4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
55 #include "sysemu/cpu-throttle.h"
59 #include "sysemu/runstate.h"
62 #include "hw/boards.h" /* for machine_dump_guest_core() */
64 #if defined(__linux__)
65 #include "qemu/userfaultfd.h"
66 #endif /* defined(__linux__) */
68 /***********************************************************/
69 /* ram save/restore */
72 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
73 * worked for pages that were filled with the same char. We switched
74 * it to only search for the zero value. And to avoid confusion with
75 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
78 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
80 #define RAM_SAVE_FLAG_FULL 0x01
81 #define RAM_SAVE_FLAG_ZERO 0x02
82 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
83 #define RAM_SAVE_FLAG_PAGE 0x08
84 #define RAM_SAVE_FLAG_EOS 0x10
85 #define RAM_SAVE_FLAG_CONTINUE 0x20
86 #define RAM_SAVE_FLAG_XBZRLE 0x40
87 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
88 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
89 #define RAM_SAVE_FLAG_MULTIFD_FLUSH 0x200
90 /* We can't use any flag that is bigger than 0x200 */
92 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
93 uint8_t *, int) = xbzrle_encode_buffer;
94 #if defined(CONFIG_AVX512BW_OPT)
95 #include "qemu/cpuid.h"
96 static void __attribute__((constructor)) init_cpu_flag(void)
98 unsigned max = __get_cpuid_max(0, NULL);
101 __cpuid(1, a, b, c, d);
102 /* We must check that AVX is not just available, but usable. */
103 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
105 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
106 __cpuid_count(7, 0, a, b, c, d);
108 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
109 * and ZMM16-ZMM31 state are enabled by OS)
110 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
112 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
113 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
120 XBZRLECacheStats xbzrle_counters;
122 /* used by the search for pages to send */
123 struct PageSearchStatus {
124 /* The migration channel used for a specific host page */
125 QEMUFile *pss_channel;
126 /* Last block from where we have sent data */
127 RAMBlock *last_sent_block;
128 /* Current block being searched */
130 /* Current page to search from */
132 /* Set once we wrap around */
134 /* Whether we're sending a host page */
135 bool host_page_sending;
136 /* The start/end of current host page. Invalid if host_page_sending==false */
137 unsigned long host_page_start;
138 unsigned long host_page_end;
140 typedef struct PageSearchStatus PageSearchStatus;
142 /* struct contains XBZRLE cache and a static page
143 used by the compression */
145 /* buffer used for XBZRLE encoding */
146 uint8_t *encoded_buf;
147 /* buffer for storing page content */
148 uint8_t *current_buf;
149 /* Cache for XBZRLE, Protected by lock. */
152 /* it will store a page full of zeros */
153 uint8_t *zero_target_page;
154 /* buffer used for XBZRLE decoding */
155 uint8_t *decoded_buf;
158 static void XBZRLE_cache_lock(void)
160 if (migrate_xbzrle()) {
161 qemu_mutex_lock(&XBZRLE.lock);
165 static void XBZRLE_cache_unlock(void)
167 if (migrate_xbzrle()) {
168 qemu_mutex_unlock(&XBZRLE.lock);
173 * xbzrle_cache_resize: resize the xbzrle cache
175 * This function is called from migrate_params_apply in main
176 * thread, possibly while a migration is in progress. A running
177 * migration may be using the cache and might finish during this call,
178 * hence changes to the cache are protected by XBZRLE.lock().
180 * Returns 0 for success or -1 for error
182 * @new_size: new cache size
183 * @errp: set *errp if the check failed, with reason
185 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
187 PageCache *new_cache;
190 /* Check for truncation */
191 if (new_size != (size_t)new_size) {
192 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
193 "exceeding address space");
197 if (new_size == migrate_xbzrle_cache_size()) {
204 if (XBZRLE.cache != NULL) {
205 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
211 cache_fini(XBZRLE.cache);
212 XBZRLE.cache = new_cache;
215 XBZRLE_cache_unlock();
219 static bool postcopy_preempt_active(void)
221 return migrate_postcopy_preempt() && migration_in_postcopy();
224 bool ramblock_is_ignored(RAMBlock *block)
226 return !qemu_ram_is_migratable(block) ||
227 (migrate_ignore_shared() && qemu_ram_is_shared(block));
230 #undef RAMBLOCK_FOREACH
232 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
237 RCU_READ_LOCK_GUARD();
239 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
240 ret = func(block, opaque);
248 static void ramblock_recv_map_init(void)
252 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
253 assert(!rb->receivedmap);
254 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
258 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
260 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
264 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
266 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
269 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
271 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
274 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
277 bitmap_set_atomic(rb->receivedmap,
278 ramblock_recv_bitmap_offset(host_addr, rb),
282 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
285 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
287 * Returns >0 if success with sent bytes, or <0 if error.
289 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
290 const char *block_name)
292 RAMBlock *block = qemu_ram_block_by_name(block_name);
293 unsigned long *le_bitmap, nbits;
297 error_report("%s: invalid block name: %s", __func__, block_name);
301 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
304 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
305 * machines we may need 4 more bytes for padding (see below
306 * comment). So extend it a bit before hand.
308 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
311 * Always use little endian when sending the bitmap. This is
312 * required that when source and destination VMs are not using the
313 * same endianness. (Note: big endian won't work.)
315 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
317 /* Size of the bitmap, in bytes */
318 size = DIV_ROUND_UP(nbits, 8);
321 * size is always aligned to 8 bytes for 64bit machines, but it
322 * may not be true for 32bit machines. We need this padding to
323 * make sure the migration can survive even between 32bit and
326 size = ROUND_UP(size, 8);
328 qemu_put_be64(file, size);
329 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
331 * Mark as an end, in case the middle part is screwed up due to
332 * some "mysterious" reason.
334 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
339 if (qemu_file_get_error(file)) {
340 return qemu_file_get_error(file);
343 return size + sizeof(size);
347 * An outstanding page request, on the source, having been received
350 struct RAMSrcPageRequest {
355 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
358 /* State of RAM for migration */
361 * PageSearchStatus structures for the channels when send pages.
362 * Protected by the bitmap_mutex.
364 PageSearchStatus pss[RAM_CHANNEL_MAX];
365 /* UFFD file descriptor, used in 'write-tracking' migration */
367 /* total ram size in bytes */
368 uint64_t ram_bytes_total;
369 /* Last block that we have visited searching for dirty pages */
370 RAMBlock *last_seen_block;
371 /* Last dirty target page we have sent */
372 ram_addr_t last_page;
373 /* last ram version we have seen */
374 uint32_t last_version;
375 /* How many times we have dirty too many pages */
376 int dirty_rate_high_cnt;
377 /* these variables are used for bitmap sync */
378 /* last time we did a full bitmap_sync */
379 int64_t time_last_bitmap_sync;
380 /* bytes transferred at start_time */
381 uint64_t bytes_xfer_prev;
382 /* number of dirty pages since start_time */
383 uint64_t num_dirty_pages_period;
384 /* xbzrle misses since the beginning of the period */
385 uint64_t xbzrle_cache_miss_prev;
386 /* Amount of xbzrle pages since the beginning of the period */
387 uint64_t xbzrle_pages_prev;
388 /* Amount of xbzrle encoded bytes since the beginning of the period */
389 uint64_t xbzrle_bytes_prev;
390 /* Start using XBZRLE (e.g., after the first round). */
392 /* Are we on the last stage of migration */
394 /* compression statistics since the beginning of the period */
395 /* amount of count that no free thread to compress data */
396 uint64_t compress_thread_busy_prev;
397 /* amount bytes after compression */
398 uint64_t compressed_size_prev;
399 /* amount of compressed pages */
400 uint64_t compress_pages_prev;
402 /* total handled target pages at the beginning of period */
403 uint64_t target_page_count_prev;
404 /* total handled target pages since start */
405 uint64_t target_page_count;
406 /* number of dirty bits in the bitmap */
407 uint64_t migration_dirty_pages;
410 * - dirty/clear bitmap
411 * - migration_dirty_pages
414 QemuMutex bitmap_mutex;
415 /* The RAMBlock used in the last src_page_requests */
416 RAMBlock *last_req_rb;
417 /* Queue of outstanding page requests from the destination */
418 QemuMutex src_page_req_mutex;
419 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
421 typedef struct RAMState RAMState;
423 static RAMState *ram_state;
425 static NotifierWithReturnList precopy_notifier_list;
427 /* Whether postcopy has queued requests? */
428 static bool postcopy_has_request(RAMState *rs)
430 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
433 void precopy_infrastructure_init(void)
435 notifier_with_return_list_init(&precopy_notifier_list);
438 void precopy_add_notifier(NotifierWithReturn *n)
440 notifier_with_return_list_add(&precopy_notifier_list, n);
443 void precopy_remove_notifier(NotifierWithReturn *n)
445 notifier_with_return_remove(n);
448 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
450 PrecopyNotifyData pnd;
454 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
457 uint64_t ram_bytes_remaining(void)
459 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
463 RAMStats ram_counters;
465 void ram_transferred_add(uint64_t bytes)
467 if (runstate_is_running()) {
468 stat64_add(&ram_counters.precopy_bytes, bytes);
469 } else if (migration_in_postcopy()) {
470 stat64_add(&ram_counters.postcopy_bytes, bytes);
472 stat64_add(&ram_counters.downtime_bytes, bytes);
474 stat64_add(&ram_counters.transferred, bytes);
477 struct MigrationOps {
478 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
480 typedef struct MigrationOps MigrationOps;
482 MigrationOps *migration_ops;
484 CompressionStats compression_counters;
486 struct CompressParam {
496 /* internally used fields */
500 typedef struct CompressParam CompressParam;
502 struct DecompressParam {
512 typedef struct DecompressParam DecompressParam;
514 static CompressParam *comp_param;
515 static QemuThread *compress_threads;
516 /* comp_done_cond is used to wake up the migration thread when
517 * one of the compression threads has finished the compression.
518 * comp_done_lock is used to co-work with comp_done_cond.
520 static QemuMutex comp_done_lock;
521 static QemuCond comp_done_cond;
523 static QEMUFile *decomp_file;
524 static DecompressParam *decomp_param;
525 static QemuThread *decompress_threads;
526 static QemuMutex decomp_done_lock;
527 static QemuCond decomp_done_cond;
529 static int ram_save_host_page_urgent(PageSearchStatus *pss);
531 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
532 ram_addr_t offset, uint8_t *source_buf);
534 /* NOTE: page is the PFN not real ram_addr_t. */
535 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
539 pss->complete_round = false;
543 * Check whether two PSSs are actively sending the same page. Return true
544 * if it is, false otherwise.
546 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
548 return pss1->host_page_sending && pss2->host_page_sending &&
549 (pss1->host_page_start == pss2->host_page_start);
552 static void *do_data_compress(void *opaque)
554 CompressParam *param = opaque;
559 qemu_mutex_lock(¶m->mutex);
560 while (!param->quit) {
562 block = param->block;
563 offset = param->offset;
565 qemu_mutex_unlock(¶m->mutex);
567 zero_page = do_compress_ram_page(param->file, ¶m->stream,
568 block, offset, param->originbuf);
570 qemu_mutex_lock(&comp_done_lock);
572 param->zero_page = zero_page;
573 qemu_cond_signal(&comp_done_cond);
574 qemu_mutex_unlock(&comp_done_lock);
576 qemu_mutex_lock(¶m->mutex);
578 qemu_cond_wait(¶m->cond, ¶m->mutex);
581 qemu_mutex_unlock(¶m->mutex);
586 static void compress_threads_save_cleanup(void)
590 if (!migrate_compress() || !comp_param) {
594 thread_count = migrate_compress_threads();
595 for (i = 0; i < thread_count; i++) {
597 * we use it as a indicator which shows if the thread is
598 * properly init'd or not
600 if (!comp_param[i].file) {
604 qemu_mutex_lock(&comp_param[i].mutex);
605 comp_param[i].quit = true;
606 qemu_cond_signal(&comp_param[i].cond);
607 qemu_mutex_unlock(&comp_param[i].mutex);
609 qemu_thread_join(compress_threads + i);
610 qemu_mutex_destroy(&comp_param[i].mutex);
611 qemu_cond_destroy(&comp_param[i].cond);
612 deflateEnd(&comp_param[i].stream);
613 g_free(comp_param[i].originbuf);
614 qemu_fclose(comp_param[i].file);
615 comp_param[i].file = NULL;
617 qemu_mutex_destroy(&comp_done_lock);
618 qemu_cond_destroy(&comp_done_cond);
619 g_free(compress_threads);
621 compress_threads = NULL;
625 static int compress_threads_save_setup(void)
629 if (!migrate_compress()) {
632 thread_count = migrate_compress_threads();
633 compress_threads = g_new0(QemuThread, thread_count);
634 comp_param = g_new0(CompressParam, thread_count);
635 qemu_cond_init(&comp_done_cond);
636 qemu_mutex_init(&comp_done_lock);
637 for (i = 0; i < thread_count; i++) {
638 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
639 if (!comp_param[i].originbuf) {
643 if (deflateInit(&comp_param[i].stream,
644 migrate_compress_level()) != Z_OK) {
645 g_free(comp_param[i].originbuf);
649 /* comp_param[i].file is just used as a dummy buffer to save data,
650 * set its ops to empty.
652 comp_param[i].file = qemu_file_new_output(
653 QIO_CHANNEL(qio_channel_null_new()));
654 comp_param[i].done = true;
655 comp_param[i].quit = false;
656 qemu_mutex_init(&comp_param[i].mutex);
657 qemu_cond_init(&comp_param[i].cond);
658 qemu_thread_create(compress_threads + i, "compress",
659 do_data_compress, comp_param + i,
660 QEMU_THREAD_JOINABLE);
665 compress_threads_save_cleanup();
670 * save_page_header: write page header to wire
672 * If this is the 1st block, it also writes the block identification
674 * Returns the number of bytes written
676 * @pss: current PSS channel status
677 * @block: block that contains the page we want to send
678 * @offset: offset inside the block for the page
679 * in the lower bits, it contains flags
681 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
682 RAMBlock *block, ram_addr_t offset)
685 bool same_block = (block == pss->last_sent_block);
688 offset |= RAM_SAVE_FLAG_CONTINUE;
690 qemu_put_be64(f, offset);
694 len = strlen(block->idstr);
695 qemu_put_byte(f, len);
696 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
698 pss->last_sent_block = block;
704 * mig_throttle_guest_down: throttle down the guest
706 * Reduce amount of guest cpu execution to hopefully slow down memory
707 * writes. If guest dirty memory rate is reduced below the rate at
708 * which we can transfer pages to the destination then we should be
709 * able to complete migration. Some workloads dirty memory way too
710 * fast and will not effectively converge, even with auto-converge.
712 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
713 uint64_t bytes_dirty_threshold)
715 uint64_t pct_initial = migrate_cpu_throttle_initial();
716 uint64_t pct_increment = migrate_cpu_throttle_increment();
717 bool pct_tailslow = migrate_cpu_throttle_tailslow();
718 int pct_max = migrate_max_cpu_throttle();
720 uint64_t throttle_now = cpu_throttle_get_percentage();
721 uint64_t cpu_now, cpu_ideal, throttle_inc;
723 /* We have not started throttling yet. Let's start it. */
724 if (!cpu_throttle_active()) {
725 cpu_throttle_set(pct_initial);
727 /* Throttling already on, just increase the rate */
729 throttle_inc = pct_increment;
731 /* Compute the ideal CPU percentage used by Guest, which may
732 * make the dirty rate match the dirty rate threshold. */
733 cpu_now = 100 - throttle_now;
734 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
736 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
738 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
742 void mig_throttle_counter_reset(void)
744 RAMState *rs = ram_state;
746 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
747 rs->num_dirty_pages_period = 0;
748 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
752 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
754 * @rs: current RAM state
755 * @current_addr: address for the zero page
757 * Update the xbzrle cache to reflect a page that's been sent as all 0.
758 * The important thing is that a stale (not-yet-0'd) page be replaced
760 * As a bonus, if the page wasn't in the cache it gets added so that
761 * when a small write is made into the 0'd page it gets XBZRLE sent.
763 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
765 /* We don't care if this fails to allocate a new cache page
766 * as long as it updated an old one */
767 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
768 stat64_get(&ram_counters.dirty_sync_count));
771 #define ENCODING_FLAG_XBZRLE 0x1
774 * save_xbzrle_page: compress and send current page
776 * Returns: 1 means that we wrote the page
777 * 0 means that page is identical to the one already sent
778 * -1 means that xbzrle would be longer than normal
780 * @rs: current RAM state
781 * @pss: current PSS channel
782 * @current_data: pointer to the address of the page contents
783 * @current_addr: addr of the page
784 * @block: block that contains the page we want to send
785 * @offset: offset inside the block for the page
787 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
788 uint8_t **current_data, ram_addr_t current_addr,
789 RAMBlock *block, ram_addr_t offset)
791 int encoded_len = 0, bytes_xbzrle;
792 uint8_t *prev_cached_page;
793 QEMUFile *file = pss->pss_channel;
794 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
796 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
797 xbzrle_counters.cache_miss++;
798 if (!rs->last_stage) {
799 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
803 /* update *current_data when the page has been
804 inserted into cache */
805 *current_data = get_cached_data(XBZRLE.cache, current_addr);
812 * Reaching here means the page has hit the xbzrle cache, no matter what
813 * encoding result it is (normal encoding, overflow or skipping the page),
814 * count the page as encoded. This is used to calculate the encoding rate.
816 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
817 * 2nd page turns out to be skipped (i.e. no new bytes written to the
818 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
819 * skipped page included. In this way, the encoding rate can tell if the
820 * guest page is good for xbzrle encoding.
822 xbzrle_counters.pages++;
823 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
825 /* save current buffer into memory */
826 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
828 /* XBZRLE encoding (if there is no overflow) */
829 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
830 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
834 * Update the cache contents, so that it corresponds to the data
835 * sent, in all cases except where we skip the page.
837 if (!rs->last_stage && encoded_len != 0) {
838 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
840 * In the case where we couldn't compress, ensure that the caller
841 * sends the data from the cache, since the guest might have
842 * changed the RAM since we copied it.
844 *current_data = prev_cached_page;
847 if (encoded_len == 0) {
848 trace_save_xbzrle_page_skipping();
850 } else if (encoded_len == -1) {
851 trace_save_xbzrle_page_overflow();
852 xbzrle_counters.overflow++;
853 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
857 /* Send XBZRLE based compressed page */
858 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
859 offset | RAM_SAVE_FLAG_XBZRLE);
860 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
861 qemu_put_be16(file, encoded_len);
862 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
863 bytes_xbzrle += encoded_len + 1 + 2;
865 * Like compressed_size (please see update_compress_thread_counts),
866 * the xbzrle encoded bytes don't count the 8 byte header with
867 * RAM_SAVE_FLAG_CONTINUE.
869 xbzrle_counters.bytes += bytes_xbzrle - 8;
870 ram_transferred_add(bytes_xbzrle);
876 * pss_find_next_dirty: find the next dirty page of current ramblock
878 * This function updates pss->page to point to the next dirty page index
879 * within the ramblock to migrate, or the end of ramblock when nothing
880 * found. Note that when pss->host_page_sending==true it means we're
881 * during sending a host page, so we won't look for dirty page that is
882 * outside the host page boundary.
884 * @pss: the current page search status
886 static void pss_find_next_dirty(PageSearchStatus *pss)
888 RAMBlock *rb = pss->block;
889 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
890 unsigned long *bitmap = rb->bmap;
892 if (ramblock_is_ignored(rb)) {
893 /* Points directly to the end, so we know no dirty page */
899 * If during sending a host page, only look for dirty pages within the
900 * current host page being send.
902 if (pss->host_page_sending) {
903 assert(pss->host_page_end);
904 size = MIN(size, pss->host_page_end);
907 pss->page = find_next_bit(bitmap, size, pss->page);
910 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
916 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
920 shift = rb->clear_bmap_shift;
922 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
923 * can make things easier sometimes since then start address
924 * of the small chunk will always be 64 pages aligned so the
925 * bitmap will always be aligned to unsigned long. We should
926 * even be able to remove this restriction but I'm simply
931 size = 1ULL << (TARGET_PAGE_BITS + shift);
932 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
933 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
934 memory_region_clear_dirty_bitmap(rb->mr, start, size);
938 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
940 unsigned long npages)
942 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
943 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
944 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
947 * Clear pages from start to start + npages - 1, so the end boundary is
950 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
951 migration_clear_memory_region_dirty_bitmap(rb, i);
956 * colo_bitmap_find_diry:find contiguous dirty pages from start
958 * Returns the page offset within memory region of the start of the contiguout
961 * @rs: current RAM state
962 * @rb: RAMBlock where to search for dirty pages
963 * @start: page where we start the search
964 * @num: the number of contiguous dirty pages
967 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
968 unsigned long start, unsigned long *num)
970 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
971 unsigned long *bitmap = rb->bmap;
972 unsigned long first, next;
976 if (ramblock_is_ignored(rb)) {
980 first = find_next_bit(bitmap, size, start);
984 next = find_next_zero_bit(bitmap, size, first + 1);
985 assert(next >= first);
990 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
997 * Clear dirty bitmap if needed. This _must_ be called before we
998 * send any of the page in the chunk because we need to make sure
999 * we can capture further page content changes when we sync dirty
1000 * log the next time. So as long as we are going to send any of
1001 * the page in the chunk we clear the remote dirty bitmap for all.
1002 * Clearing it earlier won't be a problem, but too late will.
1004 migration_clear_memory_region_dirty_bitmap(rb, page);
1006 ret = test_and_clear_bit(page, rb->bmap);
1008 rs->migration_dirty_pages--;
1014 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1017 const hwaddr offset = section->offset_within_region;
1018 const hwaddr size = int128_get64(section->size);
1019 const unsigned long start = offset >> TARGET_PAGE_BITS;
1020 const unsigned long npages = size >> TARGET_PAGE_BITS;
1021 RAMBlock *rb = section->mr->ram_block;
1022 uint64_t *cleared_bits = opaque;
1025 * We don't grab ram_state->bitmap_mutex because we expect to run
1026 * only when starting migration or during postcopy recovery where
1027 * we don't have concurrent access.
1029 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1030 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1032 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1033 bitmap_clear(rb->bmap, start, npages);
1037 * Exclude all dirty pages from migration that fall into a discarded range as
1038 * managed by a RamDiscardManager responsible for the mapped memory region of
1039 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1041 * Discarded pages ("logically unplugged") have undefined content and must
1042 * not get migrated, because even reading these pages for migration might
1043 * result in undesired behavior.
1045 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1047 * Note: The result is only stable while migrating (precopy/postcopy).
1049 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1051 uint64_t cleared_bits = 0;
1053 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1054 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1055 MemoryRegionSection section = {
1057 .offset_within_region = 0,
1058 .size = int128_make64(qemu_ram_get_used_length(rb)),
1061 ram_discard_manager_replay_discarded(rdm, §ion,
1062 dirty_bitmap_clear_section,
1065 return cleared_bits;
1069 * Check if a host-page aligned page falls into a discarded range as managed by
1070 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1072 * Note: The result is only stable while migrating (precopy/postcopy).
1074 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1076 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1077 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1078 MemoryRegionSection section = {
1080 .offset_within_region = start,
1081 .size = int128_make64(qemu_ram_pagesize(rb)),
1084 return !ram_discard_manager_is_populated(rdm, §ion);
1089 /* Called with RCU critical section */
1090 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1092 uint64_t new_dirty_pages =
1093 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1095 rs->migration_dirty_pages += new_dirty_pages;
1096 rs->num_dirty_pages_period += new_dirty_pages;
1100 * ram_pagesize_summary: calculate all the pagesizes of a VM
1102 * Returns a summary bitmap of the page sizes of all RAMBlocks
1104 * For VMs with just normal pages this is equivalent to the host page
1105 * size. If it's got some huge pages then it's the OR of all the
1106 * different page sizes.
1108 uint64_t ram_pagesize_summary(void)
1111 uint64_t summary = 0;
1113 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1114 summary |= block->page_size;
1120 uint64_t ram_get_total_transferred_pages(void)
1122 return stat64_get(&ram_counters.normal_pages) +
1123 stat64_get(&ram_counters.zero_pages) +
1124 compression_counters.pages + xbzrle_counters.pages;
1127 static void migration_update_rates(RAMState *rs, int64_t end_time)
1129 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1130 double compressed_size;
1132 /* calculate period counters */
1133 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1134 / (end_time - rs->time_last_bitmap_sync);
1140 if (migrate_xbzrle()) {
1141 double encoded_size, unencoded_size;
1143 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1144 rs->xbzrle_cache_miss_prev) / page_count;
1145 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1146 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1148 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1149 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1150 xbzrle_counters.encoding_rate = 0;
1152 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1154 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1155 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1158 if (migrate_compress()) {
1159 compression_counters.busy_rate = (double)(compression_counters.busy -
1160 rs->compress_thread_busy_prev) / page_count;
1161 rs->compress_thread_busy_prev = compression_counters.busy;
1163 compressed_size = compression_counters.compressed_size -
1164 rs->compressed_size_prev;
1165 if (compressed_size) {
1166 double uncompressed_size = (compression_counters.pages -
1167 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1169 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1170 compression_counters.compression_rate =
1171 uncompressed_size / compressed_size;
1173 rs->compress_pages_prev = compression_counters.pages;
1174 rs->compressed_size_prev = compression_counters.compressed_size;
1179 static void migration_trigger_throttle(RAMState *rs)
1181 uint64_t threshold = migrate_throttle_trigger_threshold();
1182 uint64_t bytes_xfer_period =
1183 stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
1184 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1185 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1187 /* During block migration the auto-converge logic incorrectly detects
1188 * that ram migration makes no progress. Avoid this by disabling the
1189 * throttling logic during the bulk phase of block migration. */
1190 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1191 /* The following detection logic can be refined later. For now:
1192 Check to see if the ratio between dirtied bytes and the approx.
1193 amount of bytes that just got transferred since the last time
1194 we were in this routine reaches the threshold. If that happens
1195 twice, start or increase throttling. */
1197 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1198 (++rs->dirty_rate_high_cnt >= 2)) {
1199 trace_migration_throttle();
1200 rs->dirty_rate_high_cnt = 0;
1201 mig_throttle_guest_down(bytes_dirty_period,
1202 bytes_dirty_threshold);
1207 static void migration_bitmap_sync(RAMState *rs)
1212 stat64_add(&ram_counters.dirty_sync_count, 1);
1214 if (!rs->time_last_bitmap_sync) {
1215 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1218 trace_migration_bitmap_sync_start();
1219 memory_global_dirty_log_sync();
1221 qemu_mutex_lock(&rs->bitmap_mutex);
1222 WITH_RCU_READ_LOCK_GUARD() {
1223 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1224 ramblock_sync_dirty_bitmap(rs, block);
1226 ram_counters.remaining = ram_bytes_remaining();
1228 qemu_mutex_unlock(&rs->bitmap_mutex);
1230 memory_global_after_dirty_log_sync();
1231 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1233 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1235 /* more than 1 second = 1000 millisecons */
1236 if (end_time > rs->time_last_bitmap_sync + 1000) {
1237 migration_trigger_throttle(rs);
1239 migration_update_rates(rs, end_time);
1241 rs->target_page_count_prev = rs->target_page_count;
1243 /* reset period counters */
1244 rs->time_last_bitmap_sync = end_time;
1245 rs->num_dirty_pages_period = 0;
1246 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
1248 if (migrate_events()) {
1249 uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
1250 qapi_event_send_migration_pass(generation);
1254 static void migration_bitmap_sync_precopy(RAMState *rs)
1256 Error *local_err = NULL;
1259 * The current notifier usage is just an optimization to migration, so we
1260 * don't stop the normal migration process in the error case.
1262 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1263 error_report_err(local_err);
1267 migration_bitmap_sync(rs);
1269 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1270 error_report_err(local_err);
1274 void ram_release_page(const char *rbname, uint64_t offset)
1276 if (!migrate_release_ram() || !migration_in_postcopy()) {
1280 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1284 * save_zero_page_to_file: send the zero page to the file
1286 * Returns the size of data written to the file, 0 means the page is not
1289 * @pss: current PSS channel
1290 * @block: block that contains the page we want to send
1291 * @offset: offset inside the block for the page
1293 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1294 RAMBlock *block, ram_addr_t offset)
1296 uint8_t *p = block->host + offset;
1299 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1300 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1301 qemu_put_byte(file, 0);
1303 ram_release_page(block->idstr, offset);
1309 * save_zero_page: send the zero page to the stream
1311 * Returns the number of pages written.
1313 * @pss: current PSS channel
1314 * @block: block that contains the page we want to send
1315 * @offset: offset inside the block for the page
1317 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1320 int len = save_zero_page_to_file(pss, f, block, offset);
1323 stat64_add(&ram_counters.zero_pages, 1);
1324 ram_transferred_add(len);
1331 * @pages: the number of pages written by the control path,
1333 * > 0 - number of pages written
1335 * Return true if the pages has been saved, otherwise false is returned.
1337 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1338 ram_addr_t offset, int *pages)
1340 uint64_t bytes_xmit = 0;
1344 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1345 TARGET_PAGE_SIZE, &bytes_xmit);
1346 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1351 ram_transferred_add(bytes_xmit);
1355 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1359 if (bytes_xmit > 0) {
1360 stat64_add(&ram_counters.normal_pages, 1);
1361 } else if (bytes_xmit == 0) {
1362 stat64_add(&ram_counters.zero_pages, 1);
1369 * directly send the page to the stream
1371 * Returns the number of pages written.
1373 * @pss: current PSS channel
1374 * @block: block that contains the page we want to send
1375 * @offset: offset inside the block for the page
1376 * @buf: the page to be sent
1377 * @async: send to page asyncly
1379 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1380 ram_addr_t offset, uint8_t *buf, bool async)
1382 QEMUFile *file = pss->pss_channel;
1384 ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1385 offset | RAM_SAVE_FLAG_PAGE));
1387 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1388 migrate_release_ram() &&
1389 migration_in_postcopy());
1391 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1393 ram_transferred_add(TARGET_PAGE_SIZE);
1394 stat64_add(&ram_counters.normal_pages, 1);
1399 * ram_save_page: send the given page to the stream
1401 * Returns the number of pages written.
1403 * >=0 - Number of pages written - this might legally be 0
1404 * if xbzrle noticed the page was the same.
1406 * @rs: current RAM state
1407 * @block: block that contains the page we want to send
1408 * @offset: offset inside the block for the page
1410 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1414 bool send_async = true;
1415 RAMBlock *block = pss->block;
1416 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1417 ram_addr_t current_addr = block->offset + offset;
1419 p = block->host + offset;
1420 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1422 XBZRLE_cache_lock();
1423 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1424 pages = save_xbzrle_page(rs, pss, &p, current_addr,
1426 if (!rs->last_stage) {
1427 /* Can't send this cached data async, since the cache page
1428 * might get updated before it gets to the wire
1434 /* XBZRLE overflow or normal page */
1436 pages = save_normal_page(pss, block, offset, p, send_async);
1439 XBZRLE_cache_unlock();
1444 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1447 if (multifd_queue_page(file, block, offset) < 0) {
1450 stat64_add(&ram_counters.normal_pages, 1);
1455 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1456 ram_addr_t offset, uint8_t *source_buf)
1458 RAMState *rs = ram_state;
1459 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1460 uint8_t *p = block->host + offset;
1463 if (save_zero_page_to_file(pss, f, block, offset)) {
1467 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1470 * copy it to a internal buffer to avoid it being modified by VM
1471 * so that we can catch up the error during compression and
1474 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1475 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1477 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1478 error_report("compressed data failed!");
1484 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1486 ram_transferred_add(bytes_xmit);
1488 if (param->zero_page) {
1489 stat64_add(&ram_counters.zero_pages, 1);
1493 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1494 compression_counters.compressed_size += bytes_xmit - 8;
1495 compression_counters.pages++;
1498 static bool save_page_use_compression(RAMState *rs);
1500 static void flush_compressed_data(RAMState *rs)
1502 MigrationState *ms = migrate_get_current();
1503 int idx, len, thread_count;
1505 if (!save_page_use_compression(rs)) {
1508 thread_count = migrate_compress_threads();
1510 qemu_mutex_lock(&comp_done_lock);
1511 for (idx = 0; idx < thread_count; idx++) {
1512 while (!comp_param[idx].done) {
1513 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1516 qemu_mutex_unlock(&comp_done_lock);
1518 for (idx = 0; idx < thread_count; idx++) {
1519 qemu_mutex_lock(&comp_param[idx].mutex);
1520 if (!comp_param[idx].quit) {
1521 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1523 * it's safe to fetch zero_page without holding comp_done_lock
1524 * as there is no further request submitted to the thread,
1525 * i.e, the thread should be waiting for a request at this point.
1527 update_compress_thread_counts(&comp_param[idx], len);
1529 qemu_mutex_unlock(&comp_param[idx].mutex);
1533 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1536 param->block = block;
1537 param->offset = offset;
1540 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1542 int idx, thread_count, bytes_xmit = -1, pages = -1;
1543 bool wait = migrate_compress_wait_thread();
1544 MigrationState *ms = migrate_get_current();
1546 thread_count = migrate_compress_threads();
1547 qemu_mutex_lock(&comp_done_lock);
1549 for (idx = 0; idx < thread_count; idx++) {
1550 if (comp_param[idx].done) {
1551 comp_param[idx].done = false;
1552 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1553 comp_param[idx].file);
1554 qemu_mutex_lock(&comp_param[idx].mutex);
1555 set_compress_params(&comp_param[idx], block, offset);
1556 qemu_cond_signal(&comp_param[idx].cond);
1557 qemu_mutex_unlock(&comp_param[idx].mutex);
1559 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1565 * wait for the free thread if the user specifies 'compress-wait-thread',
1566 * otherwise we will post the page out in the main thread as normal page.
1568 if (pages < 0 && wait) {
1569 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1572 qemu_mutex_unlock(&comp_done_lock);
1577 #define PAGE_ALL_CLEAN 0
1578 #define PAGE_TRY_AGAIN 1
1579 #define PAGE_DIRTY_FOUND 2
1581 * find_dirty_block: find the next dirty page and update any state
1582 * associated with the search process.
1585 * <0: An error happened
1586 * PAGE_ALL_CLEAN: no dirty page found, give up
1587 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1588 * PAGE_DIRTY_FOUND: dirty page found
1590 * @rs: current RAM state
1591 * @pss: data about the state of the current dirty page scan
1592 * @again: set to false if the search has scanned the whole of RAM
1594 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1596 /* Update pss->page for the next dirty bit in ramblock */
1597 pss_find_next_dirty(pss);
1599 if (pss->complete_round && pss->block == rs->last_seen_block &&
1600 pss->page >= rs->last_page) {
1602 * We've been once around the RAM and haven't found anything.
1605 return PAGE_ALL_CLEAN;
1607 if (!offset_in_ramblock(pss->block,
1608 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1609 /* Didn't find anything in this RAM Block */
1611 pss->block = QLIST_NEXT_RCU(pss->block, next);
1613 if (!migrate_multifd_flush_after_each_section()) {
1614 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1615 int ret = multifd_send_sync_main(f);
1619 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1623 * If memory migration starts over, we will meet a dirtied page
1624 * which may still exists in compression threads's ring, so we
1625 * should flush the compressed data to make sure the new page
1626 * is not overwritten by the old one in the destination.
1628 * Also If xbzrle is on, stop using the data compression at this
1629 * point. In theory, xbzrle can do better than compression.
1631 flush_compressed_data(rs);
1633 /* Hit the end of the list */
1634 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1635 /* Flag that we've looped */
1636 pss->complete_round = true;
1637 /* After the first round, enable XBZRLE. */
1638 if (migrate_xbzrle()) {
1639 rs->xbzrle_enabled = true;
1642 /* Didn't find anything this time, but try again on the new block */
1643 return PAGE_TRY_AGAIN;
1645 /* We've found something */
1646 return PAGE_DIRTY_FOUND;
1651 * unqueue_page: gets a page of the queue
1653 * Helper for 'get_queued_page' - gets a page off the queue
1655 * Returns the block of the page (or NULL if none available)
1657 * @rs: current RAM state
1658 * @offset: used to return the offset within the RAMBlock
1660 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1662 struct RAMSrcPageRequest *entry;
1663 RAMBlock *block = NULL;
1665 if (!postcopy_has_request(rs)) {
1669 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1672 * This should _never_ change even after we take the lock, because no one
1673 * should be taking anything off the request list other than us.
1675 assert(postcopy_has_request(rs));
1677 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1679 *offset = entry->offset;
1681 if (entry->len > TARGET_PAGE_SIZE) {
1682 entry->len -= TARGET_PAGE_SIZE;
1683 entry->offset += TARGET_PAGE_SIZE;
1685 memory_region_unref(block->mr);
1686 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1688 migration_consume_urgent_request();
1694 #if defined(__linux__)
1696 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1697 * is found, return RAM block pointer and page offset
1699 * Returns pointer to the RAMBlock containing faulting page,
1700 * NULL if no write faults are pending
1702 * @rs: current RAM state
1703 * @offset: page offset from the beginning of the block
1705 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1707 struct uffd_msg uffd_msg;
1712 if (!migrate_background_snapshot()) {
1716 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1721 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1722 block = qemu_ram_block_from_host(page_address, false, offset);
1723 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1728 * ram_save_release_protection: release UFFD write protection after
1729 * a range of pages has been saved
1731 * @rs: current RAM state
1732 * @pss: page-search-status structure
1733 * @start_page: index of the first page in the range relative to pss->block
1735 * Returns 0 on success, negative value in case of an error
1737 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1738 unsigned long start_page)
1742 /* Check if page is from UFFD-managed region. */
1743 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1744 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1745 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1747 /* Flush async buffers before un-protect. */
1748 qemu_fflush(pss->pss_channel);
1749 /* Un-protect memory range. */
1750 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1757 /* ram_write_tracking_available: check if kernel supports required UFFD features
1759 * Returns true if supports, false otherwise
1761 bool ram_write_tracking_available(void)
1763 uint64_t uffd_features;
1766 res = uffd_query_features(&uffd_features);
1768 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1771 /* ram_write_tracking_compatible: check if guest configuration is
1772 * compatible with 'write-tracking'
1774 * Returns true if compatible, false otherwise
1776 bool ram_write_tracking_compatible(void)
1778 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1783 /* Open UFFD file descriptor */
1784 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1789 RCU_READ_LOCK_GUARD();
1791 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1792 uint64_t uffd_ioctls;
1794 /* Nothing to do with read-only and MMIO-writable regions */
1795 if (block->mr->readonly || block->mr->rom_device) {
1798 /* Try to register block memory via UFFD-IO to track writes */
1799 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1800 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1803 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1810 uffd_close_fd(uffd_fd);
1814 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1817 const ram_addr_t end = offset + size;
1820 * We read one byte of each page; this will preallocate page tables if
1821 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1822 * where no page was populated yet. This might require adaption when
1823 * supporting other mappings, like shmem.
1825 for (; offset < end; offset += block->page_size) {
1826 char tmp = *((char *)block->host + offset);
1828 /* Don't optimize the read out */
1829 asm volatile("" : "+r" (tmp));
1833 static inline int populate_read_section(MemoryRegionSection *section,
1836 const hwaddr size = int128_get64(section->size);
1837 hwaddr offset = section->offset_within_region;
1838 RAMBlock *block = section->mr->ram_block;
1840 populate_read_range(block, offset, size);
1845 * ram_block_populate_read: preallocate page tables and populate pages in the
1846 * RAM block by reading a byte of each page.
1848 * Since it's solely used for userfault_fd WP feature, here we just
1849 * hardcode page size to qemu_real_host_page_size.
1851 * @block: RAM block to populate
1853 static void ram_block_populate_read(RAMBlock *rb)
1856 * Skip populating all pages that fall into a discarded range as managed by
1857 * a RamDiscardManager responsible for the mapped memory region of the
1858 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1859 * must not get populated automatically. We don't have to track
1860 * modifications via userfaultfd WP reliably, because these pages will
1861 * not be part of the migration stream either way -- see
1862 * ramblock_dirty_bitmap_exclude_discarded_pages().
1864 * Note: The result is only stable while migrating (precopy/postcopy).
1866 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1867 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1868 MemoryRegionSection section = {
1870 .offset_within_region = 0,
1871 .size = rb->mr->size,
1874 ram_discard_manager_replay_populated(rdm, §ion,
1875 populate_read_section, NULL);
1877 populate_read_range(rb, 0, rb->used_length);
1882 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1884 void ram_write_tracking_prepare(void)
1888 RCU_READ_LOCK_GUARD();
1890 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1891 /* Nothing to do with read-only and MMIO-writable regions */
1892 if (block->mr->readonly || block->mr->rom_device) {
1897 * Populate pages of the RAM block before enabling userfault_fd
1900 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1901 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1902 * pages with pte_none() entries in page table.
1904 ram_block_populate_read(block);
1908 static inline int uffd_protect_section(MemoryRegionSection *section,
1911 const hwaddr size = int128_get64(section->size);
1912 const hwaddr offset = section->offset_within_region;
1913 RAMBlock *rb = section->mr->ram_block;
1914 int uffd_fd = (uintptr_t)opaque;
1916 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1920 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1922 assert(rb->flags & RAM_UF_WRITEPROTECT);
1924 /* See ram_block_populate_read() */
1925 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1926 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1927 MemoryRegionSection section = {
1929 .offset_within_region = 0,
1930 .size = rb->mr->size,
1933 return ram_discard_manager_replay_populated(rdm, §ion,
1934 uffd_protect_section,
1935 (void *)(uintptr_t)uffd_fd);
1937 return uffd_change_protection(uffd_fd, rb->host,
1938 rb->used_length, true, false);
1942 * ram_write_tracking_start: start UFFD-WP memory tracking
1944 * Returns 0 for success or negative value in case of error
1946 int ram_write_tracking_start(void)
1949 RAMState *rs = ram_state;
1952 /* Open UFFD file descriptor */
1953 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1957 rs->uffdio_fd = uffd_fd;
1959 RCU_READ_LOCK_GUARD();
1961 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1962 /* Nothing to do with read-only and MMIO-writable regions */
1963 if (block->mr->readonly || block->mr->rom_device) {
1967 /* Register block memory with UFFD to track writes */
1968 if (uffd_register_memory(rs->uffdio_fd, block->host,
1969 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1972 block->flags |= RAM_UF_WRITEPROTECT;
1973 memory_region_ref(block->mr);
1975 /* Apply UFFD write protection to the block memory range */
1976 if (ram_block_uffd_protect(block, uffd_fd)) {
1980 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1981 block->host, block->max_length);
1987 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1989 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1990 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1993 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1994 /* Cleanup flags and remove reference */
1995 block->flags &= ~RAM_UF_WRITEPROTECT;
1996 memory_region_unref(block->mr);
1999 uffd_close_fd(uffd_fd);
2005 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
2007 void ram_write_tracking_stop(void)
2009 RAMState *rs = ram_state;
2012 RCU_READ_LOCK_GUARD();
2014 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2015 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2018 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2020 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2021 block->host, block->max_length);
2023 /* Cleanup flags and remove reference */
2024 block->flags &= ~RAM_UF_WRITEPROTECT;
2025 memory_region_unref(block->mr);
2028 /* Finally close UFFD file descriptor */
2029 uffd_close_fd(rs->uffdio_fd);
2034 /* No target OS support, stubs just fail or ignore */
2036 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2044 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2045 unsigned long start_page)
2054 bool ram_write_tracking_available(void)
2059 bool ram_write_tracking_compatible(void)
2065 int ram_write_tracking_start(void)
2071 void ram_write_tracking_stop(void)
2075 #endif /* defined(__linux__) */
2078 * get_queued_page: unqueue a page from the postcopy requests
2080 * Skips pages that are already sent (!dirty)
2082 * Returns true if a queued page is found
2084 * @rs: current RAM state
2085 * @pss: data about the state of the current dirty page scan
2087 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2094 block = unqueue_page(rs, &offset);
2096 * We're sending this page, and since it's postcopy nothing else
2097 * will dirty it, and we must make sure it doesn't get sent again
2098 * even if this queue request was received after the background
2099 * search already sent it.
2104 page = offset >> TARGET_PAGE_BITS;
2105 dirty = test_bit(page, block->bmap);
2107 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2110 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2114 } while (block && !dirty);
2118 * Poll write faults too if background snapshot is enabled; that's
2119 * when we have vcpus got blocked by the write protected pages.
2121 block = poll_fault_page(rs, &offset);
2126 * We want the background search to continue from the queued page
2127 * since the guest is likely to want other pages near to the page
2128 * it just requested.
2131 pss->page = offset >> TARGET_PAGE_BITS;
2134 * This unqueued page would break the "one round" check, even is
2137 pss->complete_round = false;
2144 * migration_page_queue_free: drop any remaining pages in the ram
2147 * It should be empty at the end anyway, but in error cases there may
2148 * be some left. in case that there is any page left, we drop it.
2151 static void migration_page_queue_free(RAMState *rs)
2153 struct RAMSrcPageRequest *mspr, *next_mspr;
2154 /* This queue generally should be empty - but in the case of a failed
2155 * migration might have some droppings in.
2157 RCU_READ_LOCK_GUARD();
2158 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2159 memory_region_unref(mspr->rb->mr);
2160 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2166 * ram_save_queue_pages: queue the page for transmission
2168 * A request from postcopy destination for example.
2170 * Returns zero on success or negative on error
2172 * @rbname: Name of the RAMBLock of the request. NULL means the
2173 * same that last one.
2174 * @start: starting address from the start of the RAMBlock
2175 * @len: length (in bytes) to send
2177 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2180 RAMState *rs = ram_state;
2182 stat64_add(&ram_counters.postcopy_requests, 1);
2183 RCU_READ_LOCK_GUARD();
2186 /* Reuse last RAMBlock */
2187 ramblock = rs->last_req_rb;
2191 * Shouldn't happen, we can't reuse the last RAMBlock if
2192 * it's the 1st request.
2194 error_report("ram_save_queue_pages no previous block");
2198 ramblock = qemu_ram_block_by_name(rbname);
2201 /* We shouldn't be asked for a non-existent RAMBlock */
2202 error_report("ram_save_queue_pages no block '%s'", rbname);
2205 rs->last_req_rb = ramblock;
2207 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2208 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2209 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2210 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2211 __func__, start, len, ramblock->used_length);
2216 * When with postcopy preempt, we send back the page directly in the
2219 if (postcopy_preempt_active()) {
2220 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2221 size_t page_size = qemu_ram_pagesize(ramblock);
2222 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2225 qemu_mutex_lock(&rs->bitmap_mutex);
2227 pss_init(pss, ramblock, page_start);
2229 * Always use the preempt channel, and make sure it's there. It's
2230 * safe to access without lock, because when rp-thread is running
2231 * we should be the only one who operates on the qemufile
2233 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2234 assert(pss->pss_channel);
2237 * It must be either one or multiple of host page size. Just
2238 * assert; if something wrong we're mostly split brain anyway.
2240 assert(len % page_size == 0);
2242 if (ram_save_host_page_urgent(pss)) {
2243 error_report("%s: ram_save_host_page_urgent() failed: "
2244 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2245 __func__, ramblock->idstr, start);
2250 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2251 * will automatically be moved and point to the next host page
2252 * we're going to send, so no need to update here.
2254 * Normally QEMU never sends >1 host page in requests, so
2255 * logically we don't even need that as the loop should only
2256 * run once, but just to be consistent.
2260 qemu_mutex_unlock(&rs->bitmap_mutex);
2265 struct RAMSrcPageRequest *new_entry =
2266 g_new0(struct RAMSrcPageRequest, 1);
2267 new_entry->rb = ramblock;
2268 new_entry->offset = start;
2269 new_entry->len = len;
2271 memory_region_ref(ramblock->mr);
2272 qemu_mutex_lock(&rs->src_page_req_mutex);
2273 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2274 migration_make_urgent_request();
2275 qemu_mutex_unlock(&rs->src_page_req_mutex);
2280 static bool save_page_use_compression(RAMState *rs)
2282 if (!migrate_compress()) {
2287 * If xbzrle is enabled (e.g., after first round of migration), stop
2288 * using the data compression. In theory, xbzrle can do better than
2291 if (rs->xbzrle_enabled) {
2299 * try to compress the page before posting it out, return true if the page
2300 * has been properly handled by compression, otherwise needs other
2301 * paths to handle it
2303 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2304 RAMBlock *block, ram_addr_t offset)
2306 if (!save_page_use_compression(rs)) {
2311 * When starting the process of a new block, the first page of
2312 * the block should be sent out before other pages in the same
2313 * block, and all the pages in last block should have been sent
2314 * out, keeping this order is important, because the 'cont' flag
2315 * is used to avoid resending the block name.
2317 * We post the fist page as normal page as compression will take
2318 * much CPU resource.
2320 if (block != pss->last_sent_block) {
2321 flush_compressed_data(rs);
2325 if (compress_page_with_multi_thread(block, offset) > 0) {
2329 compression_counters.busy++;
2334 * ram_save_target_page_legacy: save one target page
2336 * Returns the number of pages written
2338 * @rs: current RAM state
2339 * @pss: data about the page we want to send
2341 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2343 RAMBlock *block = pss->block;
2344 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2347 if (control_save_page(pss, block, offset, &res)) {
2351 if (save_compress_page(rs, pss, block, offset)) {
2355 res = save_zero_page(pss, pss->pss_channel, block, offset);
2357 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2358 * page would be stale
2360 if (rs->xbzrle_enabled) {
2361 XBZRLE_cache_lock();
2362 xbzrle_cache_zero_page(rs, block->offset + offset);
2363 XBZRLE_cache_unlock();
2369 * Do not use multifd in postcopy as one whole host page should be
2370 * placed. Meanwhile postcopy requires atomic update of pages, so even
2371 * if host page size == guest page size the dest guest during run may
2372 * still see partially copied pages which is data corruption.
2374 if (migrate_multifd() && !migration_in_postcopy()) {
2375 return ram_save_multifd_page(pss->pss_channel, block, offset);
2378 return ram_save_page(rs, pss);
2381 /* Should be called before sending a host page */
2382 static void pss_host_page_prepare(PageSearchStatus *pss)
2384 /* How many guest pages are there in one host page? */
2385 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2387 pss->host_page_sending = true;
2388 if (guest_pfns <= 1) {
2390 * This covers both when guest psize == host psize, or when guest
2391 * has larger psize than the host (guest_pfns==0).
2393 * For the latter, we always send one whole guest page per
2394 * iteration of the host page (example: an Alpha VM on x86 host
2395 * will have guest psize 8K while host psize 4K).
2397 pss->host_page_start = pss->page;
2398 pss->host_page_end = pss->page + 1;
2401 * The host page spans over multiple guest pages, we send them
2402 * within the same host page iteration.
2404 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2405 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2410 * Whether the page pointed by PSS is within the host page being sent.
2411 * Must be called after a previous pss_host_page_prepare().
2413 static bool pss_within_range(PageSearchStatus *pss)
2415 ram_addr_t ram_addr;
2417 assert(pss->host_page_sending);
2419 /* Over host-page boundary? */
2420 if (pss->page >= pss->host_page_end) {
2424 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2426 return offset_in_ramblock(pss->block, ram_addr);
2429 static void pss_host_page_finish(PageSearchStatus *pss)
2431 pss->host_page_sending = false;
2432 /* This is not needed, but just to reset it */
2433 pss->host_page_start = pss->host_page_end = 0;
2437 * Send an urgent host page specified by `pss'. Need to be called with
2438 * bitmap_mutex held.
2440 * Returns 0 if save host page succeeded, false otherwise.
2442 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2444 bool page_dirty, sent = false;
2445 RAMState *rs = ram_state;
2448 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2449 pss_host_page_prepare(pss);
2452 * If precopy is sending the same page, let it be done in precopy, or
2453 * we could send the same page in two channels and none of them will
2454 * receive the whole page.
2456 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2457 trace_postcopy_preempt_hit(pss->block->idstr,
2458 pss->page << TARGET_PAGE_BITS);
2463 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2466 /* Be strict to return code; it must be 1, or what else? */
2467 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2468 error_report_once("%s: ram_save_target_page failed", __func__);
2474 pss_find_next_dirty(pss);
2475 } while (pss_within_range(pss));
2477 pss_host_page_finish(pss);
2478 /* For urgent requests, flush immediately if sent */
2480 qemu_fflush(pss->pss_channel);
2486 * ram_save_host_page: save a whole host page
2488 * Starting at *offset send pages up to the end of the current host
2489 * page. It's valid for the initial offset to point into the middle of
2490 * a host page in which case the remainder of the hostpage is sent.
2491 * Only dirty target pages are sent. Note that the host page size may
2492 * be a huge page for this block.
2494 * The saving stops at the boundary of the used_length of the block
2495 * if the RAMBlock isn't a multiple of the host page size.
2497 * The caller must be with ram_state.bitmap_mutex held to call this
2498 * function. Note that this function can temporarily release the lock, but
2499 * when the function is returned it'll make sure the lock is still held.
2501 * Returns the number of pages written or negative on error
2503 * @rs: current RAM state
2504 * @pss: data about the page we want to send
2506 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2508 bool page_dirty, preempt_active = postcopy_preempt_active();
2509 int tmppages, pages = 0;
2510 size_t pagesize_bits =
2511 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2512 unsigned long start_page = pss->page;
2515 if (ramblock_is_ignored(pss->block)) {
2516 error_report("block %s should not be migrated !", pss->block->idstr);
2520 /* Update host page boundary information */
2521 pss_host_page_prepare(pss);
2524 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2526 /* Check the pages is dirty and if it is send it */
2529 * Properly yield the lock only in postcopy preempt mode
2530 * because both migration thread and rp-return thread can
2531 * operate on the bitmaps.
2533 if (preempt_active) {
2534 qemu_mutex_unlock(&rs->bitmap_mutex);
2536 tmppages = migration_ops->ram_save_target_page(rs, pss);
2537 if (tmppages >= 0) {
2540 * Allow rate limiting to happen in the middle of huge pages if
2541 * something is sent in the current iteration.
2543 if (pagesize_bits > 1 && tmppages > 0) {
2544 migration_rate_limit();
2547 if (preempt_active) {
2548 qemu_mutex_lock(&rs->bitmap_mutex);
2555 pss_host_page_finish(pss);
2559 pss_find_next_dirty(pss);
2560 } while (pss_within_range(pss));
2562 pss_host_page_finish(pss);
2564 res = ram_save_release_protection(rs, pss, start_page);
2565 return (res < 0 ? res : pages);
2569 * ram_find_and_save_block: finds a dirty page and sends it to f
2571 * Called within an RCU critical section.
2573 * Returns the number of pages written where zero means no dirty pages,
2574 * or negative on error
2576 * @rs: current RAM state
2578 * On systems where host-page-size > target-page-size it will send all the
2579 * pages in a host page that are dirty.
2581 static int ram_find_and_save_block(RAMState *rs)
2583 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2586 /* No dirty page as there is zero RAM */
2587 if (!rs->ram_bytes_total) {
2592 * Always keep last_seen_block/last_page valid during this procedure,
2593 * because find_dirty_block() relies on these values (e.g., we compare
2594 * last_seen_block with pss.block to see whether we searched all the
2595 * ramblocks) to detect the completion of migration. Having NULL value
2596 * of last_seen_block can conditionally cause below loop to run forever.
2598 if (!rs->last_seen_block) {
2599 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2603 pss_init(pss, rs->last_seen_block, rs->last_page);
2606 if (!get_queued_page(rs, pss)) {
2607 /* priority queue empty, so just search for something dirty */
2608 int res = find_dirty_block(rs, pss);
2609 if (res != PAGE_DIRTY_FOUND) {
2610 if (res == PAGE_ALL_CLEAN) {
2612 } else if (res == PAGE_TRY_AGAIN) {
2614 } else if (res < 0) {
2620 pages = ram_save_host_page(rs, pss);
2626 rs->last_seen_block = pss->block;
2627 rs->last_page = pss->page;
2632 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2634 uint64_t pages = size / TARGET_PAGE_SIZE;
2637 stat64_add(&ram_counters.zero_pages, pages);
2639 stat64_add(&ram_counters.normal_pages, pages);
2640 ram_transferred_add(size);
2641 qemu_file_credit_transfer(f, size);
2645 static uint64_t ram_bytes_total_with_ignored(void)
2650 RCU_READ_LOCK_GUARD();
2652 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2653 total += block->used_length;
2658 uint64_t ram_bytes_total(void)
2663 RCU_READ_LOCK_GUARD();
2665 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2666 total += block->used_length;
2671 static void xbzrle_load_setup(void)
2673 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2676 static void xbzrle_load_cleanup(void)
2678 g_free(XBZRLE.decoded_buf);
2679 XBZRLE.decoded_buf = NULL;
2682 static void ram_state_cleanup(RAMState **rsp)
2685 migration_page_queue_free(*rsp);
2686 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2687 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2693 static void xbzrle_cleanup(void)
2695 XBZRLE_cache_lock();
2697 cache_fini(XBZRLE.cache);
2698 g_free(XBZRLE.encoded_buf);
2699 g_free(XBZRLE.current_buf);
2700 g_free(XBZRLE.zero_target_page);
2701 XBZRLE.cache = NULL;
2702 XBZRLE.encoded_buf = NULL;
2703 XBZRLE.current_buf = NULL;
2704 XBZRLE.zero_target_page = NULL;
2706 XBZRLE_cache_unlock();
2709 static void ram_save_cleanup(void *opaque)
2711 RAMState **rsp = opaque;
2714 /* We don't use dirty log with background snapshots */
2715 if (!migrate_background_snapshot()) {
2716 /* caller have hold iothread lock or is in a bh, so there is
2717 * no writing race against the migration bitmap
2719 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2721 * do not stop dirty log without starting it, since
2722 * memory_global_dirty_log_stop will assert that
2723 * memory_global_dirty_log_start/stop used in pairs
2725 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2729 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2730 g_free(block->clear_bmap);
2731 block->clear_bmap = NULL;
2732 g_free(block->bmap);
2737 compress_threads_save_cleanup();
2738 ram_state_cleanup(rsp);
2739 g_free(migration_ops);
2740 migration_ops = NULL;
2743 static void ram_state_reset(RAMState *rs)
2747 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2748 rs->pss[i].last_sent_block = NULL;
2751 rs->last_seen_block = NULL;
2753 rs->last_version = ram_list.version;
2754 rs->xbzrle_enabled = false;
2757 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2759 /* **** functions for postcopy ***** */
2761 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2763 struct RAMBlock *block;
2765 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2766 unsigned long *bitmap = block->bmap;
2767 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2768 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2770 while (run_start < range) {
2771 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2772 ram_discard_range(block->idstr,
2773 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2774 ((ram_addr_t)(run_end - run_start))
2775 << TARGET_PAGE_BITS);
2776 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2782 * postcopy_send_discard_bm_ram: discard a RAMBlock
2784 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2786 * @ms: current migration state
2787 * @block: RAMBlock to discard
2789 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2791 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2792 unsigned long current;
2793 unsigned long *bitmap = block->bmap;
2795 for (current = 0; current < end; ) {
2796 unsigned long one = find_next_bit(bitmap, end, current);
2797 unsigned long zero, discard_length;
2803 zero = find_next_zero_bit(bitmap, end, one + 1);
2806 discard_length = end - one;
2808 discard_length = zero - one;
2810 postcopy_discard_send_range(ms, one, discard_length);
2811 current = one + discard_length;
2815 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2818 * postcopy_each_ram_send_discard: discard all RAMBlocks
2820 * Utility for the outgoing postcopy code.
2821 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2822 * passing it bitmap indexes and name.
2823 * (qemu_ram_foreach_block ends up passing unscaled lengths
2824 * which would mean postcopy code would have to deal with target page)
2826 * @ms: current migration state
2828 static void postcopy_each_ram_send_discard(MigrationState *ms)
2830 struct RAMBlock *block;
2832 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2833 postcopy_discard_send_init(ms, block->idstr);
2836 * Deal with TPS != HPS and huge pages. It discard any partially sent
2837 * host-page size chunks, mark any partially dirty host-page size
2838 * chunks as all dirty. In this case the host-page is the host-page
2839 * for the particular RAMBlock, i.e. it might be a huge page.
2841 postcopy_chunk_hostpages_pass(ms, block);
2844 * Postcopy sends chunks of bitmap over the wire, but it
2845 * just needs indexes at this point, avoids it having
2846 * target page specific code.
2848 postcopy_send_discard_bm_ram(ms, block);
2849 postcopy_discard_send_finish(ms);
2854 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2856 * Helper for postcopy_chunk_hostpages; it's called twice to
2857 * canonicalize the two bitmaps, that are similar, but one is
2860 * Postcopy requires that all target pages in a hostpage are dirty or
2861 * clean, not a mix. This function canonicalizes the bitmaps.
2863 * @ms: current migration state
2864 * @block: block that contains the page we want to canonicalize
2866 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2868 RAMState *rs = ram_state;
2869 unsigned long *bitmap = block->bmap;
2870 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2871 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2872 unsigned long run_start;
2874 if (block->page_size == TARGET_PAGE_SIZE) {
2875 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2879 /* Find a dirty page */
2880 run_start = find_next_bit(bitmap, pages, 0);
2882 while (run_start < pages) {
2885 * If the start of this run of pages is in the middle of a host
2886 * page, then we need to fixup this host page.
2888 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2889 /* Find the end of this run */
2890 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2892 * If the end isn't at the start of a host page, then the
2893 * run doesn't finish at the end of a host page
2894 * and we need to discard.
2898 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2900 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2902 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2904 /* Clean up the bitmap */
2905 for (page = fixup_start_addr;
2906 page < fixup_start_addr + host_ratio; page++) {
2908 * Remark them as dirty, updating the count for any pages
2909 * that weren't previously dirty.
2911 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2915 /* Find the next dirty page for the next iteration */
2916 run_start = find_next_bit(bitmap, pages, run_start);
2921 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2923 * Transmit the set of pages to be discarded after precopy to the target
2924 * these are pages that:
2925 * a) Have been previously transmitted but are now dirty again
2926 * b) Pages that have never been transmitted, this ensures that
2927 * any pages on the destination that have been mapped by background
2928 * tasks get discarded (transparent huge pages is the specific concern)
2929 * Hopefully this is pretty sparse
2931 * @ms: current migration state
2933 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2935 RAMState *rs = ram_state;
2937 RCU_READ_LOCK_GUARD();
2939 /* This should be our last sync, the src is now paused */
2940 migration_bitmap_sync(rs);
2942 /* Easiest way to make sure we don't resume in the middle of a host-page */
2943 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2944 rs->last_seen_block = NULL;
2947 postcopy_each_ram_send_discard(ms);
2949 trace_ram_postcopy_send_discard_bitmap();
2953 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2955 * Returns zero on success
2957 * @rbname: name of the RAMBlock of the request. NULL means the
2958 * same that last one.
2959 * @start: RAMBlock starting page
2960 * @length: RAMBlock size
2962 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2964 trace_ram_discard_range(rbname, start, length);
2966 RCU_READ_LOCK_GUARD();
2967 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2970 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2975 * On source VM, we don't need to update the received bitmap since
2976 * we don't even have one.
2978 if (rb->receivedmap) {
2979 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2980 length >> qemu_target_page_bits());
2983 return ram_block_discard_range(rb, start, length);
2987 * For every allocation, we will try not to crash the VM if the
2988 * allocation failed.
2990 static int xbzrle_init(void)
2992 Error *local_err = NULL;
2994 if (!migrate_xbzrle()) {
2998 XBZRLE_cache_lock();
3000 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3001 if (!XBZRLE.zero_target_page) {
3002 error_report("%s: Error allocating zero page", __func__);
3006 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3007 TARGET_PAGE_SIZE, &local_err);
3008 if (!XBZRLE.cache) {
3009 error_report_err(local_err);
3010 goto free_zero_page;
3013 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3014 if (!XBZRLE.encoded_buf) {
3015 error_report("%s: Error allocating encoded_buf", __func__);
3019 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3020 if (!XBZRLE.current_buf) {
3021 error_report("%s: Error allocating current_buf", __func__);
3022 goto free_encoded_buf;
3025 /* We are all good */
3026 XBZRLE_cache_unlock();
3030 g_free(XBZRLE.encoded_buf);
3031 XBZRLE.encoded_buf = NULL;
3033 cache_fini(XBZRLE.cache);
3034 XBZRLE.cache = NULL;
3036 g_free(XBZRLE.zero_target_page);
3037 XBZRLE.zero_target_page = NULL;
3039 XBZRLE_cache_unlock();
3043 static int ram_state_init(RAMState **rsp)
3045 *rsp = g_try_new0(RAMState, 1);
3048 error_report("%s: Init ramstate fail", __func__);
3052 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3053 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3054 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3055 (*rsp)->ram_bytes_total = ram_bytes_total();
3058 * Count the total number of pages used by ram blocks not including any
3059 * gaps due to alignment or unplugs.
3060 * This must match with the initial values of dirty bitmap.
3062 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3063 ram_state_reset(*rsp);
3068 static void ram_list_init_bitmaps(void)
3070 MigrationState *ms = migrate_get_current();
3072 unsigned long pages;
3075 /* Skip setting bitmap if there is no RAM */
3076 if (ram_bytes_total()) {
3077 shift = ms->clear_bitmap_shift;
3078 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3079 error_report("clear_bitmap_shift (%u) too big, using "
3080 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3081 shift = CLEAR_BITMAP_SHIFT_MAX;
3082 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3083 error_report("clear_bitmap_shift (%u) too small, using "
3084 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3085 shift = CLEAR_BITMAP_SHIFT_MIN;
3088 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3089 pages = block->max_length >> TARGET_PAGE_BITS;
3091 * The initial dirty bitmap for migration must be set with all
3092 * ones to make sure we'll migrate every guest RAM page to
3094 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3095 * new migration after a failed migration, ram_list.
3096 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3099 block->bmap = bitmap_new(pages);
3100 bitmap_set(block->bmap, 0, pages);
3101 block->clear_bmap_shift = shift;
3102 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3107 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3109 unsigned long pages;
3112 RCU_READ_LOCK_GUARD();
3114 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3115 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3116 rs->migration_dirty_pages -= pages;
3120 static void ram_init_bitmaps(RAMState *rs)
3122 /* For memory_global_dirty_log_start below. */
3123 qemu_mutex_lock_iothread();
3124 qemu_mutex_lock_ramlist();
3126 WITH_RCU_READ_LOCK_GUARD() {
3127 ram_list_init_bitmaps();
3128 /* We don't use dirty log with background snapshots */
3129 if (!migrate_background_snapshot()) {
3130 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3131 migration_bitmap_sync_precopy(rs);
3134 qemu_mutex_unlock_ramlist();
3135 qemu_mutex_unlock_iothread();
3138 * After an eventual first bitmap sync, fixup the initial bitmap
3139 * containing all 1s to exclude any discarded pages from migration.
3141 migration_bitmap_clear_discarded_pages(rs);
3144 static int ram_init_all(RAMState **rsp)
3146 if (ram_state_init(rsp)) {
3150 if (xbzrle_init()) {
3151 ram_state_cleanup(rsp);
3155 ram_init_bitmaps(*rsp);
3160 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3166 * Postcopy is not using xbzrle/compression, so no need for that.
3167 * Also, since source are already halted, we don't need to care
3168 * about dirty page logging as well.
3171 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3172 pages += bitmap_count_one(block->bmap,
3173 block->used_length >> TARGET_PAGE_BITS);
3176 /* This may not be aligned with current bitmaps. Recalculate. */
3177 rs->migration_dirty_pages = pages;
3179 ram_state_reset(rs);
3181 /* Update RAMState cache of output QEMUFile */
3182 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3184 trace_ram_state_resume_prepare(pages);
3188 * This function clears bits of the free pages reported by the caller from the
3189 * migration dirty bitmap. @addr is the host address corresponding to the
3190 * start of the continuous guest free pages, and @len is the total bytes of
3193 void qemu_guest_free_page_hint(void *addr, size_t len)
3197 size_t used_len, start, npages;
3198 MigrationState *s = migrate_get_current();
3200 /* This function is currently expected to be used during live migration */
3201 if (!migration_is_setup_or_active(s->state)) {
3205 for (; len > 0; len -= used_len, addr += used_len) {
3206 block = qemu_ram_block_from_host(addr, false, &offset);
3207 if (unlikely(!block || offset >= block->used_length)) {
3209 * The implementation might not support RAMBlock resize during
3210 * live migration, but it could happen in theory with future
3211 * updates. So we add a check here to capture that case.
3213 error_report_once("%s unexpected error", __func__);
3217 if (len <= block->used_length - offset) {
3220 used_len = block->used_length - offset;
3223 start = offset >> TARGET_PAGE_BITS;
3224 npages = used_len >> TARGET_PAGE_BITS;
3226 qemu_mutex_lock(&ram_state->bitmap_mutex);
3228 * The skipped free pages are equavalent to be sent from clear_bmap's
3229 * perspective, so clear the bits from the memory region bitmap which
3230 * are initially set. Otherwise those skipped pages will be sent in
3231 * the next round after syncing from the memory region bitmap.
3233 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3234 ram_state->migration_dirty_pages -=
3235 bitmap_count_one_with_offset(block->bmap, start, npages);
3236 bitmap_clear(block->bmap, start, npages);
3237 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3242 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3243 * long-running RCU critical section. When rcu-reclaims in the code
3244 * start to become numerous it will be necessary to reduce the
3245 * granularity of these critical sections.
3249 * ram_save_setup: Setup RAM for migration
3251 * Returns zero to indicate success and negative for error
3253 * @f: QEMUFile where to send the data
3254 * @opaque: RAMState pointer
3256 static int ram_save_setup(QEMUFile *f, void *opaque)
3258 RAMState **rsp = opaque;
3262 if (compress_threads_save_setup()) {
3266 /* migration has already setup the bitmap, reuse it. */
3267 if (!migration_in_colo_state()) {
3268 if (ram_init_all(rsp) != 0) {
3269 compress_threads_save_cleanup();
3273 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3275 WITH_RCU_READ_LOCK_GUARD() {
3276 qemu_put_be64(f, ram_bytes_total_with_ignored()
3277 | RAM_SAVE_FLAG_MEM_SIZE);
3279 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3280 qemu_put_byte(f, strlen(block->idstr));
3281 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3282 qemu_put_be64(f, block->used_length);
3283 if (migrate_postcopy_ram() && block->page_size !=
3284 qemu_host_page_size) {
3285 qemu_put_be64(f, block->page_size);
3287 if (migrate_ignore_shared()) {
3288 qemu_put_be64(f, block->mr->addr);
3293 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3294 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3296 migration_ops = g_malloc0(sizeof(MigrationOps));
3297 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3298 ret = multifd_send_sync_main(f);
3303 if (!migrate_multifd_flush_after_each_section()) {
3304 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3307 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3314 * ram_save_iterate: iterative stage for migration
3316 * Returns zero to indicate success and negative for error
3318 * @f: QEMUFile where to send the data
3319 * @opaque: RAMState pointer
3321 static int ram_save_iterate(QEMUFile *f, void *opaque)
3323 RAMState **temp = opaque;
3324 RAMState *rs = *temp;
3330 if (blk_mig_bulk_active()) {
3331 /* Avoid transferring ram during bulk phase of block migration as
3332 * the bulk phase will usually take a long time and transferring
3333 * ram updates during that time is pointless. */
3338 * We'll take this lock a little bit long, but it's okay for two reasons.
3339 * Firstly, the only possible other thread to take it is who calls
3340 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3341 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3342 * guarantees that we'll at least released it in a regular basis.
3344 qemu_mutex_lock(&rs->bitmap_mutex);
3345 WITH_RCU_READ_LOCK_GUARD() {
3346 if (ram_list.version != rs->last_version) {
3347 ram_state_reset(rs);
3350 /* Read version before ram_list.blocks */
3353 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3355 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3357 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3358 postcopy_has_request(rs)) {
3361 if (qemu_file_get_error(f)) {
3365 pages = ram_find_and_save_block(rs);
3366 /* no more pages to sent */
3373 qemu_file_set_error(f, pages);
3377 rs->target_page_count += pages;
3380 * During postcopy, it is necessary to make sure one whole host
3381 * page is sent in one chunk.
3383 if (migrate_postcopy_ram()) {
3384 flush_compressed_data(rs);
3388 * we want to check in the 1st loop, just in case it was the 1st
3389 * time and we had to sync the dirty bitmap.
3390 * qemu_clock_get_ns() is a bit expensive, so we only check each
3393 if ((i & 63) == 0) {
3394 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3396 if (t1 > MAX_WAIT) {
3397 trace_ram_save_iterate_big_wait(t1, i);
3404 qemu_mutex_unlock(&rs->bitmap_mutex);
3407 * Must occur before EOS (or any QEMUFile operation)
3408 * because of RDMA protocol.
3410 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3414 && migration_is_setup_or_active(migrate_get_current()->state)) {
3415 if (migrate_multifd_flush_after_each_section()) {
3416 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3422 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3424 ram_transferred_add(8);
3426 ret = qemu_file_get_error(f);
3436 * ram_save_complete: function called to send the remaining amount of ram
3438 * Returns zero to indicate success or negative on error
3440 * Called with iothread lock
3442 * @f: QEMUFile where to send the data
3443 * @opaque: RAMState pointer
3445 static int ram_save_complete(QEMUFile *f, void *opaque)
3447 RAMState **temp = opaque;
3448 RAMState *rs = *temp;
3451 rs->last_stage = !migration_in_colo_state();
3453 WITH_RCU_READ_LOCK_GUARD() {
3454 if (!migration_in_postcopy()) {
3455 migration_bitmap_sync_precopy(rs);
3458 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3460 /* try transferring iterative blocks of memory */
3462 /* flush all remaining blocks regardless of rate limiting */
3463 qemu_mutex_lock(&rs->bitmap_mutex);
3467 pages = ram_find_and_save_block(rs);
3468 /* no more blocks to sent */
3477 qemu_mutex_unlock(&rs->bitmap_mutex);
3479 flush_compressed_data(rs);
3480 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3487 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3492 if (!migrate_multifd_flush_after_each_section()) {
3493 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3495 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3501 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3502 uint64_t *can_postcopy)
3504 RAMState **temp = opaque;
3505 RAMState *rs = *temp;
3507 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3509 if (migrate_postcopy_ram()) {
3510 /* We can do postcopy, and all the data is postcopiable */
3511 *can_postcopy += remaining_size;
3513 *must_precopy += remaining_size;
3517 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3518 uint64_t *can_postcopy)
3520 MigrationState *s = migrate_get_current();
3521 RAMState **temp = opaque;
3522 RAMState *rs = *temp;
3524 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3526 if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3527 qemu_mutex_lock_iothread();
3528 WITH_RCU_READ_LOCK_GUARD() {
3529 migration_bitmap_sync_precopy(rs);
3531 qemu_mutex_unlock_iothread();
3532 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3535 if (migrate_postcopy_ram()) {
3536 /* We can do postcopy, and all the data is postcopiable */
3537 *can_postcopy += remaining_size;
3539 *must_precopy += remaining_size;
3543 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3545 unsigned int xh_len;
3547 uint8_t *loaded_data;
3549 /* extract RLE header */
3550 xh_flags = qemu_get_byte(f);
3551 xh_len = qemu_get_be16(f);
3553 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3554 error_report("Failed to load XBZRLE page - wrong compression!");
3558 if (xh_len > TARGET_PAGE_SIZE) {
3559 error_report("Failed to load XBZRLE page - len overflow!");
3562 loaded_data = XBZRLE.decoded_buf;
3563 /* load data and decode */
3564 /* it can change loaded_data to point to an internal buffer */
3565 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3568 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3569 TARGET_PAGE_SIZE) == -1) {
3570 error_report("Failed to load XBZRLE page - decode error!");
3578 * ram_block_from_stream: read a RAMBlock id from the migration stream
3580 * Must be called from within a rcu critical section.
3582 * Returns a pointer from within the RCU-protected ram_list.
3584 * @mis: the migration incoming state pointer
3585 * @f: QEMUFile where to read the data from
3586 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3587 * @channel: the channel we're using
3589 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3590 QEMUFile *f, int flags,
3593 RAMBlock *block = mis->last_recv_block[channel];
3597 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3599 error_report("Ack, bad migration stream!");
3605 len = qemu_get_byte(f);
3606 qemu_get_buffer(f, (uint8_t *)id, len);
3609 block = qemu_ram_block_by_name(id);
3611 error_report("Can't find block %s", id);
3615 if (ramblock_is_ignored(block)) {
3616 error_report("block %s should not be migrated !", id);
3620 mis->last_recv_block[channel] = block;
3625 static inline void *host_from_ram_block_offset(RAMBlock *block,
3628 if (!offset_in_ramblock(block, offset)) {
3632 return block->host + offset;
3635 static void *host_page_from_ram_block_offset(RAMBlock *block,
3638 /* Note: Explicitly no check against offset_in_ramblock(). */
3639 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3643 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3646 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3649 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3650 ram_addr_t offset, bool record_bitmap)
3652 if (!offset_in_ramblock(block, offset)) {
3655 if (!block->colo_cache) {
3656 error_report("%s: colo_cache is NULL in block :%s",
3657 __func__, block->idstr);
3662 * During colo checkpoint, we need bitmap of these migrated pages.
3663 * It help us to decide which pages in ram cache should be flushed
3664 * into VM's RAM later.
3666 if (record_bitmap &&
3667 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3668 ram_state->migration_dirty_pages++;
3670 return block->colo_cache + offset;
3674 * ram_handle_compressed: handle the zero page case
3676 * If a page (or a whole RDMA chunk) has been
3677 * determined to be zero, then zap it.
3679 * @host: host address for the zero page
3680 * @ch: what the page is filled from. We only support zero
3681 * @size: size of the zero page
3683 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3685 if (ch != 0 || !buffer_is_zero(host, size)) {
3686 memset(host, ch, size);
3690 /* return the size after decompression, or negative value on error */
3692 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3693 const uint8_t *source, size_t source_len)
3697 err = inflateReset(stream);
3702 stream->avail_in = source_len;
3703 stream->next_in = (uint8_t *)source;
3704 stream->avail_out = dest_len;
3705 stream->next_out = dest;
3707 err = inflate(stream, Z_NO_FLUSH);
3708 if (err != Z_STREAM_END) {
3712 return stream->total_out;
3715 static void *do_data_decompress(void *opaque)
3717 DecompressParam *param = opaque;
3718 unsigned long pagesize;
3722 qemu_mutex_lock(¶m->mutex);
3723 while (!param->quit) {
3728 qemu_mutex_unlock(¶m->mutex);
3730 pagesize = TARGET_PAGE_SIZE;
3732 ret = qemu_uncompress_data(¶m->stream, des, pagesize,
3733 param->compbuf, len);
3734 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3735 error_report("decompress data failed");
3736 qemu_file_set_error(decomp_file, ret);
3739 qemu_mutex_lock(&decomp_done_lock);
3741 qemu_cond_signal(&decomp_done_cond);
3742 qemu_mutex_unlock(&decomp_done_lock);
3744 qemu_mutex_lock(¶m->mutex);
3746 qemu_cond_wait(¶m->cond, ¶m->mutex);
3749 qemu_mutex_unlock(¶m->mutex);
3754 static int wait_for_decompress_done(void)
3756 int idx, thread_count;
3758 if (!migrate_compress()) {
3762 thread_count = migrate_decompress_threads();
3763 qemu_mutex_lock(&decomp_done_lock);
3764 for (idx = 0; idx < thread_count; idx++) {
3765 while (!decomp_param[idx].done) {
3766 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3769 qemu_mutex_unlock(&decomp_done_lock);
3770 return qemu_file_get_error(decomp_file);
3773 static void compress_threads_load_cleanup(void)
3775 int i, thread_count;
3777 if (!migrate_compress()) {
3780 thread_count = migrate_decompress_threads();
3781 for (i = 0; i < thread_count; i++) {
3783 * we use it as a indicator which shows if the thread is
3784 * properly init'd or not
3786 if (!decomp_param[i].compbuf) {
3790 qemu_mutex_lock(&decomp_param[i].mutex);
3791 decomp_param[i].quit = true;
3792 qemu_cond_signal(&decomp_param[i].cond);
3793 qemu_mutex_unlock(&decomp_param[i].mutex);
3795 for (i = 0; i < thread_count; i++) {
3796 if (!decomp_param[i].compbuf) {
3800 qemu_thread_join(decompress_threads + i);
3801 qemu_mutex_destroy(&decomp_param[i].mutex);
3802 qemu_cond_destroy(&decomp_param[i].cond);
3803 inflateEnd(&decomp_param[i].stream);
3804 g_free(decomp_param[i].compbuf);
3805 decomp_param[i].compbuf = NULL;
3807 g_free(decompress_threads);
3808 g_free(decomp_param);
3809 decompress_threads = NULL;
3810 decomp_param = NULL;
3814 static int compress_threads_load_setup(QEMUFile *f)
3816 int i, thread_count;
3818 if (!migrate_compress()) {
3822 thread_count = migrate_decompress_threads();
3823 decompress_threads = g_new0(QemuThread, thread_count);
3824 decomp_param = g_new0(DecompressParam, thread_count);
3825 qemu_mutex_init(&decomp_done_lock);
3826 qemu_cond_init(&decomp_done_cond);
3828 for (i = 0; i < thread_count; i++) {
3829 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3833 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3834 qemu_mutex_init(&decomp_param[i].mutex);
3835 qemu_cond_init(&decomp_param[i].cond);
3836 decomp_param[i].done = true;
3837 decomp_param[i].quit = false;
3838 qemu_thread_create(decompress_threads + i, "decompress",
3839 do_data_decompress, decomp_param + i,
3840 QEMU_THREAD_JOINABLE);
3844 compress_threads_load_cleanup();
3848 static void decompress_data_with_multi_threads(QEMUFile *f,
3849 void *host, int len)
3851 int idx, thread_count;
3853 thread_count = migrate_decompress_threads();
3854 QEMU_LOCK_GUARD(&decomp_done_lock);
3856 for (idx = 0; idx < thread_count; idx++) {
3857 if (decomp_param[idx].done) {
3858 decomp_param[idx].done = false;
3859 qemu_mutex_lock(&decomp_param[idx].mutex);
3860 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3861 decomp_param[idx].des = host;
3862 decomp_param[idx].len = len;
3863 qemu_cond_signal(&decomp_param[idx].cond);
3864 qemu_mutex_unlock(&decomp_param[idx].mutex);
3868 if (idx < thread_count) {
3871 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3876 static void colo_init_ram_state(void)
3878 ram_state_init(&ram_state);
3882 * colo cache: this is for secondary VM, we cache the whole
3883 * memory of the secondary VM, it is need to hold the global lock
3884 * to call this helper.
3886 int colo_init_ram_cache(void)
3890 WITH_RCU_READ_LOCK_GUARD() {
3891 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3892 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3893 NULL, false, false);
3894 if (!block->colo_cache) {
3895 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3896 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3897 block->used_length);
3898 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3899 if (block->colo_cache) {
3900 qemu_anon_ram_free(block->colo_cache, block->used_length);
3901 block->colo_cache = NULL;
3906 if (!machine_dump_guest_core(current_machine)) {
3907 qemu_madvise(block->colo_cache, block->used_length,
3908 QEMU_MADV_DONTDUMP);
3914 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3915 * with to decide which page in cache should be flushed into SVM's RAM. Here
3916 * we use the same name 'ram_bitmap' as for migration.
3918 if (ram_bytes_total()) {
3921 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3922 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3923 block->bmap = bitmap_new(pages);
3927 colo_init_ram_state();
3931 /* TODO: duplicated with ram_init_bitmaps */
3932 void colo_incoming_start_dirty_log(void)
3934 RAMBlock *block = NULL;
3935 /* For memory_global_dirty_log_start below. */
3936 qemu_mutex_lock_iothread();
3937 qemu_mutex_lock_ramlist();
3939 memory_global_dirty_log_sync();
3940 WITH_RCU_READ_LOCK_GUARD() {
3941 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3942 ramblock_sync_dirty_bitmap(ram_state, block);
3943 /* Discard this dirty bitmap record */
3944 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3946 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3948 ram_state->migration_dirty_pages = 0;
3949 qemu_mutex_unlock_ramlist();
3950 qemu_mutex_unlock_iothread();
3953 /* It is need to hold the global lock to call this helper */
3954 void colo_release_ram_cache(void)
3958 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3959 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3960 g_free(block->bmap);
3964 WITH_RCU_READ_LOCK_GUARD() {
3965 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3966 if (block->colo_cache) {
3967 qemu_anon_ram_free(block->colo_cache, block->used_length);
3968 block->colo_cache = NULL;
3972 ram_state_cleanup(&ram_state);
3976 * ram_load_setup: Setup RAM for migration incoming side
3978 * Returns zero to indicate success and negative for error
3980 * @f: QEMUFile where to receive the data
3981 * @opaque: RAMState pointer
3983 static int ram_load_setup(QEMUFile *f, void *opaque)
3985 if (compress_threads_load_setup(f)) {
3989 xbzrle_load_setup();
3990 ramblock_recv_map_init();
3995 static int ram_load_cleanup(void *opaque)
3999 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4000 qemu_ram_block_writeback(rb);
4003 xbzrle_load_cleanup();
4004 compress_threads_load_cleanup();
4006 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4007 g_free(rb->receivedmap);
4008 rb->receivedmap = NULL;
4015 * ram_postcopy_incoming_init: allocate postcopy data structures
4017 * Returns 0 for success and negative if there was one error
4019 * @mis: current migration incoming state
4021 * Allocate data structures etc needed by incoming migration with
4022 * postcopy-ram. postcopy-ram's similarly names
4023 * postcopy_ram_incoming_init does the work.
4025 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4027 return postcopy_ram_incoming_init(mis);
4031 * ram_load_postcopy: load a page in postcopy case
4033 * Returns 0 for success or -errno in case of error
4035 * Called in postcopy mode by ram_load().
4036 * rcu_read_lock is taken prior to this being called.
4038 * @f: QEMUFile where to send the data
4039 * @channel: the channel to use for loading
4041 int ram_load_postcopy(QEMUFile *f, int channel)
4043 int flags = 0, ret = 0;
4044 bool place_needed = false;
4045 bool matches_target_page_size = false;
4046 MigrationIncomingState *mis = migration_incoming_get_current();
4047 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4049 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4051 void *page_buffer = NULL;
4052 void *place_source = NULL;
4053 RAMBlock *block = NULL;
4057 addr = qemu_get_be64(f);
4060 * If qemu file error, we should stop here, and then "addr"
4063 ret = qemu_file_get_error(f);
4068 flags = addr & ~TARGET_PAGE_MASK;
4069 addr &= TARGET_PAGE_MASK;
4071 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4072 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4073 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4074 block = ram_block_from_stream(mis, f, flags, channel);
4081 * Relying on used_length is racy and can result in false positives.
4082 * We might place pages beyond used_length in case RAM was shrunk
4083 * while in postcopy, which is fine - trying to place via
4084 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4086 if (!block->host || addr >= block->postcopy_length) {
4087 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4091 tmp_page->target_pages++;
4092 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4094 * Postcopy requires that we place whole host pages atomically;
4095 * these may be huge pages for RAMBlocks that are backed by
4097 * To make it atomic, the data is read into a temporary page
4098 * that's moved into place later.
4099 * The migration protocol uses, possibly smaller, target-pages
4100 * however the source ensures it always sends all the components
4101 * of a host page in one chunk.
4103 page_buffer = tmp_page->tmp_huge_page +
4104 host_page_offset_from_ram_block_offset(block, addr);
4105 /* If all TP are zero then we can optimise the place */
4106 if (tmp_page->target_pages == 1) {
4107 tmp_page->host_addr =
4108 host_page_from_ram_block_offset(block, addr);
4109 } else if (tmp_page->host_addr !=
4110 host_page_from_ram_block_offset(block, addr)) {
4111 /* not the 1st TP within the HP */
4112 error_report("Non-same host page detected on channel %d: "
4113 "Target host page %p, received host page %p "
4114 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4115 channel, tmp_page->host_addr,
4116 host_page_from_ram_block_offset(block, addr),
4117 block->idstr, addr, tmp_page->target_pages);
4123 * If it's the last part of a host page then we place the host
4126 if (tmp_page->target_pages ==
4127 (block->page_size / TARGET_PAGE_SIZE)) {
4128 place_needed = true;
4130 place_source = tmp_page->tmp_huge_page;
4133 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4134 case RAM_SAVE_FLAG_ZERO:
4135 ch = qemu_get_byte(f);
4137 * Can skip to set page_buffer when
4138 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4140 if (ch || !matches_target_page_size) {
4141 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4144 tmp_page->all_zero = false;
4148 case RAM_SAVE_FLAG_PAGE:
4149 tmp_page->all_zero = false;
4150 if (!matches_target_page_size) {
4151 /* For huge pages, we always use temporary buffer */
4152 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4155 * For small pages that matches target page size, we
4156 * avoid the qemu_file copy. Instead we directly use
4157 * the buffer of QEMUFile to place the page. Note: we
4158 * cannot do any QEMUFile operation before using that
4159 * buffer to make sure the buffer is valid when
4162 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4166 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4167 tmp_page->all_zero = false;
4168 len = qemu_get_be32(f);
4169 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4170 error_report("Invalid compressed data length: %d", len);
4174 decompress_data_with_multi_threads(f, page_buffer, len);
4176 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4177 multifd_recv_sync_main();
4179 case RAM_SAVE_FLAG_EOS:
4181 if (migrate_multifd_flush_after_each_section()) {
4182 multifd_recv_sync_main();
4186 error_report("Unknown combination of migration flags: 0x%x"
4187 " (postcopy mode)", flags);
4192 /* Got the whole host page, wait for decompress before placing. */
4194 ret |= wait_for_decompress_done();
4197 /* Detect for any possible file errors */
4198 if (!ret && qemu_file_get_error(f)) {
4199 ret = qemu_file_get_error(f);
4202 if (!ret && place_needed) {
4203 if (tmp_page->all_zero) {
4204 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4206 ret = postcopy_place_page(mis, tmp_page->host_addr,
4207 place_source, block);
4209 place_needed = false;
4210 postcopy_temp_page_reset(tmp_page);
4217 static bool postcopy_is_running(void)
4219 PostcopyState ps = postcopy_state_get();
4220 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4224 * Flush content of RAM cache into SVM's memory.
4225 * Only flush the pages that be dirtied by PVM or SVM or both.
4227 void colo_flush_ram_cache(void)
4229 RAMBlock *block = NULL;
4232 unsigned long offset = 0;
4234 memory_global_dirty_log_sync();
4235 WITH_RCU_READ_LOCK_GUARD() {
4236 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4237 ramblock_sync_dirty_bitmap(ram_state, block);
4241 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4242 WITH_RCU_READ_LOCK_GUARD() {
4243 block = QLIST_FIRST_RCU(&ram_list.blocks);
4246 unsigned long num = 0;
4248 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4249 if (!offset_in_ramblock(block,
4250 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4253 block = QLIST_NEXT_RCU(block, next);
4255 unsigned long i = 0;
4257 for (i = 0; i < num; i++) {
4258 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4260 dst_host = block->host
4261 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4262 src_host = block->colo_cache
4263 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4264 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4269 trace_colo_flush_ram_cache_end();
4273 * ram_load_precopy: load pages in precopy case
4275 * Returns 0 for success or -errno in case of error
4277 * Called in precopy mode by ram_load().
4278 * rcu_read_lock is taken prior to this being called.
4280 * @f: QEMUFile where to send the data
4282 static int ram_load_precopy(QEMUFile *f)
4284 MigrationIncomingState *mis = migration_incoming_get_current();
4285 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4286 /* ADVISE is earlier, it shows the source has the postcopy capability on */
4287 bool postcopy_advised = migration_incoming_postcopy_advised();
4288 if (!migrate_compress()) {
4289 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4292 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4293 ram_addr_t addr, total_ram_bytes;
4294 void *host = NULL, *host_bak = NULL;
4298 * Yield periodically to let main loop run, but an iteration of
4299 * the main loop is expensive, so do it each some iterations
4301 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4302 aio_co_schedule(qemu_get_current_aio_context(),
4303 qemu_coroutine_self());
4304 qemu_coroutine_yield();
4308 addr = qemu_get_be64(f);
4309 flags = addr & ~TARGET_PAGE_MASK;
4310 addr &= TARGET_PAGE_MASK;
4312 if (flags & invalid_flags) {
4313 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4314 error_report("Received an unexpected compressed page");
4321 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4322 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4323 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4324 RAM_CHANNEL_PRECOPY);
4326 host = host_from_ram_block_offset(block, addr);
4328 * After going into COLO stage, we should not load the page
4329 * into SVM's memory directly, we put them into colo_cache firstly.
4330 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4331 * Previously, we copied all these memory in preparing stage of COLO
4332 * while we need to stop VM, which is a time-consuming process.
4333 * Here we optimize it by a trick, back-up every page while in
4334 * migration process while COLO is enabled, though it affects the
4335 * speed of the migration, but it obviously reduce the downtime of
4336 * back-up all SVM'S memory in COLO preparing stage.
4338 if (migration_incoming_colo_enabled()) {
4339 if (migration_incoming_in_colo_state()) {
4340 /* In COLO stage, put all pages into cache temporarily */
4341 host = colo_cache_from_block_offset(block, addr, true);
4344 * In migration stage but before COLO stage,
4345 * Put all pages into both cache and SVM's memory.
4347 host_bak = colo_cache_from_block_offset(block, addr, false);
4351 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4355 if (!migration_incoming_in_colo_state()) {
4356 ramblock_recv_bitmap_set(block, host);
4359 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4362 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4363 case RAM_SAVE_FLAG_MEM_SIZE:
4364 /* Synchronize RAM block list */
4365 total_ram_bytes = addr;
4366 while (!ret && total_ram_bytes) {
4371 len = qemu_get_byte(f);
4372 qemu_get_buffer(f, (uint8_t *)id, len);
4374 length = qemu_get_be64(f);
4376 block = qemu_ram_block_by_name(id);
4377 if (block && !qemu_ram_is_migratable(block)) {
4378 error_report("block %s should not be migrated !", id);
4381 if (length != block->used_length) {
4382 Error *local_err = NULL;
4384 ret = qemu_ram_resize(block, length,
4387 error_report_err(local_err);
4390 /* For postcopy we need to check hugepage sizes match */
4391 if (postcopy_advised && migrate_postcopy_ram() &&
4392 block->page_size != qemu_host_page_size) {
4393 uint64_t remote_page_size = qemu_get_be64(f);
4394 if (remote_page_size != block->page_size) {
4395 error_report("Mismatched RAM page size %s "
4396 "(local) %zd != %" PRId64,
4397 id, block->page_size,
4402 if (migrate_ignore_shared()) {
4403 hwaddr addr = qemu_get_be64(f);
4404 if (ramblock_is_ignored(block) &&
4405 block->mr->addr != addr) {
4406 error_report("Mismatched GPAs for block %s "
4407 "%" PRId64 "!= %" PRId64,
4409 (uint64_t)block->mr->addr);
4413 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4416 error_report("Unknown ramblock \"%s\", cannot "
4417 "accept migration", id);
4421 total_ram_bytes -= length;
4425 case RAM_SAVE_FLAG_ZERO:
4426 ch = qemu_get_byte(f);
4427 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4430 case RAM_SAVE_FLAG_PAGE:
4431 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4434 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4435 len = qemu_get_be32(f);
4436 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4437 error_report("Invalid compressed data length: %d", len);
4441 decompress_data_with_multi_threads(f, host, len);
4444 case RAM_SAVE_FLAG_XBZRLE:
4445 if (load_xbzrle(f, addr, host) < 0) {
4446 error_report("Failed to decompress XBZRLE page at "
4447 RAM_ADDR_FMT, addr);
4452 case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4453 multifd_recv_sync_main();
4455 case RAM_SAVE_FLAG_EOS:
4457 if (migrate_multifd_flush_after_each_section()) {
4458 multifd_recv_sync_main();
4462 if (flags & RAM_SAVE_FLAG_HOOK) {
4463 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4465 error_report("Unknown combination of migration flags: 0x%x",
4471 ret = qemu_file_get_error(f);
4473 if (!ret && host_bak) {
4474 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4478 ret |= wait_for_decompress_done();
4482 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4485 static uint64_t seq_iter;
4487 * If system is running in postcopy mode, page inserts to host memory must
4490 bool postcopy_running = postcopy_is_running();
4494 if (version_id != 4) {
4499 * This RCU critical section can be very long running.
4500 * When RCU reclaims in the code start to become numerous,
4501 * it will be necessary to reduce the granularity of this
4504 WITH_RCU_READ_LOCK_GUARD() {
4505 if (postcopy_running) {
4507 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4508 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4509 * service fast page faults.
4511 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4513 ret = ram_load_precopy(f);
4516 trace_ram_load_complete(ret, seq_iter);
4521 static bool ram_has_postcopy(void *opaque)
4524 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4525 if (ramblock_is_pmem(rb)) {
4526 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4527 "is not supported now!", rb->idstr, rb->host);
4532 return migrate_postcopy_ram();
4535 /* Sync all the dirty bitmap with destination VM. */
4536 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4539 QEMUFile *file = s->to_dst_file;
4540 int ramblock_count = 0;
4542 trace_ram_dirty_bitmap_sync_start();
4544 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4545 qemu_savevm_send_recv_bitmap(file, block->idstr);
4546 trace_ram_dirty_bitmap_request(block->idstr);
4550 trace_ram_dirty_bitmap_sync_wait();
4552 /* Wait until all the ramblocks' dirty bitmap synced */
4553 while (ramblock_count--) {
4554 qemu_sem_wait(&s->rp_state.rp_sem);
4557 trace_ram_dirty_bitmap_sync_complete();
4562 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4564 qemu_sem_post(&s->rp_state.rp_sem);
4568 * Read the received bitmap, revert it as the initial dirty bitmap.
4569 * This is only used when the postcopy migration is paused but wants
4570 * to resume from a middle point.
4572 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4575 /* from_dst_file is always valid because we're within rp_thread */
4576 QEMUFile *file = s->rp_state.from_dst_file;
4577 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4578 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4579 uint64_t size, end_mark;
4581 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4583 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4584 error_report("%s: incorrect state %s", __func__,
4585 MigrationStatus_str(s->state));
4590 * Note: see comments in ramblock_recv_bitmap_send() on why we
4591 * need the endianness conversion, and the paddings.
4593 local_size = ROUND_UP(local_size, 8);
4596 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4598 size = qemu_get_be64(file);
4600 /* The size of the bitmap should match with our ramblock */
4601 if (size != local_size) {
4602 error_report("%s: ramblock '%s' bitmap size mismatch "
4603 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4604 block->idstr, size, local_size);
4609 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4610 end_mark = qemu_get_be64(file);
4612 ret = qemu_file_get_error(file);
4613 if (ret || size != local_size) {
4614 error_report("%s: read bitmap failed for ramblock '%s': %d"
4615 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4616 __func__, block->idstr, ret, local_size, size);
4621 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4622 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4623 __func__, block->idstr, end_mark);
4629 * Endianness conversion. We are during postcopy (though paused).
4630 * The dirty bitmap won't change. We can directly modify it.
4632 bitmap_from_le(block->bmap, le_bitmap, nbits);
4635 * What we received is "received bitmap". Revert it as the initial
4636 * dirty bitmap for this ramblock.
4638 bitmap_complement(block->bmap, block->bmap, nbits);
4640 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4641 ramblock_dirty_bitmap_clear_discarded_pages(block);
4643 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4644 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4647 * We succeeded to sync bitmap for current ramblock. If this is
4648 * the last one to sync, we need to notify the main send thread.
4650 ram_dirty_bitmap_reload_notify(s);
4658 static int ram_resume_prepare(MigrationState *s, void *opaque)
4660 RAMState *rs = *(RAMState **)opaque;
4663 ret = ram_dirty_bitmap_sync_all(s, rs);
4668 ram_state_resume_prepare(rs, s->to_dst_file);
4673 void postcopy_preempt_shutdown_file(MigrationState *s)
4675 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4676 qemu_fflush(s->postcopy_qemufile_src);
4679 static SaveVMHandlers savevm_ram_handlers = {
4680 .save_setup = ram_save_setup,
4681 .save_live_iterate = ram_save_iterate,
4682 .save_live_complete_postcopy = ram_save_complete,
4683 .save_live_complete_precopy = ram_save_complete,
4684 .has_postcopy = ram_has_postcopy,
4685 .state_pending_exact = ram_state_pending_exact,
4686 .state_pending_estimate = ram_state_pending_estimate,
4687 .load_state = ram_load,
4688 .save_cleanup = ram_save_cleanup,
4689 .load_setup = ram_load_setup,
4690 .load_cleanup = ram_load_cleanup,
4691 .resume_prepare = ram_resume_prepare,
4694 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4695 size_t old_size, size_t new_size)
4697 PostcopyState ps = postcopy_state_get();
4699 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4702 if (ramblock_is_ignored(rb)) {
4706 if (!migration_is_idle()) {
4708 * Precopy code on the source cannot deal with the size of RAM blocks
4709 * changing at random points in time - especially after sending the
4710 * RAM block sizes in the migration stream, they must no longer change.
4711 * Abort and indicate a proper reason.
4713 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4714 migration_cancel(err);
4719 case POSTCOPY_INCOMING_ADVISE:
4721 * Update what ram_postcopy_incoming_init()->init_range() does at the
4722 * time postcopy was advised. Syncing RAM blocks with the source will
4723 * result in RAM resizes.
4725 if (old_size < new_size) {
4726 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4727 error_report("RAM block '%s' discard of resized RAM failed",
4731 rb->postcopy_length = new_size;
4733 case POSTCOPY_INCOMING_NONE:
4734 case POSTCOPY_INCOMING_RUNNING:
4735 case POSTCOPY_INCOMING_END:
4737 * Once our guest is running, postcopy does no longer care about
4738 * resizes. When growing, the new memory was not available on the
4739 * source, no handler needed.
4743 error_report("RAM block '%s' resized during postcopy state: %d",
4749 static RAMBlockNotifier ram_mig_ram_notifier = {
4750 .ram_block_resized = ram_mig_ram_block_resized,
4753 void ram_mig_init(void)
4755 qemu_mutex_init(&XBZRLE.lock);
4756 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4757 ram_block_notifier_add(&ram_mig_ram_notifier);