4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
55 #include "sysemu/cpu-throttle.h"
59 #include "sysemu/runstate.h"
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
67 /***********************************************************/
68 /* ram save/restore */
71 * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
72 * worked for pages that were filled with the same char. We switched
73 * it to only search for the zero value. And to avoid confusion with
74 * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
77 * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
79 #define RAM_SAVE_FLAG_FULL 0x01
80 #define RAM_SAVE_FLAG_ZERO 0x02
81 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
82 #define RAM_SAVE_FLAG_PAGE 0x08
83 #define RAM_SAVE_FLAG_EOS 0x10
84 #define RAM_SAVE_FLAG_CONTINUE 0x20
85 #define RAM_SAVE_FLAG_XBZRLE 0x40
86 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
87 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
88 /* We can't use any flag that is bigger than 0x200 */
90 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
91 uint8_t *, int) = xbzrle_encode_buffer;
92 #if defined(CONFIG_AVX512BW_OPT)
93 #include "qemu/cpuid.h"
94 static void __attribute__((constructor)) init_cpu_flag(void)
96 unsigned max = __get_cpuid_max(0, NULL);
99 __cpuid(1, a, b, c, d);
100 /* We must check that AVX is not just available, but usable. */
101 if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
103 __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
104 __cpuid_count(7, 0, a, b, c, d);
106 * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
107 * and ZMM16-ZMM31 state are enabled by OS)
108 * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
110 if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
111 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
118 XBZRLECacheStats xbzrle_counters;
120 /* used by the search for pages to send */
121 struct PageSearchStatus {
122 /* The migration channel used for a specific host page */
123 QEMUFile *pss_channel;
124 /* Last block from where we have sent data */
125 RAMBlock *last_sent_block;
126 /* Current block being searched */
128 /* Current page to search from */
130 /* Set once we wrap around */
132 /* Whether we're sending a host page */
133 bool host_page_sending;
134 /* The start/end of current host page. Invalid if host_page_sending==false */
135 unsigned long host_page_start;
136 unsigned long host_page_end;
138 typedef struct PageSearchStatus PageSearchStatus;
140 /* struct contains XBZRLE cache and a static page
141 used by the compression */
143 /* buffer used for XBZRLE encoding */
144 uint8_t *encoded_buf;
145 /* buffer for storing page content */
146 uint8_t *current_buf;
147 /* Cache for XBZRLE, Protected by lock. */
150 /* it will store a page full of zeros */
151 uint8_t *zero_target_page;
152 /* buffer used for XBZRLE decoding */
153 uint8_t *decoded_buf;
156 static void XBZRLE_cache_lock(void)
158 if (migrate_use_xbzrle()) {
159 qemu_mutex_lock(&XBZRLE.lock);
163 static void XBZRLE_cache_unlock(void)
165 if (migrate_use_xbzrle()) {
166 qemu_mutex_unlock(&XBZRLE.lock);
171 * xbzrle_cache_resize: resize the xbzrle cache
173 * This function is called from migrate_params_apply in main
174 * thread, possibly while a migration is in progress. A running
175 * migration may be using the cache and might finish during this call,
176 * hence changes to the cache are protected by XBZRLE.lock().
178 * Returns 0 for success or -1 for error
180 * @new_size: new cache size
181 * @errp: set *errp if the check failed, with reason
183 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
185 PageCache *new_cache;
188 /* Check for truncation */
189 if (new_size != (size_t)new_size) {
190 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
191 "exceeding address space");
195 if (new_size == migrate_xbzrle_cache_size()) {
202 if (XBZRLE.cache != NULL) {
203 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
209 cache_fini(XBZRLE.cache);
210 XBZRLE.cache = new_cache;
213 XBZRLE_cache_unlock();
217 static bool postcopy_preempt_active(void)
219 return migrate_postcopy_preempt() && migration_in_postcopy();
222 bool ramblock_is_ignored(RAMBlock *block)
224 return !qemu_ram_is_migratable(block) ||
225 (migrate_ignore_shared() && qemu_ram_is_shared(block));
228 #undef RAMBLOCK_FOREACH
230 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
235 RCU_READ_LOCK_GUARD();
237 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
238 ret = func(block, opaque);
246 static void ramblock_recv_map_init(void)
250 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
251 assert(!rb->receivedmap);
252 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
256 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
258 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
262 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
264 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
267 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
269 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
272 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
275 bitmap_set_atomic(rb->receivedmap,
276 ramblock_recv_bitmap_offset(host_addr, rb),
280 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
283 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
285 * Returns >0 if success with sent bytes, or <0 if error.
287 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
288 const char *block_name)
290 RAMBlock *block = qemu_ram_block_by_name(block_name);
291 unsigned long *le_bitmap, nbits;
295 error_report("%s: invalid block name: %s", __func__, block_name);
299 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
302 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
303 * machines we may need 4 more bytes for padding (see below
304 * comment). So extend it a bit before hand.
306 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
309 * Always use little endian when sending the bitmap. This is
310 * required that when source and destination VMs are not using the
311 * same endianness. (Note: big endian won't work.)
313 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
315 /* Size of the bitmap, in bytes */
316 size = DIV_ROUND_UP(nbits, 8);
319 * size is always aligned to 8 bytes for 64bit machines, but it
320 * may not be true for 32bit machines. We need this padding to
321 * make sure the migration can survive even between 32bit and
324 size = ROUND_UP(size, 8);
326 qemu_put_be64(file, size);
327 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
329 * Mark as an end, in case the middle part is screwed up due to
330 * some "mysterious" reason.
332 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
337 if (qemu_file_get_error(file)) {
338 return qemu_file_get_error(file);
341 return size + sizeof(size);
345 * An outstanding page request, on the source, having been received
348 struct RAMSrcPageRequest {
353 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
356 /* State of RAM for migration */
359 * PageSearchStatus structures for the channels when send pages.
360 * Protected by the bitmap_mutex.
362 PageSearchStatus pss[RAM_CHANNEL_MAX];
363 /* UFFD file descriptor, used in 'write-tracking' migration */
365 /* total ram size in bytes */
366 uint64_t ram_bytes_total;
367 /* Last block that we have visited searching for dirty pages */
368 RAMBlock *last_seen_block;
369 /* Last dirty target page we have sent */
370 ram_addr_t last_page;
371 /* last ram version we have seen */
372 uint32_t last_version;
373 /* How many times we have dirty too many pages */
374 int dirty_rate_high_cnt;
375 /* these variables are used for bitmap sync */
376 /* last time we did a full bitmap_sync */
377 int64_t time_last_bitmap_sync;
378 /* bytes transferred at start_time */
379 uint64_t bytes_xfer_prev;
380 /* number of dirty pages since start_time */
381 uint64_t num_dirty_pages_period;
382 /* xbzrle misses since the beginning of the period */
383 uint64_t xbzrle_cache_miss_prev;
384 /* Amount of xbzrle pages since the beginning of the period */
385 uint64_t xbzrle_pages_prev;
386 /* Amount of xbzrle encoded bytes since the beginning of the period */
387 uint64_t xbzrle_bytes_prev;
388 /* Start using XBZRLE (e.g., after the first round). */
390 /* Are we on the last stage of migration */
392 /* compression statistics since the beginning of the period */
393 /* amount of count that no free thread to compress data */
394 uint64_t compress_thread_busy_prev;
395 /* amount bytes after compression */
396 uint64_t compressed_size_prev;
397 /* amount of compressed pages */
398 uint64_t compress_pages_prev;
400 /* total handled target pages at the beginning of period */
401 uint64_t target_page_count_prev;
402 /* total handled target pages since start */
403 uint64_t target_page_count;
404 /* number of dirty bits in the bitmap */
405 uint64_t migration_dirty_pages;
408 * - dirty/clear bitmap
409 * - migration_dirty_pages
412 QemuMutex bitmap_mutex;
413 /* The RAMBlock used in the last src_page_requests */
414 RAMBlock *last_req_rb;
415 /* Queue of outstanding page requests from the destination */
416 QemuMutex src_page_req_mutex;
417 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
419 typedef struct RAMState RAMState;
421 static RAMState *ram_state;
423 static NotifierWithReturnList precopy_notifier_list;
425 /* Whether postcopy has queued requests? */
426 static bool postcopy_has_request(RAMState *rs)
428 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
431 void precopy_infrastructure_init(void)
433 notifier_with_return_list_init(&precopy_notifier_list);
436 void precopy_add_notifier(NotifierWithReturn *n)
438 notifier_with_return_list_add(&precopy_notifier_list, n);
441 void precopy_remove_notifier(NotifierWithReturn *n)
443 notifier_with_return_remove(n);
446 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
448 PrecopyNotifyData pnd;
452 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
455 uint64_t ram_bytes_remaining(void)
457 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
461 RAMStats ram_counters;
463 void ram_transferred_add(uint64_t bytes)
465 if (runstate_is_running()) {
466 stat64_add(&ram_counters.precopy_bytes, bytes);
467 } else if (migration_in_postcopy()) {
468 stat64_add(&ram_counters.postcopy_bytes, bytes);
470 ram_counters.downtime_bytes += bytes;
472 stat64_add(&ram_counters.transferred, bytes);
475 struct MigrationOps {
476 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
478 typedef struct MigrationOps MigrationOps;
480 MigrationOps *migration_ops;
482 CompressionStats compression_counters;
484 struct CompressParam {
494 /* internally used fields */
498 typedef struct CompressParam CompressParam;
500 struct DecompressParam {
510 typedef struct DecompressParam DecompressParam;
512 static CompressParam *comp_param;
513 static QemuThread *compress_threads;
514 /* comp_done_cond is used to wake up the migration thread when
515 * one of the compression threads has finished the compression.
516 * comp_done_lock is used to co-work with comp_done_cond.
518 static QemuMutex comp_done_lock;
519 static QemuCond comp_done_cond;
521 static QEMUFile *decomp_file;
522 static DecompressParam *decomp_param;
523 static QemuThread *decompress_threads;
524 static QemuMutex decomp_done_lock;
525 static QemuCond decomp_done_cond;
527 static int ram_save_host_page_urgent(PageSearchStatus *pss);
529 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
530 ram_addr_t offset, uint8_t *source_buf);
532 /* NOTE: page is the PFN not real ram_addr_t. */
533 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
537 pss->complete_round = false;
541 * Check whether two PSSs are actively sending the same page. Return true
542 * if it is, false otherwise.
544 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
546 return pss1->host_page_sending && pss2->host_page_sending &&
547 (pss1->host_page_start == pss2->host_page_start);
550 static void *do_data_compress(void *opaque)
552 CompressParam *param = opaque;
557 qemu_mutex_lock(¶m->mutex);
558 while (!param->quit) {
560 block = param->block;
561 offset = param->offset;
563 qemu_mutex_unlock(¶m->mutex);
565 zero_page = do_compress_ram_page(param->file, ¶m->stream,
566 block, offset, param->originbuf);
568 qemu_mutex_lock(&comp_done_lock);
570 param->zero_page = zero_page;
571 qemu_cond_signal(&comp_done_cond);
572 qemu_mutex_unlock(&comp_done_lock);
574 qemu_mutex_lock(¶m->mutex);
576 qemu_cond_wait(¶m->cond, ¶m->mutex);
579 qemu_mutex_unlock(¶m->mutex);
584 static void compress_threads_save_cleanup(void)
588 if (!migrate_use_compression() || !comp_param) {
592 thread_count = migrate_compress_threads();
593 for (i = 0; i < thread_count; i++) {
595 * we use it as a indicator which shows if the thread is
596 * properly init'd or not
598 if (!comp_param[i].file) {
602 qemu_mutex_lock(&comp_param[i].mutex);
603 comp_param[i].quit = true;
604 qemu_cond_signal(&comp_param[i].cond);
605 qemu_mutex_unlock(&comp_param[i].mutex);
607 qemu_thread_join(compress_threads + i);
608 qemu_mutex_destroy(&comp_param[i].mutex);
609 qemu_cond_destroy(&comp_param[i].cond);
610 deflateEnd(&comp_param[i].stream);
611 g_free(comp_param[i].originbuf);
612 qemu_fclose(comp_param[i].file);
613 comp_param[i].file = NULL;
615 qemu_mutex_destroy(&comp_done_lock);
616 qemu_cond_destroy(&comp_done_cond);
617 g_free(compress_threads);
619 compress_threads = NULL;
623 static int compress_threads_save_setup(void)
627 if (!migrate_use_compression()) {
630 thread_count = migrate_compress_threads();
631 compress_threads = g_new0(QemuThread, thread_count);
632 comp_param = g_new0(CompressParam, thread_count);
633 qemu_cond_init(&comp_done_cond);
634 qemu_mutex_init(&comp_done_lock);
635 for (i = 0; i < thread_count; i++) {
636 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
637 if (!comp_param[i].originbuf) {
641 if (deflateInit(&comp_param[i].stream,
642 migrate_compress_level()) != Z_OK) {
643 g_free(comp_param[i].originbuf);
647 /* comp_param[i].file is just used as a dummy buffer to save data,
648 * set its ops to empty.
650 comp_param[i].file = qemu_file_new_output(
651 QIO_CHANNEL(qio_channel_null_new()));
652 comp_param[i].done = true;
653 comp_param[i].quit = false;
654 qemu_mutex_init(&comp_param[i].mutex);
655 qemu_cond_init(&comp_param[i].cond);
656 qemu_thread_create(compress_threads + i, "compress",
657 do_data_compress, comp_param + i,
658 QEMU_THREAD_JOINABLE);
663 compress_threads_save_cleanup();
668 * save_page_header: write page header to wire
670 * If this is the 1st block, it also writes the block identification
672 * Returns the number of bytes written
674 * @pss: current PSS channel status
675 * @block: block that contains the page we want to send
676 * @offset: offset inside the block for the page
677 * in the lower bits, it contains flags
679 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
680 RAMBlock *block, ram_addr_t offset)
683 bool same_block = (block == pss->last_sent_block);
686 offset |= RAM_SAVE_FLAG_CONTINUE;
688 qemu_put_be64(f, offset);
692 len = strlen(block->idstr);
693 qemu_put_byte(f, len);
694 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
696 pss->last_sent_block = block;
702 * mig_throttle_guest_down: throttle down the guest
704 * Reduce amount of guest cpu execution to hopefully slow down memory
705 * writes. If guest dirty memory rate is reduced below the rate at
706 * which we can transfer pages to the destination then we should be
707 * able to complete migration. Some workloads dirty memory way too
708 * fast and will not effectively converge, even with auto-converge.
710 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
711 uint64_t bytes_dirty_threshold)
713 MigrationState *s = migrate_get_current();
714 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
715 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
716 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
717 int pct_max = s->parameters.max_cpu_throttle;
719 uint64_t throttle_now = cpu_throttle_get_percentage();
720 uint64_t cpu_now, cpu_ideal, throttle_inc;
722 /* We have not started throttling yet. Let's start it. */
723 if (!cpu_throttle_active()) {
724 cpu_throttle_set(pct_initial);
726 /* Throttling already on, just increase the rate */
728 throttle_inc = pct_increment;
730 /* Compute the ideal CPU percentage used by Guest, which may
731 * make the dirty rate match the dirty rate threshold. */
732 cpu_now = 100 - throttle_now;
733 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
735 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
737 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
741 void mig_throttle_counter_reset(void)
743 RAMState *rs = ram_state;
745 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
746 rs->num_dirty_pages_period = 0;
747 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
751 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
753 * @rs: current RAM state
754 * @current_addr: address for the zero page
756 * Update the xbzrle cache to reflect a page that's been sent as all 0.
757 * The important thing is that a stale (not-yet-0'd) page be replaced
759 * As a bonus, if the page wasn't in the cache it gets added so that
760 * when a small write is made into the 0'd page it gets XBZRLE sent.
762 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
764 /* We don't care if this fails to allocate a new cache page
765 * as long as it updated an old one */
766 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
767 ram_counters.dirty_sync_count);
770 #define ENCODING_FLAG_XBZRLE 0x1
773 * save_xbzrle_page: compress and send current page
775 * Returns: 1 means that we wrote the page
776 * 0 means that page is identical to the one already sent
777 * -1 means that xbzrle would be longer than normal
779 * @rs: current RAM state
780 * @pss: current PSS channel
781 * @current_data: pointer to the address of the page contents
782 * @current_addr: addr of the page
783 * @block: block that contains the page we want to send
784 * @offset: offset inside the block for the page
786 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
787 uint8_t **current_data, ram_addr_t current_addr,
788 RAMBlock *block, ram_addr_t offset)
790 int encoded_len = 0, bytes_xbzrle;
791 uint8_t *prev_cached_page;
792 QEMUFile *file = pss->pss_channel;
794 if (!cache_is_cached(XBZRLE.cache, current_addr,
795 ram_counters.dirty_sync_count)) {
796 xbzrle_counters.cache_miss++;
797 if (!rs->last_stage) {
798 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
799 ram_counters.dirty_sync_count) == -1) {
802 /* update *current_data when the page has been
803 inserted into cache */
804 *current_data = get_cached_data(XBZRLE.cache, current_addr);
811 * Reaching here means the page has hit the xbzrle cache, no matter what
812 * encoding result it is (normal encoding, overflow or skipping the page),
813 * count the page as encoded. This is used to calculate the encoding rate.
815 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
816 * 2nd page turns out to be skipped (i.e. no new bytes written to the
817 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
818 * skipped page included. In this way, the encoding rate can tell if the
819 * guest page is good for xbzrle encoding.
821 xbzrle_counters.pages++;
822 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
824 /* save current buffer into memory */
825 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
827 /* XBZRLE encoding (if there is no overflow) */
828 encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
829 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
833 * Update the cache contents, so that it corresponds to the data
834 * sent, in all cases except where we skip the page.
836 if (!rs->last_stage && encoded_len != 0) {
837 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
839 * In the case where we couldn't compress, ensure that the caller
840 * sends the data from the cache, since the guest might have
841 * changed the RAM since we copied it.
843 *current_data = prev_cached_page;
846 if (encoded_len == 0) {
847 trace_save_xbzrle_page_skipping();
849 } else if (encoded_len == -1) {
850 trace_save_xbzrle_page_overflow();
851 xbzrle_counters.overflow++;
852 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
856 /* Send XBZRLE based compressed page */
857 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
858 offset | RAM_SAVE_FLAG_XBZRLE);
859 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
860 qemu_put_be16(file, encoded_len);
861 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
862 bytes_xbzrle += encoded_len + 1 + 2;
864 * Like compressed_size (please see update_compress_thread_counts),
865 * the xbzrle encoded bytes don't count the 8 byte header with
866 * RAM_SAVE_FLAG_CONTINUE.
868 xbzrle_counters.bytes += bytes_xbzrle - 8;
869 ram_transferred_add(bytes_xbzrle);
875 * pss_find_next_dirty: find the next dirty page of current ramblock
877 * This function updates pss->page to point to the next dirty page index
878 * within the ramblock to migrate, or the end of ramblock when nothing
879 * found. Note that when pss->host_page_sending==true it means we're
880 * during sending a host page, so we won't look for dirty page that is
881 * outside the host page boundary.
883 * @pss: the current page search status
885 static void pss_find_next_dirty(PageSearchStatus *pss)
887 RAMBlock *rb = pss->block;
888 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
889 unsigned long *bitmap = rb->bmap;
891 if (ramblock_is_ignored(rb)) {
892 /* Points directly to the end, so we know no dirty page */
898 * If during sending a host page, only look for dirty pages within the
899 * current host page being send.
901 if (pss->host_page_sending) {
902 assert(pss->host_page_end);
903 size = MIN(size, pss->host_page_end);
906 pss->page = find_next_bit(bitmap, size, pss->page);
909 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
915 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
919 shift = rb->clear_bmap_shift;
921 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
922 * can make things easier sometimes since then start address
923 * of the small chunk will always be 64 pages aligned so the
924 * bitmap will always be aligned to unsigned long. We should
925 * even be able to remove this restriction but I'm simply
930 size = 1ULL << (TARGET_PAGE_BITS + shift);
931 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
932 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
933 memory_region_clear_dirty_bitmap(rb->mr, start, size);
937 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
939 unsigned long npages)
941 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
942 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
943 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
946 * Clear pages from start to start + npages - 1, so the end boundary is
949 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
950 migration_clear_memory_region_dirty_bitmap(rb, i);
955 * colo_bitmap_find_diry:find contiguous dirty pages from start
957 * Returns the page offset within memory region of the start of the contiguout
960 * @rs: current RAM state
961 * @rb: RAMBlock where to search for dirty pages
962 * @start: page where we start the search
963 * @num: the number of contiguous dirty pages
966 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
967 unsigned long start, unsigned long *num)
969 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
970 unsigned long *bitmap = rb->bmap;
971 unsigned long first, next;
975 if (ramblock_is_ignored(rb)) {
979 first = find_next_bit(bitmap, size, start);
983 next = find_next_zero_bit(bitmap, size, first + 1);
984 assert(next >= first);
989 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
996 * Clear dirty bitmap if needed. This _must_ be called before we
997 * send any of the page in the chunk because we need to make sure
998 * we can capture further page content changes when we sync dirty
999 * log the next time. So as long as we are going to send any of
1000 * the page in the chunk we clear the remote dirty bitmap for all.
1001 * Clearing it earlier won't be a problem, but too late will.
1003 migration_clear_memory_region_dirty_bitmap(rb, page);
1005 ret = test_and_clear_bit(page, rb->bmap);
1007 rs->migration_dirty_pages--;
1013 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1016 const hwaddr offset = section->offset_within_region;
1017 const hwaddr size = int128_get64(section->size);
1018 const unsigned long start = offset >> TARGET_PAGE_BITS;
1019 const unsigned long npages = size >> TARGET_PAGE_BITS;
1020 RAMBlock *rb = section->mr->ram_block;
1021 uint64_t *cleared_bits = opaque;
1024 * We don't grab ram_state->bitmap_mutex because we expect to run
1025 * only when starting migration or during postcopy recovery where
1026 * we don't have concurrent access.
1028 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1029 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1031 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1032 bitmap_clear(rb->bmap, start, npages);
1036 * Exclude all dirty pages from migration that fall into a discarded range as
1037 * managed by a RamDiscardManager responsible for the mapped memory region of
1038 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1040 * Discarded pages ("logically unplugged") have undefined content and must
1041 * not get migrated, because even reading these pages for migration might
1042 * result in undesired behavior.
1044 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1046 * Note: The result is only stable while migrating (precopy/postcopy).
1048 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1050 uint64_t cleared_bits = 0;
1052 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1053 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1054 MemoryRegionSection section = {
1056 .offset_within_region = 0,
1057 .size = int128_make64(qemu_ram_get_used_length(rb)),
1060 ram_discard_manager_replay_discarded(rdm, §ion,
1061 dirty_bitmap_clear_section,
1064 return cleared_bits;
1068 * Check if a host-page aligned page falls into a discarded range as managed by
1069 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1071 * Note: The result is only stable while migrating (precopy/postcopy).
1073 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1075 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1076 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1077 MemoryRegionSection section = {
1079 .offset_within_region = start,
1080 .size = int128_make64(qemu_ram_pagesize(rb)),
1083 return !ram_discard_manager_is_populated(rdm, §ion);
1088 /* Called with RCU critical section */
1089 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1091 uint64_t new_dirty_pages =
1092 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1094 rs->migration_dirty_pages += new_dirty_pages;
1095 rs->num_dirty_pages_period += new_dirty_pages;
1099 * ram_pagesize_summary: calculate all the pagesizes of a VM
1101 * Returns a summary bitmap of the page sizes of all RAMBlocks
1103 * For VMs with just normal pages this is equivalent to the host page
1104 * size. If it's got some huge pages then it's the OR of all the
1105 * different page sizes.
1107 uint64_t ram_pagesize_summary(void)
1110 uint64_t summary = 0;
1112 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1113 summary |= block->page_size;
1119 uint64_t ram_get_total_transferred_pages(void)
1121 return stat64_get(&ram_counters.normal) +
1122 stat64_get(&ram_counters.duplicate) +
1123 compression_counters.pages + xbzrle_counters.pages;
1126 static void migration_update_rates(RAMState *rs, int64_t end_time)
1128 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1129 double compressed_size;
1131 /* calculate period counters */
1132 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1133 / (end_time - rs->time_last_bitmap_sync);
1139 if (migrate_use_xbzrle()) {
1140 double encoded_size, unencoded_size;
1142 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1143 rs->xbzrle_cache_miss_prev) / page_count;
1144 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1145 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1147 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1148 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1149 xbzrle_counters.encoding_rate = 0;
1151 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1153 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1154 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1157 if (migrate_use_compression()) {
1158 compression_counters.busy_rate = (double)(compression_counters.busy -
1159 rs->compress_thread_busy_prev) / page_count;
1160 rs->compress_thread_busy_prev = compression_counters.busy;
1162 compressed_size = compression_counters.compressed_size -
1163 rs->compressed_size_prev;
1164 if (compressed_size) {
1165 double uncompressed_size = (compression_counters.pages -
1166 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1168 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1169 compression_counters.compression_rate =
1170 uncompressed_size / compressed_size;
1172 rs->compress_pages_prev = compression_counters.pages;
1173 rs->compressed_size_prev = compression_counters.compressed_size;
1178 static void migration_trigger_throttle(RAMState *rs)
1180 MigrationState *s = migrate_get_current();
1181 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1182 uint64_t bytes_xfer_period =
1183 stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
1184 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1185 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1187 /* During block migration the auto-converge logic incorrectly detects
1188 * that ram migration makes no progress. Avoid this by disabling the
1189 * throttling logic during the bulk phase of block migration. */
1190 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1191 /* The following detection logic can be refined later. For now:
1192 Check to see if the ratio between dirtied bytes and the approx.
1193 amount of bytes that just got transferred since the last time
1194 we were in this routine reaches the threshold. If that happens
1195 twice, start or increase throttling. */
1197 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1198 (++rs->dirty_rate_high_cnt >= 2)) {
1199 trace_migration_throttle();
1200 rs->dirty_rate_high_cnt = 0;
1201 mig_throttle_guest_down(bytes_dirty_period,
1202 bytes_dirty_threshold);
1207 static void migration_bitmap_sync(RAMState *rs)
1212 ram_counters.dirty_sync_count++;
1214 if (!rs->time_last_bitmap_sync) {
1215 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1218 trace_migration_bitmap_sync_start();
1219 memory_global_dirty_log_sync();
1221 qemu_mutex_lock(&rs->bitmap_mutex);
1222 WITH_RCU_READ_LOCK_GUARD() {
1223 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1224 ramblock_sync_dirty_bitmap(rs, block);
1226 ram_counters.remaining = ram_bytes_remaining();
1228 qemu_mutex_unlock(&rs->bitmap_mutex);
1230 memory_global_after_dirty_log_sync();
1231 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1233 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1235 /* more than 1 second = 1000 millisecons */
1236 if (end_time > rs->time_last_bitmap_sync + 1000) {
1237 migration_trigger_throttle(rs);
1239 migration_update_rates(rs, end_time);
1241 rs->target_page_count_prev = rs->target_page_count;
1243 /* reset period counters */
1244 rs->time_last_bitmap_sync = end_time;
1245 rs->num_dirty_pages_period = 0;
1246 rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
1248 if (migrate_use_events()) {
1249 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1253 static void migration_bitmap_sync_precopy(RAMState *rs)
1255 Error *local_err = NULL;
1258 * The current notifier usage is just an optimization to migration, so we
1259 * don't stop the normal migration process in the error case.
1261 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1262 error_report_err(local_err);
1266 migration_bitmap_sync(rs);
1268 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1269 error_report_err(local_err);
1273 void ram_release_page(const char *rbname, uint64_t offset)
1275 if (!migrate_release_ram() || !migration_in_postcopy()) {
1279 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1283 * save_zero_page_to_file: send the zero page to the file
1285 * Returns the size of data written to the file, 0 means the page is not
1288 * @pss: current PSS channel
1289 * @block: block that contains the page we want to send
1290 * @offset: offset inside the block for the page
1292 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1293 RAMBlock *block, ram_addr_t offset)
1295 uint8_t *p = block->host + offset;
1298 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1299 len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1300 qemu_put_byte(file, 0);
1302 ram_release_page(block->idstr, offset);
1308 * save_zero_page: send the zero page to the stream
1310 * Returns the number of pages written.
1312 * @pss: current PSS channel
1313 * @block: block that contains the page we want to send
1314 * @offset: offset inside the block for the page
1316 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1319 int len = save_zero_page_to_file(pss, f, block, offset);
1322 stat64_add(&ram_counters.duplicate, 1);
1323 ram_transferred_add(len);
1330 * @pages: the number of pages written by the control path,
1332 * > 0 - number of pages written
1334 * Return true if the pages has been saved, otherwise false is returned.
1336 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1337 ram_addr_t offset, int *pages)
1339 uint64_t bytes_xmit = 0;
1343 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1344 TARGET_PAGE_SIZE, &bytes_xmit);
1345 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1350 ram_transferred_add(bytes_xmit);
1354 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1358 if (bytes_xmit > 0) {
1359 stat64_add(&ram_counters.normal, 1);
1360 } else if (bytes_xmit == 0) {
1361 stat64_add(&ram_counters.duplicate, 1);
1368 * directly send the page to the stream
1370 * Returns the number of pages written.
1372 * @pss: current PSS channel
1373 * @block: block that contains the page we want to send
1374 * @offset: offset inside the block for the page
1375 * @buf: the page to be sent
1376 * @async: send to page asyncly
1378 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1379 ram_addr_t offset, uint8_t *buf, bool async)
1381 QEMUFile *file = pss->pss_channel;
1383 ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1384 offset | RAM_SAVE_FLAG_PAGE));
1386 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1387 migrate_release_ram() &&
1388 migration_in_postcopy());
1390 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1392 ram_transferred_add(TARGET_PAGE_SIZE);
1393 stat64_add(&ram_counters.normal, 1);
1398 * ram_save_page: send the given page to the stream
1400 * Returns the number of pages written.
1402 * >=0 - Number of pages written - this might legally be 0
1403 * if xbzrle noticed the page was the same.
1405 * @rs: current RAM state
1406 * @block: block that contains the page we want to send
1407 * @offset: offset inside the block for the page
1409 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1413 bool send_async = true;
1414 RAMBlock *block = pss->block;
1415 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1416 ram_addr_t current_addr = block->offset + offset;
1418 p = block->host + offset;
1419 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1421 XBZRLE_cache_lock();
1422 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1423 pages = save_xbzrle_page(rs, pss, &p, current_addr,
1425 if (!rs->last_stage) {
1426 /* Can't send this cached data async, since the cache page
1427 * might get updated before it gets to the wire
1433 /* XBZRLE overflow or normal page */
1435 pages = save_normal_page(pss, block, offset, p, send_async);
1438 XBZRLE_cache_unlock();
1443 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1446 if (multifd_queue_page(file, block, offset) < 0) {
1449 stat64_add(&ram_counters.normal, 1);
1454 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1455 ram_addr_t offset, uint8_t *source_buf)
1457 RAMState *rs = ram_state;
1458 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1459 uint8_t *p = block->host + offset;
1462 if (save_zero_page_to_file(pss, f, block, offset)) {
1466 save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1469 * copy it to a internal buffer to avoid it being modified by VM
1470 * so that we can catch up the error during compression and
1473 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1474 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1476 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1477 error_report("compressed data failed!");
1483 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1485 ram_transferred_add(bytes_xmit);
1487 if (param->zero_page) {
1488 stat64_add(&ram_counters.duplicate, 1);
1492 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1493 compression_counters.compressed_size += bytes_xmit - 8;
1494 compression_counters.pages++;
1497 static bool save_page_use_compression(RAMState *rs);
1499 static void flush_compressed_data(RAMState *rs)
1501 MigrationState *ms = migrate_get_current();
1502 int idx, len, thread_count;
1504 if (!save_page_use_compression(rs)) {
1507 thread_count = migrate_compress_threads();
1509 qemu_mutex_lock(&comp_done_lock);
1510 for (idx = 0; idx < thread_count; idx++) {
1511 while (!comp_param[idx].done) {
1512 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1515 qemu_mutex_unlock(&comp_done_lock);
1517 for (idx = 0; idx < thread_count; idx++) {
1518 qemu_mutex_lock(&comp_param[idx].mutex);
1519 if (!comp_param[idx].quit) {
1520 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1522 * it's safe to fetch zero_page without holding comp_done_lock
1523 * as there is no further request submitted to the thread,
1524 * i.e, the thread should be waiting for a request at this point.
1526 update_compress_thread_counts(&comp_param[idx], len);
1528 qemu_mutex_unlock(&comp_param[idx].mutex);
1532 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1535 param->block = block;
1536 param->offset = offset;
1539 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1541 int idx, thread_count, bytes_xmit = -1, pages = -1;
1542 bool wait = migrate_compress_wait_thread();
1543 MigrationState *ms = migrate_get_current();
1545 thread_count = migrate_compress_threads();
1546 qemu_mutex_lock(&comp_done_lock);
1548 for (idx = 0; idx < thread_count; idx++) {
1549 if (comp_param[idx].done) {
1550 comp_param[idx].done = false;
1551 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1552 comp_param[idx].file);
1553 qemu_mutex_lock(&comp_param[idx].mutex);
1554 set_compress_params(&comp_param[idx], block, offset);
1555 qemu_cond_signal(&comp_param[idx].cond);
1556 qemu_mutex_unlock(&comp_param[idx].mutex);
1558 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1564 * wait for the free thread if the user specifies 'compress-wait-thread',
1565 * otherwise we will post the page out in the main thread as normal page.
1567 if (pages < 0 && wait) {
1568 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1571 qemu_mutex_unlock(&comp_done_lock);
1576 #define PAGE_ALL_CLEAN 0
1577 #define PAGE_TRY_AGAIN 1
1578 #define PAGE_DIRTY_FOUND 2
1580 * find_dirty_block: find the next dirty page and update any state
1581 * associated with the search process.
1584 * PAGE_ALL_CLEAN: no dirty page found, give up
1585 * PAGE_TRY_AGAIN: no dirty page found, retry for next block
1586 * PAGE_DIRTY_FOUND: dirty page found
1588 * @rs: current RAM state
1589 * @pss: data about the state of the current dirty page scan
1590 * @again: set to false if the search has scanned the whole of RAM
1592 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1594 /* Update pss->page for the next dirty bit in ramblock */
1595 pss_find_next_dirty(pss);
1597 if (pss->complete_round && pss->block == rs->last_seen_block &&
1598 pss->page >= rs->last_page) {
1600 * We've been once around the RAM and haven't found anything.
1603 return PAGE_ALL_CLEAN;
1605 if (!offset_in_ramblock(pss->block,
1606 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1607 /* Didn't find anything in this RAM Block */
1609 pss->block = QLIST_NEXT_RCU(pss->block, next);
1612 * If memory migration starts over, we will meet a dirtied page
1613 * which may still exists in compression threads's ring, so we
1614 * should flush the compressed data to make sure the new page
1615 * is not overwritten by the old one in the destination.
1617 * Also If xbzrle is on, stop using the data compression at this
1618 * point. In theory, xbzrle can do better than compression.
1620 flush_compressed_data(rs);
1622 /* Hit the end of the list */
1623 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1624 /* Flag that we've looped */
1625 pss->complete_round = true;
1626 /* After the first round, enable XBZRLE. */
1627 if (migrate_use_xbzrle()) {
1628 rs->xbzrle_enabled = true;
1631 /* Didn't find anything this time, but try again on the new block */
1632 return PAGE_TRY_AGAIN;
1634 /* We've found something */
1635 return PAGE_DIRTY_FOUND;
1640 * unqueue_page: gets a page of the queue
1642 * Helper for 'get_queued_page' - gets a page off the queue
1644 * Returns the block of the page (or NULL if none available)
1646 * @rs: current RAM state
1647 * @offset: used to return the offset within the RAMBlock
1649 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1651 struct RAMSrcPageRequest *entry;
1652 RAMBlock *block = NULL;
1654 if (!postcopy_has_request(rs)) {
1658 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1661 * This should _never_ change even after we take the lock, because no one
1662 * should be taking anything off the request list other than us.
1664 assert(postcopy_has_request(rs));
1666 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1668 *offset = entry->offset;
1670 if (entry->len > TARGET_PAGE_SIZE) {
1671 entry->len -= TARGET_PAGE_SIZE;
1672 entry->offset += TARGET_PAGE_SIZE;
1674 memory_region_unref(block->mr);
1675 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1677 migration_consume_urgent_request();
1683 #if defined(__linux__)
1685 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1686 * is found, return RAM block pointer and page offset
1688 * Returns pointer to the RAMBlock containing faulting page,
1689 * NULL if no write faults are pending
1691 * @rs: current RAM state
1692 * @offset: page offset from the beginning of the block
1694 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1696 struct uffd_msg uffd_msg;
1701 if (!migrate_background_snapshot()) {
1705 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1710 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1711 block = qemu_ram_block_from_host(page_address, false, offset);
1712 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1717 * ram_save_release_protection: release UFFD write protection after
1718 * a range of pages has been saved
1720 * @rs: current RAM state
1721 * @pss: page-search-status structure
1722 * @start_page: index of the first page in the range relative to pss->block
1724 * Returns 0 on success, negative value in case of an error
1726 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1727 unsigned long start_page)
1731 /* Check if page is from UFFD-managed region. */
1732 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1733 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1734 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1736 /* Flush async buffers before un-protect. */
1737 qemu_fflush(pss->pss_channel);
1738 /* Un-protect memory range. */
1739 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1746 /* ram_write_tracking_available: check if kernel supports required UFFD features
1748 * Returns true if supports, false otherwise
1750 bool ram_write_tracking_available(void)
1752 uint64_t uffd_features;
1755 res = uffd_query_features(&uffd_features);
1757 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1760 /* ram_write_tracking_compatible: check if guest configuration is
1761 * compatible with 'write-tracking'
1763 * Returns true if compatible, false otherwise
1765 bool ram_write_tracking_compatible(void)
1767 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1772 /* Open UFFD file descriptor */
1773 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1778 RCU_READ_LOCK_GUARD();
1780 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1781 uint64_t uffd_ioctls;
1783 /* Nothing to do with read-only and MMIO-writable regions */
1784 if (block->mr->readonly || block->mr->rom_device) {
1787 /* Try to register block memory via UFFD-IO to track writes */
1788 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1789 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1792 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1799 uffd_close_fd(uffd_fd);
1803 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1806 const ram_addr_t end = offset + size;
1809 * We read one byte of each page; this will preallocate page tables if
1810 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1811 * where no page was populated yet. This might require adaption when
1812 * supporting other mappings, like shmem.
1814 for (; offset < end; offset += block->page_size) {
1815 char tmp = *((char *)block->host + offset);
1817 /* Don't optimize the read out */
1818 asm volatile("" : "+r" (tmp));
1822 static inline int populate_read_section(MemoryRegionSection *section,
1825 const hwaddr size = int128_get64(section->size);
1826 hwaddr offset = section->offset_within_region;
1827 RAMBlock *block = section->mr->ram_block;
1829 populate_read_range(block, offset, size);
1834 * ram_block_populate_read: preallocate page tables and populate pages in the
1835 * RAM block by reading a byte of each page.
1837 * Since it's solely used for userfault_fd WP feature, here we just
1838 * hardcode page size to qemu_real_host_page_size.
1840 * @block: RAM block to populate
1842 static void ram_block_populate_read(RAMBlock *rb)
1845 * Skip populating all pages that fall into a discarded range as managed by
1846 * a RamDiscardManager responsible for the mapped memory region of the
1847 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1848 * must not get populated automatically. We don't have to track
1849 * modifications via userfaultfd WP reliably, because these pages will
1850 * not be part of the migration stream either way -- see
1851 * ramblock_dirty_bitmap_exclude_discarded_pages().
1853 * Note: The result is only stable while migrating (precopy/postcopy).
1855 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1856 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1857 MemoryRegionSection section = {
1859 .offset_within_region = 0,
1860 .size = rb->mr->size,
1863 ram_discard_manager_replay_populated(rdm, §ion,
1864 populate_read_section, NULL);
1866 populate_read_range(rb, 0, rb->used_length);
1871 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1873 void ram_write_tracking_prepare(void)
1877 RCU_READ_LOCK_GUARD();
1879 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1880 /* Nothing to do with read-only and MMIO-writable regions */
1881 if (block->mr->readonly || block->mr->rom_device) {
1886 * Populate pages of the RAM block before enabling userfault_fd
1889 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1890 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1891 * pages with pte_none() entries in page table.
1893 ram_block_populate_read(block);
1897 static inline int uffd_protect_section(MemoryRegionSection *section,
1900 const hwaddr size = int128_get64(section->size);
1901 const hwaddr offset = section->offset_within_region;
1902 RAMBlock *rb = section->mr->ram_block;
1903 int uffd_fd = (uintptr_t)opaque;
1905 return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1909 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1911 assert(rb->flags & RAM_UF_WRITEPROTECT);
1913 /* See ram_block_populate_read() */
1914 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1915 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1916 MemoryRegionSection section = {
1918 .offset_within_region = 0,
1919 .size = rb->mr->size,
1922 return ram_discard_manager_replay_populated(rdm, §ion,
1923 uffd_protect_section,
1924 (void *)(uintptr_t)uffd_fd);
1926 return uffd_change_protection(uffd_fd, rb->host,
1927 rb->used_length, true, false);
1931 * ram_write_tracking_start: start UFFD-WP memory tracking
1933 * Returns 0 for success or negative value in case of error
1935 int ram_write_tracking_start(void)
1938 RAMState *rs = ram_state;
1941 /* Open UFFD file descriptor */
1942 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1946 rs->uffdio_fd = uffd_fd;
1948 RCU_READ_LOCK_GUARD();
1950 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1951 /* Nothing to do with read-only and MMIO-writable regions */
1952 if (block->mr->readonly || block->mr->rom_device) {
1956 /* Register block memory with UFFD to track writes */
1957 if (uffd_register_memory(rs->uffdio_fd, block->host,
1958 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1961 block->flags |= RAM_UF_WRITEPROTECT;
1962 memory_region_ref(block->mr);
1964 /* Apply UFFD write protection to the block memory range */
1965 if (ram_block_uffd_protect(block, uffd_fd)) {
1969 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1970 block->host, block->max_length);
1976 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1978 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1979 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1982 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1983 /* Cleanup flags and remove reference */
1984 block->flags &= ~RAM_UF_WRITEPROTECT;
1985 memory_region_unref(block->mr);
1988 uffd_close_fd(uffd_fd);
1994 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1996 void ram_write_tracking_stop(void)
1998 RAMState *rs = ram_state;
2001 RCU_READ_LOCK_GUARD();
2003 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2004 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2007 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2009 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2010 block->host, block->max_length);
2012 /* Cleanup flags and remove reference */
2013 block->flags &= ~RAM_UF_WRITEPROTECT;
2014 memory_region_unref(block->mr);
2017 /* Finally close UFFD file descriptor */
2018 uffd_close_fd(rs->uffdio_fd);
2023 /* No target OS support, stubs just fail or ignore */
2025 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2033 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2034 unsigned long start_page)
2043 bool ram_write_tracking_available(void)
2048 bool ram_write_tracking_compatible(void)
2054 int ram_write_tracking_start(void)
2060 void ram_write_tracking_stop(void)
2064 #endif /* defined(__linux__) */
2067 * get_queued_page: unqueue a page from the postcopy requests
2069 * Skips pages that are already sent (!dirty)
2071 * Returns true if a queued page is found
2073 * @rs: current RAM state
2074 * @pss: data about the state of the current dirty page scan
2076 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2083 block = unqueue_page(rs, &offset);
2085 * We're sending this page, and since it's postcopy nothing else
2086 * will dirty it, and we must make sure it doesn't get sent again
2087 * even if this queue request was received after the background
2088 * search already sent it.
2093 page = offset >> TARGET_PAGE_BITS;
2094 dirty = test_bit(page, block->bmap);
2096 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2099 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2103 } while (block && !dirty);
2107 * Poll write faults too if background snapshot is enabled; that's
2108 * when we have vcpus got blocked by the write protected pages.
2110 block = poll_fault_page(rs, &offset);
2115 * We want the background search to continue from the queued page
2116 * since the guest is likely to want other pages near to the page
2117 * it just requested.
2120 pss->page = offset >> TARGET_PAGE_BITS;
2123 * This unqueued page would break the "one round" check, even is
2126 pss->complete_round = false;
2133 * migration_page_queue_free: drop any remaining pages in the ram
2136 * It should be empty at the end anyway, but in error cases there may
2137 * be some left. in case that there is any page left, we drop it.
2140 static void migration_page_queue_free(RAMState *rs)
2142 struct RAMSrcPageRequest *mspr, *next_mspr;
2143 /* This queue generally should be empty - but in the case of a failed
2144 * migration might have some droppings in.
2146 RCU_READ_LOCK_GUARD();
2147 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2148 memory_region_unref(mspr->rb->mr);
2149 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2155 * ram_save_queue_pages: queue the page for transmission
2157 * A request from postcopy destination for example.
2159 * Returns zero on success or negative on error
2161 * @rbname: Name of the RAMBLock of the request. NULL means the
2162 * same that last one.
2163 * @start: starting address from the start of the RAMBlock
2164 * @len: length (in bytes) to send
2166 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2169 RAMState *rs = ram_state;
2171 ram_counters.postcopy_requests++;
2172 RCU_READ_LOCK_GUARD();
2175 /* Reuse last RAMBlock */
2176 ramblock = rs->last_req_rb;
2180 * Shouldn't happen, we can't reuse the last RAMBlock if
2181 * it's the 1st request.
2183 error_report("ram_save_queue_pages no previous block");
2187 ramblock = qemu_ram_block_by_name(rbname);
2190 /* We shouldn't be asked for a non-existent RAMBlock */
2191 error_report("ram_save_queue_pages no block '%s'", rbname);
2194 rs->last_req_rb = ramblock;
2196 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2197 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2198 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2199 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2200 __func__, start, len, ramblock->used_length);
2205 * When with postcopy preempt, we send back the page directly in the
2208 if (postcopy_preempt_active()) {
2209 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2210 size_t page_size = qemu_ram_pagesize(ramblock);
2211 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2214 qemu_mutex_lock(&rs->bitmap_mutex);
2216 pss_init(pss, ramblock, page_start);
2218 * Always use the preempt channel, and make sure it's there. It's
2219 * safe to access without lock, because when rp-thread is running
2220 * we should be the only one who operates on the qemufile
2222 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2223 assert(pss->pss_channel);
2226 * It must be either one or multiple of host page size. Just
2227 * assert; if something wrong we're mostly split brain anyway.
2229 assert(len % page_size == 0);
2231 if (ram_save_host_page_urgent(pss)) {
2232 error_report("%s: ram_save_host_page_urgent() failed: "
2233 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2234 __func__, ramblock->idstr, start);
2239 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2240 * will automatically be moved and point to the next host page
2241 * we're going to send, so no need to update here.
2243 * Normally QEMU never sends >1 host page in requests, so
2244 * logically we don't even need that as the loop should only
2245 * run once, but just to be consistent.
2249 qemu_mutex_unlock(&rs->bitmap_mutex);
2254 struct RAMSrcPageRequest *new_entry =
2255 g_new0(struct RAMSrcPageRequest, 1);
2256 new_entry->rb = ramblock;
2257 new_entry->offset = start;
2258 new_entry->len = len;
2260 memory_region_ref(ramblock->mr);
2261 qemu_mutex_lock(&rs->src_page_req_mutex);
2262 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2263 migration_make_urgent_request();
2264 qemu_mutex_unlock(&rs->src_page_req_mutex);
2269 static bool save_page_use_compression(RAMState *rs)
2271 if (!migrate_use_compression()) {
2276 * If xbzrle is enabled (e.g., after first round of migration), stop
2277 * using the data compression. In theory, xbzrle can do better than
2280 if (rs->xbzrle_enabled) {
2288 * try to compress the page before posting it out, return true if the page
2289 * has been properly handled by compression, otherwise needs other
2290 * paths to handle it
2292 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2293 RAMBlock *block, ram_addr_t offset)
2295 if (!save_page_use_compression(rs)) {
2300 * When starting the process of a new block, the first page of
2301 * the block should be sent out before other pages in the same
2302 * block, and all the pages in last block should have been sent
2303 * out, keeping this order is important, because the 'cont' flag
2304 * is used to avoid resending the block name.
2306 * We post the fist page as normal page as compression will take
2307 * much CPU resource.
2309 if (block != pss->last_sent_block) {
2310 flush_compressed_data(rs);
2314 if (compress_page_with_multi_thread(block, offset) > 0) {
2318 compression_counters.busy++;
2323 * ram_save_target_page_legacy: save one target page
2325 * Returns the number of pages written
2327 * @rs: current RAM state
2328 * @pss: data about the page we want to send
2330 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2332 RAMBlock *block = pss->block;
2333 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2336 if (control_save_page(pss, block, offset, &res)) {
2340 if (save_compress_page(rs, pss, block, offset)) {
2344 res = save_zero_page(pss, pss->pss_channel, block, offset);
2346 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2347 * page would be stale
2349 if (rs->xbzrle_enabled) {
2350 XBZRLE_cache_lock();
2351 xbzrle_cache_zero_page(rs, block->offset + offset);
2352 XBZRLE_cache_unlock();
2358 * Do not use multifd in postcopy as one whole host page should be
2359 * placed. Meanwhile postcopy requires atomic update of pages, so even
2360 * if host page size == guest page size the dest guest during run may
2361 * still see partially copied pages which is data corruption.
2363 if (migrate_use_multifd() && !migration_in_postcopy()) {
2364 return ram_save_multifd_page(pss->pss_channel, block, offset);
2367 return ram_save_page(rs, pss);
2370 /* Should be called before sending a host page */
2371 static void pss_host_page_prepare(PageSearchStatus *pss)
2373 /* How many guest pages are there in one host page? */
2374 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2376 pss->host_page_sending = true;
2377 if (guest_pfns <= 1) {
2379 * This covers both when guest psize == host psize, or when guest
2380 * has larger psize than the host (guest_pfns==0).
2382 * For the latter, we always send one whole guest page per
2383 * iteration of the host page (example: an Alpha VM on x86 host
2384 * will have guest psize 8K while host psize 4K).
2386 pss->host_page_start = pss->page;
2387 pss->host_page_end = pss->page + 1;
2390 * The host page spans over multiple guest pages, we send them
2391 * within the same host page iteration.
2393 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2394 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2399 * Whether the page pointed by PSS is within the host page being sent.
2400 * Must be called after a previous pss_host_page_prepare().
2402 static bool pss_within_range(PageSearchStatus *pss)
2404 ram_addr_t ram_addr;
2406 assert(pss->host_page_sending);
2408 /* Over host-page boundary? */
2409 if (pss->page >= pss->host_page_end) {
2413 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2415 return offset_in_ramblock(pss->block, ram_addr);
2418 static void pss_host_page_finish(PageSearchStatus *pss)
2420 pss->host_page_sending = false;
2421 /* This is not needed, but just to reset it */
2422 pss->host_page_start = pss->host_page_end = 0;
2426 * Send an urgent host page specified by `pss'. Need to be called with
2427 * bitmap_mutex held.
2429 * Returns 0 if save host page succeeded, false otherwise.
2431 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2433 bool page_dirty, sent = false;
2434 RAMState *rs = ram_state;
2437 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2438 pss_host_page_prepare(pss);
2441 * If precopy is sending the same page, let it be done in precopy, or
2442 * we could send the same page in two channels and none of them will
2443 * receive the whole page.
2445 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2446 trace_postcopy_preempt_hit(pss->block->idstr,
2447 pss->page << TARGET_PAGE_BITS);
2452 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2455 /* Be strict to return code; it must be 1, or what else? */
2456 if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2457 error_report_once("%s: ram_save_target_page failed", __func__);
2463 pss_find_next_dirty(pss);
2464 } while (pss_within_range(pss));
2466 pss_host_page_finish(pss);
2467 /* For urgent requests, flush immediately if sent */
2469 qemu_fflush(pss->pss_channel);
2475 * ram_save_host_page: save a whole host page
2477 * Starting at *offset send pages up to the end of the current host
2478 * page. It's valid for the initial offset to point into the middle of
2479 * a host page in which case the remainder of the hostpage is sent.
2480 * Only dirty target pages are sent. Note that the host page size may
2481 * be a huge page for this block.
2483 * The saving stops at the boundary of the used_length of the block
2484 * if the RAMBlock isn't a multiple of the host page size.
2486 * The caller must be with ram_state.bitmap_mutex held to call this
2487 * function. Note that this function can temporarily release the lock, but
2488 * when the function is returned it'll make sure the lock is still held.
2490 * Returns the number of pages written or negative on error
2492 * @rs: current RAM state
2493 * @pss: data about the page we want to send
2495 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2497 bool page_dirty, preempt_active = postcopy_preempt_active();
2498 int tmppages, pages = 0;
2499 size_t pagesize_bits =
2500 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2501 unsigned long start_page = pss->page;
2504 if (ramblock_is_ignored(pss->block)) {
2505 error_report("block %s should not be migrated !", pss->block->idstr);
2509 /* Update host page boundary information */
2510 pss_host_page_prepare(pss);
2513 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2515 /* Check the pages is dirty and if it is send it */
2518 * Properly yield the lock only in postcopy preempt mode
2519 * because both migration thread and rp-return thread can
2520 * operate on the bitmaps.
2522 if (preempt_active) {
2523 qemu_mutex_unlock(&rs->bitmap_mutex);
2525 tmppages = migration_ops->ram_save_target_page(rs, pss);
2526 if (tmppages >= 0) {
2529 * Allow rate limiting to happen in the middle of huge pages if
2530 * something is sent in the current iteration.
2532 if (pagesize_bits > 1 && tmppages > 0) {
2533 migration_rate_limit();
2536 if (preempt_active) {
2537 qemu_mutex_lock(&rs->bitmap_mutex);
2544 pss_host_page_finish(pss);
2548 pss_find_next_dirty(pss);
2549 } while (pss_within_range(pss));
2551 pss_host_page_finish(pss);
2553 res = ram_save_release_protection(rs, pss, start_page);
2554 return (res < 0 ? res : pages);
2558 * ram_find_and_save_block: finds a dirty page and sends it to f
2560 * Called within an RCU critical section.
2562 * Returns the number of pages written where zero means no dirty pages,
2563 * or negative on error
2565 * @rs: current RAM state
2567 * On systems where host-page-size > target-page-size it will send all the
2568 * pages in a host page that are dirty.
2570 static int ram_find_and_save_block(RAMState *rs)
2572 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2575 /* No dirty page as there is zero RAM */
2576 if (!rs->ram_bytes_total) {
2581 * Always keep last_seen_block/last_page valid during this procedure,
2582 * because find_dirty_block() relies on these values (e.g., we compare
2583 * last_seen_block with pss.block to see whether we searched all the
2584 * ramblocks) to detect the completion of migration. Having NULL value
2585 * of last_seen_block can conditionally cause below loop to run forever.
2587 if (!rs->last_seen_block) {
2588 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2592 pss_init(pss, rs->last_seen_block, rs->last_page);
2595 if (!get_queued_page(rs, pss)) {
2596 /* priority queue empty, so just search for something dirty */
2597 int res = find_dirty_block(rs, pss);
2598 if (res != PAGE_DIRTY_FOUND) {
2599 if (res == PAGE_ALL_CLEAN) {
2601 } else if (res == PAGE_TRY_AGAIN) {
2606 pages = ram_save_host_page(rs, pss);
2612 rs->last_seen_block = pss->block;
2613 rs->last_page = pss->page;
2618 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2620 uint64_t pages = size / TARGET_PAGE_SIZE;
2623 stat64_add(&ram_counters.duplicate, pages);
2625 stat64_add(&ram_counters.normal, pages);
2626 ram_transferred_add(size);
2627 qemu_file_credit_transfer(f, size);
2631 static uint64_t ram_bytes_total_with_ignored(void)
2636 RCU_READ_LOCK_GUARD();
2638 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2639 total += block->used_length;
2644 uint64_t ram_bytes_total(void)
2649 RCU_READ_LOCK_GUARD();
2651 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2652 total += block->used_length;
2657 static void xbzrle_load_setup(void)
2659 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2662 static void xbzrle_load_cleanup(void)
2664 g_free(XBZRLE.decoded_buf);
2665 XBZRLE.decoded_buf = NULL;
2668 static void ram_state_cleanup(RAMState **rsp)
2671 migration_page_queue_free(*rsp);
2672 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2673 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2679 static void xbzrle_cleanup(void)
2681 XBZRLE_cache_lock();
2683 cache_fini(XBZRLE.cache);
2684 g_free(XBZRLE.encoded_buf);
2685 g_free(XBZRLE.current_buf);
2686 g_free(XBZRLE.zero_target_page);
2687 XBZRLE.cache = NULL;
2688 XBZRLE.encoded_buf = NULL;
2689 XBZRLE.current_buf = NULL;
2690 XBZRLE.zero_target_page = NULL;
2692 XBZRLE_cache_unlock();
2695 static void ram_save_cleanup(void *opaque)
2697 RAMState **rsp = opaque;
2700 /* We don't use dirty log with background snapshots */
2701 if (!migrate_background_snapshot()) {
2702 /* caller have hold iothread lock or is in a bh, so there is
2703 * no writing race against the migration bitmap
2705 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2707 * do not stop dirty log without starting it, since
2708 * memory_global_dirty_log_stop will assert that
2709 * memory_global_dirty_log_start/stop used in pairs
2711 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2715 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2716 g_free(block->clear_bmap);
2717 block->clear_bmap = NULL;
2718 g_free(block->bmap);
2723 compress_threads_save_cleanup();
2724 ram_state_cleanup(rsp);
2725 g_free(migration_ops);
2726 migration_ops = NULL;
2729 static void ram_state_reset(RAMState *rs)
2733 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2734 rs->pss[i].last_sent_block = NULL;
2737 rs->last_seen_block = NULL;
2739 rs->last_version = ram_list.version;
2740 rs->xbzrle_enabled = false;
2743 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2745 /* **** functions for postcopy ***** */
2747 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2749 struct RAMBlock *block;
2751 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2752 unsigned long *bitmap = block->bmap;
2753 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2754 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2756 while (run_start < range) {
2757 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2758 ram_discard_range(block->idstr,
2759 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2760 ((ram_addr_t)(run_end - run_start))
2761 << TARGET_PAGE_BITS);
2762 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2768 * postcopy_send_discard_bm_ram: discard a RAMBlock
2770 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2772 * @ms: current migration state
2773 * @block: RAMBlock to discard
2775 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2777 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2778 unsigned long current;
2779 unsigned long *bitmap = block->bmap;
2781 for (current = 0; current < end; ) {
2782 unsigned long one = find_next_bit(bitmap, end, current);
2783 unsigned long zero, discard_length;
2789 zero = find_next_zero_bit(bitmap, end, one + 1);
2792 discard_length = end - one;
2794 discard_length = zero - one;
2796 postcopy_discard_send_range(ms, one, discard_length);
2797 current = one + discard_length;
2801 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2804 * postcopy_each_ram_send_discard: discard all RAMBlocks
2806 * Utility for the outgoing postcopy code.
2807 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2808 * passing it bitmap indexes and name.
2809 * (qemu_ram_foreach_block ends up passing unscaled lengths
2810 * which would mean postcopy code would have to deal with target page)
2812 * @ms: current migration state
2814 static void postcopy_each_ram_send_discard(MigrationState *ms)
2816 struct RAMBlock *block;
2818 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2819 postcopy_discard_send_init(ms, block->idstr);
2822 * Deal with TPS != HPS and huge pages. It discard any partially sent
2823 * host-page size chunks, mark any partially dirty host-page size
2824 * chunks as all dirty. In this case the host-page is the host-page
2825 * for the particular RAMBlock, i.e. it might be a huge page.
2827 postcopy_chunk_hostpages_pass(ms, block);
2830 * Postcopy sends chunks of bitmap over the wire, but it
2831 * just needs indexes at this point, avoids it having
2832 * target page specific code.
2834 postcopy_send_discard_bm_ram(ms, block);
2835 postcopy_discard_send_finish(ms);
2840 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2842 * Helper for postcopy_chunk_hostpages; it's called twice to
2843 * canonicalize the two bitmaps, that are similar, but one is
2846 * Postcopy requires that all target pages in a hostpage are dirty or
2847 * clean, not a mix. This function canonicalizes the bitmaps.
2849 * @ms: current migration state
2850 * @block: block that contains the page we want to canonicalize
2852 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2854 RAMState *rs = ram_state;
2855 unsigned long *bitmap = block->bmap;
2856 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2857 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2858 unsigned long run_start;
2860 if (block->page_size == TARGET_PAGE_SIZE) {
2861 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2865 /* Find a dirty page */
2866 run_start = find_next_bit(bitmap, pages, 0);
2868 while (run_start < pages) {
2871 * If the start of this run of pages is in the middle of a host
2872 * page, then we need to fixup this host page.
2874 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2875 /* Find the end of this run */
2876 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2878 * If the end isn't at the start of a host page, then the
2879 * run doesn't finish at the end of a host page
2880 * and we need to discard.
2884 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2886 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2888 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2890 /* Clean up the bitmap */
2891 for (page = fixup_start_addr;
2892 page < fixup_start_addr + host_ratio; page++) {
2894 * Remark them as dirty, updating the count for any pages
2895 * that weren't previously dirty.
2897 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2901 /* Find the next dirty page for the next iteration */
2902 run_start = find_next_bit(bitmap, pages, run_start);
2907 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2909 * Transmit the set of pages to be discarded after precopy to the target
2910 * these are pages that:
2911 * a) Have been previously transmitted but are now dirty again
2912 * b) Pages that have never been transmitted, this ensures that
2913 * any pages on the destination that have been mapped by background
2914 * tasks get discarded (transparent huge pages is the specific concern)
2915 * Hopefully this is pretty sparse
2917 * @ms: current migration state
2919 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2921 RAMState *rs = ram_state;
2923 RCU_READ_LOCK_GUARD();
2925 /* This should be our last sync, the src is now paused */
2926 migration_bitmap_sync(rs);
2928 /* Easiest way to make sure we don't resume in the middle of a host-page */
2929 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2930 rs->last_seen_block = NULL;
2933 postcopy_each_ram_send_discard(ms);
2935 trace_ram_postcopy_send_discard_bitmap();
2939 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2941 * Returns zero on success
2943 * @rbname: name of the RAMBlock of the request. NULL means the
2944 * same that last one.
2945 * @start: RAMBlock starting page
2946 * @length: RAMBlock size
2948 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2950 trace_ram_discard_range(rbname, start, length);
2952 RCU_READ_LOCK_GUARD();
2953 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2956 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2961 * On source VM, we don't need to update the received bitmap since
2962 * we don't even have one.
2964 if (rb->receivedmap) {
2965 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2966 length >> qemu_target_page_bits());
2969 return ram_block_discard_range(rb, start, length);
2973 * For every allocation, we will try not to crash the VM if the
2974 * allocation failed.
2976 static int xbzrle_init(void)
2978 Error *local_err = NULL;
2980 if (!migrate_use_xbzrle()) {
2984 XBZRLE_cache_lock();
2986 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2987 if (!XBZRLE.zero_target_page) {
2988 error_report("%s: Error allocating zero page", __func__);
2992 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2993 TARGET_PAGE_SIZE, &local_err);
2994 if (!XBZRLE.cache) {
2995 error_report_err(local_err);
2996 goto free_zero_page;
2999 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3000 if (!XBZRLE.encoded_buf) {
3001 error_report("%s: Error allocating encoded_buf", __func__);
3005 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3006 if (!XBZRLE.current_buf) {
3007 error_report("%s: Error allocating current_buf", __func__);
3008 goto free_encoded_buf;
3011 /* We are all good */
3012 XBZRLE_cache_unlock();
3016 g_free(XBZRLE.encoded_buf);
3017 XBZRLE.encoded_buf = NULL;
3019 cache_fini(XBZRLE.cache);
3020 XBZRLE.cache = NULL;
3022 g_free(XBZRLE.zero_target_page);
3023 XBZRLE.zero_target_page = NULL;
3025 XBZRLE_cache_unlock();
3029 static int ram_state_init(RAMState **rsp)
3031 *rsp = g_try_new0(RAMState, 1);
3034 error_report("%s: Init ramstate fail", __func__);
3038 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3039 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3040 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3041 (*rsp)->ram_bytes_total = ram_bytes_total();
3044 * Count the total number of pages used by ram blocks not including any
3045 * gaps due to alignment or unplugs.
3046 * This must match with the initial values of dirty bitmap.
3048 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3049 ram_state_reset(*rsp);
3054 static void ram_list_init_bitmaps(void)
3056 MigrationState *ms = migrate_get_current();
3058 unsigned long pages;
3061 /* Skip setting bitmap if there is no RAM */
3062 if (ram_bytes_total()) {
3063 shift = ms->clear_bitmap_shift;
3064 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3065 error_report("clear_bitmap_shift (%u) too big, using "
3066 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3067 shift = CLEAR_BITMAP_SHIFT_MAX;
3068 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3069 error_report("clear_bitmap_shift (%u) too small, using "
3070 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3071 shift = CLEAR_BITMAP_SHIFT_MIN;
3074 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3075 pages = block->max_length >> TARGET_PAGE_BITS;
3077 * The initial dirty bitmap for migration must be set with all
3078 * ones to make sure we'll migrate every guest RAM page to
3080 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3081 * new migration after a failed migration, ram_list.
3082 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3085 block->bmap = bitmap_new(pages);
3086 bitmap_set(block->bmap, 0, pages);
3087 block->clear_bmap_shift = shift;
3088 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3093 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3095 unsigned long pages;
3098 RCU_READ_LOCK_GUARD();
3100 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3101 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3102 rs->migration_dirty_pages -= pages;
3106 static void ram_init_bitmaps(RAMState *rs)
3108 /* For memory_global_dirty_log_start below. */
3109 qemu_mutex_lock_iothread();
3110 qemu_mutex_lock_ramlist();
3112 WITH_RCU_READ_LOCK_GUARD() {
3113 ram_list_init_bitmaps();
3114 /* We don't use dirty log with background snapshots */
3115 if (!migrate_background_snapshot()) {
3116 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3117 migration_bitmap_sync_precopy(rs);
3120 qemu_mutex_unlock_ramlist();
3121 qemu_mutex_unlock_iothread();
3124 * After an eventual first bitmap sync, fixup the initial bitmap
3125 * containing all 1s to exclude any discarded pages from migration.
3127 migration_bitmap_clear_discarded_pages(rs);
3130 static int ram_init_all(RAMState **rsp)
3132 if (ram_state_init(rsp)) {
3136 if (xbzrle_init()) {
3137 ram_state_cleanup(rsp);
3141 ram_init_bitmaps(*rsp);
3146 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3152 * Postcopy is not using xbzrle/compression, so no need for that.
3153 * Also, since source are already halted, we don't need to care
3154 * about dirty page logging as well.
3157 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3158 pages += bitmap_count_one(block->bmap,
3159 block->used_length >> TARGET_PAGE_BITS);
3162 /* This may not be aligned with current bitmaps. Recalculate. */
3163 rs->migration_dirty_pages = pages;
3165 ram_state_reset(rs);
3167 /* Update RAMState cache of output QEMUFile */
3168 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3170 trace_ram_state_resume_prepare(pages);
3174 * This function clears bits of the free pages reported by the caller from the
3175 * migration dirty bitmap. @addr is the host address corresponding to the
3176 * start of the continuous guest free pages, and @len is the total bytes of
3179 void qemu_guest_free_page_hint(void *addr, size_t len)
3183 size_t used_len, start, npages;
3184 MigrationState *s = migrate_get_current();
3186 /* This function is currently expected to be used during live migration */
3187 if (!migration_is_setup_or_active(s->state)) {
3191 for (; len > 0; len -= used_len, addr += used_len) {
3192 block = qemu_ram_block_from_host(addr, false, &offset);
3193 if (unlikely(!block || offset >= block->used_length)) {
3195 * The implementation might not support RAMBlock resize during
3196 * live migration, but it could happen in theory with future
3197 * updates. So we add a check here to capture that case.
3199 error_report_once("%s unexpected error", __func__);
3203 if (len <= block->used_length - offset) {
3206 used_len = block->used_length - offset;
3209 start = offset >> TARGET_PAGE_BITS;
3210 npages = used_len >> TARGET_PAGE_BITS;
3212 qemu_mutex_lock(&ram_state->bitmap_mutex);
3214 * The skipped free pages are equavalent to be sent from clear_bmap's
3215 * perspective, so clear the bits from the memory region bitmap which
3216 * are initially set. Otherwise those skipped pages will be sent in
3217 * the next round after syncing from the memory region bitmap.
3219 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3220 ram_state->migration_dirty_pages -=
3221 bitmap_count_one_with_offset(block->bmap, start, npages);
3222 bitmap_clear(block->bmap, start, npages);
3223 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3228 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3229 * long-running RCU critical section. When rcu-reclaims in the code
3230 * start to become numerous it will be necessary to reduce the
3231 * granularity of these critical sections.
3235 * ram_save_setup: Setup RAM for migration
3237 * Returns zero to indicate success and negative for error
3239 * @f: QEMUFile where to send the data
3240 * @opaque: RAMState pointer
3242 static int ram_save_setup(QEMUFile *f, void *opaque)
3244 RAMState **rsp = opaque;
3248 if (compress_threads_save_setup()) {
3252 /* migration has already setup the bitmap, reuse it. */
3253 if (!migration_in_colo_state()) {
3254 if (ram_init_all(rsp) != 0) {
3255 compress_threads_save_cleanup();
3259 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3261 WITH_RCU_READ_LOCK_GUARD() {
3262 qemu_put_be64(f, ram_bytes_total_with_ignored()
3263 | RAM_SAVE_FLAG_MEM_SIZE);
3265 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3266 qemu_put_byte(f, strlen(block->idstr));
3267 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3268 qemu_put_be64(f, block->used_length);
3269 if (migrate_postcopy_ram() && block->page_size !=
3270 qemu_host_page_size) {
3271 qemu_put_be64(f, block->page_size);
3273 if (migrate_ignore_shared()) {
3274 qemu_put_be64(f, block->mr->addr);
3279 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3280 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3282 migration_ops = g_malloc0(sizeof(MigrationOps));
3283 migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3284 ret = multifd_send_sync_main(f);
3289 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3296 * ram_save_iterate: iterative stage for migration
3298 * Returns zero to indicate success and negative for error
3300 * @f: QEMUFile where to send the data
3301 * @opaque: RAMState pointer
3303 static int ram_save_iterate(QEMUFile *f, void *opaque)
3305 RAMState **temp = opaque;
3306 RAMState *rs = *temp;
3312 if (blk_mig_bulk_active()) {
3313 /* Avoid transferring ram during bulk phase of block migration as
3314 * the bulk phase will usually take a long time and transferring
3315 * ram updates during that time is pointless. */
3320 * We'll take this lock a little bit long, but it's okay for two reasons.
3321 * Firstly, the only possible other thread to take it is who calls
3322 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3323 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3324 * guarantees that we'll at least released it in a regular basis.
3326 qemu_mutex_lock(&rs->bitmap_mutex);
3327 WITH_RCU_READ_LOCK_GUARD() {
3328 if (ram_list.version != rs->last_version) {
3329 ram_state_reset(rs);
3332 /* Read version before ram_list.blocks */
3335 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3337 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3339 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3340 postcopy_has_request(rs)) {
3343 if (qemu_file_get_error(f)) {
3347 pages = ram_find_and_save_block(rs);
3348 /* no more pages to sent */
3355 qemu_file_set_error(f, pages);
3359 rs->target_page_count += pages;
3362 * During postcopy, it is necessary to make sure one whole host
3363 * page is sent in one chunk.
3365 if (migrate_postcopy_ram()) {
3366 flush_compressed_data(rs);
3370 * we want to check in the 1st loop, just in case it was the 1st
3371 * time and we had to sync the dirty bitmap.
3372 * qemu_clock_get_ns() is a bit expensive, so we only check each
3375 if ((i & 63) == 0) {
3376 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3378 if (t1 > MAX_WAIT) {
3379 trace_ram_save_iterate_big_wait(t1, i);
3386 qemu_mutex_unlock(&rs->bitmap_mutex);
3389 * Must occur before EOS (or any QEMUFile operation)
3390 * because of RDMA protocol.
3392 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3396 && migration_is_setup_or_active(migrate_get_current()->state)) {
3397 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3402 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3404 ram_transferred_add(8);
3406 ret = qemu_file_get_error(f);
3416 * ram_save_complete: function called to send the remaining amount of ram
3418 * Returns zero to indicate success or negative on error
3420 * Called with iothread lock
3422 * @f: QEMUFile where to send the data
3423 * @opaque: RAMState pointer
3425 static int ram_save_complete(QEMUFile *f, void *opaque)
3427 RAMState **temp = opaque;
3428 RAMState *rs = *temp;
3431 rs->last_stage = !migration_in_colo_state();
3433 WITH_RCU_READ_LOCK_GUARD() {
3434 if (!migration_in_postcopy()) {
3435 migration_bitmap_sync_precopy(rs);
3438 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3440 /* try transferring iterative blocks of memory */
3442 /* flush all remaining blocks regardless of rate limiting */
3443 qemu_mutex_lock(&rs->bitmap_mutex);
3447 pages = ram_find_and_save_block(rs);
3448 /* no more blocks to sent */
3457 qemu_mutex_unlock(&rs->bitmap_mutex);
3459 flush_compressed_data(rs);
3460 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3467 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3472 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3478 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3479 uint64_t *can_postcopy)
3481 RAMState **temp = opaque;
3482 RAMState *rs = *temp;
3484 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3486 if (migrate_postcopy_ram()) {
3487 /* We can do postcopy, and all the data is postcopiable */
3488 *can_postcopy += remaining_size;
3490 *must_precopy += remaining_size;
3494 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3495 uint64_t *can_postcopy)
3497 MigrationState *s = migrate_get_current();
3498 RAMState **temp = opaque;
3499 RAMState *rs = *temp;
3501 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3503 if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3504 qemu_mutex_lock_iothread();
3505 WITH_RCU_READ_LOCK_GUARD() {
3506 migration_bitmap_sync_precopy(rs);
3508 qemu_mutex_unlock_iothread();
3509 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3512 if (migrate_postcopy_ram()) {
3513 /* We can do postcopy, and all the data is postcopiable */
3514 *can_postcopy += remaining_size;
3516 *must_precopy += remaining_size;
3520 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3522 unsigned int xh_len;
3524 uint8_t *loaded_data;
3526 /* extract RLE header */
3527 xh_flags = qemu_get_byte(f);
3528 xh_len = qemu_get_be16(f);
3530 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3531 error_report("Failed to load XBZRLE page - wrong compression!");
3535 if (xh_len > TARGET_PAGE_SIZE) {
3536 error_report("Failed to load XBZRLE page - len overflow!");
3539 loaded_data = XBZRLE.decoded_buf;
3540 /* load data and decode */
3541 /* it can change loaded_data to point to an internal buffer */
3542 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3545 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3546 TARGET_PAGE_SIZE) == -1) {
3547 error_report("Failed to load XBZRLE page - decode error!");
3555 * ram_block_from_stream: read a RAMBlock id from the migration stream
3557 * Must be called from within a rcu critical section.
3559 * Returns a pointer from within the RCU-protected ram_list.
3561 * @mis: the migration incoming state pointer
3562 * @f: QEMUFile where to read the data from
3563 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3564 * @channel: the channel we're using
3566 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3567 QEMUFile *f, int flags,
3570 RAMBlock *block = mis->last_recv_block[channel];
3574 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3576 error_report("Ack, bad migration stream!");
3582 len = qemu_get_byte(f);
3583 qemu_get_buffer(f, (uint8_t *)id, len);
3586 block = qemu_ram_block_by_name(id);
3588 error_report("Can't find block %s", id);
3592 if (ramblock_is_ignored(block)) {
3593 error_report("block %s should not be migrated !", id);
3597 mis->last_recv_block[channel] = block;
3602 static inline void *host_from_ram_block_offset(RAMBlock *block,
3605 if (!offset_in_ramblock(block, offset)) {
3609 return block->host + offset;
3612 static void *host_page_from_ram_block_offset(RAMBlock *block,
3615 /* Note: Explicitly no check against offset_in_ramblock(). */
3616 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3620 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3623 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3626 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3627 ram_addr_t offset, bool record_bitmap)
3629 if (!offset_in_ramblock(block, offset)) {
3632 if (!block->colo_cache) {
3633 error_report("%s: colo_cache is NULL in block :%s",
3634 __func__, block->idstr);
3639 * During colo checkpoint, we need bitmap of these migrated pages.
3640 * It help us to decide which pages in ram cache should be flushed
3641 * into VM's RAM later.
3643 if (record_bitmap &&
3644 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3645 ram_state->migration_dirty_pages++;
3647 return block->colo_cache + offset;
3651 * ram_handle_compressed: handle the zero page case
3653 * If a page (or a whole RDMA chunk) has been
3654 * determined to be zero, then zap it.
3656 * @host: host address for the zero page
3657 * @ch: what the page is filled from. We only support zero
3658 * @size: size of the zero page
3660 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3662 if (ch != 0 || !buffer_is_zero(host, size)) {
3663 memset(host, ch, size);
3667 /* return the size after decompression, or negative value on error */
3669 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3670 const uint8_t *source, size_t source_len)
3674 err = inflateReset(stream);
3679 stream->avail_in = source_len;
3680 stream->next_in = (uint8_t *)source;
3681 stream->avail_out = dest_len;
3682 stream->next_out = dest;
3684 err = inflate(stream, Z_NO_FLUSH);
3685 if (err != Z_STREAM_END) {
3689 return stream->total_out;
3692 static void *do_data_decompress(void *opaque)
3694 DecompressParam *param = opaque;
3695 unsigned long pagesize;
3699 qemu_mutex_lock(¶m->mutex);
3700 while (!param->quit) {
3705 qemu_mutex_unlock(¶m->mutex);
3707 pagesize = TARGET_PAGE_SIZE;
3709 ret = qemu_uncompress_data(¶m->stream, des, pagesize,
3710 param->compbuf, len);
3711 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3712 error_report("decompress data failed");
3713 qemu_file_set_error(decomp_file, ret);
3716 qemu_mutex_lock(&decomp_done_lock);
3718 qemu_cond_signal(&decomp_done_cond);
3719 qemu_mutex_unlock(&decomp_done_lock);
3721 qemu_mutex_lock(¶m->mutex);
3723 qemu_cond_wait(¶m->cond, ¶m->mutex);
3726 qemu_mutex_unlock(¶m->mutex);
3731 static int wait_for_decompress_done(void)
3733 int idx, thread_count;
3735 if (!migrate_use_compression()) {
3739 thread_count = migrate_decompress_threads();
3740 qemu_mutex_lock(&decomp_done_lock);
3741 for (idx = 0; idx < thread_count; idx++) {
3742 while (!decomp_param[idx].done) {
3743 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3746 qemu_mutex_unlock(&decomp_done_lock);
3747 return qemu_file_get_error(decomp_file);
3750 static void compress_threads_load_cleanup(void)
3752 int i, thread_count;
3754 if (!migrate_use_compression()) {
3757 thread_count = migrate_decompress_threads();
3758 for (i = 0; i < thread_count; i++) {
3760 * we use it as a indicator which shows if the thread is
3761 * properly init'd or not
3763 if (!decomp_param[i].compbuf) {
3767 qemu_mutex_lock(&decomp_param[i].mutex);
3768 decomp_param[i].quit = true;
3769 qemu_cond_signal(&decomp_param[i].cond);
3770 qemu_mutex_unlock(&decomp_param[i].mutex);
3772 for (i = 0; i < thread_count; i++) {
3773 if (!decomp_param[i].compbuf) {
3777 qemu_thread_join(decompress_threads + i);
3778 qemu_mutex_destroy(&decomp_param[i].mutex);
3779 qemu_cond_destroy(&decomp_param[i].cond);
3780 inflateEnd(&decomp_param[i].stream);
3781 g_free(decomp_param[i].compbuf);
3782 decomp_param[i].compbuf = NULL;
3784 g_free(decompress_threads);
3785 g_free(decomp_param);
3786 decompress_threads = NULL;
3787 decomp_param = NULL;
3791 static int compress_threads_load_setup(QEMUFile *f)
3793 int i, thread_count;
3795 if (!migrate_use_compression()) {
3799 thread_count = migrate_decompress_threads();
3800 decompress_threads = g_new0(QemuThread, thread_count);
3801 decomp_param = g_new0(DecompressParam, thread_count);
3802 qemu_mutex_init(&decomp_done_lock);
3803 qemu_cond_init(&decomp_done_cond);
3805 for (i = 0; i < thread_count; i++) {
3806 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3810 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3811 qemu_mutex_init(&decomp_param[i].mutex);
3812 qemu_cond_init(&decomp_param[i].cond);
3813 decomp_param[i].done = true;
3814 decomp_param[i].quit = false;
3815 qemu_thread_create(decompress_threads + i, "decompress",
3816 do_data_decompress, decomp_param + i,
3817 QEMU_THREAD_JOINABLE);
3821 compress_threads_load_cleanup();
3825 static void decompress_data_with_multi_threads(QEMUFile *f,
3826 void *host, int len)
3828 int idx, thread_count;
3830 thread_count = migrate_decompress_threads();
3831 QEMU_LOCK_GUARD(&decomp_done_lock);
3833 for (idx = 0; idx < thread_count; idx++) {
3834 if (decomp_param[idx].done) {
3835 decomp_param[idx].done = false;
3836 qemu_mutex_lock(&decomp_param[idx].mutex);
3837 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3838 decomp_param[idx].des = host;
3839 decomp_param[idx].len = len;
3840 qemu_cond_signal(&decomp_param[idx].cond);
3841 qemu_mutex_unlock(&decomp_param[idx].mutex);
3845 if (idx < thread_count) {
3848 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3853 static void colo_init_ram_state(void)
3855 ram_state_init(&ram_state);
3859 * colo cache: this is for secondary VM, we cache the whole
3860 * memory of the secondary VM, it is need to hold the global lock
3861 * to call this helper.
3863 int colo_init_ram_cache(void)
3867 WITH_RCU_READ_LOCK_GUARD() {
3868 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3869 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3870 NULL, false, false);
3871 if (!block->colo_cache) {
3872 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3873 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3874 block->used_length);
3875 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3876 if (block->colo_cache) {
3877 qemu_anon_ram_free(block->colo_cache, block->used_length);
3878 block->colo_cache = NULL;
3883 if (!machine_dump_guest_core(current_machine)) {
3884 qemu_madvise(block->colo_cache, block->used_length,
3885 QEMU_MADV_DONTDUMP);
3891 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3892 * with to decide which page in cache should be flushed into SVM's RAM. Here
3893 * we use the same name 'ram_bitmap' as for migration.
3895 if (ram_bytes_total()) {
3898 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3899 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3900 block->bmap = bitmap_new(pages);
3904 colo_init_ram_state();
3908 /* TODO: duplicated with ram_init_bitmaps */
3909 void colo_incoming_start_dirty_log(void)
3911 RAMBlock *block = NULL;
3912 /* For memory_global_dirty_log_start below. */
3913 qemu_mutex_lock_iothread();
3914 qemu_mutex_lock_ramlist();
3916 memory_global_dirty_log_sync();
3917 WITH_RCU_READ_LOCK_GUARD() {
3918 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3919 ramblock_sync_dirty_bitmap(ram_state, block);
3920 /* Discard this dirty bitmap record */
3921 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3923 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3925 ram_state->migration_dirty_pages = 0;
3926 qemu_mutex_unlock_ramlist();
3927 qemu_mutex_unlock_iothread();
3930 /* It is need to hold the global lock to call this helper */
3931 void colo_release_ram_cache(void)
3935 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3936 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3937 g_free(block->bmap);
3941 WITH_RCU_READ_LOCK_GUARD() {
3942 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3943 if (block->colo_cache) {
3944 qemu_anon_ram_free(block->colo_cache, block->used_length);
3945 block->colo_cache = NULL;
3949 ram_state_cleanup(&ram_state);
3953 * ram_load_setup: Setup RAM for migration incoming side
3955 * Returns zero to indicate success and negative for error
3957 * @f: QEMUFile where to receive the data
3958 * @opaque: RAMState pointer
3960 static int ram_load_setup(QEMUFile *f, void *opaque)
3962 if (compress_threads_load_setup(f)) {
3966 xbzrle_load_setup();
3967 ramblock_recv_map_init();
3972 static int ram_load_cleanup(void *opaque)
3976 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3977 qemu_ram_block_writeback(rb);
3980 xbzrle_load_cleanup();
3981 compress_threads_load_cleanup();
3983 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3984 g_free(rb->receivedmap);
3985 rb->receivedmap = NULL;
3992 * ram_postcopy_incoming_init: allocate postcopy data structures
3994 * Returns 0 for success and negative if there was one error
3996 * @mis: current migration incoming state
3998 * Allocate data structures etc needed by incoming migration with
3999 * postcopy-ram. postcopy-ram's similarly names
4000 * postcopy_ram_incoming_init does the work.
4002 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4004 return postcopy_ram_incoming_init(mis);
4008 * ram_load_postcopy: load a page in postcopy case
4010 * Returns 0 for success or -errno in case of error
4012 * Called in postcopy mode by ram_load().
4013 * rcu_read_lock is taken prior to this being called.
4015 * @f: QEMUFile where to send the data
4016 * @channel: the channel to use for loading
4018 int ram_load_postcopy(QEMUFile *f, int channel)
4020 int flags = 0, ret = 0;
4021 bool place_needed = false;
4022 bool matches_target_page_size = false;
4023 MigrationIncomingState *mis = migration_incoming_get_current();
4024 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4026 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4028 void *page_buffer = NULL;
4029 void *place_source = NULL;
4030 RAMBlock *block = NULL;
4034 addr = qemu_get_be64(f);
4037 * If qemu file error, we should stop here, and then "addr"
4040 ret = qemu_file_get_error(f);
4045 flags = addr & ~TARGET_PAGE_MASK;
4046 addr &= TARGET_PAGE_MASK;
4048 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4049 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4050 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4051 block = ram_block_from_stream(mis, f, flags, channel);
4058 * Relying on used_length is racy and can result in false positives.
4059 * We might place pages beyond used_length in case RAM was shrunk
4060 * while in postcopy, which is fine - trying to place via
4061 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4063 if (!block->host || addr >= block->postcopy_length) {
4064 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4068 tmp_page->target_pages++;
4069 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4071 * Postcopy requires that we place whole host pages atomically;
4072 * these may be huge pages for RAMBlocks that are backed by
4074 * To make it atomic, the data is read into a temporary page
4075 * that's moved into place later.
4076 * The migration protocol uses, possibly smaller, target-pages
4077 * however the source ensures it always sends all the components
4078 * of a host page in one chunk.
4080 page_buffer = tmp_page->tmp_huge_page +
4081 host_page_offset_from_ram_block_offset(block, addr);
4082 /* If all TP are zero then we can optimise the place */
4083 if (tmp_page->target_pages == 1) {
4084 tmp_page->host_addr =
4085 host_page_from_ram_block_offset(block, addr);
4086 } else if (tmp_page->host_addr !=
4087 host_page_from_ram_block_offset(block, addr)) {
4088 /* not the 1st TP within the HP */
4089 error_report("Non-same host page detected on channel %d: "
4090 "Target host page %p, received host page %p "
4091 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4092 channel, tmp_page->host_addr,
4093 host_page_from_ram_block_offset(block, addr),
4094 block->idstr, addr, tmp_page->target_pages);
4100 * If it's the last part of a host page then we place the host
4103 if (tmp_page->target_pages ==
4104 (block->page_size / TARGET_PAGE_SIZE)) {
4105 place_needed = true;
4107 place_source = tmp_page->tmp_huge_page;
4110 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4111 case RAM_SAVE_FLAG_ZERO:
4112 ch = qemu_get_byte(f);
4114 * Can skip to set page_buffer when
4115 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4117 if (ch || !matches_target_page_size) {
4118 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4121 tmp_page->all_zero = false;
4125 case RAM_SAVE_FLAG_PAGE:
4126 tmp_page->all_zero = false;
4127 if (!matches_target_page_size) {
4128 /* For huge pages, we always use temporary buffer */
4129 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4132 * For small pages that matches target page size, we
4133 * avoid the qemu_file copy. Instead we directly use
4134 * the buffer of QEMUFile to place the page. Note: we
4135 * cannot do any QEMUFile operation before using that
4136 * buffer to make sure the buffer is valid when
4139 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4143 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4144 tmp_page->all_zero = false;
4145 len = qemu_get_be32(f);
4146 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4147 error_report("Invalid compressed data length: %d", len);
4151 decompress_data_with_multi_threads(f, page_buffer, len);
4154 case RAM_SAVE_FLAG_EOS:
4156 multifd_recv_sync_main();
4159 error_report("Unknown combination of migration flags: 0x%x"
4160 " (postcopy mode)", flags);
4165 /* Got the whole host page, wait for decompress before placing. */
4167 ret |= wait_for_decompress_done();
4170 /* Detect for any possible file errors */
4171 if (!ret && qemu_file_get_error(f)) {
4172 ret = qemu_file_get_error(f);
4175 if (!ret && place_needed) {
4176 if (tmp_page->all_zero) {
4177 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4179 ret = postcopy_place_page(mis, tmp_page->host_addr,
4180 place_source, block);
4182 place_needed = false;
4183 postcopy_temp_page_reset(tmp_page);
4190 static bool postcopy_is_running(void)
4192 PostcopyState ps = postcopy_state_get();
4193 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4197 * Flush content of RAM cache into SVM's memory.
4198 * Only flush the pages that be dirtied by PVM or SVM or both.
4200 void colo_flush_ram_cache(void)
4202 RAMBlock *block = NULL;
4205 unsigned long offset = 0;
4207 memory_global_dirty_log_sync();
4208 WITH_RCU_READ_LOCK_GUARD() {
4209 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4210 ramblock_sync_dirty_bitmap(ram_state, block);
4214 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4215 WITH_RCU_READ_LOCK_GUARD() {
4216 block = QLIST_FIRST_RCU(&ram_list.blocks);
4219 unsigned long num = 0;
4221 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4222 if (!offset_in_ramblock(block,
4223 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4226 block = QLIST_NEXT_RCU(block, next);
4228 unsigned long i = 0;
4230 for (i = 0; i < num; i++) {
4231 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4233 dst_host = block->host
4234 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4235 src_host = block->colo_cache
4236 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4237 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4242 trace_colo_flush_ram_cache_end();
4246 * ram_load_precopy: load pages in precopy case
4248 * Returns 0 for success or -errno in case of error
4250 * Called in precopy mode by ram_load().
4251 * rcu_read_lock is taken prior to this being called.
4253 * @f: QEMUFile where to send the data
4255 static int ram_load_precopy(QEMUFile *f)
4257 MigrationIncomingState *mis = migration_incoming_get_current();
4258 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4259 /* ADVISE is earlier, it shows the source has the postcopy capability on */
4260 bool postcopy_advised = migration_incoming_postcopy_advised();
4261 if (!migrate_use_compression()) {
4262 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4265 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4266 ram_addr_t addr, total_ram_bytes;
4267 void *host = NULL, *host_bak = NULL;
4271 * Yield periodically to let main loop run, but an iteration of
4272 * the main loop is expensive, so do it each some iterations
4274 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4275 aio_co_schedule(qemu_get_current_aio_context(),
4276 qemu_coroutine_self());
4277 qemu_coroutine_yield();
4281 addr = qemu_get_be64(f);
4282 flags = addr & ~TARGET_PAGE_MASK;
4283 addr &= TARGET_PAGE_MASK;
4285 if (flags & invalid_flags) {
4286 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4287 error_report("Received an unexpected compressed page");
4294 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4295 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4296 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4297 RAM_CHANNEL_PRECOPY);
4299 host = host_from_ram_block_offset(block, addr);
4301 * After going into COLO stage, we should not load the page
4302 * into SVM's memory directly, we put them into colo_cache firstly.
4303 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4304 * Previously, we copied all these memory in preparing stage of COLO
4305 * while we need to stop VM, which is a time-consuming process.
4306 * Here we optimize it by a trick, back-up every page while in
4307 * migration process while COLO is enabled, though it affects the
4308 * speed of the migration, but it obviously reduce the downtime of
4309 * back-up all SVM'S memory in COLO preparing stage.
4311 if (migration_incoming_colo_enabled()) {
4312 if (migration_incoming_in_colo_state()) {
4313 /* In COLO stage, put all pages into cache temporarily */
4314 host = colo_cache_from_block_offset(block, addr, true);
4317 * In migration stage but before COLO stage,
4318 * Put all pages into both cache and SVM's memory.
4320 host_bak = colo_cache_from_block_offset(block, addr, false);
4324 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4328 if (!migration_incoming_in_colo_state()) {
4329 ramblock_recv_bitmap_set(block, host);
4332 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4335 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4336 case RAM_SAVE_FLAG_MEM_SIZE:
4337 /* Synchronize RAM block list */
4338 total_ram_bytes = addr;
4339 while (!ret && total_ram_bytes) {
4344 len = qemu_get_byte(f);
4345 qemu_get_buffer(f, (uint8_t *)id, len);
4347 length = qemu_get_be64(f);
4349 block = qemu_ram_block_by_name(id);
4350 if (block && !qemu_ram_is_migratable(block)) {
4351 error_report("block %s should not be migrated !", id);
4354 if (length != block->used_length) {
4355 Error *local_err = NULL;
4357 ret = qemu_ram_resize(block, length,
4360 error_report_err(local_err);
4363 /* For postcopy we need to check hugepage sizes match */
4364 if (postcopy_advised && migrate_postcopy_ram() &&
4365 block->page_size != qemu_host_page_size) {
4366 uint64_t remote_page_size = qemu_get_be64(f);
4367 if (remote_page_size != block->page_size) {
4368 error_report("Mismatched RAM page size %s "
4369 "(local) %zd != %" PRId64,
4370 id, block->page_size,
4375 if (migrate_ignore_shared()) {
4376 hwaddr addr = qemu_get_be64(f);
4377 if (ramblock_is_ignored(block) &&
4378 block->mr->addr != addr) {
4379 error_report("Mismatched GPAs for block %s "
4380 "%" PRId64 "!= %" PRId64,
4382 (uint64_t)block->mr->addr);
4386 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4389 error_report("Unknown ramblock \"%s\", cannot "
4390 "accept migration", id);
4394 total_ram_bytes -= length;
4398 case RAM_SAVE_FLAG_ZERO:
4399 ch = qemu_get_byte(f);
4400 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4403 case RAM_SAVE_FLAG_PAGE:
4404 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4407 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4408 len = qemu_get_be32(f);
4409 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4410 error_report("Invalid compressed data length: %d", len);
4414 decompress_data_with_multi_threads(f, host, len);
4417 case RAM_SAVE_FLAG_XBZRLE:
4418 if (load_xbzrle(f, addr, host) < 0) {
4419 error_report("Failed to decompress XBZRLE page at "
4420 RAM_ADDR_FMT, addr);
4425 case RAM_SAVE_FLAG_EOS:
4427 multifd_recv_sync_main();
4430 if (flags & RAM_SAVE_FLAG_HOOK) {
4431 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4433 error_report("Unknown combination of migration flags: 0x%x",
4439 ret = qemu_file_get_error(f);
4441 if (!ret && host_bak) {
4442 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4446 ret |= wait_for_decompress_done();
4450 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4453 static uint64_t seq_iter;
4455 * If system is running in postcopy mode, page inserts to host memory must
4458 bool postcopy_running = postcopy_is_running();
4462 if (version_id != 4) {
4467 * This RCU critical section can be very long running.
4468 * When RCU reclaims in the code start to become numerous,
4469 * it will be necessary to reduce the granularity of this
4472 WITH_RCU_READ_LOCK_GUARD() {
4473 if (postcopy_running) {
4475 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4476 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4477 * service fast page faults.
4479 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4481 ret = ram_load_precopy(f);
4484 trace_ram_load_complete(ret, seq_iter);
4489 static bool ram_has_postcopy(void *opaque)
4492 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4493 if (ramblock_is_pmem(rb)) {
4494 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4495 "is not supported now!", rb->idstr, rb->host);
4500 return migrate_postcopy_ram();
4503 /* Sync all the dirty bitmap with destination VM. */
4504 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4507 QEMUFile *file = s->to_dst_file;
4508 int ramblock_count = 0;
4510 trace_ram_dirty_bitmap_sync_start();
4512 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4513 qemu_savevm_send_recv_bitmap(file, block->idstr);
4514 trace_ram_dirty_bitmap_request(block->idstr);
4518 trace_ram_dirty_bitmap_sync_wait();
4520 /* Wait until all the ramblocks' dirty bitmap synced */
4521 while (ramblock_count--) {
4522 qemu_sem_wait(&s->rp_state.rp_sem);
4525 trace_ram_dirty_bitmap_sync_complete();
4530 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4532 qemu_sem_post(&s->rp_state.rp_sem);
4536 * Read the received bitmap, revert it as the initial dirty bitmap.
4537 * This is only used when the postcopy migration is paused but wants
4538 * to resume from a middle point.
4540 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4543 /* from_dst_file is always valid because we're within rp_thread */
4544 QEMUFile *file = s->rp_state.from_dst_file;
4545 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4546 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4547 uint64_t size, end_mark;
4549 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4551 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4552 error_report("%s: incorrect state %s", __func__,
4553 MigrationStatus_str(s->state));
4558 * Note: see comments in ramblock_recv_bitmap_send() on why we
4559 * need the endianness conversion, and the paddings.
4561 local_size = ROUND_UP(local_size, 8);
4564 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4566 size = qemu_get_be64(file);
4568 /* The size of the bitmap should match with our ramblock */
4569 if (size != local_size) {
4570 error_report("%s: ramblock '%s' bitmap size mismatch "
4571 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4572 block->idstr, size, local_size);
4577 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4578 end_mark = qemu_get_be64(file);
4580 ret = qemu_file_get_error(file);
4581 if (ret || size != local_size) {
4582 error_report("%s: read bitmap failed for ramblock '%s': %d"
4583 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4584 __func__, block->idstr, ret, local_size, size);
4589 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4590 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4591 __func__, block->idstr, end_mark);
4597 * Endianness conversion. We are during postcopy (though paused).
4598 * The dirty bitmap won't change. We can directly modify it.
4600 bitmap_from_le(block->bmap, le_bitmap, nbits);
4603 * What we received is "received bitmap". Revert it as the initial
4604 * dirty bitmap for this ramblock.
4606 bitmap_complement(block->bmap, block->bmap, nbits);
4608 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4609 ramblock_dirty_bitmap_clear_discarded_pages(block);
4611 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4612 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4615 * We succeeded to sync bitmap for current ramblock. If this is
4616 * the last one to sync, we need to notify the main send thread.
4618 ram_dirty_bitmap_reload_notify(s);
4626 static int ram_resume_prepare(MigrationState *s, void *opaque)
4628 RAMState *rs = *(RAMState **)opaque;
4631 ret = ram_dirty_bitmap_sync_all(s, rs);
4636 ram_state_resume_prepare(rs, s->to_dst_file);
4641 void postcopy_preempt_shutdown_file(MigrationState *s)
4643 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4644 qemu_fflush(s->postcopy_qemufile_src);
4647 static SaveVMHandlers savevm_ram_handlers = {
4648 .save_setup = ram_save_setup,
4649 .save_live_iterate = ram_save_iterate,
4650 .save_live_complete_postcopy = ram_save_complete,
4651 .save_live_complete_precopy = ram_save_complete,
4652 .has_postcopy = ram_has_postcopy,
4653 .state_pending_exact = ram_state_pending_exact,
4654 .state_pending_estimate = ram_state_pending_estimate,
4655 .load_state = ram_load,
4656 .save_cleanup = ram_save_cleanup,
4657 .load_setup = ram_load_setup,
4658 .load_cleanup = ram_load_cleanup,
4659 .resume_prepare = ram_resume_prepare,
4662 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4663 size_t old_size, size_t new_size)
4665 PostcopyState ps = postcopy_state_get();
4667 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4670 if (ramblock_is_ignored(rb)) {
4674 if (!migration_is_idle()) {
4676 * Precopy code on the source cannot deal with the size of RAM blocks
4677 * changing at random points in time - especially after sending the
4678 * RAM block sizes in the migration stream, they must no longer change.
4679 * Abort and indicate a proper reason.
4681 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4682 migration_cancel(err);
4687 case POSTCOPY_INCOMING_ADVISE:
4689 * Update what ram_postcopy_incoming_init()->init_range() does at the
4690 * time postcopy was advised. Syncing RAM blocks with the source will
4691 * result in RAM resizes.
4693 if (old_size < new_size) {
4694 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4695 error_report("RAM block '%s' discard of resized RAM failed",
4699 rb->postcopy_length = new_size;
4701 case POSTCOPY_INCOMING_NONE:
4702 case POSTCOPY_INCOMING_RUNNING:
4703 case POSTCOPY_INCOMING_END:
4705 * Once our guest is running, postcopy does no longer care about
4706 * resizes. When growing, the new memory was not available on the
4707 * source, no handler needed.
4711 error_report("RAM block '%s' resized during postcopy state: %d",
4717 static RAMBlockNotifier ram_mig_ram_notifier = {
4718 .ram_block_resized = ram_mig_ram_block_resized,
4721 void ram_mig_init(void)
4723 qemu_mutex_init(&XBZRLE.lock);
4724 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4725 ram_block_notifier_add(&ram_mig_ram_notifier);