4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
55 #include "sysemu/cpu-throttle.h"
59 #include "sysemu/runstate.h"
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
67 /***********************************************************/
68 /* ram save/restore */
70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71 * worked for pages that where filled with the same char. We switched
72 * it to only search for the zero value. And to avoid confusion with
73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
77 #define RAM_SAVE_FLAG_ZERO 0x02
78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
79 #define RAM_SAVE_FLAG_PAGE 0x08
80 #define RAM_SAVE_FLAG_EOS 0x10
81 #define RAM_SAVE_FLAG_CONTINUE 0x20
82 #define RAM_SAVE_FLAG_XBZRLE 0x40
83 /* 0x80 is reserved in migration.h start with 0x100 next */
84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
86 XBZRLECacheStats xbzrle_counters;
88 /* struct contains XBZRLE cache and a static page
89 used by the compression */
91 /* buffer used for XBZRLE encoding */
93 /* buffer for storing page content */
95 /* Cache for XBZRLE, Protected by lock. */
98 /* it will store a page full of zeros */
99 uint8_t *zero_target_page;
100 /* buffer used for XBZRLE decoding */
101 uint8_t *decoded_buf;
104 static void XBZRLE_cache_lock(void)
106 if (migrate_use_xbzrle()) {
107 qemu_mutex_lock(&XBZRLE.lock);
111 static void XBZRLE_cache_unlock(void)
113 if (migrate_use_xbzrle()) {
114 qemu_mutex_unlock(&XBZRLE.lock);
119 * xbzrle_cache_resize: resize the xbzrle cache
121 * This function is called from migrate_params_apply in main
122 * thread, possibly while a migration is in progress. A running
123 * migration may be using the cache and might finish during this call,
124 * hence changes to the cache are protected by XBZRLE.lock().
126 * Returns 0 for success or -1 for error
128 * @new_size: new cache size
129 * @errp: set *errp if the check failed, with reason
131 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
133 PageCache *new_cache;
136 /* Check for truncation */
137 if (new_size != (size_t)new_size) {
138 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
139 "exceeding address space");
143 if (new_size == migrate_xbzrle_cache_size()) {
150 if (XBZRLE.cache != NULL) {
151 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
157 cache_fini(XBZRLE.cache);
158 XBZRLE.cache = new_cache;
161 XBZRLE_cache_unlock();
165 static bool postcopy_preempt_active(void)
167 return migrate_postcopy_preempt() && migration_in_postcopy();
170 bool ramblock_is_ignored(RAMBlock *block)
172 return !qemu_ram_is_migratable(block) ||
173 (migrate_ignore_shared() && qemu_ram_is_shared(block));
176 #undef RAMBLOCK_FOREACH
178 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
183 RCU_READ_LOCK_GUARD();
185 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
186 ret = func(block, opaque);
194 static void ramblock_recv_map_init(void)
198 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
199 assert(!rb->receivedmap);
200 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
204 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
206 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
210 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
212 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
215 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
217 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
220 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
223 bitmap_set_atomic(rb->receivedmap,
224 ramblock_recv_bitmap_offset(host_addr, rb),
228 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
231 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
233 * Returns >0 if success with sent bytes, or <0 if error.
235 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
236 const char *block_name)
238 RAMBlock *block = qemu_ram_block_by_name(block_name);
239 unsigned long *le_bitmap, nbits;
243 error_report("%s: invalid block name: %s", __func__, block_name);
247 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
250 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
251 * machines we may need 4 more bytes for padding (see below
252 * comment). So extend it a bit before hand.
254 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
257 * Always use little endian when sending the bitmap. This is
258 * required that when source and destination VMs are not using the
259 * same endianness. (Note: big endian won't work.)
261 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
263 /* Size of the bitmap, in bytes */
264 size = DIV_ROUND_UP(nbits, 8);
267 * size is always aligned to 8 bytes for 64bit machines, but it
268 * may not be true for 32bit machines. We need this padding to
269 * make sure the migration can survive even between 32bit and
272 size = ROUND_UP(size, 8);
274 qemu_put_be64(file, size);
275 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
277 * Mark as an end, in case the middle part is screwed up due to
278 * some "mysterious" reason.
280 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
285 if (qemu_file_get_error(file)) {
286 return qemu_file_get_error(file);
289 return size + sizeof(size);
293 * An outstanding page request, on the source, having been received
296 struct RAMSrcPageRequest {
301 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
306 * Cached ramblock/offset values if preempted. They're only meaningful if
307 * preempted==true below.
310 unsigned long ram_page;
312 * Whether a postcopy preemption just happened. Will be reset after
313 * precopy recovered to background migration.
316 } PostcopyPreemptState;
318 /* State of RAM for migration */
320 /* QEMUFile used for this migration */
322 /* UFFD file descriptor, used in 'write-tracking' migration */
324 /* Last block that we have visited searching for dirty pages */
325 RAMBlock *last_seen_block;
326 /* Last block from where we have sent data */
327 RAMBlock *last_sent_block;
328 /* Last dirty target page we have sent */
329 ram_addr_t last_page;
330 /* last ram version we have seen */
331 uint32_t last_version;
332 /* How many times we have dirty too many pages */
333 int dirty_rate_high_cnt;
334 /* these variables are used for bitmap sync */
335 /* last time we did a full bitmap_sync */
336 int64_t time_last_bitmap_sync;
337 /* bytes transferred at start_time */
338 uint64_t bytes_xfer_prev;
339 /* number of dirty pages since start_time */
340 uint64_t num_dirty_pages_period;
341 /* xbzrle misses since the beginning of the period */
342 uint64_t xbzrle_cache_miss_prev;
343 /* Amount of xbzrle pages since the beginning of the period */
344 uint64_t xbzrle_pages_prev;
345 /* Amount of xbzrle encoded bytes since the beginning of the period */
346 uint64_t xbzrle_bytes_prev;
347 /* Start using XBZRLE (e.g., after the first round). */
349 /* Are we on the last stage of migration */
351 /* compression statistics since the beginning of the period */
352 /* amount of count that no free thread to compress data */
353 uint64_t compress_thread_busy_prev;
354 /* amount bytes after compression */
355 uint64_t compressed_size_prev;
356 /* amount of compressed pages */
357 uint64_t compress_pages_prev;
359 /* total handled target pages at the beginning of period */
360 uint64_t target_page_count_prev;
361 /* total handled target pages since start */
362 uint64_t target_page_count;
363 /* number of dirty bits in the bitmap */
364 uint64_t migration_dirty_pages;
365 /* Protects modification of the bitmap and migration dirty pages */
366 QemuMutex bitmap_mutex;
367 /* The RAMBlock used in the last src_page_requests */
368 RAMBlock *last_req_rb;
369 /* Queue of outstanding page requests from the destination */
370 QemuMutex src_page_req_mutex;
371 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
373 /* Postcopy preemption informations */
374 PostcopyPreemptState postcopy_preempt_state;
376 * Current channel we're using on src VM. Only valid if postcopy-preempt
379 unsigned int postcopy_channel;
381 typedef struct RAMState RAMState;
383 static RAMState *ram_state;
385 static NotifierWithReturnList precopy_notifier_list;
387 static void postcopy_preempt_reset(RAMState *rs)
389 memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
392 /* Whether postcopy has queued requests? */
393 static bool postcopy_has_request(RAMState *rs)
395 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
398 void precopy_infrastructure_init(void)
400 notifier_with_return_list_init(&precopy_notifier_list);
403 void precopy_add_notifier(NotifierWithReturn *n)
405 notifier_with_return_list_add(&precopy_notifier_list, n);
408 void precopy_remove_notifier(NotifierWithReturn *n)
410 notifier_with_return_remove(n);
413 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
415 PrecopyNotifyData pnd;
419 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
422 uint64_t ram_bytes_remaining(void)
424 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
428 MigrationStats ram_counters;
430 void ram_transferred_add(uint64_t bytes)
432 if (runstate_is_running()) {
433 ram_counters.precopy_bytes += bytes;
434 } else if (migration_in_postcopy()) {
435 ram_counters.postcopy_bytes += bytes;
437 ram_counters.downtime_bytes += bytes;
439 ram_counters.transferred += bytes;
442 void dirty_sync_missed_zero_copy(void)
444 ram_counters.dirty_sync_missed_zero_copy++;
447 /* used by the search for pages to send */
448 struct PageSearchStatus {
449 /* Current block being searched */
451 /* Current page to search from */
453 /* Set once we wrap around */
456 * [POSTCOPY-ONLY] Whether current page is explicitly requested by
457 * postcopy. When set, the request is "urgent" because the dest QEMU
458 * threads are waiting for us.
460 bool postcopy_requested;
462 * [POSTCOPY-ONLY] The target channel to use to send current page.
464 * Note: This may _not_ match with the value in postcopy_requested
465 * above. Let's imagine the case where the postcopy request is exactly
466 * the page that we're sending in progress during precopy. In this case
467 * we'll have postcopy_requested set to true but the target channel
468 * will be the precopy channel (so that we don't split brain on that
469 * specific page since the precopy channel already contains partial of
472 * Besides that specific use case, postcopy_target_channel should
473 * always be equal to postcopy_requested, because by default we send
474 * postcopy pages via postcopy preempt channel.
476 bool postcopy_target_channel;
478 typedef struct PageSearchStatus PageSearchStatus;
480 CompressionStats compression_counters;
482 struct CompressParam {
492 /* internally used fields */
496 typedef struct CompressParam CompressParam;
498 struct DecompressParam {
508 typedef struct DecompressParam DecompressParam;
510 static CompressParam *comp_param;
511 static QemuThread *compress_threads;
512 /* comp_done_cond is used to wake up the migration thread when
513 * one of the compression threads has finished the compression.
514 * comp_done_lock is used to co-work with comp_done_cond.
516 static QemuMutex comp_done_lock;
517 static QemuCond comp_done_cond;
519 static QEMUFile *decomp_file;
520 static DecompressParam *decomp_param;
521 static QemuThread *decompress_threads;
522 static QemuMutex decomp_done_lock;
523 static QemuCond decomp_done_cond;
525 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
526 ram_addr_t offset, uint8_t *source_buf);
528 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
529 bool postcopy_requested);
531 static void *do_data_compress(void *opaque)
533 CompressParam *param = opaque;
538 qemu_mutex_lock(¶m->mutex);
539 while (!param->quit) {
541 block = param->block;
542 offset = param->offset;
544 qemu_mutex_unlock(¶m->mutex);
546 zero_page = do_compress_ram_page(param->file, ¶m->stream,
547 block, offset, param->originbuf);
549 qemu_mutex_lock(&comp_done_lock);
551 param->zero_page = zero_page;
552 qemu_cond_signal(&comp_done_cond);
553 qemu_mutex_unlock(&comp_done_lock);
555 qemu_mutex_lock(¶m->mutex);
557 qemu_cond_wait(¶m->cond, ¶m->mutex);
560 qemu_mutex_unlock(¶m->mutex);
565 static void compress_threads_save_cleanup(void)
569 if (!migrate_use_compression() || !comp_param) {
573 thread_count = migrate_compress_threads();
574 for (i = 0; i < thread_count; i++) {
576 * we use it as a indicator which shows if the thread is
577 * properly init'd or not
579 if (!comp_param[i].file) {
583 qemu_mutex_lock(&comp_param[i].mutex);
584 comp_param[i].quit = true;
585 qemu_cond_signal(&comp_param[i].cond);
586 qemu_mutex_unlock(&comp_param[i].mutex);
588 qemu_thread_join(compress_threads + i);
589 qemu_mutex_destroy(&comp_param[i].mutex);
590 qemu_cond_destroy(&comp_param[i].cond);
591 deflateEnd(&comp_param[i].stream);
592 g_free(comp_param[i].originbuf);
593 qemu_fclose(comp_param[i].file);
594 comp_param[i].file = NULL;
596 qemu_mutex_destroy(&comp_done_lock);
597 qemu_cond_destroy(&comp_done_cond);
598 g_free(compress_threads);
600 compress_threads = NULL;
604 static int compress_threads_save_setup(void)
608 if (!migrate_use_compression()) {
611 thread_count = migrate_compress_threads();
612 compress_threads = g_new0(QemuThread, thread_count);
613 comp_param = g_new0(CompressParam, thread_count);
614 qemu_cond_init(&comp_done_cond);
615 qemu_mutex_init(&comp_done_lock);
616 for (i = 0; i < thread_count; i++) {
617 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
618 if (!comp_param[i].originbuf) {
622 if (deflateInit(&comp_param[i].stream,
623 migrate_compress_level()) != Z_OK) {
624 g_free(comp_param[i].originbuf);
628 /* comp_param[i].file is just used as a dummy buffer to save data,
629 * set its ops to empty.
631 comp_param[i].file = qemu_file_new_output(
632 QIO_CHANNEL(qio_channel_null_new()));
633 comp_param[i].done = true;
634 comp_param[i].quit = false;
635 qemu_mutex_init(&comp_param[i].mutex);
636 qemu_cond_init(&comp_param[i].cond);
637 qemu_thread_create(compress_threads + i, "compress",
638 do_data_compress, comp_param + i,
639 QEMU_THREAD_JOINABLE);
644 compress_threads_save_cleanup();
649 * save_page_header: write page header to wire
651 * If this is the 1st block, it also writes the block identification
653 * Returns the number of bytes written
655 * @f: QEMUFile where to send the data
656 * @block: block that contains the page we want to send
657 * @offset: offset inside the block for the page
658 * in the lower bits, it contains flags
660 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
665 if (block == rs->last_sent_block) {
666 offset |= RAM_SAVE_FLAG_CONTINUE;
668 qemu_put_be64(f, offset);
671 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
672 len = strlen(block->idstr);
673 qemu_put_byte(f, len);
674 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
676 rs->last_sent_block = block;
682 * mig_throttle_guest_down: throttle down the guest
684 * Reduce amount of guest cpu execution to hopefully slow down memory
685 * writes. If guest dirty memory rate is reduced below the rate at
686 * which we can transfer pages to the destination then we should be
687 * able to complete migration. Some workloads dirty memory way too
688 * fast and will not effectively converge, even with auto-converge.
690 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
691 uint64_t bytes_dirty_threshold)
693 MigrationState *s = migrate_get_current();
694 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
695 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
696 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
697 int pct_max = s->parameters.max_cpu_throttle;
699 uint64_t throttle_now = cpu_throttle_get_percentage();
700 uint64_t cpu_now, cpu_ideal, throttle_inc;
702 /* We have not started throttling yet. Let's start it. */
703 if (!cpu_throttle_active()) {
704 cpu_throttle_set(pct_initial);
706 /* Throttling already on, just increase the rate */
708 throttle_inc = pct_increment;
710 /* Compute the ideal CPU percentage used by Guest, which may
711 * make the dirty rate match the dirty rate threshold. */
712 cpu_now = 100 - throttle_now;
713 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
715 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
717 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
721 void mig_throttle_counter_reset(void)
723 RAMState *rs = ram_state;
725 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
726 rs->num_dirty_pages_period = 0;
727 rs->bytes_xfer_prev = ram_counters.transferred;
731 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
733 * @rs: current RAM state
734 * @current_addr: address for the zero page
736 * Update the xbzrle cache to reflect a page that's been sent as all 0.
737 * The important thing is that a stale (not-yet-0'd) page be replaced
739 * As a bonus, if the page wasn't in the cache it gets added so that
740 * when a small write is made into the 0'd page it gets XBZRLE sent.
742 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
744 /* We don't care if this fails to allocate a new cache page
745 * as long as it updated an old one */
746 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
747 ram_counters.dirty_sync_count);
750 #define ENCODING_FLAG_XBZRLE 0x1
753 * save_xbzrle_page: compress and send current page
755 * Returns: 1 means that we wrote the page
756 * 0 means that page is identical to the one already sent
757 * -1 means that xbzrle would be longer than normal
759 * @rs: current RAM state
760 * @current_data: pointer to the address of the page contents
761 * @current_addr: addr of the page
762 * @block: block that contains the page we want to send
763 * @offset: offset inside the block for the page
765 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
766 ram_addr_t current_addr, RAMBlock *block,
769 int encoded_len = 0, bytes_xbzrle;
770 uint8_t *prev_cached_page;
772 if (!cache_is_cached(XBZRLE.cache, current_addr,
773 ram_counters.dirty_sync_count)) {
774 xbzrle_counters.cache_miss++;
775 if (!rs->last_stage) {
776 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
777 ram_counters.dirty_sync_count) == -1) {
780 /* update *current_data when the page has been
781 inserted into cache */
782 *current_data = get_cached_data(XBZRLE.cache, current_addr);
789 * Reaching here means the page has hit the xbzrle cache, no matter what
790 * encoding result it is (normal encoding, overflow or skipping the page),
791 * count the page as encoded. This is used to calculate the encoding rate.
793 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
794 * 2nd page turns out to be skipped (i.e. no new bytes written to the
795 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
796 * skipped page included. In this way, the encoding rate can tell if the
797 * guest page is good for xbzrle encoding.
799 xbzrle_counters.pages++;
800 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
802 /* save current buffer into memory */
803 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
805 /* XBZRLE encoding (if there is no overflow) */
806 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
807 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
811 * Update the cache contents, so that it corresponds to the data
812 * sent, in all cases except where we skip the page.
814 if (!rs->last_stage && encoded_len != 0) {
815 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
817 * In the case where we couldn't compress, ensure that the caller
818 * sends the data from the cache, since the guest might have
819 * changed the RAM since we copied it.
821 *current_data = prev_cached_page;
824 if (encoded_len == 0) {
825 trace_save_xbzrle_page_skipping();
827 } else if (encoded_len == -1) {
828 trace_save_xbzrle_page_overflow();
829 xbzrle_counters.overflow++;
830 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
834 /* Send XBZRLE based compressed page */
835 bytes_xbzrle = save_page_header(rs, rs->f, block,
836 offset | RAM_SAVE_FLAG_XBZRLE);
837 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
838 qemu_put_be16(rs->f, encoded_len);
839 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
840 bytes_xbzrle += encoded_len + 1 + 2;
842 * Like compressed_size (please see update_compress_thread_counts),
843 * the xbzrle encoded bytes don't count the 8 byte header with
844 * RAM_SAVE_FLAG_CONTINUE.
846 xbzrle_counters.bytes += bytes_xbzrle - 8;
847 ram_transferred_add(bytes_xbzrle);
853 * migration_bitmap_find_dirty: find the next dirty page from start
855 * Returns the page offset within memory region of the start of a dirty page
857 * @rs: current RAM state
858 * @rb: RAMBlock where to search for dirty pages
859 * @start: page where we start the search
862 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
865 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
866 unsigned long *bitmap = rb->bmap;
868 if (ramblock_is_ignored(rb)) {
872 return find_next_bit(bitmap, size, start);
875 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
881 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
885 shift = rb->clear_bmap_shift;
887 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
888 * can make things easier sometimes since then start address
889 * of the small chunk will always be 64 pages aligned so the
890 * bitmap will always be aligned to unsigned long. We should
891 * even be able to remove this restriction but I'm simply
896 size = 1ULL << (TARGET_PAGE_BITS + shift);
897 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
898 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
899 memory_region_clear_dirty_bitmap(rb->mr, start, size);
903 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
905 unsigned long npages)
907 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
908 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
909 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
912 * Clear pages from start to start + npages - 1, so the end boundary is
915 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
916 migration_clear_memory_region_dirty_bitmap(rb, i);
921 * colo_bitmap_find_diry:find contiguous dirty pages from start
923 * Returns the page offset within memory region of the start of the contiguout
926 * @rs: current RAM state
927 * @rb: RAMBlock where to search for dirty pages
928 * @start: page where we start the search
929 * @num: the number of contiguous dirty pages
932 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
933 unsigned long start, unsigned long *num)
935 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
936 unsigned long *bitmap = rb->bmap;
937 unsigned long first, next;
941 if (ramblock_is_ignored(rb)) {
945 first = find_next_bit(bitmap, size, start);
949 next = find_next_zero_bit(bitmap, size, first + 1);
950 assert(next >= first);
955 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
962 * Clear dirty bitmap if needed. This _must_ be called before we
963 * send any of the page in the chunk because we need to make sure
964 * we can capture further page content changes when we sync dirty
965 * log the next time. So as long as we are going to send any of
966 * the page in the chunk we clear the remote dirty bitmap for all.
967 * Clearing it earlier won't be a problem, but too late will.
969 migration_clear_memory_region_dirty_bitmap(rb, page);
971 ret = test_and_clear_bit(page, rb->bmap);
973 rs->migration_dirty_pages--;
979 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
982 const hwaddr offset = section->offset_within_region;
983 const hwaddr size = int128_get64(section->size);
984 const unsigned long start = offset >> TARGET_PAGE_BITS;
985 const unsigned long npages = size >> TARGET_PAGE_BITS;
986 RAMBlock *rb = section->mr->ram_block;
987 uint64_t *cleared_bits = opaque;
990 * We don't grab ram_state->bitmap_mutex because we expect to run
991 * only when starting migration or during postcopy recovery where
992 * we don't have concurrent access.
994 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
995 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
997 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
998 bitmap_clear(rb->bmap, start, npages);
1002 * Exclude all dirty pages from migration that fall into a discarded range as
1003 * managed by a RamDiscardManager responsible for the mapped memory region of
1004 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1006 * Discarded pages ("logically unplugged") have undefined content and must
1007 * not get migrated, because even reading these pages for migration might
1008 * result in undesired behavior.
1010 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1012 * Note: The result is only stable while migrating (precopy/postcopy).
1014 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1016 uint64_t cleared_bits = 0;
1018 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1019 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1020 MemoryRegionSection section = {
1022 .offset_within_region = 0,
1023 .size = int128_make64(qemu_ram_get_used_length(rb)),
1026 ram_discard_manager_replay_discarded(rdm, §ion,
1027 dirty_bitmap_clear_section,
1030 return cleared_bits;
1034 * Check if a host-page aligned page falls into a discarded range as managed by
1035 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1037 * Note: The result is only stable while migrating (precopy/postcopy).
1039 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1041 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1042 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1043 MemoryRegionSection section = {
1045 .offset_within_region = start,
1046 .size = int128_make64(qemu_ram_pagesize(rb)),
1049 return !ram_discard_manager_is_populated(rdm, §ion);
1054 /* Called with RCU critical section */
1055 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1057 uint64_t new_dirty_pages =
1058 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1060 rs->migration_dirty_pages += new_dirty_pages;
1061 rs->num_dirty_pages_period += new_dirty_pages;
1065 * ram_pagesize_summary: calculate all the pagesizes of a VM
1067 * Returns a summary bitmap of the page sizes of all RAMBlocks
1069 * For VMs with just normal pages this is equivalent to the host page
1070 * size. If it's got some huge pages then it's the OR of all the
1071 * different page sizes.
1073 uint64_t ram_pagesize_summary(void)
1076 uint64_t summary = 0;
1078 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1079 summary |= block->page_size;
1085 uint64_t ram_get_total_transferred_pages(void)
1087 return ram_counters.normal + ram_counters.duplicate +
1088 compression_counters.pages + xbzrle_counters.pages;
1091 static void migration_update_rates(RAMState *rs, int64_t end_time)
1093 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1094 double compressed_size;
1096 /* calculate period counters */
1097 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1098 / (end_time - rs->time_last_bitmap_sync);
1104 if (migrate_use_xbzrle()) {
1105 double encoded_size, unencoded_size;
1107 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1108 rs->xbzrle_cache_miss_prev) / page_count;
1109 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1110 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1112 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1113 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1114 xbzrle_counters.encoding_rate = 0;
1116 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1118 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1119 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1122 if (migrate_use_compression()) {
1123 compression_counters.busy_rate = (double)(compression_counters.busy -
1124 rs->compress_thread_busy_prev) / page_count;
1125 rs->compress_thread_busy_prev = compression_counters.busy;
1127 compressed_size = compression_counters.compressed_size -
1128 rs->compressed_size_prev;
1129 if (compressed_size) {
1130 double uncompressed_size = (compression_counters.pages -
1131 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1133 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1134 compression_counters.compression_rate =
1135 uncompressed_size / compressed_size;
1137 rs->compress_pages_prev = compression_counters.pages;
1138 rs->compressed_size_prev = compression_counters.compressed_size;
1143 static void migration_trigger_throttle(RAMState *rs)
1145 MigrationState *s = migrate_get_current();
1146 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1148 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1149 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1150 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1152 /* During block migration the auto-converge logic incorrectly detects
1153 * that ram migration makes no progress. Avoid this by disabling the
1154 * throttling logic during the bulk phase of block migration. */
1155 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1156 /* The following detection logic can be refined later. For now:
1157 Check to see if the ratio between dirtied bytes and the approx.
1158 amount of bytes that just got transferred since the last time
1159 we were in this routine reaches the threshold. If that happens
1160 twice, start or increase throttling. */
1162 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1163 (++rs->dirty_rate_high_cnt >= 2)) {
1164 trace_migration_throttle();
1165 rs->dirty_rate_high_cnt = 0;
1166 mig_throttle_guest_down(bytes_dirty_period,
1167 bytes_dirty_threshold);
1172 static void migration_bitmap_sync(RAMState *rs)
1177 ram_counters.dirty_sync_count++;
1179 if (!rs->time_last_bitmap_sync) {
1180 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1183 trace_migration_bitmap_sync_start();
1184 memory_global_dirty_log_sync();
1186 qemu_mutex_lock(&rs->bitmap_mutex);
1187 WITH_RCU_READ_LOCK_GUARD() {
1188 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1189 ramblock_sync_dirty_bitmap(rs, block);
1191 ram_counters.remaining = ram_bytes_remaining();
1193 qemu_mutex_unlock(&rs->bitmap_mutex);
1195 memory_global_after_dirty_log_sync();
1196 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1198 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1200 /* more than 1 second = 1000 millisecons */
1201 if (end_time > rs->time_last_bitmap_sync + 1000) {
1202 migration_trigger_throttle(rs);
1204 migration_update_rates(rs, end_time);
1206 rs->target_page_count_prev = rs->target_page_count;
1208 /* reset period counters */
1209 rs->time_last_bitmap_sync = end_time;
1210 rs->num_dirty_pages_period = 0;
1211 rs->bytes_xfer_prev = ram_counters.transferred;
1213 if (migrate_use_events()) {
1214 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1218 static void migration_bitmap_sync_precopy(RAMState *rs)
1220 Error *local_err = NULL;
1223 * The current notifier usage is just an optimization to migration, so we
1224 * don't stop the normal migration process in the error case.
1226 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1227 error_report_err(local_err);
1231 migration_bitmap_sync(rs);
1233 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1234 error_report_err(local_err);
1238 void ram_release_page(const char *rbname, uint64_t offset)
1240 if (!migrate_release_ram() || !migration_in_postcopy()) {
1244 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1248 * save_zero_page_to_file: send the zero page to the file
1250 * Returns the size of data written to the file, 0 means the page is not
1253 * @rs: current RAM state
1254 * @file: the file where the data is saved
1255 * @block: block that contains the page we want to send
1256 * @offset: offset inside the block for the page
1258 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1259 RAMBlock *block, ram_addr_t offset)
1261 uint8_t *p = block->host + offset;
1264 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1265 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1266 qemu_put_byte(file, 0);
1268 ram_release_page(block->idstr, offset);
1274 * save_zero_page: send the zero page to the stream
1276 * Returns the number of pages written.
1278 * @rs: current RAM state
1279 * @block: block that contains the page we want to send
1280 * @offset: offset inside the block for the page
1282 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1284 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1287 ram_counters.duplicate++;
1288 ram_transferred_add(len);
1295 * @pages: the number of pages written by the control path,
1297 * > 0 - number of pages written
1299 * Return true if the pages has been saved, otherwise false is returned.
1301 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1304 uint64_t bytes_xmit = 0;
1308 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1310 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1315 ram_transferred_add(bytes_xmit);
1319 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1323 if (bytes_xmit > 0) {
1324 ram_counters.normal++;
1325 } else if (bytes_xmit == 0) {
1326 ram_counters.duplicate++;
1333 * directly send the page to the stream
1335 * Returns the number of pages written.
1337 * @rs: current RAM state
1338 * @block: block that contains the page we want to send
1339 * @offset: offset inside the block for the page
1340 * @buf: the page to be sent
1341 * @async: send to page asyncly
1343 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1344 uint8_t *buf, bool async)
1346 ram_transferred_add(save_page_header(rs, rs->f, block,
1347 offset | RAM_SAVE_FLAG_PAGE));
1349 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1350 migrate_release_ram() &&
1351 migration_in_postcopy());
1353 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1355 ram_transferred_add(TARGET_PAGE_SIZE);
1356 ram_counters.normal++;
1361 * ram_save_page: send the given page to the stream
1363 * Returns the number of pages written.
1365 * >=0 - Number of pages written - this might legally be 0
1366 * if xbzrle noticed the page was the same.
1368 * @rs: current RAM state
1369 * @block: block that contains the page we want to send
1370 * @offset: offset inside the block for the page
1372 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1376 bool send_async = true;
1377 RAMBlock *block = pss->block;
1378 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1379 ram_addr_t current_addr = block->offset + offset;
1381 p = block->host + offset;
1382 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1384 XBZRLE_cache_lock();
1385 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1386 pages = save_xbzrle_page(rs, &p, current_addr, block,
1388 if (!rs->last_stage) {
1389 /* Can't send this cached data async, since the cache page
1390 * might get updated before it gets to the wire
1396 /* XBZRLE overflow or normal page */
1398 pages = save_normal_page(rs, block, offset, p, send_async);
1401 XBZRLE_cache_unlock();
1406 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1409 if (multifd_queue_page(rs->f, block, offset) < 0) {
1412 ram_counters.normal++;
1417 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1418 ram_addr_t offset, uint8_t *source_buf)
1420 RAMState *rs = ram_state;
1421 uint8_t *p = block->host + offset;
1424 if (save_zero_page_to_file(rs, f, block, offset)) {
1428 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1431 * copy it to a internal buffer to avoid it being modified by VM
1432 * so that we can catch up the error during compression and
1435 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1436 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1438 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1439 error_report("compressed data failed!");
1445 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1447 ram_transferred_add(bytes_xmit);
1449 if (param->zero_page) {
1450 ram_counters.duplicate++;
1454 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1455 compression_counters.compressed_size += bytes_xmit - 8;
1456 compression_counters.pages++;
1459 static bool save_page_use_compression(RAMState *rs);
1461 static void flush_compressed_data(RAMState *rs)
1463 int idx, len, thread_count;
1465 if (!save_page_use_compression(rs)) {
1468 thread_count = migrate_compress_threads();
1470 qemu_mutex_lock(&comp_done_lock);
1471 for (idx = 0; idx < thread_count; idx++) {
1472 while (!comp_param[idx].done) {
1473 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1476 qemu_mutex_unlock(&comp_done_lock);
1478 for (idx = 0; idx < thread_count; idx++) {
1479 qemu_mutex_lock(&comp_param[idx].mutex);
1480 if (!comp_param[idx].quit) {
1481 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1483 * it's safe to fetch zero_page without holding comp_done_lock
1484 * as there is no further request submitted to the thread,
1485 * i.e, the thread should be waiting for a request at this point.
1487 update_compress_thread_counts(&comp_param[idx], len);
1489 qemu_mutex_unlock(&comp_param[idx].mutex);
1493 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1496 param->block = block;
1497 param->offset = offset;
1500 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1503 int idx, thread_count, bytes_xmit = -1, pages = -1;
1504 bool wait = migrate_compress_wait_thread();
1506 thread_count = migrate_compress_threads();
1507 qemu_mutex_lock(&comp_done_lock);
1509 for (idx = 0; idx < thread_count; idx++) {
1510 if (comp_param[idx].done) {
1511 comp_param[idx].done = false;
1512 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1513 qemu_mutex_lock(&comp_param[idx].mutex);
1514 set_compress_params(&comp_param[idx], block, offset);
1515 qemu_cond_signal(&comp_param[idx].cond);
1516 qemu_mutex_unlock(&comp_param[idx].mutex);
1518 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1524 * wait for the free thread if the user specifies 'compress-wait-thread',
1525 * otherwise we will post the page out in the main thread as normal page.
1527 if (pages < 0 && wait) {
1528 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1531 qemu_mutex_unlock(&comp_done_lock);
1537 * find_dirty_block: find the next dirty page and update any state
1538 * associated with the search process.
1540 * Returns true if a page is found
1542 * @rs: current RAM state
1543 * @pss: data about the state of the current dirty page scan
1544 * @again: set to false if the search has scanned the whole of RAM
1546 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1549 * This is not a postcopy requested page, mark it "not urgent", and use
1550 * precopy channel to send it.
1552 pss->postcopy_requested = false;
1553 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
1555 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1556 if (pss->complete_round && pss->block == rs->last_seen_block &&
1557 pss->page >= rs->last_page) {
1559 * We've been once around the RAM and haven't found anything.
1565 if (!offset_in_ramblock(pss->block,
1566 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1567 /* Didn't find anything in this RAM Block */
1569 pss->block = QLIST_NEXT_RCU(pss->block, next);
1572 * If memory migration starts over, we will meet a dirtied page
1573 * which may still exists in compression threads's ring, so we
1574 * should flush the compressed data to make sure the new page
1575 * is not overwritten by the old one in the destination.
1577 * Also If xbzrle is on, stop using the data compression at this
1578 * point. In theory, xbzrle can do better than compression.
1580 flush_compressed_data(rs);
1582 /* Hit the end of the list */
1583 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1584 /* Flag that we've looped */
1585 pss->complete_round = true;
1586 /* After the first round, enable XBZRLE. */
1587 if (migrate_use_xbzrle()) {
1588 rs->xbzrle_enabled = true;
1591 /* Didn't find anything this time, but try again on the new block */
1595 /* Can go around again, but... */
1597 /* We've found something so probably don't need to */
1603 * unqueue_page: gets a page of the queue
1605 * Helper for 'get_queued_page' - gets a page off the queue
1607 * Returns the block of the page (or NULL if none available)
1609 * @rs: current RAM state
1610 * @offset: used to return the offset within the RAMBlock
1612 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1614 struct RAMSrcPageRequest *entry;
1615 RAMBlock *block = NULL;
1617 if (!postcopy_has_request(rs)) {
1621 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1624 * This should _never_ change even after we take the lock, because no one
1625 * should be taking anything off the request list other than us.
1627 assert(postcopy_has_request(rs));
1629 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1631 *offset = entry->offset;
1633 if (entry->len > TARGET_PAGE_SIZE) {
1634 entry->len -= TARGET_PAGE_SIZE;
1635 entry->offset += TARGET_PAGE_SIZE;
1637 memory_region_unref(block->mr);
1638 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1640 migration_consume_urgent_request();
1646 #if defined(__linux__)
1648 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1649 * is found, return RAM block pointer and page offset
1651 * Returns pointer to the RAMBlock containing faulting page,
1652 * NULL if no write faults are pending
1654 * @rs: current RAM state
1655 * @offset: page offset from the beginning of the block
1657 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1659 struct uffd_msg uffd_msg;
1664 if (!migrate_background_snapshot()) {
1668 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1673 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1674 block = qemu_ram_block_from_host(page_address, false, offset);
1675 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1680 * ram_save_release_protection: release UFFD write protection after
1681 * a range of pages has been saved
1683 * @rs: current RAM state
1684 * @pss: page-search-status structure
1685 * @start_page: index of the first page in the range relative to pss->block
1687 * Returns 0 on success, negative value in case of an error
1689 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1690 unsigned long start_page)
1694 /* Check if page is from UFFD-managed region. */
1695 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1696 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1697 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1699 /* Flush async buffers before un-protect. */
1701 /* Un-protect memory range. */
1702 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1709 /* ram_write_tracking_available: check if kernel supports required UFFD features
1711 * Returns true if supports, false otherwise
1713 bool ram_write_tracking_available(void)
1715 uint64_t uffd_features;
1718 res = uffd_query_features(&uffd_features);
1720 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1723 /* ram_write_tracking_compatible: check if guest configuration is
1724 * compatible with 'write-tracking'
1726 * Returns true if compatible, false otherwise
1728 bool ram_write_tracking_compatible(void)
1730 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1735 /* Open UFFD file descriptor */
1736 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1741 RCU_READ_LOCK_GUARD();
1743 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1744 uint64_t uffd_ioctls;
1746 /* Nothing to do with read-only and MMIO-writable regions */
1747 if (block->mr->readonly || block->mr->rom_device) {
1750 /* Try to register block memory via UFFD-IO to track writes */
1751 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1752 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1755 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1762 uffd_close_fd(uffd_fd);
1766 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1770 * We read one byte of each page; this will preallocate page tables if
1771 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1772 * where no page was populated yet. This might require adaption when
1773 * supporting other mappings, like shmem.
1775 for (; offset < size; offset += block->page_size) {
1776 char tmp = *((char *)block->host + offset);
1778 /* Don't optimize the read out */
1779 asm volatile("" : "+r" (tmp));
1783 static inline int populate_read_section(MemoryRegionSection *section,
1786 const hwaddr size = int128_get64(section->size);
1787 hwaddr offset = section->offset_within_region;
1788 RAMBlock *block = section->mr->ram_block;
1790 populate_read_range(block, offset, size);
1795 * ram_block_populate_read: preallocate page tables and populate pages in the
1796 * RAM block by reading a byte of each page.
1798 * Since it's solely used for userfault_fd WP feature, here we just
1799 * hardcode page size to qemu_real_host_page_size.
1801 * @block: RAM block to populate
1803 static void ram_block_populate_read(RAMBlock *rb)
1806 * Skip populating all pages that fall into a discarded range as managed by
1807 * a RamDiscardManager responsible for the mapped memory region of the
1808 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1809 * must not get populated automatically. We don't have to track
1810 * modifications via userfaultfd WP reliably, because these pages will
1811 * not be part of the migration stream either way -- see
1812 * ramblock_dirty_bitmap_exclude_discarded_pages().
1814 * Note: The result is only stable while migrating (precopy/postcopy).
1816 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1817 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1818 MemoryRegionSection section = {
1820 .offset_within_region = 0,
1821 .size = rb->mr->size,
1824 ram_discard_manager_replay_populated(rdm, §ion,
1825 populate_read_section, NULL);
1827 populate_read_range(rb, 0, rb->used_length);
1832 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1834 void ram_write_tracking_prepare(void)
1838 RCU_READ_LOCK_GUARD();
1840 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1841 /* Nothing to do with read-only and MMIO-writable regions */
1842 if (block->mr->readonly || block->mr->rom_device) {
1847 * Populate pages of the RAM block before enabling userfault_fd
1850 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1851 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1852 * pages with pte_none() entries in page table.
1854 ram_block_populate_read(block);
1859 * ram_write_tracking_start: start UFFD-WP memory tracking
1861 * Returns 0 for success or negative value in case of error
1863 int ram_write_tracking_start(void)
1866 RAMState *rs = ram_state;
1869 /* Open UFFD file descriptor */
1870 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1874 rs->uffdio_fd = uffd_fd;
1876 RCU_READ_LOCK_GUARD();
1878 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1879 /* Nothing to do with read-only and MMIO-writable regions */
1880 if (block->mr->readonly || block->mr->rom_device) {
1884 /* Register block memory with UFFD to track writes */
1885 if (uffd_register_memory(rs->uffdio_fd, block->host,
1886 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1889 /* Apply UFFD write protection to the block memory range */
1890 if (uffd_change_protection(rs->uffdio_fd, block->host,
1891 block->max_length, true, false)) {
1894 block->flags |= RAM_UF_WRITEPROTECT;
1895 memory_region_ref(block->mr);
1897 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1898 block->host, block->max_length);
1904 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1906 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1907 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1911 * In case some memory block failed to be write-protected
1912 * remove protection and unregister all succeeded RAM blocks
1914 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1916 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1917 /* Cleanup flags and remove reference */
1918 block->flags &= ~RAM_UF_WRITEPROTECT;
1919 memory_region_unref(block->mr);
1922 uffd_close_fd(uffd_fd);
1928 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1930 void ram_write_tracking_stop(void)
1932 RAMState *rs = ram_state;
1935 RCU_READ_LOCK_GUARD();
1937 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1938 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1941 /* Remove protection and unregister all affected RAM blocks */
1942 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1944 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1946 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1947 block->host, block->max_length);
1949 /* Cleanup flags and remove reference */
1950 block->flags &= ~RAM_UF_WRITEPROTECT;
1951 memory_region_unref(block->mr);
1954 /* Finally close UFFD file descriptor */
1955 uffd_close_fd(rs->uffdio_fd);
1960 /* No target OS support, stubs just fail or ignore */
1962 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1970 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1971 unsigned long start_page)
1980 bool ram_write_tracking_available(void)
1985 bool ram_write_tracking_compatible(void)
1991 int ram_write_tracking_start(void)
1997 void ram_write_tracking_stop(void)
2001 #endif /* defined(__linux__) */
2004 * Check whether two addr/offset of the ramblock falls onto the same host huge
2005 * page. Returns true if so, false otherwise.
2007 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
2010 size_t page_size = qemu_ram_pagesize(rb);
2012 addr1 = ROUND_DOWN(addr1, page_size);
2013 addr2 = ROUND_DOWN(addr2, page_size);
2015 return addr1 == addr2;
2019 * Whether a previous preempted precopy huge page contains current requested
2020 * page? Returns true if so, false otherwise.
2022 * This should really happen very rarely, because it means when we were sending
2023 * during background migration for postcopy we're sending exactly the page that
2024 * some vcpu got faulted on on dest node. When it happens, we probably don't
2025 * need to do much but drop the request, because we know right after we restore
2026 * the precopy stream it'll be serviced. It'll slightly affect the order of
2027 * postcopy requests to be serviced (e.g. it'll be the same as we move current
2028 * request to the end of the queue) but it shouldn't be a big deal. The most
2029 * imporant thing is we can _never_ try to send a partial-sent huge page on the
2030 * POSTCOPY channel again, otherwise that huge page will got "split brain" on
2031 * two channels (PRECOPY, POSTCOPY).
2033 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
2036 PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2038 /* No preemption at all? */
2039 if (!state->preempted) {
2043 /* Not even the same ramblock? */
2044 if (state->ram_block != block) {
2048 return offset_on_same_huge_page(block, offset,
2049 state->ram_page << TARGET_PAGE_BITS);
2053 * get_queued_page: unqueue a page from the postcopy requests
2055 * Skips pages that are already sent (!dirty)
2057 * Returns true if a queued page is found
2059 * @rs: current RAM state
2060 * @pss: data about the state of the current dirty page scan
2062 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2069 block = unqueue_page(rs, &offset);
2071 * We're sending this page, and since it's postcopy nothing else
2072 * will dirty it, and we must make sure it doesn't get sent again
2073 * even if this queue request was received after the background
2074 * search already sent it.
2079 page = offset >> TARGET_PAGE_BITS;
2080 dirty = test_bit(page, block->bmap);
2082 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2085 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2089 } while (block && !dirty);
2092 /* See comment above postcopy_preempted_contains() */
2093 if (postcopy_preempted_contains(rs, block, offset)) {
2094 trace_postcopy_preempt_hit(block->idstr, offset);
2096 * If what we preempted previously was exactly what we're
2097 * requesting right now, restore the preempted precopy
2098 * immediately, boosting its priority as it's requested by
2101 postcopy_preempt_restore(rs, pss, true);
2106 * Poll write faults too if background snapshot is enabled; that's
2107 * when we have vcpus got blocked by the write protected pages.
2109 block = poll_fault_page(rs, &offset);
2114 * We want the background search to continue from the queued page
2115 * since the guest is likely to want other pages near to the page
2116 * it just requested.
2119 pss->page = offset >> TARGET_PAGE_BITS;
2122 * This unqueued page would break the "one round" check, even is
2125 pss->complete_round = false;
2126 /* Mark it an urgent request, meanwhile using POSTCOPY channel */
2127 pss->postcopy_requested = true;
2128 pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
2135 * migration_page_queue_free: drop any remaining pages in the ram
2138 * It should be empty at the end anyway, but in error cases there may
2139 * be some left. in case that there is any page left, we drop it.
2142 static void migration_page_queue_free(RAMState *rs)
2144 struct RAMSrcPageRequest *mspr, *next_mspr;
2145 /* This queue generally should be empty - but in the case of a failed
2146 * migration might have some droppings in.
2148 RCU_READ_LOCK_GUARD();
2149 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2150 memory_region_unref(mspr->rb->mr);
2151 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2157 * ram_save_queue_pages: queue the page for transmission
2159 * A request from postcopy destination for example.
2161 * Returns zero on success or negative on error
2163 * @rbname: Name of the RAMBLock of the request. NULL means the
2164 * same that last one.
2165 * @start: starting address from the start of the RAMBlock
2166 * @len: length (in bytes) to send
2168 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2171 RAMState *rs = ram_state;
2173 ram_counters.postcopy_requests++;
2174 RCU_READ_LOCK_GUARD();
2177 /* Reuse last RAMBlock */
2178 ramblock = rs->last_req_rb;
2182 * Shouldn't happen, we can't reuse the last RAMBlock if
2183 * it's the 1st request.
2185 error_report("ram_save_queue_pages no previous block");
2189 ramblock = qemu_ram_block_by_name(rbname);
2192 /* We shouldn't be asked for a non-existent RAMBlock */
2193 error_report("ram_save_queue_pages no block '%s'", rbname);
2196 rs->last_req_rb = ramblock;
2198 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2199 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2200 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2201 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2202 __func__, start, len, ramblock->used_length);
2206 struct RAMSrcPageRequest *new_entry =
2207 g_new0(struct RAMSrcPageRequest, 1);
2208 new_entry->rb = ramblock;
2209 new_entry->offset = start;
2210 new_entry->len = len;
2212 memory_region_ref(ramblock->mr);
2213 qemu_mutex_lock(&rs->src_page_req_mutex);
2214 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2215 migration_make_urgent_request();
2216 qemu_mutex_unlock(&rs->src_page_req_mutex);
2221 static bool save_page_use_compression(RAMState *rs)
2223 if (!migrate_use_compression()) {
2228 * If xbzrle is enabled (e.g., after first round of migration), stop
2229 * using the data compression. In theory, xbzrle can do better than
2232 if (rs->xbzrle_enabled) {
2240 * try to compress the page before posting it out, return true if the page
2241 * has been properly handled by compression, otherwise needs other
2242 * paths to handle it
2244 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2246 if (!save_page_use_compression(rs)) {
2251 * When starting the process of a new block, the first page of
2252 * the block should be sent out before other pages in the same
2253 * block, and all the pages in last block should have been sent
2254 * out, keeping this order is important, because the 'cont' flag
2255 * is used to avoid resending the block name.
2257 * We post the fist page as normal page as compression will take
2258 * much CPU resource.
2260 if (block != rs->last_sent_block) {
2261 flush_compressed_data(rs);
2265 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2269 compression_counters.busy++;
2274 * ram_save_target_page: save one target page
2276 * Returns the number of pages written
2278 * @rs: current RAM state
2279 * @pss: data about the page we want to send
2281 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2283 RAMBlock *block = pss->block;
2284 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2287 if (control_save_page(rs, block, offset, &res)) {
2291 if (save_compress_page(rs, block, offset)) {
2295 res = save_zero_page(rs, block, offset);
2297 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2298 * page would be stale
2300 if (rs->xbzrle_enabled) {
2301 XBZRLE_cache_lock();
2302 xbzrle_cache_zero_page(rs, block->offset + offset);
2303 XBZRLE_cache_unlock();
2309 * Do not use multifd in postcopy as one whole host page should be
2310 * placed. Meanwhile postcopy requires atomic update of pages, so even
2311 * if host page size == guest page size the dest guest during run may
2312 * still see partially copied pages which is data corruption.
2314 if (migrate_use_multifd() && !migration_in_postcopy()) {
2315 return ram_save_multifd_page(rs, block, offset);
2318 return ram_save_page(rs, pss);
2321 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
2323 MigrationState *ms = migrate_get_current();
2325 /* Not enabled eager preempt? Then never do that. */
2326 if (!migrate_postcopy_preempt()) {
2330 /* If the user explicitly disabled breaking of huge page, skip */
2331 if (!ms->postcopy_preempt_break_huge) {
2335 /* If the ramblock we're sending is a small page? Never bother. */
2336 if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
2340 /* Not in postcopy at all? */
2341 if (!migration_in_postcopy()) {
2346 * If we're already handling a postcopy request, don't preempt as this page
2347 * has got the same high priority.
2349 if (pss->postcopy_requested) {
2353 /* If there's postcopy requests, then check it up! */
2354 return postcopy_has_request(rs);
2357 /* Returns true if we preempted precopy, false otherwise */
2358 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
2360 PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
2362 trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
2365 * Time to preempt precopy. Cache current PSS into preempt state, so that
2366 * after handling the postcopy pages we can recover to it. We need to do
2367 * so because the dest VM will have partial of the precopy huge page kept
2368 * over in its tmp huge page caches; better move on with it when we can.
2370 p_state->ram_block = pss->block;
2371 p_state->ram_page = pss->page;
2372 p_state->preempted = true;
2375 /* Whether we're preempted by a postcopy request during sending a huge page */
2376 static bool postcopy_preempt_triggered(RAMState *rs)
2378 return rs->postcopy_preempt_state.preempted;
2381 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
2382 bool postcopy_requested)
2384 PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2386 assert(state->preempted);
2388 pss->block = state->ram_block;
2389 pss->page = state->ram_page;
2391 /* Whether this is a postcopy request? */
2392 pss->postcopy_requested = postcopy_requested;
2394 * When restoring a preempted page, the old data resides in PRECOPY
2395 * slow channel, even if postcopy_requested is set. So always use
2396 * PRECOPY channel here.
2398 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
2400 trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
2402 /* Reset preempt state, most importantly, set preempted==false */
2403 postcopy_preempt_reset(rs);
2406 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
2408 MigrationState *s = migrate_get_current();
2409 unsigned int channel = pss->postcopy_target_channel;
2412 if (channel != rs->postcopy_channel) {
2413 if (channel == RAM_CHANNEL_PRECOPY) {
2414 next = s->to_dst_file;
2416 next = s->postcopy_qemufile_src;
2418 /* Update and cache the current channel */
2420 rs->postcopy_channel = channel;
2423 * If channel switched, reset last_sent_block since the old sent block
2424 * may not be on the same channel.
2426 rs->last_sent_block = NULL;
2428 trace_postcopy_preempt_switch_channel(channel);
2431 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2434 /* We need to make sure rs->f always points to the default channel elsewhere */
2435 static void postcopy_preempt_reset_channel(RAMState *rs)
2437 if (postcopy_preempt_active()) {
2438 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2439 rs->f = migrate_get_current()->to_dst_file;
2440 trace_postcopy_preempt_reset_channel();
2445 * ram_save_host_page: save a whole host page
2447 * Starting at *offset send pages up to the end of the current host
2448 * page. It's valid for the initial offset to point into the middle of
2449 * a host page in which case the remainder of the hostpage is sent.
2450 * Only dirty target pages are sent. Note that the host page size may
2451 * be a huge page for this block.
2452 * The saving stops at the boundary of the used_length of the block
2453 * if the RAMBlock isn't a multiple of the host page size.
2455 * Returns the number of pages written or negative on error
2457 * @rs: current RAM state
2458 * @pss: data about the page we want to send
2460 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2462 int tmppages, pages = 0;
2463 size_t pagesize_bits =
2464 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2465 unsigned long hostpage_boundary =
2466 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2467 unsigned long start_page = pss->page;
2470 if (ramblock_is_ignored(pss->block)) {
2471 error_report("block %s should not be migrated !", pss->block->idstr);
2475 if (postcopy_preempt_active()) {
2476 postcopy_preempt_choose_channel(rs, pss);
2480 if (postcopy_needs_preempt(rs, pss)) {
2481 postcopy_do_preempt(rs, pss);
2485 /* Check the pages is dirty and if it is send it */
2486 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2487 tmppages = ram_save_target_page(rs, pss);
2494 * Allow rate limiting to happen in the middle of huge pages if
2495 * something is sent in the current iteration.
2497 if (pagesize_bits > 1 && tmppages > 0) {
2498 migration_rate_limit();
2501 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2502 } while ((pss->page < hostpage_boundary) &&
2503 offset_in_ramblock(pss->block,
2504 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2505 /* The offset we leave with is the min boundary of host page and block */
2506 pss->page = MIN(pss->page, hostpage_boundary);
2509 * When with postcopy preempt mode, flush the data as soon as possible for
2510 * postcopy requests, because we've already sent a whole huge page, so the
2511 * dst node should already have enough resource to atomically filling in
2512 * the current missing page.
2514 * More importantly, when using separate postcopy channel, we must do
2515 * explicit flush or it won't flush until the buffer is full.
2517 if (migrate_postcopy_preempt() && pss->postcopy_requested) {
2521 res = ram_save_release_protection(rs, pss, start_page);
2522 return (res < 0 ? res : pages);
2526 * ram_find_and_save_block: finds a dirty page and sends it to f
2528 * Called within an RCU critical section.
2530 * Returns the number of pages written where zero means no dirty pages,
2531 * or negative on error
2533 * @rs: current RAM state
2535 * On systems where host-page-size > target-page-size it will send all the
2536 * pages in a host page that are dirty.
2538 static int ram_find_and_save_block(RAMState *rs)
2540 PageSearchStatus pss;
2544 /* No dirty page as there is zero RAM */
2545 if (!ram_bytes_total()) {
2550 * Always keep last_seen_block/last_page valid during this procedure,
2551 * because find_dirty_block() relies on these values (e.g., we compare
2552 * last_seen_block with pss.block to see whether we searched all the
2553 * ramblocks) to detect the completion of migration. Having NULL value
2554 * of last_seen_block can conditionally cause below loop to run forever.
2556 if (!rs->last_seen_block) {
2557 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2561 pss.block = rs->last_seen_block;
2562 pss.page = rs->last_page;
2563 pss.complete_round = false;
2567 found = get_queued_page(rs, &pss);
2571 * Recover previous precopy ramblock/offset if postcopy has
2572 * preempted precopy. Otherwise find the next dirty bit.
2574 if (postcopy_preempt_triggered(rs)) {
2575 postcopy_preempt_restore(rs, &pss, false);
2578 /* priority queue empty, so just search for something dirty */
2579 found = find_dirty_block(rs, &pss, &again);
2584 pages = ram_save_host_page(rs, &pss);
2586 } while (!pages && again);
2588 rs->last_seen_block = pss.block;
2589 rs->last_page = pss.page;
2594 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2596 uint64_t pages = size / TARGET_PAGE_SIZE;
2599 ram_counters.duplicate += pages;
2601 ram_counters.normal += pages;
2602 ram_transferred_add(size);
2603 qemu_file_credit_transfer(f, size);
2607 static uint64_t ram_bytes_total_common(bool count_ignored)
2612 RCU_READ_LOCK_GUARD();
2614 if (count_ignored) {
2615 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2616 total += block->used_length;
2619 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2620 total += block->used_length;
2626 uint64_t ram_bytes_total(void)
2628 return ram_bytes_total_common(false);
2631 static void xbzrle_load_setup(void)
2633 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2636 static void xbzrle_load_cleanup(void)
2638 g_free(XBZRLE.decoded_buf);
2639 XBZRLE.decoded_buf = NULL;
2642 static void ram_state_cleanup(RAMState **rsp)
2645 migration_page_queue_free(*rsp);
2646 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2647 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2653 static void xbzrle_cleanup(void)
2655 XBZRLE_cache_lock();
2657 cache_fini(XBZRLE.cache);
2658 g_free(XBZRLE.encoded_buf);
2659 g_free(XBZRLE.current_buf);
2660 g_free(XBZRLE.zero_target_page);
2661 XBZRLE.cache = NULL;
2662 XBZRLE.encoded_buf = NULL;
2663 XBZRLE.current_buf = NULL;
2664 XBZRLE.zero_target_page = NULL;
2666 XBZRLE_cache_unlock();
2669 static void ram_save_cleanup(void *opaque)
2671 RAMState **rsp = opaque;
2674 /* We don't use dirty log with background snapshots */
2675 if (!migrate_background_snapshot()) {
2676 /* caller have hold iothread lock or is in a bh, so there is
2677 * no writing race against the migration bitmap
2679 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2681 * do not stop dirty log without starting it, since
2682 * memory_global_dirty_log_stop will assert that
2683 * memory_global_dirty_log_start/stop used in pairs
2685 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2689 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2690 g_free(block->clear_bmap);
2691 block->clear_bmap = NULL;
2692 g_free(block->bmap);
2697 compress_threads_save_cleanup();
2698 ram_state_cleanup(rsp);
2701 static void ram_state_reset(RAMState *rs)
2703 rs->last_seen_block = NULL;
2704 rs->last_sent_block = NULL;
2706 rs->last_version = ram_list.version;
2707 rs->xbzrle_enabled = false;
2708 postcopy_preempt_reset(rs);
2709 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2712 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2714 /* **** functions for postcopy ***** */
2716 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2718 struct RAMBlock *block;
2720 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2721 unsigned long *bitmap = block->bmap;
2722 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2723 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2725 while (run_start < range) {
2726 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2727 ram_discard_range(block->idstr,
2728 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2729 ((ram_addr_t)(run_end - run_start))
2730 << TARGET_PAGE_BITS);
2731 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2737 * postcopy_send_discard_bm_ram: discard a RAMBlock
2739 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2741 * @ms: current migration state
2742 * @block: RAMBlock to discard
2744 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2746 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2747 unsigned long current;
2748 unsigned long *bitmap = block->bmap;
2750 for (current = 0; current < end; ) {
2751 unsigned long one = find_next_bit(bitmap, end, current);
2752 unsigned long zero, discard_length;
2758 zero = find_next_zero_bit(bitmap, end, one + 1);
2761 discard_length = end - one;
2763 discard_length = zero - one;
2765 postcopy_discard_send_range(ms, one, discard_length);
2766 current = one + discard_length;
2770 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2773 * postcopy_each_ram_send_discard: discard all RAMBlocks
2775 * Utility for the outgoing postcopy code.
2776 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2777 * passing it bitmap indexes and name.
2778 * (qemu_ram_foreach_block ends up passing unscaled lengths
2779 * which would mean postcopy code would have to deal with target page)
2781 * @ms: current migration state
2783 static void postcopy_each_ram_send_discard(MigrationState *ms)
2785 struct RAMBlock *block;
2787 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2788 postcopy_discard_send_init(ms, block->idstr);
2791 * Deal with TPS != HPS and huge pages. It discard any partially sent
2792 * host-page size chunks, mark any partially dirty host-page size
2793 * chunks as all dirty. In this case the host-page is the host-page
2794 * for the particular RAMBlock, i.e. it might be a huge page.
2796 postcopy_chunk_hostpages_pass(ms, block);
2799 * Postcopy sends chunks of bitmap over the wire, but it
2800 * just needs indexes at this point, avoids it having
2801 * target page specific code.
2803 postcopy_send_discard_bm_ram(ms, block);
2804 postcopy_discard_send_finish(ms);
2809 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2811 * Helper for postcopy_chunk_hostpages; it's called twice to
2812 * canonicalize the two bitmaps, that are similar, but one is
2815 * Postcopy requires that all target pages in a hostpage are dirty or
2816 * clean, not a mix. This function canonicalizes the bitmaps.
2818 * @ms: current migration state
2819 * @block: block that contains the page we want to canonicalize
2821 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2823 RAMState *rs = ram_state;
2824 unsigned long *bitmap = block->bmap;
2825 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2826 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2827 unsigned long run_start;
2829 if (block->page_size == TARGET_PAGE_SIZE) {
2830 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2834 /* Find a dirty page */
2835 run_start = find_next_bit(bitmap, pages, 0);
2837 while (run_start < pages) {
2840 * If the start of this run of pages is in the middle of a host
2841 * page, then we need to fixup this host page.
2843 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2844 /* Find the end of this run */
2845 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2847 * If the end isn't at the start of a host page, then the
2848 * run doesn't finish at the end of a host page
2849 * and we need to discard.
2853 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2855 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2857 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2859 /* Clean up the bitmap */
2860 for (page = fixup_start_addr;
2861 page < fixup_start_addr + host_ratio; page++) {
2863 * Remark them as dirty, updating the count for any pages
2864 * that weren't previously dirty.
2866 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2870 /* Find the next dirty page for the next iteration */
2871 run_start = find_next_bit(bitmap, pages, run_start);
2876 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2878 * Transmit the set of pages to be discarded after precopy to the target
2879 * these are pages that:
2880 * a) Have been previously transmitted but are now dirty again
2881 * b) Pages that have never been transmitted, this ensures that
2882 * any pages on the destination that have been mapped by background
2883 * tasks get discarded (transparent huge pages is the specific concern)
2884 * Hopefully this is pretty sparse
2886 * @ms: current migration state
2888 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2890 RAMState *rs = ram_state;
2892 RCU_READ_LOCK_GUARD();
2894 /* This should be our last sync, the src is now paused */
2895 migration_bitmap_sync(rs);
2897 /* Easiest way to make sure we don't resume in the middle of a host-page */
2898 rs->last_seen_block = NULL;
2899 rs->last_sent_block = NULL;
2902 postcopy_each_ram_send_discard(ms);
2904 trace_ram_postcopy_send_discard_bitmap();
2908 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2910 * Returns zero on success
2912 * @rbname: name of the RAMBlock of the request. NULL means the
2913 * same that last one.
2914 * @start: RAMBlock starting page
2915 * @length: RAMBlock size
2917 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2919 trace_ram_discard_range(rbname, start, length);
2921 RCU_READ_LOCK_GUARD();
2922 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2925 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2930 * On source VM, we don't need to update the received bitmap since
2931 * we don't even have one.
2933 if (rb->receivedmap) {
2934 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2935 length >> qemu_target_page_bits());
2938 return ram_block_discard_range(rb, start, length);
2942 * For every allocation, we will try not to crash the VM if the
2943 * allocation failed.
2945 static int xbzrle_init(void)
2947 Error *local_err = NULL;
2949 if (!migrate_use_xbzrle()) {
2953 XBZRLE_cache_lock();
2955 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2956 if (!XBZRLE.zero_target_page) {
2957 error_report("%s: Error allocating zero page", __func__);
2961 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2962 TARGET_PAGE_SIZE, &local_err);
2963 if (!XBZRLE.cache) {
2964 error_report_err(local_err);
2965 goto free_zero_page;
2968 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2969 if (!XBZRLE.encoded_buf) {
2970 error_report("%s: Error allocating encoded_buf", __func__);
2974 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2975 if (!XBZRLE.current_buf) {
2976 error_report("%s: Error allocating current_buf", __func__);
2977 goto free_encoded_buf;
2980 /* We are all good */
2981 XBZRLE_cache_unlock();
2985 g_free(XBZRLE.encoded_buf);
2986 XBZRLE.encoded_buf = NULL;
2988 cache_fini(XBZRLE.cache);
2989 XBZRLE.cache = NULL;
2991 g_free(XBZRLE.zero_target_page);
2992 XBZRLE.zero_target_page = NULL;
2994 XBZRLE_cache_unlock();
2998 static int ram_state_init(RAMState **rsp)
3000 *rsp = g_try_new0(RAMState, 1);
3003 error_report("%s: Init ramstate fail", __func__);
3007 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3008 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3009 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3012 * Count the total number of pages used by ram blocks not including any
3013 * gaps due to alignment or unplugs.
3014 * This must match with the initial values of dirty bitmap.
3016 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3017 ram_state_reset(*rsp);
3022 static void ram_list_init_bitmaps(void)
3024 MigrationState *ms = migrate_get_current();
3026 unsigned long pages;
3029 /* Skip setting bitmap if there is no RAM */
3030 if (ram_bytes_total()) {
3031 shift = ms->clear_bitmap_shift;
3032 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3033 error_report("clear_bitmap_shift (%u) too big, using "
3034 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3035 shift = CLEAR_BITMAP_SHIFT_MAX;
3036 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3037 error_report("clear_bitmap_shift (%u) too small, using "
3038 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3039 shift = CLEAR_BITMAP_SHIFT_MIN;
3042 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3043 pages = block->max_length >> TARGET_PAGE_BITS;
3045 * The initial dirty bitmap for migration must be set with all
3046 * ones to make sure we'll migrate every guest RAM page to
3048 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3049 * new migration after a failed migration, ram_list.
3050 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3053 block->bmap = bitmap_new(pages);
3054 bitmap_set(block->bmap, 0, pages);
3055 block->clear_bmap_shift = shift;
3056 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3061 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3063 unsigned long pages;
3066 RCU_READ_LOCK_GUARD();
3068 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3069 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3070 rs->migration_dirty_pages -= pages;
3074 static void ram_init_bitmaps(RAMState *rs)
3076 /* For memory_global_dirty_log_start below. */
3077 qemu_mutex_lock_iothread();
3078 qemu_mutex_lock_ramlist();
3080 WITH_RCU_READ_LOCK_GUARD() {
3081 ram_list_init_bitmaps();
3082 /* We don't use dirty log with background snapshots */
3083 if (!migrate_background_snapshot()) {
3084 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3085 migration_bitmap_sync_precopy(rs);
3088 qemu_mutex_unlock_ramlist();
3089 qemu_mutex_unlock_iothread();
3092 * After an eventual first bitmap sync, fixup the initial bitmap
3093 * containing all 1s to exclude any discarded pages from migration.
3095 migration_bitmap_clear_discarded_pages(rs);
3098 static int ram_init_all(RAMState **rsp)
3100 if (ram_state_init(rsp)) {
3104 if (xbzrle_init()) {
3105 ram_state_cleanup(rsp);
3109 ram_init_bitmaps(*rsp);
3114 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3120 * Postcopy is not using xbzrle/compression, so no need for that.
3121 * Also, since source are already halted, we don't need to care
3122 * about dirty page logging as well.
3125 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3126 pages += bitmap_count_one(block->bmap,
3127 block->used_length >> TARGET_PAGE_BITS);
3130 /* This may not be aligned with current bitmaps. Recalculate. */
3131 rs->migration_dirty_pages = pages;
3133 ram_state_reset(rs);
3135 /* Update RAMState cache of output QEMUFile */
3138 trace_ram_state_resume_prepare(pages);
3142 * This function clears bits of the free pages reported by the caller from the
3143 * migration dirty bitmap. @addr is the host address corresponding to the
3144 * start of the continuous guest free pages, and @len is the total bytes of
3147 void qemu_guest_free_page_hint(void *addr, size_t len)
3151 size_t used_len, start, npages;
3152 MigrationState *s = migrate_get_current();
3154 /* This function is currently expected to be used during live migration */
3155 if (!migration_is_setup_or_active(s->state)) {
3159 for (; len > 0; len -= used_len, addr += used_len) {
3160 block = qemu_ram_block_from_host(addr, false, &offset);
3161 if (unlikely(!block || offset >= block->used_length)) {
3163 * The implementation might not support RAMBlock resize during
3164 * live migration, but it could happen in theory with future
3165 * updates. So we add a check here to capture that case.
3167 error_report_once("%s unexpected error", __func__);
3171 if (len <= block->used_length - offset) {
3174 used_len = block->used_length - offset;
3177 start = offset >> TARGET_PAGE_BITS;
3178 npages = used_len >> TARGET_PAGE_BITS;
3180 qemu_mutex_lock(&ram_state->bitmap_mutex);
3182 * The skipped free pages are equavalent to be sent from clear_bmap's
3183 * perspective, so clear the bits from the memory region bitmap which
3184 * are initially set. Otherwise those skipped pages will be sent in
3185 * the next round after syncing from the memory region bitmap.
3187 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3188 ram_state->migration_dirty_pages -=
3189 bitmap_count_one_with_offset(block->bmap, start, npages);
3190 bitmap_clear(block->bmap, start, npages);
3191 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3196 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3197 * long-running RCU critical section. When rcu-reclaims in the code
3198 * start to become numerous it will be necessary to reduce the
3199 * granularity of these critical sections.
3203 * ram_save_setup: Setup RAM for migration
3205 * Returns zero to indicate success and negative for error
3207 * @f: QEMUFile where to send the data
3208 * @opaque: RAMState pointer
3210 static int ram_save_setup(QEMUFile *f, void *opaque)
3212 RAMState **rsp = opaque;
3216 if (compress_threads_save_setup()) {
3220 /* migration has already setup the bitmap, reuse it. */
3221 if (!migration_in_colo_state()) {
3222 if (ram_init_all(rsp) != 0) {
3223 compress_threads_save_cleanup();
3229 WITH_RCU_READ_LOCK_GUARD() {
3230 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3232 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3233 qemu_put_byte(f, strlen(block->idstr));
3234 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3235 qemu_put_be64(f, block->used_length);
3236 if (migrate_postcopy_ram() && block->page_size !=
3237 qemu_host_page_size) {
3238 qemu_put_be64(f, block->page_size);
3240 if (migrate_ignore_shared()) {
3241 qemu_put_be64(f, block->mr->addr);
3246 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3247 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3249 ret = multifd_send_sync_main(f);
3254 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3261 * ram_save_iterate: iterative stage for migration
3263 * Returns zero to indicate success and negative for error
3265 * @f: QEMUFile where to send the data
3266 * @opaque: RAMState pointer
3268 static int ram_save_iterate(QEMUFile *f, void *opaque)
3270 RAMState **temp = opaque;
3271 RAMState *rs = *temp;
3277 if (blk_mig_bulk_active()) {
3278 /* Avoid transferring ram during bulk phase of block migration as
3279 * the bulk phase will usually take a long time and transferring
3280 * ram updates during that time is pointless. */
3285 * We'll take this lock a little bit long, but it's okay for two reasons.
3286 * Firstly, the only possible other thread to take it is who calls
3287 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3288 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3289 * guarantees that we'll at least released it in a regular basis.
3291 qemu_mutex_lock(&rs->bitmap_mutex);
3292 WITH_RCU_READ_LOCK_GUARD() {
3293 if (ram_list.version != rs->last_version) {
3294 ram_state_reset(rs);
3297 /* Read version before ram_list.blocks */
3300 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3302 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3304 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3305 postcopy_has_request(rs)) {
3308 if (qemu_file_get_error(f)) {
3312 pages = ram_find_and_save_block(rs);
3313 /* no more pages to sent */
3320 qemu_file_set_error(f, pages);
3324 rs->target_page_count += pages;
3327 * During postcopy, it is necessary to make sure one whole host
3328 * page is sent in one chunk.
3330 if (migrate_postcopy_ram()) {
3331 flush_compressed_data(rs);
3335 * we want to check in the 1st loop, just in case it was the 1st
3336 * time and we had to sync the dirty bitmap.
3337 * qemu_clock_get_ns() is a bit expensive, so we only check each
3340 if ((i & 63) == 0) {
3341 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3343 if (t1 > MAX_WAIT) {
3344 trace_ram_save_iterate_big_wait(t1, i);
3351 qemu_mutex_unlock(&rs->bitmap_mutex);
3353 postcopy_preempt_reset_channel(rs);
3356 * Must occur before EOS (or any QEMUFile operation)
3357 * because of RDMA protocol.
3359 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3363 && migration_is_setup_or_active(migrate_get_current()->state)) {
3364 ret = multifd_send_sync_main(rs->f);
3369 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3371 ram_transferred_add(8);
3373 ret = qemu_file_get_error(f);
3383 * ram_save_complete: function called to send the remaining amount of ram
3385 * Returns zero to indicate success or negative on error
3387 * Called with iothread lock
3389 * @f: QEMUFile where to send the data
3390 * @opaque: RAMState pointer
3392 static int ram_save_complete(QEMUFile *f, void *opaque)
3394 RAMState **temp = opaque;
3395 RAMState *rs = *temp;
3398 rs->last_stage = !migration_in_colo_state();
3400 WITH_RCU_READ_LOCK_GUARD() {
3401 if (!migration_in_postcopy()) {
3402 migration_bitmap_sync_precopy(rs);
3405 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3407 /* try transferring iterative blocks of memory */
3409 /* flush all remaining blocks regardless of rate limiting */
3410 qemu_mutex_lock(&rs->bitmap_mutex);
3414 pages = ram_find_and_save_block(rs);
3415 /* no more blocks to sent */
3424 qemu_mutex_unlock(&rs->bitmap_mutex);
3426 flush_compressed_data(rs);
3427 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3434 postcopy_preempt_reset_channel(rs);
3436 ret = multifd_send_sync_main(rs->f);
3441 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3447 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3448 uint64_t *res_precopy_only,
3449 uint64_t *res_compatible,
3450 uint64_t *res_postcopy_only)
3452 RAMState **temp = opaque;
3453 RAMState *rs = *temp;
3454 uint64_t remaining_size;
3456 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3458 if (!migration_in_postcopy() &&
3459 remaining_size < max_size) {
3460 qemu_mutex_lock_iothread();
3461 WITH_RCU_READ_LOCK_GUARD() {
3462 migration_bitmap_sync_precopy(rs);
3464 qemu_mutex_unlock_iothread();
3465 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3468 if (migrate_postcopy_ram()) {
3469 /* We can do postcopy, and all the data is postcopiable */
3470 *res_compatible += remaining_size;
3472 *res_precopy_only += remaining_size;
3476 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3478 unsigned int xh_len;
3480 uint8_t *loaded_data;
3482 /* extract RLE header */
3483 xh_flags = qemu_get_byte(f);
3484 xh_len = qemu_get_be16(f);
3486 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3487 error_report("Failed to load XBZRLE page - wrong compression!");
3491 if (xh_len > TARGET_PAGE_SIZE) {
3492 error_report("Failed to load XBZRLE page - len overflow!");
3495 loaded_data = XBZRLE.decoded_buf;
3496 /* load data and decode */
3497 /* it can change loaded_data to point to an internal buffer */
3498 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3501 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3502 TARGET_PAGE_SIZE) == -1) {
3503 error_report("Failed to load XBZRLE page - decode error!");
3511 * ram_block_from_stream: read a RAMBlock id from the migration stream
3513 * Must be called from within a rcu critical section.
3515 * Returns a pointer from within the RCU-protected ram_list.
3517 * @mis: the migration incoming state pointer
3518 * @f: QEMUFile where to read the data from
3519 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3520 * @channel: the channel we're using
3522 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3523 QEMUFile *f, int flags,
3526 RAMBlock *block = mis->last_recv_block[channel];
3530 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3532 error_report("Ack, bad migration stream!");
3538 len = qemu_get_byte(f);
3539 qemu_get_buffer(f, (uint8_t *)id, len);
3542 block = qemu_ram_block_by_name(id);
3544 error_report("Can't find block %s", id);
3548 if (ramblock_is_ignored(block)) {
3549 error_report("block %s should not be migrated !", id);
3553 mis->last_recv_block[channel] = block;
3558 static inline void *host_from_ram_block_offset(RAMBlock *block,
3561 if (!offset_in_ramblock(block, offset)) {
3565 return block->host + offset;
3568 static void *host_page_from_ram_block_offset(RAMBlock *block,
3571 /* Note: Explicitly no check against offset_in_ramblock(). */
3572 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3576 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3579 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3582 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3583 ram_addr_t offset, bool record_bitmap)
3585 if (!offset_in_ramblock(block, offset)) {
3588 if (!block->colo_cache) {
3589 error_report("%s: colo_cache is NULL in block :%s",
3590 __func__, block->idstr);
3595 * During colo checkpoint, we need bitmap of these migrated pages.
3596 * It help us to decide which pages in ram cache should be flushed
3597 * into VM's RAM later.
3599 if (record_bitmap &&
3600 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3601 ram_state->migration_dirty_pages++;
3603 return block->colo_cache + offset;
3607 * ram_handle_compressed: handle the zero page case
3609 * If a page (or a whole RDMA chunk) has been
3610 * determined to be zero, then zap it.
3612 * @host: host address for the zero page
3613 * @ch: what the page is filled from. We only support zero
3614 * @size: size of the zero page
3616 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3618 if (ch != 0 || !buffer_is_zero(host, size)) {
3619 memset(host, ch, size);
3623 /* return the size after decompression, or negative value on error */
3625 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3626 const uint8_t *source, size_t source_len)
3630 err = inflateReset(stream);
3635 stream->avail_in = source_len;
3636 stream->next_in = (uint8_t *)source;
3637 stream->avail_out = dest_len;
3638 stream->next_out = dest;
3640 err = inflate(stream, Z_NO_FLUSH);
3641 if (err != Z_STREAM_END) {
3645 return stream->total_out;
3648 static void *do_data_decompress(void *opaque)
3650 DecompressParam *param = opaque;
3651 unsigned long pagesize;
3655 qemu_mutex_lock(¶m->mutex);
3656 while (!param->quit) {
3661 qemu_mutex_unlock(¶m->mutex);
3663 pagesize = TARGET_PAGE_SIZE;
3665 ret = qemu_uncompress_data(¶m->stream, des, pagesize,
3666 param->compbuf, len);
3667 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3668 error_report("decompress data failed");
3669 qemu_file_set_error(decomp_file, ret);
3672 qemu_mutex_lock(&decomp_done_lock);
3674 qemu_cond_signal(&decomp_done_cond);
3675 qemu_mutex_unlock(&decomp_done_lock);
3677 qemu_mutex_lock(¶m->mutex);
3679 qemu_cond_wait(¶m->cond, ¶m->mutex);
3682 qemu_mutex_unlock(¶m->mutex);
3687 static int wait_for_decompress_done(void)
3689 int idx, thread_count;
3691 if (!migrate_use_compression()) {
3695 thread_count = migrate_decompress_threads();
3696 qemu_mutex_lock(&decomp_done_lock);
3697 for (idx = 0; idx < thread_count; idx++) {
3698 while (!decomp_param[idx].done) {
3699 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3702 qemu_mutex_unlock(&decomp_done_lock);
3703 return qemu_file_get_error(decomp_file);
3706 static void compress_threads_load_cleanup(void)
3708 int i, thread_count;
3710 if (!migrate_use_compression()) {
3713 thread_count = migrate_decompress_threads();
3714 for (i = 0; i < thread_count; i++) {
3716 * we use it as a indicator which shows if the thread is
3717 * properly init'd or not
3719 if (!decomp_param[i].compbuf) {
3723 qemu_mutex_lock(&decomp_param[i].mutex);
3724 decomp_param[i].quit = true;
3725 qemu_cond_signal(&decomp_param[i].cond);
3726 qemu_mutex_unlock(&decomp_param[i].mutex);
3728 for (i = 0; i < thread_count; i++) {
3729 if (!decomp_param[i].compbuf) {
3733 qemu_thread_join(decompress_threads + i);
3734 qemu_mutex_destroy(&decomp_param[i].mutex);
3735 qemu_cond_destroy(&decomp_param[i].cond);
3736 inflateEnd(&decomp_param[i].stream);
3737 g_free(decomp_param[i].compbuf);
3738 decomp_param[i].compbuf = NULL;
3740 g_free(decompress_threads);
3741 g_free(decomp_param);
3742 decompress_threads = NULL;
3743 decomp_param = NULL;
3747 static int compress_threads_load_setup(QEMUFile *f)
3749 int i, thread_count;
3751 if (!migrate_use_compression()) {
3755 thread_count = migrate_decompress_threads();
3756 decompress_threads = g_new0(QemuThread, thread_count);
3757 decomp_param = g_new0(DecompressParam, thread_count);
3758 qemu_mutex_init(&decomp_done_lock);
3759 qemu_cond_init(&decomp_done_cond);
3761 for (i = 0; i < thread_count; i++) {
3762 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3766 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3767 qemu_mutex_init(&decomp_param[i].mutex);
3768 qemu_cond_init(&decomp_param[i].cond);
3769 decomp_param[i].done = true;
3770 decomp_param[i].quit = false;
3771 qemu_thread_create(decompress_threads + i, "decompress",
3772 do_data_decompress, decomp_param + i,
3773 QEMU_THREAD_JOINABLE);
3777 compress_threads_load_cleanup();
3781 static void decompress_data_with_multi_threads(QEMUFile *f,
3782 void *host, int len)
3784 int idx, thread_count;
3786 thread_count = migrate_decompress_threads();
3787 QEMU_LOCK_GUARD(&decomp_done_lock);
3789 for (idx = 0; idx < thread_count; idx++) {
3790 if (decomp_param[idx].done) {
3791 decomp_param[idx].done = false;
3792 qemu_mutex_lock(&decomp_param[idx].mutex);
3793 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3794 decomp_param[idx].des = host;
3795 decomp_param[idx].len = len;
3796 qemu_cond_signal(&decomp_param[idx].cond);
3797 qemu_mutex_unlock(&decomp_param[idx].mutex);
3801 if (idx < thread_count) {
3804 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3809 static void colo_init_ram_state(void)
3811 ram_state_init(&ram_state);
3815 * colo cache: this is for secondary VM, we cache the whole
3816 * memory of the secondary VM, it is need to hold the global lock
3817 * to call this helper.
3819 int colo_init_ram_cache(void)
3823 WITH_RCU_READ_LOCK_GUARD() {
3824 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3825 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3826 NULL, false, false);
3827 if (!block->colo_cache) {
3828 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3829 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3830 block->used_length);
3831 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3832 if (block->colo_cache) {
3833 qemu_anon_ram_free(block->colo_cache, block->used_length);
3834 block->colo_cache = NULL;
3839 if (!machine_dump_guest_core(current_machine)) {
3840 qemu_madvise(block->colo_cache, block->used_length,
3841 QEMU_MADV_DONTDUMP);
3847 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3848 * with to decide which page in cache should be flushed into SVM's RAM. Here
3849 * we use the same name 'ram_bitmap' as for migration.
3851 if (ram_bytes_total()) {
3854 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3855 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3856 block->bmap = bitmap_new(pages);
3860 colo_init_ram_state();
3864 /* TODO: duplicated with ram_init_bitmaps */
3865 void colo_incoming_start_dirty_log(void)
3867 RAMBlock *block = NULL;
3868 /* For memory_global_dirty_log_start below. */
3869 qemu_mutex_lock_iothread();
3870 qemu_mutex_lock_ramlist();
3872 memory_global_dirty_log_sync();
3873 WITH_RCU_READ_LOCK_GUARD() {
3874 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3875 ramblock_sync_dirty_bitmap(ram_state, block);
3876 /* Discard this dirty bitmap record */
3877 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3879 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3881 ram_state->migration_dirty_pages = 0;
3882 qemu_mutex_unlock_ramlist();
3883 qemu_mutex_unlock_iothread();
3886 /* It is need to hold the global lock to call this helper */
3887 void colo_release_ram_cache(void)
3891 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3892 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3893 g_free(block->bmap);
3897 WITH_RCU_READ_LOCK_GUARD() {
3898 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3899 if (block->colo_cache) {
3900 qemu_anon_ram_free(block->colo_cache, block->used_length);
3901 block->colo_cache = NULL;
3905 ram_state_cleanup(&ram_state);
3909 * ram_load_setup: Setup RAM for migration incoming side
3911 * Returns zero to indicate success and negative for error
3913 * @f: QEMUFile where to receive the data
3914 * @opaque: RAMState pointer
3916 static int ram_load_setup(QEMUFile *f, void *opaque)
3918 if (compress_threads_load_setup(f)) {
3922 xbzrle_load_setup();
3923 ramblock_recv_map_init();
3928 static int ram_load_cleanup(void *opaque)
3932 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3933 qemu_ram_block_writeback(rb);
3936 xbzrle_load_cleanup();
3937 compress_threads_load_cleanup();
3939 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3940 g_free(rb->receivedmap);
3941 rb->receivedmap = NULL;
3948 * ram_postcopy_incoming_init: allocate postcopy data structures
3950 * Returns 0 for success and negative if there was one error
3952 * @mis: current migration incoming state
3954 * Allocate data structures etc needed by incoming migration with
3955 * postcopy-ram. postcopy-ram's similarly names
3956 * postcopy_ram_incoming_init does the work.
3958 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3960 return postcopy_ram_incoming_init(mis);
3964 * ram_load_postcopy: load a page in postcopy case
3966 * Returns 0 for success or -errno in case of error
3968 * Called in postcopy mode by ram_load().
3969 * rcu_read_lock is taken prior to this being called.
3971 * @f: QEMUFile where to send the data
3972 * @channel: the channel to use for loading
3974 int ram_load_postcopy(QEMUFile *f, int channel)
3976 int flags = 0, ret = 0;
3977 bool place_needed = false;
3978 bool matches_target_page_size = false;
3979 MigrationIncomingState *mis = migration_incoming_get_current();
3980 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3982 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3984 void *page_buffer = NULL;
3985 void *place_source = NULL;
3986 RAMBlock *block = NULL;
3990 addr = qemu_get_be64(f);
3993 * If qemu file error, we should stop here, and then "addr"
3996 ret = qemu_file_get_error(f);
4001 flags = addr & ~TARGET_PAGE_MASK;
4002 addr &= TARGET_PAGE_MASK;
4004 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4005 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4006 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4007 block = ram_block_from_stream(mis, f, flags, channel);
4014 * Relying on used_length is racy and can result in false positives.
4015 * We might place pages beyond used_length in case RAM was shrunk
4016 * while in postcopy, which is fine - trying to place via
4017 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4019 if (!block->host || addr >= block->postcopy_length) {
4020 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4024 tmp_page->target_pages++;
4025 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4027 * Postcopy requires that we place whole host pages atomically;
4028 * these may be huge pages for RAMBlocks that are backed by
4030 * To make it atomic, the data is read into a temporary page
4031 * that's moved into place later.
4032 * The migration protocol uses, possibly smaller, target-pages
4033 * however the source ensures it always sends all the components
4034 * of a host page in one chunk.
4036 page_buffer = tmp_page->tmp_huge_page +
4037 host_page_offset_from_ram_block_offset(block, addr);
4038 /* If all TP are zero then we can optimise the place */
4039 if (tmp_page->target_pages == 1) {
4040 tmp_page->host_addr =
4041 host_page_from_ram_block_offset(block, addr);
4042 } else if (tmp_page->host_addr !=
4043 host_page_from_ram_block_offset(block, addr)) {
4044 /* not the 1st TP within the HP */
4045 error_report("Non-same host page detected on channel %d: "
4046 "Target host page %p, received host page %p "
4047 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4048 channel, tmp_page->host_addr,
4049 host_page_from_ram_block_offset(block, addr),
4050 block->idstr, addr, tmp_page->target_pages);
4056 * If it's the last part of a host page then we place the host
4059 if (tmp_page->target_pages ==
4060 (block->page_size / TARGET_PAGE_SIZE)) {
4061 place_needed = true;
4063 place_source = tmp_page->tmp_huge_page;
4066 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4067 case RAM_SAVE_FLAG_ZERO:
4068 ch = qemu_get_byte(f);
4070 * Can skip to set page_buffer when
4071 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4073 if (ch || !matches_target_page_size) {
4074 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4077 tmp_page->all_zero = false;
4081 case RAM_SAVE_FLAG_PAGE:
4082 tmp_page->all_zero = false;
4083 if (!matches_target_page_size) {
4084 /* For huge pages, we always use temporary buffer */
4085 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4088 * For small pages that matches target page size, we
4089 * avoid the qemu_file copy. Instead we directly use
4090 * the buffer of QEMUFile to place the page. Note: we
4091 * cannot do any QEMUFile operation before using that
4092 * buffer to make sure the buffer is valid when
4095 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4099 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4100 tmp_page->all_zero = false;
4101 len = qemu_get_be32(f);
4102 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4103 error_report("Invalid compressed data length: %d", len);
4107 decompress_data_with_multi_threads(f, page_buffer, len);
4110 case RAM_SAVE_FLAG_EOS:
4112 multifd_recv_sync_main();
4115 error_report("Unknown combination of migration flags: 0x%x"
4116 " (postcopy mode)", flags);
4121 /* Got the whole host page, wait for decompress before placing. */
4123 ret |= wait_for_decompress_done();
4126 /* Detect for any possible file errors */
4127 if (!ret && qemu_file_get_error(f)) {
4128 ret = qemu_file_get_error(f);
4131 if (!ret && place_needed) {
4132 if (tmp_page->all_zero) {
4133 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4135 ret = postcopy_place_page(mis, tmp_page->host_addr,
4136 place_source, block);
4138 place_needed = false;
4139 postcopy_temp_page_reset(tmp_page);
4146 static bool postcopy_is_advised(void)
4148 PostcopyState ps = postcopy_state_get();
4149 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4152 static bool postcopy_is_running(void)
4154 PostcopyState ps = postcopy_state_get();
4155 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4159 * Flush content of RAM cache into SVM's memory.
4160 * Only flush the pages that be dirtied by PVM or SVM or both.
4162 void colo_flush_ram_cache(void)
4164 RAMBlock *block = NULL;
4167 unsigned long offset = 0;
4169 memory_global_dirty_log_sync();
4170 WITH_RCU_READ_LOCK_GUARD() {
4171 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4172 ramblock_sync_dirty_bitmap(ram_state, block);
4176 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4177 WITH_RCU_READ_LOCK_GUARD() {
4178 block = QLIST_FIRST_RCU(&ram_list.blocks);
4181 unsigned long num = 0;
4183 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4184 if (!offset_in_ramblock(block,
4185 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4188 block = QLIST_NEXT_RCU(block, next);
4190 unsigned long i = 0;
4192 for (i = 0; i < num; i++) {
4193 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4195 dst_host = block->host
4196 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4197 src_host = block->colo_cache
4198 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4199 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4204 trace_colo_flush_ram_cache_end();
4208 * ram_load_precopy: load pages in precopy case
4210 * Returns 0 for success or -errno in case of error
4212 * Called in precopy mode by ram_load().
4213 * rcu_read_lock is taken prior to this being called.
4215 * @f: QEMUFile where to send the data
4217 static int ram_load_precopy(QEMUFile *f)
4219 MigrationIncomingState *mis = migration_incoming_get_current();
4220 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4221 /* ADVISE is earlier, it shows the source has the postcopy capability on */
4222 bool postcopy_advised = postcopy_is_advised();
4223 if (!migrate_use_compression()) {
4224 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4227 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4228 ram_addr_t addr, total_ram_bytes;
4229 void *host = NULL, *host_bak = NULL;
4233 * Yield periodically to let main loop run, but an iteration of
4234 * the main loop is expensive, so do it each some iterations
4236 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4237 aio_co_schedule(qemu_get_current_aio_context(),
4238 qemu_coroutine_self());
4239 qemu_coroutine_yield();
4243 addr = qemu_get_be64(f);
4244 flags = addr & ~TARGET_PAGE_MASK;
4245 addr &= TARGET_PAGE_MASK;
4247 if (flags & invalid_flags) {
4248 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4249 error_report("Received an unexpected compressed page");
4256 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4257 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4258 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4259 RAM_CHANNEL_PRECOPY);
4261 host = host_from_ram_block_offset(block, addr);
4263 * After going into COLO stage, we should not load the page
4264 * into SVM's memory directly, we put them into colo_cache firstly.
4265 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4266 * Previously, we copied all these memory in preparing stage of COLO
4267 * while we need to stop VM, which is a time-consuming process.
4268 * Here we optimize it by a trick, back-up every page while in
4269 * migration process while COLO is enabled, though it affects the
4270 * speed of the migration, but it obviously reduce the downtime of
4271 * back-up all SVM'S memory in COLO preparing stage.
4273 if (migration_incoming_colo_enabled()) {
4274 if (migration_incoming_in_colo_state()) {
4275 /* In COLO stage, put all pages into cache temporarily */
4276 host = colo_cache_from_block_offset(block, addr, true);
4279 * In migration stage but before COLO stage,
4280 * Put all pages into both cache and SVM's memory.
4282 host_bak = colo_cache_from_block_offset(block, addr, false);
4286 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4290 if (!migration_incoming_in_colo_state()) {
4291 ramblock_recv_bitmap_set(block, host);
4294 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4297 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4298 case RAM_SAVE_FLAG_MEM_SIZE:
4299 /* Synchronize RAM block list */
4300 total_ram_bytes = addr;
4301 while (!ret && total_ram_bytes) {
4306 len = qemu_get_byte(f);
4307 qemu_get_buffer(f, (uint8_t *)id, len);
4309 length = qemu_get_be64(f);
4311 block = qemu_ram_block_by_name(id);
4312 if (block && !qemu_ram_is_migratable(block)) {
4313 error_report("block %s should not be migrated !", id);
4316 if (length != block->used_length) {
4317 Error *local_err = NULL;
4319 ret = qemu_ram_resize(block, length,
4322 error_report_err(local_err);
4325 /* For postcopy we need to check hugepage sizes match */
4326 if (postcopy_advised && migrate_postcopy_ram() &&
4327 block->page_size != qemu_host_page_size) {
4328 uint64_t remote_page_size = qemu_get_be64(f);
4329 if (remote_page_size != block->page_size) {
4330 error_report("Mismatched RAM page size %s "
4331 "(local) %zd != %" PRId64,
4332 id, block->page_size,
4337 if (migrate_ignore_shared()) {
4338 hwaddr addr = qemu_get_be64(f);
4339 if (ramblock_is_ignored(block) &&
4340 block->mr->addr != addr) {
4341 error_report("Mismatched GPAs for block %s "
4342 "%" PRId64 "!= %" PRId64,
4344 (uint64_t)block->mr->addr);
4348 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4351 error_report("Unknown ramblock \"%s\", cannot "
4352 "accept migration", id);
4356 total_ram_bytes -= length;
4360 case RAM_SAVE_FLAG_ZERO:
4361 ch = qemu_get_byte(f);
4362 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4365 case RAM_SAVE_FLAG_PAGE:
4366 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4369 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4370 len = qemu_get_be32(f);
4371 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4372 error_report("Invalid compressed data length: %d", len);
4376 decompress_data_with_multi_threads(f, host, len);
4379 case RAM_SAVE_FLAG_XBZRLE:
4380 if (load_xbzrle(f, addr, host) < 0) {
4381 error_report("Failed to decompress XBZRLE page at "
4382 RAM_ADDR_FMT, addr);
4387 case RAM_SAVE_FLAG_EOS:
4389 multifd_recv_sync_main();
4392 if (flags & RAM_SAVE_FLAG_HOOK) {
4393 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4395 error_report("Unknown combination of migration flags: 0x%x",
4401 ret = qemu_file_get_error(f);
4403 if (!ret && host_bak) {
4404 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4408 ret |= wait_for_decompress_done();
4412 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4415 static uint64_t seq_iter;
4417 * If system is running in postcopy mode, page inserts to host memory must
4420 bool postcopy_running = postcopy_is_running();
4424 if (version_id != 4) {
4429 * This RCU critical section can be very long running.
4430 * When RCU reclaims in the code start to become numerous,
4431 * it will be necessary to reduce the granularity of this
4434 WITH_RCU_READ_LOCK_GUARD() {
4435 if (postcopy_running) {
4437 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4438 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4439 * service fast page faults.
4441 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4443 ret = ram_load_precopy(f);
4446 trace_ram_load_complete(ret, seq_iter);
4451 static bool ram_has_postcopy(void *opaque)
4454 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4455 if (ramblock_is_pmem(rb)) {
4456 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4457 "is not supported now!", rb->idstr, rb->host);
4462 return migrate_postcopy_ram();
4465 /* Sync all the dirty bitmap with destination VM. */
4466 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4469 QEMUFile *file = s->to_dst_file;
4470 int ramblock_count = 0;
4472 trace_ram_dirty_bitmap_sync_start();
4474 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4475 qemu_savevm_send_recv_bitmap(file, block->idstr);
4476 trace_ram_dirty_bitmap_request(block->idstr);
4480 trace_ram_dirty_bitmap_sync_wait();
4482 /* Wait until all the ramblocks' dirty bitmap synced */
4483 while (ramblock_count--) {
4484 qemu_sem_wait(&s->rp_state.rp_sem);
4487 trace_ram_dirty_bitmap_sync_complete();
4492 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4494 qemu_sem_post(&s->rp_state.rp_sem);
4498 * Read the received bitmap, revert it as the initial dirty bitmap.
4499 * This is only used when the postcopy migration is paused but wants
4500 * to resume from a middle point.
4502 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4505 /* from_dst_file is always valid because we're within rp_thread */
4506 QEMUFile *file = s->rp_state.from_dst_file;
4507 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4508 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4509 uint64_t size, end_mark;
4511 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4513 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4514 error_report("%s: incorrect state %s", __func__,
4515 MigrationStatus_str(s->state));
4520 * Note: see comments in ramblock_recv_bitmap_send() on why we
4521 * need the endianness conversion, and the paddings.
4523 local_size = ROUND_UP(local_size, 8);
4526 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4528 size = qemu_get_be64(file);
4530 /* The size of the bitmap should match with our ramblock */
4531 if (size != local_size) {
4532 error_report("%s: ramblock '%s' bitmap size mismatch "
4533 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4534 block->idstr, size, local_size);
4539 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4540 end_mark = qemu_get_be64(file);
4542 ret = qemu_file_get_error(file);
4543 if (ret || size != local_size) {
4544 error_report("%s: read bitmap failed for ramblock '%s': %d"
4545 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4546 __func__, block->idstr, ret, local_size, size);
4551 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4552 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4553 __func__, block->idstr, end_mark);
4559 * Endianness conversion. We are during postcopy (though paused).
4560 * The dirty bitmap won't change. We can directly modify it.
4562 bitmap_from_le(block->bmap, le_bitmap, nbits);
4565 * What we received is "received bitmap". Revert it as the initial
4566 * dirty bitmap for this ramblock.
4568 bitmap_complement(block->bmap, block->bmap, nbits);
4570 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4571 ramblock_dirty_bitmap_clear_discarded_pages(block);
4573 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4574 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4577 * We succeeded to sync bitmap for current ramblock. If this is
4578 * the last one to sync, we need to notify the main send thread.
4580 ram_dirty_bitmap_reload_notify(s);
4588 static int ram_resume_prepare(MigrationState *s, void *opaque)
4590 RAMState *rs = *(RAMState **)opaque;
4593 ret = ram_dirty_bitmap_sync_all(s, rs);
4598 ram_state_resume_prepare(rs, s->to_dst_file);
4603 void postcopy_preempt_shutdown_file(MigrationState *s)
4605 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4606 qemu_fflush(s->postcopy_qemufile_src);
4609 static SaveVMHandlers savevm_ram_handlers = {
4610 .save_setup = ram_save_setup,
4611 .save_live_iterate = ram_save_iterate,
4612 .save_live_complete_postcopy = ram_save_complete,
4613 .save_live_complete_precopy = ram_save_complete,
4614 .has_postcopy = ram_has_postcopy,
4615 .save_live_pending = ram_save_pending,
4616 .load_state = ram_load,
4617 .save_cleanup = ram_save_cleanup,
4618 .load_setup = ram_load_setup,
4619 .load_cleanup = ram_load_cleanup,
4620 .resume_prepare = ram_resume_prepare,
4623 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4624 size_t old_size, size_t new_size)
4626 PostcopyState ps = postcopy_state_get();
4628 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4631 if (ramblock_is_ignored(rb)) {
4635 if (!migration_is_idle()) {
4637 * Precopy code on the source cannot deal with the size of RAM blocks
4638 * changing at random points in time - especially after sending the
4639 * RAM block sizes in the migration stream, they must no longer change.
4640 * Abort and indicate a proper reason.
4642 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4643 migration_cancel(err);
4648 case POSTCOPY_INCOMING_ADVISE:
4650 * Update what ram_postcopy_incoming_init()->init_range() does at the
4651 * time postcopy was advised. Syncing RAM blocks with the source will
4652 * result in RAM resizes.
4654 if (old_size < new_size) {
4655 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4656 error_report("RAM block '%s' discard of resized RAM failed",
4660 rb->postcopy_length = new_size;
4662 case POSTCOPY_INCOMING_NONE:
4663 case POSTCOPY_INCOMING_RUNNING:
4664 case POSTCOPY_INCOMING_END:
4666 * Once our guest is running, postcopy does no longer care about
4667 * resizes. When growing, the new memory was not available on the
4668 * source, no handler needed.
4672 error_report("RAM block '%s' resized during postcopy state: %d",
4678 static RAMBlockNotifier ram_mig_ram_notifier = {
4679 .ram_block_resized = ram_mig_ram_block_resized,
4682 void ram_mig_init(void)
4684 qemu_mutex_init(&XBZRLE.lock);
4685 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4686 ram_block_notifier_add(&ram_mig_ram_notifier);