migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/madvise.h"
  34 #include "qemu/main-loop.h"
  35 #include "io/channel-null.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/error.h"
  46 #include "qapi/qapi-types-migration.h"
  47 #include "qapi/qapi-events-migration.h"
  48 #include "qapi/qmp/qerror.h"
  49 #include "trace.h"
  50 #include "exec/ram_addr.h"
  51 #include "exec/target_page.h"
  52 #include "qemu/rcu_queue.h"
  53 #include "migration/colo.h"
  54 #include "block.h"
  55 #include "sysemu/cpu-throttle.h"
  56 #include "savevm.h"
  57 #include "qemu/iov.h"
  58 #include "multifd.h"
  59 #include "sysemu/runstate.h"
  60 #include "options.h"
  61
  62 #include "hw/boards.h" /* for machine_dump_guest_core() */
  63
  64 #if defined(__linux__)
  65 #include "qemu/userfaultfd.h"
  66 #endif /* defined(__linux__) */
  67
  68 /***********************************************************/
  69 /* ram save/restore */
  70
  71 /*
  72  * RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  73  * worked for pages that were filled with the same char.  We switched
  74  * it to only search for the zero value.  And to avoid confusion with
  75  * RAM_SAVE_FLAG_COMPRESS_PAGE just rename it.
  76  */
  77 /*
  78  * RAM_SAVE_FLAG_FULL was obsoleted in 2009, it can be reused now
  79  */
  80 #define RAM_SAVE_FLAG_FULL     0x01
  81 #define RAM_SAVE_FLAG_ZERO     0x02
  82 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  83 #define RAM_SAVE_FLAG_PAGE     0x08
  84 #define RAM_SAVE_FLAG_EOS      0x10
  85 #define RAM_SAVE_FLAG_CONTINUE 0x20
  86 #define RAM_SAVE_FLAG_XBZRLE   0x40
  87 /* 0x80 is reserved in qemu-file.h for RAM_SAVE_FLAG_HOOK */
  88 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  89 #define RAM_SAVE_FLAG_MULTIFD_FLUSH    0x200
  90 /* We can't use any flag that is bigger than 0x200 */
  91
  92 int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
  93      uint8_t *, int) = xbzrle_encode_buffer;
  94 #if defined(CONFIG_AVX512BW_OPT)
  95 #include "qemu/cpuid.h"
  96 static void __attribute__((constructor)) init_cpu_flag(void)
  97 {
  98     unsigned max = __get_cpuid_max(0, NULL);
  99     int a, b, c, d;
 100     if (max >= 1) {
 101         __cpuid(1, a, b, c, d);
 102          /* We must check that AVX is not just available, but usable.  */
 103         if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
 104             int bv;
 105             __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
 106             __cpuid_count(7, 0, a, b, c, d);
 107            /* 0xe6:
 108             *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
 109             *                    and ZMM16-ZMM31 state are enabled by OS)
 110             *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
 111             */
 112             if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
 113                 xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
 114             }
 115         }
 116     }
 117 }
 118 #endif
 119
 120 XBZRLECacheStats xbzrle_counters;
 121
 122 /* used by the search for pages to send */
 123 struct PageSearchStatus {
 124     /* The migration channel used for a specific host page */
 125     QEMUFile    *pss_channel;
 126     /* Last block from where we have sent data */
 127     RAMBlock *last_sent_block;
 128     /* Current block being searched */
 129     RAMBlock    *block;
 130     /* Current page to search from */
 131     unsigned long page;
 132     /* Set once we wrap around */
 133     bool         complete_round;
 134     /* Whether we're sending a host page */
 135     bool          host_page_sending;
 136     /* The start/end of current host page.  Invalid if host_page_sending==false */
 137     unsigned long host_page_start;
 138     unsigned long host_page_end;
 139 };
 140 typedef struct PageSearchStatus PageSearchStatus;
 141
 142 /* struct contains XBZRLE cache and a static page
 143    used by the compression */
 144 static struct {
 145     /* buffer used for XBZRLE encoding */
 146     uint8_t *encoded_buf;
 147     /* buffer for storing page content */
 148     uint8_t *current_buf;
 149     /* Cache for XBZRLE, Protected by lock. */
 150     PageCache *cache;
 151     QemuMutex lock;
 152     /* it will store a page full of zeros */
 153     uint8_t *zero_target_page;
 154     /* buffer used for XBZRLE decoding */
 155     uint8_t *decoded_buf;
 156 } XBZRLE;
 157
 158 static void XBZRLE_cache_lock(void)
 159 {
 160     if (migrate_xbzrle()) {
 161         qemu_mutex_lock(&XBZRLE.lock);
 162     }
 163 }
 164
 165 static void XBZRLE_cache_unlock(void)
 166 {
 167     if (migrate_xbzrle()) {
 168         qemu_mutex_unlock(&XBZRLE.lock);
 169     }
 170 }
 171
 172 /**
 173  * xbzrle_cache_resize: resize the xbzrle cache
 174  *
 175  * This function is called from migrate_params_apply in main
 176  * thread, possibly while a migration is in progress.  A running
 177  * migration may be using the cache and might finish during this call,
 178  * hence changes to the cache are protected by XBZRLE.lock().
 179  *
 180  * Returns 0 for success or -1 for error
 181  *
 182  * @new_size: new cache size
 183  * @errp: set *errp if the check failed, with reason
 184  */
 185 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 186 {
 187     PageCache *new_cache;
 188     int64_t ret = 0;
 189
 190     /* Check for truncation */
 191     if (new_size != (size_t)new_size) {
 192         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 193                    "exceeding address space");
 194         return -1;
 195     }
 196
 197     if (new_size == migrate_xbzrle_cache_size()) {
 198         /* nothing to do */
 199         return 0;
 200     }
 201
 202     XBZRLE_cache_lock();
 203
 204     if (XBZRLE.cache != NULL) {
 205         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 206         if (!new_cache) {
 207             ret = -1;
 208             goto out;
 209         }
 210
 211         cache_fini(XBZRLE.cache);
 212         XBZRLE.cache = new_cache;
 213     }
 214 out:
 215     XBZRLE_cache_unlock();
 216     return ret;
 217 }
 218
 219 static bool postcopy_preempt_active(void)
 220 {
 221     return migrate_postcopy_preempt() && migration_in_postcopy();
 222 }
 223
 224 bool ramblock_is_ignored(RAMBlock *block)
 225 {
 226     return !qemu_ram_is_migratable(block) ||
 227            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 228 }
 229
 230 #undef RAMBLOCK_FOREACH
 231
 232 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 233 {
 234     RAMBlock *block;
 235     int ret = 0;
 236
 237     RCU_READ_LOCK_GUARD();
 238
 239     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 240         ret = func(block, opaque);
 241         if (ret) {
 242             break;
 243         }
 244     }
 245     return ret;
 246 }
 247
 248 static void ramblock_recv_map_init(void)
 249 {
 250     RAMBlock *rb;
 251
 252     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 253         assert(!rb->receivedmap);
 254         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 255     }
 256 }
 257
 258 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 259 {
 260     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 261                     rb->receivedmap);
 262 }
 263
 264 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 265 {
 266     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 267 }
 268
 269 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 270 {
 271     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 272 }
 273
 274 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 275                                     size_t nr)
 276 {
 277     bitmap_set_atomic(rb->receivedmap,
 278                       ramblock_recv_bitmap_offset(host_addr, rb),
 279                       nr);
 280 }
 281
 282 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 283
 284 /*
 285  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 286  *
 287  * Returns >0 if success with sent bytes, or <0 if error.
 288  */
 289 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 290                                   const char *block_name)
 291 {
 292     RAMBlock *block = qemu_ram_block_by_name(block_name);
 293     unsigned long *le_bitmap, nbits;
 294     uint64_t size;
 295
 296     if (!block) {
 297         error_report("%s: invalid block name: %s", __func__, block_name);
 298         return -1;
 299     }
 300
 301     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 302
 303     /*
 304      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 305      * machines we may need 4 more bytes for padding (see below
 306      * comment). So extend it a bit before hand.
 307      */
 308     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 309
 310     /*
 311      * Always use little endian when sending the bitmap. This is
 312      * required that when source and destination VMs are not using the
 313      * same endianness. (Note: big endian won't work.)
 314      */
 315     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 316
 317     /* Size of the bitmap, in bytes */
 318     size = DIV_ROUND_UP(nbits, 8);
 319
 320     /*
 321      * size is always aligned to 8 bytes for 64bit machines, but it
 322      * may not be true for 32bit machines. We need this padding to
 323      * make sure the migration can survive even between 32bit and
 324      * 64bit machines.
 325      */
 326     size = ROUND_UP(size, 8);
 327
 328     qemu_put_be64(file, size);
 329     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 330     /*
 331      * Mark as an end, in case the middle part is screwed up due to
 332      * some "mysterious" reason.
 333      */
 334     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 335     qemu_fflush(file);
 336
 337     g_free(le_bitmap);
 338
 339     if (qemu_file_get_error(file)) {
 340         return qemu_file_get_error(file);
 341     }
 342
 343     return size + sizeof(size);
 344 }
 345
 346 /*
 347  * An outstanding page request, on the source, having been received
 348  * and queued
 349  */
 350 struct RAMSrcPageRequest {
 351     RAMBlock *rb;
 352     hwaddr    offset;
 353     hwaddr    len;
 354
 355     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 356 };
 357
 358 /* State of RAM for migration */
 359 struct RAMState {
 360     /*
 361      * PageSearchStatus structures for the channels when send pages.
 362      * Protected by the bitmap_mutex.
 363      */
 364     PageSearchStatus pss[RAM_CHANNEL_MAX];
 365     /* UFFD file descriptor, used in 'write-tracking' migration */
 366     int uffdio_fd;
 367     /* total ram size in bytes */
 368     uint64_t ram_bytes_total;
 369     /* Last block that we have visited searching for dirty pages */
 370     RAMBlock *last_seen_block;
 371     /* Last dirty target page we have sent */
 372     ram_addr_t last_page;
 373     /* last ram version we have seen */
 374     uint32_t last_version;
 375     /* How many times we have dirty too many pages */
 376     int dirty_rate_high_cnt;
 377     /* these variables are used for bitmap sync */
 378     /* last time we did a full bitmap_sync */
 379     int64_t time_last_bitmap_sync;
 380     /* bytes transferred at start_time */
 381     uint64_t bytes_xfer_prev;
 382     /* number of dirty pages since start_time */
 383     uint64_t num_dirty_pages_period;
 384     /* xbzrle misses since the beginning of the period */
 385     uint64_t xbzrle_cache_miss_prev;
 386     /* Amount of xbzrle pages since the beginning of the period */
 387     uint64_t xbzrle_pages_prev;
 388     /* Amount of xbzrle encoded bytes since the beginning of the period */
 389     uint64_t xbzrle_bytes_prev;
 390     /* Start using XBZRLE (e.g., after the first round). */
 391     bool xbzrle_enabled;
 392     /* Are we on the last stage of migration */
 393     bool last_stage;
 394     /* compression statistics since the beginning of the period */
 395     /* amount of count that no free thread to compress data */
 396     uint64_t compress_thread_busy_prev;
 397     /* amount bytes after compression */
 398     uint64_t compressed_size_prev;
 399     /* amount of compressed pages */
 400     uint64_t compress_pages_prev;
 401
 402     /* total handled target pages at the beginning of period */
 403     uint64_t target_page_count_prev;
 404     /* total handled target pages since start */
 405     uint64_t target_page_count;
 406     /* number of dirty bits in the bitmap */
 407     uint64_t migration_dirty_pages;
 408     /*
 409      * Protects:
 410      * - dirty/clear bitmap
 411      * - migration_dirty_pages
 412      * - pss structures
 413      */
 414     QemuMutex bitmap_mutex;
 415     /* The RAMBlock used in the last src_page_requests */
 416     RAMBlock *last_req_rb;
 417     /* Queue of outstanding page requests from the destination */
 418     QemuMutex src_page_req_mutex;
 419     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 420 };
 421 typedef struct RAMState RAMState;
 422
 423 static RAMState *ram_state;
 424
 425 static NotifierWithReturnList precopy_notifier_list;
 426
 427 /* Whether postcopy has queued requests? */
 428 static bool postcopy_has_request(RAMState *rs)
 429 {
 430     return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 431 }
 432
 433 void precopy_infrastructure_init(void)
 434 {
 435     notifier_with_return_list_init(&precopy_notifier_list);
 436 }
 437
 438 void precopy_add_notifier(NotifierWithReturn *n)
 439 {
 440     notifier_with_return_list_add(&precopy_notifier_list, n);
 441 }
 442
 443 void precopy_remove_notifier(NotifierWithReturn *n)
 444 {
 445     notifier_with_return_remove(n);
 446 }
 447
 448 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 449 {
 450     PrecopyNotifyData pnd;
 451     pnd.reason = reason;
 452     pnd.errp = errp;
 453
 454     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 455 }
 456
 457 uint64_t ram_bytes_remaining(void)
 458 {
 459     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 460                        0;
 461 }
 462
 463 RAMStats ram_counters;
 464
 465 void ram_transferred_add(uint64_t bytes)
 466 {
 467     if (runstate_is_running()) {
 468         stat64_add(&ram_counters.precopy_bytes, bytes);
 469     } else if (migration_in_postcopy()) {
 470         stat64_add(&ram_counters.postcopy_bytes, bytes);
 471     } else {
 472         stat64_add(&ram_counters.downtime_bytes, bytes);
 473     }
 474     stat64_add(&ram_counters.transferred, bytes);
 475 }
 476
 477 struct MigrationOps {
 478     int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss);
 479 };
 480 typedef struct MigrationOps MigrationOps;
 481
 482 MigrationOps *migration_ops;
 483
 484 CompressionStats compression_counters;
 485
 486 struct CompressParam {
 487     bool done;
 488     bool quit;
 489     bool zero_page;
 490     QEMUFile *file;
 491     QemuMutex mutex;
 492     QemuCond cond;
 493     RAMBlock *block;
 494     ram_addr_t offset;
 495
 496     /* internally used fields */
 497     z_stream stream;
 498     uint8_t *originbuf;
 499 };
 500 typedef struct CompressParam CompressParam;
 501
 502 struct DecompressParam {
 503     bool done;
 504     bool quit;
 505     QemuMutex mutex;
 506     QemuCond cond;
 507     void *des;
 508     uint8_t *compbuf;
 509     int len;
 510     z_stream stream;
 511 };
 512 typedef struct DecompressParam DecompressParam;
 513
 514 static CompressParam *comp_param;
 515 static QemuThread *compress_threads;
 516 /* comp_done_cond is used to wake up the migration thread when
 517  * one of the compression threads has finished the compression.
 518  * comp_done_lock is used to co-work with comp_done_cond.
 519  */
 520 static QemuMutex comp_done_lock;
 521 static QemuCond comp_done_cond;
 522
 523 static QEMUFile *decomp_file;
 524 static DecompressParam *decomp_param;
 525 static QemuThread *decompress_threads;
 526 static QemuMutex decomp_done_lock;
 527 static QemuCond decomp_done_cond;
 528
 529 static int ram_save_host_page_urgent(PageSearchStatus *pss);
 530
 531 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 532                                  ram_addr_t offset, uint8_t *source_buf);
 533
 534 /* NOTE: page is the PFN not real ram_addr_t. */
 535 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
 536 {
 537     pss->block = rb;
 538     pss->page = page;
 539     pss->complete_round = false;
 540 }
 541
 542 /*
 543  * Check whether two PSSs are actively sending the same page.  Return true
 544  * if it is, false otherwise.
 545  */
 546 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
 547 {
 548     return pss1->host_page_sending && pss2->host_page_sending &&
 549         (pss1->host_page_start == pss2->host_page_start);
 550 }
 551
 552 static void *do_data_compress(void *opaque)
 553 {
 554     CompressParam *param = opaque;
 555     RAMBlock *block;
 556     ram_addr_t offset;
 557     bool zero_page;
 558
 559     qemu_mutex_lock(&param->mutex);
 560     while (!param->quit) {
 561         if (param->block) {
 562             block = param->block;
 563             offset = param->offset;
 564             param->block = NULL;
 565             qemu_mutex_unlock(&param->mutex);
 566
 567             zero_page = do_compress_ram_page(param->file, &param->stream,
 568                                              block, offset, param->originbuf);
 569
 570             qemu_mutex_lock(&comp_done_lock);
 571             param->done = true;
 572             param->zero_page = zero_page;
 573             qemu_cond_signal(&comp_done_cond);
 574             qemu_mutex_unlock(&comp_done_lock);
 575
 576             qemu_mutex_lock(&param->mutex);
 577         } else {
 578             qemu_cond_wait(&param->cond, &param->mutex);
 579         }
 580     }
 581     qemu_mutex_unlock(&param->mutex);
 582
 583     return NULL;
 584 }
 585
 586 static void compress_threads_save_cleanup(void)
 587 {
 588     int i, thread_count;
 589
 590     if (!migrate_compress() || !comp_param) {
 591         return;
 592     }
 593
 594     thread_count = migrate_compress_threads();
 595     for (i = 0; i < thread_count; i++) {
 596         /*
 597          * we use it as a indicator which shows if the thread is
 598          * properly init'd or not
 599          */
 600         if (!comp_param[i].file) {
 601             break;
 602         }
 603
 604         qemu_mutex_lock(&comp_param[i].mutex);
 605         comp_param[i].quit = true;
 606         qemu_cond_signal(&comp_param[i].cond);
 607         qemu_mutex_unlock(&comp_param[i].mutex);
 608
 609         qemu_thread_join(compress_threads + i);
 610         qemu_mutex_destroy(&comp_param[i].mutex);
 611         qemu_cond_destroy(&comp_param[i].cond);
 612         deflateEnd(&comp_param[i].stream);
 613         g_free(comp_param[i].originbuf);
 614         qemu_fclose(comp_param[i].file);
 615         comp_param[i].file = NULL;
 616     }
 617     qemu_mutex_destroy(&comp_done_lock);
 618     qemu_cond_destroy(&comp_done_cond);
 619     g_free(compress_threads);
 620     g_free(comp_param);
 621     compress_threads = NULL;
 622     comp_param = NULL;
 623 }
 624
 625 static int compress_threads_save_setup(void)
 626 {
 627     int i, thread_count;
 628
 629     if (!migrate_compress()) {
 630         return 0;
 631     }
 632     thread_count = migrate_compress_threads();
 633     compress_threads = g_new0(QemuThread, thread_count);
 634     comp_param = g_new0(CompressParam, thread_count);
 635     qemu_cond_init(&comp_done_cond);
 636     qemu_mutex_init(&comp_done_lock);
 637     for (i = 0; i < thread_count; i++) {
 638         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 639         if (!comp_param[i].originbuf) {
 640             goto exit;
 641         }
 642
 643         if (deflateInit(&comp_param[i].stream,
 644                         migrate_compress_level()) != Z_OK) {
 645             g_free(comp_param[i].originbuf);
 646             goto exit;
 647         }
 648
 649         /* comp_param[i].file is just used as a dummy buffer to save data,
 650          * set its ops to empty.
 651          */
 652         comp_param[i].file = qemu_file_new_output(
 653             QIO_CHANNEL(qio_channel_null_new()));
 654         comp_param[i].done = true;
 655         comp_param[i].quit = false;
 656         qemu_mutex_init(&comp_param[i].mutex);
 657         qemu_cond_init(&comp_param[i].cond);
 658         qemu_thread_create(compress_threads + i, "compress",
 659                            do_data_compress, comp_param + i,
 660                            QEMU_THREAD_JOINABLE);
 661     }
 662     return 0;
 663
 664 exit:
 665     compress_threads_save_cleanup();
 666     return -1;
 667 }
 668
 669 /**
 670  * save_page_header: write page header to wire
 671  *
 672  * If this is the 1st block, it also writes the block identification
 673  *
 674  * Returns the number of bytes written
 675  *
 676  * @pss: current PSS channel status
 677  * @block: block that contains the page we want to send
 678  * @offset: offset inside the block for the page
 679  *          in the lower bits, it contains flags
 680  */
 681 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f,
 682                                RAMBlock *block, ram_addr_t offset)
 683 {
 684     size_t size, len;
 685     bool same_block = (block == pss->last_sent_block);
 686
 687     if (same_block) {
 688         offset |= RAM_SAVE_FLAG_CONTINUE;
 689     }
 690     qemu_put_be64(f, offset);
 691     size = 8;
 692
 693     if (!same_block) {
 694         len = strlen(block->idstr);
 695         qemu_put_byte(f, len);
 696         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 697         size += 1 + len;
 698         pss->last_sent_block = block;
 699     }
 700     return size;
 701 }
 702
 703 /**
 704  * mig_throttle_guest_down: throttle down the guest
 705  *
 706  * Reduce amount of guest cpu execution to hopefully slow down memory
 707  * writes. If guest dirty memory rate is reduced below the rate at
 708  * which we can transfer pages to the destination then we should be
 709  * able to complete migration. Some workloads dirty memory way too
 710  * fast and will not effectively converge, even with auto-converge.
 711  */
 712 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 713                                     uint64_t bytes_dirty_threshold)
 714 {
 715     uint64_t pct_initial = migrate_cpu_throttle_initial();
 716     uint64_t pct_increment = migrate_cpu_throttle_increment();
 717     bool pct_tailslow = migrate_cpu_throttle_tailslow();
 718     int pct_max = migrate_max_cpu_throttle();
 719
 720     uint64_t throttle_now = cpu_throttle_get_percentage();
 721     uint64_t cpu_now, cpu_ideal, throttle_inc;
 722
 723     /* We have not started throttling yet. Let's start it. */
 724     if (!cpu_throttle_active()) {
 725         cpu_throttle_set(pct_initial);
 726     } else {
 727         /* Throttling already on, just increase the rate */
 728         if (!pct_tailslow) {
 729             throttle_inc = pct_increment;
 730         } else {
 731             /* Compute the ideal CPU percentage used by Guest, which may
 732              * make the dirty rate match the dirty rate threshold. */
 733             cpu_now = 100 - throttle_now;
 734             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 735                         bytes_dirty_period);
 736             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 737         }
 738         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 739     }
 740 }
 741
 742 void mig_throttle_counter_reset(void)
 743 {
 744     RAMState *rs = ram_state;
 745
 746     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 747     rs->num_dirty_pages_period = 0;
 748     rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
 749 }
 750
 751 /**
 752  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 753  *
 754  * @rs: current RAM state
 755  * @current_addr: address for the zero page
 756  *
 757  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 758  * The important thing is that a stale (not-yet-0'd) page be replaced
 759  * by the new data.
 760  * As a bonus, if the page wasn't in the cache it gets added so that
 761  * when a small write is made into the 0'd page it gets XBZRLE sent.
 762  */
 763 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 764 {
 765     /* We don't care if this fails to allocate a new cache page
 766      * as long as it updated an old one */
 767     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 768                  stat64_get(&ram_counters.dirty_sync_count));
 769 }
 770
 771 #define ENCODING_FLAG_XBZRLE 0x1
 772
 773 /**
 774  * save_xbzrle_page: compress and send current page
 775  *
 776  * Returns: 1 means that we wrote the page
 777  *          0 means that page is identical to the one already sent
 778  *          -1 means that xbzrle would be longer than normal
 779  *
 780  * @rs: current RAM state
 781  * @pss: current PSS channel
 782  * @current_data: pointer to the address of the page contents
 783  * @current_addr: addr of the page
 784  * @block: block that contains the page we want to send
 785  * @offset: offset inside the block for the page
 786  */
 787 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
 788                             uint8_t **current_data, ram_addr_t current_addr,
 789                             RAMBlock *block, ram_addr_t offset)
 790 {
 791     int encoded_len = 0, bytes_xbzrle;
 792     uint8_t *prev_cached_page;
 793     QEMUFile *file = pss->pss_channel;
 794     uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
 795
 796     if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) {
 797         xbzrle_counters.cache_miss++;
 798         if (!rs->last_stage) {
 799             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 800                              generation) == -1) {
 801                 return -1;
 802             } else {
 803                 /* update *current_data when the page has been
 804                    inserted into cache */
 805                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 806             }
 807         }
 808         return -1;
 809     }
 810
 811     /*
 812      * Reaching here means the page has hit the xbzrle cache, no matter what
 813      * encoding result it is (normal encoding, overflow or skipping the page),
 814      * count the page as encoded. This is used to calculate the encoding rate.
 815      *
 816      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 817      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 818      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 819      * skipped page included. In this way, the encoding rate can tell if the
 820      * guest page is good for xbzrle encoding.
 821      */
 822     xbzrle_counters.pages++;
 823     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 824
 825     /* save current buffer into memory */
 826     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 827
 828     /* XBZRLE encoding (if there is no overflow) */
 829     encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf,
 830                                             TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 831                                             TARGET_PAGE_SIZE);
 832
 833     /*
 834      * Update the cache contents, so that it corresponds to the data
 835      * sent, in all cases except where we skip the page.
 836      */
 837     if (!rs->last_stage && encoded_len != 0) {
 838         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 839         /*
 840          * In the case where we couldn't compress, ensure that the caller
 841          * sends the data from the cache, since the guest might have
 842          * changed the RAM since we copied it.
 843          */
 844         *current_data = prev_cached_page;
 845     }
 846
 847     if (encoded_len == 0) {
 848         trace_save_xbzrle_page_skipping();
 849         return 0;
 850     } else if (encoded_len == -1) {
 851         trace_save_xbzrle_page_overflow();
 852         xbzrle_counters.overflow++;
 853         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 854         return -1;
 855     }
 856
 857     /* Send XBZRLE based compressed page */
 858     bytes_xbzrle = save_page_header(pss, pss->pss_channel, block,
 859                                     offset | RAM_SAVE_FLAG_XBZRLE);
 860     qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
 861     qemu_put_be16(file, encoded_len);
 862     qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
 863     bytes_xbzrle += encoded_len + 1 + 2;
 864     /*
 865      * Like compressed_size (please see update_compress_thread_counts),
 866      * the xbzrle encoded bytes don't count the 8 byte header with
 867      * RAM_SAVE_FLAG_CONTINUE.
 868      */
 869     xbzrle_counters.bytes += bytes_xbzrle - 8;
 870     ram_transferred_add(bytes_xbzrle);
 871
 872     return 1;
 873 }
 874
 875 /**
 876  * pss_find_next_dirty: find the next dirty page of current ramblock
 877  *
 878  * This function updates pss->page to point to the next dirty page index
 879  * within the ramblock to migrate, or the end of ramblock when nothing
 880  * found.  Note that when pss->host_page_sending==true it means we're
 881  * during sending a host page, so we won't look for dirty page that is
 882  * outside the host page boundary.
 883  *
 884  * @pss: the current page search status
 885  */
 886 static void pss_find_next_dirty(PageSearchStatus *pss)
 887 {
 888     RAMBlock *rb = pss->block;
 889     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 890     unsigned long *bitmap = rb->bmap;
 891
 892     if (ramblock_is_ignored(rb)) {
 893         /* Points directly to the end, so we know no dirty page */
 894         pss->page = size;
 895         return;
 896     }
 897
 898     /*
 899      * If during sending a host page, only look for dirty pages within the
 900      * current host page being send.
 901      */
 902     if (pss->host_page_sending) {
 903         assert(pss->host_page_end);
 904         size = MIN(size, pss->host_page_end);
 905     }
 906
 907     pss->page = find_next_bit(bitmap, size, pss->page);
 908 }
 909
 910 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 911                                                        unsigned long page)
 912 {
 913     uint8_t shift;
 914     hwaddr size, start;
 915
 916     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 917         return;
 918     }
 919
 920     shift = rb->clear_bmap_shift;
 921     /*
 922      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 923      * can make things easier sometimes since then start address
 924      * of the small chunk will always be 64 pages aligned so the
 925      * bitmap will always be aligned to unsigned long. We should
 926      * even be able to remove this restriction but I'm simply
 927      * keeping it.
 928      */
 929     assert(shift >= 6);
 930
 931     size = 1ULL << (TARGET_PAGE_BITS + shift);
 932     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 933     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 934     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 935 }
 936
 937 static void
 938 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 939                                                  unsigned long start,
 940                                                  unsigned long npages)
 941 {
 942     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 943     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 944     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 945
 946     /*
 947      * Clear pages from start to start + npages - 1, so the end boundary is
 948      * exclusive.
 949      */
 950     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 951         migration_clear_memory_region_dirty_bitmap(rb, i);
 952     }
 953 }
 954
 955 /*
 956  * colo_bitmap_find_diry:find contiguous dirty pages from start
 957  *
 958  * Returns the page offset within memory region of the start of the contiguout
 959  * dirty page
 960  *
 961  * @rs: current RAM state
 962  * @rb: RAMBlock where to search for dirty pages
 963  * @start: page where we start the search
 964  * @num: the number of contiguous dirty pages
 965  */
 966 static inline
 967 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 968                                      unsigned long start, unsigned long *num)
 969 {
 970     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 971     unsigned long *bitmap = rb->bmap;
 972     unsigned long first, next;
 973
 974     *num = 0;
 975
 976     if (ramblock_is_ignored(rb)) {
 977         return size;
 978     }
 979
 980     first = find_next_bit(bitmap, size, start);
 981     if (first >= size) {
 982         return first;
 983     }
 984     next = find_next_zero_bit(bitmap, size, first + 1);
 985     assert(next >= first);
 986     *num = next - first;
 987     return first;
 988 }
 989
 990 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 991                                                 RAMBlock *rb,
 992                                                 unsigned long page)
 993 {
 994     bool ret;
 995
 996     /*
 997      * Clear dirty bitmap if needed.  This _must_ be called before we
 998      * send any of the page in the chunk because we need to make sure
 999      * we can capture further page content changes when we sync dirty
1000      * log the next time.  So as long as we are going to send any of
1001      * the page in the chunk we clear the remote dirty bitmap for all.
1002      * Clearing it earlier won't be a problem, but too late will.
1003      */
1004     migration_clear_memory_region_dirty_bitmap(rb, page);
1005
1006     ret = test_and_clear_bit(page, rb->bmap);
1007     if (ret) {
1008         rs->migration_dirty_pages--;
1009     }
1010
1011     return ret;
1012 }
1013
1014 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1015                                        void *opaque)
1016 {
1017     const hwaddr offset = section->offset_within_region;
1018     const hwaddr size = int128_get64(section->size);
1019     const unsigned long start = offset >> TARGET_PAGE_BITS;
1020     const unsigned long npages = size >> TARGET_PAGE_BITS;
1021     RAMBlock *rb = section->mr->ram_block;
1022     uint64_t *cleared_bits = opaque;
1023
1024     /*
1025      * We don't grab ram_state->bitmap_mutex because we expect to run
1026      * only when starting migration or during postcopy recovery where
1027      * we don't have concurrent access.
1028      */
1029     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1030         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1031     }
1032     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1033     bitmap_clear(rb->bmap, start, npages);
1034 }
1035
1036 /*
1037  * Exclude all dirty pages from migration that fall into a discarded range as
1038  * managed by a RamDiscardManager responsible for the mapped memory region of
1039  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1040  *
1041  * Discarded pages ("logically unplugged") have undefined content and must
1042  * not get migrated, because even reading these pages for migration might
1043  * result in undesired behavior.
1044  *
1045  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1046  *
1047  * Note: The result is only stable while migrating (precopy/postcopy).
1048  */
1049 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1050 {
1051     uint64_t cleared_bits = 0;
1052
1053     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1054         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1055         MemoryRegionSection section = {
1056             .mr = rb->mr,
1057             .offset_within_region = 0,
1058             .size = int128_make64(qemu_ram_get_used_length(rb)),
1059         };
1060
1061         ram_discard_manager_replay_discarded(rdm, &section,
1062                                              dirty_bitmap_clear_section,
1063                                              &cleared_bits);
1064     }
1065     return cleared_bits;
1066 }
1067
1068 /*
1069  * Check if a host-page aligned page falls into a discarded range as managed by
1070  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1071  *
1072  * Note: The result is only stable while migrating (precopy/postcopy).
1073  */
1074 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1075 {
1076     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1077         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1078         MemoryRegionSection section = {
1079             .mr = rb->mr,
1080             .offset_within_region = start,
1081             .size = int128_make64(qemu_ram_pagesize(rb)),
1082         };
1083
1084         return !ram_discard_manager_is_populated(rdm, &section);
1085     }
1086     return false;
1087 }
1088
1089 /* Called with RCU critical section */
1090 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1091 {
1092     uint64_t new_dirty_pages =
1093         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1094
1095     rs->migration_dirty_pages += new_dirty_pages;
1096     rs->num_dirty_pages_period += new_dirty_pages;
1097 }
1098
1099 /**
1100  * ram_pagesize_summary: calculate all the pagesizes of a VM
1101  *
1102  * Returns a summary bitmap of the page sizes of all RAMBlocks
1103  *
1104  * For VMs with just normal pages this is equivalent to the host page
1105  * size. If it's got some huge pages then it's the OR of all the
1106  * different page sizes.
1107  */
1108 uint64_t ram_pagesize_summary(void)
1109 {
1110     RAMBlock *block;
1111     uint64_t summary = 0;
1112
1113     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1114         summary |= block->page_size;
1115     }
1116
1117     return summary;
1118 }
1119
1120 uint64_t ram_get_total_transferred_pages(void)
1121 {
1122     return stat64_get(&ram_counters.normal_pages) +
1123         stat64_get(&ram_counters.zero_pages) +
1124         compression_counters.pages + xbzrle_counters.pages;
1125 }
1126
1127 static void migration_update_rates(RAMState *rs, int64_t end_time)
1128 {
1129     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1130     double compressed_size;
1131
1132     /* calculate period counters */
1133     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1134                 / (end_time - rs->time_last_bitmap_sync);
1135
1136     if (!page_count) {
1137         return;
1138     }
1139
1140     if (migrate_xbzrle()) {
1141         double encoded_size, unencoded_size;
1142
1143         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1144             rs->xbzrle_cache_miss_prev) / page_count;
1145         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1146         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1147                          TARGET_PAGE_SIZE;
1148         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1149         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1150             xbzrle_counters.encoding_rate = 0;
1151         } else {
1152             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1153         }
1154         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1155         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1156     }
1157
1158     if (migrate_compress()) {
1159         compression_counters.busy_rate = (double)(compression_counters.busy -
1160             rs->compress_thread_busy_prev) / page_count;
1161         rs->compress_thread_busy_prev = compression_counters.busy;
1162
1163         compressed_size = compression_counters.compressed_size -
1164                           rs->compressed_size_prev;
1165         if (compressed_size) {
1166             double uncompressed_size = (compression_counters.pages -
1167                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1168
1169             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1170             compression_counters.compression_rate =
1171                                         uncompressed_size / compressed_size;
1172
1173             rs->compress_pages_prev = compression_counters.pages;
1174             rs->compressed_size_prev = compression_counters.compressed_size;
1175         }
1176     }
1177 }
1178
1179 static void migration_trigger_throttle(RAMState *rs)
1180 {
1181     uint64_t threshold = migrate_throttle_trigger_threshold();
1182     uint64_t bytes_xfer_period =
1183         stat64_get(&ram_counters.transferred) - rs->bytes_xfer_prev;
1184     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1185     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1186
1187     /* During block migration the auto-converge logic incorrectly detects
1188      * that ram migration makes no progress. Avoid this by disabling the
1189      * throttling logic during the bulk phase of block migration. */
1190     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1191         /* The following detection logic can be refined later. For now:
1192            Check to see if the ratio between dirtied bytes and the approx.
1193            amount of bytes that just got transferred since the last time
1194            we were in this routine reaches the threshold. If that happens
1195            twice, start or increase throttling. */
1196
1197         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1198             (++rs->dirty_rate_high_cnt >= 2)) {
1199             trace_migration_throttle();
1200             rs->dirty_rate_high_cnt = 0;
1201             mig_throttle_guest_down(bytes_dirty_period,
1202                                     bytes_dirty_threshold);
1203         }
1204     }
1205 }
1206
1207 static void migration_bitmap_sync(RAMState *rs)
1208 {
1209     RAMBlock *block;
1210     int64_t end_time;
1211
1212     stat64_add(&ram_counters.dirty_sync_count, 1);
1213
1214     if (!rs->time_last_bitmap_sync) {
1215         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1216     }
1217
1218     trace_migration_bitmap_sync_start();
1219     memory_global_dirty_log_sync();
1220
1221     qemu_mutex_lock(&rs->bitmap_mutex);
1222     WITH_RCU_READ_LOCK_GUARD() {
1223         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1224             ramblock_sync_dirty_bitmap(rs, block);
1225         }
1226         ram_counters.remaining = ram_bytes_remaining();
1227     }
1228     qemu_mutex_unlock(&rs->bitmap_mutex);
1229
1230     memory_global_after_dirty_log_sync();
1231     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1232
1233     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1234
1235     /* more than 1 second = 1000 millisecons */
1236     if (end_time > rs->time_last_bitmap_sync + 1000) {
1237         migration_trigger_throttle(rs);
1238
1239         migration_update_rates(rs, end_time);
1240
1241         rs->target_page_count_prev = rs->target_page_count;
1242
1243         /* reset period counters */
1244         rs->time_last_bitmap_sync = end_time;
1245         rs->num_dirty_pages_period = 0;
1246         rs->bytes_xfer_prev = stat64_get(&ram_counters.transferred);
1247     }
1248     if (migrate_events()) {
1249         uint64_t generation = stat64_get(&ram_counters.dirty_sync_count);
1250         qapi_event_send_migration_pass(generation);
1251     }
1252 }
1253
1254 static void migration_bitmap_sync_precopy(RAMState *rs)
1255 {
1256     Error *local_err = NULL;
1257
1258     /*
1259      * The current notifier usage is just an optimization to migration, so we
1260      * don't stop the normal migration process in the error case.
1261      */
1262     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1263         error_report_err(local_err);
1264         local_err = NULL;
1265     }
1266
1267     migration_bitmap_sync(rs);
1268
1269     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1270         error_report_err(local_err);
1271     }
1272 }
1273
1274 void ram_release_page(const char *rbname, uint64_t offset)
1275 {
1276     if (!migrate_release_ram() || !migration_in_postcopy()) {
1277         return;
1278     }
1279
1280     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1281 }
1282
1283 /**
1284  * save_zero_page_to_file: send the zero page to the file
1285  *
1286  * Returns the size of data written to the file, 0 means the page is not
1287  * a zero page
1288  *
1289  * @pss: current PSS channel
1290  * @block: block that contains the page we want to send
1291  * @offset: offset inside the block for the page
1292  */
1293 static int save_zero_page_to_file(PageSearchStatus *pss, QEMUFile *file,
1294                                   RAMBlock *block, ram_addr_t offset)
1295 {
1296     uint8_t *p = block->host + offset;
1297     int len = 0;
1298
1299     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1300         len += save_page_header(pss, file, block, offset | RAM_SAVE_FLAG_ZERO);
1301         qemu_put_byte(file, 0);
1302         len += 1;
1303         ram_release_page(block->idstr, offset);
1304     }
1305     return len;
1306 }
1307
1308 /**
1309  * save_zero_page: send the zero page to the stream
1310  *
1311  * Returns the number of pages written.
1312  *
1313  * @pss: current PSS channel
1314  * @block: block that contains the page we want to send
1315  * @offset: offset inside the block for the page
1316  */
1317 static int save_zero_page(PageSearchStatus *pss, QEMUFile *f, RAMBlock *block,
1318                           ram_addr_t offset)
1319 {
1320     int len = save_zero_page_to_file(pss, f, block, offset);
1321
1322     if (len) {
1323         stat64_add(&ram_counters.zero_pages, 1);
1324         ram_transferred_add(len);
1325         return 1;
1326     }
1327     return -1;
1328 }
1329
1330 /*
1331  * @pages: the number of pages written by the control path,
1332  *        < 0 - error
1333  *        > 0 - number of pages written
1334  *
1335  * Return true if the pages has been saved, otherwise false is returned.
1336  */
1337 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1338                               ram_addr_t offset, int *pages)
1339 {
1340     uint64_t bytes_xmit = 0;
1341     int ret;
1342
1343     *pages = -1;
1344     ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1345                                 TARGET_PAGE_SIZE, &bytes_xmit);
1346     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1347         return false;
1348     }
1349
1350     if (bytes_xmit) {
1351         ram_transferred_add(bytes_xmit);
1352         *pages = 1;
1353     }
1354
1355     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1356         return true;
1357     }
1358
1359     if (bytes_xmit > 0) {
1360         stat64_add(&ram_counters.normal_pages, 1);
1361     } else if (bytes_xmit == 0) {
1362         stat64_add(&ram_counters.zero_pages, 1);
1363     }
1364
1365     return true;
1366 }
1367
1368 /*
1369  * directly send the page to the stream
1370  *
1371  * Returns the number of pages written.
1372  *
1373  * @pss: current PSS channel
1374  * @block: block that contains the page we want to send
1375  * @offset: offset inside the block for the page
1376  * @buf: the page to be sent
1377  * @async: send to page asyncly
1378  */
1379 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1380                             ram_addr_t offset, uint8_t *buf, bool async)
1381 {
1382     QEMUFile *file = pss->pss_channel;
1383
1384     ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
1385                                          offset | RAM_SAVE_FLAG_PAGE));
1386     if (async) {
1387         qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1388                               migrate_release_ram() &&
1389                               migration_in_postcopy());
1390     } else {
1391         qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1392     }
1393     ram_transferred_add(TARGET_PAGE_SIZE);
1394     stat64_add(&ram_counters.normal_pages, 1);
1395     return 1;
1396 }
1397
1398 /**
1399  * ram_save_page: send the given page to the stream
1400  *
1401  * Returns the number of pages written.
1402  *          < 0 - error
1403  *          >=0 - Number of pages written - this might legally be 0
1404  *                if xbzrle noticed the page was the same.
1405  *
1406  * @rs: current RAM state
1407  * @block: block that contains the page we want to send
1408  * @offset: offset inside the block for the page
1409  */
1410 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1411 {
1412     int pages = -1;
1413     uint8_t *p;
1414     bool send_async = true;
1415     RAMBlock *block = pss->block;
1416     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1417     ram_addr_t current_addr = block->offset + offset;
1418
1419     p = block->host + offset;
1420     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1421
1422     XBZRLE_cache_lock();
1423     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1424         pages = save_xbzrle_page(rs, pss, &p, current_addr,
1425                                  block, offset);
1426         if (!rs->last_stage) {
1427             /* Can't send this cached data async, since the cache page
1428              * might get updated before it gets to the wire
1429              */
1430             send_async = false;
1431         }
1432     }
1433
1434     /* XBZRLE overflow or normal page */
1435     if (pages == -1) {
1436         pages = save_normal_page(pss, block, offset, p, send_async);
1437     }
1438
1439     XBZRLE_cache_unlock();
1440
1441     return pages;
1442 }
1443
1444 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1445                                  ram_addr_t offset)
1446 {
1447     if (multifd_queue_page(file, block, offset) < 0) {
1448         return -1;
1449     }
1450     stat64_add(&ram_counters.normal_pages, 1);
1451
1452     return 1;
1453 }
1454
1455 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1456                                  ram_addr_t offset, uint8_t *source_buf)
1457 {
1458     RAMState *rs = ram_state;
1459     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1460     uint8_t *p = block->host + offset;
1461     int ret;
1462
1463     if (save_zero_page_to_file(pss, f, block, offset)) {
1464         return true;
1465     }
1466
1467     save_page_header(pss, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1468
1469     /*
1470      * copy it to a internal buffer to avoid it being modified by VM
1471      * so that we can catch up the error during compression and
1472      * decompression
1473      */
1474     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1475     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1476     if (ret < 0) {
1477         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1478         error_report("compressed data failed!");
1479     }
1480     return false;
1481 }
1482
1483 static void
1484 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1485 {
1486     ram_transferred_add(bytes_xmit);
1487
1488     if (param->zero_page) {
1489         stat64_add(&ram_counters.zero_pages, 1);
1490         return;
1491     }
1492
1493     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1494     compression_counters.compressed_size += bytes_xmit - 8;
1495     compression_counters.pages++;
1496 }
1497
1498 static bool save_page_use_compression(RAMState *rs);
1499
1500 static void flush_compressed_data(RAMState *rs)
1501 {
1502     MigrationState *ms = migrate_get_current();
1503     int idx, len, thread_count;
1504
1505     if (!save_page_use_compression(rs)) {
1506         return;
1507     }
1508     thread_count = migrate_compress_threads();
1509
1510     qemu_mutex_lock(&comp_done_lock);
1511     for (idx = 0; idx < thread_count; idx++) {
1512         while (!comp_param[idx].done) {
1513             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1514         }
1515     }
1516     qemu_mutex_unlock(&comp_done_lock);
1517
1518     for (idx = 0; idx < thread_count; idx++) {
1519         qemu_mutex_lock(&comp_param[idx].mutex);
1520         if (!comp_param[idx].quit) {
1521             len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1522             /*
1523              * it's safe to fetch zero_page without holding comp_done_lock
1524              * as there is no further request submitted to the thread,
1525              * i.e, the thread should be waiting for a request at this point.
1526              */
1527             update_compress_thread_counts(&comp_param[idx], len);
1528         }
1529         qemu_mutex_unlock(&comp_param[idx].mutex);
1530     }
1531 }
1532
1533 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1534                                        ram_addr_t offset)
1535 {
1536     param->block = block;
1537     param->offset = offset;
1538 }
1539
1540 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1541 {
1542     int idx, thread_count, bytes_xmit = -1, pages = -1;
1543     bool wait = migrate_compress_wait_thread();
1544     MigrationState *ms = migrate_get_current();
1545
1546     thread_count = migrate_compress_threads();
1547     qemu_mutex_lock(&comp_done_lock);
1548 retry:
1549     for (idx = 0; idx < thread_count; idx++) {
1550         if (comp_param[idx].done) {
1551             comp_param[idx].done = false;
1552             bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1553                                             comp_param[idx].file);
1554             qemu_mutex_lock(&comp_param[idx].mutex);
1555             set_compress_params(&comp_param[idx], block, offset);
1556             qemu_cond_signal(&comp_param[idx].cond);
1557             qemu_mutex_unlock(&comp_param[idx].mutex);
1558             pages = 1;
1559             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1560             break;
1561         }
1562     }
1563
1564     /*
1565      * wait for the free thread if the user specifies 'compress-wait-thread',
1566      * otherwise we will post the page out in the main thread as normal page.
1567      */
1568     if (pages < 0 && wait) {
1569         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1570         goto retry;
1571     }
1572     qemu_mutex_unlock(&comp_done_lock);
1573
1574     return pages;
1575 }
1576
1577 #define PAGE_ALL_CLEAN 0
1578 #define PAGE_TRY_AGAIN 1
1579 #define PAGE_DIRTY_FOUND 2
1580 /**
1581  * find_dirty_block: find the next dirty page and update any state
1582  * associated with the search process.
1583  *
1584  * Returns:
1585  *         <0: An error happened
1586  *         PAGE_ALL_CLEAN: no dirty page found, give up
1587  *         PAGE_TRY_AGAIN: no dirty page found, retry for next block
1588  *         PAGE_DIRTY_FOUND: dirty page found
1589  *
1590  * @rs: current RAM state
1591  * @pss: data about the state of the current dirty page scan
1592  * @again: set to false if the search has scanned the whole of RAM
1593  */
1594 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss)
1595 {
1596     /* Update pss->page for the next dirty bit in ramblock */
1597     pss_find_next_dirty(pss);
1598
1599     if (pss->complete_round && pss->block == rs->last_seen_block &&
1600         pss->page >= rs->last_page) {
1601         /*
1602          * We've been once around the RAM and haven't found anything.
1603          * Give up.
1604          */
1605         return PAGE_ALL_CLEAN;
1606     }
1607     if (!offset_in_ramblock(pss->block,
1608                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1609         /* Didn't find anything in this RAM Block */
1610         pss->page = 0;
1611         pss->block = QLIST_NEXT_RCU(pss->block, next);
1612         if (!pss->block) {
1613             if (!migrate_multifd_flush_after_each_section()) {
1614                 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel;
1615                 int ret = multifd_send_sync_main(f);
1616                 if (ret < 0) {
1617                     return ret;
1618                 }
1619                 qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
1620                 qemu_fflush(f);
1621             }
1622             /*
1623              * If memory migration starts over, we will meet a dirtied page
1624              * which may still exists in compression threads's ring, so we
1625              * should flush the compressed data to make sure the new page
1626              * is not overwritten by the old one in the destination.
1627              *
1628              * Also If xbzrle is on, stop using the data compression at this
1629              * point. In theory, xbzrle can do better than compression.
1630              */
1631             flush_compressed_data(rs);
1632
1633             /* Hit the end of the list */
1634             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1635             /* Flag that we've looped */
1636             pss->complete_round = true;
1637             /* After the first round, enable XBZRLE. */
1638             if (migrate_xbzrle()) {
1639                 rs->xbzrle_enabled = true;
1640             }
1641         }
1642         /* Didn't find anything this time, but try again on the new block */
1643         return PAGE_TRY_AGAIN;
1644     } else {
1645         /* We've found something */
1646         return PAGE_DIRTY_FOUND;
1647     }
1648 }
1649
1650 /**
1651  * unqueue_page: gets a page of the queue
1652  *
1653  * Helper for 'get_queued_page' - gets a page off the queue
1654  *
1655  * Returns the block of the page (or NULL if none available)
1656  *
1657  * @rs: current RAM state
1658  * @offset: used to return the offset within the RAMBlock
1659  */
1660 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1661 {
1662     struct RAMSrcPageRequest *entry;
1663     RAMBlock *block = NULL;
1664
1665     if (!postcopy_has_request(rs)) {
1666         return NULL;
1667     }
1668
1669     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1670
1671     /*
1672      * This should _never_ change even after we take the lock, because no one
1673      * should be taking anything off the request list other than us.
1674      */
1675     assert(postcopy_has_request(rs));
1676
1677     entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1678     block = entry->rb;
1679     *offset = entry->offset;
1680
1681     if (entry->len > TARGET_PAGE_SIZE) {
1682         entry->len -= TARGET_PAGE_SIZE;
1683         entry->offset += TARGET_PAGE_SIZE;
1684     } else {
1685         memory_region_unref(block->mr);
1686         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1687         g_free(entry);
1688         migration_consume_urgent_request();
1689     }
1690
1691     return block;
1692 }
1693
1694 #if defined(__linux__)
1695 /**
1696  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1697  *   is found, return RAM block pointer and page offset
1698  *
1699  * Returns pointer to the RAMBlock containing faulting page,
1700  *   NULL if no write faults are pending
1701  *
1702  * @rs: current RAM state
1703  * @offset: page offset from the beginning of the block
1704  */
1705 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1706 {
1707     struct uffd_msg uffd_msg;
1708     void *page_address;
1709     RAMBlock *block;
1710     int res;
1711
1712     if (!migrate_background_snapshot()) {
1713         return NULL;
1714     }
1715
1716     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1717     if (res <= 0) {
1718         return NULL;
1719     }
1720
1721     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1722     block = qemu_ram_block_from_host(page_address, false, offset);
1723     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1724     return block;
1725 }
1726
1727 /**
1728  * ram_save_release_protection: release UFFD write protection after
1729  *   a range of pages has been saved
1730  *
1731  * @rs: current RAM state
1732  * @pss: page-search-status structure
1733  * @start_page: index of the first page in the range relative to pss->block
1734  *
1735  * Returns 0 on success, negative value in case of an error
1736 */
1737 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1738         unsigned long start_page)
1739 {
1740     int res = 0;
1741
1742     /* Check if page is from UFFD-managed region. */
1743     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1744         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1745         uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1746
1747         /* Flush async buffers before un-protect. */
1748         qemu_fflush(pss->pss_channel);
1749         /* Un-protect memory range. */
1750         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1751                 false, false);
1752     }
1753
1754     return res;
1755 }
1756
1757 /* ram_write_tracking_available: check if kernel supports required UFFD features
1758  *
1759  * Returns true if supports, false otherwise
1760  */
1761 bool ram_write_tracking_available(void)
1762 {
1763     uint64_t uffd_features;
1764     int res;
1765
1766     res = uffd_query_features(&uffd_features);
1767     return (res == 0 &&
1768             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1769 }
1770
1771 /* ram_write_tracking_compatible: check if guest configuration is
1772  *   compatible with 'write-tracking'
1773  *
1774  * Returns true if compatible, false otherwise
1775  */
1776 bool ram_write_tracking_compatible(void)
1777 {
1778     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1779     int uffd_fd;
1780     RAMBlock *block;
1781     bool ret = false;
1782
1783     /* Open UFFD file descriptor */
1784     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1785     if (uffd_fd < 0) {
1786         return false;
1787     }
1788
1789     RCU_READ_LOCK_GUARD();
1790
1791     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1792         uint64_t uffd_ioctls;
1793
1794         /* Nothing to do with read-only and MMIO-writable regions */
1795         if (block->mr->readonly || block->mr->rom_device) {
1796             continue;
1797         }
1798         /* Try to register block memory via UFFD-IO to track writes */
1799         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1800                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1801             goto out;
1802         }
1803         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1804             goto out;
1805         }
1806     }
1807     ret = true;
1808
1809 out:
1810     uffd_close_fd(uffd_fd);
1811     return ret;
1812 }
1813
1814 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1815                                        ram_addr_t size)
1816 {
1817     const ram_addr_t end = offset + size;
1818
1819     /*
1820      * We read one byte of each page; this will preallocate page tables if
1821      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1822      * where no page was populated yet. This might require adaption when
1823      * supporting other mappings, like shmem.
1824      */
1825     for (; offset < end; offset += block->page_size) {
1826         char tmp = *((char *)block->host + offset);
1827
1828         /* Don't optimize the read out */
1829         asm volatile("" : "+r" (tmp));
1830     }
1831 }
1832
1833 static inline int populate_read_section(MemoryRegionSection *section,
1834                                         void *opaque)
1835 {
1836     const hwaddr size = int128_get64(section->size);
1837     hwaddr offset = section->offset_within_region;
1838     RAMBlock *block = section->mr->ram_block;
1839
1840     populate_read_range(block, offset, size);
1841     return 0;
1842 }
1843
1844 /*
1845  * ram_block_populate_read: preallocate page tables and populate pages in the
1846  *   RAM block by reading a byte of each page.
1847  *
1848  * Since it's solely used for userfault_fd WP feature, here we just
1849  *   hardcode page size to qemu_real_host_page_size.
1850  *
1851  * @block: RAM block to populate
1852  */
1853 static void ram_block_populate_read(RAMBlock *rb)
1854 {
1855     /*
1856      * Skip populating all pages that fall into a discarded range as managed by
1857      * a RamDiscardManager responsible for the mapped memory region of the
1858      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1859      * must not get populated automatically. We don't have to track
1860      * modifications via userfaultfd WP reliably, because these pages will
1861      * not be part of the migration stream either way -- see
1862      * ramblock_dirty_bitmap_exclude_discarded_pages().
1863      *
1864      * Note: The result is only stable while migrating (precopy/postcopy).
1865      */
1866     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1867         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1868         MemoryRegionSection section = {
1869             .mr = rb->mr,
1870             .offset_within_region = 0,
1871             .size = rb->mr->size,
1872         };
1873
1874         ram_discard_manager_replay_populated(rdm, &section,
1875                                              populate_read_section, NULL);
1876     } else {
1877         populate_read_range(rb, 0, rb->used_length);
1878     }
1879 }
1880
1881 /*
1882  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1883  */
1884 void ram_write_tracking_prepare(void)
1885 {
1886     RAMBlock *block;
1887
1888     RCU_READ_LOCK_GUARD();
1889
1890     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1891         /* Nothing to do with read-only and MMIO-writable regions */
1892         if (block->mr->readonly || block->mr->rom_device) {
1893             continue;
1894         }
1895
1896         /*
1897          * Populate pages of the RAM block before enabling userfault_fd
1898          * write protection.
1899          *
1900          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1901          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1902          * pages with pte_none() entries in page table.
1903          */
1904         ram_block_populate_read(block);
1905     }
1906 }
1907
1908 static inline int uffd_protect_section(MemoryRegionSection *section,
1909                                        void *opaque)
1910 {
1911     const hwaddr size = int128_get64(section->size);
1912     const hwaddr offset = section->offset_within_region;
1913     RAMBlock *rb = section->mr->ram_block;
1914     int uffd_fd = (uintptr_t)opaque;
1915
1916     return uffd_change_protection(uffd_fd, rb->host + offset, size, true,
1917                                   false);
1918 }
1919
1920 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd)
1921 {
1922     assert(rb->flags & RAM_UF_WRITEPROTECT);
1923
1924     /* See ram_block_populate_read() */
1925     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1926         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1927         MemoryRegionSection section = {
1928             .mr = rb->mr,
1929             .offset_within_region = 0,
1930             .size = rb->mr->size,
1931         };
1932
1933         return ram_discard_manager_replay_populated(rdm, &section,
1934                                                     uffd_protect_section,
1935                                                     (void *)(uintptr_t)uffd_fd);
1936     }
1937     return uffd_change_protection(uffd_fd, rb->host,
1938                                   rb->used_length, true, false);
1939 }
1940
1941 /*
1942  * ram_write_tracking_start: start UFFD-WP memory tracking
1943  *
1944  * Returns 0 for success or negative value in case of error
1945  */
1946 int ram_write_tracking_start(void)
1947 {
1948     int uffd_fd;
1949     RAMState *rs = ram_state;
1950     RAMBlock *block;
1951
1952     /* Open UFFD file descriptor */
1953     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1954     if (uffd_fd < 0) {
1955         return uffd_fd;
1956     }
1957     rs->uffdio_fd = uffd_fd;
1958
1959     RCU_READ_LOCK_GUARD();
1960
1961     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1962         /* Nothing to do with read-only and MMIO-writable regions */
1963         if (block->mr->readonly || block->mr->rom_device) {
1964             continue;
1965         }
1966
1967         /* Register block memory with UFFD to track writes */
1968         if (uffd_register_memory(rs->uffdio_fd, block->host,
1969                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1970             goto fail;
1971         }
1972         block->flags |= RAM_UF_WRITEPROTECT;
1973         memory_region_ref(block->mr);
1974
1975         /* Apply UFFD write protection to the block memory range */
1976         if (ram_block_uffd_protect(block, uffd_fd)) {
1977             goto fail;
1978         }
1979
1980         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1981                 block->host, block->max_length);
1982     }
1983
1984     return 0;
1985
1986 fail:
1987     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1988
1989     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1990         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1991             continue;
1992         }
1993         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1994         /* Cleanup flags and remove reference */
1995         block->flags &= ~RAM_UF_WRITEPROTECT;
1996         memory_region_unref(block->mr);
1997     }
1998
1999     uffd_close_fd(uffd_fd);
2000     rs->uffdio_fd = -1;
2001     return -1;
2002 }
2003
2004 /**
2005  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
2006  */
2007 void ram_write_tracking_stop(void)
2008 {
2009     RAMState *rs = ram_state;
2010     RAMBlock *block;
2011
2012     RCU_READ_LOCK_GUARD();
2013
2014     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2015         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2016             continue;
2017         }
2018         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2019
2020         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2021                 block->host, block->max_length);
2022
2023         /* Cleanup flags and remove reference */
2024         block->flags &= ~RAM_UF_WRITEPROTECT;
2025         memory_region_unref(block->mr);
2026     }
2027
2028     /* Finally close UFFD file descriptor */
2029     uffd_close_fd(rs->uffdio_fd);
2030     rs->uffdio_fd = -1;
2031 }
2032
2033 #else
2034 /* No target OS support, stubs just fail or ignore */
2035
2036 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2037 {
2038     (void) rs;
2039     (void) offset;
2040
2041     return NULL;
2042 }
2043
2044 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2045         unsigned long start_page)
2046 {
2047     (void) rs;
2048     (void) pss;
2049     (void) start_page;
2050
2051     return 0;
2052 }
2053
2054 bool ram_write_tracking_available(void)
2055 {
2056     return false;
2057 }
2058
2059 bool ram_write_tracking_compatible(void)
2060 {
2061     assert(0);
2062     return false;
2063 }
2064
2065 int ram_write_tracking_start(void)
2066 {
2067     assert(0);
2068     return -1;
2069 }
2070
2071 void ram_write_tracking_stop(void)
2072 {
2073     assert(0);
2074 }
2075 #endif /* defined(__linux__) */
2076
2077 /**
2078  * get_queued_page: unqueue a page from the postcopy requests
2079  *
2080  * Skips pages that are already sent (!dirty)
2081  *
2082  * Returns true if a queued page is found
2083  *
2084  * @rs: current RAM state
2085  * @pss: data about the state of the current dirty page scan
2086  */
2087 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2088 {
2089     RAMBlock  *block;
2090     ram_addr_t offset;
2091     bool dirty;
2092
2093     do {
2094         block = unqueue_page(rs, &offset);
2095         /*
2096          * We're sending this page, and since it's postcopy nothing else
2097          * will dirty it, and we must make sure it doesn't get sent again
2098          * even if this queue request was received after the background
2099          * search already sent it.
2100          */
2101         if (block) {
2102             unsigned long page;
2103
2104             page = offset >> TARGET_PAGE_BITS;
2105             dirty = test_bit(page, block->bmap);
2106             if (!dirty) {
2107                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2108                                                 page);
2109             } else {
2110                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2111             }
2112         }
2113
2114     } while (block && !dirty);
2115
2116     if (!block) {
2117         /*
2118          * Poll write faults too if background snapshot is enabled; that's
2119          * when we have vcpus got blocked by the write protected pages.
2120          */
2121         block = poll_fault_page(rs, &offset);
2122     }
2123
2124     if (block) {
2125         /*
2126          * We want the background search to continue from the queued page
2127          * since the guest is likely to want other pages near to the page
2128          * it just requested.
2129          */
2130         pss->block = block;
2131         pss->page = offset >> TARGET_PAGE_BITS;
2132
2133         /*
2134          * This unqueued page would break the "one round" check, even is
2135          * really rare.
2136          */
2137         pss->complete_round = false;
2138     }
2139
2140     return !!block;
2141 }
2142
2143 /**
2144  * migration_page_queue_free: drop any remaining pages in the ram
2145  * request queue
2146  *
2147  * It should be empty at the end anyway, but in error cases there may
2148  * be some left.  in case that there is any page left, we drop it.
2149  *
2150  */
2151 static void migration_page_queue_free(RAMState *rs)
2152 {
2153     struct RAMSrcPageRequest *mspr, *next_mspr;
2154     /* This queue generally should be empty - but in the case of a failed
2155      * migration might have some droppings in.
2156      */
2157     RCU_READ_LOCK_GUARD();
2158     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2159         memory_region_unref(mspr->rb->mr);
2160         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2161         g_free(mspr);
2162     }
2163 }
2164
2165 /**
2166  * ram_save_queue_pages: queue the page for transmission
2167  *
2168  * A request from postcopy destination for example.
2169  *
2170  * Returns zero on success or negative on error
2171  *
2172  * @rbname: Name of the RAMBLock of the request. NULL means the
2173  *          same that last one.
2174  * @start: starting address from the start of the RAMBlock
2175  * @len: length (in bytes) to send
2176  */
2177 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2178 {
2179     RAMBlock *ramblock;
2180     RAMState *rs = ram_state;
2181
2182     stat64_add(&ram_counters.postcopy_requests, 1);
2183     RCU_READ_LOCK_GUARD();
2184
2185     if (!rbname) {
2186         /* Reuse last RAMBlock */
2187         ramblock = rs->last_req_rb;
2188
2189         if (!ramblock) {
2190             /*
2191              * Shouldn't happen, we can't reuse the last RAMBlock if
2192              * it's the 1st request.
2193              */
2194             error_report("ram_save_queue_pages no previous block");
2195             return -1;
2196         }
2197     } else {
2198         ramblock = qemu_ram_block_by_name(rbname);
2199
2200         if (!ramblock) {
2201             /* We shouldn't be asked for a non-existent RAMBlock */
2202             error_report("ram_save_queue_pages no block '%s'", rbname);
2203             return -1;
2204         }
2205         rs->last_req_rb = ramblock;
2206     }
2207     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2208     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2209         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2210                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2211                      __func__, start, len, ramblock->used_length);
2212         return -1;
2213     }
2214
2215     /*
2216      * When with postcopy preempt, we send back the page directly in the
2217      * rp-return thread.
2218      */
2219     if (postcopy_preempt_active()) {
2220         ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2221         size_t page_size = qemu_ram_pagesize(ramblock);
2222         PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2223         int ret = 0;
2224
2225         qemu_mutex_lock(&rs->bitmap_mutex);
2226
2227         pss_init(pss, ramblock, page_start);
2228         /*
2229          * Always use the preempt channel, and make sure it's there.  It's
2230          * safe to access without lock, because when rp-thread is running
2231          * we should be the only one who operates on the qemufile
2232          */
2233         pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2234         assert(pss->pss_channel);
2235
2236         /*
2237          * It must be either one or multiple of host page size.  Just
2238          * assert; if something wrong we're mostly split brain anyway.
2239          */
2240         assert(len % page_size == 0);
2241         while (len) {
2242             if (ram_save_host_page_urgent(pss)) {
2243                 error_report("%s: ram_save_host_page_urgent() failed: "
2244                              "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2245                              __func__, ramblock->idstr, start);
2246                 ret = -1;
2247                 break;
2248             }
2249             /*
2250              * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2251              * will automatically be moved and point to the next host page
2252              * we're going to send, so no need to update here.
2253              *
2254              * Normally QEMU never sends >1 host page in requests, so
2255              * logically we don't even need that as the loop should only
2256              * run once, but just to be consistent.
2257              */
2258             len -= page_size;
2259         };
2260         qemu_mutex_unlock(&rs->bitmap_mutex);
2261
2262         return ret;
2263     }
2264
2265     struct RAMSrcPageRequest *new_entry =
2266         g_new0(struct RAMSrcPageRequest, 1);
2267     new_entry->rb = ramblock;
2268     new_entry->offset = start;
2269     new_entry->len = len;
2270
2271     memory_region_ref(ramblock->mr);
2272     qemu_mutex_lock(&rs->src_page_req_mutex);
2273     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2274     migration_make_urgent_request();
2275     qemu_mutex_unlock(&rs->src_page_req_mutex);
2276
2277     return 0;
2278 }
2279
2280 static bool save_page_use_compression(RAMState *rs)
2281 {
2282     if (!migrate_compress()) {
2283         return false;
2284     }
2285
2286     /*
2287      * If xbzrle is enabled (e.g., after first round of migration), stop
2288      * using the data compression. In theory, xbzrle can do better than
2289      * compression.
2290      */
2291     if (rs->xbzrle_enabled) {
2292         return false;
2293     }
2294
2295     return true;
2296 }
2297
2298 /*
2299  * try to compress the page before posting it out, return true if the page
2300  * has been properly handled by compression, otherwise needs other
2301  * paths to handle it
2302  */
2303 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2304                                RAMBlock *block, ram_addr_t offset)
2305 {
2306     if (!save_page_use_compression(rs)) {
2307         return false;
2308     }
2309
2310     /*
2311      * When starting the process of a new block, the first page of
2312      * the block should be sent out before other pages in the same
2313      * block, and all the pages in last block should have been sent
2314      * out, keeping this order is important, because the 'cont' flag
2315      * is used to avoid resending the block name.
2316      *
2317      * We post the fist page as normal page as compression will take
2318      * much CPU resource.
2319      */
2320     if (block != pss->last_sent_block) {
2321         flush_compressed_data(rs);
2322         return false;
2323     }
2324
2325     if (compress_page_with_multi_thread(block, offset) > 0) {
2326         return true;
2327     }
2328
2329     compression_counters.busy++;
2330     return false;
2331 }
2332
2333 /**
2334  * ram_save_target_page_legacy: save one target page
2335  *
2336  * Returns the number of pages written
2337  *
2338  * @rs: current RAM state
2339  * @pss: data about the page we want to send
2340  */
2341 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss)
2342 {
2343     RAMBlock *block = pss->block;
2344     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2345     int res;
2346
2347     if (control_save_page(pss, block, offset, &res)) {
2348         return res;
2349     }
2350
2351     if (save_compress_page(rs, pss, block, offset)) {
2352         return 1;
2353     }
2354
2355     res = save_zero_page(pss, pss->pss_channel, block, offset);
2356     if (res > 0) {
2357         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2358          * page would be stale
2359          */
2360         if (rs->xbzrle_enabled) {
2361             XBZRLE_cache_lock();
2362             xbzrle_cache_zero_page(rs, block->offset + offset);
2363             XBZRLE_cache_unlock();
2364         }
2365         return res;
2366     }
2367
2368     /*
2369      * Do not use multifd in postcopy as one whole host page should be
2370      * placed.  Meanwhile postcopy requires atomic update of pages, so even
2371      * if host page size == guest page size the dest guest during run may
2372      * still see partially copied pages which is data corruption.
2373      */
2374     if (migrate_multifd() && !migration_in_postcopy()) {
2375         return ram_save_multifd_page(pss->pss_channel, block, offset);
2376     }
2377
2378     return ram_save_page(rs, pss);
2379 }
2380
2381 /* Should be called before sending a host page */
2382 static void pss_host_page_prepare(PageSearchStatus *pss)
2383 {
2384     /* How many guest pages are there in one host page? */
2385     size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2386
2387     pss->host_page_sending = true;
2388     if (guest_pfns <= 1) {
2389         /*
2390          * This covers both when guest psize == host psize, or when guest
2391          * has larger psize than the host (guest_pfns==0).
2392          *
2393          * For the latter, we always send one whole guest page per
2394          * iteration of the host page (example: an Alpha VM on x86 host
2395          * will have guest psize 8K while host psize 4K).
2396          */
2397         pss->host_page_start = pss->page;
2398         pss->host_page_end = pss->page + 1;
2399     } else {
2400         /*
2401          * The host page spans over multiple guest pages, we send them
2402          * within the same host page iteration.
2403          */
2404         pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2405         pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2406     }
2407 }
2408
2409 /*
2410  * Whether the page pointed by PSS is within the host page being sent.
2411  * Must be called after a previous pss_host_page_prepare().
2412  */
2413 static bool pss_within_range(PageSearchStatus *pss)
2414 {
2415     ram_addr_t ram_addr;
2416
2417     assert(pss->host_page_sending);
2418
2419     /* Over host-page boundary? */
2420     if (pss->page >= pss->host_page_end) {
2421         return false;
2422     }
2423
2424     ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2425
2426     return offset_in_ramblock(pss->block, ram_addr);
2427 }
2428
2429 static void pss_host_page_finish(PageSearchStatus *pss)
2430 {
2431     pss->host_page_sending = false;
2432     /* This is not needed, but just to reset it */
2433     pss->host_page_start = pss->host_page_end = 0;
2434 }
2435
2436 /*
2437  * Send an urgent host page specified by `pss'.  Need to be called with
2438  * bitmap_mutex held.
2439  *
2440  * Returns 0 if save host page succeeded, false otherwise.
2441  */
2442 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2443 {
2444     bool page_dirty, sent = false;
2445     RAMState *rs = ram_state;
2446     int ret = 0;
2447
2448     trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2449     pss_host_page_prepare(pss);
2450
2451     /*
2452      * If precopy is sending the same page, let it be done in precopy, or
2453      * we could send the same page in two channels and none of them will
2454      * receive the whole page.
2455      */
2456     if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2457         trace_postcopy_preempt_hit(pss->block->idstr,
2458                                    pss->page << TARGET_PAGE_BITS);
2459         return 0;
2460     }
2461
2462     do {
2463         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2464
2465         if (page_dirty) {
2466             /* Be strict to return code; it must be 1, or what else? */
2467             if (migration_ops->ram_save_target_page(rs, pss) != 1) {
2468                 error_report_once("%s: ram_save_target_page failed", __func__);
2469                 ret = -1;
2470                 goto out;
2471             }
2472             sent = true;
2473         }
2474         pss_find_next_dirty(pss);
2475     } while (pss_within_range(pss));
2476 out:
2477     pss_host_page_finish(pss);
2478     /* For urgent requests, flush immediately if sent */
2479     if (sent) {
2480         qemu_fflush(pss->pss_channel);
2481     }
2482     return ret;
2483 }
2484
2485 /**
2486  * ram_save_host_page: save a whole host page
2487  *
2488  * Starting at *offset send pages up to the end of the current host
2489  * page. It's valid for the initial offset to point into the middle of
2490  * a host page in which case the remainder of the hostpage is sent.
2491  * Only dirty target pages are sent. Note that the host page size may
2492  * be a huge page for this block.
2493  *
2494  * The saving stops at the boundary of the used_length of the block
2495  * if the RAMBlock isn't a multiple of the host page size.
2496  *
2497  * The caller must be with ram_state.bitmap_mutex held to call this
2498  * function.  Note that this function can temporarily release the lock, but
2499  * when the function is returned it'll make sure the lock is still held.
2500  *
2501  * Returns the number of pages written or negative on error
2502  *
2503  * @rs: current RAM state
2504  * @pss: data about the page we want to send
2505  */
2506 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2507 {
2508     bool page_dirty, preempt_active = postcopy_preempt_active();
2509     int tmppages, pages = 0;
2510     size_t pagesize_bits =
2511         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2512     unsigned long start_page = pss->page;
2513     int res;
2514
2515     if (ramblock_is_ignored(pss->block)) {
2516         error_report("block %s should not be migrated !", pss->block->idstr);
2517         return 0;
2518     }
2519
2520     /* Update host page boundary information */
2521     pss_host_page_prepare(pss);
2522
2523     do {
2524         page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2525
2526         /* Check the pages is dirty and if it is send it */
2527         if (page_dirty) {
2528             /*
2529              * Properly yield the lock only in postcopy preempt mode
2530              * because both migration thread and rp-return thread can
2531              * operate on the bitmaps.
2532              */
2533             if (preempt_active) {
2534                 qemu_mutex_unlock(&rs->bitmap_mutex);
2535             }
2536             tmppages = migration_ops->ram_save_target_page(rs, pss);
2537             if (tmppages >= 0) {
2538                 pages += tmppages;
2539                 /*
2540                  * Allow rate limiting to happen in the middle of huge pages if
2541                  * something is sent in the current iteration.
2542                  */
2543                 if (pagesize_bits > 1 && tmppages > 0) {
2544                     migration_rate_limit();
2545                 }
2546             }
2547             if (preempt_active) {
2548                 qemu_mutex_lock(&rs->bitmap_mutex);
2549             }
2550         } else {
2551             tmppages = 0;
2552         }
2553
2554         if (tmppages < 0) {
2555             pss_host_page_finish(pss);
2556             return tmppages;
2557         }
2558
2559         pss_find_next_dirty(pss);
2560     } while (pss_within_range(pss));
2561
2562     pss_host_page_finish(pss);
2563
2564     res = ram_save_release_protection(rs, pss, start_page);
2565     return (res < 0 ? res : pages);
2566 }
2567
2568 /**
2569  * ram_find_and_save_block: finds a dirty page and sends it to f
2570  *
2571  * Called within an RCU critical section.
2572  *
2573  * Returns the number of pages written where zero means no dirty pages,
2574  * or negative on error
2575  *
2576  * @rs: current RAM state
2577  *
2578  * On systems where host-page-size > target-page-size it will send all the
2579  * pages in a host page that are dirty.
2580  */
2581 static int ram_find_and_save_block(RAMState *rs)
2582 {
2583     PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2584     int pages = 0;
2585
2586     /* No dirty page as there is zero RAM */
2587     if (!rs->ram_bytes_total) {
2588         return pages;
2589     }
2590
2591     /*
2592      * Always keep last_seen_block/last_page valid during this procedure,
2593      * because find_dirty_block() relies on these values (e.g., we compare
2594      * last_seen_block with pss.block to see whether we searched all the
2595      * ramblocks) to detect the completion of migration.  Having NULL value
2596      * of last_seen_block can conditionally cause below loop to run forever.
2597      */
2598     if (!rs->last_seen_block) {
2599         rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2600         rs->last_page = 0;
2601     }
2602
2603     pss_init(pss, rs->last_seen_block, rs->last_page);
2604
2605     while (true){
2606         if (!get_queued_page(rs, pss)) {
2607             /* priority queue empty, so just search for something dirty */
2608             int res = find_dirty_block(rs, pss);
2609             if (res != PAGE_DIRTY_FOUND) {
2610                 if (res == PAGE_ALL_CLEAN) {
2611                     break;
2612                 } else if (res == PAGE_TRY_AGAIN) {
2613                     continue;
2614                 } else if (res < 0) {
2615                     pages = res;
2616                     break;
2617                 }
2618             }
2619         }
2620         pages = ram_save_host_page(rs, pss);
2621         if (pages) {
2622             break;
2623         }
2624     }
2625
2626     rs->last_seen_block = pss->block;
2627     rs->last_page = pss->page;
2628
2629     return pages;
2630 }
2631
2632 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2633 {
2634     uint64_t pages = size / TARGET_PAGE_SIZE;
2635
2636     if (zero) {
2637         stat64_add(&ram_counters.zero_pages, pages);
2638     } else {
2639         stat64_add(&ram_counters.normal_pages, pages);
2640         ram_transferred_add(size);
2641         qemu_file_credit_transfer(f, size);
2642     }
2643 }
2644
2645 static uint64_t ram_bytes_total_with_ignored(void)
2646 {
2647     RAMBlock *block;
2648     uint64_t total = 0;
2649
2650     RCU_READ_LOCK_GUARD();
2651
2652     RAMBLOCK_FOREACH_MIGRATABLE(block) {
2653         total += block->used_length;
2654     }
2655     return total;
2656 }
2657
2658 uint64_t ram_bytes_total(void)
2659 {
2660     RAMBlock *block;
2661     uint64_t total = 0;
2662
2663     RCU_READ_LOCK_GUARD();
2664
2665     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2666         total += block->used_length;
2667     }
2668     return total;
2669 }
2670
2671 static void xbzrle_load_setup(void)
2672 {
2673     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2674 }
2675
2676 static void xbzrle_load_cleanup(void)
2677 {
2678     g_free(XBZRLE.decoded_buf);
2679     XBZRLE.decoded_buf = NULL;
2680 }
2681
2682 static void ram_state_cleanup(RAMState **rsp)
2683 {
2684     if (*rsp) {
2685         migration_page_queue_free(*rsp);
2686         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2687         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2688         g_free(*rsp);
2689         *rsp = NULL;
2690     }
2691 }
2692
2693 static void xbzrle_cleanup(void)
2694 {
2695     XBZRLE_cache_lock();
2696     if (XBZRLE.cache) {
2697         cache_fini(XBZRLE.cache);
2698         g_free(XBZRLE.encoded_buf);
2699         g_free(XBZRLE.current_buf);
2700         g_free(XBZRLE.zero_target_page);
2701         XBZRLE.cache = NULL;
2702         XBZRLE.encoded_buf = NULL;
2703         XBZRLE.current_buf = NULL;
2704         XBZRLE.zero_target_page = NULL;
2705     }
2706     XBZRLE_cache_unlock();
2707 }
2708
2709 static void ram_save_cleanup(void *opaque)
2710 {
2711     RAMState **rsp = opaque;
2712     RAMBlock *block;
2713
2714     /* We don't use dirty log with background snapshots */
2715     if (!migrate_background_snapshot()) {
2716         /* caller have hold iothread lock or is in a bh, so there is
2717          * no writing race against the migration bitmap
2718          */
2719         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2720             /*
2721              * do not stop dirty log without starting it, since
2722              * memory_global_dirty_log_stop will assert that
2723              * memory_global_dirty_log_start/stop used in pairs
2724              */
2725             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2726         }
2727     }
2728
2729     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2730         g_free(block->clear_bmap);
2731         block->clear_bmap = NULL;
2732         g_free(block->bmap);
2733         block->bmap = NULL;
2734     }
2735
2736     xbzrle_cleanup();
2737     compress_threads_save_cleanup();
2738     ram_state_cleanup(rsp);
2739     g_free(migration_ops);
2740     migration_ops = NULL;
2741 }
2742
2743 static void ram_state_reset(RAMState *rs)
2744 {
2745     int i;
2746
2747     for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2748         rs->pss[i].last_sent_block = NULL;
2749     }
2750
2751     rs->last_seen_block = NULL;
2752     rs->last_page = 0;
2753     rs->last_version = ram_list.version;
2754     rs->xbzrle_enabled = false;
2755 }
2756
2757 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2758
2759 /* **** functions for postcopy ***** */
2760
2761 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2762 {
2763     struct RAMBlock *block;
2764
2765     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2766         unsigned long *bitmap = block->bmap;
2767         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2768         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2769
2770         while (run_start < range) {
2771             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2772             ram_discard_range(block->idstr,
2773                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2774                               ((ram_addr_t)(run_end - run_start))
2775                                 << TARGET_PAGE_BITS);
2776             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2777         }
2778     }
2779 }
2780
2781 /**
2782  * postcopy_send_discard_bm_ram: discard a RAMBlock
2783  *
2784  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2785  *
2786  * @ms: current migration state
2787  * @block: RAMBlock to discard
2788  */
2789 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2790 {
2791     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2792     unsigned long current;
2793     unsigned long *bitmap = block->bmap;
2794
2795     for (current = 0; current < end; ) {
2796         unsigned long one = find_next_bit(bitmap, end, current);
2797         unsigned long zero, discard_length;
2798
2799         if (one >= end) {
2800             break;
2801         }
2802
2803         zero = find_next_zero_bit(bitmap, end, one + 1);
2804
2805         if (zero >= end) {
2806             discard_length = end - one;
2807         } else {
2808             discard_length = zero - one;
2809         }
2810         postcopy_discard_send_range(ms, one, discard_length);
2811         current = one + discard_length;
2812     }
2813 }
2814
2815 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2816
2817 /**
2818  * postcopy_each_ram_send_discard: discard all RAMBlocks
2819  *
2820  * Utility for the outgoing postcopy code.
2821  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2822  *   passing it bitmap indexes and name.
2823  * (qemu_ram_foreach_block ends up passing unscaled lengths
2824  *  which would mean postcopy code would have to deal with target page)
2825  *
2826  * @ms: current migration state
2827  */
2828 static void postcopy_each_ram_send_discard(MigrationState *ms)
2829 {
2830     struct RAMBlock *block;
2831
2832     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2833         postcopy_discard_send_init(ms, block->idstr);
2834
2835         /*
2836          * Deal with TPS != HPS and huge pages.  It discard any partially sent
2837          * host-page size chunks, mark any partially dirty host-page size
2838          * chunks as all dirty.  In this case the host-page is the host-page
2839          * for the particular RAMBlock, i.e. it might be a huge page.
2840          */
2841         postcopy_chunk_hostpages_pass(ms, block);
2842
2843         /*
2844          * Postcopy sends chunks of bitmap over the wire, but it
2845          * just needs indexes at this point, avoids it having
2846          * target page specific code.
2847          */
2848         postcopy_send_discard_bm_ram(ms, block);
2849         postcopy_discard_send_finish(ms);
2850     }
2851 }
2852
2853 /**
2854  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2855  *
2856  * Helper for postcopy_chunk_hostpages; it's called twice to
2857  * canonicalize the two bitmaps, that are similar, but one is
2858  * inverted.
2859  *
2860  * Postcopy requires that all target pages in a hostpage are dirty or
2861  * clean, not a mix.  This function canonicalizes the bitmaps.
2862  *
2863  * @ms: current migration state
2864  * @block: block that contains the page we want to canonicalize
2865  */
2866 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2867 {
2868     RAMState *rs = ram_state;
2869     unsigned long *bitmap = block->bmap;
2870     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2871     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2872     unsigned long run_start;
2873
2874     if (block->page_size == TARGET_PAGE_SIZE) {
2875         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2876         return;
2877     }
2878
2879     /* Find a dirty page */
2880     run_start = find_next_bit(bitmap, pages, 0);
2881
2882     while (run_start < pages) {
2883
2884         /*
2885          * If the start of this run of pages is in the middle of a host
2886          * page, then we need to fixup this host page.
2887          */
2888         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2889             /* Find the end of this run */
2890             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2891             /*
2892              * If the end isn't at the start of a host page, then the
2893              * run doesn't finish at the end of a host page
2894              * and we need to discard.
2895              */
2896         }
2897
2898         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2899             unsigned long page;
2900             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2901                                                              host_ratio);
2902             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2903
2904             /* Clean up the bitmap */
2905             for (page = fixup_start_addr;
2906                  page < fixup_start_addr + host_ratio; page++) {
2907                 /*
2908                  * Remark them as dirty, updating the count for any pages
2909                  * that weren't previously dirty.
2910                  */
2911                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2912             }
2913         }
2914
2915         /* Find the next dirty page for the next iteration */
2916         run_start = find_next_bit(bitmap, pages, run_start);
2917     }
2918 }
2919
2920 /**
2921  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2922  *
2923  * Transmit the set of pages to be discarded after precopy to the target
2924  * these are pages that:
2925  *     a) Have been previously transmitted but are now dirty again
2926  *     b) Pages that have never been transmitted, this ensures that
2927  *        any pages on the destination that have been mapped by background
2928  *        tasks get discarded (transparent huge pages is the specific concern)
2929  * Hopefully this is pretty sparse
2930  *
2931  * @ms: current migration state
2932  */
2933 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2934 {
2935     RAMState *rs = ram_state;
2936
2937     RCU_READ_LOCK_GUARD();
2938
2939     /* This should be our last sync, the src is now paused */
2940     migration_bitmap_sync(rs);
2941
2942     /* Easiest way to make sure we don't resume in the middle of a host-page */
2943     rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2944     rs->last_seen_block = NULL;
2945     rs->last_page = 0;
2946
2947     postcopy_each_ram_send_discard(ms);
2948
2949     trace_ram_postcopy_send_discard_bitmap();
2950 }
2951
2952 /**
2953  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2954  *
2955  * Returns zero on success
2956  *
2957  * @rbname: name of the RAMBlock of the request. NULL means the
2958  *          same that last one.
2959  * @start: RAMBlock starting page
2960  * @length: RAMBlock size
2961  */
2962 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2963 {
2964     trace_ram_discard_range(rbname, start, length);
2965
2966     RCU_READ_LOCK_GUARD();
2967     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2968
2969     if (!rb) {
2970         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2971         return -1;
2972     }
2973
2974     /*
2975      * On source VM, we don't need to update the received bitmap since
2976      * we don't even have one.
2977      */
2978     if (rb->receivedmap) {
2979         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2980                      length >> qemu_target_page_bits());
2981     }
2982
2983     return ram_block_discard_range(rb, start, length);
2984 }
2985
2986 /*
2987  * For every allocation, we will try not to crash the VM if the
2988  * allocation failed.
2989  */
2990 static int xbzrle_init(void)
2991 {
2992     Error *local_err = NULL;
2993
2994     if (!migrate_xbzrle()) {
2995         return 0;
2996     }
2997
2998     XBZRLE_cache_lock();
2999
3000     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3001     if (!XBZRLE.zero_target_page) {
3002         error_report("%s: Error allocating zero page", __func__);
3003         goto err_out;
3004     }
3005
3006     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3007                               TARGET_PAGE_SIZE, &local_err);
3008     if (!XBZRLE.cache) {
3009         error_report_err(local_err);
3010         goto free_zero_page;
3011     }
3012
3013     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3014     if (!XBZRLE.encoded_buf) {
3015         error_report("%s: Error allocating encoded_buf", __func__);
3016         goto free_cache;
3017     }
3018
3019     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3020     if (!XBZRLE.current_buf) {
3021         error_report("%s: Error allocating current_buf", __func__);
3022         goto free_encoded_buf;
3023     }
3024
3025     /* We are all good */
3026     XBZRLE_cache_unlock();
3027     return 0;
3028
3029 free_encoded_buf:
3030     g_free(XBZRLE.encoded_buf);
3031     XBZRLE.encoded_buf = NULL;
3032 free_cache:
3033     cache_fini(XBZRLE.cache);
3034     XBZRLE.cache = NULL;
3035 free_zero_page:
3036     g_free(XBZRLE.zero_target_page);
3037     XBZRLE.zero_target_page = NULL;
3038 err_out:
3039     XBZRLE_cache_unlock();
3040     return -ENOMEM;
3041 }
3042
3043 static int ram_state_init(RAMState **rsp)
3044 {
3045     *rsp = g_try_new0(RAMState, 1);
3046
3047     if (!*rsp) {
3048         error_report("%s: Init ramstate fail", __func__);
3049         return -1;
3050     }
3051
3052     qemu_mutex_init(&(*rsp)->bitmap_mutex);
3053     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3054     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3055     (*rsp)->ram_bytes_total = ram_bytes_total();
3056
3057     /*
3058      * Count the total number of pages used by ram blocks not including any
3059      * gaps due to alignment or unplugs.
3060      * This must match with the initial values of dirty bitmap.
3061      */
3062     (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS;
3063     ram_state_reset(*rsp);
3064
3065     return 0;
3066 }
3067
3068 static void ram_list_init_bitmaps(void)
3069 {
3070     MigrationState *ms = migrate_get_current();
3071     RAMBlock *block;
3072     unsigned long pages;
3073     uint8_t shift;
3074
3075     /* Skip setting bitmap if there is no RAM */
3076     if (ram_bytes_total()) {
3077         shift = ms->clear_bitmap_shift;
3078         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3079             error_report("clear_bitmap_shift (%u) too big, using "
3080                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3081             shift = CLEAR_BITMAP_SHIFT_MAX;
3082         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3083             error_report("clear_bitmap_shift (%u) too small, using "
3084                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3085             shift = CLEAR_BITMAP_SHIFT_MIN;
3086         }
3087
3088         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3089             pages = block->max_length >> TARGET_PAGE_BITS;
3090             /*
3091              * The initial dirty bitmap for migration must be set with all
3092              * ones to make sure we'll migrate every guest RAM page to
3093              * destination.
3094              * Here we set RAMBlock.bmap all to 1 because when rebegin a
3095              * new migration after a failed migration, ram_list.
3096              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3097              * guest memory.
3098              */
3099             block->bmap = bitmap_new(pages);
3100             bitmap_set(block->bmap, 0, pages);
3101             block->clear_bmap_shift = shift;
3102             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3103         }
3104     }
3105 }
3106
3107 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3108 {
3109     unsigned long pages;
3110     RAMBlock *rb;
3111
3112     RCU_READ_LOCK_GUARD();
3113
3114     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3115             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3116             rs->migration_dirty_pages -= pages;
3117     }
3118 }
3119
3120 static void ram_init_bitmaps(RAMState *rs)
3121 {
3122     /* For memory_global_dirty_log_start below.  */
3123     qemu_mutex_lock_iothread();
3124     qemu_mutex_lock_ramlist();
3125
3126     WITH_RCU_READ_LOCK_GUARD() {
3127         ram_list_init_bitmaps();
3128         /* We don't use dirty log with background snapshots */
3129         if (!migrate_background_snapshot()) {
3130             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3131             migration_bitmap_sync_precopy(rs);
3132         }
3133     }
3134     qemu_mutex_unlock_ramlist();
3135     qemu_mutex_unlock_iothread();
3136
3137     /*
3138      * After an eventual first bitmap sync, fixup the initial bitmap
3139      * containing all 1s to exclude any discarded pages from migration.
3140      */
3141     migration_bitmap_clear_discarded_pages(rs);
3142 }
3143
3144 static int ram_init_all(RAMState **rsp)
3145 {
3146     if (ram_state_init(rsp)) {
3147         return -1;
3148     }
3149
3150     if (xbzrle_init()) {
3151         ram_state_cleanup(rsp);
3152         return -1;
3153     }
3154
3155     ram_init_bitmaps(*rsp);
3156
3157     return 0;
3158 }
3159
3160 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3161 {
3162     RAMBlock *block;
3163     uint64_t pages = 0;
3164
3165     /*
3166      * Postcopy is not using xbzrle/compression, so no need for that.
3167      * Also, since source are already halted, we don't need to care
3168      * about dirty page logging as well.
3169      */
3170
3171     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3172         pages += bitmap_count_one(block->bmap,
3173                                   block->used_length >> TARGET_PAGE_BITS);
3174     }
3175
3176     /* This may not be aligned with current bitmaps. Recalculate. */
3177     rs->migration_dirty_pages = pages;
3178
3179     ram_state_reset(rs);
3180
3181     /* Update RAMState cache of output QEMUFile */
3182     rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3183
3184     trace_ram_state_resume_prepare(pages);
3185 }
3186
3187 /*
3188  * This function clears bits of the free pages reported by the caller from the
3189  * migration dirty bitmap. @addr is the host address corresponding to the
3190  * start of the continuous guest free pages, and @len is the total bytes of
3191  * those pages.
3192  */
3193 void qemu_guest_free_page_hint(void *addr, size_t len)
3194 {
3195     RAMBlock *block;
3196     ram_addr_t offset;
3197     size_t used_len, start, npages;
3198     MigrationState *s = migrate_get_current();
3199
3200     /* This function is currently expected to be used during live migration */
3201     if (!migration_is_setup_or_active(s->state)) {
3202         return;
3203     }
3204
3205     for (; len > 0; len -= used_len, addr += used_len) {
3206         block = qemu_ram_block_from_host(addr, false, &offset);
3207         if (unlikely(!block || offset >= block->used_length)) {
3208             /*
3209              * The implementation might not support RAMBlock resize during
3210              * live migration, but it could happen in theory with future
3211              * updates. So we add a check here to capture that case.
3212              */
3213             error_report_once("%s unexpected error", __func__);
3214             return;
3215         }
3216
3217         if (len <= block->used_length - offset) {
3218             used_len = len;
3219         } else {
3220             used_len = block->used_length - offset;
3221         }
3222
3223         start = offset >> TARGET_PAGE_BITS;
3224         npages = used_len >> TARGET_PAGE_BITS;
3225
3226         qemu_mutex_lock(&ram_state->bitmap_mutex);
3227         /*
3228          * The skipped free pages are equavalent to be sent from clear_bmap's
3229          * perspective, so clear the bits from the memory region bitmap which
3230          * are initially set. Otherwise those skipped pages will be sent in
3231          * the next round after syncing from the memory region bitmap.
3232          */
3233         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3234         ram_state->migration_dirty_pages -=
3235                       bitmap_count_one_with_offset(block->bmap, start, npages);
3236         bitmap_clear(block->bmap, start, npages);
3237         qemu_mutex_unlock(&ram_state->bitmap_mutex);
3238     }
3239 }
3240
3241 /*
3242  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3243  * long-running RCU critical section.  When rcu-reclaims in the code
3244  * start to become numerous it will be necessary to reduce the
3245  * granularity of these critical sections.
3246  */
3247
3248 /**
3249  * ram_save_setup: Setup RAM for migration
3250  *
3251  * Returns zero to indicate success and negative for error
3252  *
3253  * @f: QEMUFile where to send the data
3254  * @opaque: RAMState pointer
3255  */
3256 static int ram_save_setup(QEMUFile *f, void *opaque)
3257 {
3258     RAMState **rsp = opaque;
3259     RAMBlock *block;
3260     int ret;
3261
3262     if (compress_threads_save_setup()) {
3263         return -1;
3264     }
3265
3266     /* migration has already setup the bitmap, reuse it. */
3267     if (!migration_in_colo_state()) {
3268         if (ram_init_all(rsp) != 0) {
3269             compress_threads_save_cleanup();
3270             return -1;
3271         }
3272     }
3273     (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3274
3275     WITH_RCU_READ_LOCK_GUARD() {
3276         qemu_put_be64(f, ram_bytes_total_with_ignored()
3277                          | RAM_SAVE_FLAG_MEM_SIZE);
3278
3279         RAMBLOCK_FOREACH_MIGRATABLE(block) {
3280             qemu_put_byte(f, strlen(block->idstr));
3281             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3282             qemu_put_be64(f, block->used_length);
3283             if (migrate_postcopy_ram() && block->page_size !=
3284                                           qemu_host_page_size) {
3285                 qemu_put_be64(f, block->page_size);
3286             }
3287             if (migrate_ignore_shared()) {
3288                 qemu_put_be64(f, block->mr->addr);
3289             }
3290         }
3291     }
3292
3293     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3294     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3295
3296     migration_ops = g_malloc0(sizeof(MigrationOps));
3297     migration_ops->ram_save_target_page = ram_save_target_page_legacy;
3298     ret = multifd_send_sync_main(f);
3299     if (ret < 0) {
3300         return ret;
3301     }
3302
3303     if (!migrate_multifd_flush_after_each_section()) {
3304         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3305     }
3306
3307     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3308     qemu_fflush(f);
3309
3310     return 0;
3311 }
3312
3313 /**
3314  * ram_save_iterate: iterative stage for migration
3315  *
3316  * Returns zero to indicate success and negative for error
3317  *
3318  * @f: QEMUFile where to send the data
3319  * @opaque: RAMState pointer
3320  */
3321 static int ram_save_iterate(QEMUFile *f, void *opaque)
3322 {
3323     RAMState **temp = opaque;
3324     RAMState *rs = *temp;
3325     int ret = 0;
3326     int i;
3327     int64_t t0;
3328     int done = 0;
3329
3330     if (blk_mig_bulk_active()) {
3331         /* Avoid transferring ram during bulk phase of block migration as
3332          * the bulk phase will usually take a long time and transferring
3333          * ram updates during that time is pointless. */
3334         goto out;
3335     }
3336
3337     /*
3338      * We'll take this lock a little bit long, but it's okay for two reasons.
3339      * Firstly, the only possible other thread to take it is who calls
3340      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3341      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3342      * guarantees that we'll at least released it in a regular basis.
3343      */
3344     qemu_mutex_lock(&rs->bitmap_mutex);
3345     WITH_RCU_READ_LOCK_GUARD() {
3346         if (ram_list.version != rs->last_version) {
3347             ram_state_reset(rs);
3348         }
3349
3350         /* Read version before ram_list.blocks */
3351         smp_rmb();
3352
3353         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3354
3355         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3356         i = 0;
3357         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3358                postcopy_has_request(rs)) {
3359             int pages;
3360
3361             if (qemu_file_get_error(f)) {
3362                 break;
3363             }
3364
3365             pages = ram_find_and_save_block(rs);
3366             /* no more pages to sent */
3367             if (pages == 0) {
3368                 done = 1;
3369                 break;
3370             }
3371
3372             if (pages < 0) {
3373                 qemu_file_set_error(f, pages);
3374                 break;
3375             }
3376
3377             rs->target_page_count += pages;
3378
3379             /*
3380              * During postcopy, it is necessary to make sure one whole host
3381              * page is sent in one chunk.
3382              */
3383             if (migrate_postcopy_ram()) {
3384                 flush_compressed_data(rs);
3385             }
3386
3387             /*
3388              * we want to check in the 1st loop, just in case it was the 1st
3389              * time and we had to sync the dirty bitmap.
3390              * qemu_clock_get_ns() is a bit expensive, so we only check each
3391              * some iterations
3392              */
3393             if ((i & 63) == 0) {
3394                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3395                               1000000;
3396                 if (t1 > MAX_WAIT) {
3397                     trace_ram_save_iterate_big_wait(t1, i);
3398                     break;
3399                 }
3400             }
3401             i++;
3402         }
3403     }
3404     qemu_mutex_unlock(&rs->bitmap_mutex);
3405
3406     /*
3407      * Must occur before EOS (or any QEMUFile operation)
3408      * because of RDMA protocol.
3409      */
3410     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3411
3412 out:
3413     if (ret >= 0
3414         && migration_is_setup_or_active(migrate_get_current()->state)) {
3415         if (migrate_multifd_flush_after_each_section()) {
3416             ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3417             if (ret < 0) {
3418                 return ret;
3419             }
3420         }
3421
3422         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3423         qemu_fflush(f);
3424         ram_transferred_add(8);
3425
3426         ret = qemu_file_get_error(f);
3427     }
3428     if (ret < 0) {
3429         return ret;
3430     }
3431
3432     return done;
3433 }
3434
3435 /**
3436  * ram_save_complete: function called to send the remaining amount of ram
3437  *
3438  * Returns zero to indicate success or negative on error
3439  *
3440  * Called with iothread lock
3441  *
3442  * @f: QEMUFile where to send the data
3443  * @opaque: RAMState pointer
3444  */
3445 static int ram_save_complete(QEMUFile *f, void *opaque)
3446 {
3447     RAMState **temp = opaque;
3448     RAMState *rs = *temp;
3449     int ret = 0;
3450
3451     rs->last_stage = !migration_in_colo_state();
3452
3453     WITH_RCU_READ_LOCK_GUARD() {
3454         if (!migration_in_postcopy()) {
3455             migration_bitmap_sync_precopy(rs);
3456         }
3457
3458         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3459
3460         /* try transferring iterative blocks of memory */
3461
3462         /* flush all remaining blocks regardless of rate limiting */
3463         qemu_mutex_lock(&rs->bitmap_mutex);
3464         while (true) {
3465             int pages;
3466
3467             pages = ram_find_and_save_block(rs);
3468             /* no more blocks to sent */
3469             if (pages == 0) {
3470                 break;
3471             }
3472             if (pages < 0) {
3473                 ret = pages;
3474                 break;
3475             }
3476         }
3477         qemu_mutex_unlock(&rs->bitmap_mutex);
3478
3479         flush_compressed_data(rs);
3480         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3481     }
3482
3483     if (ret < 0) {
3484         return ret;
3485     }
3486
3487     ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3488     if (ret < 0) {
3489         return ret;
3490     }
3491
3492     if (!migrate_multifd_flush_after_each_section()) {
3493         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
3494     }
3495     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3496     qemu_fflush(f);
3497
3498     return 0;
3499 }
3500
3501 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy,
3502                                        uint64_t *can_postcopy)
3503 {
3504     RAMState **temp = opaque;
3505     RAMState *rs = *temp;
3506
3507     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3508
3509     if (migrate_postcopy_ram()) {
3510         /* We can do postcopy, and all the data is postcopiable */
3511         *can_postcopy += remaining_size;
3512     } else {
3513         *must_precopy += remaining_size;
3514     }
3515 }
3516
3517 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy,
3518                                     uint64_t *can_postcopy)
3519 {
3520     MigrationState *s = migrate_get_current();
3521     RAMState **temp = opaque;
3522     RAMState *rs = *temp;
3523
3524     uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3525
3526     if (!migration_in_postcopy() && remaining_size < s->threshold_size) {
3527         qemu_mutex_lock_iothread();
3528         WITH_RCU_READ_LOCK_GUARD() {
3529             migration_bitmap_sync_precopy(rs);
3530         }
3531         qemu_mutex_unlock_iothread();
3532         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3533     }
3534
3535     if (migrate_postcopy_ram()) {
3536         /* We can do postcopy, and all the data is postcopiable */
3537         *can_postcopy += remaining_size;
3538     } else {
3539         *must_precopy += remaining_size;
3540     }
3541 }
3542
3543 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3544 {
3545     unsigned int xh_len;
3546     int xh_flags;
3547     uint8_t *loaded_data;
3548
3549     /* extract RLE header */
3550     xh_flags = qemu_get_byte(f);
3551     xh_len = qemu_get_be16(f);
3552
3553     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3554         error_report("Failed to load XBZRLE page - wrong compression!");
3555         return -1;
3556     }
3557
3558     if (xh_len > TARGET_PAGE_SIZE) {
3559         error_report("Failed to load XBZRLE page - len overflow!");
3560         return -1;
3561     }
3562     loaded_data = XBZRLE.decoded_buf;
3563     /* load data and decode */
3564     /* it can change loaded_data to point to an internal buffer */
3565     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3566
3567     /* decode RLE */
3568     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3569                              TARGET_PAGE_SIZE) == -1) {
3570         error_report("Failed to load XBZRLE page - decode error!");
3571         return -1;
3572     }
3573
3574     return 0;
3575 }
3576
3577 /**
3578  * ram_block_from_stream: read a RAMBlock id from the migration stream
3579  *
3580  * Must be called from within a rcu critical section.
3581  *
3582  * Returns a pointer from within the RCU-protected ram_list.
3583  *
3584  * @mis: the migration incoming state pointer
3585  * @f: QEMUFile where to read the data from
3586  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3587  * @channel: the channel we're using
3588  */
3589 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3590                                               QEMUFile *f, int flags,
3591                                               int channel)
3592 {
3593     RAMBlock *block = mis->last_recv_block[channel];
3594     char id[256];
3595     uint8_t len;
3596
3597     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3598         if (!block) {
3599             error_report("Ack, bad migration stream!");
3600             return NULL;
3601         }
3602         return block;
3603     }
3604
3605     len = qemu_get_byte(f);
3606     qemu_get_buffer(f, (uint8_t *)id, len);
3607     id[len] = 0;
3608
3609     block = qemu_ram_block_by_name(id);
3610     if (!block) {
3611         error_report("Can't find block %s", id);
3612         return NULL;
3613     }
3614
3615     if (ramblock_is_ignored(block)) {
3616         error_report("block %s should not be migrated !", id);
3617         return NULL;
3618     }
3619
3620     mis->last_recv_block[channel] = block;
3621
3622     return block;
3623 }
3624
3625 static inline void *host_from_ram_block_offset(RAMBlock *block,
3626                                                ram_addr_t offset)
3627 {
3628     if (!offset_in_ramblock(block, offset)) {
3629         return NULL;
3630     }
3631
3632     return block->host + offset;
3633 }
3634
3635 static void *host_page_from_ram_block_offset(RAMBlock *block,
3636                                              ram_addr_t offset)
3637 {
3638     /* Note: Explicitly no check against offset_in_ramblock(). */
3639     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3640                                    block->page_size);
3641 }
3642
3643 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3644                                                          ram_addr_t offset)
3645 {
3646     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3647 }
3648
3649 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3650                              ram_addr_t offset, bool record_bitmap)
3651 {
3652     if (!offset_in_ramblock(block, offset)) {
3653         return NULL;
3654     }
3655     if (!block->colo_cache) {
3656         error_report("%s: colo_cache is NULL in block :%s",
3657                      __func__, block->idstr);
3658         return NULL;
3659     }
3660
3661     /*
3662     * During colo checkpoint, we need bitmap of these migrated pages.
3663     * It help us to decide which pages in ram cache should be flushed
3664     * into VM's RAM later.
3665     */
3666     if (record_bitmap &&
3667         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3668         ram_state->migration_dirty_pages++;
3669     }
3670     return block->colo_cache + offset;
3671 }
3672
3673 /**
3674  * ram_handle_compressed: handle the zero page case
3675  *
3676  * If a page (or a whole RDMA chunk) has been
3677  * determined to be zero, then zap it.
3678  *
3679  * @host: host address for the zero page
3680  * @ch: what the page is filled from.  We only support zero
3681  * @size: size of the zero page
3682  */
3683 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3684 {
3685     if (ch != 0 || !buffer_is_zero(host, size)) {
3686         memset(host, ch, size);
3687     }
3688 }
3689
3690 /* return the size after decompression, or negative value on error */
3691 static int
3692 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3693                      const uint8_t *source, size_t source_len)
3694 {
3695     int err;
3696
3697     err = inflateReset(stream);
3698     if (err != Z_OK) {
3699         return -1;
3700     }
3701
3702     stream->avail_in = source_len;
3703     stream->next_in = (uint8_t *)source;
3704     stream->avail_out = dest_len;
3705     stream->next_out = dest;
3706
3707     err = inflate(stream, Z_NO_FLUSH);
3708     if (err != Z_STREAM_END) {
3709         return -1;
3710     }
3711
3712     return stream->total_out;
3713 }
3714
3715 static void *do_data_decompress(void *opaque)
3716 {
3717     DecompressParam *param = opaque;
3718     unsigned long pagesize;
3719     uint8_t *des;
3720     int len, ret;
3721
3722     qemu_mutex_lock(&param->mutex);
3723     while (!param->quit) {
3724         if (param->des) {
3725             des = param->des;
3726             len = param->len;
3727             param->des = 0;
3728             qemu_mutex_unlock(&param->mutex);
3729
3730             pagesize = TARGET_PAGE_SIZE;
3731
3732             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3733                                        param->compbuf, len);
3734             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3735                 error_report("decompress data failed");
3736                 qemu_file_set_error(decomp_file, ret);
3737             }
3738
3739             qemu_mutex_lock(&decomp_done_lock);
3740             param->done = true;
3741             qemu_cond_signal(&decomp_done_cond);
3742             qemu_mutex_unlock(&decomp_done_lock);
3743
3744             qemu_mutex_lock(&param->mutex);
3745         } else {
3746             qemu_cond_wait(&param->cond, &param->mutex);
3747         }
3748     }
3749     qemu_mutex_unlock(&param->mutex);
3750
3751     return NULL;
3752 }
3753
3754 static int wait_for_decompress_done(void)
3755 {
3756     int idx, thread_count;
3757
3758     if (!migrate_compress()) {
3759         return 0;
3760     }
3761
3762     thread_count = migrate_decompress_threads();
3763     qemu_mutex_lock(&decomp_done_lock);
3764     for (idx = 0; idx < thread_count; idx++) {
3765         while (!decomp_param[idx].done) {
3766             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3767         }
3768     }
3769     qemu_mutex_unlock(&decomp_done_lock);
3770     return qemu_file_get_error(decomp_file);
3771 }
3772
3773 static void compress_threads_load_cleanup(void)
3774 {
3775     int i, thread_count;
3776
3777     if (!migrate_compress()) {
3778         return;
3779     }
3780     thread_count = migrate_decompress_threads();
3781     for (i = 0; i < thread_count; i++) {
3782         /*
3783          * we use it as a indicator which shows if the thread is
3784          * properly init'd or not
3785          */
3786         if (!decomp_param[i].compbuf) {
3787             break;
3788         }
3789
3790         qemu_mutex_lock(&decomp_param[i].mutex);
3791         decomp_param[i].quit = true;
3792         qemu_cond_signal(&decomp_param[i].cond);
3793         qemu_mutex_unlock(&decomp_param[i].mutex);
3794     }
3795     for (i = 0; i < thread_count; i++) {
3796         if (!decomp_param[i].compbuf) {
3797             break;
3798         }
3799
3800         qemu_thread_join(decompress_threads + i);
3801         qemu_mutex_destroy(&decomp_param[i].mutex);
3802         qemu_cond_destroy(&decomp_param[i].cond);
3803         inflateEnd(&decomp_param[i].stream);
3804         g_free(decomp_param[i].compbuf);
3805         decomp_param[i].compbuf = NULL;
3806     }
3807     g_free(decompress_threads);
3808     g_free(decomp_param);
3809     decompress_threads = NULL;
3810     decomp_param = NULL;
3811     decomp_file = NULL;
3812 }
3813
3814 static int compress_threads_load_setup(QEMUFile *f)
3815 {
3816     int i, thread_count;
3817
3818     if (!migrate_compress()) {
3819         return 0;
3820     }
3821
3822     thread_count = migrate_decompress_threads();
3823     decompress_threads = g_new0(QemuThread, thread_count);
3824     decomp_param = g_new0(DecompressParam, thread_count);
3825     qemu_mutex_init(&decomp_done_lock);
3826     qemu_cond_init(&decomp_done_cond);
3827     decomp_file = f;
3828     for (i = 0; i < thread_count; i++) {
3829         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3830             goto exit;
3831         }
3832
3833         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3834         qemu_mutex_init(&decomp_param[i].mutex);
3835         qemu_cond_init(&decomp_param[i].cond);
3836         decomp_param[i].done = true;
3837         decomp_param[i].quit = false;
3838         qemu_thread_create(decompress_threads + i, "decompress",
3839                            do_data_decompress, decomp_param + i,
3840                            QEMU_THREAD_JOINABLE);
3841     }
3842     return 0;
3843 exit:
3844     compress_threads_load_cleanup();
3845     return -1;
3846 }
3847
3848 static void decompress_data_with_multi_threads(QEMUFile *f,
3849                                                void *host, int len)
3850 {
3851     int idx, thread_count;
3852
3853     thread_count = migrate_decompress_threads();
3854     QEMU_LOCK_GUARD(&decomp_done_lock);
3855     while (true) {
3856         for (idx = 0; idx < thread_count; idx++) {
3857             if (decomp_param[idx].done) {
3858                 decomp_param[idx].done = false;
3859                 qemu_mutex_lock(&decomp_param[idx].mutex);
3860                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3861                 decomp_param[idx].des = host;
3862                 decomp_param[idx].len = len;
3863                 qemu_cond_signal(&decomp_param[idx].cond);
3864                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3865                 break;
3866             }
3867         }
3868         if (idx < thread_count) {
3869             break;
3870         } else {
3871             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3872         }
3873     }
3874 }
3875
3876 static void colo_init_ram_state(void)
3877 {
3878     ram_state_init(&ram_state);
3879 }
3880
3881 /*
3882  * colo cache: this is for secondary VM, we cache the whole
3883  * memory of the secondary VM, it is need to hold the global lock
3884  * to call this helper.
3885  */
3886 int colo_init_ram_cache(void)
3887 {
3888     RAMBlock *block;
3889
3890     WITH_RCU_READ_LOCK_GUARD() {
3891         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3892             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3893                                                     NULL, false, false);
3894             if (!block->colo_cache) {
3895                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3896                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3897                              block->used_length);
3898                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3899                     if (block->colo_cache) {
3900                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3901                         block->colo_cache = NULL;
3902                     }
3903                 }
3904                 return -errno;
3905             }
3906             if (!machine_dump_guest_core(current_machine)) {
3907                 qemu_madvise(block->colo_cache, block->used_length,
3908                              QEMU_MADV_DONTDUMP);
3909             }
3910         }
3911     }
3912
3913     /*
3914     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3915     * with to decide which page in cache should be flushed into SVM's RAM. Here
3916     * we use the same name 'ram_bitmap' as for migration.
3917     */
3918     if (ram_bytes_total()) {
3919         RAMBlock *block;
3920
3921         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3922             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3923             block->bmap = bitmap_new(pages);
3924         }
3925     }
3926
3927     colo_init_ram_state();
3928     return 0;
3929 }
3930
3931 /* TODO: duplicated with ram_init_bitmaps */
3932 void colo_incoming_start_dirty_log(void)
3933 {
3934     RAMBlock *block = NULL;
3935     /* For memory_global_dirty_log_start below. */
3936     qemu_mutex_lock_iothread();
3937     qemu_mutex_lock_ramlist();
3938
3939     memory_global_dirty_log_sync();
3940     WITH_RCU_READ_LOCK_GUARD() {
3941         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3942             ramblock_sync_dirty_bitmap(ram_state, block);
3943             /* Discard this dirty bitmap record */
3944             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3945         }
3946         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3947     }
3948     ram_state->migration_dirty_pages = 0;
3949     qemu_mutex_unlock_ramlist();
3950     qemu_mutex_unlock_iothread();
3951 }
3952
3953 /* It is need to hold the global lock to call this helper */
3954 void colo_release_ram_cache(void)
3955 {
3956     RAMBlock *block;
3957
3958     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3959     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3960         g_free(block->bmap);
3961         block->bmap = NULL;
3962     }
3963
3964     WITH_RCU_READ_LOCK_GUARD() {
3965         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3966             if (block->colo_cache) {
3967                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3968                 block->colo_cache = NULL;
3969             }
3970         }
3971     }
3972     ram_state_cleanup(&ram_state);
3973 }
3974
3975 /**
3976  * ram_load_setup: Setup RAM for migration incoming side
3977  *
3978  * Returns zero to indicate success and negative for error
3979  *
3980  * @f: QEMUFile where to receive the data
3981  * @opaque: RAMState pointer
3982  */
3983 static int ram_load_setup(QEMUFile *f, void *opaque)
3984 {
3985     if (compress_threads_load_setup(f)) {
3986         return -1;
3987     }
3988
3989     xbzrle_load_setup();
3990     ramblock_recv_map_init();
3991
3992     return 0;
3993 }
3994
3995 static int ram_load_cleanup(void *opaque)
3996 {
3997     RAMBlock *rb;
3998
3999     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4000         qemu_ram_block_writeback(rb);
4001     }
4002
4003     xbzrle_load_cleanup();
4004     compress_threads_load_cleanup();
4005
4006     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4007         g_free(rb->receivedmap);
4008         rb->receivedmap = NULL;
4009     }
4010
4011     return 0;
4012 }
4013
4014 /**
4015  * ram_postcopy_incoming_init: allocate postcopy data structures
4016  *
4017  * Returns 0 for success and negative if there was one error
4018  *
4019  * @mis: current migration incoming state
4020  *
4021  * Allocate data structures etc needed by incoming migration with
4022  * postcopy-ram. postcopy-ram's similarly names
4023  * postcopy_ram_incoming_init does the work.
4024  */
4025 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4026 {
4027     return postcopy_ram_incoming_init(mis);
4028 }
4029
4030 /**
4031  * ram_load_postcopy: load a page in postcopy case
4032  *
4033  * Returns 0 for success or -errno in case of error
4034  *
4035  * Called in postcopy mode by ram_load().
4036  * rcu_read_lock is taken prior to this being called.
4037  *
4038  * @f: QEMUFile where to send the data
4039  * @channel: the channel to use for loading
4040  */
4041 int ram_load_postcopy(QEMUFile *f, int channel)
4042 {
4043     int flags = 0, ret = 0;
4044     bool place_needed = false;
4045     bool matches_target_page_size = false;
4046     MigrationIncomingState *mis = migration_incoming_get_current();
4047     PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4048
4049     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4050         ram_addr_t addr;
4051         void *page_buffer = NULL;
4052         void *place_source = NULL;
4053         RAMBlock *block = NULL;
4054         uint8_t ch;
4055         int len;
4056
4057         addr = qemu_get_be64(f);
4058
4059         /*
4060          * If qemu file error, we should stop here, and then "addr"
4061          * may be invalid
4062          */
4063         ret = qemu_file_get_error(f);
4064         if (ret) {
4065             break;
4066         }
4067
4068         flags = addr & ~TARGET_PAGE_MASK;
4069         addr &= TARGET_PAGE_MASK;
4070
4071         trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4072         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4073                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4074             block = ram_block_from_stream(mis, f, flags, channel);
4075             if (!block) {
4076                 ret = -EINVAL;
4077                 break;
4078             }
4079
4080             /*
4081              * Relying on used_length is racy and can result in false positives.
4082              * We might place pages beyond used_length in case RAM was shrunk
4083              * while in postcopy, which is fine - trying to place via
4084              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4085              */
4086             if (!block->host || addr >= block->postcopy_length) {
4087                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4088                 ret = -EINVAL;
4089                 break;
4090             }
4091             tmp_page->target_pages++;
4092             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4093             /*
4094              * Postcopy requires that we place whole host pages atomically;
4095              * these may be huge pages for RAMBlocks that are backed by
4096              * hugetlbfs.
4097              * To make it atomic, the data is read into a temporary page
4098              * that's moved into place later.
4099              * The migration protocol uses,  possibly smaller, target-pages
4100              * however the source ensures it always sends all the components
4101              * of a host page in one chunk.
4102              */
4103             page_buffer = tmp_page->tmp_huge_page +
4104                           host_page_offset_from_ram_block_offset(block, addr);
4105             /* If all TP are zero then we can optimise the place */
4106             if (tmp_page->target_pages == 1) {
4107                 tmp_page->host_addr =
4108                     host_page_from_ram_block_offset(block, addr);
4109             } else if (tmp_page->host_addr !=
4110                        host_page_from_ram_block_offset(block, addr)) {
4111                 /* not the 1st TP within the HP */
4112                 error_report("Non-same host page detected on channel %d: "
4113                              "Target host page %p, received host page %p "
4114                              "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4115                              channel, tmp_page->host_addr,
4116                              host_page_from_ram_block_offset(block, addr),
4117                              block->idstr, addr, tmp_page->target_pages);
4118                 ret = -EINVAL;
4119                 break;
4120             }
4121
4122             /*
4123              * If it's the last part of a host page then we place the host
4124              * page
4125              */
4126             if (tmp_page->target_pages ==
4127                 (block->page_size / TARGET_PAGE_SIZE)) {
4128                 place_needed = true;
4129             }
4130             place_source = tmp_page->tmp_huge_page;
4131         }
4132
4133         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4134         case RAM_SAVE_FLAG_ZERO:
4135             ch = qemu_get_byte(f);
4136             /*
4137              * Can skip to set page_buffer when
4138              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4139              */
4140             if (ch || !matches_target_page_size) {
4141                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4142             }
4143             if (ch) {
4144                 tmp_page->all_zero = false;
4145             }
4146             break;
4147
4148         case RAM_SAVE_FLAG_PAGE:
4149             tmp_page->all_zero = false;
4150             if (!matches_target_page_size) {
4151                 /* For huge pages, we always use temporary buffer */
4152                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4153             } else {
4154                 /*
4155                  * For small pages that matches target page size, we
4156                  * avoid the qemu_file copy.  Instead we directly use
4157                  * the buffer of QEMUFile to place the page.  Note: we
4158                  * cannot do any QEMUFile operation before using that
4159                  * buffer to make sure the buffer is valid when
4160                  * placing the page.
4161                  */
4162                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4163                                          TARGET_PAGE_SIZE);
4164             }
4165             break;
4166         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4167             tmp_page->all_zero = false;
4168             len = qemu_get_be32(f);
4169             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4170                 error_report("Invalid compressed data length: %d", len);
4171                 ret = -EINVAL;
4172                 break;
4173             }
4174             decompress_data_with_multi_threads(f, page_buffer, len);
4175             break;
4176         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4177             multifd_recv_sync_main();
4178             break;
4179         case RAM_SAVE_FLAG_EOS:
4180             /* normal exit */
4181             if (migrate_multifd_flush_after_each_section()) {
4182                 multifd_recv_sync_main();
4183             }
4184             break;
4185         default:
4186             error_report("Unknown combination of migration flags: 0x%x"
4187                          " (postcopy mode)", flags);
4188             ret = -EINVAL;
4189             break;
4190         }
4191
4192         /* Got the whole host page, wait for decompress before placing. */
4193         if (place_needed) {
4194             ret |= wait_for_decompress_done();
4195         }
4196
4197         /* Detect for any possible file errors */
4198         if (!ret && qemu_file_get_error(f)) {
4199             ret = qemu_file_get_error(f);
4200         }
4201
4202         if (!ret && place_needed) {
4203             if (tmp_page->all_zero) {
4204                 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4205             } else {
4206                 ret = postcopy_place_page(mis, tmp_page->host_addr,
4207                                           place_source, block);
4208             }
4209             place_needed = false;
4210             postcopy_temp_page_reset(tmp_page);
4211         }
4212     }
4213
4214     return ret;
4215 }
4216
4217 static bool postcopy_is_running(void)
4218 {
4219     PostcopyState ps = postcopy_state_get();
4220     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4221 }
4222
4223 /*
4224  * Flush content of RAM cache into SVM's memory.
4225  * Only flush the pages that be dirtied by PVM or SVM or both.
4226  */
4227 void colo_flush_ram_cache(void)
4228 {
4229     RAMBlock *block = NULL;
4230     void *dst_host;
4231     void *src_host;
4232     unsigned long offset = 0;
4233
4234     memory_global_dirty_log_sync();
4235     WITH_RCU_READ_LOCK_GUARD() {
4236         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4237             ramblock_sync_dirty_bitmap(ram_state, block);
4238         }
4239     }
4240
4241     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4242     WITH_RCU_READ_LOCK_GUARD() {
4243         block = QLIST_FIRST_RCU(&ram_list.blocks);
4244
4245         while (block) {
4246             unsigned long num = 0;
4247
4248             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4249             if (!offset_in_ramblock(block,
4250                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4251                 offset = 0;
4252                 num = 0;
4253                 block = QLIST_NEXT_RCU(block, next);
4254             } else {
4255                 unsigned long i = 0;
4256
4257                 for (i = 0; i < num; i++) {
4258                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
4259                 }
4260                 dst_host = block->host
4261                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4262                 src_host = block->colo_cache
4263                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4264                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4265                 offset += num;
4266             }
4267         }
4268     }
4269     trace_colo_flush_ram_cache_end();
4270 }
4271
4272 /**
4273  * ram_load_precopy: load pages in precopy case
4274  *
4275  * Returns 0 for success or -errno in case of error
4276  *
4277  * Called in precopy mode by ram_load().
4278  * rcu_read_lock is taken prior to this being called.
4279  *
4280  * @f: QEMUFile where to send the data
4281  */
4282 static int ram_load_precopy(QEMUFile *f)
4283 {
4284     MigrationIncomingState *mis = migration_incoming_get_current();
4285     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4286     /* ADVISE is earlier, it shows the source has the postcopy capability on */
4287     bool postcopy_advised = migration_incoming_postcopy_advised();
4288     if (!migrate_compress()) {
4289         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4290     }
4291
4292     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4293         ram_addr_t addr, total_ram_bytes;
4294         void *host = NULL, *host_bak = NULL;
4295         uint8_t ch;
4296
4297         /*
4298          * Yield periodically to let main loop run, but an iteration of
4299          * the main loop is expensive, so do it each some iterations
4300          */
4301         if ((i & 32767) == 0 && qemu_in_coroutine()) {
4302             aio_co_schedule(qemu_get_current_aio_context(),
4303                             qemu_coroutine_self());
4304             qemu_coroutine_yield();
4305         }
4306         i++;
4307
4308         addr = qemu_get_be64(f);
4309         flags = addr & ~TARGET_PAGE_MASK;
4310         addr &= TARGET_PAGE_MASK;
4311
4312         if (flags & invalid_flags) {
4313             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4314                 error_report("Received an unexpected compressed page");
4315             }
4316
4317             ret = -EINVAL;
4318             break;
4319         }
4320
4321         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4322                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4323             RAMBlock *block = ram_block_from_stream(mis, f, flags,
4324                                                     RAM_CHANNEL_PRECOPY);
4325
4326             host = host_from_ram_block_offset(block, addr);
4327             /*
4328              * After going into COLO stage, we should not load the page
4329              * into SVM's memory directly, we put them into colo_cache firstly.
4330              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4331              * Previously, we copied all these memory in preparing stage of COLO
4332              * while we need to stop VM, which is a time-consuming process.
4333              * Here we optimize it by a trick, back-up every page while in
4334              * migration process while COLO is enabled, though it affects the
4335              * speed of the migration, but it obviously reduce the downtime of
4336              * back-up all SVM'S memory in COLO preparing stage.
4337              */
4338             if (migration_incoming_colo_enabled()) {
4339                 if (migration_incoming_in_colo_state()) {
4340                     /* In COLO stage, put all pages into cache temporarily */
4341                     host = colo_cache_from_block_offset(block, addr, true);
4342                 } else {
4343                    /*
4344                     * In migration stage but before COLO stage,
4345                     * Put all pages into both cache and SVM's memory.
4346                     */
4347                     host_bak = colo_cache_from_block_offset(block, addr, false);
4348                 }
4349             }
4350             if (!host) {
4351                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4352                 ret = -EINVAL;
4353                 break;
4354             }
4355             if (!migration_incoming_in_colo_state()) {
4356                 ramblock_recv_bitmap_set(block, host);
4357             }
4358
4359             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4360         }
4361
4362         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4363         case RAM_SAVE_FLAG_MEM_SIZE:
4364             /* Synchronize RAM block list */
4365             total_ram_bytes = addr;
4366             while (!ret && total_ram_bytes) {
4367                 RAMBlock *block;
4368                 char id[256];
4369                 ram_addr_t length;
4370
4371                 len = qemu_get_byte(f);
4372                 qemu_get_buffer(f, (uint8_t *)id, len);
4373                 id[len] = 0;
4374                 length = qemu_get_be64(f);
4375
4376                 block = qemu_ram_block_by_name(id);
4377                 if (block && !qemu_ram_is_migratable(block)) {
4378                     error_report("block %s should not be migrated !", id);
4379                     ret = -EINVAL;
4380                 } else if (block) {
4381                     if (length != block->used_length) {
4382                         Error *local_err = NULL;
4383
4384                         ret = qemu_ram_resize(block, length,
4385                                               &local_err);
4386                         if (local_err) {
4387                             error_report_err(local_err);
4388                         }
4389                     }
4390                     /* For postcopy we need to check hugepage sizes match */
4391                     if (postcopy_advised && migrate_postcopy_ram() &&
4392                         block->page_size != qemu_host_page_size) {
4393                         uint64_t remote_page_size = qemu_get_be64(f);
4394                         if (remote_page_size != block->page_size) {
4395                             error_report("Mismatched RAM page size %s "
4396                                          "(local) %zd != %" PRId64,
4397                                          id, block->page_size,
4398                                          remote_page_size);
4399                             ret = -EINVAL;
4400                         }
4401                     }
4402                     if (migrate_ignore_shared()) {
4403                         hwaddr addr = qemu_get_be64(f);
4404                         if (ramblock_is_ignored(block) &&
4405                             block->mr->addr != addr) {
4406                             error_report("Mismatched GPAs for block %s "
4407                                          "%" PRId64 "!= %" PRId64,
4408                                          id, (uint64_t)addr,
4409                                          (uint64_t)block->mr->addr);
4410                             ret = -EINVAL;
4411                         }
4412                     }
4413                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4414                                           block->idstr);
4415                 } else {
4416                     error_report("Unknown ramblock \"%s\", cannot "
4417                                  "accept migration", id);
4418                     ret = -EINVAL;
4419                 }
4420
4421                 total_ram_bytes -= length;
4422             }
4423             break;
4424
4425         case RAM_SAVE_FLAG_ZERO:
4426             ch = qemu_get_byte(f);
4427             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4428             break;
4429
4430         case RAM_SAVE_FLAG_PAGE:
4431             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4432             break;
4433
4434         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4435             len = qemu_get_be32(f);
4436             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4437                 error_report("Invalid compressed data length: %d", len);
4438                 ret = -EINVAL;
4439                 break;
4440             }
4441             decompress_data_with_multi_threads(f, host, len);
4442             break;
4443
4444         case RAM_SAVE_FLAG_XBZRLE:
4445             if (load_xbzrle(f, addr, host) < 0) {
4446                 error_report("Failed to decompress XBZRLE page at "
4447                              RAM_ADDR_FMT, addr);
4448                 ret = -EINVAL;
4449                 break;
4450             }
4451             break;
4452         case RAM_SAVE_FLAG_MULTIFD_FLUSH:
4453             multifd_recv_sync_main();
4454             break;
4455         case RAM_SAVE_FLAG_EOS:
4456             /* normal exit */
4457             if (migrate_multifd_flush_after_each_section()) {
4458                 multifd_recv_sync_main();
4459             }
4460             break;
4461         default:
4462             if (flags & RAM_SAVE_FLAG_HOOK) {
4463                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4464             } else {
4465                 error_report("Unknown combination of migration flags: 0x%x",
4466                              flags);
4467                 ret = -EINVAL;
4468             }
4469         }
4470         if (!ret) {
4471             ret = qemu_file_get_error(f);
4472         }
4473         if (!ret && host_bak) {
4474             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4475         }
4476     }
4477
4478     ret |= wait_for_decompress_done();
4479     return ret;
4480 }
4481
4482 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4483 {
4484     int ret = 0;
4485     static uint64_t seq_iter;
4486     /*
4487      * If system is running in postcopy mode, page inserts to host memory must
4488      * be atomic
4489      */
4490     bool postcopy_running = postcopy_is_running();
4491
4492     seq_iter++;
4493
4494     if (version_id != 4) {
4495         return -EINVAL;
4496     }
4497
4498     /*
4499      * This RCU critical section can be very long running.
4500      * When RCU reclaims in the code start to become numerous,
4501      * it will be necessary to reduce the granularity of this
4502      * critical section.
4503      */
4504     WITH_RCU_READ_LOCK_GUARD() {
4505         if (postcopy_running) {
4506             /*
4507              * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4508              * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4509              * service fast page faults.
4510              */
4511             ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4512         } else {
4513             ret = ram_load_precopy(f);
4514         }
4515     }
4516     trace_ram_load_complete(ret, seq_iter);
4517
4518     return ret;
4519 }
4520
4521 static bool ram_has_postcopy(void *opaque)
4522 {
4523     RAMBlock *rb;
4524     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4525         if (ramblock_is_pmem(rb)) {
4526             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4527                          "is not supported now!", rb->idstr, rb->host);
4528             return false;
4529         }
4530     }
4531
4532     return migrate_postcopy_ram();
4533 }
4534
4535 /* Sync all the dirty bitmap with destination VM.  */
4536 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4537 {
4538     RAMBlock *block;
4539     QEMUFile *file = s->to_dst_file;
4540     int ramblock_count = 0;
4541
4542     trace_ram_dirty_bitmap_sync_start();
4543
4544     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4545         qemu_savevm_send_recv_bitmap(file, block->idstr);
4546         trace_ram_dirty_bitmap_request(block->idstr);
4547         ramblock_count++;
4548     }
4549
4550     trace_ram_dirty_bitmap_sync_wait();
4551
4552     /* Wait until all the ramblocks' dirty bitmap synced */
4553     while (ramblock_count--) {
4554         qemu_sem_wait(&s->rp_state.rp_sem);
4555     }
4556
4557     trace_ram_dirty_bitmap_sync_complete();
4558
4559     return 0;
4560 }
4561
4562 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4563 {
4564     qemu_sem_post(&s->rp_state.rp_sem);
4565 }
4566
4567 /*
4568  * Read the received bitmap, revert it as the initial dirty bitmap.
4569  * This is only used when the postcopy migration is paused but wants
4570  * to resume from a middle point.
4571  */
4572 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4573 {
4574     int ret = -EINVAL;
4575     /* from_dst_file is always valid because we're within rp_thread */
4576     QEMUFile *file = s->rp_state.from_dst_file;
4577     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4578     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4579     uint64_t size, end_mark;
4580
4581     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4582
4583     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4584         error_report("%s: incorrect state %s", __func__,
4585                      MigrationStatus_str(s->state));
4586         return -EINVAL;
4587     }
4588
4589     /*
4590      * Note: see comments in ramblock_recv_bitmap_send() on why we
4591      * need the endianness conversion, and the paddings.
4592      */
4593     local_size = ROUND_UP(local_size, 8);
4594
4595     /* Add paddings */
4596     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4597
4598     size = qemu_get_be64(file);
4599
4600     /* The size of the bitmap should match with our ramblock */
4601     if (size != local_size) {
4602         error_report("%s: ramblock '%s' bitmap size mismatch "
4603                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4604                      block->idstr, size, local_size);
4605         ret = -EINVAL;
4606         goto out;
4607     }
4608
4609     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4610     end_mark = qemu_get_be64(file);
4611
4612     ret = qemu_file_get_error(file);
4613     if (ret || size != local_size) {
4614         error_report("%s: read bitmap failed for ramblock '%s': %d"
4615                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4616                      __func__, block->idstr, ret, local_size, size);
4617         ret = -EIO;
4618         goto out;
4619     }
4620
4621     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4622         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4623                      __func__, block->idstr, end_mark);
4624         ret = -EINVAL;
4625         goto out;
4626     }
4627
4628     /*
4629      * Endianness conversion. We are during postcopy (though paused).
4630      * The dirty bitmap won't change. We can directly modify it.
4631      */
4632     bitmap_from_le(block->bmap, le_bitmap, nbits);
4633
4634     /*
4635      * What we received is "received bitmap". Revert it as the initial
4636      * dirty bitmap for this ramblock.
4637      */
4638     bitmap_complement(block->bmap, block->bmap, nbits);
4639
4640     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4641     ramblock_dirty_bitmap_clear_discarded_pages(block);
4642
4643     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4644     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4645
4646     /*
4647      * We succeeded to sync bitmap for current ramblock. If this is
4648      * the last one to sync, we need to notify the main send thread.
4649      */
4650     ram_dirty_bitmap_reload_notify(s);
4651
4652     ret = 0;
4653 out:
4654     g_free(le_bitmap);
4655     return ret;
4656 }
4657
4658 static int ram_resume_prepare(MigrationState *s, void *opaque)
4659 {
4660     RAMState *rs = *(RAMState **)opaque;
4661     int ret;
4662
4663     ret = ram_dirty_bitmap_sync_all(s, rs);
4664     if (ret) {
4665         return ret;
4666     }
4667
4668     ram_state_resume_prepare(rs, s->to_dst_file);
4669
4670     return 0;
4671 }
4672
4673 void postcopy_preempt_shutdown_file(MigrationState *s)
4674 {
4675     qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4676     qemu_fflush(s->postcopy_qemufile_src);
4677 }
4678
4679 static SaveVMHandlers savevm_ram_handlers = {
4680     .save_setup = ram_save_setup,
4681     .save_live_iterate = ram_save_iterate,
4682     .save_live_complete_postcopy = ram_save_complete,
4683     .save_live_complete_precopy = ram_save_complete,
4684     .has_postcopy = ram_has_postcopy,
4685     .state_pending_exact = ram_state_pending_exact,
4686     .state_pending_estimate = ram_state_pending_estimate,
4687     .load_state = ram_load,
4688     .save_cleanup = ram_save_cleanup,
4689     .load_setup = ram_load_setup,
4690     .load_cleanup = ram_load_cleanup,
4691     .resume_prepare = ram_resume_prepare,
4692 };
4693
4694 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4695                                       size_t old_size, size_t new_size)
4696 {
4697     PostcopyState ps = postcopy_state_get();
4698     ram_addr_t offset;
4699     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4700     Error *err = NULL;
4701
4702     if (ramblock_is_ignored(rb)) {
4703         return;
4704     }
4705
4706     if (!migration_is_idle()) {
4707         /*
4708          * Precopy code on the source cannot deal with the size of RAM blocks
4709          * changing at random points in time - especially after sending the
4710          * RAM block sizes in the migration stream, they must no longer change.
4711          * Abort and indicate a proper reason.
4712          */
4713         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4714         migration_cancel(err);
4715         error_free(err);
4716     }
4717
4718     switch (ps) {
4719     case POSTCOPY_INCOMING_ADVISE:
4720         /*
4721          * Update what ram_postcopy_incoming_init()->init_range() does at the
4722          * time postcopy was advised. Syncing RAM blocks with the source will
4723          * result in RAM resizes.
4724          */
4725         if (old_size < new_size) {
4726             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4727                 error_report("RAM block '%s' discard of resized RAM failed",
4728                              rb->idstr);
4729             }
4730         }
4731         rb->postcopy_length = new_size;
4732         break;
4733     case POSTCOPY_INCOMING_NONE:
4734     case POSTCOPY_INCOMING_RUNNING:
4735     case POSTCOPY_INCOMING_END:
4736         /*
4737          * Once our guest is running, postcopy does no longer care about
4738          * resizes. When growing, the new memory was not available on the
4739          * source, no handler needed.
4740          */
4741         break;
4742     default:
4743         error_report("RAM block '%s' resized during postcopy state: %d",
4744                      rb->idstr, ps);
4745         exit(-1);
4746     }
4747 }
4748
4749 static RAMBlockNotifier ram_mig_ram_notifier = {
4750     .ram_block_resized = ram_mig_ram_block_resized,
4751 };
4752
4753 void ram_mig_init(void)
4754 {
4755     qemu_mutex_init(&XBZRLE.lock);
4756     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4757     ram_block_notifier_add(&ram_mig_ram_notifier);
4758 }