OSDN Git Service

migration: Export tls-[creds|hostname|authz] params to cmdline too
[qmiga/qemu.git] / migration / migration.c
1 /*
2  * QEMU live migration
3  *
4  * Copyright IBM, Corp. 2008
5  *
6  * Authors:
7  *  Anthony Liguori   <aliguori@us.ibm.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15
16 #include "qemu/osdep.h"
17 #include "qemu/cutils.h"
18 #include "qemu/error-report.h"
19 #include "qemu/main-loop.h"
20 #include "migration/blocker.h"
21 #include "exec.h"
22 #include "fd.h"
23 #include "socket.h"
24 #include "sysemu/runstate.h"
25 #include "sysemu/sysemu.h"
26 #include "sysemu/cpu-throttle.h"
27 #include "rdma.h"
28 #include "ram.h"
29 #include "migration/global_state.h"
30 #include "migration/misc.h"
31 #include "migration.h"
32 #include "savevm.h"
33 #include "qemu-file.h"
34 #include "migration/vmstate.h"
35 #include "block/block.h"
36 #include "qapi/error.h"
37 #include "qapi/clone-visitor.h"
38 #include "qapi/qapi-visit-migration.h"
39 #include "qapi/qapi-visit-sockets.h"
40 #include "qapi/qapi-commands-migration.h"
41 #include "qapi/qapi-events-migration.h"
42 #include "qapi/qmp/qerror.h"
43 #include "qapi/qmp/qnull.h"
44 #include "qemu/rcu.h"
45 #include "block.h"
46 #include "postcopy-ram.h"
47 #include "qemu/thread.h"
48 #include "trace.h"
49 #include "exec/target_page.h"
50 #include "io/channel-buffer.h"
51 #include "io/channel-tls.h"
52 #include "migration/colo.h"
53 #include "hw/boards.h"
54 #include "hw/qdev-properties.h"
55 #include "hw/qdev-properties-system.h"
56 #include "monitor/monitor.h"
57 #include "net/announce.h"
58 #include "qemu/queue.h"
59 #include "multifd.h"
60 #include "qemu/yank.h"
61 #include "sysemu/cpus.h"
62 #include "yank_functions.h"
63 #include "sysemu/qtest.h"
64
65 #define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
66
67 /* Amount of time to allocate to each "chunk" of bandwidth-throttled
68  * data. */
69 #define BUFFER_DELAY     100
70 #define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
71
72 /* Time in milliseconds we are allowed to stop the source,
73  * for sending the last part */
74 #define DEFAULT_MIGRATE_SET_DOWNTIME 300
75
76 /* Maximum migrate downtime set to 2000 seconds */
77 #define MAX_MIGRATE_DOWNTIME_SECONDS 2000
78 #define MAX_MIGRATE_DOWNTIME (MAX_MIGRATE_DOWNTIME_SECONDS * 1000)
79
80 /* Default compression thread count */
81 #define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8
82 /* Default decompression thread count, usually decompression is at
83  * least 4 times as fast as compression.*/
84 #define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
85 /*0: means nocompress, 1: best speed, ... 9: best compress ratio */
86 #define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
87 /* Define default autoconverge cpu throttle migration parameters */
88 #define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50
89 #define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
90 #define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
91 #define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99
92
93 /* Migration XBZRLE default cache size */
94 #define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
95
96 /* The delay time (in ms) between two COLO checkpoints */
97 #define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
98 #define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
99 #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
100 /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
101 #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
102 /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
103 #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
104
105 /* Background transfer rate for postcopy, 0 means unlimited, note
106  * that page requests can still exceed this limit.
107  */
108 #define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0
109
110 /*
111  * Parameters for self_announce_delay giving a stream of RARP/ARP
112  * packets after migration.
113  */
114 #define DEFAULT_MIGRATE_ANNOUNCE_INITIAL  50
115 #define DEFAULT_MIGRATE_ANNOUNCE_MAX     550
116 #define DEFAULT_MIGRATE_ANNOUNCE_ROUNDS    5
117 #define DEFAULT_MIGRATE_ANNOUNCE_STEP    100
118
119 static NotifierList migration_state_notifiers =
120     NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
121
122 /* Messages sent on the return path from destination to source */
123 enum mig_rp_message_type {
124     MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
125     MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
126     MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
127
128     MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
129     MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
130     MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
131     MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
132
133     MIG_RP_MSG_MAX
134 };
135
136 /* Migration capabilities set */
137 struct MigrateCapsSet {
138     int size;                       /* Capability set size */
139     MigrationCapability caps[];     /* Variadic array of capabilities */
140 };
141 typedef struct MigrateCapsSet MigrateCapsSet;
142
143 /* Define and initialize MigrateCapsSet */
144 #define INITIALIZE_MIGRATE_CAPS_SET(_name, ...)   \
145     MigrateCapsSet _name = {    \
146         .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \
147         .caps = { __VA_ARGS__ } \
148     }
149
150 /* Background-snapshot compatibility check list */
151 static const
152 INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
153     MIGRATION_CAPABILITY_POSTCOPY_RAM,
154     MIGRATION_CAPABILITY_DIRTY_BITMAPS,
155     MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME,
156     MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE,
157     MIGRATION_CAPABILITY_RETURN_PATH,
158     MIGRATION_CAPABILITY_MULTIFD,
159     MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER,
160     MIGRATION_CAPABILITY_AUTO_CONVERGE,
161     MIGRATION_CAPABILITY_RELEASE_RAM,
162     MIGRATION_CAPABILITY_RDMA_PIN_ALL,
163     MIGRATION_CAPABILITY_COMPRESS,
164     MIGRATION_CAPABILITY_XBZRLE,
165     MIGRATION_CAPABILITY_X_COLO,
166     MIGRATION_CAPABILITY_VALIDATE_UUID,
167     MIGRATION_CAPABILITY_ZERO_COPY_SEND);
168
169 /* When we add fault tolerance, we could have several
170    migrations at once.  For now we don't need to add
171    dynamic creation of migration */
172
173 static MigrationState *current_migration;
174 static MigrationIncomingState *current_incoming;
175
176 static GSList *migration_blockers;
177
178 static bool migration_object_check(MigrationState *ms, Error **errp);
179 static int migration_maybe_pause(MigrationState *s,
180                                  int *current_active_state,
181                                  int new_state);
182 static void migrate_fd_cancel(MigrationState *s);
183
184 static bool migrate_allow_multi_channels = true;
185
186 void migrate_protocol_allow_multi_channels(bool allow)
187 {
188     migrate_allow_multi_channels = allow;
189 }
190
191 bool migrate_multi_channels_is_allowed(void)
192 {
193     return migrate_allow_multi_channels;
194 }
195
196 static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
197 {
198     uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
199
200     return (a > b) - (a < b);
201 }
202
203 void migration_object_init(void)
204 {
205     /* This can only be called once. */
206     assert(!current_migration);
207     current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
208
209     /*
210      * Init the migrate incoming object as well no matter whether
211      * we'll use it or not.
212      */
213     assert(!current_incoming);
214     current_incoming = g_new0(MigrationIncomingState, 1);
215     current_incoming->state = MIGRATION_STATUS_NONE;
216     current_incoming->postcopy_remote_fds =
217         g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
218     qemu_mutex_init(&current_incoming->rp_mutex);
219     qemu_mutex_init(&current_incoming->postcopy_prio_thread_mutex);
220     qemu_event_init(&current_incoming->main_thread_load_event, false);
221     qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
222     qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
223     qemu_sem_init(&current_incoming->postcopy_pause_sem_fast_load, 0);
224     qemu_mutex_init(&current_incoming->page_request_mutex);
225     current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
226
227     migration_object_check(current_migration, &error_fatal);
228
229     blk_mig_init();
230     ram_mig_init();
231     dirty_bitmap_mig_init();
232 }
233
234 void migration_cancel(const Error *error)
235 {
236     if (error) {
237         migrate_set_error(current_migration, error);
238     }
239     migrate_fd_cancel(current_migration);
240 }
241
242 void migration_shutdown(void)
243 {
244     /*
245      * When the QEMU main thread exit, the COLO thread
246      * may wait a semaphore. So, we should wakeup the
247      * COLO thread before migration shutdown.
248      */
249     colo_shutdown();
250     /*
251      * Cancel the current migration - that will (eventually)
252      * stop the migration using this structure
253      */
254     migration_cancel(NULL);
255     object_unref(OBJECT(current_migration));
256
257     /*
258      * Cancel outgoing migration of dirty bitmaps. It should
259      * at least unref used block nodes.
260      */
261     dirty_bitmap_mig_cancel_outgoing();
262
263     /*
264      * Cancel incoming migration of dirty bitmaps. Dirty bitmaps
265      * are non-critical data, and their loss never considered as
266      * something serious.
267      */
268     dirty_bitmap_mig_cancel_incoming();
269 }
270
271 /* For outgoing */
272 MigrationState *migrate_get_current(void)
273 {
274     /* This can only be called after the object created. */
275     assert(current_migration);
276     return current_migration;
277 }
278
279 MigrationIncomingState *migration_incoming_get_current(void)
280 {
281     assert(current_incoming);
282     return current_incoming;
283 }
284
285 void migration_incoming_transport_cleanup(MigrationIncomingState *mis)
286 {
287     if (mis->socket_address_list) {
288         qapi_free_SocketAddressList(mis->socket_address_list);
289         mis->socket_address_list = NULL;
290     }
291
292     if (mis->transport_cleanup) {
293         mis->transport_cleanup(mis->transport_data);
294         mis->transport_data = mis->transport_cleanup = NULL;
295     }
296 }
297
298 void migration_incoming_state_destroy(void)
299 {
300     struct MigrationIncomingState *mis = migration_incoming_get_current();
301
302     if (mis->to_src_file) {
303         /* Tell source that we are done */
304         migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
305         qemu_fclose(mis->to_src_file);
306         mis->to_src_file = NULL;
307     }
308
309     if (mis->from_src_file) {
310         migration_ioc_unregister_yank_from_file(mis->from_src_file);
311         qemu_fclose(mis->from_src_file);
312         mis->from_src_file = NULL;
313     }
314     if (mis->postcopy_remote_fds) {
315         g_array_free(mis->postcopy_remote_fds, TRUE);
316         mis->postcopy_remote_fds = NULL;
317     }
318
319     migration_incoming_transport_cleanup(mis);
320     qemu_event_reset(&mis->main_thread_load_event);
321
322     if (mis->page_requested) {
323         g_tree_destroy(mis->page_requested);
324         mis->page_requested = NULL;
325     }
326
327     if (mis->postcopy_qemufile_dst) {
328         migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
329         qemu_fclose(mis->postcopy_qemufile_dst);
330         mis->postcopy_qemufile_dst = NULL;
331     }
332
333     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
334 }
335
336 static void migrate_generate_event(int new_state)
337 {
338     if (migrate_use_events()) {
339         qapi_event_send_migration(new_state);
340     }
341 }
342
343 static bool migrate_late_block_activate(void)
344 {
345     MigrationState *s;
346
347     s = migrate_get_current();
348
349     return s->enabled_capabilities[
350         MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE];
351 }
352
353 /*
354  * Send a message on the return channel back to the source
355  * of the migration.
356  */
357 static int migrate_send_rp_message(MigrationIncomingState *mis,
358                                    enum mig_rp_message_type message_type,
359                                    uint16_t len, void *data)
360 {
361     int ret = 0;
362
363     trace_migrate_send_rp_message((int)message_type, len);
364     QEMU_LOCK_GUARD(&mis->rp_mutex);
365
366     /*
367      * It's possible that the file handle got lost due to network
368      * failures.
369      */
370     if (!mis->to_src_file) {
371         ret = -EIO;
372         return ret;
373     }
374
375     qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
376     qemu_put_be16(mis->to_src_file, len);
377     qemu_put_buffer(mis->to_src_file, data, len);
378     qemu_fflush(mis->to_src_file);
379
380     /* It's possible that qemu file got error during sending */
381     ret = qemu_file_get_error(mis->to_src_file);
382
383     return ret;
384 }
385
386 /* Request one page from the source VM at the given start address.
387  *   rb: the RAMBlock to request the page in
388  *   Start: Address offset within the RB
389  *   Len: Length in bytes required - must be a multiple of pagesize
390  */
391 int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
392                                       RAMBlock *rb, ram_addr_t start)
393 {
394     uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
395     size_t msglen = 12; /* start + len */
396     size_t len = qemu_ram_pagesize(rb);
397     enum mig_rp_message_type msg_type;
398     const char *rbname;
399     int rbname_len;
400
401     *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
402     *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
403
404     /*
405      * We maintain the last ramblock that we requested for page.  Note that we
406      * don't need locking because this function will only be called within the
407      * postcopy ram fault thread.
408      */
409     if (rb != mis->last_rb) {
410         mis->last_rb = rb;
411
412         rbname = qemu_ram_get_idstr(rb);
413         rbname_len = strlen(rbname);
414
415         assert(rbname_len < 256);
416
417         bufc[msglen++] = rbname_len;
418         memcpy(bufc + msglen, rbname, rbname_len);
419         msglen += rbname_len;
420         msg_type = MIG_RP_MSG_REQ_PAGES_ID;
421     } else {
422         msg_type = MIG_RP_MSG_REQ_PAGES;
423     }
424
425     return migrate_send_rp_message(mis, msg_type, msglen, bufc);
426 }
427
428 int migrate_send_rp_req_pages(MigrationIncomingState *mis,
429                               RAMBlock *rb, ram_addr_t start, uint64_t haddr)
430 {
431     void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
432     bool received = false;
433
434     WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
435         received = ramblock_recv_bitmap_test_byte_offset(rb, start);
436         if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
437             /*
438              * The page has not been received, and it's not yet in the page
439              * request list.  Queue it.  Set the value of element to 1, so that
440              * things like g_tree_lookup() will return TRUE (1) when found.
441              */
442             g_tree_insert(mis->page_requested, aligned, (gpointer)1);
443             mis->page_requested_count++;
444             trace_postcopy_page_req_add(aligned, mis->page_requested_count);
445         }
446     }
447
448     /*
449      * If the page is there, skip sending the message.  We don't even need the
450      * lock because as long as the page arrived, it'll be there forever.
451      */
452     if (received) {
453         return 0;
454     }
455
456     return migrate_send_rp_message_req_pages(mis, rb, start);
457 }
458
459 static bool migration_colo_enabled;
460 bool migration_incoming_colo_enabled(void)
461 {
462     return migration_colo_enabled;
463 }
464
465 void migration_incoming_disable_colo(void)
466 {
467     ram_block_discard_disable(false);
468     migration_colo_enabled = false;
469 }
470
471 int migration_incoming_enable_colo(void)
472 {
473     if (ram_block_discard_disable(true)) {
474         error_report("COLO: cannot disable RAM discard");
475         return -EBUSY;
476     }
477     migration_colo_enabled = true;
478     return 0;
479 }
480
481 void migrate_add_address(SocketAddress *address)
482 {
483     MigrationIncomingState *mis = migration_incoming_get_current();
484
485     QAPI_LIST_PREPEND(mis->socket_address_list,
486                       QAPI_CLONE(SocketAddress, address));
487 }
488
489 static void qemu_start_incoming_migration(const char *uri, Error **errp)
490 {
491     const char *p = NULL;
492
493     migrate_protocol_allow_multi_channels(false); /* reset it anyway */
494     qapi_event_send_migration(MIGRATION_STATUS_SETUP);
495     if (strstart(uri, "tcp:", &p) ||
496         strstart(uri, "unix:", NULL) ||
497         strstart(uri, "vsock:", NULL)) {
498         migrate_protocol_allow_multi_channels(true);
499         socket_start_incoming_migration(p ? p : uri, errp);
500 #ifdef CONFIG_RDMA
501     } else if (strstart(uri, "rdma:", &p)) {
502         rdma_start_incoming_migration(p, errp);
503 #endif
504     } else if (strstart(uri, "exec:", &p)) {
505         exec_start_incoming_migration(p, errp);
506     } else if (strstart(uri, "fd:", &p)) {
507         fd_start_incoming_migration(p, errp);
508     } else {
509         error_setg(errp, "unknown migration protocol: %s", uri);
510     }
511 }
512
513 static void process_incoming_migration_bh(void *opaque)
514 {
515     Error *local_err = NULL;
516     MigrationIncomingState *mis = opaque;
517
518     /* If capability late_block_activate is set:
519      * Only fire up the block code now if we're going to restart the
520      * VM, else 'cont' will do it.
521      * This causes file locking to happen; so we don't want it to happen
522      * unless we really are starting the VM.
523      */
524     if (!migrate_late_block_activate() ||
525          (autostart && (!global_state_received() ||
526             global_state_get_runstate() == RUN_STATE_RUNNING))) {
527         /* Make sure all file formats throw away their mutable metadata.
528          * If we get an error here, just don't restart the VM yet. */
529         bdrv_activate_all(&local_err);
530         if (local_err) {
531             error_report_err(local_err);
532             local_err = NULL;
533             autostart = false;
534         }
535     }
536
537     /*
538      * This must happen after all error conditions are dealt with and
539      * we're sure the VM is going to be running on this host.
540      */
541     qemu_announce_self(&mis->announce_timer, migrate_announce_params());
542
543     if (multifd_load_cleanup(&local_err) != 0) {
544         error_report_err(local_err);
545         autostart = false;
546     }
547     /* If global state section was not received or we are in running
548        state, we need to obey autostart. Any other state is set with
549        runstate_set. */
550
551     dirty_bitmap_mig_before_vm_start();
552
553     if (!global_state_received() ||
554         global_state_get_runstate() == RUN_STATE_RUNNING) {
555         if (autostart) {
556             vm_start();
557         } else {
558             runstate_set(RUN_STATE_PAUSED);
559         }
560     } else if (migration_incoming_colo_enabled()) {
561         migration_incoming_disable_colo();
562         vm_start();
563     } else {
564         runstate_set(global_state_get_runstate());
565     }
566     /*
567      * This must happen after any state changes since as soon as an external
568      * observer sees this event they might start to prod at the VM assuming
569      * it's ready to use.
570      */
571     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
572                       MIGRATION_STATUS_COMPLETED);
573     qemu_bh_delete(mis->bh);
574     migration_incoming_state_destroy();
575 }
576
577 static void process_incoming_migration_co(void *opaque)
578 {
579     MigrationIncomingState *mis = migration_incoming_get_current();
580     PostcopyState ps;
581     int ret;
582     Error *local_err = NULL;
583
584     assert(mis->from_src_file);
585     mis->migration_incoming_co = qemu_coroutine_self();
586     mis->largest_page_size = qemu_ram_pagesize_largest();
587     postcopy_state_set(POSTCOPY_INCOMING_NONE);
588     migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
589                       MIGRATION_STATUS_ACTIVE);
590     ret = qemu_loadvm_state(mis->from_src_file);
591
592     ps = postcopy_state_get();
593     trace_process_incoming_migration_co_end(ret, ps);
594     if (ps != POSTCOPY_INCOMING_NONE) {
595         if (ps == POSTCOPY_INCOMING_ADVISE) {
596             /*
597              * Where a migration had postcopy enabled (and thus went to advise)
598              * but managed to complete within the precopy period, we can use
599              * the normal exit.
600              */
601             postcopy_ram_incoming_cleanup(mis);
602         } else if (ret >= 0) {
603             /*
604              * Postcopy was started, cleanup should happen at the end of the
605              * postcopy thread.
606              */
607             trace_process_incoming_migration_co_postcopy_end_main();
608             return;
609         }
610         /* Else if something went wrong then just fall out of the normal exit */
611     }
612
613     /* we get COLO info, and know if we are in COLO mode */
614     if (!ret && migration_incoming_colo_enabled()) {
615         /* Make sure all file formats throw away their mutable metadata */
616         bdrv_activate_all(&local_err);
617         if (local_err) {
618             error_report_err(local_err);
619             goto fail;
620         }
621
622         qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
623              colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
624         mis->have_colo_incoming_thread = true;
625         qemu_coroutine_yield();
626
627         qemu_mutex_unlock_iothread();
628         /* Wait checkpoint incoming thread exit before free resource */
629         qemu_thread_join(&mis->colo_incoming_thread);
630         qemu_mutex_lock_iothread();
631         /* We hold the global iothread lock, so it is safe here */
632         colo_release_ram_cache();
633     }
634
635     if (ret < 0) {
636         error_report("load of migration failed: %s", strerror(-ret));
637         goto fail;
638     }
639     mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
640     qemu_bh_schedule(mis->bh);
641     mis->migration_incoming_co = NULL;
642     return;
643 fail:
644     local_err = NULL;
645     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
646                       MIGRATION_STATUS_FAILED);
647     qemu_fclose(mis->from_src_file);
648     if (multifd_load_cleanup(&local_err) != 0) {
649         error_report_err(local_err);
650     }
651     exit(EXIT_FAILURE);
652 }
653
654 /**
655  * migration_incoming_setup: Setup incoming migration
656  * @f: file for main migration channel
657  * @errp: where to put errors
658  *
659  * Returns: %true on success, %false on error.
660  */
661 static bool migration_incoming_setup(QEMUFile *f, Error **errp)
662 {
663     MigrationIncomingState *mis = migration_incoming_get_current();
664
665     if (multifd_load_setup(errp) != 0) {
666         return false;
667     }
668
669     if (!mis->from_src_file) {
670         mis->from_src_file = f;
671     }
672     qemu_file_set_blocking(f, false);
673     return true;
674 }
675
676 void migration_incoming_process(void)
677 {
678     Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
679     qemu_coroutine_enter(co);
680 }
681
682 /* Returns true if recovered from a paused migration, otherwise false */
683 static bool postcopy_try_recover(void)
684 {
685     MigrationIncomingState *mis = migration_incoming_get_current();
686
687     if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
688         /* Resumed from a paused postcopy migration */
689
690         /* This should be set already in migration_incoming_setup() */
691         assert(mis->from_src_file);
692         /* Postcopy has standalone thread to do vm load */
693         qemu_file_set_blocking(mis->from_src_file, true);
694
695         /* Re-configure the return path */
696         mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
697
698         migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
699                           MIGRATION_STATUS_POSTCOPY_RECOVER);
700
701         /*
702          * Here, we only wake up the main loading thread (while the
703          * rest threads will still be waiting), so that we can receive
704          * commands from source now, and answer it if needed. The
705          * rest threads will be woken up afterwards until we are sure
706          * that source is ready to reply to page requests.
707          */
708         qemu_sem_post(&mis->postcopy_pause_sem_dst);
709         return true;
710     }
711
712     return false;
713 }
714
715 void migration_fd_process_incoming(QEMUFile *f, Error **errp)
716 {
717     if (!migration_incoming_setup(f, errp)) {
718         return;
719     }
720     if (postcopy_try_recover()) {
721         return;
722     }
723     migration_incoming_process();
724 }
725
726 static bool migration_needs_multiple_sockets(void)
727 {
728     return migrate_use_multifd() || migrate_postcopy_preempt();
729 }
730
731 void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
732 {
733     MigrationIncomingState *mis = migration_incoming_get_current();
734     Error *local_err = NULL;
735     bool start_migration;
736     QEMUFile *f;
737
738     if (!mis->from_src_file) {
739         /* The first connection (multifd may have multiple) */
740         f = qemu_file_new_input(ioc);
741
742         if (!migration_incoming_setup(f, errp)) {
743             return;
744         }
745
746         /*
747          * Common migration only needs one channel, so we can start
748          * right now.  Some features need more than one channel, we wait.
749          */
750         start_migration = !migration_needs_multiple_sockets();
751     } else {
752         /* Multiple connections */
753         assert(migration_needs_multiple_sockets());
754         if (migrate_use_multifd()) {
755             start_migration = multifd_recv_new_channel(ioc, &local_err);
756         } else {
757             assert(migrate_postcopy_preempt());
758             f = qemu_file_new_input(ioc);
759             start_migration = postcopy_preempt_new_channel(mis, f);
760         }
761         if (local_err) {
762             error_propagate(errp, local_err);
763             return;
764         }
765     }
766
767     if (start_migration) {
768         /* If it's a recovery, we're done */
769         if (postcopy_try_recover()) {
770             return;
771         }
772         migration_incoming_process();
773     }
774 }
775
776 /**
777  * @migration_has_all_channels: We have received all channels that we need
778  *
779  * Returns true when we have got connections to all the channels that
780  * we need for migration.
781  */
782 bool migration_has_all_channels(void)
783 {
784     MigrationIncomingState *mis = migration_incoming_get_current();
785
786     if (!mis->from_src_file) {
787         return false;
788     }
789
790     if (migrate_use_multifd()) {
791         return multifd_recv_all_channels_created();
792     }
793
794     if (migrate_postcopy_preempt()) {
795         return mis->postcopy_qemufile_dst != NULL;
796     }
797
798     return true;
799 }
800
801 /*
802  * Send a 'SHUT' message on the return channel with the given value
803  * to indicate that we've finished with the RP.  Non-0 value indicates
804  * error.
805  */
806 void migrate_send_rp_shut(MigrationIncomingState *mis,
807                           uint32_t value)
808 {
809     uint32_t buf;
810
811     buf = cpu_to_be32(value);
812     migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
813 }
814
815 /*
816  * Send a 'PONG' message on the return channel with the given value
817  * (normally in response to a 'PING')
818  */
819 void migrate_send_rp_pong(MigrationIncomingState *mis,
820                           uint32_t value)
821 {
822     uint32_t buf;
823
824     buf = cpu_to_be32(value);
825     migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
826 }
827
828 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
829                                  char *block_name)
830 {
831     char buf[512];
832     int len;
833     int64_t res;
834
835     /*
836      * First, we send the header part. It contains only the len of
837      * idstr, and the idstr itself.
838      */
839     len = strlen(block_name);
840     buf[0] = len;
841     memcpy(buf + 1, block_name, len);
842
843     if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
844         error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
845                      __func__);
846         return;
847     }
848
849     migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
850
851     /*
852      * Next, we dump the received bitmap to the stream.
853      *
854      * TODO: currently we are safe since we are the only one that is
855      * using the to_src_file handle (fault thread is still paused),
856      * and it's ok even not taking the mutex. However the best way is
857      * to take the lock before sending the message header, and release
858      * the lock after sending the bitmap.
859      */
860     qemu_mutex_lock(&mis->rp_mutex);
861     res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
862     qemu_mutex_unlock(&mis->rp_mutex);
863
864     trace_migrate_send_rp_recv_bitmap(block_name, res);
865 }
866
867 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
868 {
869     uint32_t buf;
870
871     buf = cpu_to_be32(value);
872     migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
873 }
874
875 MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
876 {
877     MigrationCapabilityStatusList *head = NULL, **tail = &head;
878     MigrationCapabilityStatus *caps;
879     MigrationState *s = migrate_get_current();
880     int i;
881
882     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
883 #ifndef CONFIG_LIVE_BLOCK_MIGRATION
884         if (i == MIGRATION_CAPABILITY_BLOCK) {
885             continue;
886         }
887 #endif
888         caps = g_malloc0(sizeof(*caps));
889         caps->capability = i;
890         caps->state = s->enabled_capabilities[i];
891         QAPI_LIST_APPEND(tail, caps);
892     }
893
894     return head;
895 }
896
897 MigrationParameters *qmp_query_migrate_parameters(Error **errp)
898 {
899     MigrationParameters *params;
900     MigrationState *s = migrate_get_current();
901
902     /* TODO use QAPI_CLONE() instead of duplicating it inline */
903     params = g_malloc0(sizeof(*params));
904     params->has_compress_level = true;
905     params->compress_level = s->parameters.compress_level;
906     params->has_compress_threads = true;
907     params->compress_threads = s->parameters.compress_threads;
908     params->has_compress_wait_thread = true;
909     params->compress_wait_thread = s->parameters.compress_wait_thread;
910     params->has_decompress_threads = true;
911     params->decompress_threads = s->parameters.decompress_threads;
912     params->has_throttle_trigger_threshold = true;
913     params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold;
914     params->has_cpu_throttle_initial = true;
915     params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
916     params->has_cpu_throttle_increment = true;
917     params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
918     params->has_cpu_throttle_tailslow = true;
919     params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
920     params->has_tls_creds = true;
921     params->tls_creds = g_strdup(s->parameters.tls_creds);
922     params->has_tls_hostname = true;
923     params->tls_hostname = g_strdup(s->parameters.tls_hostname);
924     params->has_tls_authz = true;
925     params->tls_authz = g_strdup(s->parameters.tls_authz ?
926                                  s->parameters.tls_authz : "");
927     params->has_max_bandwidth = true;
928     params->max_bandwidth = s->parameters.max_bandwidth;
929     params->has_downtime_limit = true;
930     params->downtime_limit = s->parameters.downtime_limit;
931     params->has_x_checkpoint_delay = true;
932     params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
933     params->has_block_incremental = true;
934     params->block_incremental = s->parameters.block_incremental;
935     params->has_multifd_channels = true;
936     params->multifd_channels = s->parameters.multifd_channels;
937     params->has_multifd_compression = true;
938     params->multifd_compression = s->parameters.multifd_compression;
939     params->has_multifd_zlib_level = true;
940     params->multifd_zlib_level = s->parameters.multifd_zlib_level;
941     params->has_multifd_zstd_level = true;
942     params->multifd_zstd_level = s->parameters.multifd_zstd_level;
943     params->has_xbzrle_cache_size = true;
944     params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
945     params->has_max_postcopy_bandwidth = true;
946     params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth;
947     params->has_max_cpu_throttle = true;
948     params->max_cpu_throttle = s->parameters.max_cpu_throttle;
949     params->has_announce_initial = true;
950     params->announce_initial = s->parameters.announce_initial;
951     params->has_announce_max = true;
952     params->announce_max = s->parameters.announce_max;
953     params->has_announce_rounds = true;
954     params->announce_rounds = s->parameters.announce_rounds;
955     params->has_announce_step = true;
956     params->announce_step = s->parameters.announce_step;
957
958     if (s->parameters.has_block_bitmap_mapping) {
959         params->has_block_bitmap_mapping = true;
960         params->block_bitmap_mapping =
961             QAPI_CLONE(BitmapMigrationNodeAliasList,
962                        s->parameters.block_bitmap_mapping);
963     }
964
965     return params;
966 }
967
968 AnnounceParameters *migrate_announce_params(void)
969 {
970     static AnnounceParameters ap;
971
972     MigrationState *s = migrate_get_current();
973
974     ap.initial = s->parameters.announce_initial;
975     ap.max = s->parameters.announce_max;
976     ap.rounds = s->parameters.announce_rounds;
977     ap.step = s->parameters.announce_step;
978
979     return &ap;
980 }
981
982 /*
983  * Return true if we're already in the middle of a migration
984  * (i.e. any of the active or setup states)
985  */
986 bool migration_is_setup_or_active(int state)
987 {
988     switch (state) {
989     case MIGRATION_STATUS_ACTIVE:
990     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
991     case MIGRATION_STATUS_POSTCOPY_PAUSED:
992     case MIGRATION_STATUS_POSTCOPY_RECOVER:
993     case MIGRATION_STATUS_SETUP:
994     case MIGRATION_STATUS_PRE_SWITCHOVER:
995     case MIGRATION_STATUS_DEVICE:
996     case MIGRATION_STATUS_WAIT_UNPLUG:
997     case MIGRATION_STATUS_COLO:
998         return true;
999
1000     default:
1001         return false;
1002
1003     }
1004 }
1005
1006 bool migration_is_running(int state)
1007 {
1008     switch (state) {
1009     case MIGRATION_STATUS_ACTIVE:
1010     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1011     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1012     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1013     case MIGRATION_STATUS_SETUP:
1014     case MIGRATION_STATUS_PRE_SWITCHOVER:
1015     case MIGRATION_STATUS_DEVICE:
1016     case MIGRATION_STATUS_WAIT_UNPLUG:
1017     case MIGRATION_STATUS_CANCELLING:
1018         return true;
1019
1020     default:
1021         return false;
1022
1023     }
1024 }
1025
1026 static void populate_time_info(MigrationInfo *info, MigrationState *s)
1027 {
1028     info->has_status = true;
1029     info->has_setup_time = true;
1030     info->setup_time = s->setup_time;
1031     if (s->state == MIGRATION_STATUS_COMPLETED) {
1032         info->has_total_time = true;
1033         info->total_time = s->total_time;
1034         info->has_downtime = true;
1035         info->downtime = s->downtime;
1036     } else {
1037         info->has_total_time = true;
1038         info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
1039                            s->start_time;
1040         info->has_expected_downtime = true;
1041         info->expected_downtime = s->expected_downtime;
1042     }
1043 }
1044
1045 static void populate_ram_info(MigrationInfo *info, MigrationState *s)
1046 {
1047     size_t page_size = qemu_target_page_size();
1048
1049     info->has_ram = true;
1050     info->ram = g_malloc0(sizeof(*info->ram));
1051     info->ram->transferred = ram_counters.transferred;
1052     info->ram->total = ram_bytes_total();
1053     info->ram->duplicate = ram_counters.duplicate;
1054     /* legacy value.  It is not used anymore */
1055     info->ram->skipped = 0;
1056     info->ram->normal = ram_counters.normal;
1057     info->ram->normal_bytes = ram_counters.normal * page_size;
1058     info->ram->mbps = s->mbps;
1059     info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
1060     info->ram->postcopy_requests = ram_counters.postcopy_requests;
1061     info->ram->page_size = page_size;
1062     info->ram->multifd_bytes = ram_counters.multifd_bytes;
1063     info->ram->pages_per_second = s->pages_per_second;
1064     info->ram->precopy_bytes = ram_counters.precopy_bytes;
1065     info->ram->downtime_bytes = ram_counters.downtime_bytes;
1066     info->ram->postcopy_bytes = ram_counters.postcopy_bytes;
1067
1068     if (migrate_use_xbzrle()) {
1069         info->has_xbzrle_cache = true;
1070         info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
1071         info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
1072         info->xbzrle_cache->bytes = xbzrle_counters.bytes;
1073         info->xbzrle_cache->pages = xbzrle_counters.pages;
1074         info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
1075         info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
1076         info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate;
1077         info->xbzrle_cache->overflow = xbzrle_counters.overflow;
1078     }
1079
1080     if (migrate_use_compression()) {
1081         info->has_compression = true;
1082         info->compression = g_malloc0(sizeof(*info->compression));
1083         info->compression->pages = compression_counters.pages;
1084         info->compression->busy = compression_counters.busy;
1085         info->compression->busy_rate = compression_counters.busy_rate;
1086         info->compression->compressed_size =
1087                                     compression_counters.compressed_size;
1088         info->compression->compression_rate =
1089                                     compression_counters.compression_rate;
1090     }
1091
1092     if (cpu_throttle_active()) {
1093         info->has_cpu_throttle_percentage = true;
1094         info->cpu_throttle_percentage = cpu_throttle_get_percentage();
1095     }
1096
1097     if (s->state != MIGRATION_STATUS_COMPLETED) {
1098         info->ram->remaining = ram_bytes_remaining();
1099         info->ram->dirty_pages_rate = ram_counters.dirty_pages_rate;
1100     }
1101 }
1102
1103 static void populate_disk_info(MigrationInfo *info)
1104 {
1105     if (blk_mig_active()) {
1106         info->has_disk = true;
1107         info->disk = g_malloc0(sizeof(*info->disk));
1108         info->disk->transferred = blk_mig_bytes_transferred();
1109         info->disk->remaining = blk_mig_bytes_remaining();
1110         info->disk->total = blk_mig_bytes_total();
1111     }
1112 }
1113
1114 static void fill_source_migration_info(MigrationInfo *info)
1115 {
1116     MigrationState *s = migrate_get_current();
1117     int state = qatomic_read(&s->state);
1118     GSList *cur_blocker = migration_blockers;
1119
1120     info->blocked_reasons = NULL;
1121
1122     /*
1123      * There are two types of reasons a migration might be blocked;
1124      * a) devices marked in VMState as non-migratable, and
1125      * b) Explicit migration blockers
1126      * We need to add both of them here.
1127      */
1128     qemu_savevm_non_migratable_list(&info->blocked_reasons);
1129
1130     while (cur_blocker) {
1131         QAPI_LIST_PREPEND(info->blocked_reasons,
1132                           g_strdup(error_get_pretty(cur_blocker->data)));
1133         cur_blocker = g_slist_next(cur_blocker);
1134     }
1135     info->has_blocked_reasons = info->blocked_reasons != NULL;
1136
1137     switch (state) {
1138     case MIGRATION_STATUS_NONE:
1139         /* no migration has happened ever */
1140         /* do not overwrite destination migration status */
1141         return;
1142     case MIGRATION_STATUS_SETUP:
1143         info->has_status = true;
1144         info->has_total_time = false;
1145         break;
1146     case MIGRATION_STATUS_ACTIVE:
1147     case MIGRATION_STATUS_CANCELLING:
1148     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1149     case MIGRATION_STATUS_PRE_SWITCHOVER:
1150     case MIGRATION_STATUS_DEVICE:
1151     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1152     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1153         /* TODO add some postcopy stats */
1154         populate_time_info(info, s);
1155         populate_ram_info(info, s);
1156         populate_disk_info(info);
1157         populate_vfio_info(info);
1158         break;
1159     case MIGRATION_STATUS_COLO:
1160         info->has_status = true;
1161         /* TODO: display COLO specific information (checkpoint info etc.) */
1162         break;
1163     case MIGRATION_STATUS_COMPLETED:
1164         populate_time_info(info, s);
1165         populate_ram_info(info, s);
1166         populate_vfio_info(info);
1167         break;
1168     case MIGRATION_STATUS_FAILED:
1169         info->has_status = true;
1170         if (s->error) {
1171             info->has_error_desc = true;
1172             info->error_desc = g_strdup(error_get_pretty(s->error));
1173         }
1174         break;
1175     case MIGRATION_STATUS_CANCELLED:
1176         info->has_status = true;
1177         break;
1178     case MIGRATION_STATUS_WAIT_UNPLUG:
1179         info->has_status = true;
1180         break;
1181     }
1182     info->status = state;
1183 }
1184
1185 typedef enum WriteTrackingSupport {
1186     WT_SUPPORT_UNKNOWN = 0,
1187     WT_SUPPORT_ABSENT,
1188     WT_SUPPORT_AVAILABLE,
1189     WT_SUPPORT_COMPATIBLE
1190 } WriteTrackingSupport;
1191
1192 static
1193 WriteTrackingSupport migrate_query_write_tracking(void)
1194 {
1195     /* Check if kernel supports required UFFD features */
1196     if (!ram_write_tracking_available()) {
1197         return WT_SUPPORT_ABSENT;
1198     }
1199     /*
1200      * Check if current memory configuration is
1201      * compatible with required UFFD features.
1202      */
1203     if (!ram_write_tracking_compatible()) {
1204         return WT_SUPPORT_AVAILABLE;
1205     }
1206
1207     return WT_SUPPORT_COMPATIBLE;
1208 }
1209
1210 /**
1211  * @migration_caps_check - check capability validity
1212  *
1213  * @cap_list: old capability list, array of bool
1214  * @params: new capabilities to be applied soon
1215  * @errp: set *errp if the check failed, with reason
1216  *
1217  * Returns true if check passed, otherwise false.
1218  */
1219 static bool migrate_caps_check(bool *cap_list,
1220                                MigrationCapabilityStatusList *params,
1221                                Error **errp)
1222 {
1223     MigrationCapabilityStatusList *cap;
1224     bool old_postcopy_cap;
1225     MigrationIncomingState *mis = migration_incoming_get_current();
1226
1227     old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM];
1228
1229     for (cap = params; cap; cap = cap->next) {
1230         cap_list[cap->value->capability] = cap->value->state;
1231     }
1232
1233 #ifndef CONFIG_LIVE_BLOCK_MIGRATION
1234     if (cap_list[MIGRATION_CAPABILITY_BLOCK]) {
1235         error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) "
1236                    "block migration");
1237         error_append_hint(errp, "Use drive_mirror+NBD instead.\n");
1238         return false;
1239     }
1240 #endif
1241
1242 #ifndef CONFIG_REPLICATION
1243     if (cap_list[MIGRATION_CAPABILITY_X_COLO]) {
1244         error_setg(errp, "QEMU compiled without replication module"
1245                    " can't enable COLO");
1246         error_append_hint(errp, "Please enable replication before COLO.\n");
1247         return false;
1248     }
1249 #endif
1250
1251     if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
1252         /* This check is reasonably expensive, so only when it's being
1253          * set the first time, also it's only the destination that needs
1254          * special support.
1255          */
1256         if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) &&
1257             !postcopy_ram_supported_by_host(mis)) {
1258             /* postcopy_ram_supported_by_host will have emitted a more
1259              * detailed message
1260              */
1261             error_setg(errp, "Postcopy is not supported");
1262             return false;
1263         }
1264
1265         if (cap_list[MIGRATION_CAPABILITY_X_IGNORE_SHARED]) {
1266             error_setg(errp, "Postcopy is not compatible with ignore-shared");
1267             return false;
1268         }
1269     }
1270
1271     if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
1272         WriteTrackingSupport wt_support;
1273         int idx;
1274         /*
1275          * Check if 'background-snapshot' capability is supported by
1276          * host kernel and compatible with guest memory configuration.
1277          */
1278         wt_support = migrate_query_write_tracking();
1279         if (wt_support < WT_SUPPORT_AVAILABLE) {
1280             error_setg(errp, "Background-snapshot is not supported by host kernel");
1281             return false;
1282         }
1283         if (wt_support < WT_SUPPORT_COMPATIBLE) {
1284             error_setg(errp, "Background-snapshot is not compatible "
1285                     "with guest memory configuration");
1286             return false;
1287         }
1288
1289         /*
1290          * Check if there are any migration capabilities
1291          * incompatible with 'background-snapshot'.
1292          */
1293         for (idx = 0; idx < check_caps_background_snapshot.size; idx++) {
1294             int incomp_cap = check_caps_background_snapshot.caps[idx];
1295             if (cap_list[incomp_cap]) {
1296                 error_setg(errp,
1297                         "Background-snapshot is not compatible with %s",
1298                         MigrationCapability_str(incomp_cap));
1299                 return false;
1300             }
1301         }
1302     }
1303
1304 #ifdef CONFIG_LINUX
1305     if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND] &&
1306         (!cap_list[MIGRATION_CAPABILITY_MULTIFD] ||
1307          migrate_use_compression() ||
1308          migrate_use_tls())) {
1309         error_setg(errp,
1310                    "Zero copy only available for non-compressed non-TLS multifd migration");
1311         return false;
1312     }
1313 #else
1314     if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND]) {
1315         error_setg(errp,
1316                    "Zero copy currently only available on Linux");
1317         return false;
1318     }
1319 #endif
1320
1321
1322     /* incoming side only */
1323     if (runstate_check(RUN_STATE_INMIGRATE) &&
1324         !migrate_multi_channels_is_allowed() &&
1325         cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
1326         error_setg(errp, "multifd is not supported by current protocol");
1327         return false;
1328     }
1329
1330     if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]) {
1331         if (!cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
1332             error_setg(errp, "Postcopy preempt requires postcopy-ram");
1333             return false;
1334         }
1335     }
1336
1337     return true;
1338 }
1339
1340 static void fill_destination_migration_info(MigrationInfo *info)
1341 {
1342     MigrationIncomingState *mis = migration_incoming_get_current();
1343
1344     if (mis->socket_address_list) {
1345         info->has_socket_address = true;
1346         info->socket_address =
1347             QAPI_CLONE(SocketAddressList, mis->socket_address_list);
1348     }
1349
1350     switch (mis->state) {
1351     case MIGRATION_STATUS_NONE:
1352         return;
1353     case MIGRATION_STATUS_SETUP:
1354     case MIGRATION_STATUS_CANCELLING:
1355     case MIGRATION_STATUS_CANCELLED:
1356     case MIGRATION_STATUS_ACTIVE:
1357     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1358     case MIGRATION_STATUS_POSTCOPY_PAUSED:
1359     case MIGRATION_STATUS_POSTCOPY_RECOVER:
1360     case MIGRATION_STATUS_FAILED:
1361     case MIGRATION_STATUS_COLO:
1362         info->has_status = true;
1363         break;
1364     case MIGRATION_STATUS_COMPLETED:
1365         info->has_status = true;
1366         fill_destination_postcopy_migration_info(info);
1367         break;
1368     }
1369     info->status = mis->state;
1370 }
1371
1372 MigrationInfo *qmp_query_migrate(Error **errp)
1373 {
1374     MigrationInfo *info = g_malloc0(sizeof(*info));
1375
1376     fill_destination_migration_info(info);
1377     fill_source_migration_info(info);
1378
1379     return info;
1380 }
1381
1382 void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
1383                                   Error **errp)
1384 {
1385     MigrationState *s = migrate_get_current();
1386     MigrationCapabilityStatusList *cap;
1387     bool cap_list[MIGRATION_CAPABILITY__MAX];
1388
1389     if (migration_is_running(s->state)) {
1390         error_setg(errp, QERR_MIGRATION_ACTIVE);
1391         return;
1392     }
1393
1394     memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list));
1395     if (!migrate_caps_check(cap_list, params, errp)) {
1396         return;
1397     }
1398
1399     for (cap = params; cap; cap = cap->next) {
1400         s->enabled_capabilities[cap->value->capability] = cap->value->state;
1401     }
1402 }
1403
1404 /*
1405  * Check whether the parameters are valid. Error will be put into errp
1406  * (if provided). Return true if valid, otherwise false.
1407  */
1408 static bool migrate_params_check(MigrationParameters *params, Error **errp)
1409 {
1410     if (params->has_compress_level &&
1411         (params->compress_level > 9)) {
1412         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
1413                    "a value between 0 and 9");
1414         return false;
1415     }
1416
1417     if (params->has_compress_threads && (params->compress_threads < 1)) {
1418         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1419                    "compress_threads",
1420                    "a value between 1 and 255");
1421         return false;
1422     }
1423
1424     if (params->has_decompress_threads && (params->decompress_threads < 1)) {
1425         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1426                    "decompress_threads",
1427                    "a value between 1 and 255");
1428         return false;
1429     }
1430
1431     if (params->has_throttle_trigger_threshold &&
1432         (params->throttle_trigger_threshold < 1 ||
1433          params->throttle_trigger_threshold > 100)) {
1434         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1435                    "throttle_trigger_threshold",
1436                    "an integer in the range of 1 to 100");
1437         return false;
1438     }
1439
1440     if (params->has_cpu_throttle_initial &&
1441         (params->cpu_throttle_initial < 1 ||
1442          params->cpu_throttle_initial > 99)) {
1443         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1444                    "cpu_throttle_initial",
1445                    "an integer in the range of 1 to 99");
1446         return false;
1447     }
1448
1449     if (params->has_cpu_throttle_increment &&
1450         (params->cpu_throttle_increment < 1 ||
1451          params->cpu_throttle_increment > 99)) {
1452         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1453                    "cpu_throttle_increment",
1454                    "an integer in the range of 1 to 99");
1455         return false;
1456     }
1457
1458     if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) {
1459         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1460                    "max_bandwidth",
1461                    "an integer in the range of 0 to "stringify(SIZE_MAX)
1462                    " bytes/second");
1463         return false;
1464     }
1465
1466     if (params->has_downtime_limit &&
1467         (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
1468         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1469                    "downtime_limit",
1470                    "an integer in the range of 0 to "
1471                     stringify(MAX_MIGRATE_DOWNTIME)" ms");
1472         return false;
1473     }
1474
1475     /* x_checkpoint_delay is now always positive */
1476
1477     if (params->has_multifd_channels && (params->multifd_channels < 1)) {
1478         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1479                    "multifd_channels",
1480                    "a value between 1 and 255");
1481         return false;
1482     }
1483
1484     if (params->has_multifd_zlib_level &&
1485         (params->multifd_zlib_level > 9)) {
1486         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level",
1487                    "a value between 0 and 9");
1488         return false;
1489     }
1490
1491     if (params->has_multifd_zstd_level &&
1492         (params->multifd_zstd_level > 20)) {
1493         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
1494                    "a value between 0 and 20");
1495         return false;
1496     }
1497
1498     if (params->has_xbzrle_cache_size &&
1499         (params->xbzrle_cache_size < qemu_target_page_size() ||
1500          !is_power_of_2(params->xbzrle_cache_size))) {
1501         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1502                    "xbzrle_cache_size",
1503                    "a power of two no less than the target page size");
1504         return false;
1505     }
1506
1507     if (params->has_max_cpu_throttle &&
1508         (params->max_cpu_throttle < params->cpu_throttle_initial ||
1509          params->max_cpu_throttle > 99)) {
1510         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1511                    "max_cpu_throttle",
1512                    "an integer in the range of cpu_throttle_initial to 99");
1513         return false;
1514     }
1515
1516     if (params->has_announce_initial &&
1517         params->announce_initial > 100000) {
1518         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1519                    "announce_initial",
1520                    "a value between 0 and 100000");
1521         return false;
1522     }
1523     if (params->has_announce_max &&
1524         params->announce_max > 100000) {
1525         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1526                    "announce_max",
1527                    "a value between 0 and 100000");
1528        return false;
1529     }
1530     if (params->has_announce_rounds &&
1531         params->announce_rounds > 1000) {
1532         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1533                    "announce_rounds",
1534                    "a value between 0 and 1000");
1535        return false;
1536     }
1537     if (params->has_announce_step &&
1538         (params->announce_step < 1 ||
1539         params->announce_step > 10000)) {
1540         error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1541                    "announce_step",
1542                    "a value between 0 and 10000");
1543        return false;
1544     }
1545
1546     if (params->has_block_bitmap_mapping &&
1547         !check_dirty_bitmap_mig_alias_map(params->block_bitmap_mapping, errp)) {
1548         error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: ");
1549         return false;
1550     }
1551     return true;
1552 }
1553
1554 static void migrate_params_test_apply(MigrateSetParameters *params,
1555                                       MigrationParameters *dest)
1556 {
1557     *dest = migrate_get_current()->parameters;
1558
1559     /* TODO use QAPI_CLONE() instead of duplicating it inline */
1560
1561     if (params->has_compress_level) {
1562         dest->compress_level = params->compress_level;
1563     }
1564
1565     if (params->has_compress_threads) {
1566         dest->compress_threads = params->compress_threads;
1567     }
1568
1569     if (params->has_compress_wait_thread) {
1570         dest->compress_wait_thread = params->compress_wait_thread;
1571     }
1572
1573     if (params->has_decompress_threads) {
1574         dest->decompress_threads = params->decompress_threads;
1575     }
1576
1577     if (params->has_throttle_trigger_threshold) {
1578         dest->throttle_trigger_threshold = params->throttle_trigger_threshold;
1579     }
1580
1581     if (params->has_cpu_throttle_initial) {
1582         dest->cpu_throttle_initial = params->cpu_throttle_initial;
1583     }
1584
1585     if (params->has_cpu_throttle_increment) {
1586         dest->cpu_throttle_increment = params->cpu_throttle_increment;
1587     }
1588
1589     if (params->has_cpu_throttle_tailslow) {
1590         dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1591     }
1592
1593     if (params->has_tls_creds) {
1594         assert(params->tls_creds->type == QTYPE_QSTRING);
1595         dest->tls_creds = params->tls_creds->u.s;
1596     }
1597
1598     if (params->has_tls_hostname) {
1599         assert(params->tls_hostname->type == QTYPE_QSTRING);
1600         dest->tls_hostname = params->tls_hostname->u.s;
1601     }
1602
1603     if (params->has_max_bandwidth) {
1604         dest->max_bandwidth = params->max_bandwidth;
1605     }
1606
1607     if (params->has_downtime_limit) {
1608         dest->downtime_limit = params->downtime_limit;
1609     }
1610
1611     if (params->has_x_checkpoint_delay) {
1612         dest->x_checkpoint_delay = params->x_checkpoint_delay;
1613     }
1614
1615     if (params->has_block_incremental) {
1616         dest->block_incremental = params->block_incremental;
1617     }
1618     if (params->has_multifd_channels) {
1619         dest->multifd_channels = params->multifd_channels;
1620     }
1621     if (params->has_multifd_compression) {
1622         dest->multifd_compression = params->multifd_compression;
1623     }
1624     if (params->has_xbzrle_cache_size) {
1625         dest->xbzrle_cache_size = params->xbzrle_cache_size;
1626     }
1627     if (params->has_max_postcopy_bandwidth) {
1628         dest->max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1629     }
1630     if (params->has_max_cpu_throttle) {
1631         dest->max_cpu_throttle = params->max_cpu_throttle;
1632     }
1633     if (params->has_announce_initial) {
1634         dest->announce_initial = params->announce_initial;
1635     }
1636     if (params->has_announce_max) {
1637         dest->announce_max = params->announce_max;
1638     }
1639     if (params->has_announce_rounds) {
1640         dest->announce_rounds = params->announce_rounds;
1641     }
1642     if (params->has_announce_step) {
1643         dest->announce_step = params->announce_step;
1644     }
1645
1646     if (params->has_block_bitmap_mapping) {
1647         dest->has_block_bitmap_mapping = true;
1648         dest->block_bitmap_mapping = params->block_bitmap_mapping;
1649     }
1650 }
1651
1652 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
1653 {
1654     MigrationState *s = migrate_get_current();
1655
1656     /* TODO use QAPI_CLONE() instead of duplicating it inline */
1657
1658     if (params->has_compress_level) {
1659         s->parameters.compress_level = params->compress_level;
1660     }
1661
1662     if (params->has_compress_threads) {
1663         s->parameters.compress_threads = params->compress_threads;
1664     }
1665
1666     if (params->has_compress_wait_thread) {
1667         s->parameters.compress_wait_thread = params->compress_wait_thread;
1668     }
1669
1670     if (params->has_decompress_threads) {
1671         s->parameters.decompress_threads = params->decompress_threads;
1672     }
1673
1674     if (params->has_throttle_trigger_threshold) {
1675         s->parameters.throttle_trigger_threshold = params->throttle_trigger_threshold;
1676     }
1677
1678     if (params->has_cpu_throttle_initial) {
1679         s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
1680     }
1681
1682     if (params->has_cpu_throttle_increment) {
1683         s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
1684     }
1685
1686     if (params->has_cpu_throttle_tailslow) {
1687         s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1688     }
1689
1690     if (params->has_tls_creds) {
1691         g_free(s->parameters.tls_creds);
1692         assert(params->tls_creds->type == QTYPE_QSTRING);
1693         s->parameters.tls_creds = g_strdup(params->tls_creds->u.s);
1694     }
1695
1696     if (params->has_tls_hostname) {
1697         g_free(s->parameters.tls_hostname);
1698         assert(params->tls_hostname->type == QTYPE_QSTRING);
1699         s->parameters.tls_hostname = g_strdup(params->tls_hostname->u.s);
1700     }
1701
1702     if (params->has_tls_authz) {
1703         g_free(s->parameters.tls_authz);
1704         assert(params->tls_authz->type == QTYPE_QSTRING);
1705         s->parameters.tls_authz = g_strdup(params->tls_authz->u.s);
1706     }
1707
1708     if (params->has_max_bandwidth) {
1709         s->parameters.max_bandwidth = params->max_bandwidth;
1710         if (s->to_dst_file && !migration_in_postcopy()) {
1711             qemu_file_set_rate_limit(s->to_dst_file,
1712                                 s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
1713         }
1714     }
1715
1716     if (params->has_downtime_limit) {
1717         s->parameters.downtime_limit = params->downtime_limit;
1718     }
1719
1720     if (params->has_x_checkpoint_delay) {
1721         s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
1722         if (migration_in_colo_state()) {
1723             colo_checkpoint_notify(s);
1724         }
1725     }
1726
1727     if (params->has_block_incremental) {
1728         s->parameters.block_incremental = params->block_incremental;
1729     }
1730     if (params->has_multifd_channels) {
1731         s->parameters.multifd_channels = params->multifd_channels;
1732     }
1733     if (params->has_multifd_compression) {
1734         s->parameters.multifd_compression = params->multifd_compression;
1735     }
1736     if (params->has_xbzrle_cache_size) {
1737         s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
1738         xbzrle_cache_resize(params->xbzrle_cache_size, errp);
1739     }
1740     if (params->has_max_postcopy_bandwidth) {
1741         s->parameters.max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1742         if (s->to_dst_file && migration_in_postcopy()) {
1743             qemu_file_set_rate_limit(s->to_dst_file,
1744                     s->parameters.max_postcopy_bandwidth / XFER_LIMIT_RATIO);
1745         }
1746     }
1747     if (params->has_max_cpu_throttle) {
1748         s->parameters.max_cpu_throttle = params->max_cpu_throttle;
1749     }
1750     if (params->has_announce_initial) {
1751         s->parameters.announce_initial = params->announce_initial;
1752     }
1753     if (params->has_announce_max) {
1754         s->parameters.announce_max = params->announce_max;
1755     }
1756     if (params->has_announce_rounds) {
1757         s->parameters.announce_rounds = params->announce_rounds;
1758     }
1759     if (params->has_announce_step) {
1760         s->parameters.announce_step = params->announce_step;
1761     }
1762
1763     if (params->has_block_bitmap_mapping) {
1764         qapi_free_BitmapMigrationNodeAliasList(
1765             s->parameters.block_bitmap_mapping);
1766
1767         s->parameters.has_block_bitmap_mapping = true;
1768         s->parameters.block_bitmap_mapping =
1769             QAPI_CLONE(BitmapMigrationNodeAliasList,
1770                        params->block_bitmap_mapping);
1771     }
1772 }
1773
1774 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
1775 {
1776     MigrationParameters tmp;
1777
1778     /* TODO Rewrite "" to null instead */
1779     if (params->has_tls_creds
1780         && params->tls_creds->type == QTYPE_QNULL) {
1781         qobject_unref(params->tls_creds->u.n);
1782         params->tls_creds->type = QTYPE_QSTRING;
1783         params->tls_creds->u.s = strdup("");
1784     }
1785     /* TODO Rewrite "" to null instead */
1786     if (params->has_tls_hostname
1787         && params->tls_hostname->type == QTYPE_QNULL) {
1788         qobject_unref(params->tls_hostname->u.n);
1789         params->tls_hostname->type = QTYPE_QSTRING;
1790         params->tls_hostname->u.s = strdup("");
1791     }
1792
1793     migrate_params_test_apply(params, &tmp);
1794
1795     if (!migrate_params_check(&tmp, errp)) {
1796         /* Invalid parameter */
1797         return;
1798     }
1799
1800     migrate_params_apply(params, errp);
1801 }
1802
1803
1804 void qmp_migrate_start_postcopy(Error **errp)
1805 {
1806     MigrationState *s = migrate_get_current();
1807
1808     if (!migrate_postcopy()) {
1809         error_setg(errp, "Enable postcopy with migrate_set_capability before"
1810                          " the start of migration");
1811         return;
1812     }
1813
1814     if (s->state == MIGRATION_STATUS_NONE) {
1815         error_setg(errp, "Postcopy must be started after migration has been"
1816                          " started");
1817         return;
1818     }
1819     /*
1820      * we don't error if migration has finished since that would be racy
1821      * with issuing this command.
1822      */
1823     qatomic_set(&s->start_postcopy, true);
1824 }
1825
1826 /* shared migration helpers */
1827
1828 void migrate_set_state(int *state, int old_state, int new_state)
1829 {
1830     assert(new_state < MIGRATION_STATUS__MAX);
1831     if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
1832         trace_migrate_set_state(MigrationStatus_str(new_state));
1833         migrate_generate_event(new_state);
1834     }
1835 }
1836
1837 static MigrationCapabilityStatus *migrate_cap_add(MigrationCapability index,
1838                                                   bool state)
1839 {
1840     MigrationCapabilityStatus *cap;
1841
1842     cap = g_new0(MigrationCapabilityStatus, 1);
1843     cap->capability = index;
1844     cap->state = state;
1845
1846     return cap;
1847 }
1848
1849 void migrate_set_block_enabled(bool value, Error **errp)
1850 {
1851     MigrationCapabilityStatusList *cap = NULL;
1852
1853     QAPI_LIST_PREPEND(cap, migrate_cap_add(MIGRATION_CAPABILITY_BLOCK, value));
1854     qmp_migrate_set_capabilities(cap, errp);
1855     qapi_free_MigrationCapabilityStatusList(cap);
1856 }
1857
1858 static void migrate_set_block_incremental(MigrationState *s, bool value)
1859 {
1860     s->parameters.block_incremental = value;
1861 }
1862
1863 static void block_cleanup_parameters(MigrationState *s)
1864 {
1865     if (s->must_remove_block_options) {
1866         /* setting to false can never fail */
1867         migrate_set_block_enabled(false, &error_abort);
1868         migrate_set_block_incremental(s, false);
1869         s->must_remove_block_options = false;
1870     }
1871 }
1872
1873 static void migrate_fd_cleanup(MigrationState *s)
1874 {
1875     qemu_bh_delete(s->cleanup_bh);
1876     s->cleanup_bh = NULL;
1877
1878     g_free(s->hostname);
1879     s->hostname = NULL;
1880
1881     qemu_savevm_state_cleanup();
1882
1883     if (s->to_dst_file) {
1884         QEMUFile *tmp;
1885
1886         trace_migrate_fd_cleanup();
1887         qemu_mutex_unlock_iothread();
1888         if (s->migration_thread_running) {
1889             qemu_thread_join(&s->thread);
1890             s->migration_thread_running = false;
1891         }
1892         qemu_mutex_lock_iothread();
1893
1894         multifd_save_cleanup();
1895         qemu_mutex_lock(&s->qemu_file_lock);
1896         tmp = s->to_dst_file;
1897         s->to_dst_file = NULL;
1898         qemu_mutex_unlock(&s->qemu_file_lock);
1899         /*
1900          * Close the file handle without the lock to make sure the
1901          * critical section won't block for long.
1902          */
1903         migration_ioc_unregister_yank_from_file(tmp);
1904         qemu_fclose(tmp);
1905     }
1906
1907     if (s->postcopy_qemufile_src) {
1908         migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
1909         qemu_fclose(s->postcopy_qemufile_src);
1910         s->postcopy_qemufile_src = NULL;
1911     }
1912
1913     assert(!migration_is_active(s));
1914
1915     if (s->state == MIGRATION_STATUS_CANCELLING) {
1916         migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
1917                           MIGRATION_STATUS_CANCELLED);
1918     }
1919
1920     if (s->error) {
1921         /* It is used on info migrate.  We can't free it */
1922         error_report_err(error_copy(s->error));
1923     }
1924     notifier_list_notify(&migration_state_notifiers, s);
1925     block_cleanup_parameters(s);
1926     yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1927 }
1928
1929 static void migrate_fd_cleanup_schedule(MigrationState *s)
1930 {
1931     /*
1932      * Ref the state for bh, because it may be called when
1933      * there're already no other refs
1934      */
1935     object_ref(OBJECT(s));
1936     qemu_bh_schedule(s->cleanup_bh);
1937 }
1938
1939 static void migrate_fd_cleanup_bh(void *opaque)
1940 {
1941     MigrationState *s = opaque;
1942     migrate_fd_cleanup(s);
1943     object_unref(OBJECT(s));
1944 }
1945
1946 void migrate_set_error(MigrationState *s, const Error *error)
1947 {
1948     QEMU_LOCK_GUARD(&s->error_mutex);
1949     if (!s->error) {
1950         s->error = error_copy(error);
1951     }
1952 }
1953
1954 static void migrate_error_free(MigrationState *s)
1955 {
1956     QEMU_LOCK_GUARD(&s->error_mutex);
1957     if (s->error) {
1958         error_free(s->error);
1959         s->error = NULL;
1960     }
1961 }
1962
1963 void migrate_fd_error(MigrationState *s, const Error *error)
1964 {
1965     trace_migrate_fd_error(error_get_pretty(error));
1966     assert(s->to_dst_file == NULL);
1967     migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1968                       MIGRATION_STATUS_FAILED);
1969     migrate_set_error(s, error);
1970 }
1971
1972 static void migrate_fd_cancel(MigrationState *s)
1973 {
1974     int old_state ;
1975     QEMUFile *f = migrate_get_current()->to_dst_file;
1976     trace_migrate_fd_cancel();
1977
1978     WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1979         if (s->rp_state.from_dst_file) {
1980             /* shutdown the rp socket, so causing the rp thread to shutdown */
1981             qemu_file_shutdown(s->rp_state.from_dst_file);
1982         }
1983     }
1984
1985     do {
1986         old_state = s->state;
1987         if (!migration_is_running(old_state)) {
1988             break;
1989         }
1990         /* If the migration is paused, kick it out of the pause */
1991         if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
1992             qemu_sem_post(&s->pause_sem);
1993         }
1994         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
1995     } while (s->state != MIGRATION_STATUS_CANCELLING);
1996
1997     /*
1998      * If we're unlucky the migration code might be stuck somewhere in a
1999      * send/write while the network has failed and is waiting to timeout;
2000      * if we've got shutdown(2) available then we can force it to quit.
2001      * The outgoing qemu file gets closed in migrate_fd_cleanup that is
2002      * called in a bh, so there is no race against this cancel.
2003      */
2004     if (s->state == MIGRATION_STATUS_CANCELLING && f) {
2005         qemu_file_shutdown(f);
2006     }
2007     if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
2008         Error *local_err = NULL;
2009
2010         bdrv_activate_all(&local_err);
2011         if (local_err) {
2012             error_report_err(local_err);
2013         } else {
2014             s->block_inactive = false;
2015         }
2016     }
2017 }
2018
2019 void add_migration_state_change_notifier(Notifier *notify)
2020 {
2021     notifier_list_add(&migration_state_notifiers, notify);
2022 }
2023
2024 void remove_migration_state_change_notifier(Notifier *notify)
2025 {
2026     notifier_remove(notify);
2027 }
2028
2029 bool migration_in_setup(MigrationState *s)
2030 {
2031     return s->state == MIGRATION_STATUS_SETUP;
2032 }
2033
2034 bool migration_has_finished(MigrationState *s)
2035 {
2036     return s->state == MIGRATION_STATUS_COMPLETED;
2037 }
2038
2039 bool migration_has_failed(MigrationState *s)
2040 {
2041     return (s->state == MIGRATION_STATUS_CANCELLED ||
2042             s->state == MIGRATION_STATUS_FAILED);
2043 }
2044
2045 bool migration_in_postcopy(void)
2046 {
2047     MigrationState *s = migrate_get_current();
2048
2049     switch (s->state) {
2050     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
2051     case MIGRATION_STATUS_POSTCOPY_PAUSED:
2052     case MIGRATION_STATUS_POSTCOPY_RECOVER:
2053         return true;
2054     default:
2055         return false;
2056     }
2057 }
2058
2059 bool migration_in_postcopy_after_devices(MigrationState *s)
2060 {
2061     return migration_in_postcopy() && s->postcopy_after_devices;
2062 }
2063
2064 bool migration_in_incoming_postcopy(void)
2065 {
2066     PostcopyState ps = postcopy_state_get();
2067
2068     return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
2069 }
2070
2071 bool migration_in_bg_snapshot(void)
2072 {
2073     MigrationState *s = migrate_get_current();
2074
2075     return migrate_background_snapshot() &&
2076             migration_is_setup_or_active(s->state);
2077 }
2078
2079 bool migration_is_idle(void)
2080 {
2081     MigrationState *s = current_migration;
2082
2083     if (!s) {
2084         return true;
2085     }
2086
2087     switch (s->state) {
2088     case MIGRATION_STATUS_NONE:
2089     case MIGRATION_STATUS_CANCELLED:
2090     case MIGRATION_STATUS_COMPLETED:
2091     case MIGRATION_STATUS_FAILED:
2092         return true;
2093     case MIGRATION_STATUS_SETUP:
2094     case MIGRATION_STATUS_CANCELLING:
2095     case MIGRATION_STATUS_ACTIVE:
2096     case MIGRATION_STATUS_POSTCOPY_ACTIVE:
2097     case MIGRATION_STATUS_COLO:
2098     case MIGRATION_STATUS_PRE_SWITCHOVER:
2099     case MIGRATION_STATUS_DEVICE:
2100     case MIGRATION_STATUS_WAIT_UNPLUG:
2101         return false;
2102     case MIGRATION_STATUS__MAX:
2103         g_assert_not_reached();
2104     }
2105
2106     return false;
2107 }
2108
2109 bool migration_is_active(MigrationState *s)
2110 {
2111     return (s->state == MIGRATION_STATUS_ACTIVE ||
2112             s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2113 }
2114
2115 void migrate_init(MigrationState *s)
2116 {
2117     /*
2118      * Reinitialise all migration state, except
2119      * parameters/capabilities that the user set, and
2120      * locks.
2121      */
2122     s->cleanup_bh = 0;
2123     s->vm_start_bh = 0;
2124     s->to_dst_file = NULL;
2125     s->state = MIGRATION_STATUS_NONE;
2126     s->rp_state.from_dst_file = NULL;
2127     s->rp_state.error = false;
2128     s->mbps = 0.0;
2129     s->pages_per_second = 0.0;
2130     s->downtime = 0;
2131     s->expected_downtime = 0;
2132     s->setup_time = 0;
2133     s->start_postcopy = false;
2134     s->postcopy_after_devices = false;
2135     s->migration_thread_running = false;
2136     error_free(s->error);
2137     s->error = NULL;
2138     s->hostname = NULL;
2139
2140     migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
2141
2142     s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2143     s->total_time = 0;
2144     s->vm_was_running = false;
2145     s->iteration_initial_bytes = 0;
2146     s->threshold_size = 0;
2147 }
2148
2149 int migrate_add_blocker_internal(Error *reason, Error **errp)
2150 {
2151     /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
2152     if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
2153         error_propagate_prepend(errp, error_copy(reason),
2154                                 "disallowing migration blocker "
2155                                 "(migration/snapshot in progress) for: ");
2156         return -EBUSY;
2157     }
2158
2159     migration_blockers = g_slist_prepend(migration_blockers, reason);
2160     return 0;
2161 }
2162
2163 int migrate_add_blocker(Error *reason, Error **errp)
2164 {
2165     if (only_migratable) {
2166         error_propagate_prepend(errp, error_copy(reason),
2167                                 "disallowing migration blocker "
2168                                 "(--only-migratable) for: ");
2169         return -EACCES;
2170     }
2171
2172     return migrate_add_blocker_internal(reason, errp);
2173 }
2174
2175 void migrate_del_blocker(Error *reason)
2176 {
2177     migration_blockers = g_slist_remove(migration_blockers, reason);
2178 }
2179
2180 void qmp_migrate_incoming(const char *uri, Error **errp)
2181 {
2182     Error *local_err = NULL;
2183     static bool once = true;
2184
2185     if (!once) {
2186         error_setg(errp, "The incoming migration has already been started");
2187         return;
2188     }
2189     if (!runstate_check(RUN_STATE_INMIGRATE)) {
2190         error_setg(errp, "'-incoming' was not specified on the command line");
2191         return;
2192     }
2193
2194     if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2195         return;
2196     }
2197
2198     qemu_start_incoming_migration(uri, &local_err);
2199
2200     if (local_err) {
2201         yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2202         error_propagate(errp, local_err);
2203         return;
2204     }
2205
2206     once = false;
2207 }
2208
2209 void qmp_migrate_recover(const char *uri, Error **errp)
2210 {
2211     MigrationIncomingState *mis = migration_incoming_get_current();
2212
2213     /*
2214      * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
2215      * callers (no one should ignore a recover failure); if there is, it's a
2216      * programming error.
2217      */
2218     assert(errp);
2219
2220     if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2221         error_setg(errp, "Migrate recover can only be run "
2222                    "when postcopy is paused.");
2223         return;
2224     }
2225
2226     /* If there's an existing transport, release it */
2227     migration_incoming_transport_cleanup(mis);
2228
2229     /*
2230      * Note that this call will never start a real migration; it will
2231      * only re-setup the migration stream and poke existing migration
2232      * to continue using that newly established channel.
2233      */
2234     qemu_start_incoming_migration(uri, errp);
2235 }
2236
2237 void qmp_migrate_pause(Error **errp)
2238 {
2239     MigrationState *ms = migrate_get_current();
2240     MigrationIncomingState *mis = migration_incoming_get_current();
2241     int ret;
2242
2243     if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2244         /* Source side, during postcopy */
2245         qemu_mutex_lock(&ms->qemu_file_lock);
2246         ret = qemu_file_shutdown(ms->to_dst_file);
2247         qemu_mutex_unlock(&ms->qemu_file_lock);
2248         if (ret) {
2249             error_setg(errp, "Failed to pause source migration");
2250         }
2251         return;
2252     }
2253
2254     if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2255         ret = qemu_file_shutdown(mis->from_src_file);
2256         if (ret) {
2257             error_setg(errp, "Failed to pause destination migration");
2258         }
2259         return;
2260     }
2261
2262     error_setg(errp, "migrate-pause is currently only supported "
2263                "during postcopy-active state");
2264 }
2265
2266 bool migration_is_blocked(Error **errp)
2267 {
2268     if (qemu_savevm_state_blocked(errp)) {
2269         return true;
2270     }
2271
2272     if (migration_blockers) {
2273         error_propagate(errp, error_copy(migration_blockers->data));
2274         return true;
2275     }
2276
2277     return false;
2278 }
2279
2280 /* Returns true if continue to migrate, or false if error detected */
2281 static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
2282                             bool resume, Error **errp)
2283 {
2284     Error *local_err = NULL;
2285
2286     if (resume) {
2287         if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2288             error_setg(errp, "Cannot resume if there is no "
2289                        "paused migration");
2290             return false;
2291         }
2292
2293         /*
2294          * Postcopy recovery won't work well with release-ram
2295          * capability since release-ram will drop the page buffer as
2296          * long as the page is put into the send buffer.  So if there
2297          * is a network failure happened, any page buffers that have
2298          * not yet reached the destination VM but have already been
2299          * sent from the source VM will be lost forever.  Let's refuse
2300          * the client from resuming such a postcopy migration.
2301          * Luckily release-ram was designed to only be used when src
2302          * and destination VMs are on the same host, so it should be
2303          * fine.
2304          */
2305         if (migrate_release_ram()) {
2306             error_setg(errp, "Postcopy recovery cannot work "
2307                        "when release-ram capability is set");
2308             return false;
2309         }
2310
2311         /* This is a resume, skip init status */
2312         return true;
2313     }
2314
2315     if (migration_is_running(s->state)) {
2316         error_setg(errp, QERR_MIGRATION_ACTIVE);
2317         return false;
2318     }
2319
2320     if (runstate_check(RUN_STATE_INMIGRATE)) {
2321         error_setg(errp, "Guest is waiting for an incoming migration");
2322         return false;
2323     }
2324
2325     if (runstate_check(RUN_STATE_POSTMIGRATE)) {
2326         error_setg(errp, "Can't migrate the vm that was paused due to "
2327                    "previous migration");
2328         return false;
2329     }
2330
2331     if (migration_is_blocked(errp)) {
2332         return false;
2333     }
2334
2335     if (blk || blk_inc) {
2336         if (migrate_colo_enabled()) {
2337             error_setg(errp, "No disk migration is required in COLO mode");
2338             return false;
2339         }
2340         if (migrate_use_block() || migrate_use_block_incremental()) {
2341             error_setg(errp, "Command options are incompatible with "
2342                        "current migration capabilities");
2343             return false;
2344         }
2345         migrate_set_block_enabled(true, &local_err);
2346         if (local_err) {
2347             error_propagate(errp, local_err);
2348             return false;
2349         }
2350         s->must_remove_block_options = true;
2351     }
2352
2353     if (blk_inc) {
2354         migrate_set_block_incremental(s, true);
2355     }
2356
2357     migrate_init(s);
2358     /*
2359      * set ram_counters compression_counters memory to zero for a
2360      * new migration
2361      */
2362     memset(&ram_counters, 0, sizeof(ram_counters));
2363     memset(&compression_counters, 0, sizeof(compression_counters));
2364
2365     return true;
2366 }
2367
2368 void qmp_migrate(const char *uri, bool has_blk, bool blk,
2369                  bool has_inc, bool inc, bool has_detach, bool detach,
2370                  bool has_resume, bool resume, Error **errp)
2371 {
2372     Error *local_err = NULL;
2373     MigrationState *s = migrate_get_current();
2374     const char *p = NULL;
2375
2376     if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
2377                          has_resume && resume, errp)) {
2378         /* Error detected, put into errp */
2379         return;
2380     }
2381
2382     if (!(has_resume && resume)) {
2383         if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2384             return;
2385         }
2386     }
2387
2388     migrate_protocol_allow_multi_channels(false);
2389     if (strstart(uri, "tcp:", &p) ||
2390         strstart(uri, "unix:", NULL) ||
2391         strstart(uri, "vsock:", NULL)) {
2392         migrate_protocol_allow_multi_channels(true);
2393         socket_start_outgoing_migration(s, p ? p : uri, &local_err);
2394 #ifdef CONFIG_RDMA
2395     } else if (strstart(uri, "rdma:", &p)) {
2396         rdma_start_outgoing_migration(s, p, &local_err);
2397 #endif
2398     } else if (strstart(uri, "exec:", &p)) {
2399         exec_start_outgoing_migration(s, p, &local_err);
2400     } else if (strstart(uri, "fd:", &p)) {
2401         fd_start_outgoing_migration(s, p, &local_err);
2402     } else {
2403         if (!(has_resume && resume)) {
2404             yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2405         }
2406         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
2407                    "a valid migration protocol");
2408         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2409                           MIGRATION_STATUS_FAILED);
2410         block_cleanup_parameters(s);
2411         return;
2412     }
2413
2414     if (local_err) {
2415         if (!(has_resume && resume)) {
2416             yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2417         }
2418         migrate_fd_error(s, local_err);
2419         error_propagate(errp, local_err);
2420         return;
2421     }
2422 }
2423
2424 void qmp_migrate_cancel(Error **errp)
2425 {
2426     migration_cancel(NULL);
2427 }
2428
2429 void qmp_migrate_continue(MigrationStatus state, Error **errp)
2430 {
2431     MigrationState *s = migrate_get_current();
2432     if (s->state != state) {
2433         error_setg(errp,  "Migration not in expected state: %s",
2434                    MigrationStatus_str(s->state));
2435         return;
2436     }
2437     qemu_sem_post(&s->pause_sem);
2438 }
2439
2440 bool migrate_release_ram(void)
2441 {
2442     MigrationState *s;
2443
2444     s = migrate_get_current();
2445
2446     return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
2447 }
2448
2449 bool migrate_postcopy_ram(void)
2450 {
2451     MigrationState *s;
2452
2453     s = migrate_get_current();
2454
2455     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM];
2456 }
2457
2458 bool migrate_postcopy(void)
2459 {
2460     return migrate_postcopy_ram() || migrate_dirty_bitmaps();
2461 }
2462
2463 bool migrate_auto_converge(void)
2464 {
2465     MigrationState *s;
2466
2467     s = migrate_get_current();
2468
2469     return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
2470 }
2471
2472 bool migrate_zero_blocks(void)
2473 {
2474     MigrationState *s;
2475
2476     s = migrate_get_current();
2477
2478     return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
2479 }
2480
2481 bool migrate_postcopy_blocktime(void)
2482 {
2483     MigrationState *s;
2484
2485     s = migrate_get_current();
2486
2487     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
2488 }
2489
2490 bool migrate_use_compression(void)
2491 {
2492     MigrationState *s;
2493
2494     s = migrate_get_current();
2495
2496     return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS];
2497 }
2498
2499 int migrate_compress_level(void)
2500 {
2501     MigrationState *s;
2502
2503     s = migrate_get_current();
2504
2505     return s->parameters.compress_level;
2506 }
2507
2508 int migrate_compress_threads(void)
2509 {
2510     MigrationState *s;
2511
2512     s = migrate_get_current();
2513
2514     return s->parameters.compress_threads;
2515 }
2516
2517 int migrate_compress_wait_thread(void)
2518 {
2519     MigrationState *s;
2520
2521     s = migrate_get_current();
2522
2523     return s->parameters.compress_wait_thread;
2524 }
2525
2526 int migrate_decompress_threads(void)
2527 {
2528     MigrationState *s;
2529
2530     s = migrate_get_current();
2531
2532     return s->parameters.decompress_threads;
2533 }
2534
2535 bool migrate_dirty_bitmaps(void)
2536 {
2537     MigrationState *s;
2538
2539     s = migrate_get_current();
2540
2541     return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
2542 }
2543
2544 bool migrate_ignore_shared(void)
2545 {
2546     MigrationState *s;
2547
2548     s = migrate_get_current();
2549
2550     return s->enabled_capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED];
2551 }
2552
2553 bool migrate_validate_uuid(void)
2554 {
2555     MigrationState *s;
2556
2557     s = migrate_get_current();
2558
2559     return s->enabled_capabilities[MIGRATION_CAPABILITY_VALIDATE_UUID];
2560 }
2561
2562 bool migrate_use_events(void)
2563 {
2564     MigrationState *s;
2565
2566     s = migrate_get_current();
2567
2568     return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
2569 }
2570
2571 bool migrate_use_multifd(void)
2572 {
2573     MigrationState *s;
2574
2575     s = migrate_get_current();
2576
2577     return s->enabled_capabilities[MIGRATION_CAPABILITY_MULTIFD];
2578 }
2579
2580 bool migrate_pause_before_switchover(void)
2581 {
2582     MigrationState *s;
2583
2584     s = migrate_get_current();
2585
2586     return s->enabled_capabilities[
2587         MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER];
2588 }
2589
2590 int migrate_multifd_channels(void)
2591 {
2592     MigrationState *s;
2593
2594     s = migrate_get_current();
2595
2596     return s->parameters.multifd_channels;
2597 }
2598
2599 MultiFDCompression migrate_multifd_compression(void)
2600 {
2601     MigrationState *s;
2602
2603     s = migrate_get_current();
2604
2605     return s->parameters.multifd_compression;
2606 }
2607
2608 int migrate_multifd_zlib_level(void)
2609 {
2610     MigrationState *s;
2611
2612     s = migrate_get_current();
2613
2614     return s->parameters.multifd_zlib_level;
2615 }
2616
2617 int migrate_multifd_zstd_level(void)
2618 {
2619     MigrationState *s;
2620
2621     s = migrate_get_current();
2622
2623     return s->parameters.multifd_zstd_level;
2624 }
2625
2626 #ifdef CONFIG_LINUX
2627 bool migrate_use_zero_copy_send(void)
2628 {
2629     MigrationState *s;
2630
2631     s = migrate_get_current();
2632
2633     return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND];
2634 }
2635 #endif
2636
2637 int migrate_use_tls(void)
2638 {
2639     MigrationState *s;
2640
2641     s = migrate_get_current();
2642
2643     return s->parameters.tls_creds && *s->parameters.tls_creds;
2644 }
2645
2646 int migrate_use_xbzrle(void)
2647 {
2648     MigrationState *s;
2649
2650     s = migrate_get_current();
2651
2652     return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
2653 }
2654
2655 uint64_t migrate_xbzrle_cache_size(void)
2656 {
2657     MigrationState *s;
2658
2659     s = migrate_get_current();
2660
2661     return s->parameters.xbzrle_cache_size;
2662 }
2663
2664 static int64_t migrate_max_postcopy_bandwidth(void)
2665 {
2666     MigrationState *s;
2667
2668     s = migrate_get_current();
2669
2670     return s->parameters.max_postcopy_bandwidth;
2671 }
2672
2673 bool migrate_use_block(void)
2674 {
2675     MigrationState *s;
2676
2677     s = migrate_get_current();
2678
2679     return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK];
2680 }
2681
2682 bool migrate_use_return_path(void)
2683 {
2684     MigrationState *s;
2685
2686     s = migrate_get_current();
2687
2688     return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
2689 }
2690
2691 bool migrate_use_block_incremental(void)
2692 {
2693     MigrationState *s;
2694
2695     s = migrate_get_current();
2696
2697     return s->parameters.block_incremental;
2698 }
2699
2700 bool migrate_background_snapshot(void)
2701 {
2702     MigrationState *s;
2703
2704     s = migrate_get_current();
2705
2706     return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT];
2707 }
2708
2709 bool migrate_postcopy_preempt(void)
2710 {
2711     MigrationState *s;
2712
2713     s = migrate_get_current();
2714
2715     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT];
2716 }
2717
2718 /* migration thread support */
2719 /*
2720  * Something bad happened to the RP stream, mark an error
2721  * The caller shall print or trace something to indicate why
2722  */
2723 static void mark_source_rp_bad(MigrationState *s)
2724 {
2725     s->rp_state.error = true;
2726 }
2727
2728 static struct rp_cmd_args {
2729     ssize_t     len; /* -1 = variable */
2730     const char *name;
2731 } rp_cmd_args[] = {
2732     [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
2733     [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
2734     [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
2735     [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
2736     [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
2737     [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
2738     [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
2739     [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
2740 };
2741
2742 /*
2743  * Process a request for pages received on the return path,
2744  * We're allowed to send more than requested (e.g. to round to our page size)
2745  * and we don't need to send pages that have already been sent.
2746  */
2747 static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
2748                                        ram_addr_t start, size_t len)
2749 {
2750     long our_host_ps = qemu_real_host_page_size();
2751
2752     trace_migrate_handle_rp_req_pages(rbname, start, len);
2753
2754     /*
2755      * Since we currently insist on matching page sizes, just sanity check
2756      * we're being asked for whole host pages.
2757      */
2758     if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
2759         !QEMU_IS_ALIGNED(len, our_host_ps)) {
2760         error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
2761                      " len: %zd", __func__, start, len);
2762         mark_source_rp_bad(ms);
2763         return;
2764     }
2765
2766     if (ram_save_queue_pages(rbname, start, len)) {
2767         mark_source_rp_bad(ms);
2768     }
2769 }
2770
2771 /* Return true to retry, false to quit */
2772 static bool postcopy_pause_return_path_thread(MigrationState *s)
2773 {
2774     trace_postcopy_pause_return_path();
2775
2776     qemu_sem_wait(&s->postcopy_pause_rp_sem);
2777
2778     trace_postcopy_pause_return_path_continued();
2779
2780     return true;
2781 }
2782
2783 static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
2784 {
2785     RAMBlock *block = qemu_ram_block_by_name(block_name);
2786
2787     if (!block) {
2788         error_report("%s: invalid block name '%s'", __func__, block_name);
2789         return -EINVAL;
2790     }
2791
2792     /* Fetch the received bitmap and refresh the dirty bitmap */
2793     return ram_dirty_bitmap_reload(s, block);
2794 }
2795
2796 static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
2797 {
2798     trace_source_return_path_thread_resume_ack(value);
2799
2800     if (value != MIGRATION_RESUME_ACK_VALUE) {
2801         error_report("%s: illegal resume_ack value %"PRIu32,
2802                      __func__, value);
2803         return -1;
2804     }
2805
2806     /* Now both sides are active. */
2807     migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2808                       MIGRATION_STATUS_POSTCOPY_ACTIVE);
2809
2810     /* Notify send thread that time to continue send pages */
2811     qemu_sem_post(&s->rp_state.rp_sem);
2812
2813     return 0;
2814 }
2815
2816 /* Release ms->rp_state.from_dst_file in a safe way */
2817 static void migration_release_from_dst_file(MigrationState *ms)
2818 {
2819     QEMUFile *file;
2820
2821     WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2822         /*
2823          * Reset the from_dst_file pointer first before releasing it, as we
2824          * can't block within lock section
2825          */
2826         file = ms->rp_state.from_dst_file;
2827         ms->rp_state.from_dst_file = NULL;
2828     }
2829
2830     qemu_fclose(file);
2831 }
2832
2833 /*
2834  * Handles messages sent on the return path towards the source VM
2835  *
2836  */
2837 static void *source_return_path_thread(void *opaque)
2838 {
2839     MigrationState *ms = opaque;
2840     QEMUFile *rp = ms->rp_state.from_dst_file;
2841     uint16_t header_len, header_type;
2842     uint8_t buf[512];
2843     uint32_t tmp32, sibling_error;
2844     ram_addr_t start = 0; /* =0 to silence warning */
2845     size_t  len = 0, expected_len;
2846     int res;
2847
2848     trace_source_return_path_thread_entry();
2849     rcu_register_thread();
2850
2851 retry:
2852     while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
2853            migration_is_setup_or_active(ms->state)) {
2854         trace_source_return_path_thread_loop_top();
2855         header_type = qemu_get_be16(rp);
2856         header_len = qemu_get_be16(rp);
2857
2858         if (qemu_file_get_error(rp)) {
2859             mark_source_rp_bad(ms);
2860             goto out;
2861         }
2862
2863         if (header_type >= MIG_RP_MSG_MAX ||
2864             header_type == MIG_RP_MSG_INVALID) {
2865             error_report("RP: Received invalid message 0x%04x length 0x%04x",
2866                          header_type, header_len);
2867             mark_source_rp_bad(ms);
2868             goto out;
2869         }
2870
2871         if ((rp_cmd_args[header_type].len != -1 &&
2872             header_len != rp_cmd_args[header_type].len) ||
2873             header_len > sizeof(buf)) {
2874             error_report("RP: Received '%s' message (0x%04x) with"
2875                          "incorrect length %d expecting %zu",
2876                          rp_cmd_args[header_type].name, header_type, header_len,
2877                          (size_t)rp_cmd_args[header_type].len);
2878             mark_source_rp_bad(ms);
2879             goto out;
2880         }
2881
2882         /* We know we've got a valid header by this point */
2883         res = qemu_get_buffer(rp, buf, header_len);
2884         if (res != header_len) {
2885             error_report("RP: Failed reading data for message 0x%04x"
2886                          " read %d expected %d",
2887                          header_type, res, header_len);
2888             mark_source_rp_bad(ms);
2889             goto out;
2890         }
2891
2892         /* OK, we have the message and the data */
2893         switch (header_type) {
2894         case MIG_RP_MSG_SHUT:
2895             sibling_error = ldl_be_p(buf);
2896             trace_source_return_path_thread_shut(sibling_error);
2897             if (sibling_error) {
2898                 error_report("RP: Sibling indicated error %d", sibling_error);
2899                 mark_source_rp_bad(ms);
2900             }
2901             /*
2902              * We'll let the main thread deal with closing the RP
2903              * we could do a shutdown(2) on it, but we're the only user
2904              * anyway, so there's nothing gained.
2905              */
2906             goto out;
2907
2908         case MIG_RP_MSG_PONG:
2909             tmp32 = ldl_be_p(buf);
2910             trace_source_return_path_thread_pong(tmp32);
2911             break;
2912
2913         case MIG_RP_MSG_REQ_PAGES:
2914             start = ldq_be_p(buf);
2915             len = ldl_be_p(buf + 8);
2916             migrate_handle_rp_req_pages(ms, NULL, start, len);
2917             break;
2918
2919         case MIG_RP_MSG_REQ_PAGES_ID:
2920             expected_len = 12 + 1; /* header + termination */
2921
2922             if (header_len >= expected_len) {
2923                 start = ldq_be_p(buf);
2924                 len = ldl_be_p(buf + 8);
2925                 /* Now we expect an idstr */
2926                 tmp32 = buf[12]; /* Length of the following idstr */
2927                 buf[13 + tmp32] = '\0';
2928                 expected_len += tmp32;
2929             }
2930             if (header_len != expected_len) {
2931                 error_report("RP: Req_Page_id with length %d expecting %zd",
2932                              header_len, expected_len);
2933                 mark_source_rp_bad(ms);
2934                 goto out;
2935             }
2936             migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
2937             break;
2938
2939         case MIG_RP_MSG_RECV_BITMAP:
2940             if (header_len < 1) {
2941                 error_report("%s: missing block name", __func__);
2942                 mark_source_rp_bad(ms);
2943                 goto out;
2944             }
2945             /* Format: len (1B) + idstr (<255B). This ends the idstr. */
2946             buf[buf[0] + 1] = '\0';
2947             if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
2948                 mark_source_rp_bad(ms);
2949                 goto out;
2950             }
2951             break;
2952
2953         case MIG_RP_MSG_RESUME_ACK:
2954             tmp32 = ldl_be_p(buf);
2955             if (migrate_handle_rp_resume_ack(ms, tmp32)) {
2956                 mark_source_rp_bad(ms);
2957                 goto out;
2958             }
2959             break;
2960
2961         default:
2962             break;
2963         }
2964     }
2965
2966 out:
2967     res = qemu_file_get_error(rp);
2968     if (res) {
2969         if (res && migration_in_postcopy()) {
2970             /*
2971              * Maybe there is something we can do: it looks like a
2972              * network down issue, and we pause for a recovery.
2973              */
2974             migration_release_from_dst_file(ms);
2975             rp = NULL;
2976             if (postcopy_pause_return_path_thread(ms)) {
2977                 /*
2978                  * Reload rp, reset the rest.  Referencing it is safe since
2979                  * it's reset only by us above, or when migration completes
2980                  */
2981                 rp = ms->rp_state.from_dst_file;
2982                 ms->rp_state.error = false;
2983                 goto retry;
2984             }
2985         }
2986
2987         trace_source_return_path_thread_bad_end();
2988         mark_source_rp_bad(ms);
2989     }
2990
2991     trace_source_return_path_thread_end();
2992     migration_release_from_dst_file(ms);
2993     rcu_unregister_thread();
2994     return NULL;
2995 }
2996
2997 static int open_return_path_on_source(MigrationState *ms,
2998                                       bool create_thread)
2999 {
3000     ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
3001     if (!ms->rp_state.from_dst_file) {
3002         return -1;
3003     }
3004
3005     trace_open_return_path_on_source();
3006
3007     if (!create_thread) {
3008         /* We're done */
3009         return 0;
3010     }
3011
3012     qemu_thread_create(&ms->rp_state.rp_thread, "return path",
3013                        source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
3014     ms->rp_state.rp_thread_created = true;
3015
3016     trace_open_return_path_on_source_continue();
3017
3018     return 0;
3019 }
3020
3021 /* Returns 0 if the RP was ok, otherwise there was an error on the RP */
3022 static int await_return_path_close_on_source(MigrationState *ms)
3023 {
3024     /*
3025      * If this is a normal exit then the destination will send a SHUT and the
3026      * rp_thread will exit, however if there's an error we need to cause
3027      * it to exit.
3028      */
3029     if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) {
3030         /*
3031          * shutdown(2), if we have it, will cause it to unblock if it's stuck
3032          * waiting for the destination.
3033          */
3034         qemu_file_shutdown(ms->rp_state.from_dst_file);
3035         mark_source_rp_bad(ms);
3036     }
3037     trace_await_return_path_close_on_source_joining();
3038     qemu_thread_join(&ms->rp_state.rp_thread);
3039     ms->rp_state.rp_thread_created = false;
3040     trace_await_return_path_close_on_source_close();
3041     return ms->rp_state.error;
3042 }
3043
3044 /*
3045  * Switch from normal iteration to postcopy
3046  * Returns non-0 on error
3047  */
3048 static int postcopy_start(MigrationState *ms)
3049 {
3050     int ret;
3051     QIOChannelBuffer *bioc;
3052     QEMUFile *fb;
3053     int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3054     int64_t bandwidth = migrate_max_postcopy_bandwidth();
3055     bool restart_block = false;
3056     int cur_state = MIGRATION_STATUS_ACTIVE;
3057
3058     if (postcopy_preempt_wait_channel(ms)) {
3059         migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
3060         return -1;
3061     }
3062
3063     if (!migrate_pause_before_switchover()) {
3064         migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
3065                           MIGRATION_STATUS_POSTCOPY_ACTIVE);
3066     }
3067
3068     trace_postcopy_start();
3069     qemu_mutex_lock_iothread();
3070     trace_postcopy_start_set_run();
3071
3072     qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3073     global_state_store();
3074     ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
3075     if (ret < 0) {
3076         goto fail;
3077     }
3078
3079     ret = migration_maybe_pause(ms, &cur_state,
3080                                 MIGRATION_STATUS_POSTCOPY_ACTIVE);
3081     if (ret < 0) {
3082         goto fail;
3083     }
3084
3085     ret = bdrv_inactivate_all();
3086     if (ret < 0) {
3087         goto fail;
3088     }
3089     restart_block = true;
3090
3091     /*
3092      * Cause any non-postcopiable, but iterative devices to
3093      * send out their final data.
3094      */
3095     qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
3096
3097     /*
3098      * in Finish migrate and with the io-lock held everything should
3099      * be quiet, but we've potentially still got dirty pages and we
3100      * need to tell the destination to throw any pages it's already received
3101      * that are dirty
3102      */
3103     if (migrate_postcopy_ram()) {
3104         ram_postcopy_send_discard_bitmap(ms);
3105     }
3106
3107     /*
3108      * send rest of state - note things that are doing postcopy
3109      * will notice we're in POSTCOPY_ACTIVE and not actually
3110      * wrap their state up here
3111      */
3112     /* 0 max-postcopy-bandwidth means unlimited */
3113     if (!bandwidth) {
3114         qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
3115     } else {
3116         qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / XFER_LIMIT_RATIO);
3117     }
3118     if (migrate_postcopy_ram()) {
3119         /* Ping just for debugging, helps line traces up */
3120         qemu_savevm_send_ping(ms->to_dst_file, 2);
3121     }
3122
3123     /*
3124      * While loading the device state we may trigger page transfer
3125      * requests and the fd must be free to process those, and thus
3126      * the destination must read the whole device state off the fd before
3127      * it starts processing it.  Unfortunately the ad-hoc migration format
3128      * doesn't allow the destination to know the size to read without fully
3129      * parsing it through each devices load-state code (especially the open
3130      * coded devices that use get/put).
3131      * So we wrap the device state up in a package with a length at the start;
3132      * to do this we use a qemu_buf to hold the whole of the device state.
3133      */
3134     bioc = qio_channel_buffer_new(4096);
3135     qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
3136     fb = qemu_file_new_output(QIO_CHANNEL(bioc));
3137     object_unref(OBJECT(bioc));
3138
3139     /*
3140      * Make sure the receiver can get incoming pages before we send the rest
3141      * of the state
3142      */
3143     qemu_savevm_send_postcopy_listen(fb);
3144
3145     qemu_savevm_state_complete_precopy(fb, false, false);
3146     if (migrate_postcopy_ram()) {
3147         qemu_savevm_send_ping(fb, 3);
3148     }
3149
3150     qemu_savevm_send_postcopy_run(fb);
3151
3152     /* <><> end of stuff going into the package */
3153
3154     /* Last point of recovery; as soon as we send the package the destination
3155      * can open devices and potentially start running.
3156      * Lets just check again we've not got any errors.
3157      */
3158     ret = qemu_file_get_error(ms->to_dst_file);
3159     if (ret) {
3160         error_report("postcopy_start: Migration stream errored (pre package)");
3161         goto fail_closefb;
3162     }
3163
3164     restart_block = false;
3165
3166     /* Now send that blob */
3167     if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
3168         goto fail_closefb;
3169     }
3170     qemu_fclose(fb);
3171
3172     /* Send a notify to give a chance for anything that needs to happen
3173      * at the transition to postcopy and after the device state; in particular
3174      * spice needs to trigger a transition now
3175      */
3176     ms->postcopy_after_devices = true;
3177     notifier_list_notify(&migration_state_notifiers, ms);
3178
3179     ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
3180
3181     qemu_mutex_unlock_iothread();
3182
3183     if (migrate_postcopy_ram()) {
3184         /*
3185          * Although this ping is just for debug, it could potentially be
3186          * used for getting a better measurement of downtime at the source.
3187          */
3188         qemu_savevm_send_ping(ms->to_dst_file, 4);
3189     }
3190
3191     if (migrate_release_ram()) {
3192         ram_postcopy_migrated_memory_release(ms);
3193     }
3194
3195     ret = qemu_file_get_error(ms->to_dst_file);
3196     if (ret) {
3197         error_report("postcopy_start: Migration stream errored");
3198         migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3199                               MIGRATION_STATUS_FAILED);
3200     }
3201
3202     trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
3203
3204     return ret;
3205
3206 fail_closefb:
3207     qemu_fclose(fb);
3208 fail:
3209     migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3210                           MIGRATION_STATUS_FAILED);
3211     if (restart_block) {
3212         /* A failure happened early enough that we know the destination hasn't
3213          * accessed block devices, so we're safe to recover.
3214          */
3215         Error *local_err = NULL;
3216
3217         bdrv_activate_all(&local_err);
3218         if (local_err) {
3219             error_report_err(local_err);
3220         }
3221     }
3222     qemu_mutex_unlock_iothread();
3223     return -1;
3224 }
3225
3226 /**
3227  * migration_maybe_pause: Pause if required to by
3228  * migrate_pause_before_switchover called with the iothread locked
3229  * Returns: 0 on success
3230  */
3231 static int migration_maybe_pause(MigrationState *s,
3232                                  int *current_active_state,
3233                                  int new_state)
3234 {
3235     if (!migrate_pause_before_switchover()) {
3236         return 0;
3237     }
3238
3239     /* Since leaving this state is not atomic with posting the semaphore
3240      * it's possible that someone could have issued multiple migrate_continue
3241      * and the semaphore is incorrectly positive at this point;
3242      * the docs say it's undefined to reinit a semaphore that's already
3243      * init'd, so use timedwait to eat up any existing posts.
3244      */
3245     while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
3246         /* This block intentionally left blank */
3247     }
3248
3249     /*
3250      * If the migration is cancelled when it is in the completion phase,
3251      * the migration state is set to MIGRATION_STATUS_CANCELLING.
3252      * So we don't need to wait a semaphore, otherwise we would always
3253      * wait for the 'pause_sem' semaphore.
3254      */
3255     if (s->state != MIGRATION_STATUS_CANCELLING) {
3256         qemu_mutex_unlock_iothread();
3257         migrate_set_state(&s->state, *current_active_state,
3258                           MIGRATION_STATUS_PRE_SWITCHOVER);
3259         qemu_sem_wait(&s->pause_sem);
3260         migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
3261                           new_state);
3262         *current_active_state = new_state;
3263         qemu_mutex_lock_iothread();
3264     }
3265
3266     return s->state == new_state ? 0 : -EINVAL;
3267 }
3268
3269 /**
3270  * migration_completion: Used by migration_thread when there's not much left.
3271  *   The caller 'breaks' the loop when this returns.
3272  *
3273  * @s: Current migration state
3274  */
3275 static void migration_completion(MigrationState *s)
3276 {
3277     int ret;
3278     int current_active_state = s->state;
3279
3280     if (s->state == MIGRATION_STATUS_ACTIVE) {
3281         qemu_mutex_lock_iothread();
3282         s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3283         qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3284         s->vm_was_running = runstate_is_running();
3285         ret = global_state_store();
3286
3287         if (!ret) {
3288             bool inactivate = !migrate_colo_enabled();
3289             ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
3290             trace_migration_completion_vm_stop(ret);
3291             if (ret >= 0) {
3292                 ret = migration_maybe_pause(s, &current_active_state,
3293                                             MIGRATION_STATUS_DEVICE);
3294             }
3295             if (ret >= 0) {
3296                 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
3297                 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
3298                                                          inactivate);
3299             }
3300             if (inactivate && ret >= 0) {
3301                 s->block_inactive = true;
3302             }
3303         }
3304         qemu_mutex_unlock_iothread();
3305
3306         if (ret < 0) {
3307             goto fail;
3308         }
3309     } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3310         trace_migration_completion_postcopy_end();
3311
3312         qemu_mutex_lock_iothread();
3313         qemu_savevm_state_complete_postcopy(s->to_dst_file);
3314         qemu_mutex_unlock_iothread();
3315
3316         /* Shutdown the postcopy fast path thread */
3317         if (migrate_postcopy_preempt()) {
3318             postcopy_preempt_shutdown_file(s);
3319         }
3320
3321         trace_migration_completion_postcopy_end_after_complete();
3322     } else {
3323         goto fail;
3324     }
3325
3326     /*
3327      * If rp was opened we must clean up the thread before
3328      * cleaning everything else up (since if there are no failures
3329      * it will wait for the destination to send it's status in
3330      * a SHUT command).
3331      */
3332     if (s->rp_state.rp_thread_created) {
3333         int rp_error;
3334         trace_migration_return_path_end_before();
3335         rp_error = await_return_path_close_on_source(s);
3336         trace_migration_return_path_end_after(rp_error);
3337         if (rp_error) {
3338             goto fail_invalidate;
3339         }
3340     }
3341
3342     if (qemu_file_get_error(s->to_dst_file)) {
3343         trace_migration_completion_file_err();
3344         goto fail_invalidate;
3345     }
3346
3347     if (migrate_colo_enabled() && s->state == MIGRATION_STATUS_ACTIVE) {
3348         /* COLO does not support postcopy */
3349         migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3350                           MIGRATION_STATUS_COLO);
3351     } else {
3352         migrate_set_state(&s->state, current_active_state,
3353                           MIGRATION_STATUS_COMPLETED);
3354     }
3355
3356     return;
3357
3358 fail_invalidate:
3359     /* If not doing postcopy, vm_start() will be called: let's regain
3360      * control on images.
3361      */
3362     if (s->state == MIGRATION_STATUS_ACTIVE ||
3363         s->state == MIGRATION_STATUS_DEVICE) {
3364         Error *local_err = NULL;
3365
3366         qemu_mutex_lock_iothread();
3367         bdrv_activate_all(&local_err);
3368         if (local_err) {
3369             error_report_err(local_err);
3370         } else {
3371             s->block_inactive = false;
3372         }
3373         qemu_mutex_unlock_iothread();
3374     }
3375
3376 fail:
3377     migrate_set_state(&s->state, current_active_state,
3378                       MIGRATION_STATUS_FAILED);
3379 }
3380
3381 /**
3382  * bg_migration_completion: Used by bg_migration_thread when after all the
3383  *   RAM has been saved. The caller 'breaks' the loop when this returns.
3384  *
3385  * @s: Current migration state
3386  */
3387 static void bg_migration_completion(MigrationState *s)
3388 {
3389     int current_active_state = s->state;
3390
3391     /*
3392      * Stop tracking RAM writes - un-protect memory, un-register UFFD
3393      * memory ranges, flush kernel wait queues and wake up threads
3394      * waiting for write fault to be resolved.
3395      */
3396     ram_write_tracking_stop();
3397
3398     if (s->state == MIGRATION_STATUS_ACTIVE) {
3399         /*
3400          * By this moment we have RAM content saved into the migration stream.
3401          * The next step is to flush the non-RAM content (device state)
3402          * right after the ram content. The device state has been stored into
3403          * the temporary buffer before RAM saving started.
3404          */
3405         qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
3406         qemu_fflush(s->to_dst_file);
3407     } else if (s->state == MIGRATION_STATUS_CANCELLING) {
3408         goto fail;
3409     }
3410
3411     if (qemu_file_get_error(s->to_dst_file)) {
3412         trace_migration_completion_file_err();
3413         goto fail;
3414     }
3415
3416     migrate_set_state(&s->state, current_active_state,
3417                       MIGRATION_STATUS_COMPLETED);
3418     return;
3419
3420 fail:
3421     migrate_set_state(&s->state, current_active_state,
3422                       MIGRATION_STATUS_FAILED);
3423 }
3424
3425 bool migrate_colo_enabled(void)
3426 {
3427     MigrationState *s = migrate_get_current();
3428     return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
3429 }
3430
3431 typedef enum MigThrError {
3432     /* No error detected */
3433     MIG_THR_ERR_NONE = 0,
3434     /* Detected error, but resumed successfully */
3435     MIG_THR_ERR_RECOVERED = 1,
3436     /* Detected fatal error, need to exit */
3437     MIG_THR_ERR_FATAL = 2,
3438 } MigThrError;
3439
3440 static int postcopy_resume_handshake(MigrationState *s)
3441 {
3442     qemu_savevm_send_postcopy_resume(s->to_dst_file);
3443
3444     while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3445         qemu_sem_wait(&s->rp_state.rp_sem);
3446     }
3447
3448     if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3449         return 0;
3450     }
3451
3452     return -1;
3453 }
3454
3455 /* Return zero if success, or <0 for error */
3456 static int postcopy_do_resume(MigrationState *s)
3457 {
3458     int ret;
3459
3460     /*
3461      * Call all the resume_prepare() hooks, so that modules can be
3462      * ready for the migration resume.
3463      */
3464     ret = qemu_savevm_state_resume_prepare(s);
3465     if (ret) {
3466         error_report("%s: resume_prepare() failure detected: %d",
3467                      __func__, ret);
3468         return ret;
3469     }
3470
3471     /*
3472      * Last handshake with destination on the resume (destination will
3473      * switch to postcopy-active afterwards)
3474      */
3475     ret = postcopy_resume_handshake(s);
3476     if (ret) {
3477         error_report("%s: handshake failed: %d", __func__, ret);
3478         return ret;
3479     }
3480
3481     return 0;
3482 }
3483
3484 /*
3485  * We don't return until we are in a safe state to continue current
3486  * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
3487  * MIG_THR_ERR_FATAL if unrecovery failure happened.
3488  */
3489 static MigThrError postcopy_pause(MigrationState *s)
3490 {
3491     assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
3492
3493     while (true) {
3494         QEMUFile *file;
3495
3496         /*
3497          * Current channel is possibly broken. Release it.  Note that this is
3498          * guaranteed even without lock because to_dst_file should only be
3499          * modified by the migration thread.  That also guarantees that the
3500          * unregister of yank is safe too without the lock.  It should be safe
3501          * even to be within the qemu_file_lock, but we didn't do that to avoid
3502          * taking more mutex (yank_lock) within qemu_file_lock.  TL;DR: we make
3503          * the qemu_file_lock critical section as small as possible.
3504          */
3505         assert(s->to_dst_file);
3506         migration_ioc_unregister_yank_from_file(s->to_dst_file);
3507         qemu_mutex_lock(&s->qemu_file_lock);
3508         file = s->to_dst_file;
3509         s->to_dst_file = NULL;
3510         qemu_mutex_unlock(&s->qemu_file_lock);
3511
3512         qemu_file_shutdown(file);
3513         qemu_fclose(file);
3514
3515         /*
3516          * Do the same to postcopy fast path socket too if there is.  No
3517          * locking needed because no racer as long as we do this before setting
3518          * status to paused.
3519          */
3520         if (s->postcopy_qemufile_src) {
3521             migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
3522             qemu_file_shutdown(s->postcopy_qemufile_src);
3523             qemu_fclose(s->postcopy_qemufile_src);
3524             s->postcopy_qemufile_src = NULL;
3525         }
3526
3527         migrate_set_state(&s->state, s->state,
3528                           MIGRATION_STATUS_POSTCOPY_PAUSED);
3529
3530         error_report("Detected IO failure for postcopy. "
3531                      "Migration paused.");
3532
3533         /*
3534          * We wait until things fixed up. Then someone will setup the
3535          * status back for us.
3536          */
3537         while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
3538             qemu_sem_wait(&s->postcopy_pause_sem);
3539         }
3540
3541         if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3542             /* Woken up by a recover procedure. Give it a shot */
3543
3544             if (postcopy_preempt_wait_channel(s)) {
3545                 /*
3546                  * Preempt enabled, and new channel create failed; loop
3547                  * back to wait for another recovery.
3548                  */
3549                 continue;
3550             }
3551
3552             /*
3553              * Firstly, let's wake up the return path now, with a new
3554              * return path channel.
3555              */
3556             qemu_sem_post(&s->postcopy_pause_rp_sem);
3557
3558             /* Do the resume logic */
3559             if (postcopy_do_resume(s) == 0) {
3560                 /* Let's continue! */
3561                 trace_postcopy_pause_continued();
3562                 return MIG_THR_ERR_RECOVERED;
3563             } else {
3564                 /*
3565                  * Something wrong happened during the recovery, let's
3566                  * pause again. Pause is always better than throwing
3567                  * data away.
3568                  */
3569                 continue;
3570             }
3571         } else {
3572             /* This is not right... Time to quit. */
3573             return MIG_THR_ERR_FATAL;
3574         }
3575     }
3576 }
3577
3578 static MigThrError migration_detect_error(MigrationState *s)
3579 {
3580     int ret;
3581     int state = s->state;
3582     Error *local_error = NULL;
3583
3584     if (state == MIGRATION_STATUS_CANCELLING ||
3585         state == MIGRATION_STATUS_CANCELLED) {
3586         /* End the migration, but don't set the state to failed */
3587         return MIG_THR_ERR_FATAL;
3588     }
3589
3590     /*
3591      * Try to detect any file errors.  Note that postcopy_qemufile_src will
3592      * be NULL when postcopy preempt is not enabled.
3593      */
3594     ret = qemu_file_get_error_obj_any(s->to_dst_file,
3595                                       s->postcopy_qemufile_src,
3596                                       &local_error);
3597     if (!ret) {
3598         /* Everything is fine */
3599         assert(!local_error);
3600         return MIG_THR_ERR_NONE;
3601     }
3602
3603     if (local_error) {
3604         migrate_set_error(s, local_error);
3605         error_free(local_error);
3606     }
3607
3608     if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) {
3609         /*
3610          * For postcopy, we allow the network to be down for a
3611          * while. After that, it can be continued by a
3612          * recovery phase.
3613          */
3614         return postcopy_pause(s);
3615     } else {
3616         /*
3617          * For precopy (or postcopy with error outside IO), we fail
3618          * with no time.
3619          */
3620         migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
3621         trace_migration_thread_file_err();
3622
3623         /* Time to stop the migration, now. */
3624         return MIG_THR_ERR_FATAL;
3625     }
3626 }
3627
3628 /* How many bytes have we transferred since the beginning of the migration */
3629 static uint64_t migration_total_bytes(MigrationState *s)
3630 {
3631     return qemu_file_total_transferred(s->to_dst_file) +
3632         ram_counters.multifd_bytes;
3633 }
3634
3635 static void migration_calculate_complete(MigrationState *s)
3636 {
3637     uint64_t bytes = migration_total_bytes(s);
3638     int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3639     int64_t transfer_time;
3640
3641     s->total_time = end_time - s->start_time;
3642     if (!s->downtime) {
3643         /*
3644          * It's still not set, so we are precopy migration.  For
3645          * postcopy, downtime is calculated during postcopy_start().
3646          */
3647         s->downtime = end_time - s->downtime_start;
3648     }
3649
3650     transfer_time = s->total_time - s->setup_time;
3651     if (transfer_time) {
3652         s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
3653     }
3654 }
3655
3656 static void update_iteration_initial_status(MigrationState *s)
3657 {
3658     /*
3659      * Update these three fields at the same time to avoid mismatch info lead
3660      * wrong speed calculation.
3661      */
3662     s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3663     s->iteration_initial_bytes = migration_total_bytes(s);
3664     s->iteration_initial_pages = ram_get_total_transferred_pages();
3665 }
3666
3667 static void migration_update_counters(MigrationState *s,
3668                                       int64_t current_time)
3669 {
3670     uint64_t transferred, transferred_pages, time_spent;
3671     uint64_t current_bytes; /* bytes transferred since the beginning */
3672     double bandwidth;
3673
3674     if (current_time < s->iteration_start_time + BUFFER_DELAY) {
3675         return;
3676     }
3677
3678     current_bytes = migration_total_bytes(s);
3679     transferred = current_bytes - s->iteration_initial_bytes;
3680     time_spent = current_time - s->iteration_start_time;
3681     bandwidth = (double)transferred / time_spent;
3682     s->threshold_size = bandwidth * s->parameters.downtime_limit;
3683
3684     s->mbps = (((double) transferred * 8.0) /
3685                ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
3686
3687     transferred_pages = ram_get_total_transferred_pages() -
3688                             s->iteration_initial_pages;
3689     s->pages_per_second = (double) transferred_pages /
3690                              (((double) time_spent / 1000.0));
3691
3692     /*
3693      * if we haven't sent anything, we don't want to
3694      * recalculate. 10000 is a small enough number for our purposes
3695      */
3696     if (ram_counters.dirty_pages_rate && transferred > 10000) {
3697         s->expected_downtime = ram_counters.remaining / bandwidth;
3698     }
3699
3700     qemu_file_reset_rate_limit(s->to_dst_file);
3701
3702     update_iteration_initial_status(s);
3703
3704     trace_migrate_transferred(transferred, time_spent,
3705                               bandwidth, s->threshold_size);
3706 }
3707
3708 /* Migration thread iteration status */
3709 typedef enum {
3710     MIG_ITERATE_RESUME,         /* Resume current iteration */
3711     MIG_ITERATE_SKIP,           /* Skip current iteration */
3712     MIG_ITERATE_BREAK,          /* Break the loop */
3713 } MigIterateState;
3714
3715 /*
3716  * Return true if continue to the next iteration directly, false
3717  * otherwise.
3718  */
3719 static MigIterateState migration_iteration_run(MigrationState *s)
3720 {
3721     uint64_t pending_size, pend_pre, pend_compat, pend_post;
3722     bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
3723
3724     qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
3725                               &pend_compat, &pend_post);
3726     pending_size = pend_pre + pend_compat + pend_post;
3727
3728     trace_migrate_pending(pending_size, s->threshold_size,
3729                           pend_pre, pend_compat, pend_post);
3730
3731     if (pending_size && pending_size >= s->threshold_size) {
3732         /* Still a significant amount to transfer */
3733         if (!in_postcopy && pend_pre <= s->threshold_size &&
3734             qatomic_read(&s->start_postcopy)) {
3735             if (postcopy_start(s)) {
3736                 error_report("%s: postcopy failed to start", __func__);
3737             }
3738             return MIG_ITERATE_SKIP;
3739         }
3740         /* Just another iteration step */
3741         qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
3742     } else {
3743         trace_migration_thread_low_pending(pending_size);
3744         migration_completion(s);
3745         return MIG_ITERATE_BREAK;
3746     }
3747
3748     return MIG_ITERATE_RESUME;
3749 }
3750
3751 static void migration_iteration_finish(MigrationState *s)
3752 {
3753     /* If we enabled cpu throttling for auto-converge, turn it off. */
3754     cpu_throttle_stop();
3755
3756     qemu_mutex_lock_iothread();
3757     switch (s->state) {
3758     case MIGRATION_STATUS_COMPLETED:
3759         migration_calculate_complete(s);
3760         runstate_set(RUN_STATE_POSTMIGRATE);
3761         break;
3762     case MIGRATION_STATUS_COLO:
3763         if (!migrate_colo_enabled()) {
3764             error_report("%s: critical error: calling COLO code without "
3765                          "COLO enabled", __func__);
3766         }
3767         migrate_start_colo_process(s);
3768         s->vm_was_running = true;
3769         /* Fallthrough */
3770     case MIGRATION_STATUS_FAILED:
3771     case MIGRATION_STATUS_CANCELLED:
3772     case MIGRATION_STATUS_CANCELLING:
3773         if (s->vm_was_running) {
3774             if (!runstate_check(RUN_STATE_SHUTDOWN)) {
3775                 vm_start();
3776             }
3777         } else {
3778             if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
3779                 runstate_set(RUN_STATE_POSTMIGRATE);
3780             }
3781         }
3782         break;
3783
3784     default:
3785         /* Should not reach here, but if so, forgive the VM. */
3786         error_report("%s: Unknown ending state %d", __func__, s->state);
3787         break;
3788     }
3789     migrate_fd_cleanup_schedule(s);
3790     qemu_mutex_unlock_iothread();
3791 }
3792
3793 static void bg_migration_iteration_finish(MigrationState *s)
3794 {
3795     qemu_mutex_lock_iothread();
3796     switch (s->state) {
3797     case MIGRATION_STATUS_COMPLETED:
3798         migration_calculate_complete(s);
3799         break;
3800
3801     case MIGRATION_STATUS_ACTIVE:
3802     case MIGRATION_STATUS_FAILED:
3803     case MIGRATION_STATUS_CANCELLED:
3804     case MIGRATION_STATUS_CANCELLING:
3805         break;
3806
3807     default:
3808         /* Should not reach here, but if so, forgive the VM. */
3809         error_report("%s: Unknown ending state %d", __func__, s->state);
3810         break;
3811     }
3812
3813     migrate_fd_cleanup_schedule(s);
3814     qemu_mutex_unlock_iothread();
3815 }
3816
3817 /*
3818  * Return true if continue to the next iteration directly, false
3819  * otherwise.
3820  */
3821 static MigIterateState bg_migration_iteration_run(MigrationState *s)
3822 {
3823     int res;
3824
3825     res = qemu_savevm_state_iterate(s->to_dst_file, false);
3826     if (res > 0) {
3827         bg_migration_completion(s);
3828         return MIG_ITERATE_BREAK;
3829     }
3830
3831     return MIG_ITERATE_RESUME;
3832 }
3833
3834 void migration_make_urgent_request(void)
3835 {
3836     qemu_sem_post(&migrate_get_current()->rate_limit_sem);
3837 }
3838
3839 void migration_consume_urgent_request(void)
3840 {
3841     qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
3842 }
3843
3844 /* Returns true if the rate limiting was broken by an urgent request */
3845 bool migration_rate_limit(void)
3846 {
3847     int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3848     MigrationState *s = migrate_get_current();
3849
3850     bool urgent = false;
3851     migration_update_counters(s, now);
3852     if (qemu_file_rate_limit(s->to_dst_file)) {
3853
3854         if (qemu_file_get_error(s->to_dst_file)) {
3855             return false;
3856         }
3857         /*
3858          * Wait for a delay to do rate limiting OR
3859          * something urgent to post the semaphore.
3860          */
3861         int ms = s->iteration_start_time + BUFFER_DELAY - now;
3862         trace_migration_rate_limit_pre(ms);
3863         if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
3864             /*
3865              * We were woken by one or more urgent things but
3866              * the timedwait will have consumed one of them.
3867              * The service routine for the urgent wake will dec
3868              * the semaphore itself for each item it consumes,
3869              * so add this one we just eat back.
3870              */
3871             qemu_sem_post(&s->rate_limit_sem);
3872             urgent = true;
3873         }
3874         trace_migration_rate_limit_post(urgent);
3875     }
3876     return urgent;
3877 }
3878
3879 /*
3880  * if failover devices are present, wait they are completely
3881  * unplugged
3882  */
3883
3884 static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
3885                                     int new_state)
3886 {
3887     if (qemu_savevm_state_guest_unplug_pending()) {
3888         migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
3889
3890         while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
3891                qemu_savevm_state_guest_unplug_pending()) {
3892             qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3893         }
3894         if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
3895             int timeout = 120; /* 30 seconds */
3896             /*
3897              * migration has been canceled
3898              * but as we have started an unplug we must wait the end
3899              * to be able to plug back the card
3900              */
3901             while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
3902                 qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3903             }
3904             if (qemu_savevm_state_guest_unplug_pending() &&
3905                 !qtest_enabled()) {
3906                 warn_report("migration: partially unplugged device on "
3907                             "failure");
3908             }
3909         }
3910
3911         migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
3912     } else {
3913         migrate_set_state(&s->state, old_state, new_state);
3914     }
3915 }
3916
3917 /*
3918  * Master migration thread on the source VM.
3919  * It drives the migration and pumps the data down the outgoing channel.
3920  */
3921 static void *migration_thread(void *opaque)
3922 {
3923     MigrationState *s = opaque;
3924     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3925     MigThrError thr_error;
3926     bool urgent = false;
3927
3928     rcu_register_thread();
3929
3930     object_ref(OBJECT(s));
3931     update_iteration_initial_status(s);
3932
3933     qemu_savevm_state_header(s->to_dst_file);
3934
3935     /*
3936      * If we opened the return path, we need to make sure dst has it
3937      * opened as well.
3938      */
3939     if (s->rp_state.rp_thread_created) {
3940         /* Now tell the dest that it should open its end so it can reply */
3941         qemu_savevm_send_open_return_path(s->to_dst_file);
3942
3943         /* And do a ping that will make stuff easier to debug */
3944         qemu_savevm_send_ping(s->to_dst_file, 1);
3945     }
3946
3947     if (migrate_postcopy()) {
3948         /*
3949          * Tell the destination that we *might* want to do postcopy later;
3950          * if the other end can't do postcopy it should fail now, nice and
3951          * early.
3952          */
3953         qemu_savevm_send_postcopy_advise(s->to_dst_file);
3954     }
3955
3956     if (migrate_colo_enabled()) {
3957         /* Notify migration destination that we enable COLO */
3958         qemu_savevm_send_colo_enable(s->to_dst_file);
3959     }
3960
3961     qemu_savevm_state_setup(s->to_dst_file);
3962
3963     qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3964                                MIGRATION_STATUS_ACTIVE);
3965
3966     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3967
3968     trace_migration_thread_setup_complete();
3969
3970     while (migration_is_active(s)) {
3971         if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
3972             MigIterateState iter_state = migration_iteration_run(s);
3973             if (iter_state == MIG_ITERATE_SKIP) {
3974                 continue;
3975             } else if (iter_state == MIG_ITERATE_BREAK) {
3976                 break;
3977             }
3978         }
3979
3980         /*
3981          * Try to detect any kind of failures, and see whether we
3982          * should stop the migration now.
3983          */
3984         thr_error = migration_detect_error(s);
3985         if (thr_error == MIG_THR_ERR_FATAL) {
3986             /* Stop migration */
3987             break;
3988         } else if (thr_error == MIG_THR_ERR_RECOVERED) {
3989             /*
3990              * Just recovered from a e.g. network failure, reset all
3991              * the local variables. This is important to avoid
3992              * breaking transferred_bytes and bandwidth calculation
3993              */
3994             update_iteration_initial_status(s);
3995         }
3996
3997         urgent = migration_rate_limit();
3998     }
3999
4000     trace_migration_thread_after_loop();
4001     migration_iteration_finish(s);
4002     object_unref(OBJECT(s));
4003     rcu_unregister_thread();
4004     return NULL;
4005 }
4006
4007 static void bg_migration_vm_start_bh(void *opaque)
4008 {
4009     MigrationState *s = opaque;
4010
4011     qemu_bh_delete(s->vm_start_bh);
4012     s->vm_start_bh = NULL;
4013
4014     vm_start();
4015     s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
4016 }
4017
4018 /**
4019  * Background snapshot thread, based on live migration code.
4020  * This is an alternative implementation of live migration mechanism
4021  * introduced specifically to support background snapshots.
4022  *
4023  * It takes advantage of userfault_fd write protection mechanism introduced
4024  * in v5.7 kernel. Compared to existing dirty page logging migration much
4025  * lesser stream traffic is produced resulting in smaller snapshot images,
4026  * simply cause of no page duplicates can get into the stream.
4027  *
4028  * Another key point is that generated vmstate stream reflects machine state
4029  * 'frozen' at the beginning of snapshot creation compared to dirty page logging
4030  * mechanism, which effectively results in that saved snapshot is the state of VM
4031  * at the end of the process.
4032  */
4033 static void *bg_migration_thread(void *opaque)
4034 {
4035     MigrationState *s = opaque;
4036     int64_t setup_start;
4037     MigThrError thr_error;
4038     QEMUFile *fb;
4039     bool early_fail = true;
4040
4041     rcu_register_thread();
4042     object_ref(OBJECT(s));
4043
4044     qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
4045
4046     setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
4047     /*
4048      * We want to save vmstate for the moment when migration has been
4049      * initiated but also we want to save RAM content while VM is running.
4050      * The RAM content should appear first in the vmstate. So, we first
4051      * stash the non-RAM part of the vmstate to the temporary buffer,
4052      * then write RAM part of the vmstate to the migration stream
4053      * with vCPUs running and, finally, write stashed non-RAM part of
4054      * the vmstate from the buffer to the migration stream.
4055      */
4056     s->bioc = qio_channel_buffer_new(512 * 1024);
4057     qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
4058     fb = qemu_file_new_output(QIO_CHANNEL(s->bioc));
4059     object_unref(OBJECT(s->bioc));
4060
4061     update_iteration_initial_status(s);
4062
4063     /*
4064      * Prepare for tracking memory writes with UFFD-WP - populate
4065      * RAM pages before protecting.
4066      */
4067 #ifdef __linux__
4068     ram_write_tracking_prepare();
4069 #endif
4070
4071     qemu_savevm_state_header(s->to_dst_file);
4072     qemu_savevm_state_setup(s->to_dst_file);
4073
4074     qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
4075                                MIGRATION_STATUS_ACTIVE);
4076
4077     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
4078
4079     trace_migration_thread_setup_complete();
4080     s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
4081
4082     qemu_mutex_lock_iothread();
4083
4084     /*
4085      * If VM is currently in suspended state, then, to make a valid runstate
4086      * transition in vm_stop_force_state() we need to wakeup it up.
4087      */
4088     qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
4089     s->vm_was_running = runstate_is_running();
4090
4091     if (global_state_store()) {
4092         goto fail;
4093     }
4094     /* Forcibly stop VM before saving state of vCPUs and devices */
4095     if (vm_stop_force_state(RUN_STATE_PAUSED)) {
4096         goto fail;
4097     }
4098     /*
4099      * Put vCPUs in sync with shadow context structures, then
4100      * save their state to channel-buffer along with devices.
4101      */
4102     cpu_synchronize_all_states();
4103     if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
4104         goto fail;
4105     }
4106     /*
4107      * Since we are going to get non-iterable state data directly
4108      * from s->bioc->data, explicit flush is needed here.
4109      */
4110     qemu_fflush(fb);
4111
4112     /* Now initialize UFFD context and start tracking RAM writes */
4113     if (ram_write_tracking_start()) {
4114         goto fail;
4115     }
4116     early_fail = false;
4117
4118     /*
4119      * Start VM from BH handler to avoid write-fault lock here.
4120      * UFFD-WP protection for the whole RAM is already enabled so
4121      * calling VM state change notifiers from vm_start() would initiate
4122      * writes to virtio VQs memory which is in write-protected region.
4123      */
4124     s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
4125     qemu_bh_schedule(s->vm_start_bh);
4126
4127     qemu_mutex_unlock_iothread();
4128
4129     while (migration_is_active(s)) {
4130         MigIterateState iter_state = bg_migration_iteration_run(s);
4131         if (iter_state == MIG_ITERATE_SKIP) {
4132             continue;
4133         } else if (iter_state == MIG_ITERATE_BREAK) {
4134             break;
4135         }
4136
4137         /*
4138          * Try to detect any kind of failures, and see whether we
4139          * should stop the migration now.
4140          */
4141         thr_error = migration_detect_error(s);
4142         if (thr_error == MIG_THR_ERR_FATAL) {
4143             /* Stop migration */
4144             break;
4145         }
4146
4147         migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
4148     }
4149
4150     trace_migration_thread_after_loop();
4151
4152 fail:
4153     if (early_fail) {
4154         migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
4155                 MIGRATION_STATUS_FAILED);
4156         qemu_mutex_unlock_iothread();
4157     }
4158
4159     bg_migration_iteration_finish(s);
4160
4161     qemu_fclose(fb);
4162     object_unref(OBJECT(s));
4163     rcu_unregister_thread();
4164
4165     return NULL;
4166 }
4167
4168 void migrate_fd_connect(MigrationState *s, Error *error_in)
4169 {
4170     Error *local_err = NULL;
4171     int64_t rate_limit;
4172     bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
4173
4174     /*
4175      * If there's a previous error, free it and prepare for another one.
4176      * Meanwhile if migration completes successfully, there won't have an error
4177      * dumped when calling migrate_fd_cleanup().
4178      */
4179     migrate_error_free(s);
4180
4181     s->expected_downtime = s->parameters.downtime_limit;
4182     if (resume) {
4183         assert(s->cleanup_bh);
4184     } else {
4185         assert(!s->cleanup_bh);
4186         s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s);
4187     }
4188     if (error_in) {
4189         migrate_fd_error(s, error_in);
4190         if (resume) {
4191             /*
4192              * Don't do cleanup for resume if channel is invalid, but only dump
4193              * the error.  We wait for another channel connect from the user.
4194              * The error_report still gives HMP user a hint on what failed.
4195              * It's normally done in migrate_fd_cleanup(), but call it here
4196              * explicitly.
4197              */
4198             error_report_err(error_copy(s->error));
4199         } else {
4200             migrate_fd_cleanup(s);
4201         }
4202         return;
4203     }
4204
4205     if (resume) {
4206         /* This is a resumed migration */
4207         rate_limit = s->parameters.max_postcopy_bandwidth /
4208             XFER_LIMIT_RATIO;
4209     } else {
4210         /* This is a fresh new migration */
4211         rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
4212
4213         /* Notify before starting migration thread */
4214         notifier_list_notify(&migration_state_notifiers, s);
4215     }
4216
4217     qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
4218     qemu_file_set_blocking(s->to_dst_file, true);
4219
4220     /*
4221      * Open the return path. For postcopy, it is used exclusively. For
4222      * precopy, only if user specified "return-path" capability would
4223      * QEMU uses the return path.
4224      */
4225     if (migrate_postcopy_ram() || migrate_use_return_path()) {
4226         if (open_return_path_on_source(s, !resume)) {
4227             error_report("Unable to open return-path for postcopy");
4228             migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
4229             migrate_fd_cleanup(s);
4230             return;
4231         }
4232     }
4233
4234     /* This needs to be done before resuming a postcopy */
4235     if (postcopy_preempt_setup(s, &local_err)) {
4236         error_report_err(local_err);
4237         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
4238                           MIGRATION_STATUS_FAILED);
4239         migrate_fd_cleanup(s);
4240         return;
4241     }
4242
4243     if (resume) {
4244         /* Wakeup the main migration thread to do the recovery */
4245         migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
4246                           MIGRATION_STATUS_POSTCOPY_RECOVER);
4247         qemu_sem_post(&s->postcopy_pause_sem);
4248         return;
4249     }
4250
4251     if (multifd_save_setup(&local_err) != 0) {
4252         error_report_err(local_err);
4253         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
4254                           MIGRATION_STATUS_FAILED);
4255         migrate_fd_cleanup(s);
4256         return;
4257     }
4258
4259     if (migrate_background_snapshot()) {
4260         qemu_thread_create(&s->thread, "bg_snapshot",
4261                 bg_migration_thread, s, QEMU_THREAD_JOINABLE);
4262     } else {
4263         qemu_thread_create(&s->thread, "live_migration",
4264                 migration_thread, s, QEMU_THREAD_JOINABLE);
4265     }
4266     s->migration_thread_running = true;
4267 }
4268
4269 void migration_global_dump(Monitor *mon)
4270 {
4271     MigrationState *ms = migrate_get_current();
4272
4273     monitor_printf(mon, "globals:\n");
4274     monitor_printf(mon, "store-global-state: %s\n",
4275                    ms->store_global_state ? "on" : "off");
4276     monitor_printf(mon, "only-migratable: %s\n",
4277                    only_migratable ? "on" : "off");
4278     monitor_printf(mon, "send-configuration: %s\n",
4279                    ms->send_configuration ? "on" : "off");
4280     monitor_printf(mon, "send-section-footer: %s\n",
4281                    ms->send_section_footer ? "on" : "off");
4282     monitor_printf(mon, "decompress-error-check: %s\n",
4283                    ms->decompress_error_check ? "on" : "off");
4284     monitor_printf(mon, "clear-bitmap-shift: %u\n",
4285                    ms->clear_bitmap_shift);
4286 }
4287
4288 #define DEFINE_PROP_MIG_CAP(name, x)             \
4289     DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false)
4290
4291 static Property migration_properties[] = {
4292     DEFINE_PROP_BOOL("store-global-state", MigrationState,
4293                      store_global_state, true),
4294     DEFINE_PROP_BOOL("send-configuration", MigrationState,
4295                      send_configuration, true),
4296     DEFINE_PROP_BOOL("send-section-footer", MigrationState,
4297                      send_section_footer, true),
4298     DEFINE_PROP_BOOL("decompress-error-check", MigrationState,
4299                       decompress_error_check, true),
4300     DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState,
4301                       clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT),
4302
4303     /* Migration parameters */
4304     DEFINE_PROP_UINT8("x-compress-level", MigrationState,
4305                       parameters.compress_level,
4306                       DEFAULT_MIGRATE_COMPRESS_LEVEL),
4307     DEFINE_PROP_UINT8("x-compress-threads", MigrationState,
4308                       parameters.compress_threads,
4309                       DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT),
4310     DEFINE_PROP_BOOL("x-compress-wait-thread", MigrationState,
4311                       parameters.compress_wait_thread, true),
4312     DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
4313                       parameters.decompress_threads,
4314                       DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
4315     DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
4316                       parameters.throttle_trigger_threshold,
4317                       DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD),
4318     DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
4319                       parameters.cpu_throttle_initial,
4320                       DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
4321     DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
4322                       parameters.cpu_throttle_increment,
4323                       DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
4324     DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
4325                       parameters.cpu_throttle_tailslow, false),
4326     DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
4327                       parameters.max_bandwidth, MAX_THROTTLE),
4328     DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
4329                       parameters.downtime_limit,
4330                       DEFAULT_MIGRATE_SET_DOWNTIME),
4331     DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState,
4332                       parameters.x_checkpoint_delay,
4333                       DEFAULT_MIGRATE_X_CHECKPOINT_DELAY),
4334     DEFINE_PROP_UINT8("multifd-channels", MigrationState,
4335                       parameters.multifd_channels,
4336                       DEFAULT_MIGRATE_MULTIFD_CHANNELS),
4337     DEFINE_PROP_MULTIFD_COMPRESSION("multifd-compression", MigrationState,
4338                       parameters.multifd_compression,
4339                       DEFAULT_MIGRATE_MULTIFD_COMPRESSION),
4340     DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
4341                       parameters.multifd_zlib_level,
4342                       DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
4343     DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
4344                       parameters.multifd_zstd_level,
4345                       DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
4346     DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
4347                       parameters.xbzrle_cache_size,
4348                       DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
4349     DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState,
4350                       parameters.max_postcopy_bandwidth,
4351                       DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH),
4352     DEFINE_PROP_UINT8("max-cpu-throttle", MigrationState,
4353                       parameters.max_cpu_throttle,
4354                       DEFAULT_MIGRATE_MAX_CPU_THROTTLE),
4355     DEFINE_PROP_SIZE("announce-initial", MigrationState,
4356                       parameters.announce_initial,
4357                       DEFAULT_MIGRATE_ANNOUNCE_INITIAL),
4358     DEFINE_PROP_SIZE("announce-max", MigrationState,
4359                       parameters.announce_max,
4360                       DEFAULT_MIGRATE_ANNOUNCE_MAX),
4361     DEFINE_PROP_SIZE("announce-rounds", MigrationState,
4362                       parameters.announce_rounds,
4363                       DEFAULT_MIGRATE_ANNOUNCE_ROUNDS),
4364     DEFINE_PROP_SIZE("announce-step", MigrationState,
4365                       parameters.announce_step,
4366                       DEFAULT_MIGRATE_ANNOUNCE_STEP),
4367     DEFINE_PROP_BOOL("x-postcopy-preempt-break-huge", MigrationState,
4368                       postcopy_preempt_break_huge, true),
4369     DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds),
4370     DEFINE_PROP_STRING("tls-hostname", MigrationState, parameters.tls_hostname),
4371     DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
4372
4373     /* Migration capabilities */
4374     DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
4375     DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL),
4376     DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE),
4377     DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS),
4378     DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS),
4379     DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS),
4380     DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
4381     DEFINE_PROP_MIG_CAP("x-postcopy-preempt",
4382                         MIGRATION_CAPABILITY_POSTCOPY_PREEMPT),
4383     DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
4384     DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
4385     DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
4386     DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
4387     DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
4388     DEFINE_PROP_MIG_CAP("x-background-snapshot",
4389             MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
4390 #ifdef CONFIG_LINUX
4391     DEFINE_PROP_MIG_CAP("x-zero-copy-send",
4392             MIGRATION_CAPABILITY_ZERO_COPY_SEND),
4393 #endif
4394
4395     DEFINE_PROP_END_OF_LIST(),
4396 };
4397
4398 static void migration_class_init(ObjectClass *klass, void *data)
4399 {
4400     DeviceClass *dc = DEVICE_CLASS(klass);
4401
4402     dc->user_creatable = false;
4403     device_class_set_props(dc, migration_properties);
4404 }
4405
4406 static void migration_instance_finalize(Object *obj)
4407 {
4408     MigrationState *ms = MIGRATION_OBJ(obj);
4409
4410     qemu_mutex_destroy(&ms->error_mutex);
4411     qemu_mutex_destroy(&ms->qemu_file_lock);
4412     qemu_sem_destroy(&ms->wait_unplug_sem);
4413     qemu_sem_destroy(&ms->rate_limit_sem);
4414     qemu_sem_destroy(&ms->pause_sem);
4415     qemu_sem_destroy(&ms->postcopy_pause_sem);
4416     qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
4417     qemu_sem_destroy(&ms->rp_state.rp_sem);
4418     qemu_sem_destroy(&ms->postcopy_qemufile_src_sem);
4419     error_free(ms->error);
4420 }
4421
4422 static void migration_instance_init(Object *obj)
4423 {
4424     MigrationState *ms = MIGRATION_OBJ(obj);
4425     MigrationParameters *params = &ms->parameters;
4426
4427     ms->state = MIGRATION_STATUS_NONE;
4428     ms->mbps = -1;
4429     ms->pages_per_second = -1;
4430     qemu_sem_init(&ms->pause_sem, 0);
4431     qemu_mutex_init(&ms->error_mutex);
4432
4433     params->tls_hostname = g_strdup("");
4434     params->tls_creds = g_strdup("");
4435
4436     /* Set has_* up only for parameter checks */
4437     params->has_compress_level = true;
4438     params->has_compress_threads = true;
4439     params->has_decompress_threads = true;
4440     params->has_throttle_trigger_threshold = true;
4441     params->has_cpu_throttle_initial = true;
4442     params->has_cpu_throttle_increment = true;
4443     params->has_cpu_throttle_tailslow = true;
4444     params->has_max_bandwidth = true;
4445     params->has_downtime_limit = true;
4446     params->has_x_checkpoint_delay = true;
4447     params->has_block_incremental = true;
4448     params->has_multifd_channels = true;
4449     params->has_multifd_compression = true;
4450     params->has_multifd_zlib_level = true;
4451     params->has_multifd_zstd_level = true;
4452     params->has_xbzrle_cache_size = true;
4453     params->has_max_postcopy_bandwidth = true;
4454     params->has_max_cpu_throttle = true;
4455     params->has_announce_initial = true;
4456     params->has_announce_max = true;
4457     params->has_announce_rounds = true;
4458     params->has_announce_step = true;
4459
4460     qemu_sem_init(&ms->postcopy_pause_sem, 0);
4461     qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
4462     qemu_sem_init(&ms->rp_state.rp_sem, 0);
4463     qemu_sem_init(&ms->rate_limit_sem, 0);
4464     qemu_sem_init(&ms->wait_unplug_sem, 0);
4465     qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0);
4466     qemu_mutex_init(&ms->qemu_file_lock);
4467 }
4468
4469 /*
4470  * Return true if check pass, false otherwise. Error will be put
4471  * inside errp if provided.
4472  */
4473 static bool migration_object_check(MigrationState *ms, Error **errp)
4474 {
4475     MigrationCapabilityStatusList *head = NULL;
4476     /* Assuming all off */
4477     bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret;
4478     int i;
4479
4480     if (!migrate_params_check(&ms->parameters, errp)) {
4481         return false;
4482     }
4483
4484     for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
4485         if (ms->enabled_capabilities[i]) {
4486             QAPI_LIST_PREPEND(head, migrate_cap_add(i, true));
4487         }
4488     }
4489
4490     ret = migrate_caps_check(cap_list, head, errp);
4491
4492     /* It works with head == NULL */
4493     qapi_free_MigrationCapabilityStatusList(head);
4494
4495     return ret;
4496 }
4497
4498 static const TypeInfo migration_type = {
4499     .name = TYPE_MIGRATION,
4500     /*
4501      * NOTE: TYPE_MIGRATION is not really a device, as the object is
4502      * not created using qdev_new(), it is not attached to the qdev
4503      * device tree, and it is never realized.
4504      *
4505      * TODO: Make this TYPE_OBJECT once QOM provides something like
4506      * TYPE_DEVICE's "-global" properties.
4507      */
4508     .parent = TYPE_DEVICE,
4509     .class_init = migration_class_init,
4510     .class_size = sizeof(MigrationClass),
4511     .instance_size = sizeof(MigrationState),
4512     .instance_init = migration_instance_init,
4513     .instance_finalize = migration_instance_finalize,
4514 };
4515
4516 static void register_migration_types(void)
4517 {
4518     type_register_static(&migration_type);
4519 }
4520
4521 type_init(register_migration_types);