cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 typedef struct TimersState {
 128     /* Protected by BQL.  */
 129     int64_t cpu_ticks_prev;
 130     int64_t cpu_ticks_offset;
 131
 132     /* Protect fields that can be respectively read outside the
 133      * BQL, and written from multiple threads.
 134      */
 135     QemuSeqLock vm_clock_seqlock;
 136     QemuSpin vm_clock_lock;
 137
 138     int16_t cpu_ticks_enabled;
 139
 140     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 141     int16_t icount_time_shift;
 142
 143     /* Compensate for varying guest execution speed.  */
 144     int64_t qemu_icount_bias;
 145
 146     int64_t vm_clock_warp_start;
 147     int64_t cpu_clock_offset;
 148
 149     /* Only written by TCG thread */
 150     int64_t qemu_icount;
 151
 152     /* for adjusting icount */
 153     QEMUTimer *icount_rt_timer;
 154     QEMUTimer *icount_vm_timer;
 155     QEMUTimer *icount_warp_timer;
 156 } TimersState;
 157
 158 static TimersState timers_state;
 159 bool mttcg_enabled;
 160
 161 /*
 162  * We default to false if we know other options have been enabled
 163  * which are currently incompatible with MTTCG. Otherwise when each
 164  * guest (target) has been updated to support:
 165  *   - atomic instructions
 166  *   - memory ordering primitives (barriers)
 167  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 168  *
 169  * Once a guest architecture has been converted to the new primitives
 170  * there are two remaining limitations to check.
 171  *
 172  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 173  * - The host must have a stronger memory order than the guest
 174  *
 175  * It may be possible in future to support strong guests on weak hosts
 176  * but that will require tagging all load/stores in a guest with their
 177  * implicit memory order requirements which would likely slow things
 178  * down a lot.
 179  */
 180
 181 static bool check_tcg_memory_orders_compatible(void)
 182 {
 183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 184     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 185 #else
 186     return false;
 187 #endif
 188 }
 189
 190 static bool default_mttcg_enabled(void)
 191 {
 192     if (use_icount || TCG_OVERSIZED_GUEST) {
 193         return false;
 194     } else {
 195 #ifdef TARGET_SUPPORTS_MTTCG
 196         return check_tcg_memory_orders_compatible();
 197 #else
 198         return false;
 199 #endif
 200     }
 201 }
 202
 203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 204 {
 205     const char *t = qemu_opt_get(opts, "thread");
 206     if (t) {
 207         if (strcmp(t, "multi") == 0) {
 208             if (TCG_OVERSIZED_GUEST) {
 209                 error_setg(errp, "No MTTCG when guest word size > hosts");
 210             } else if (use_icount) {
 211                 error_setg(errp, "No MTTCG when icount is enabled");
 212             } else {
 213 #ifndef TARGET_SUPPORTS_MTTCG
 214                 warn_report("Guest not yet converted to MTTCG - "
 215                             "you may get unexpected results");
 216 #endif
 217                 if (!check_tcg_memory_orders_compatible()) {
 218                     warn_report("Guest expects a stronger memory ordering "
 219                                 "than the host provides");
 220                     error_printf("This may cause strange/hard to debug errors\n");
 221                 }
 222                 mttcg_enabled = true;
 223             }
 224         } else if (strcmp(t, "single") == 0) {
 225             mttcg_enabled = false;
 226         } else {
 227             error_setg(errp, "Invalid 'thread' setting %s", t);
 228         }
 229     } else {
 230         mttcg_enabled = default_mttcg_enabled();
 231     }
 232 }
 233
 234 /* The current number of executed instructions is based on what we
 235  * originally budgeted minus the current state of the decrementing
 236  * icount counters in extra/u16.low.
 237  */
 238 static int64_t cpu_get_icount_executed(CPUState *cpu)
 239 {
 240     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 241 }
 242
 243 /*
 244  * Update the global shared timer_state.qemu_icount to take into
 245  * account executed instructions. This is done by the TCG vCPU
 246  * thread so the main-loop can see time has moved forward.
 247  */
 248 static void cpu_update_icount_locked(CPUState *cpu)
 249 {
 250     int64_t executed = cpu_get_icount_executed(cpu);
 251     cpu->icount_budget -= executed;
 252
 253     atomic_set_i64(&timers_state.qemu_icount,
 254                    timers_state.qemu_icount + executed);
 255 }
 256
 257 /*
 258  * Update the global shared timer_state.qemu_icount to take into
 259  * account executed instructions. This is done by the TCG vCPU
 260  * thread so the main-loop can see time has moved forward.
 261  */
 262 void cpu_update_icount(CPUState *cpu)
 263 {
 264     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 265                        &timers_state.vm_clock_lock);
 266     cpu_update_icount_locked(cpu);
 267     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 268                          &timers_state.vm_clock_lock);
 269 }
 270
 271 static int64_t cpu_get_icount_raw_locked(void)
 272 {
 273     CPUState *cpu = current_cpu;
 274
 275     if (cpu && cpu->running) {
 276         if (!cpu->can_do_io) {
 277             error_report("Bad icount read");
 278             exit(1);
 279         }
 280         /* Take into account what has run */
 281         cpu_update_icount_locked(cpu);
 282     }
 283     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 284     return atomic_read_i64(&timers_state.qemu_icount);
 285 }
 286
 287 static int64_t cpu_get_icount_locked(void)
 288 {
 289     int64_t icount = cpu_get_icount_raw_locked();
 290     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 291         cpu_icount_to_ns(icount);
 292 }
 293
 294 int64_t cpu_get_icount_raw(void)
 295 {
 296     int64_t icount;
 297     unsigned start;
 298
 299     do {
 300         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 301         icount = cpu_get_icount_raw_locked();
 302     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 303
 304     return icount;
 305 }
 306
 307 /* Return the virtual CPU time, based on the instruction counter.  */
 308 int64_t cpu_get_icount(void)
 309 {
 310     int64_t icount;
 311     unsigned start;
 312
 313     do {
 314         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 315         icount = cpu_get_icount_locked();
 316     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 317
 318     return icount;
 319 }
 320
 321 int64_t cpu_icount_to_ns(int64_t icount)
 322 {
 323     return icount << atomic_read(&timers_state.icount_time_shift);
 324 }
 325
 326 static int64_t cpu_get_ticks_locked(void)
 327 {
 328     int64_t ticks = timers_state.cpu_ticks_offset;
 329     if (timers_state.cpu_ticks_enabled) {
 330         ticks += cpu_get_host_ticks();
 331     }
 332
 333     if (timers_state.cpu_ticks_prev > ticks) {
 334         /* Non increasing ticks may happen if the host uses software suspend.  */
 335         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 336         ticks = timers_state.cpu_ticks_prev;
 337     }
 338
 339     timers_state.cpu_ticks_prev = ticks;
 340     return ticks;
 341 }
 342
 343 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 344  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 345  * counter.
 346  */
 347 int64_t cpu_get_ticks(void)
 348 {
 349     int64_t ticks;
 350
 351     if (use_icount) {
 352         return cpu_get_icount();
 353     }
 354
 355     qemu_spin_lock(&timers_state.vm_clock_lock);
 356     ticks = cpu_get_ticks_locked();
 357     qemu_spin_unlock(&timers_state.vm_clock_lock);
 358     return ticks;
 359 }
 360
 361 static int64_t cpu_get_clock_locked(void)
 362 {
 363     int64_t time;
 364
 365     time = timers_state.cpu_clock_offset;
 366     if (timers_state.cpu_ticks_enabled) {
 367         time += get_clock();
 368     }
 369
 370     return time;
 371 }
 372
 373 /* Return the monotonic time elapsed in VM, i.e.,
 374  * the time between vm_start and vm_stop
 375  */
 376 int64_t cpu_get_clock(void)
 377 {
 378     int64_t ti;
 379     unsigned start;
 380
 381     do {
 382         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 383         ti = cpu_get_clock_locked();
 384     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 385
 386     return ti;
 387 }
 388
 389 /* enable cpu_get_ticks()
 390  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 391  */
 392 void cpu_enable_ticks(void)
 393 {
 394     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 395                        &timers_state.vm_clock_lock);
 396     if (!timers_state.cpu_ticks_enabled) {
 397         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 398         timers_state.cpu_clock_offset -= get_clock();
 399         timers_state.cpu_ticks_enabled = 1;
 400     }
 401     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 402                        &timers_state.vm_clock_lock);
 403 }
 404
 405 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 406  * cpu_get_ticks() after that.
 407  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 408  */
 409 void cpu_disable_ticks(void)
 410 {
 411     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 412                        &timers_state.vm_clock_lock);
 413     if (timers_state.cpu_ticks_enabled) {
 414         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 415         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 416         timers_state.cpu_ticks_enabled = 0;
 417     }
 418     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 419                          &timers_state.vm_clock_lock);
 420 }
 421
 422 /* Correlation between real and virtual time is always going to be
 423    fairly approximate, so ignore small variation.
 424    When the guest is idle real and virtual time will be aligned in
 425    the IO wait loop.  */
 426 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 427
 428 static void icount_adjust(void)
 429 {
 430     int64_t cur_time;
 431     int64_t cur_icount;
 432     int64_t delta;
 433
 434     /* Protected by TimersState mutex.  */
 435     static int64_t last_delta;
 436
 437     /* If the VM is not running, then do nothing.  */
 438     if (!runstate_is_running()) {
 439         return;
 440     }
 441
 442     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 443                        &timers_state.vm_clock_lock);
 444     cur_time = cpu_get_clock_locked();
 445     cur_icount = cpu_get_icount_locked();
 446
 447     delta = cur_icount - cur_time;
 448     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 449     if (delta > 0
 450         && last_delta + ICOUNT_WOBBLE < delta * 2
 451         && timers_state.icount_time_shift > 0) {
 452         /* The guest is getting too far ahead.  Slow time down.  */
 453         atomic_set(&timers_state.icount_time_shift,
 454                    timers_state.icount_time_shift - 1);
 455     }
 456     if (delta < 0
 457         && last_delta - ICOUNT_WOBBLE > delta * 2
 458         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 459         /* The guest is getting too far behind.  Speed time up.  */
 460         atomic_set(&timers_state.icount_time_shift,
 461                    timers_state.icount_time_shift + 1);
 462     }
 463     last_delta = delta;
 464     atomic_set_i64(&timers_state.qemu_icount_bias,
 465                    cur_icount - (timers_state.qemu_icount
 466                                  << timers_state.icount_time_shift));
 467     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 468                          &timers_state.vm_clock_lock);
 469 }
 470
 471 static void icount_adjust_rt(void *opaque)
 472 {
 473     timer_mod(timers_state.icount_rt_timer,
 474               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 475     icount_adjust();
 476 }
 477
 478 static void icount_adjust_vm(void *opaque)
 479 {
 480     timer_mod(timers_state.icount_vm_timer,
 481                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 482                    NANOSECONDS_PER_SECOND / 10);
 483     icount_adjust();
 484 }
 485
 486 static int64_t qemu_icount_round(int64_t count)
 487 {
 488     int shift = atomic_read(&timers_state.icount_time_shift);
 489     return (count + (1 << shift) - 1) >> shift;
 490 }
 491
 492 static void icount_warp_rt(void)
 493 {
 494     unsigned seq;
 495     int64_t warp_start;
 496
 497     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 498      * changes from -1 to another value, so the race here is okay.
 499      */
 500     do {
 501         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 502         warp_start = timers_state.vm_clock_warp_start;
 503     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 504
 505     if (warp_start == -1) {
 506         return;
 507     }
 508
 509     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 510                        &timers_state.vm_clock_lock);
 511     if (runstate_is_running()) {
 512         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 513                                             cpu_get_clock_locked());
 514         int64_t warp_delta;
 515
 516         warp_delta = clock - timers_state.vm_clock_warp_start;
 517         if (use_icount == 2) {
 518             /*
 519              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 520              * far ahead of real time.
 521              */
 522             int64_t cur_icount = cpu_get_icount_locked();
 523             int64_t delta = clock - cur_icount;
 524             warp_delta = MIN(warp_delta, delta);
 525         }
 526         atomic_set_i64(&timers_state.qemu_icount_bias,
 527                        timers_state.qemu_icount_bias + warp_delta);
 528     }
 529     timers_state.vm_clock_warp_start = -1;
 530     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 531                        &timers_state.vm_clock_lock);
 532
 533     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 534         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 535     }
 536 }
 537
 538 static void icount_timer_cb(void *opaque)
 539 {
 540     /* No need for a checkpoint because the timer already synchronizes
 541      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 542      */
 543     icount_warp_rt();
 544 }
 545
 546 void qtest_clock_warp(int64_t dest)
 547 {
 548     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 549     AioContext *aio_context;
 550     assert(qtest_enabled());
 551     aio_context = qemu_get_aio_context();
 552     while (clock < dest) {
 553         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 554         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 555
 556         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 557                            &timers_state.vm_clock_lock);
 558         atomic_set_i64(&timers_state.qemu_icount_bias,
 559                        timers_state.qemu_icount_bias + warp);
 560         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 561                              &timers_state.vm_clock_lock);
 562
 563         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 564         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 565         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 566     }
 567     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 568 }
 569
 570 void qemu_start_warp_timer(void)
 571 {
 572     int64_t clock;
 573     int64_t deadline;
 574
 575     if (!use_icount) {
 576         return;
 577     }
 578
 579     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 580      * do not fire, so computing the deadline does not make sense.
 581      */
 582     if (!runstate_is_running()) {
 583         return;
 584     }
 585
 586     if (replay_mode != REPLAY_MODE_PLAY) {
 587         if (!all_cpu_threads_idle()) {
 588             return;
 589         }
 590
 591         if (qtest_enabled()) {
 592             /* When testing, qtest commands advance icount.  */
 593             return;
 594         }
 595
 596         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 597     } else {
 598         /* warp clock deterministically in record/replay mode */
 599         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 600             /* vCPU is sleeping and warp can't be started.
 601                It is probably a race condition: notification sent
 602                to vCPU was processed in advance and vCPU went to sleep.
 603                Therefore we have to wake it up for doing someting. */
 604             if (replay_has_checkpoint()) {
 605                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 606             }
 607             return;
 608         }
 609     }
 610
 611     /* We want to use the earliest deadline from ALL vm_clocks */
 612     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 613     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 614     if (deadline < 0) {
 615         static bool notified;
 616         if (!icount_sleep && !notified) {
 617             warn_report("icount sleep disabled and no active timers");
 618             notified = true;
 619         }
 620         return;
 621     }
 622
 623     if (deadline > 0) {
 624         /*
 625          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 626          * sleep.  Otherwise, the CPU might be waiting for a future timer
 627          * interrupt to wake it up, but the interrupt never comes because
 628          * the vCPU isn't running any insns and thus doesn't advance the
 629          * QEMU_CLOCK_VIRTUAL.
 630          */
 631         if (!icount_sleep) {
 632             /*
 633              * We never let VCPUs sleep in no sleep icount mode.
 634              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 635              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 636              * It is useful when we want a deterministic execution time,
 637              * isolated from host latencies.
 638              */
 639             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 640                                &timers_state.vm_clock_lock);
 641             atomic_set_i64(&timers_state.qemu_icount_bias,
 642                            timers_state.qemu_icount_bias + deadline);
 643             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 644                                  &timers_state.vm_clock_lock);
 645             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 646         } else {
 647             /*
 648              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 649              * "real" time, (related to the time left until the next event) has
 650              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 651              * This avoids that the warps are visible externally; for example,
 652              * you will not be sending network packets continuously instead of
 653              * every 100ms.
 654              */
 655             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 656                                &timers_state.vm_clock_lock);
 657             if (timers_state.vm_clock_warp_start == -1
 658                 || timers_state.vm_clock_warp_start > clock) {
 659                 timers_state.vm_clock_warp_start = clock;
 660             }
 661             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 662                                  &timers_state.vm_clock_lock);
 663             timer_mod_anticipate(timers_state.icount_warp_timer,
 664                                  clock + deadline);
 665         }
 666     } else if (deadline == 0) {
 667         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 668     }
 669 }
 670
 671 static void qemu_account_warp_timer(void)
 672 {
 673     if (!use_icount || !icount_sleep) {
 674         return;
 675     }
 676
 677     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 678      * do not fire, so computing the deadline does not make sense.
 679      */
 680     if (!runstate_is_running()) {
 681         return;
 682     }
 683
 684     /* warp clock deterministically in record/replay mode */
 685     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 686         return;
 687     }
 688
 689     timer_del(timers_state.icount_warp_timer);
 690     icount_warp_rt();
 691 }
 692
 693 static bool icount_state_needed(void *opaque)
 694 {
 695     return use_icount;
 696 }
 697
 698 static bool warp_timer_state_needed(void *opaque)
 699 {
 700     TimersState *s = opaque;
 701     return s->icount_warp_timer != NULL;
 702 }
 703
 704 static bool adjust_timers_state_needed(void *opaque)
 705 {
 706     TimersState *s = opaque;
 707     return s->icount_rt_timer != NULL;
 708 }
 709
 710 /*
 711  * Subsection for warp timer migration is optional, because may not be created
 712  */
 713 static const VMStateDescription icount_vmstate_warp_timer = {
 714     .name = "timer/icount/warp_timer",
 715     .version_id = 1,
 716     .minimum_version_id = 1,
 717     .needed = warp_timer_state_needed,
 718     .fields = (VMStateField[]) {
 719         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 720         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 721         VMSTATE_END_OF_LIST()
 722     }
 723 };
 724
 725 static const VMStateDescription icount_vmstate_adjust_timers = {
 726     .name = "timer/icount/timers",
 727     .version_id = 1,
 728     .minimum_version_id = 1,
 729     .needed = adjust_timers_state_needed,
 730     .fields = (VMStateField[]) {
 731         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 732         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 733         VMSTATE_END_OF_LIST()
 734     }
 735 };
 736
 737 /*
 738  * This is a subsection for icount migration.
 739  */
 740 static const VMStateDescription icount_vmstate_timers = {
 741     .name = "timer/icount",
 742     .version_id = 1,
 743     .minimum_version_id = 1,
 744     .needed = icount_state_needed,
 745     .fields = (VMStateField[]) {
 746         VMSTATE_INT64(qemu_icount_bias, TimersState),
 747         VMSTATE_INT64(qemu_icount, TimersState),
 748         VMSTATE_END_OF_LIST()
 749     },
 750     .subsections = (const VMStateDescription*[]) {
 751         &icount_vmstate_warp_timer,
 752         &icount_vmstate_adjust_timers,
 753         NULL
 754     }
 755 };
 756
 757 static const VMStateDescription vmstate_timers = {
 758     .name = "timer",
 759     .version_id = 2,
 760     .minimum_version_id = 1,
 761     .fields = (VMStateField[]) {
 762         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 763         VMSTATE_UNUSED(8),
 764         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 765         VMSTATE_END_OF_LIST()
 766     },
 767     .subsections = (const VMStateDescription*[]) {
 768         &icount_vmstate_timers,
 769         NULL
 770     }
 771 };
 772
 773 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 774 {
 775     double pct;
 776     double throttle_ratio;
 777     long sleeptime_ns;
 778
 779     if (!cpu_throttle_get_percentage()) {
 780         return;
 781     }
 782
 783     pct = (double)cpu_throttle_get_percentage()/100;
 784     throttle_ratio = pct / (1 - pct);
 785     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 786
 787     qemu_mutex_unlock_iothread();
 788     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 789     qemu_mutex_lock_iothread();
 790     atomic_set(&cpu->throttle_thread_scheduled, 0);
 791 }
 792
 793 static void cpu_throttle_timer_tick(void *opaque)
 794 {
 795     CPUState *cpu;
 796     double pct;
 797
 798     /* Stop the timer if needed */
 799     if (!cpu_throttle_get_percentage()) {
 800         return;
 801     }
 802     CPU_FOREACH(cpu) {
 803         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 804             async_run_on_cpu(cpu, cpu_throttle_thread,
 805                              RUN_ON_CPU_NULL);
 806         }
 807     }
 808
 809     pct = (double)cpu_throttle_get_percentage()/100;
 810     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 811                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 812 }
 813
 814 void cpu_throttle_set(int new_throttle_pct)
 815 {
 816     /* Ensure throttle percentage is within valid range */
 817     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 818     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 819
 820     atomic_set(&throttle_percentage, new_throttle_pct);
 821
 822     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 823                                        CPU_THROTTLE_TIMESLICE_NS);
 824 }
 825
 826 void cpu_throttle_stop(void)
 827 {
 828     atomic_set(&throttle_percentage, 0);
 829 }
 830
 831 bool cpu_throttle_active(void)
 832 {
 833     return (cpu_throttle_get_percentage() != 0);
 834 }
 835
 836 int cpu_throttle_get_percentage(void)
 837 {
 838     return atomic_read(&throttle_percentage);
 839 }
 840
 841 void cpu_ticks_init(void)
 842 {
 843     seqlock_init(&timers_state.vm_clock_seqlock);
 844     qemu_spin_init(&timers_state.vm_clock_lock);
 845     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 846     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 847                                            cpu_throttle_timer_tick, NULL);
 848 }
 849
 850 void configure_icount(QemuOpts *opts, Error **errp)
 851 {
 852     const char *option;
 853     char *rem_str = NULL;
 854
 855     option = qemu_opt_get(opts, "shift");
 856     if (!option) {
 857         if (qemu_opt_get(opts, "align") != NULL) {
 858             error_setg(errp, "Please specify shift option when using align");
 859         }
 860         return;
 861     }
 862
 863     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 864     if (icount_sleep) {
 865         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 866                                          icount_timer_cb, NULL);
 867     }
 868
 869     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 870
 871     if (icount_align_option && !icount_sleep) {
 872         error_setg(errp, "align=on and sleep=off are incompatible");
 873     }
 874     if (strcmp(option, "auto") != 0) {
 875         errno = 0;
 876         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 877         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 878             error_setg(errp, "icount: Invalid shift value");
 879         }
 880         use_icount = 1;
 881         return;
 882     } else if (icount_align_option) {
 883         error_setg(errp, "shift=auto and align=on are incompatible");
 884     } else if (!icount_sleep) {
 885         error_setg(errp, "shift=auto and sleep=off are incompatible");
 886     }
 887
 888     use_icount = 2;
 889
 890     /* 125MIPS seems a reasonable initial guess at the guest speed.
 891        It will be corrected fairly quickly anyway.  */
 892     timers_state.icount_time_shift = 3;
 893
 894     /* Have both realtime and virtual time triggers for speed adjustment.
 895        The realtime trigger catches emulated time passing too slowly,
 896        the virtual time trigger catches emulated time passing too fast.
 897        Realtime triggers occur even when idle, so use them less frequently
 898        than VM triggers.  */
 899     timers_state.vm_clock_warp_start = -1;
 900     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 901                                    icount_adjust_rt, NULL);
 902     timer_mod(timers_state.icount_rt_timer,
 903                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 904     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 905                                         icount_adjust_vm, NULL);
 906     timer_mod(timers_state.icount_vm_timer,
 907                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 908                    NANOSECONDS_PER_SECOND / 10);
 909 }
 910
 911 /***********************************************************/
 912 /* TCG vCPU kick timer
 913  *
 914  * The kick timer is responsible for moving single threaded vCPU
 915  * emulation on to the next vCPU. If more than one vCPU is running a
 916  * timer event with force a cpu->exit so the next vCPU can get
 917  * scheduled.
 918  *
 919  * The timer is removed if all vCPUs are idle and restarted again once
 920  * idleness is complete.
 921  */
 922
 923 static QEMUTimer *tcg_kick_vcpu_timer;
 924 static CPUState *tcg_current_rr_cpu;
 925
 926 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 927
 928 static inline int64_t qemu_tcg_next_kick(void)
 929 {
 930     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 931 }
 932
 933 /* Kick the currently round-robin scheduled vCPU */
 934 static void qemu_cpu_kick_rr_cpu(void)
 935 {
 936     CPUState *cpu;
 937     do {
 938         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 939         if (cpu) {
 940             cpu_exit(cpu);
 941         }
 942     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 943 }
 944
 945 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 946 {
 947 }
 948
 949 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 950 {
 951     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 952         qemu_notify_event();
 953         return;
 954     }
 955
 956     if (qemu_in_vcpu_thread()) {
 957         /* A CPU is currently running; kick it back out to the
 958          * tcg_cpu_exec() loop so it will recalculate its
 959          * icount deadline immediately.
 960          */
 961         qemu_cpu_kick(current_cpu);
 962     } else if (first_cpu) {
 963         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 964          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 965          * causes cpu_thread_is_idle to return false.  This way,
 966          * handle_icount_deadline can run.
 967          * If we have no CPUs at all for some reason, we don't
 968          * need to do anything.
 969          */
 970         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 971     }
 972 }
 973
 974 static void kick_tcg_thread(void *opaque)
 975 {
 976     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 977     qemu_cpu_kick_rr_cpu();
 978 }
 979
 980 static void start_tcg_kick_timer(void)
 981 {
 982     assert(!mttcg_enabled);
 983     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 984         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 985                                            kick_tcg_thread, NULL);
 986     }
 987     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 988         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 989     }
 990 }
 991
 992 static void stop_tcg_kick_timer(void)
 993 {
 994     assert(!mttcg_enabled);
 995     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 996         timer_del(tcg_kick_vcpu_timer);
 997     }
 998 }
 999
1000 /***********************************************************/
1001 void hw_error(const char *fmt, ...)
1002 {
1003     va_list ap;
1004     CPUState *cpu;
1005
1006     va_start(ap, fmt);
1007     fprintf(stderr, "qemu: hardware error: ");
1008     vfprintf(stderr, fmt, ap);
1009     fprintf(stderr, "\n");
1010     CPU_FOREACH(cpu) {
1011         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1012         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
1013     }
1014     va_end(ap);
1015     abort();
1016 }
1017
1018 void cpu_synchronize_all_states(void)
1019 {
1020     CPUState *cpu;
1021
1022     CPU_FOREACH(cpu) {
1023         cpu_synchronize_state(cpu);
1024         /* TODO: move to cpu_synchronize_state() */
1025         if (hvf_enabled()) {
1026             hvf_cpu_synchronize_state(cpu);
1027         }
1028     }
1029 }
1030
1031 void cpu_synchronize_all_post_reset(void)
1032 {
1033     CPUState *cpu;
1034
1035     CPU_FOREACH(cpu) {
1036         cpu_synchronize_post_reset(cpu);
1037         /* TODO: move to cpu_synchronize_post_reset() */
1038         if (hvf_enabled()) {
1039             hvf_cpu_synchronize_post_reset(cpu);
1040         }
1041     }
1042 }
1043
1044 void cpu_synchronize_all_post_init(void)
1045 {
1046     CPUState *cpu;
1047
1048     CPU_FOREACH(cpu) {
1049         cpu_synchronize_post_init(cpu);
1050         /* TODO: move to cpu_synchronize_post_init() */
1051         if (hvf_enabled()) {
1052             hvf_cpu_synchronize_post_init(cpu);
1053         }
1054     }
1055 }
1056
1057 void cpu_synchronize_all_pre_loadvm(void)
1058 {
1059     CPUState *cpu;
1060
1061     CPU_FOREACH(cpu) {
1062         cpu_synchronize_pre_loadvm(cpu);
1063     }
1064 }
1065
1066 static int do_vm_stop(RunState state, bool send_stop)
1067 {
1068     int ret = 0;
1069
1070     if (runstate_is_running()) {
1071         cpu_disable_ticks();
1072         pause_all_vcpus();
1073         runstate_set(state);
1074         vm_state_notify(0, state);
1075         if (send_stop) {
1076             qapi_event_send_stop();
1077         }
1078     }
1079
1080     bdrv_drain_all();
1081     replay_disable_events();
1082     ret = bdrv_flush_all();
1083
1084     return ret;
1085 }
1086
1087 /* Special vm_stop() variant for terminating the process.  Historically clients
1088  * did not expect a QMP STOP event and so we need to retain compatibility.
1089  */
1090 int vm_shutdown(void)
1091 {
1092     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1093 }
1094
1095 static bool cpu_can_run(CPUState *cpu)
1096 {
1097     if (cpu->stop) {
1098         return false;
1099     }
1100     if (cpu_is_stopped(cpu)) {
1101         return false;
1102     }
1103     return true;
1104 }
1105
1106 static void cpu_handle_guest_debug(CPUState *cpu)
1107 {
1108     gdb_set_stop_cpu(cpu);
1109     qemu_system_debug_request();
1110     cpu->stopped = true;
1111 }
1112
1113 #ifdef CONFIG_LINUX
1114 static void sigbus_reraise(void)
1115 {
1116     sigset_t set;
1117     struct sigaction action;
1118
1119     memset(&action, 0, sizeof(action));
1120     action.sa_handler = SIG_DFL;
1121     if (!sigaction(SIGBUS, &action, NULL)) {
1122         raise(SIGBUS);
1123         sigemptyset(&set);
1124         sigaddset(&set, SIGBUS);
1125         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1126     }
1127     perror("Failed to re-raise SIGBUS!\n");
1128     abort();
1129 }
1130
1131 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1132 {
1133     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1134         sigbus_reraise();
1135     }
1136
1137     if (current_cpu) {
1138         /* Called asynchronously in VCPU thread.  */
1139         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1140             sigbus_reraise();
1141         }
1142     } else {
1143         /* Called synchronously (via signalfd) in main thread.  */
1144         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1145             sigbus_reraise();
1146         }
1147     }
1148 }
1149
1150 static void qemu_init_sigbus(void)
1151 {
1152     struct sigaction action;
1153
1154     memset(&action, 0, sizeof(action));
1155     action.sa_flags = SA_SIGINFO;
1156     action.sa_sigaction = sigbus_handler;
1157     sigaction(SIGBUS, &action, NULL);
1158
1159     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1160 }
1161 #else /* !CONFIG_LINUX */
1162 static void qemu_init_sigbus(void)
1163 {
1164 }
1165 #endif /* !CONFIG_LINUX */
1166
1167 static QemuMutex qemu_global_mutex;
1168
1169 static QemuThread io_thread;
1170
1171 /* cpu creation */
1172 static QemuCond qemu_cpu_cond;
1173 /* system init */
1174 static QemuCond qemu_pause_cond;
1175
1176 void qemu_init_cpu_loop(void)
1177 {
1178     qemu_init_sigbus();
1179     qemu_cond_init(&qemu_cpu_cond);
1180     qemu_cond_init(&qemu_pause_cond);
1181     qemu_mutex_init(&qemu_global_mutex);
1182
1183     qemu_thread_get_self(&io_thread);
1184 }
1185
1186 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1187 {
1188     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1189 }
1190
1191 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1192 {
1193     if (kvm_destroy_vcpu(cpu) < 0) {
1194         error_report("kvm_destroy_vcpu failed");
1195         exit(EXIT_FAILURE);
1196     }
1197 }
1198
1199 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1200 {
1201 }
1202
1203 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1204 {
1205     g_assert(qemu_cpu_is_self(cpu));
1206     cpu->stop = false;
1207     cpu->stopped = true;
1208     if (exit) {
1209         cpu_exit(cpu);
1210     }
1211     qemu_cond_broadcast(&qemu_pause_cond);
1212 }
1213
1214 static void qemu_wait_io_event_common(CPUState *cpu)
1215 {
1216     atomic_mb_set(&cpu->thread_kicked, false);
1217     if (cpu->stop) {
1218         qemu_cpu_stop(cpu, false);
1219     }
1220     process_queued_cpu_work(cpu);
1221 }
1222
1223 static void qemu_tcg_rr_wait_io_event(void)
1224 {
1225     CPUState *cpu;
1226
1227     while (all_cpu_threads_idle()) {
1228         stop_tcg_kick_timer();
1229         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1230     }
1231
1232     start_tcg_kick_timer();
1233
1234     CPU_FOREACH(cpu) {
1235         qemu_wait_io_event_common(cpu);
1236     }
1237 }
1238
1239 static void qemu_wait_io_event(CPUState *cpu)
1240 {
1241     while (cpu_thread_is_idle(cpu)) {
1242         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1243     }
1244
1245 #ifdef _WIN32
1246     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1247     if (!tcg_enabled()) {
1248         SleepEx(0, TRUE);
1249     }
1250 #endif
1251     qemu_wait_io_event_common(cpu);
1252 }
1253
1254 static void *qemu_kvm_cpu_thread_fn(void *arg)
1255 {
1256     CPUState *cpu = arg;
1257     int r;
1258
1259     rcu_register_thread();
1260
1261     qemu_mutex_lock_iothread();
1262     qemu_thread_get_self(cpu->thread);
1263     cpu->thread_id = qemu_get_thread_id();
1264     cpu->can_do_io = 1;
1265     current_cpu = cpu;
1266
1267     r = kvm_init_vcpu(cpu);
1268     if (r < 0) {
1269         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1270         exit(1);
1271     }
1272
1273     kvm_init_cpu_signals(cpu);
1274
1275     /* signal CPU creation */
1276     cpu->created = true;
1277     qemu_cond_signal(&qemu_cpu_cond);
1278
1279     do {
1280         if (cpu_can_run(cpu)) {
1281             r = kvm_cpu_exec(cpu);
1282             if (r == EXCP_DEBUG) {
1283                 cpu_handle_guest_debug(cpu);
1284             }
1285         }
1286         qemu_wait_io_event(cpu);
1287     } while (!cpu->unplug || cpu_can_run(cpu));
1288
1289     qemu_kvm_destroy_vcpu(cpu);
1290     cpu->created = false;
1291     qemu_cond_signal(&qemu_cpu_cond);
1292     qemu_mutex_unlock_iothread();
1293     rcu_unregister_thread();
1294     return NULL;
1295 }
1296
1297 static void *qemu_dummy_cpu_thread_fn(void *arg)
1298 {
1299 #ifdef _WIN32
1300     error_report("qtest is not supported under Windows");
1301     exit(1);
1302 #else
1303     CPUState *cpu = arg;
1304     sigset_t waitset;
1305     int r;
1306
1307     rcu_register_thread();
1308
1309     qemu_mutex_lock_iothread();
1310     qemu_thread_get_self(cpu->thread);
1311     cpu->thread_id = qemu_get_thread_id();
1312     cpu->can_do_io = 1;
1313     current_cpu = cpu;
1314
1315     sigemptyset(&waitset);
1316     sigaddset(&waitset, SIG_IPI);
1317
1318     /* signal CPU creation */
1319     cpu->created = true;
1320     qemu_cond_signal(&qemu_cpu_cond);
1321
1322     do {
1323         qemu_mutex_unlock_iothread();
1324         do {
1325             int sig;
1326             r = sigwait(&waitset, &sig);
1327         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1328         if (r == -1) {
1329             perror("sigwait");
1330             exit(1);
1331         }
1332         qemu_mutex_lock_iothread();
1333         qemu_wait_io_event(cpu);
1334     } while (!cpu->unplug);
1335
1336     qemu_mutex_unlock_iothread();
1337     rcu_unregister_thread();
1338     return NULL;
1339 #endif
1340 }
1341
1342 static int64_t tcg_get_icount_limit(void)
1343 {
1344     int64_t deadline;
1345
1346     if (replay_mode != REPLAY_MODE_PLAY) {
1347         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1348
1349         /* Maintain prior (possibly buggy) behaviour where if no deadline
1350          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1351          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1352          * nanoseconds.
1353          */
1354         if ((deadline < 0) || (deadline > INT32_MAX)) {
1355             deadline = INT32_MAX;
1356         }
1357
1358         return qemu_icount_round(deadline);
1359     } else {
1360         return replay_get_instructions();
1361     }
1362 }
1363
1364 static void handle_icount_deadline(void)
1365 {
1366     assert(qemu_in_vcpu_thread());
1367     if (use_icount) {
1368         int64_t deadline =
1369             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1370
1371         if (deadline == 0) {
1372             /* Wake up other AioContexts.  */
1373             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1374             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1375         }
1376     }
1377 }
1378
1379 static void prepare_icount_for_run(CPUState *cpu)
1380 {
1381     if (use_icount) {
1382         int insns_left;
1383
1384         /* These should always be cleared by process_icount_data after
1385          * each vCPU execution. However u16.high can be raised
1386          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1387          */
1388         g_assert(cpu->icount_decr.u16.low == 0);
1389         g_assert(cpu->icount_extra == 0);
1390
1391         cpu->icount_budget = tcg_get_icount_limit();
1392         insns_left = MIN(0xffff, cpu->icount_budget);
1393         cpu->icount_decr.u16.low = insns_left;
1394         cpu->icount_extra = cpu->icount_budget - insns_left;
1395
1396         replay_mutex_lock();
1397     }
1398 }
1399
1400 static void process_icount_data(CPUState *cpu)
1401 {
1402     if (use_icount) {
1403         /* Account for executed instructions */
1404         cpu_update_icount(cpu);
1405
1406         /* Reset the counters */
1407         cpu->icount_decr.u16.low = 0;
1408         cpu->icount_extra = 0;
1409         cpu->icount_budget = 0;
1410
1411         replay_account_executed_instructions();
1412
1413         replay_mutex_unlock();
1414     }
1415 }
1416
1417
1418 static int tcg_cpu_exec(CPUState *cpu)
1419 {
1420     int ret;
1421 #ifdef CONFIG_PROFILER
1422     int64_t ti;
1423 #endif
1424
1425     assert(tcg_enabled());
1426 #ifdef CONFIG_PROFILER
1427     ti = profile_getclock();
1428 #endif
1429     cpu_exec_start(cpu);
1430     ret = cpu_exec(cpu);
1431     cpu_exec_end(cpu);
1432 #ifdef CONFIG_PROFILER
1433     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1434                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1435 #endif
1436     return ret;
1437 }
1438
1439 /* Destroy any remaining vCPUs which have been unplugged and have
1440  * finished running
1441  */
1442 static void deal_with_unplugged_cpus(void)
1443 {
1444     CPUState *cpu;
1445
1446     CPU_FOREACH(cpu) {
1447         if (cpu->unplug && !cpu_can_run(cpu)) {
1448             qemu_tcg_destroy_vcpu(cpu);
1449             cpu->created = false;
1450             qemu_cond_signal(&qemu_cpu_cond);
1451             break;
1452         }
1453     }
1454 }
1455
1456 /* Single-threaded TCG
1457  *
1458  * In the single-threaded case each vCPU is simulated in turn. If
1459  * there is more than a single vCPU we create a simple timer to kick
1460  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1461  * This is done explicitly rather than relying on side-effects
1462  * elsewhere.
1463  */
1464
1465 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1466 {
1467     CPUState *cpu = arg;
1468
1469     assert(tcg_enabled());
1470     rcu_register_thread();
1471     tcg_register_thread();
1472
1473     qemu_mutex_lock_iothread();
1474     qemu_thread_get_self(cpu->thread);
1475
1476     cpu->thread_id = qemu_get_thread_id();
1477     cpu->created = true;
1478     cpu->can_do_io = 1;
1479     qemu_cond_signal(&qemu_cpu_cond);
1480
1481     /* wait for initial kick-off after machine start */
1482     while (first_cpu->stopped) {
1483         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1484
1485         /* process any pending work */
1486         CPU_FOREACH(cpu) {
1487             current_cpu = cpu;
1488             qemu_wait_io_event_common(cpu);
1489         }
1490     }
1491
1492     start_tcg_kick_timer();
1493
1494     cpu = first_cpu;
1495
1496     /* process any pending work */
1497     cpu->exit_request = 1;
1498
1499     while (1) {
1500         qemu_mutex_unlock_iothread();
1501         replay_mutex_lock();
1502         qemu_mutex_lock_iothread();
1503         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1504         qemu_account_warp_timer();
1505
1506         /* Run the timers here.  This is much more efficient than
1507          * waking up the I/O thread and waiting for completion.
1508          */
1509         handle_icount_deadline();
1510
1511         replay_mutex_unlock();
1512
1513         if (!cpu) {
1514             cpu = first_cpu;
1515         }
1516
1517         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1518
1519             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1520             current_cpu = cpu;
1521
1522             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1523                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1524
1525             if (cpu_can_run(cpu)) {
1526                 int r;
1527
1528                 qemu_mutex_unlock_iothread();
1529                 prepare_icount_for_run(cpu);
1530
1531                 r = tcg_cpu_exec(cpu);
1532
1533                 process_icount_data(cpu);
1534                 qemu_mutex_lock_iothread();
1535
1536                 if (r == EXCP_DEBUG) {
1537                     cpu_handle_guest_debug(cpu);
1538                     break;
1539                 } else if (r == EXCP_ATOMIC) {
1540                     qemu_mutex_unlock_iothread();
1541                     cpu_exec_step_atomic(cpu);
1542                     qemu_mutex_lock_iothread();
1543                     break;
1544                 }
1545             } else if (cpu->stop) {
1546                 if (cpu->unplug) {
1547                     cpu = CPU_NEXT(cpu);
1548                 }
1549                 break;
1550             }
1551
1552             cpu = CPU_NEXT(cpu);
1553         } /* while (cpu && !cpu->exit_request).. */
1554
1555         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1556         atomic_set(&tcg_current_rr_cpu, NULL);
1557
1558         if (cpu && cpu->exit_request) {
1559             atomic_mb_set(&cpu->exit_request, 0);
1560         }
1561
1562         if (use_icount && all_cpu_threads_idle()) {
1563             /*
1564              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1565              * in the main_loop, wake it up in order to start the warp timer.
1566              */
1567             qemu_notify_event();
1568         }
1569
1570         qemu_tcg_rr_wait_io_event();
1571         deal_with_unplugged_cpus();
1572     }
1573
1574     rcu_unregister_thread();
1575     return NULL;
1576 }
1577
1578 static void *qemu_hax_cpu_thread_fn(void *arg)
1579 {
1580     CPUState *cpu = arg;
1581     int r;
1582
1583     rcu_register_thread();
1584     qemu_mutex_lock_iothread();
1585     qemu_thread_get_self(cpu->thread);
1586
1587     cpu->thread_id = qemu_get_thread_id();
1588     cpu->created = true;
1589     cpu->halted = 0;
1590     current_cpu = cpu;
1591
1592     hax_init_vcpu(cpu);
1593     qemu_cond_signal(&qemu_cpu_cond);
1594
1595     do {
1596         if (cpu_can_run(cpu)) {
1597             r = hax_smp_cpu_exec(cpu);
1598             if (r == EXCP_DEBUG) {
1599                 cpu_handle_guest_debug(cpu);
1600             }
1601         }
1602
1603         qemu_wait_io_event(cpu);
1604     } while (!cpu->unplug || cpu_can_run(cpu));
1605     rcu_unregister_thread();
1606     return NULL;
1607 }
1608
1609 /* The HVF-specific vCPU thread function. This one should only run when the host
1610  * CPU supports the VMX "unrestricted guest" feature. */
1611 static void *qemu_hvf_cpu_thread_fn(void *arg)
1612 {
1613     CPUState *cpu = arg;
1614
1615     int r;
1616
1617     assert(hvf_enabled());
1618
1619     rcu_register_thread();
1620
1621     qemu_mutex_lock_iothread();
1622     qemu_thread_get_self(cpu->thread);
1623
1624     cpu->thread_id = qemu_get_thread_id();
1625     cpu->can_do_io = 1;
1626     current_cpu = cpu;
1627
1628     hvf_init_vcpu(cpu);
1629
1630     /* signal CPU creation */
1631     cpu->created = true;
1632     qemu_cond_signal(&qemu_cpu_cond);
1633
1634     do {
1635         if (cpu_can_run(cpu)) {
1636             r = hvf_vcpu_exec(cpu);
1637             if (r == EXCP_DEBUG) {
1638                 cpu_handle_guest_debug(cpu);
1639             }
1640         }
1641         qemu_wait_io_event(cpu);
1642     } while (!cpu->unplug || cpu_can_run(cpu));
1643
1644     hvf_vcpu_destroy(cpu);
1645     cpu->created = false;
1646     qemu_cond_signal(&qemu_cpu_cond);
1647     qemu_mutex_unlock_iothread();
1648     rcu_unregister_thread();
1649     return NULL;
1650 }
1651
1652 static void *qemu_whpx_cpu_thread_fn(void *arg)
1653 {
1654     CPUState *cpu = arg;
1655     int r;
1656
1657     rcu_register_thread();
1658
1659     qemu_mutex_lock_iothread();
1660     qemu_thread_get_self(cpu->thread);
1661     cpu->thread_id = qemu_get_thread_id();
1662     current_cpu = cpu;
1663
1664     r = whpx_init_vcpu(cpu);
1665     if (r < 0) {
1666         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1667         exit(1);
1668     }
1669
1670     /* signal CPU creation */
1671     cpu->created = true;
1672     qemu_cond_signal(&qemu_cpu_cond);
1673
1674     do {
1675         if (cpu_can_run(cpu)) {
1676             r = whpx_vcpu_exec(cpu);
1677             if (r == EXCP_DEBUG) {
1678                 cpu_handle_guest_debug(cpu);
1679             }
1680         }
1681         while (cpu_thread_is_idle(cpu)) {
1682             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1683         }
1684         qemu_wait_io_event_common(cpu);
1685     } while (!cpu->unplug || cpu_can_run(cpu));
1686
1687     whpx_destroy_vcpu(cpu);
1688     cpu->created = false;
1689     qemu_cond_signal(&qemu_cpu_cond);
1690     qemu_mutex_unlock_iothread();
1691     rcu_unregister_thread();
1692     return NULL;
1693 }
1694
1695 #ifdef _WIN32
1696 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1697 {
1698 }
1699 #endif
1700
1701 /* Multi-threaded TCG
1702  *
1703  * In the multi-threaded case each vCPU has its own thread. The TLS
1704  * variable current_cpu can be used deep in the code to find the
1705  * current CPUState for a given thread.
1706  */
1707
1708 static void *qemu_tcg_cpu_thread_fn(void *arg)
1709 {
1710     CPUState *cpu = arg;
1711
1712     assert(tcg_enabled());
1713     g_assert(!use_icount);
1714
1715     rcu_register_thread();
1716     tcg_register_thread();
1717
1718     qemu_mutex_lock_iothread();
1719     qemu_thread_get_self(cpu->thread);
1720
1721     cpu->thread_id = qemu_get_thread_id();
1722     cpu->created = true;
1723     cpu->can_do_io = 1;
1724     current_cpu = cpu;
1725     qemu_cond_signal(&qemu_cpu_cond);
1726
1727     /* process any pending work */
1728     cpu->exit_request = 1;
1729
1730     do {
1731         if (cpu_can_run(cpu)) {
1732             int r;
1733             qemu_mutex_unlock_iothread();
1734             r = tcg_cpu_exec(cpu);
1735             qemu_mutex_lock_iothread();
1736             switch (r) {
1737             case EXCP_DEBUG:
1738                 cpu_handle_guest_debug(cpu);
1739                 break;
1740             case EXCP_HALTED:
1741                 /* during start-up the vCPU is reset and the thread is
1742                  * kicked several times. If we don't ensure we go back
1743                  * to sleep in the halted state we won't cleanly
1744                  * start-up when the vCPU is enabled.
1745                  *
1746                  * cpu->halted should ensure we sleep in wait_io_event
1747                  */
1748                 g_assert(cpu->halted);
1749                 break;
1750             case EXCP_ATOMIC:
1751                 qemu_mutex_unlock_iothread();
1752                 cpu_exec_step_atomic(cpu);
1753                 qemu_mutex_lock_iothread();
1754             default:
1755                 /* Ignore everything else? */
1756                 break;
1757             }
1758         }
1759
1760         atomic_mb_set(&cpu->exit_request, 0);
1761         qemu_wait_io_event(cpu);
1762     } while (!cpu->unplug || cpu_can_run(cpu));
1763
1764     qemu_tcg_destroy_vcpu(cpu);
1765     cpu->created = false;
1766     qemu_cond_signal(&qemu_cpu_cond);
1767     qemu_mutex_unlock_iothread();
1768     rcu_unregister_thread();
1769     return NULL;
1770 }
1771
1772 static void qemu_cpu_kick_thread(CPUState *cpu)
1773 {
1774 #ifndef _WIN32
1775     int err;
1776
1777     if (cpu->thread_kicked) {
1778         return;
1779     }
1780     cpu->thread_kicked = true;
1781     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1782     if (err && err != ESRCH) {
1783         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1784         exit(1);
1785     }
1786 #else /* _WIN32 */
1787     if (!qemu_cpu_is_self(cpu)) {
1788         if (whpx_enabled()) {
1789             whpx_vcpu_kick(cpu);
1790         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1791             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1792                     __func__, GetLastError());
1793             exit(1);
1794         }
1795     }
1796 #endif
1797 }
1798
1799 void qemu_cpu_kick(CPUState *cpu)
1800 {
1801     qemu_cond_broadcast(cpu->halt_cond);
1802     if (tcg_enabled()) {
1803         cpu_exit(cpu);
1804         /* NOP unless doing single-thread RR */
1805         qemu_cpu_kick_rr_cpu();
1806     } else {
1807         if (hax_enabled()) {
1808             /*
1809              * FIXME: race condition with the exit_request check in
1810              * hax_vcpu_hax_exec
1811              */
1812             cpu->exit_request = 1;
1813         }
1814         qemu_cpu_kick_thread(cpu);
1815     }
1816 }
1817
1818 void qemu_cpu_kick_self(void)
1819 {
1820     assert(current_cpu);
1821     qemu_cpu_kick_thread(current_cpu);
1822 }
1823
1824 bool qemu_cpu_is_self(CPUState *cpu)
1825 {
1826     return qemu_thread_is_self(cpu->thread);
1827 }
1828
1829 bool qemu_in_vcpu_thread(void)
1830 {
1831     return current_cpu && qemu_cpu_is_self(current_cpu);
1832 }
1833
1834 static __thread bool iothread_locked = false;
1835
1836 bool qemu_mutex_iothread_locked(void)
1837 {
1838     return iothread_locked;
1839 }
1840
1841 /*
1842  * The BQL is taken from so many places that it is worth profiling the
1843  * callers directly, instead of funneling them all through a single function.
1844  */
1845 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1846 {
1847     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1848
1849     g_assert(!qemu_mutex_iothread_locked());
1850     bql_lock(&qemu_global_mutex, file, line);
1851     iothread_locked = true;
1852 }
1853
1854 void qemu_mutex_unlock_iothread(void)
1855 {
1856     g_assert(qemu_mutex_iothread_locked());
1857     iothread_locked = false;
1858     qemu_mutex_unlock(&qemu_global_mutex);
1859 }
1860
1861 static bool all_vcpus_paused(void)
1862 {
1863     CPUState *cpu;
1864
1865     CPU_FOREACH(cpu) {
1866         if (!cpu->stopped) {
1867             return false;
1868         }
1869     }
1870
1871     return true;
1872 }
1873
1874 void pause_all_vcpus(void)
1875 {
1876     CPUState *cpu;
1877
1878     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1879     CPU_FOREACH(cpu) {
1880         if (qemu_cpu_is_self(cpu)) {
1881             qemu_cpu_stop(cpu, true);
1882         } else {
1883             cpu->stop = true;
1884             qemu_cpu_kick(cpu);
1885         }
1886     }
1887
1888     /* We need to drop the replay_lock so any vCPU threads woken up
1889      * can finish their replay tasks
1890      */
1891     replay_mutex_unlock();
1892
1893     while (!all_vcpus_paused()) {
1894         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1895         CPU_FOREACH(cpu) {
1896             qemu_cpu_kick(cpu);
1897         }
1898     }
1899
1900     qemu_mutex_unlock_iothread();
1901     replay_mutex_lock();
1902     qemu_mutex_lock_iothread();
1903 }
1904
1905 void cpu_resume(CPUState *cpu)
1906 {
1907     cpu->stop = false;
1908     cpu->stopped = false;
1909     qemu_cpu_kick(cpu);
1910 }
1911
1912 void resume_all_vcpus(void)
1913 {
1914     CPUState *cpu;
1915
1916     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1917     CPU_FOREACH(cpu) {
1918         cpu_resume(cpu);
1919     }
1920 }
1921
1922 void cpu_remove_sync(CPUState *cpu)
1923 {
1924     cpu->stop = true;
1925     cpu->unplug = true;
1926     qemu_cpu_kick(cpu);
1927     qemu_mutex_unlock_iothread();
1928     qemu_thread_join(cpu->thread);
1929     qemu_mutex_lock_iothread();
1930 }
1931
1932 /* For temporary buffers for forming a name */
1933 #define VCPU_THREAD_NAME_SIZE 16
1934
1935 static void qemu_tcg_init_vcpu(CPUState *cpu)
1936 {
1937     char thread_name[VCPU_THREAD_NAME_SIZE];
1938     static QemuCond *single_tcg_halt_cond;
1939     static QemuThread *single_tcg_cpu_thread;
1940     static int tcg_region_inited;
1941
1942     assert(tcg_enabled());
1943     /*
1944      * Initialize TCG regions--once. Now is a good time, because:
1945      * (1) TCG's init context, prologue and target globals have been set up.
1946      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1947      *     -accel flag is processed, so the check doesn't work then).
1948      */
1949     if (!tcg_region_inited) {
1950         tcg_region_inited = 1;
1951         tcg_region_init();
1952     }
1953
1954     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1955         cpu->thread = g_malloc0(sizeof(QemuThread));
1956         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1957         qemu_cond_init(cpu->halt_cond);
1958
1959         if (qemu_tcg_mttcg_enabled()) {
1960             /* create a thread per vCPU with TCG (MTTCG) */
1961             parallel_cpus = true;
1962             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1963                  cpu->cpu_index);
1964
1965             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1966                                cpu, QEMU_THREAD_JOINABLE);
1967
1968         } else {
1969             /* share a single thread for all cpus with TCG */
1970             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1971             qemu_thread_create(cpu->thread, thread_name,
1972                                qemu_tcg_rr_cpu_thread_fn,
1973                                cpu, QEMU_THREAD_JOINABLE);
1974
1975             single_tcg_halt_cond = cpu->halt_cond;
1976             single_tcg_cpu_thread = cpu->thread;
1977         }
1978 #ifdef _WIN32
1979         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1980 #endif
1981     } else {
1982         /* For non-MTTCG cases we share the thread */
1983         cpu->thread = single_tcg_cpu_thread;
1984         cpu->halt_cond = single_tcg_halt_cond;
1985         cpu->thread_id = first_cpu->thread_id;
1986         cpu->can_do_io = 1;
1987         cpu->created = true;
1988     }
1989 }
1990
1991 static void qemu_hax_start_vcpu(CPUState *cpu)
1992 {
1993     char thread_name[VCPU_THREAD_NAME_SIZE];
1994
1995     cpu->thread = g_malloc0(sizeof(QemuThread));
1996     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1997     qemu_cond_init(cpu->halt_cond);
1998
1999     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2000              cpu->cpu_index);
2001     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2002                        cpu, QEMU_THREAD_JOINABLE);
2003 #ifdef _WIN32
2004     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2005 #endif
2006 }
2007
2008 static void qemu_kvm_start_vcpu(CPUState *cpu)
2009 {
2010     char thread_name[VCPU_THREAD_NAME_SIZE];
2011
2012     cpu->thread = g_malloc0(sizeof(QemuThread));
2013     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2014     qemu_cond_init(cpu->halt_cond);
2015     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2016              cpu->cpu_index);
2017     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2018                        cpu, QEMU_THREAD_JOINABLE);
2019 }
2020
2021 static void qemu_hvf_start_vcpu(CPUState *cpu)
2022 {
2023     char thread_name[VCPU_THREAD_NAME_SIZE];
2024
2025     /* HVF currently does not support TCG, and only runs in
2026      * unrestricted-guest mode. */
2027     assert(hvf_enabled());
2028
2029     cpu->thread = g_malloc0(sizeof(QemuThread));
2030     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2031     qemu_cond_init(cpu->halt_cond);
2032
2033     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2034              cpu->cpu_index);
2035     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2036                        cpu, QEMU_THREAD_JOINABLE);
2037 }
2038
2039 static void qemu_whpx_start_vcpu(CPUState *cpu)
2040 {
2041     char thread_name[VCPU_THREAD_NAME_SIZE];
2042
2043     cpu->thread = g_malloc0(sizeof(QemuThread));
2044     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2045     qemu_cond_init(cpu->halt_cond);
2046     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2047              cpu->cpu_index);
2048     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2049                        cpu, QEMU_THREAD_JOINABLE);
2050 #ifdef _WIN32
2051     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2052 #endif
2053 }
2054
2055 static void qemu_dummy_start_vcpu(CPUState *cpu)
2056 {
2057     char thread_name[VCPU_THREAD_NAME_SIZE];
2058
2059     cpu->thread = g_malloc0(sizeof(QemuThread));
2060     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2061     qemu_cond_init(cpu->halt_cond);
2062     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2063              cpu->cpu_index);
2064     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2065                        QEMU_THREAD_JOINABLE);
2066 }
2067
2068 void qemu_init_vcpu(CPUState *cpu)
2069 {
2070     cpu->nr_cores = smp_cores;
2071     cpu->nr_threads = smp_threads;
2072     cpu->stopped = true;
2073
2074     if (!cpu->as) {
2075         /* If the target cpu hasn't set up any address spaces itself,
2076          * give it the default one.
2077          */
2078         cpu->num_ases = 1;
2079         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2080     }
2081
2082     if (kvm_enabled()) {
2083         qemu_kvm_start_vcpu(cpu);
2084     } else if (hax_enabled()) {
2085         qemu_hax_start_vcpu(cpu);
2086     } else if (hvf_enabled()) {
2087         qemu_hvf_start_vcpu(cpu);
2088     } else if (tcg_enabled()) {
2089         qemu_tcg_init_vcpu(cpu);
2090     } else if (whpx_enabled()) {
2091         qemu_whpx_start_vcpu(cpu);
2092     } else {
2093         qemu_dummy_start_vcpu(cpu);
2094     }
2095
2096     while (!cpu->created) {
2097         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2098     }
2099 }
2100
2101 void cpu_stop_current(void)
2102 {
2103     if (current_cpu) {
2104         current_cpu->stop = true;
2105         cpu_exit(current_cpu);
2106     }
2107 }
2108
2109 int vm_stop(RunState state)
2110 {
2111     if (qemu_in_vcpu_thread()) {
2112         qemu_system_vmstop_request_prepare();
2113         qemu_system_vmstop_request(state);
2114         /*
2115          * FIXME: should not return to device code in case
2116          * vm_stop() has been requested.
2117          */
2118         cpu_stop_current();
2119         return 0;
2120     }
2121
2122     return do_vm_stop(state, true);
2123 }
2124
2125 /**
2126  * Prepare for (re)starting the VM.
2127  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2128  * running or in case of an error condition), 0 otherwise.
2129  */
2130 int vm_prepare_start(void)
2131 {
2132     RunState requested;
2133
2134     qemu_vmstop_requested(&requested);
2135     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2136         return -1;
2137     }
2138
2139     /* Ensure that a STOP/RESUME pair of events is emitted if a
2140      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2141      * example, according to documentation is always followed by
2142      * the STOP event.
2143      */
2144     if (runstate_is_running()) {
2145         qapi_event_send_stop();
2146         qapi_event_send_resume();
2147         return -1;
2148     }
2149
2150     /* We are sending this now, but the CPUs will be resumed shortly later */
2151     qapi_event_send_resume();
2152
2153     replay_enable_events();
2154     cpu_enable_ticks();
2155     runstate_set(RUN_STATE_RUNNING);
2156     vm_state_notify(1, RUN_STATE_RUNNING);
2157     return 0;
2158 }
2159
2160 void vm_start(void)
2161 {
2162     if (!vm_prepare_start()) {
2163         resume_all_vcpus();
2164     }
2165 }
2166
2167 /* does a state transition even if the VM is already stopped,
2168    current state is forgotten forever */
2169 int vm_stop_force_state(RunState state)
2170 {
2171     if (runstate_is_running()) {
2172         return vm_stop(state);
2173     } else {
2174         runstate_set(state);
2175
2176         bdrv_drain_all();
2177         /* Make sure to return an error if the flush in a previous vm_stop()
2178          * failed. */
2179         return bdrv_flush_all();
2180     }
2181 }
2182
2183 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2184 {
2185     /* XXX: implement xxx_cpu_list for targets that still miss it */
2186 #if defined(cpu_list)
2187     cpu_list(f, cpu_fprintf);
2188 #endif
2189 }
2190
2191 CpuInfoList *qmp_query_cpus(Error **errp)
2192 {
2193     MachineState *ms = MACHINE(qdev_get_machine());
2194     MachineClass *mc = MACHINE_GET_CLASS(ms);
2195     CpuInfoList *head = NULL, *cur_item = NULL;
2196     CPUState *cpu;
2197
2198     CPU_FOREACH(cpu) {
2199         CpuInfoList *info;
2200 #if defined(TARGET_I386)
2201         X86CPU *x86_cpu = X86_CPU(cpu);
2202         CPUX86State *env = &x86_cpu->env;
2203 #elif defined(TARGET_PPC)
2204         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2205         CPUPPCState *env = &ppc_cpu->env;
2206 #elif defined(TARGET_SPARC)
2207         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2208         CPUSPARCState *env = &sparc_cpu->env;
2209 #elif defined(TARGET_RISCV)
2210         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2211         CPURISCVState *env = &riscv_cpu->env;
2212 #elif defined(TARGET_MIPS)
2213         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2214         CPUMIPSState *env = &mips_cpu->env;
2215 #elif defined(TARGET_TRICORE)
2216         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2217         CPUTriCoreState *env = &tricore_cpu->env;
2218 #elif defined(TARGET_S390X)
2219         S390CPU *s390_cpu = S390_CPU(cpu);
2220         CPUS390XState *env = &s390_cpu->env;
2221 #endif
2222
2223         cpu_synchronize_state(cpu);
2224
2225         info = g_malloc0(sizeof(*info));
2226         info->value = g_malloc0(sizeof(*info->value));
2227         info->value->CPU = cpu->cpu_index;
2228         info->value->current = (cpu == first_cpu);
2229         info->value->halted = cpu->halted;
2230         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2231         info->value->thread_id = cpu->thread_id;
2232 #if defined(TARGET_I386)
2233         info->value->arch = CPU_INFO_ARCH_X86;
2234         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2235 #elif defined(TARGET_PPC)
2236         info->value->arch = CPU_INFO_ARCH_PPC;
2237         info->value->u.ppc.nip = env->nip;
2238 #elif defined(TARGET_SPARC)
2239         info->value->arch = CPU_INFO_ARCH_SPARC;
2240         info->value->u.q_sparc.pc = env->pc;
2241         info->value->u.q_sparc.npc = env->npc;
2242 #elif defined(TARGET_MIPS)
2243         info->value->arch = CPU_INFO_ARCH_MIPS;
2244         info->value->u.q_mips.PC = env->active_tc.PC;
2245 #elif defined(TARGET_TRICORE)
2246         info->value->arch = CPU_INFO_ARCH_TRICORE;
2247         info->value->u.tricore.PC = env->PC;
2248 #elif defined(TARGET_S390X)
2249         info->value->arch = CPU_INFO_ARCH_S390;
2250         info->value->u.s390.cpu_state = env->cpu_state;
2251 #elif defined(TARGET_RISCV)
2252         info->value->arch = CPU_INFO_ARCH_RISCV;
2253         info->value->u.riscv.pc = env->pc;
2254 #else
2255         info->value->arch = CPU_INFO_ARCH_OTHER;
2256 #endif
2257         info->value->has_props = !!mc->cpu_index_to_instance_props;
2258         if (info->value->has_props) {
2259             CpuInstanceProperties *props;
2260             props = g_malloc0(sizeof(*props));
2261             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2262             info->value->props = props;
2263         }
2264
2265         /* XXX: waiting for the qapi to support GSList */
2266         if (!cur_item) {
2267             head = cur_item = info;
2268         } else {
2269             cur_item->next = info;
2270             cur_item = info;
2271         }
2272     }
2273
2274     return head;
2275 }
2276
2277 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2278 {
2279     /*
2280      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2281      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2282      */
2283     switch (target) {
2284     case SYS_EMU_TARGET_I386:
2285     case SYS_EMU_TARGET_X86_64:
2286         return CPU_INFO_ARCH_X86;
2287
2288     case SYS_EMU_TARGET_PPC:
2289     case SYS_EMU_TARGET_PPC64:
2290         return CPU_INFO_ARCH_PPC;
2291
2292     case SYS_EMU_TARGET_SPARC:
2293     case SYS_EMU_TARGET_SPARC64:
2294         return CPU_INFO_ARCH_SPARC;
2295
2296     case SYS_EMU_TARGET_MIPS:
2297     case SYS_EMU_TARGET_MIPSEL:
2298     case SYS_EMU_TARGET_MIPS64:
2299     case SYS_EMU_TARGET_MIPS64EL:
2300         return CPU_INFO_ARCH_MIPS;
2301
2302     case SYS_EMU_TARGET_TRICORE:
2303         return CPU_INFO_ARCH_TRICORE;
2304
2305     case SYS_EMU_TARGET_S390X:
2306         return CPU_INFO_ARCH_S390;
2307
2308     case SYS_EMU_TARGET_RISCV32:
2309     case SYS_EMU_TARGET_RISCV64:
2310         return CPU_INFO_ARCH_RISCV;
2311
2312     default:
2313         return CPU_INFO_ARCH_OTHER;
2314     }
2315 }
2316
2317 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2318 {
2319 #ifdef TARGET_S390X
2320     S390CPU *s390_cpu = S390_CPU(cpu);
2321     CPUS390XState *env = &s390_cpu->env;
2322
2323     info->cpu_state = env->cpu_state;
2324 #else
2325     abort();
2326 #endif
2327 }
2328
2329 /*
2330  * fast means: we NEVER interrupt vCPU threads to retrieve
2331  * information from KVM.
2332  */
2333 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2334 {
2335     MachineState *ms = MACHINE(qdev_get_machine());
2336     MachineClass *mc = MACHINE_GET_CLASS(ms);
2337     CpuInfoFastList *head = NULL, *cur_item = NULL;
2338     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2339                                           -1, &error_abort);
2340     CPUState *cpu;
2341
2342     CPU_FOREACH(cpu) {
2343         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2344         info->value = g_malloc0(sizeof(*info->value));
2345
2346         info->value->cpu_index = cpu->cpu_index;
2347         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2348         info->value->thread_id = cpu->thread_id;
2349
2350         info->value->has_props = !!mc->cpu_index_to_instance_props;
2351         if (info->value->has_props) {
2352             CpuInstanceProperties *props;
2353             props = g_malloc0(sizeof(*props));
2354             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2355             info->value->props = props;
2356         }
2357
2358         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2359         info->value->target = target;
2360         if (target == SYS_EMU_TARGET_S390X) {
2361             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2362         }
2363
2364         if (!cur_item) {
2365             head = cur_item = info;
2366         } else {
2367             cur_item->next = info;
2368             cur_item = info;
2369         }
2370     }
2371
2372     return head;
2373 }
2374
2375 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2376                  bool has_cpu, int64_t cpu_index, Error **errp)
2377 {
2378     FILE *f;
2379     uint32_t l;
2380     CPUState *cpu;
2381     uint8_t buf[1024];
2382     int64_t orig_addr = addr, orig_size = size;
2383
2384     if (!has_cpu) {
2385         cpu_index = 0;
2386     }
2387
2388     cpu = qemu_get_cpu(cpu_index);
2389     if (cpu == NULL) {
2390         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2391                    "a CPU number");
2392         return;
2393     }
2394
2395     f = fopen(filename, "wb");
2396     if (!f) {
2397         error_setg_file_open(errp, errno, filename);
2398         return;
2399     }
2400
2401     while (size != 0) {
2402         l = sizeof(buf);
2403         if (l > size)
2404             l = size;
2405         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2406             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2407                              " specified", orig_addr, orig_size);
2408             goto exit;
2409         }
2410         if (fwrite(buf, 1, l, f) != l) {
2411             error_setg(errp, QERR_IO_ERROR);
2412             goto exit;
2413         }
2414         addr += l;
2415         size -= l;
2416     }
2417
2418 exit:
2419     fclose(f);
2420 }
2421
2422 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2423                   Error **errp)
2424 {
2425     FILE *f;
2426     uint32_t l;
2427     uint8_t buf[1024];
2428
2429     f = fopen(filename, "wb");
2430     if (!f) {
2431         error_setg_file_open(errp, errno, filename);
2432         return;
2433     }
2434
2435     while (size != 0) {
2436         l = sizeof(buf);
2437         if (l > size)
2438             l = size;
2439         cpu_physical_memory_read(addr, buf, l);
2440         if (fwrite(buf, 1, l, f) != l) {
2441             error_setg(errp, QERR_IO_ERROR);
2442             goto exit;
2443         }
2444         addr += l;
2445         size -= l;
2446     }
2447
2448 exit:
2449     fclose(f);
2450 }
2451
2452 void qmp_inject_nmi(Error **errp)
2453 {
2454     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2455 }
2456
2457 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2458 {
2459     if (!use_icount) {
2460         return;
2461     }
2462
2463     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2464                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2465     if (icount_align_option) {
2466         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2467         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2468     } else {
2469         cpu_fprintf(f, "Max guest delay     NA\n");
2470         cpu_fprintf(f, "Max guest advance   NA\n");
2471     }
2472 }