OSDN Git Service

virtio: add class_size to VirtioPCIDeviceTypeInfo
[qmiga/qemu.git] / cpus.c
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24
25 #include "qemu/osdep.h"
26 #include "qemu/config-file.h"
27 #include "cpu.h"
28 #include "monitor/monitor.h"
29 #include "qapi/error.h"
30 #include "qapi/qapi-commands-misc.h"
31 #include "qapi/qapi-events-run-state.h"
32 #include "qapi/qmp/qerror.h"
33 #include "qemu/error-report.h"
34 #include "sysemu/sysemu.h"
35 #include "sysemu/block-backend.h"
36 #include "exec/gdbstub.h"
37 #include "sysemu/dma.h"
38 #include "sysemu/hw_accel.h"
39 #include "sysemu/kvm.h"
40 #include "sysemu/hax.h"
41 #include "sysemu/hvf.h"
42 #include "sysemu/whpx.h"
43 #include "exec/exec-all.h"
44
45 #include "qemu/thread.h"
46 #include "sysemu/cpus.h"
47 #include "sysemu/qtest.h"
48 #include "qemu/main-loop.h"
49 #include "qemu/option.h"
50 #include "qemu/bitmap.h"
51 #include "qemu/seqlock.h"
52 #include "tcg.h"
53 #include "hw/nmi.h"
54 #include "sysemu/replay.h"
55 #include "hw/boards.h"
56
57 #ifdef CONFIG_LINUX
58
59 #include <sys/prctl.h>
60
61 #ifndef PR_MCE_KILL
62 #define PR_MCE_KILL 33
63 #endif
64
65 #ifndef PR_MCE_KILL_SET
66 #define PR_MCE_KILL_SET 1
67 #endif
68
69 #ifndef PR_MCE_KILL_EARLY
70 #define PR_MCE_KILL_EARLY 1
71 #endif
72
73 #endif /* CONFIG_LINUX */
74
75 int64_t max_delay;
76 int64_t max_advance;
77
78 /* vcpu throttling controls */
79 static QEMUTimer *throttle_timer;
80 static unsigned int throttle_percentage;
81
82 #define CPU_THROTTLE_PCT_MIN 1
83 #define CPU_THROTTLE_PCT_MAX 99
84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
85
86 bool cpu_is_stopped(CPUState *cpu)
87 {
88     return cpu->stopped || !runstate_is_running();
89 }
90
91 static bool cpu_thread_is_idle(CPUState *cpu)
92 {
93     if (cpu->stop || cpu->queued_work_first) {
94         return false;
95     }
96     if (cpu_is_stopped(cpu)) {
97         return true;
98     }
99     if (!cpu->halted || cpu_has_work(cpu) ||
100         kvm_halt_in_kernel()) {
101         return false;
102     }
103     return true;
104 }
105
106 static bool all_cpu_threads_idle(void)
107 {
108     CPUState *cpu;
109
110     CPU_FOREACH(cpu) {
111         if (!cpu_thread_is_idle(cpu)) {
112             return false;
113         }
114     }
115     return true;
116 }
117
118 /***********************************************************/
119 /* guest cycle counter */
120
121 /* Protected by TimersState seqlock */
122
123 static bool icount_sleep = true;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
125 #define MAX_ICOUNT_SHIFT 10
126
127 typedef struct TimersState {
128     /* Protected by BQL.  */
129     int64_t cpu_ticks_prev;
130     int64_t cpu_ticks_offset;
131
132     /* Protect fields that can be respectively read outside the
133      * BQL, and written from multiple threads.
134      */
135     QemuSeqLock vm_clock_seqlock;
136     QemuSpin vm_clock_lock;
137
138     int16_t cpu_ticks_enabled;
139
140     /* Conversion factor from emulated instructions to virtual clock ticks.  */
141     int16_t icount_time_shift;
142
143     /* Compensate for varying guest execution speed.  */
144     int64_t qemu_icount_bias;
145
146     int64_t vm_clock_warp_start;
147     int64_t cpu_clock_offset;
148
149     /* Only written by TCG thread */
150     int64_t qemu_icount;
151
152     /* for adjusting icount */
153     QEMUTimer *icount_rt_timer;
154     QEMUTimer *icount_vm_timer;
155     QEMUTimer *icount_warp_timer;
156 } TimersState;
157
158 static TimersState timers_state;
159 bool mttcg_enabled;
160
161 /*
162  * We default to false if we know other options have been enabled
163  * which are currently incompatible with MTTCG. Otherwise when each
164  * guest (target) has been updated to support:
165  *   - atomic instructions
166  *   - memory ordering primitives (barriers)
167  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
168  *
169  * Once a guest architecture has been converted to the new primitives
170  * there are two remaining limitations to check.
171  *
172  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
173  * - The host must have a stronger memory order than the guest
174  *
175  * It may be possible in future to support strong guests on weak hosts
176  * but that will require tagging all load/stores in a guest with their
177  * implicit memory order requirements which would likely slow things
178  * down a lot.
179  */
180
181 static bool check_tcg_memory_orders_compatible(void)
182 {
183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
184     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
185 #else
186     return false;
187 #endif
188 }
189
190 static bool default_mttcg_enabled(void)
191 {
192     if (use_icount || TCG_OVERSIZED_GUEST) {
193         return false;
194     } else {
195 #ifdef TARGET_SUPPORTS_MTTCG
196         return check_tcg_memory_orders_compatible();
197 #else
198         return false;
199 #endif
200     }
201 }
202
203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
204 {
205     const char *t = qemu_opt_get(opts, "thread");
206     if (t) {
207         if (strcmp(t, "multi") == 0) {
208             if (TCG_OVERSIZED_GUEST) {
209                 error_setg(errp, "No MTTCG when guest word size > hosts");
210             } else if (use_icount) {
211                 error_setg(errp, "No MTTCG when icount is enabled");
212             } else {
213 #ifndef TARGET_SUPPORTS_MTTCG
214                 warn_report("Guest not yet converted to MTTCG - "
215                             "you may get unexpected results");
216 #endif
217                 if (!check_tcg_memory_orders_compatible()) {
218                     warn_report("Guest expects a stronger memory ordering "
219                                 "than the host provides");
220                     error_printf("This may cause strange/hard to debug errors\n");
221                 }
222                 mttcg_enabled = true;
223             }
224         } else if (strcmp(t, "single") == 0) {
225             mttcg_enabled = false;
226         } else {
227             error_setg(errp, "Invalid 'thread' setting %s", t);
228         }
229     } else {
230         mttcg_enabled = default_mttcg_enabled();
231     }
232 }
233
234 /* The current number of executed instructions is based on what we
235  * originally budgeted minus the current state of the decrementing
236  * icount counters in extra/u16.low.
237  */
238 static int64_t cpu_get_icount_executed(CPUState *cpu)
239 {
240     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
241 }
242
243 /*
244  * Update the global shared timer_state.qemu_icount to take into
245  * account executed instructions. This is done by the TCG vCPU
246  * thread so the main-loop can see time has moved forward.
247  */
248 static void cpu_update_icount_locked(CPUState *cpu)
249 {
250     int64_t executed = cpu_get_icount_executed(cpu);
251     cpu->icount_budget -= executed;
252
253     atomic_set_i64(&timers_state.qemu_icount,
254                    timers_state.qemu_icount + executed);
255 }
256
257 /*
258  * Update the global shared timer_state.qemu_icount to take into
259  * account executed instructions. This is done by the TCG vCPU
260  * thread so the main-loop can see time has moved forward.
261  */
262 void cpu_update_icount(CPUState *cpu)
263 {
264     seqlock_write_lock(&timers_state.vm_clock_seqlock,
265                        &timers_state.vm_clock_lock);
266     cpu_update_icount_locked(cpu);
267     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
268                          &timers_state.vm_clock_lock);
269 }
270
271 static int64_t cpu_get_icount_raw_locked(void)
272 {
273     CPUState *cpu = current_cpu;
274
275     if (cpu && cpu->running) {
276         if (!cpu->can_do_io) {
277             error_report("Bad icount read");
278             exit(1);
279         }
280         /* Take into account what has run */
281         cpu_update_icount_locked(cpu);
282     }
283     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
284     return atomic_read_i64(&timers_state.qemu_icount);
285 }
286
287 static int64_t cpu_get_icount_locked(void)
288 {
289     int64_t icount = cpu_get_icount_raw_locked();
290     return atomic_read_i64(&timers_state.qemu_icount_bias) +
291         cpu_icount_to_ns(icount);
292 }
293
294 int64_t cpu_get_icount_raw(void)
295 {
296     int64_t icount;
297     unsigned start;
298
299     do {
300         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
301         icount = cpu_get_icount_raw_locked();
302     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
303
304     return icount;
305 }
306
307 /* Return the virtual CPU time, based on the instruction counter.  */
308 int64_t cpu_get_icount(void)
309 {
310     int64_t icount;
311     unsigned start;
312
313     do {
314         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
315         icount = cpu_get_icount_locked();
316     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
317
318     return icount;
319 }
320
321 int64_t cpu_icount_to_ns(int64_t icount)
322 {
323     return icount << atomic_read(&timers_state.icount_time_shift);
324 }
325
326 static int64_t cpu_get_ticks_locked(void)
327 {
328     int64_t ticks = timers_state.cpu_ticks_offset;
329     if (timers_state.cpu_ticks_enabled) {
330         ticks += cpu_get_host_ticks();
331     }
332
333     if (timers_state.cpu_ticks_prev > ticks) {
334         /* Non increasing ticks may happen if the host uses software suspend.  */
335         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
336         ticks = timers_state.cpu_ticks_prev;
337     }
338
339     timers_state.cpu_ticks_prev = ticks;
340     return ticks;
341 }
342
343 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
344  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
345  * counter.
346  */
347 int64_t cpu_get_ticks(void)
348 {
349     int64_t ticks;
350
351     if (use_icount) {
352         return cpu_get_icount();
353     }
354
355     qemu_spin_lock(&timers_state.vm_clock_lock);
356     ticks = cpu_get_ticks_locked();
357     qemu_spin_unlock(&timers_state.vm_clock_lock);
358     return ticks;
359 }
360
361 static int64_t cpu_get_clock_locked(void)
362 {
363     int64_t time;
364
365     time = timers_state.cpu_clock_offset;
366     if (timers_state.cpu_ticks_enabled) {
367         time += get_clock();
368     }
369
370     return time;
371 }
372
373 /* Return the monotonic time elapsed in VM, i.e.,
374  * the time between vm_start and vm_stop
375  */
376 int64_t cpu_get_clock(void)
377 {
378     int64_t ti;
379     unsigned start;
380
381     do {
382         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
383         ti = cpu_get_clock_locked();
384     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
385
386     return ti;
387 }
388
389 /* enable cpu_get_ticks()
390  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
391  */
392 void cpu_enable_ticks(void)
393 {
394     seqlock_write_lock(&timers_state.vm_clock_seqlock,
395                        &timers_state.vm_clock_lock);
396     if (!timers_state.cpu_ticks_enabled) {
397         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
398         timers_state.cpu_clock_offset -= get_clock();
399         timers_state.cpu_ticks_enabled = 1;
400     }
401     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
402                        &timers_state.vm_clock_lock);
403 }
404
405 /* disable cpu_get_ticks() : the clock is stopped. You must not call
406  * cpu_get_ticks() after that.
407  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
408  */
409 void cpu_disable_ticks(void)
410 {
411     seqlock_write_lock(&timers_state.vm_clock_seqlock,
412                        &timers_state.vm_clock_lock);
413     if (timers_state.cpu_ticks_enabled) {
414         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
415         timers_state.cpu_clock_offset = cpu_get_clock_locked();
416         timers_state.cpu_ticks_enabled = 0;
417     }
418     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
419                          &timers_state.vm_clock_lock);
420 }
421
422 /* Correlation between real and virtual time is always going to be
423    fairly approximate, so ignore small variation.
424    When the guest is idle real and virtual time will be aligned in
425    the IO wait loop.  */
426 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
427
428 static void icount_adjust(void)
429 {
430     int64_t cur_time;
431     int64_t cur_icount;
432     int64_t delta;
433
434     /* Protected by TimersState mutex.  */
435     static int64_t last_delta;
436
437     /* If the VM is not running, then do nothing.  */
438     if (!runstate_is_running()) {
439         return;
440     }
441
442     seqlock_write_lock(&timers_state.vm_clock_seqlock,
443                        &timers_state.vm_clock_lock);
444     cur_time = cpu_get_clock_locked();
445     cur_icount = cpu_get_icount_locked();
446
447     delta = cur_icount - cur_time;
448     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
449     if (delta > 0
450         && last_delta + ICOUNT_WOBBLE < delta * 2
451         && timers_state.icount_time_shift > 0) {
452         /* The guest is getting too far ahead.  Slow time down.  */
453         atomic_set(&timers_state.icount_time_shift,
454                    timers_state.icount_time_shift - 1);
455     }
456     if (delta < 0
457         && last_delta - ICOUNT_WOBBLE > delta * 2
458         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
459         /* The guest is getting too far behind.  Speed time up.  */
460         atomic_set(&timers_state.icount_time_shift,
461                    timers_state.icount_time_shift + 1);
462     }
463     last_delta = delta;
464     atomic_set_i64(&timers_state.qemu_icount_bias,
465                    cur_icount - (timers_state.qemu_icount
466                                  << timers_state.icount_time_shift));
467     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
468                          &timers_state.vm_clock_lock);
469 }
470
471 static void icount_adjust_rt(void *opaque)
472 {
473     timer_mod(timers_state.icount_rt_timer,
474               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
475     icount_adjust();
476 }
477
478 static void icount_adjust_vm(void *opaque)
479 {
480     timer_mod(timers_state.icount_vm_timer,
481                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
482                    NANOSECONDS_PER_SECOND / 10);
483     icount_adjust();
484 }
485
486 static int64_t qemu_icount_round(int64_t count)
487 {
488     int shift = atomic_read(&timers_state.icount_time_shift);
489     return (count + (1 << shift) - 1) >> shift;
490 }
491
492 static void icount_warp_rt(void)
493 {
494     unsigned seq;
495     int64_t warp_start;
496
497     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
498      * changes from -1 to another value, so the race here is okay.
499      */
500     do {
501         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
502         warp_start = timers_state.vm_clock_warp_start;
503     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
504
505     if (warp_start == -1) {
506         return;
507     }
508
509     seqlock_write_lock(&timers_state.vm_clock_seqlock,
510                        &timers_state.vm_clock_lock);
511     if (runstate_is_running()) {
512         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
513                                             cpu_get_clock_locked());
514         int64_t warp_delta;
515
516         warp_delta = clock - timers_state.vm_clock_warp_start;
517         if (use_icount == 2) {
518             /*
519              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
520              * far ahead of real time.
521              */
522             int64_t cur_icount = cpu_get_icount_locked();
523             int64_t delta = clock - cur_icount;
524             warp_delta = MIN(warp_delta, delta);
525         }
526         atomic_set_i64(&timers_state.qemu_icount_bias,
527                        timers_state.qemu_icount_bias + warp_delta);
528     }
529     timers_state.vm_clock_warp_start = -1;
530     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
531                        &timers_state.vm_clock_lock);
532
533     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
534         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
535     }
536 }
537
538 static void icount_timer_cb(void *opaque)
539 {
540     /* No need for a checkpoint because the timer already synchronizes
541      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
542      */
543     icount_warp_rt();
544 }
545
546 void qtest_clock_warp(int64_t dest)
547 {
548     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
549     AioContext *aio_context;
550     assert(qtest_enabled());
551     aio_context = qemu_get_aio_context();
552     while (clock < dest) {
553         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
554         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
555
556         seqlock_write_lock(&timers_state.vm_clock_seqlock,
557                            &timers_state.vm_clock_lock);
558         atomic_set_i64(&timers_state.qemu_icount_bias,
559                        timers_state.qemu_icount_bias + warp);
560         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
561                              &timers_state.vm_clock_lock);
562
563         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
564         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
565         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
566     }
567     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
568 }
569
570 void qemu_start_warp_timer(void)
571 {
572     int64_t clock;
573     int64_t deadline;
574
575     if (!use_icount) {
576         return;
577     }
578
579     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
580      * do not fire, so computing the deadline does not make sense.
581      */
582     if (!runstate_is_running()) {
583         return;
584     }
585
586     if (replay_mode != REPLAY_MODE_PLAY) {
587         if (!all_cpu_threads_idle()) {
588             return;
589         }
590
591         if (qtest_enabled()) {
592             /* When testing, qtest commands advance icount.  */
593             return;
594         }
595
596         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
597     } else {
598         /* warp clock deterministically in record/replay mode */
599         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
600             /* vCPU is sleeping and warp can't be started.
601                It is probably a race condition: notification sent
602                to vCPU was processed in advance and vCPU went to sleep.
603                Therefore we have to wake it up for doing someting. */
604             if (replay_has_checkpoint()) {
605                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
606             }
607             return;
608         }
609     }
610
611     /* We want to use the earliest deadline from ALL vm_clocks */
612     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
613     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
614     if (deadline < 0) {
615         static bool notified;
616         if (!icount_sleep && !notified) {
617             warn_report("icount sleep disabled and no active timers");
618             notified = true;
619         }
620         return;
621     }
622
623     if (deadline > 0) {
624         /*
625          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
626          * sleep.  Otherwise, the CPU might be waiting for a future timer
627          * interrupt to wake it up, but the interrupt never comes because
628          * the vCPU isn't running any insns and thus doesn't advance the
629          * QEMU_CLOCK_VIRTUAL.
630          */
631         if (!icount_sleep) {
632             /*
633              * We never let VCPUs sleep in no sleep icount mode.
634              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
635              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
636              * It is useful when we want a deterministic execution time,
637              * isolated from host latencies.
638              */
639             seqlock_write_lock(&timers_state.vm_clock_seqlock,
640                                &timers_state.vm_clock_lock);
641             atomic_set_i64(&timers_state.qemu_icount_bias,
642                            timers_state.qemu_icount_bias + deadline);
643             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
644                                  &timers_state.vm_clock_lock);
645             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
646         } else {
647             /*
648              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
649              * "real" time, (related to the time left until the next event) has
650              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
651              * This avoids that the warps are visible externally; for example,
652              * you will not be sending network packets continuously instead of
653              * every 100ms.
654              */
655             seqlock_write_lock(&timers_state.vm_clock_seqlock,
656                                &timers_state.vm_clock_lock);
657             if (timers_state.vm_clock_warp_start == -1
658                 || timers_state.vm_clock_warp_start > clock) {
659                 timers_state.vm_clock_warp_start = clock;
660             }
661             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
662                                  &timers_state.vm_clock_lock);
663             timer_mod_anticipate(timers_state.icount_warp_timer,
664                                  clock + deadline);
665         }
666     } else if (deadline == 0) {
667         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
668     }
669 }
670
671 static void qemu_account_warp_timer(void)
672 {
673     if (!use_icount || !icount_sleep) {
674         return;
675     }
676
677     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
678      * do not fire, so computing the deadline does not make sense.
679      */
680     if (!runstate_is_running()) {
681         return;
682     }
683
684     /* warp clock deterministically in record/replay mode */
685     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
686         return;
687     }
688
689     timer_del(timers_state.icount_warp_timer);
690     icount_warp_rt();
691 }
692
693 static bool icount_state_needed(void *opaque)
694 {
695     return use_icount;
696 }
697
698 static bool warp_timer_state_needed(void *opaque)
699 {
700     TimersState *s = opaque;
701     return s->icount_warp_timer != NULL;
702 }
703
704 static bool adjust_timers_state_needed(void *opaque)
705 {
706     TimersState *s = opaque;
707     return s->icount_rt_timer != NULL;
708 }
709
710 /*
711  * Subsection for warp timer migration is optional, because may not be created
712  */
713 static const VMStateDescription icount_vmstate_warp_timer = {
714     .name = "timer/icount/warp_timer",
715     .version_id = 1,
716     .minimum_version_id = 1,
717     .needed = warp_timer_state_needed,
718     .fields = (VMStateField[]) {
719         VMSTATE_INT64(vm_clock_warp_start, TimersState),
720         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
721         VMSTATE_END_OF_LIST()
722     }
723 };
724
725 static const VMStateDescription icount_vmstate_adjust_timers = {
726     .name = "timer/icount/timers",
727     .version_id = 1,
728     .minimum_version_id = 1,
729     .needed = adjust_timers_state_needed,
730     .fields = (VMStateField[]) {
731         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
732         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
733         VMSTATE_END_OF_LIST()
734     }
735 };
736
737 /*
738  * This is a subsection for icount migration.
739  */
740 static const VMStateDescription icount_vmstate_timers = {
741     .name = "timer/icount",
742     .version_id = 1,
743     .minimum_version_id = 1,
744     .needed = icount_state_needed,
745     .fields = (VMStateField[]) {
746         VMSTATE_INT64(qemu_icount_bias, TimersState),
747         VMSTATE_INT64(qemu_icount, TimersState),
748         VMSTATE_END_OF_LIST()
749     },
750     .subsections = (const VMStateDescription*[]) {
751         &icount_vmstate_warp_timer,
752         &icount_vmstate_adjust_timers,
753         NULL
754     }
755 };
756
757 static const VMStateDescription vmstate_timers = {
758     .name = "timer",
759     .version_id = 2,
760     .minimum_version_id = 1,
761     .fields = (VMStateField[]) {
762         VMSTATE_INT64(cpu_ticks_offset, TimersState),
763         VMSTATE_UNUSED(8),
764         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
765         VMSTATE_END_OF_LIST()
766     },
767     .subsections = (const VMStateDescription*[]) {
768         &icount_vmstate_timers,
769         NULL
770     }
771 };
772
773 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
774 {
775     double pct;
776     double throttle_ratio;
777     long sleeptime_ns;
778
779     if (!cpu_throttle_get_percentage()) {
780         return;
781     }
782
783     pct = (double)cpu_throttle_get_percentage()/100;
784     throttle_ratio = pct / (1 - pct);
785     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
786
787     qemu_mutex_unlock_iothread();
788     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
789     qemu_mutex_lock_iothread();
790     atomic_set(&cpu->throttle_thread_scheduled, 0);
791 }
792
793 static void cpu_throttle_timer_tick(void *opaque)
794 {
795     CPUState *cpu;
796     double pct;
797
798     /* Stop the timer if needed */
799     if (!cpu_throttle_get_percentage()) {
800         return;
801     }
802     CPU_FOREACH(cpu) {
803         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
804             async_run_on_cpu(cpu, cpu_throttle_thread,
805                              RUN_ON_CPU_NULL);
806         }
807     }
808
809     pct = (double)cpu_throttle_get_percentage()/100;
810     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
811                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
812 }
813
814 void cpu_throttle_set(int new_throttle_pct)
815 {
816     /* Ensure throttle percentage is within valid range */
817     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
818     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
819
820     atomic_set(&throttle_percentage, new_throttle_pct);
821
822     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
823                                        CPU_THROTTLE_TIMESLICE_NS);
824 }
825
826 void cpu_throttle_stop(void)
827 {
828     atomic_set(&throttle_percentage, 0);
829 }
830
831 bool cpu_throttle_active(void)
832 {
833     return (cpu_throttle_get_percentage() != 0);
834 }
835
836 int cpu_throttle_get_percentage(void)
837 {
838     return atomic_read(&throttle_percentage);
839 }
840
841 void cpu_ticks_init(void)
842 {
843     seqlock_init(&timers_state.vm_clock_seqlock);
844     qemu_spin_init(&timers_state.vm_clock_lock);
845     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
846     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
847                                            cpu_throttle_timer_tick, NULL);
848 }
849
850 void configure_icount(QemuOpts *opts, Error **errp)
851 {
852     const char *option;
853     char *rem_str = NULL;
854
855     option = qemu_opt_get(opts, "shift");
856     if (!option) {
857         if (qemu_opt_get(opts, "align") != NULL) {
858             error_setg(errp, "Please specify shift option when using align");
859         }
860         return;
861     }
862
863     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
864     if (icount_sleep) {
865         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
866                                          icount_timer_cb, NULL);
867     }
868
869     icount_align_option = qemu_opt_get_bool(opts, "align", false);
870
871     if (icount_align_option && !icount_sleep) {
872         error_setg(errp, "align=on and sleep=off are incompatible");
873     }
874     if (strcmp(option, "auto") != 0) {
875         errno = 0;
876         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
877         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
878             error_setg(errp, "icount: Invalid shift value");
879         }
880         use_icount = 1;
881         return;
882     } else if (icount_align_option) {
883         error_setg(errp, "shift=auto and align=on are incompatible");
884     } else if (!icount_sleep) {
885         error_setg(errp, "shift=auto and sleep=off are incompatible");
886     }
887
888     use_icount = 2;
889
890     /* 125MIPS seems a reasonable initial guess at the guest speed.
891        It will be corrected fairly quickly anyway.  */
892     timers_state.icount_time_shift = 3;
893
894     /* Have both realtime and virtual time triggers for speed adjustment.
895        The realtime trigger catches emulated time passing too slowly,
896        the virtual time trigger catches emulated time passing too fast.
897        Realtime triggers occur even when idle, so use them less frequently
898        than VM triggers.  */
899     timers_state.vm_clock_warp_start = -1;
900     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
901                                    icount_adjust_rt, NULL);
902     timer_mod(timers_state.icount_rt_timer,
903                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
904     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
905                                         icount_adjust_vm, NULL);
906     timer_mod(timers_state.icount_vm_timer,
907                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
908                    NANOSECONDS_PER_SECOND / 10);
909 }
910
911 /***********************************************************/
912 /* TCG vCPU kick timer
913  *
914  * The kick timer is responsible for moving single threaded vCPU
915  * emulation on to the next vCPU. If more than one vCPU is running a
916  * timer event with force a cpu->exit so the next vCPU can get
917  * scheduled.
918  *
919  * The timer is removed if all vCPUs are idle and restarted again once
920  * idleness is complete.
921  */
922
923 static QEMUTimer *tcg_kick_vcpu_timer;
924 static CPUState *tcg_current_rr_cpu;
925
926 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
927
928 static inline int64_t qemu_tcg_next_kick(void)
929 {
930     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
931 }
932
933 /* Kick the currently round-robin scheduled vCPU */
934 static void qemu_cpu_kick_rr_cpu(void)
935 {
936     CPUState *cpu;
937     do {
938         cpu = atomic_mb_read(&tcg_current_rr_cpu);
939         if (cpu) {
940             cpu_exit(cpu);
941         }
942     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
943 }
944
945 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
946 {
947 }
948
949 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
950 {
951     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
952         qemu_notify_event();
953         return;
954     }
955
956     if (qemu_in_vcpu_thread()) {
957         /* A CPU is currently running; kick it back out to the
958          * tcg_cpu_exec() loop so it will recalculate its
959          * icount deadline immediately.
960          */
961         qemu_cpu_kick(current_cpu);
962     } else if (first_cpu) {
963         /* qemu_cpu_kick is not enough to kick a halted CPU out of
964          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
965          * causes cpu_thread_is_idle to return false.  This way,
966          * handle_icount_deadline can run.
967          * If we have no CPUs at all for some reason, we don't
968          * need to do anything.
969          */
970         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
971     }
972 }
973
974 static void kick_tcg_thread(void *opaque)
975 {
976     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
977     qemu_cpu_kick_rr_cpu();
978 }
979
980 static void start_tcg_kick_timer(void)
981 {
982     assert(!mttcg_enabled);
983     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
984         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
985                                            kick_tcg_thread, NULL);
986     }
987     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
988         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
989     }
990 }
991
992 static void stop_tcg_kick_timer(void)
993 {
994     assert(!mttcg_enabled);
995     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
996         timer_del(tcg_kick_vcpu_timer);
997     }
998 }
999
1000 /***********************************************************/
1001 void hw_error(const char *fmt, ...)
1002 {
1003     va_list ap;
1004     CPUState *cpu;
1005
1006     va_start(ap, fmt);
1007     fprintf(stderr, "qemu: hardware error: ");
1008     vfprintf(stderr, fmt, ap);
1009     fprintf(stderr, "\n");
1010     CPU_FOREACH(cpu) {
1011         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1012         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
1013     }
1014     va_end(ap);
1015     abort();
1016 }
1017
1018 void cpu_synchronize_all_states(void)
1019 {
1020     CPUState *cpu;
1021
1022     CPU_FOREACH(cpu) {
1023         cpu_synchronize_state(cpu);
1024         /* TODO: move to cpu_synchronize_state() */
1025         if (hvf_enabled()) {
1026             hvf_cpu_synchronize_state(cpu);
1027         }
1028     }
1029 }
1030
1031 void cpu_synchronize_all_post_reset(void)
1032 {
1033     CPUState *cpu;
1034
1035     CPU_FOREACH(cpu) {
1036         cpu_synchronize_post_reset(cpu);
1037         /* TODO: move to cpu_synchronize_post_reset() */
1038         if (hvf_enabled()) {
1039             hvf_cpu_synchronize_post_reset(cpu);
1040         }
1041     }
1042 }
1043
1044 void cpu_synchronize_all_post_init(void)
1045 {
1046     CPUState *cpu;
1047
1048     CPU_FOREACH(cpu) {
1049         cpu_synchronize_post_init(cpu);
1050         /* TODO: move to cpu_synchronize_post_init() */
1051         if (hvf_enabled()) {
1052             hvf_cpu_synchronize_post_init(cpu);
1053         }
1054     }
1055 }
1056
1057 void cpu_synchronize_all_pre_loadvm(void)
1058 {
1059     CPUState *cpu;
1060
1061     CPU_FOREACH(cpu) {
1062         cpu_synchronize_pre_loadvm(cpu);
1063     }
1064 }
1065
1066 static int do_vm_stop(RunState state, bool send_stop)
1067 {
1068     int ret = 0;
1069
1070     if (runstate_is_running()) {
1071         cpu_disable_ticks();
1072         pause_all_vcpus();
1073         runstate_set(state);
1074         vm_state_notify(0, state);
1075         if (send_stop) {
1076             qapi_event_send_stop();
1077         }
1078     }
1079
1080     bdrv_drain_all();
1081     replay_disable_events();
1082     ret = bdrv_flush_all();
1083
1084     return ret;
1085 }
1086
1087 /* Special vm_stop() variant for terminating the process.  Historically clients
1088  * did not expect a QMP STOP event and so we need to retain compatibility.
1089  */
1090 int vm_shutdown(void)
1091 {
1092     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1093 }
1094
1095 static bool cpu_can_run(CPUState *cpu)
1096 {
1097     if (cpu->stop) {
1098         return false;
1099     }
1100     if (cpu_is_stopped(cpu)) {
1101         return false;
1102     }
1103     return true;
1104 }
1105
1106 static void cpu_handle_guest_debug(CPUState *cpu)
1107 {
1108     gdb_set_stop_cpu(cpu);
1109     qemu_system_debug_request();
1110     cpu->stopped = true;
1111 }
1112
1113 #ifdef CONFIG_LINUX
1114 static void sigbus_reraise(void)
1115 {
1116     sigset_t set;
1117     struct sigaction action;
1118
1119     memset(&action, 0, sizeof(action));
1120     action.sa_handler = SIG_DFL;
1121     if (!sigaction(SIGBUS, &action, NULL)) {
1122         raise(SIGBUS);
1123         sigemptyset(&set);
1124         sigaddset(&set, SIGBUS);
1125         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1126     }
1127     perror("Failed to re-raise SIGBUS!\n");
1128     abort();
1129 }
1130
1131 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1132 {
1133     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1134         sigbus_reraise();
1135     }
1136
1137     if (current_cpu) {
1138         /* Called asynchronously in VCPU thread.  */
1139         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1140             sigbus_reraise();
1141         }
1142     } else {
1143         /* Called synchronously (via signalfd) in main thread.  */
1144         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1145             sigbus_reraise();
1146         }
1147     }
1148 }
1149
1150 static void qemu_init_sigbus(void)
1151 {
1152     struct sigaction action;
1153
1154     memset(&action, 0, sizeof(action));
1155     action.sa_flags = SA_SIGINFO;
1156     action.sa_sigaction = sigbus_handler;
1157     sigaction(SIGBUS, &action, NULL);
1158
1159     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1160 }
1161 #else /* !CONFIG_LINUX */
1162 static void qemu_init_sigbus(void)
1163 {
1164 }
1165 #endif /* !CONFIG_LINUX */
1166
1167 static QemuMutex qemu_global_mutex;
1168
1169 static QemuThread io_thread;
1170
1171 /* cpu creation */
1172 static QemuCond qemu_cpu_cond;
1173 /* system init */
1174 static QemuCond qemu_pause_cond;
1175
1176 void qemu_init_cpu_loop(void)
1177 {
1178     qemu_init_sigbus();
1179     qemu_cond_init(&qemu_cpu_cond);
1180     qemu_cond_init(&qemu_pause_cond);
1181     qemu_mutex_init(&qemu_global_mutex);
1182
1183     qemu_thread_get_self(&io_thread);
1184 }
1185
1186 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1187 {
1188     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1189 }
1190
1191 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1192 {
1193     if (kvm_destroy_vcpu(cpu) < 0) {
1194         error_report("kvm_destroy_vcpu failed");
1195         exit(EXIT_FAILURE);
1196     }
1197 }
1198
1199 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1200 {
1201 }
1202
1203 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1204 {
1205     g_assert(qemu_cpu_is_self(cpu));
1206     cpu->stop = false;
1207     cpu->stopped = true;
1208     if (exit) {
1209         cpu_exit(cpu);
1210     }
1211     qemu_cond_broadcast(&qemu_pause_cond);
1212 }
1213
1214 static void qemu_wait_io_event_common(CPUState *cpu)
1215 {
1216     atomic_mb_set(&cpu->thread_kicked, false);
1217     if (cpu->stop) {
1218         qemu_cpu_stop(cpu, false);
1219     }
1220     process_queued_cpu_work(cpu);
1221 }
1222
1223 static void qemu_tcg_rr_wait_io_event(void)
1224 {
1225     CPUState *cpu;
1226
1227     while (all_cpu_threads_idle()) {
1228         stop_tcg_kick_timer();
1229         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1230     }
1231
1232     start_tcg_kick_timer();
1233
1234     CPU_FOREACH(cpu) {
1235         qemu_wait_io_event_common(cpu);
1236     }
1237 }
1238
1239 static void qemu_wait_io_event(CPUState *cpu)
1240 {
1241     while (cpu_thread_is_idle(cpu)) {
1242         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1243     }
1244
1245 #ifdef _WIN32
1246     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1247     if (!tcg_enabled()) {
1248         SleepEx(0, TRUE);
1249     }
1250 #endif
1251     qemu_wait_io_event_common(cpu);
1252 }
1253
1254 static void *qemu_kvm_cpu_thread_fn(void *arg)
1255 {
1256     CPUState *cpu = arg;
1257     int r;
1258
1259     rcu_register_thread();
1260
1261     qemu_mutex_lock_iothread();
1262     qemu_thread_get_self(cpu->thread);
1263     cpu->thread_id = qemu_get_thread_id();
1264     cpu->can_do_io = 1;
1265     current_cpu = cpu;
1266
1267     r = kvm_init_vcpu(cpu);
1268     if (r < 0) {
1269         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1270         exit(1);
1271     }
1272
1273     kvm_init_cpu_signals(cpu);
1274
1275     /* signal CPU creation */
1276     cpu->created = true;
1277     qemu_cond_signal(&qemu_cpu_cond);
1278
1279     do {
1280         if (cpu_can_run(cpu)) {
1281             r = kvm_cpu_exec(cpu);
1282             if (r == EXCP_DEBUG) {
1283                 cpu_handle_guest_debug(cpu);
1284             }
1285         }
1286         qemu_wait_io_event(cpu);
1287     } while (!cpu->unplug || cpu_can_run(cpu));
1288
1289     qemu_kvm_destroy_vcpu(cpu);
1290     cpu->created = false;
1291     qemu_cond_signal(&qemu_cpu_cond);
1292     qemu_mutex_unlock_iothread();
1293     rcu_unregister_thread();
1294     return NULL;
1295 }
1296
1297 static void *qemu_dummy_cpu_thread_fn(void *arg)
1298 {
1299 #ifdef _WIN32
1300     error_report("qtest is not supported under Windows");
1301     exit(1);
1302 #else
1303     CPUState *cpu = arg;
1304     sigset_t waitset;
1305     int r;
1306
1307     rcu_register_thread();
1308
1309     qemu_mutex_lock_iothread();
1310     qemu_thread_get_self(cpu->thread);
1311     cpu->thread_id = qemu_get_thread_id();
1312     cpu->can_do_io = 1;
1313     current_cpu = cpu;
1314
1315     sigemptyset(&waitset);
1316     sigaddset(&waitset, SIG_IPI);
1317
1318     /* signal CPU creation */
1319     cpu->created = true;
1320     qemu_cond_signal(&qemu_cpu_cond);
1321
1322     do {
1323         qemu_mutex_unlock_iothread();
1324         do {
1325             int sig;
1326             r = sigwait(&waitset, &sig);
1327         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1328         if (r == -1) {
1329             perror("sigwait");
1330             exit(1);
1331         }
1332         qemu_mutex_lock_iothread();
1333         qemu_wait_io_event(cpu);
1334     } while (!cpu->unplug);
1335
1336     qemu_mutex_unlock_iothread();
1337     rcu_unregister_thread();
1338     return NULL;
1339 #endif
1340 }
1341
1342 static int64_t tcg_get_icount_limit(void)
1343 {
1344     int64_t deadline;
1345
1346     if (replay_mode != REPLAY_MODE_PLAY) {
1347         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1348
1349         /* Maintain prior (possibly buggy) behaviour where if no deadline
1350          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1351          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1352          * nanoseconds.
1353          */
1354         if ((deadline < 0) || (deadline > INT32_MAX)) {
1355             deadline = INT32_MAX;
1356         }
1357
1358         return qemu_icount_round(deadline);
1359     } else {
1360         return replay_get_instructions();
1361     }
1362 }
1363
1364 static void handle_icount_deadline(void)
1365 {
1366     assert(qemu_in_vcpu_thread());
1367     if (use_icount) {
1368         int64_t deadline =
1369             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1370
1371         if (deadline == 0) {
1372             /* Wake up other AioContexts.  */
1373             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1374             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1375         }
1376     }
1377 }
1378
1379 static void prepare_icount_for_run(CPUState *cpu)
1380 {
1381     if (use_icount) {
1382         int insns_left;
1383
1384         /* These should always be cleared by process_icount_data after
1385          * each vCPU execution. However u16.high can be raised
1386          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1387          */
1388         g_assert(cpu->icount_decr.u16.low == 0);
1389         g_assert(cpu->icount_extra == 0);
1390
1391         cpu->icount_budget = tcg_get_icount_limit();
1392         insns_left = MIN(0xffff, cpu->icount_budget);
1393         cpu->icount_decr.u16.low = insns_left;
1394         cpu->icount_extra = cpu->icount_budget - insns_left;
1395
1396         replay_mutex_lock();
1397     }
1398 }
1399
1400 static void process_icount_data(CPUState *cpu)
1401 {
1402     if (use_icount) {
1403         /* Account for executed instructions */
1404         cpu_update_icount(cpu);
1405
1406         /* Reset the counters */
1407         cpu->icount_decr.u16.low = 0;
1408         cpu->icount_extra = 0;
1409         cpu->icount_budget = 0;
1410
1411         replay_account_executed_instructions();
1412
1413         replay_mutex_unlock();
1414     }
1415 }
1416
1417
1418 static int tcg_cpu_exec(CPUState *cpu)
1419 {
1420     int ret;
1421 #ifdef CONFIG_PROFILER
1422     int64_t ti;
1423 #endif
1424
1425     assert(tcg_enabled());
1426 #ifdef CONFIG_PROFILER
1427     ti = profile_getclock();
1428 #endif
1429     cpu_exec_start(cpu);
1430     ret = cpu_exec(cpu);
1431     cpu_exec_end(cpu);
1432 #ifdef CONFIG_PROFILER
1433     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1434                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1435 #endif
1436     return ret;
1437 }
1438
1439 /* Destroy any remaining vCPUs which have been unplugged and have
1440  * finished running
1441  */
1442 static void deal_with_unplugged_cpus(void)
1443 {
1444     CPUState *cpu;
1445
1446     CPU_FOREACH(cpu) {
1447         if (cpu->unplug && !cpu_can_run(cpu)) {
1448             qemu_tcg_destroy_vcpu(cpu);
1449             cpu->created = false;
1450             qemu_cond_signal(&qemu_cpu_cond);
1451             break;
1452         }
1453     }
1454 }
1455
1456 /* Single-threaded TCG
1457  *
1458  * In the single-threaded case each vCPU is simulated in turn. If
1459  * there is more than a single vCPU we create a simple timer to kick
1460  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1461  * This is done explicitly rather than relying on side-effects
1462  * elsewhere.
1463  */
1464
1465 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1466 {
1467     CPUState *cpu = arg;
1468
1469     assert(tcg_enabled());
1470     rcu_register_thread();
1471     tcg_register_thread();
1472
1473     qemu_mutex_lock_iothread();
1474     qemu_thread_get_self(cpu->thread);
1475
1476     cpu->thread_id = qemu_get_thread_id();
1477     cpu->created = true;
1478     cpu->can_do_io = 1;
1479     qemu_cond_signal(&qemu_cpu_cond);
1480
1481     /* wait for initial kick-off after machine start */
1482     while (first_cpu->stopped) {
1483         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1484
1485         /* process any pending work */
1486         CPU_FOREACH(cpu) {
1487             current_cpu = cpu;
1488             qemu_wait_io_event_common(cpu);
1489         }
1490     }
1491
1492     start_tcg_kick_timer();
1493
1494     cpu = first_cpu;
1495
1496     /* process any pending work */
1497     cpu->exit_request = 1;
1498
1499     while (1) {
1500         qemu_mutex_unlock_iothread();
1501         replay_mutex_lock();
1502         qemu_mutex_lock_iothread();
1503         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1504         qemu_account_warp_timer();
1505
1506         /* Run the timers here.  This is much more efficient than
1507          * waking up the I/O thread and waiting for completion.
1508          */
1509         handle_icount_deadline();
1510
1511         replay_mutex_unlock();
1512
1513         if (!cpu) {
1514             cpu = first_cpu;
1515         }
1516
1517         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1518
1519             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1520             current_cpu = cpu;
1521
1522             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1523                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1524
1525             if (cpu_can_run(cpu)) {
1526                 int r;
1527
1528                 qemu_mutex_unlock_iothread();
1529                 prepare_icount_for_run(cpu);
1530
1531                 r = tcg_cpu_exec(cpu);
1532
1533                 process_icount_data(cpu);
1534                 qemu_mutex_lock_iothread();
1535
1536                 if (r == EXCP_DEBUG) {
1537                     cpu_handle_guest_debug(cpu);
1538                     break;
1539                 } else if (r == EXCP_ATOMIC) {
1540                     qemu_mutex_unlock_iothread();
1541                     cpu_exec_step_atomic(cpu);
1542                     qemu_mutex_lock_iothread();
1543                     break;
1544                 }
1545             } else if (cpu->stop) {
1546                 if (cpu->unplug) {
1547                     cpu = CPU_NEXT(cpu);
1548                 }
1549                 break;
1550             }
1551
1552             cpu = CPU_NEXT(cpu);
1553         } /* while (cpu && !cpu->exit_request).. */
1554
1555         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1556         atomic_set(&tcg_current_rr_cpu, NULL);
1557
1558         if (cpu && cpu->exit_request) {
1559             atomic_mb_set(&cpu->exit_request, 0);
1560         }
1561
1562         if (use_icount && all_cpu_threads_idle()) {
1563             /*
1564              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1565              * in the main_loop, wake it up in order to start the warp timer.
1566              */
1567             qemu_notify_event();
1568         }
1569
1570         qemu_tcg_rr_wait_io_event();
1571         deal_with_unplugged_cpus();
1572     }
1573
1574     rcu_unregister_thread();
1575     return NULL;
1576 }
1577
1578 static void *qemu_hax_cpu_thread_fn(void *arg)
1579 {
1580     CPUState *cpu = arg;
1581     int r;
1582
1583     rcu_register_thread();
1584     qemu_mutex_lock_iothread();
1585     qemu_thread_get_self(cpu->thread);
1586
1587     cpu->thread_id = qemu_get_thread_id();
1588     cpu->created = true;
1589     cpu->halted = 0;
1590     current_cpu = cpu;
1591
1592     hax_init_vcpu(cpu);
1593     qemu_cond_signal(&qemu_cpu_cond);
1594
1595     do {
1596         if (cpu_can_run(cpu)) {
1597             r = hax_smp_cpu_exec(cpu);
1598             if (r == EXCP_DEBUG) {
1599                 cpu_handle_guest_debug(cpu);
1600             }
1601         }
1602
1603         qemu_wait_io_event(cpu);
1604     } while (!cpu->unplug || cpu_can_run(cpu));
1605     rcu_unregister_thread();
1606     return NULL;
1607 }
1608
1609 /* The HVF-specific vCPU thread function. This one should only run when the host
1610  * CPU supports the VMX "unrestricted guest" feature. */
1611 static void *qemu_hvf_cpu_thread_fn(void *arg)
1612 {
1613     CPUState *cpu = arg;
1614
1615     int r;
1616
1617     assert(hvf_enabled());
1618
1619     rcu_register_thread();
1620
1621     qemu_mutex_lock_iothread();
1622     qemu_thread_get_self(cpu->thread);
1623
1624     cpu->thread_id = qemu_get_thread_id();
1625     cpu->can_do_io = 1;
1626     current_cpu = cpu;
1627
1628     hvf_init_vcpu(cpu);
1629
1630     /* signal CPU creation */
1631     cpu->created = true;
1632     qemu_cond_signal(&qemu_cpu_cond);
1633
1634     do {
1635         if (cpu_can_run(cpu)) {
1636             r = hvf_vcpu_exec(cpu);
1637             if (r == EXCP_DEBUG) {
1638                 cpu_handle_guest_debug(cpu);
1639             }
1640         }
1641         qemu_wait_io_event(cpu);
1642     } while (!cpu->unplug || cpu_can_run(cpu));
1643
1644     hvf_vcpu_destroy(cpu);
1645     cpu->created = false;
1646     qemu_cond_signal(&qemu_cpu_cond);
1647     qemu_mutex_unlock_iothread();
1648     rcu_unregister_thread();
1649     return NULL;
1650 }
1651
1652 static void *qemu_whpx_cpu_thread_fn(void *arg)
1653 {
1654     CPUState *cpu = arg;
1655     int r;
1656
1657     rcu_register_thread();
1658
1659     qemu_mutex_lock_iothread();
1660     qemu_thread_get_self(cpu->thread);
1661     cpu->thread_id = qemu_get_thread_id();
1662     current_cpu = cpu;
1663
1664     r = whpx_init_vcpu(cpu);
1665     if (r < 0) {
1666         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1667         exit(1);
1668     }
1669
1670     /* signal CPU creation */
1671     cpu->created = true;
1672     qemu_cond_signal(&qemu_cpu_cond);
1673
1674     do {
1675         if (cpu_can_run(cpu)) {
1676             r = whpx_vcpu_exec(cpu);
1677             if (r == EXCP_DEBUG) {
1678                 cpu_handle_guest_debug(cpu);
1679             }
1680         }
1681         while (cpu_thread_is_idle(cpu)) {
1682             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1683         }
1684         qemu_wait_io_event_common(cpu);
1685     } while (!cpu->unplug || cpu_can_run(cpu));
1686
1687     whpx_destroy_vcpu(cpu);
1688     cpu->created = false;
1689     qemu_cond_signal(&qemu_cpu_cond);
1690     qemu_mutex_unlock_iothread();
1691     rcu_unregister_thread();
1692     return NULL;
1693 }
1694
1695 #ifdef _WIN32
1696 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1697 {
1698 }
1699 #endif
1700
1701 /* Multi-threaded TCG
1702  *
1703  * In the multi-threaded case each vCPU has its own thread. The TLS
1704  * variable current_cpu can be used deep in the code to find the
1705  * current CPUState for a given thread.
1706  */
1707
1708 static void *qemu_tcg_cpu_thread_fn(void *arg)
1709 {
1710     CPUState *cpu = arg;
1711
1712     assert(tcg_enabled());
1713     g_assert(!use_icount);
1714
1715     rcu_register_thread();
1716     tcg_register_thread();
1717
1718     qemu_mutex_lock_iothread();
1719     qemu_thread_get_self(cpu->thread);
1720
1721     cpu->thread_id = qemu_get_thread_id();
1722     cpu->created = true;
1723     cpu->can_do_io = 1;
1724     current_cpu = cpu;
1725     qemu_cond_signal(&qemu_cpu_cond);
1726
1727     /* process any pending work */
1728     cpu->exit_request = 1;
1729
1730     do {
1731         if (cpu_can_run(cpu)) {
1732             int r;
1733             qemu_mutex_unlock_iothread();
1734             r = tcg_cpu_exec(cpu);
1735             qemu_mutex_lock_iothread();
1736             switch (r) {
1737             case EXCP_DEBUG:
1738                 cpu_handle_guest_debug(cpu);
1739                 break;
1740             case EXCP_HALTED:
1741                 /* during start-up the vCPU is reset and the thread is
1742                  * kicked several times. If we don't ensure we go back
1743                  * to sleep in the halted state we won't cleanly
1744                  * start-up when the vCPU is enabled.
1745                  *
1746                  * cpu->halted should ensure we sleep in wait_io_event
1747                  */
1748                 g_assert(cpu->halted);
1749                 break;
1750             case EXCP_ATOMIC:
1751                 qemu_mutex_unlock_iothread();
1752                 cpu_exec_step_atomic(cpu);
1753                 qemu_mutex_lock_iothread();
1754             default:
1755                 /* Ignore everything else? */
1756                 break;
1757             }
1758         }
1759
1760         atomic_mb_set(&cpu->exit_request, 0);
1761         qemu_wait_io_event(cpu);
1762     } while (!cpu->unplug || cpu_can_run(cpu));
1763
1764     qemu_tcg_destroy_vcpu(cpu);
1765     cpu->created = false;
1766     qemu_cond_signal(&qemu_cpu_cond);
1767     qemu_mutex_unlock_iothread();
1768     rcu_unregister_thread();
1769     return NULL;
1770 }
1771
1772 static void qemu_cpu_kick_thread(CPUState *cpu)
1773 {
1774 #ifndef _WIN32
1775     int err;
1776
1777     if (cpu->thread_kicked) {
1778         return;
1779     }
1780     cpu->thread_kicked = true;
1781     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1782     if (err && err != ESRCH) {
1783         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1784         exit(1);
1785     }
1786 #else /* _WIN32 */
1787     if (!qemu_cpu_is_self(cpu)) {
1788         if (whpx_enabled()) {
1789             whpx_vcpu_kick(cpu);
1790         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1791             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1792                     __func__, GetLastError());
1793             exit(1);
1794         }
1795     }
1796 #endif
1797 }
1798
1799 void qemu_cpu_kick(CPUState *cpu)
1800 {
1801     qemu_cond_broadcast(cpu->halt_cond);
1802     if (tcg_enabled()) {
1803         cpu_exit(cpu);
1804         /* NOP unless doing single-thread RR */
1805         qemu_cpu_kick_rr_cpu();
1806     } else {
1807         if (hax_enabled()) {
1808             /*
1809              * FIXME: race condition with the exit_request check in
1810              * hax_vcpu_hax_exec
1811              */
1812             cpu->exit_request = 1;
1813         }
1814         qemu_cpu_kick_thread(cpu);
1815     }
1816 }
1817
1818 void qemu_cpu_kick_self(void)
1819 {
1820     assert(current_cpu);
1821     qemu_cpu_kick_thread(current_cpu);
1822 }
1823
1824 bool qemu_cpu_is_self(CPUState *cpu)
1825 {
1826     return qemu_thread_is_self(cpu->thread);
1827 }
1828
1829 bool qemu_in_vcpu_thread(void)
1830 {
1831     return current_cpu && qemu_cpu_is_self(current_cpu);
1832 }
1833
1834 static __thread bool iothread_locked = false;
1835
1836 bool qemu_mutex_iothread_locked(void)
1837 {
1838     return iothread_locked;
1839 }
1840
1841 /*
1842  * The BQL is taken from so many places that it is worth profiling the
1843  * callers directly, instead of funneling them all through a single function.
1844  */
1845 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1846 {
1847     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1848
1849     g_assert(!qemu_mutex_iothread_locked());
1850     bql_lock(&qemu_global_mutex, file, line);
1851     iothread_locked = true;
1852 }
1853
1854 void qemu_mutex_unlock_iothread(void)
1855 {
1856     g_assert(qemu_mutex_iothread_locked());
1857     iothread_locked = false;
1858     qemu_mutex_unlock(&qemu_global_mutex);
1859 }
1860
1861 static bool all_vcpus_paused(void)
1862 {
1863     CPUState *cpu;
1864
1865     CPU_FOREACH(cpu) {
1866         if (!cpu->stopped) {
1867             return false;
1868         }
1869     }
1870
1871     return true;
1872 }
1873
1874 void pause_all_vcpus(void)
1875 {
1876     CPUState *cpu;
1877
1878     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1879     CPU_FOREACH(cpu) {
1880         if (qemu_cpu_is_self(cpu)) {
1881             qemu_cpu_stop(cpu, true);
1882         } else {
1883             cpu->stop = true;
1884             qemu_cpu_kick(cpu);
1885         }
1886     }
1887
1888     /* We need to drop the replay_lock so any vCPU threads woken up
1889      * can finish their replay tasks
1890      */
1891     replay_mutex_unlock();
1892
1893     while (!all_vcpus_paused()) {
1894         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1895         CPU_FOREACH(cpu) {
1896             qemu_cpu_kick(cpu);
1897         }
1898     }
1899
1900     qemu_mutex_unlock_iothread();
1901     replay_mutex_lock();
1902     qemu_mutex_lock_iothread();
1903 }
1904
1905 void cpu_resume(CPUState *cpu)
1906 {
1907     cpu->stop = false;
1908     cpu->stopped = false;
1909     qemu_cpu_kick(cpu);
1910 }
1911
1912 void resume_all_vcpus(void)
1913 {
1914     CPUState *cpu;
1915
1916     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1917     CPU_FOREACH(cpu) {
1918         cpu_resume(cpu);
1919     }
1920 }
1921
1922 void cpu_remove_sync(CPUState *cpu)
1923 {
1924     cpu->stop = true;
1925     cpu->unplug = true;
1926     qemu_cpu_kick(cpu);
1927     qemu_mutex_unlock_iothread();
1928     qemu_thread_join(cpu->thread);
1929     qemu_mutex_lock_iothread();
1930 }
1931
1932 /* For temporary buffers for forming a name */
1933 #define VCPU_THREAD_NAME_SIZE 16
1934
1935 static void qemu_tcg_init_vcpu(CPUState *cpu)
1936 {
1937     char thread_name[VCPU_THREAD_NAME_SIZE];
1938     static QemuCond *single_tcg_halt_cond;
1939     static QemuThread *single_tcg_cpu_thread;
1940     static int tcg_region_inited;
1941
1942     assert(tcg_enabled());
1943     /*
1944      * Initialize TCG regions--once. Now is a good time, because:
1945      * (1) TCG's init context, prologue and target globals have been set up.
1946      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1947      *     -accel flag is processed, so the check doesn't work then).
1948      */
1949     if (!tcg_region_inited) {
1950         tcg_region_inited = 1;
1951         tcg_region_init();
1952     }
1953
1954     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1955         cpu->thread = g_malloc0(sizeof(QemuThread));
1956         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1957         qemu_cond_init(cpu->halt_cond);
1958
1959         if (qemu_tcg_mttcg_enabled()) {
1960             /* create a thread per vCPU with TCG (MTTCG) */
1961             parallel_cpus = true;
1962             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1963                  cpu->cpu_index);
1964
1965             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1966                                cpu, QEMU_THREAD_JOINABLE);
1967
1968         } else {
1969             /* share a single thread for all cpus with TCG */
1970             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1971             qemu_thread_create(cpu->thread, thread_name,
1972                                qemu_tcg_rr_cpu_thread_fn,
1973                                cpu, QEMU_THREAD_JOINABLE);
1974
1975             single_tcg_halt_cond = cpu->halt_cond;
1976             single_tcg_cpu_thread = cpu->thread;
1977         }
1978 #ifdef _WIN32
1979         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1980 #endif
1981     } else {
1982         /* For non-MTTCG cases we share the thread */
1983         cpu->thread = single_tcg_cpu_thread;
1984         cpu->halt_cond = single_tcg_halt_cond;
1985         cpu->thread_id = first_cpu->thread_id;
1986         cpu->can_do_io = 1;
1987         cpu->created = true;
1988     }
1989 }
1990
1991 static void qemu_hax_start_vcpu(CPUState *cpu)
1992 {
1993     char thread_name[VCPU_THREAD_NAME_SIZE];
1994
1995     cpu->thread = g_malloc0(sizeof(QemuThread));
1996     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1997     qemu_cond_init(cpu->halt_cond);
1998
1999     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2000              cpu->cpu_index);
2001     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2002                        cpu, QEMU_THREAD_JOINABLE);
2003 #ifdef _WIN32
2004     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2005 #endif
2006 }
2007
2008 static void qemu_kvm_start_vcpu(CPUState *cpu)
2009 {
2010     char thread_name[VCPU_THREAD_NAME_SIZE];
2011
2012     cpu->thread = g_malloc0(sizeof(QemuThread));
2013     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2014     qemu_cond_init(cpu->halt_cond);
2015     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2016              cpu->cpu_index);
2017     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2018                        cpu, QEMU_THREAD_JOINABLE);
2019 }
2020
2021 static void qemu_hvf_start_vcpu(CPUState *cpu)
2022 {
2023     char thread_name[VCPU_THREAD_NAME_SIZE];
2024
2025     /* HVF currently does not support TCG, and only runs in
2026      * unrestricted-guest mode. */
2027     assert(hvf_enabled());
2028
2029     cpu->thread = g_malloc0(sizeof(QemuThread));
2030     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2031     qemu_cond_init(cpu->halt_cond);
2032
2033     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2034              cpu->cpu_index);
2035     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2036                        cpu, QEMU_THREAD_JOINABLE);
2037 }
2038
2039 static void qemu_whpx_start_vcpu(CPUState *cpu)
2040 {
2041     char thread_name[VCPU_THREAD_NAME_SIZE];
2042
2043     cpu->thread = g_malloc0(sizeof(QemuThread));
2044     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2045     qemu_cond_init(cpu->halt_cond);
2046     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2047              cpu->cpu_index);
2048     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2049                        cpu, QEMU_THREAD_JOINABLE);
2050 #ifdef _WIN32
2051     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2052 #endif
2053 }
2054
2055 static void qemu_dummy_start_vcpu(CPUState *cpu)
2056 {
2057     char thread_name[VCPU_THREAD_NAME_SIZE];
2058
2059     cpu->thread = g_malloc0(sizeof(QemuThread));
2060     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2061     qemu_cond_init(cpu->halt_cond);
2062     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2063              cpu->cpu_index);
2064     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2065                        QEMU_THREAD_JOINABLE);
2066 }
2067
2068 void qemu_init_vcpu(CPUState *cpu)
2069 {
2070     cpu->nr_cores = smp_cores;
2071     cpu->nr_threads = smp_threads;
2072     cpu->stopped = true;
2073
2074     if (!cpu->as) {
2075         /* If the target cpu hasn't set up any address spaces itself,
2076          * give it the default one.
2077          */
2078         cpu->num_ases = 1;
2079         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2080     }
2081
2082     if (kvm_enabled()) {
2083         qemu_kvm_start_vcpu(cpu);
2084     } else if (hax_enabled()) {
2085         qemu_hax_start_vcpu(cpu);
2086     } else if (hvf_enabled()) {
2087         qemu_hvf_start_vcpu(cpu);
2088     } else if (tcg_enabled()) {
2089         qemu_tcg_init_vcpu(cpu);
2090     } else if (whpx_enabled()) {
2091         qemu_whpx_start_vcpu(cpu);
2092     } else {
2093         qemu_dummy_start_vcpu(cpu);
2094     }
2095
2096     while (!cpu->created) {
2097         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2098     }
2099 }
2100
2101 void cpu_stop_current(void)
2102 {
2103     if (current_cpu) {
2104         current_cpu->stop = true;
2105         cpu_exit(current_cpu);
2106     }
2107 }
2108
2109 int vm_stop(RunState state)
2110 {
2111     if (qemu_in_vcpu_thread()) {
2112         qemu_system_vmstop_request_prepare();
2113         qemu_system_vmstop_request(state);
2114         /*
2115          * FIXME: should not return to device code in case
2116          * vm_stop() has been requested.
2117          */
2118         cpu_stop_current();
2119         return 0;
2120     }
2121
2122     return do_vm_stop(state, true);
2123 }
2124
2125 /**
2126  * Prepare for (re)starting the VM.
2127  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2128  * running or in case of an error condition), 0 otherwise.
2129  */
2130 int vm_prepare_start(void)
2131 {
2132     RunState requested;
2133
2134     qemu_vmstop_requested(&requested);
2135     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2136         return -1;
2137     }
2138
2139     /* Ensure that a STOP/RESUME pair of events is emitted if a
2140      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2141      * example, according to documentation is always followed by
2142      * the STOP event.
2143      */
2144     if (runstate_is_running()) {
2145         qapi_event_send_stop();
2146         qapi_event_send_resume();
2147         return -1;
2148     }
2149
2150     /* We are sending this now, but the CPUs will be resumed shortly later */
2151     qapi_event_send_resume();
2152
2153     replay_enable_events();
2154     cpu_enable_ticks();
2155     runstate_set(RUN_STATE_RUNNING);
2156     vm_state_notify(1, RUN_STATE_RUNNING);
2157     return 0;
2158 }
2159
2160 void vm_start(void)
2161 {
2162     if (!vm_prepare_start()) {
2163         resume_all_vcpus();
2164     }
2165 }
2166
2167 /* does a state transition even if the VM is already stopped,
2168    current state is forgotten forever */
2169 int vm_stop_force_state(RunState state)
2170 {
2171     if (runstate_is_running()) {
2172         return vm_stop(state);
2173     } else {
2174         runstate_set(state);
2175
2176         bdrv_drain_all();
2177         /* Make sure to return an error if the flush in a previous vm_stop()
2178          * failed. */
2179         return bdrv_flush_all();
2180     }
2181 }
2182
2183 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2184 {
2185     /* XXX: implement xxx_cpu_list for targets that still miss it */
2186 #if defined(cpu_list)
2187     cpu_list(f, cpu_fprintf);
2188 #endif
2189 }
2190
2191 CpuInfoList *qmp_query_cpus(Error **errp)
2192 {
2193     MachineState *ms = MACHINE(qdev_get_machine());
2194     MachineClass *mc = MACHINE_GET_CLASS(ms);
2195     CpuInfoList *head = NULL, *cur_item = NULL;
2196     CPUState *cpu;
2197
2198     CPU_FOREACH(cpu) {
2199         CpuInfoList *info;
2200 #if defined(TARGET_I386)
2201         X86CPU *x86_cpu = X86_CPU(cpu);
2202         CPUX86State *env = &x86_cpu->env;
2203 #elif defined(TARGET_PPC)
2204         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2205         CPUPPCState *env = &ppc_cpu->env;
2206 #elif defined(TARGET_SPARC)
2207         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2208         CPUSPARCState *env = &sparc_cpu->env;
2209 #elif defined(TARGET_RISCV)
2210         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2211         CPURISCVState *env = &riscv_cpu->env;
2212 #elif defined(TARGET_MIPS)
2213         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2214         CPUMIPSState *env = &mips_cpu->env;
2215 #elif defined(TARGET_TRICORE)
2216         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2217         CPUTriCoreState *env = &tricore_cpu->env;
2218 #elif defined(TARGET_S390X)
2219         S390CPU *s390_cpu = S390_CPU(cpu);
2220         CPUS390XState *env = &s390_cpu->env;
2221 #endif
2222
2223         cpu_synchronize_state(cpu);
2224
2225         info = g_malloc0(sizeof(*info));
2226         info->value = g_malloc0(sizeof(*info->value));
2227         info->value->CPU = cpu->cpu_index;
2228         info->value->current = (cpu == first_cpu);
2229         info->value->halted = cpu->halted;
2230         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2231         info->value->thread_id = cpu->thread_id;
2232 #if defined(TARGET_I386)
2233         info->value->arch = CPU_INFO_ARCH_X86;
2234         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2235 #elif defined(TARGET_PPC)
2236         info->value->arch = CPU_INFO_ARCH_PPC;
2237         info->value->u.ppc.nip = env->nip;
2238 #elif defined(TARGET_SPARC)
2239         info->value->arch = CPU_INFO_ARCH_SPARC;
2240         info->value->u.q_sparc.pc = env->pc;
2241         info->value->u.q_sparc.npc = env->npc;
2242 #elif defined(TARGET_MIPS)
2243         info->value->arch = CPU_INFO_ARCH_MIPS;
2244         info->value->u.q_mips.PC = env->active_tc.PC;
2245 #elif defined(TARGET_TRICORE)
2246         info->value->arch = CPU_INFO_ARCH_TRICORE;
2247         info->value->u.tricore.PC = env->PC;
2248 #elif defined(TARGET_S390X)
2249         info->value->arch = CPU_INFO_ARCH_S390;
2250         info->value->u.s390.cpu_state = env->cpu_state;
2251 #elif defined(TARGET_RISCV)
2252         info->value->arch = CPU_INFO_ARCH_RISCV;
2253         info->value->u.riscv.pc = env->pc;
2254 #else
2255         info->value->arch = CPU_INFO_ARCH_OTHER;
2256 #endif
2257         info->value->has_props = !!mc->cpu_index_to_instance_props;
2258         if (info->value->has_props) {
2259             CpuInstanceProperties *props;
2260             props = g_malloc0(sizeof(*props));
2261             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2262             info->value->props = props;
2263         }
2264
2265         /* XXX: waiting for the qapi to support GSList */
2266         if (!cur_item) {
2267             head = cur_item = info;
2268         } else {
2269             cur_item->next = info;
2270             cur_item = info;
2271         }
2272     }
2273
2274     return head;
2275 }
2276
2277 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2278 {
2279     /*
2280      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2281      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2282      */
2283     switch (target) {
2284     case SYS_EMU_TARGET_I386:
2285     case SYS_EMU_TARGET_X86_64:
2286         return CPU_INFO_ARCH_X86;
2287
2288     case SYS_EMU_TARGET_PPC:
2289     case SYS_EMU_TARGET_PPC64:
2290         return CPU_INFO_ARCH_PPC;
2291
2292     case SYS_EMU_TARGET_SPARC:
2293     case SYS_EMU_TARGET_SPARC64:
2294         return CPU_INFO_ARCH_SPARC;
2295
2296     case SYS_EMU_TARGET_MIPS:
2297     case SYS_EMU_TARGET_MIPSEL:
2298     case SYS_EMU_TARGET_MIPS64:
2299     case SYS_EMU_TARGET_MIPS64EL:
2300         return CPU_INFO_ARCH_MIPS;
2301
2302     case SYS_EMU_TARGET_TRICORE:
2303         return CPU_INFO_ARCH_TRICORE;
2304
2305     case SYS_EMU_TARGET_S390X:
2306         return CPU_INFO_ARCH_S390;
2307
2308     case SYS_EMU_TARGET_RISCV32:
2309     case SYS_EMU_TARGET_RISCV64:
2310         return CPU_INFO_ARCH_RISCV;
2311
2312     default:
2313         return CPU_INFO_ARCH_OTHER;
2314     }
2315 }
2316
2317 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2318 {
2319 #ifdef TARGET_S390X
2320     S390CPU *s390_cpu = S390_CPU(cpu);
2321     CPUS390XState *env = &s390_cpu->env;
2322
2323     info->cpu_state = env->cpu_state;
2324 #else
2325     abort();
2326 #endif
2327 }
2328
2329 /*
2330  * fast means: we NEVER interrupt vCPU threads to retrieve
2331  * information from KVM.
2332  */
2333 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2334 {
2335     MachineState *ms = MACHINE(qdev_get_machine());
2336     MachineClass *mc = MACHINE_GET_CLASS(ms);
2337     CpuInfoFastList *head = NULL, *cur_item = NULL;
2338     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2339                                           -1, &error_abort);
2340     CPUState *cpu;
2341
2342     CPU_FOREACH(cpu) {
2343         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2344         info->value = g_malloc0(sizeof(*info->value));
2345
2346         info->value->cpu_index = cpu->cpu_index;
2347         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2348         info->value->thread_id = cpu->thread_id;
2349
2350         info->value->has_props = !!mc->cpu_index_to_instance_props;
2351         if (info->value->has_props) {
2352             CpuInstanceProperties *props;
2353             props = g_malloc0(sizeof(*props));
2354             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2355             info->value->props = props;
2356         }
2357
2358         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2359         info->value->target = target;
2360         if (target == SYS_EMU_TARGET_S390X) {
2361             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2362         }
2363
2364         if (!cur_item) {
2365             head = cur_item = info;
2366         } else {
2367             cur_item->next = info;
2368             cur_item = info;
2369         }
2370     }
2371
2372     return head;
2373 }
2374
2375 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2376                  bool has_cpu, int64_t cpu_index, Error **errp)
2377 {
2378     FILE *f;
2379     uint32_t l;
2380     CPUState *cpu;
2381     uint8_t buf[1024];
2382     int64_t orig_addr = addr, orig_size = size;
2383
2384     if (!has_cpu) {
2385         cpu_index = 0;
2386     }
2387
2388     cpu = qemu_get_cpu(cpu_index);
2389     if (cpu == NULL) {
2390         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2391                    "a CPU number");
2392         return;
2393     }
2394
2395     f = fopen(filename, "wb");
2396     if (!f) {
2397         error_setg_file_open(errp, errno, filename);
2398         return;
2399     }
2400
2401     while (size != 0) {
2402         l = sizeof(buf);
2403         if (l > size)
2404             l = size;
2405         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2406             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2407                              " specified", orig_addr, orig_size);
2408             goto exit;
2409         }
2410         if (fwrite(buf, 1, l, f) != l) {
2411             error_setg(errp, QERR_IO_ERROR);
2412             goto exit;
2413         }
2414         addr += l;
2415         size -= l;
2416     }
2417
2418 exit:
2419     fclose(f);
2420 }
2421
2422 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2423                   Error **errp)
2424 {
2425     FILE *f;
2426     uint32_t l;
2427     uint8_t buf[1024];
2428
2429     f = fopen(filename, "wb");
2430     if (!f) {
2431         error_setg_file_open(errp, errno, filename);
2432         return;
2433     }
2434
2435     while (size != 0) {
2436         l = sizeof(buf);
2437         if (l > size)
2438             l = size;
2439         cpu_physical_memory_read(addr, buf, l);
2440         if (fwrite(buf, 1, l, f) != l) {
2441             error_setg(errp, QERR_IO_ERROR);
2442             goto exit;
2443         }
2444         addr += l;
2445         size -= l;
2446     }
2447
2448 exit:
2449     fclose(f);
2450 }
2451
2452 void qmp_inject_nmi(Error **errp)
2453 {
2454     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2455 }
2456
2457 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2458 {
2459     if (!use_icount) {
2460         return;
2461     }
2462
2463     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2464                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2465     if (icount_align_option) {
2466         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2467         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2468     } else {
2469         cpu_fprintf(f, "Max guest delay     NA\n");
2470         cpu_fprintf(f, "Max guest advance   NA\n");
2471     }
2472 }