KVM: x86/mmu: Apply max PA check for MMIO sptes to 32-bit KVM

[sagit-ice-cold/kernel_xiaomi_msm8998.git] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 7ffc224..21fb707 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -53,6 +53,7 @@
  #include <linux/pvclock_gtod.h>
  #include <linux/kvm_irqfd.h>
  #include <linux/irqbypass.h>
+#include <linux/nospec.h>
  #include <trace/events/kvm.h>
  
  #define CREATE_TRACE_POINTS
@@ -199,7 +200,18 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
         struct kvm_shared_msrs *locals
                 = container_of(urn, struct kvm_shared_msrs, urn);
         struct kvm_shared_msr_values *values;
+       unsigned long flags;
  
+       /*
+        * Disabling irqs at this point since the following code could be
+        * interrupted and executed through kvm_arch_hardware_disable()
+        */
+       local_irq_save(flags);
+       if (locals->registered) {
+               locals->registered = false;
+               user_return_notifier_unregister(urn);
+       }
+       local_irq_restore(flags);
         for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
                 values = &locals->values[slot];
                 if (values->host != values->curr) {
@@ -207,8 +219,6 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
                         values->curr = values->host;
                 }
         }
-       locals->registered = false;
-       user_return_notifier_unregister(urn);
  }
  
  static void shared_msr_update(unsigned slot, u32 msr)
@@ -251,13 +261,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
         struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
         int err;
  
-       if (((value ^ smsr->values[slot].curr) & mask) == 0)
+       value = (value & mask) | (smsr->values[slot].host & ~mask);
+       if (value == smsr->values[slot].curr)
                 return 0;
-       smsr->values[slot].curr = value;
         err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
         if (err)
                 return 1;
  
+       smsr->values[slot].curr = value;
         if (!smsr->registered) {
                 smsr->urn.on_user_return = kvm_on_user_return;
                 user_return_notifier_register(&smsr->urn);
@@ -514,8 +525,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
                                        data, offset, len, access);
  }
  
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+       return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
+              rsvd_bits(1, 2);
+}
+
  /*
- * Load the pae pdptrs.  Return true is they are all valid.
+ * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
   */
  int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
  {
@@ -534,8 +551,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
         }
         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
                 if (is_present_gpte(pdpte[i]) &&
-                   (pdpte[i] &
-                    vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
+                   (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
                         ret = 0;
                         goto out;
                 }
@@ -561,7 +577,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
         gfn_t gfn;
         int r;
  
-       if (is_long_mode(vcpu) || !is_pae(vcpu))
+       if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
                 return false;
  
         if (!test_bit(VCPU_EXREG_PDPTR,
@@ -697,7 +713,6 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
                 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
                         return 1;
         }
-       kvm_put_guest_xcr0(vcpu);
         vcpu->arch.xcr0 = xcr0;
  
         if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
@@ -751,7 +766,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                         return 1;
  
                 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
-               if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+               if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+                   !is_long_mode(vcpu))
                         return 1;
         }
  
@@ -858,9 +874,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
  
  static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
  {
+       size_t size = ARRAY_SIZE(vcpu->arch.db);
+
         switch (dr) {
         case 0 ... 3:
-               vcpu->arch.db[dr] = val;
+               vcpu->arch.db[array_index_nospec(dr, size)] = val;
                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
                         vcpu->arch.eff_db[dr] = val;
                 break;
@@ -897,9 +915,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
  
  int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
  {
+       size_t size = ARRAY_SIZE(vcpu->arch.db);
+
         switch (dr) {
         case 0 ... 3:
-               *val = vcpu->arch.db[dr];
+               *val = vcpu->arch.db[array_index_nospec(dr, size)];
                 break;
         case 4:
                 /* fall through */
@@ -951,7 +971,8 @@ static u32 msrs_to_save[] = {
         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
  #endif
         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
-       MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
+       MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+       MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
  };
  
  static unsigned num_msrs_to_save;
@@ -975,15 +996,55 @@ static u32 emulated_msrs[] = {
         MSR_IA32_MCG_STATUS,
         MSR_IA32_MCG_CTL,
         MSR_IA32_SMBASE,
+       MSR_AMD64_VIRT_SPEC_CTRL,
  };
  
  static unsigned num_emulated_msrs;
  
-bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+u64 kvm_get_arch_capabilities(void)
  {
-       if (efer & efer_reserved_bits)
-               return false;
+       u64 data;
+
+       rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
+
+       if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+               data |= ARCH_CAP_RDCL_NO;
+       if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+               data |= ARCH_CAP_SSB_NO;
+       if (!boot_cpu_has_bug(X86_BUG_MDS))
+               data |= ARCH_CAP_MDS_NO;
+
+       /*
+        * On TAA affected systems, export MDS_NO=0 when:
+        *      - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
+        *      - Updated microcode is present. This is detected by
+        *        the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
+        *        that VERW clears CPU buffers.
+        *
+        * When MDS_NO=0 is exported, guests deploy clear CPU buffer
+        * mitigation and don't complain:
+        *
+        *      "Vulnerable: Clear CPU buffers attempted, no microcode"
+        *
+        * If TSX is disabled on the system, guests are also mitigated against
+        * TAA and clear CPU buffer mitigation is not required for guests.
+        */
+       if (!boot_cpu_has(X86_FEATURE_RTM))
+               data &= ~ARCH_CAP_TAA_NO;
+       else if (!boot_cpu_has_bug(X86_BUG_TAA))
+               data |= ARCH_CAP_TAA_NO;
+       else if (data & ARCH_CAP_TSX_CTRL_MSR)
+               data &= ~ARCH_CAP_MDS_NO;
  
+       /* KVM does not emulate MSR_IA32_TSX_CTRL.  */
+       data &= ~ARCH_CAP_TSX_CTRL_MSR;
+       return data;
+}
+
+EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
+
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
         if (efer & EFER_FFXSR) {
                 struct kvm_cpuid_entry2 *feat;
  
@@ -1001,19 +1062,33 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
         }
  
         return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+       if (efer & efer_reserved_bits)
+               return false;
+
+       return __kvm_valid_efer(vcpu, efer);
  }
  EXPORT_SYMBOL_GPL(kvm_valid_efer);
  
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
  {
         u64 old_efer = vcpu->arch.efer;
+       u64 efer = msr_info->data;
  
-       if (!kvm_valid_efer(vcpu, efer))
+       if (efer & efer_reserved_bits)
                 return 1;
  
-       if (is_paging(vcpu)
-           && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
-               return 1;
+       if (!msr_info->host_initiated) {
+               if (!__kvm_valid_efer(vcpu, efer))
+                       return 1;
+
+               if (is_paging(vcpu) &&
+                   (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+                       return 1;
+       }
  
         efer &= ~EFER_LMA;
         efer |= vcpu->arch.efer & EFER_LMA;
@@ -1271,7 +1346,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                         vcpu->arch.tsc_always_catchup = 1;
                         return 0;
                 } else {
-                       WARN(1, "user requested TSC rate below hardware speed\n");
+                       pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
                         return -1;
                 }
         }
@@ -1281,8 +1356,8 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                                 user_tsc_khz, tsc_khz);
  
         if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
-               WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
-                         user_tsc_khz);
+               pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+                                   user_tsc_khz);
                 return -1;
         }
  
@@ -1804,6 +1879,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
          */
         BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
  
+       if (guest_hv_clock.version & 1)
+               ++guest_hv_clock.version;  /* first time write, random junk */
+
         vcpu->hv_clock.version = guest_hv_clock.version + 1;
         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
                                 &vcpu->hv_clock,
@@ -1916,7 +1994,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
         default:
                 if (msr >= MSR_IA32_MC0_CTL &&
                     msr < MSR_IA32_MCx_CTL(bank_num)) {
-                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       u32 offset = array_index_nospec(
+                               msr - MSR_IA32_MC0_CTL,
+                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
                         /* only 0 or all 1s can be written to IA32_MCi_CTL
                          * some Linux kernels though clear bit 10 in bank 4 to
                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
@@ -2040,8 +2121,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case MSR_AMD64_BU_CFG2:
                 break;
  
+       case MSR_IA32_ARCH_CAPABILITIES:
+               if (!msr_info->host_initiated)
+                       return 1;
+               vcpu->arch.arch_capabilities = data;
+               break;
         case MSR_EFER:
-               return set_efer(vcpu, data);
+               return set_efer(vcpu, msr_info);
         case MSR_K7_HWCR:
                 data &= ~(u64)0x40;     /* ignore flush filter disable */
                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
@@ -2272,7 +2358,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         default:
                 if (msr >= MSR_IA32_MC0_CTL &&
                     msr < MSR_IA32_MCx_CTL(bank_num)) {
-                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       u32 offset = array_index_nospec(
+                               msr - MSR_IA32_MC0_CTL,
+                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
                         data = vcpu->arch.mce_banks[offset];
                         break;
                 }
@@ -2314,6 +2403,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case MSR_IA32_UCODE_REV:
                 msr_info->data = 0x100000000ULL;
                 break;
+       case MSR_IA32_ARCH_CAPABILITIES:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has_arch_capabilities(vcpu))
+                       return 1;
+               msr_info->data = vcpu->arch.arch_capabilities;
+               break;
         case MSR_MTRRcap:
         case 0x200 ... 0x2ff:
                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -2571,7 +2666,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                  * fringe case that is not enabled except via specific settings
                  * of the module parameters.
                  */
-               r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+               r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
                 break;
         case KVM_CAP_COALESCED_MMIO:
                 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
@@ -2743,6 +2838,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
         kvm_x86_ops->vcpu_put(vcpu);
         kvm_put_guest_fpu(vcpu);
         vcpu->arch.last_host_tsc = rdtsc();
+       /*
+        * If userspace has set any breakpoints or watchpoints, dr6 is restored
+        * on every vmexit, but if not, we might have a stale dr6 from the
+        * guest. do_debug expects dr6 to be cleared after it runs, do the same.
+        */
+       set_debugreg(0, 6);
  }
  
  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -2941,6 +3042,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
         memset(&events->reserved, 0, sizeof(events->reserved));
  }
  
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
+
  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                                               struct kvm_vcpu_events *events)
  {
@@ -2950,6 +3053,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                               | KVM_VCPUEVENT_VALID_SMM))
                 return -EINVAL;
  
+       if (events->exception.injected &&
+           (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+               return -EINVAL;
+
+       /* INITs are latched while in SMM */
+       if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
+           (events->smi.smm || events->smi.pending) &&
+           vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+               return -EINVAL;
+
         process_nmi(vcpu);
         vcpu->arch.exception.pending = events->exception.injected;
         vcpu->arch.exception.nr = events->exception.nr;
@@ -2973,10 +3086,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
  
         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+               u32 hflags = vcpu->arch.hflags;
                 if (events->smi.smm)
-                       vcpu->arch.hflags |= HF_SMM_MASK;
+                       hflags |= HF_SMM_MASK;
                 else
-                       vcpu->arch.hflags &= ~HF_SMM_MASK;
+                       hflags &= ~HF_SMM_MASK;
+               kvm_set_hflags(vcpu, hflags);
+
                 vcpu->arch.smi_pending = events->smi.pending;
                 if (events->smi.smm_inside_nmi)
                         vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
@@ -3014,6 +3130,11 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
         if (dbgregs->flags)
                 return -EINVAL;
  
+       if (dbgregs->dr6 & ~0xffffffffull)
+               return -EINVAL;
+       if (dbgregs->dr7 & ~0xffffffffull)
+               return -EINVAL;
+
         memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
         kvm_update_dr0123(vcpu);
         vcpu->arch.dr6 = dbgregs->dr6;
@@ -3039,6 +3160,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
         memcpy(dest, xsave, XSAVE_HDR_OFFSET);
  
         /* Set XSTATE_BV */
+       xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
         *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
  
         /*
@@ -3115,11 +3237,14 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
         }
  }
  
+#define XSAVE_MXCSR_OFFSET 24
+
  static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                                         struct kvm_xsave *guest_xsave)
  {
         u64 xstate_bv =
                 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+       u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
  
         if (cpu_has_xsave) {
                 /*
@@ -3127,11 +3252,13 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                  * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
                  * with old userspace.
                  */
-               if (xstate_bv & ~kvm_supported_xcr0())
+               if (xstate_bv & ~kvm_supported_xcr0() ||
+                       mxcsr & ~mxcsr_feature_mask)
                         return -EINVAL;
                 load_xsave(vcpu, (u8 *)guest_xsave->region);
         } else {
-               if (xstate_bv & ~XFEATURE_MASK_FPSSE)
+               if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
+                       mxcsr & ~mxcsr_feature_mask)
                         return -EINVAL;
                 memcpy(&vcpu->arch.guest_fpu.state.fxsave,
                         guest_xsave->region, sizeof(struct fxregs_state));
@@ -3313,6 +3440,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
         };
         case KVM_SET_VAPIC_ADDR: {
                 struct kvm_vapic_addr va;
+               int idx;
  
                 r = -EINVAL;
                 if (!lapic_in_kernel(vcpu))
@@ -3320,7 +3448,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                 r = -EFAULT;
                 if (copy_from_user(&va, argp, sizeof va))
                         goto out;
+               idx = srcu_read_lock(&vcpu->kvm->srcu);
                 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
+               srcu_read_unlock(&vcpu->kvm->srcu, idx);
                 break;
         }
         case KVM_X86_SETUP_MCE: {
@@ -3606,7 +3736,8 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
                sizeof(kvm->arch.vpit->pit_state.channels));
         kvm->arch.vpit->pit_state.flags = ps->flags;
         for (i = 0; i < 3; i++)
-               kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count, start);
+               kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+                                  start && i == 0);
         mutex_unlock(&kvm->arch.vpit->pit_state.lock);
         return 0;
  }
@@ -3806,7 +3937,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                                    sizeof(struct kvm_pit_config)))
                         goto out;
         create_pit:
-               mutex_lock(&kvm->slots_lock);
+               mutex_lock(&kvm->lock);
                 r = -EEXIST;
                 if (kvm->arch.vpit)
                         goto create_pit_unlock;
@@ -3815,7 +3946,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 if (kvm->arch.vpit)
                         r = 0;
         create_pit_unlock:
-               mutex_unlock(&kvm->slots_lock);
+               mutex_unlock(&kvm->lock);
                 break;
         case KVM_GET_IRQCHIP: {
                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
@@ -3929,13 +4060,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 mutex_unlock(&kvm->lock);
                 break;
         case KVM_XEN_HVM_CONFIG: {
+               struct kvm_xen_hvm_config xhc;
                 r = -EFAULT;
-               if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
-                                  sizeof(struct kvm_xen_hvm_config)))
+               if (copy_from_user(&xhc, argp, sizeof(xhc)))
                         goto out;
                 r = -EINVAL;
-               if (kvm->arch.xen_hvm_config.flags)
+               if (xhc.flags)
                         goto out;
+               memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
                 r = 0;
                 break;
         }
@@ -4005,16 +4137,17 @@ static void kvm_init_msr_list(void)
  
                 /*
                  * Even MSRs that are valid in the host may not be exposed
-                * to the guests in some cases.  We could work around this
-                * in VMX with the generic MSR save/load machinery, but it
-                * is not really worthwhile since it will really only
-                * happen with nested virtualization.
+                * to the guests in some cases.
                  */
                 switch (msrs_to_save[i]) {
                 case MSR_IA32_BNDCFGS:
                         if (!kvm_x86_ops->mpx_supported())
                                 continue;
                         break;
+               case MSR_TSC_AUX:
+                       if (!kvm_x86_ops->rdtscp_supported())
+                               continue;
+                       break;
                 default:
                         break;
                 }
@@ -4026,14 +4159,8 @@ static void kvm_init_msr_list(void)
         num_msrs_to_save = j;
  
         for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
-               switch (emulated_msrs[i]) {
-               case MSR_IA32_SMBASE:
-                       if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
-                               continue;
-                       break;
-               default:
-                       break;
-               }
+               if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+                       continue;
  
                 if (j < i)
                         emulated_msrs[j] = emulated_msrs[i];
@@ -4075,7 +4202,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
                                          addr, n, v))
                     && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
                         break;
-               trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
                 handled += n;
                 addr += n;
                 len -= n;
@@ -4199,24 +4326,35 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
         return X86EMUL_CONTINUE;
  }
  
-int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
                                gva_t addr, void *val, unsigned int bytes,
                                struct x86_exception *exception)
  {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
         u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
  
+       /*
+        * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+        * is returned, but our callers are not ready for that and they blindly
+        * call kvm_inject_page_fault.  Ensure that they at least do not leak
+        * uninitialized kernel stack memory into cr2 and error code.
+        */
+       memset(exception, 0, sizeof(*exception));
         return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
                                           exception);
  }
  EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
  
-static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
-                                     gva_t addr, void *val, unsigned int bytes,
-                                     struct x86_exception *exception)
+static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
+                            gva_t addr, void *val, unsigned int bytes,
+                            struct x86_exception *exception, bool system)
  {
         struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
+       u32 access = 0;
+
+       if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+               access |= PFERR_USER_MASK;
+
+       return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
  }
  
  static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
@@ -4228,18 +4366,16 @@ static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
         return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
  }
  
-int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
-                                      gva_t addr, void *val,
-                                      unsigned int bytes,
-                                      struct x86_exception *exception)
+static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
+                                     struct kvm_vcpu *vcpu, u32 access,
+                                     struct x86_exception *exception)
  {
-       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
         void *data = val;
         int r = X86EMUL_CONTINUE;
  
         while (bytes) {
                 gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
-                                                            PFERR_WRITE_MASK,
+                                                            access,
                                                              exception);
                 unsigned offset = addr & (PAGE_SIZE-1);
                 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
@@ -4260,6 +4396,34 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
  out:
         return r;
  }
+
+static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
+                             unsigned int bytes, struct x86_exception *exception,
+                             bool system)
+{
+       struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       u32 access = PFERR_WRITE_MASK;
+
+       if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+               access |= PFERR_USER_MASK;
+
+       /*
+        * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+        * is returned, but our callers are not ready for that and they blindly
+        * call kvm_inject_page_fault.  Ensure that they at least do not leak
+        * uninitialized kernel stack memory into cr2 and error code.
+        */
+       memset(exception, 0, sizeof(*exception));
+       return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+                                          access, exception);
+}
+
+int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
+                               unsigned int bytes, struct x86_exception *exception)
+{
+       return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
+                                          PFERR_WRITE_MASK, exception);
+}
  EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
  
  static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
@@ -4323,7 +4487,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
  {
         if (vcpu->mmio_read_completed) {
                 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-                              vcpu->mmio_fragments[0].gpa, *(u64 *)val);
+                              vcpu->mmio_fragments[0].gpa, val);
                 vcpu->mmio_read_completed = 0;
                 return 1;
         }
@@ -4345,14 +4509,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
  
  static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
  {
-       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
         return vcpu_mmio_write(vcpu, gpa, bytes, val);
  }
  
  static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
                           void *val, int bytes)
  {
-       trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+       trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
         return X86EMUL_IO_NEEDED;
  }
  
@@ -4573,16 +4737,20 @@ emul_write:
  
  static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
  {
-       /* TODO: String I/O for in kernel device */
-       int r;
+       int r = 0, i;
  
-       if (vcpu->arch.pio.in)
-               r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
-                                   vcpu->arch.pio.size, pd);
-       else
-               r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
-                                    vcpu->arch.pio.port, vcpu->arch.pio.size,
-                                    pd);
+       for (i = 0; i < vcpu->arch.pio.count; i++) {
+               if (vcpu->arch.pio.in)
+                       r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
+                                           vcpu->arch.pio.size, pd);
+               else
+                       r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
+                                            vcpu->arch.pio.port, vcpu->arch.pio.size,
+                                            pd);
+               if (r)
+                       break;
+               pd += vcpu->arch.pio.size;
+       }
         return r;
  }
  
@@ -4620,6 +4788,8 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
         if (vcpu->arch.pio.count)
                 goto data_avail;
  
+       memset(vcpu->arch.pio_data, 0, size * count);
+
         ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
         if (ret) {
  data_avail:
@@ -4803,6 +4973,8 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
  
         if (var.unusable) {
                 memset(desc, 0, sizeof(*desc));
+               if (base3)
+                       *base3 = 0;
                 return false;
         }
  
@@ -4958,11 +5130,21 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
         kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
  }
  
+static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+{
+       return emul_to_vcpu(ctxt)->arch.hflags;
+}
+
+static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+{
+       kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
+}
+
  static const struct x86_emulate_ops emulate_ops = {
         .read_gpr            = emulator_read_gpr,
         .write_gpr           = emulator_write_gpr,
-       .read_std            = kvm_read_guest_virt_system,
-       .write_std           = kvm_write_guest_virt_system,
+       .read_std            = emulator_read_std,
+       .write_std           = emulator_write_std,
         .read_phys           = kvm_read_guest_phys_system,
         .fetch               = kvm_fetch_guest_virt,
         .read_emulated       = emulator_read_emulated,
@@ -4997,6 +5179,8 @@ static const struct x86_emulate_ops emulate_ops = {
         .intercept           = emulator_intercept,
         .get_cpuid           = emulator_get_cpuid,
         .set_nmi_mask        = emulator_set_nmi_mask,
+       .get_hflags          = emulator_get_hflags,
+       .set_hflags          = emulator_set_hflags,
  };
  
  static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -5040,6 +5224,8 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
  
         ctxt->eflags = kvm_get_rflags(vcpu);
+       ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
+
         ctxt->eip = kvm_rip_read(vcpu);
         ctxt->mode = (!is_protmode(vcpu))               ? X86EMUL_MODE_REAL :
                      (ctxt->eflags & X86_EFLAGS_VM)     ? X86EMUL_MODE_VM86 :
@@ -5049,7 +5235,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
         BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
         BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
         BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
-       ctxt->emul_flags = vcpu->arch.hflags;
  
         init_decode_cache(ctxt);
         vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5093,7 +5278,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
                 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
                 vcpu->run->internal.ndata = 0;
-               r = EMULATE_FAIL;
+               r = EMULATE_USER_EXIT;
         }
         kvm_queue_exception(vcpu, UD_VECTOR);
  
@@ -5261,37 +5446,26 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
         return dr6;
  }
  
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
+static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
  {
         struct kvm_run *kvm_run = vcpu->run;
  
-       /*
-        * rflags is the old, "raw" value of the flags.  The new value has
-        * not been saved yet.
-        *
-        * This is correct even for TF set by the guest, because "the
-        * processor will not generate this exception after the instruction
-        * that sets the TF flag".
-        */
-       if (unlikely(rflags & X86_EFLAGS_TF)) {
-               if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
-                       kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
-                                                 DR6_RTM;
-                       kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
-                       kvm_run->debug.arch.exception = DB_VECTOR;
-                       kvm_run->exit_reason = KVM_EXIT_DEBUG;
-                       *r = EMULATE_USER_EXIT;
-               } else {
-                       vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
-                       /*
-                        * "Certain debug exceptions may clear bit 0-3.  The
-                        * remaining contents of the DR6 register are never
-                        * cleared by the processor".
-                        */
-                       vcpu->arch.dr6 &= ~15;
-                       vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
-                       kvm_queue_exception(vcpu, DB_VECTOR);
-               }
+       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+               kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+               kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+               kvm_run->debug.arch.exception = DB_VECTOR;
+               kvm_run->exit_reason = KVM_EXIT_DEBUG;
+               *r = EMULATE_USER_EXIT;
+       } else {
+               vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
+               /*
+                * "Certain debug exceptions may clear bit 0-3.  The
+                * remaining contents of the DR6 register are never
+                * cleared by the processor".
+                */
+               vcpu->arch.dr6 &= ~15;
+               vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
+               kvm_queue_exception(vcpu, DB_VECTOR);
         }
  }
  
@@ -5361,7 +5535,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                  * handle watchpoints yet, those would be handled in
                  * the emulate_ops.
                  */
-               if (kvm_vcpu_check_breakpoint(vcpu, &r))
+               if (!(emulation_type & EMULTYPE_SKIP) &&
+                   kvm_vcpu_check_breakpoint(vcpu, &r))
                         return r;
  
                 ctxt->interruptibility = 0;
@@ -5381,6 +5556,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                         if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
                                                 emulation_type))
                                 return EMULATE_DONE;
+                       if (ctxt->have_exception) {
+                               /*
+                                * #UD should result in just EMULATION_FAILED, and trap-like
+                                * exception should not be encountered during decode.
+                                */
+                               WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
+                                            exception_type(ctxt->exception.vector) == EXCPT_TRAP);
+                               inject_emulated_exception(vcpu);
+                               return EMULATE_DONE;
+                       }
                         if (emulation_type & EMULTYPE_SKIP)
                                 return EMULATE_FAIL;
                         return handle_emulation_failure(vcpu);
@@ -5445,14 +5630,13 @@ restart:
                 unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
                 toggle_interruptibility(vcpu, ctxt->interruptibility);
                 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
-               if (vcpu->arch.hflags != ctxt->emul_flags)
-                       kvm_set_hflags(vcpu, ctxt->emul_flags);
-               kvm_rip_write(vcpu, ctxt->eip);
-               if (r == EMULATE_DONE)
-                       kvm_vcpu_check_singlestep(vcpu, rflags, &r);
                 if (!ctxt->have_exception ||
-                   exception_type(ctxt->exception.vector) == EXCPT_TRAP)
+                   exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+                       kvm_rip_write(vcpu, ctxt->eip);
+                       if (r == EMULATE_DONE && ctxt->tf)
+                               kvm_vcpu_do_singlestep(vcpu, &r);
                         __kvm_set_rflags(vcpu, ctxt->eflags);
+               }
  
                 /*
                  * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
@@ -5701,14 +5885,12 @@ static void kvm_set_mmio_spte_mask(void)
         /* Set the present bit. */
         mask |= 1ull;
  
-#ifdef CONFIG_X86_64
         /*
          * If reserved bit is not supported, clear the present bit to disable
          * mmio page fault.
          */
         if (maxphyaddr == 52)
                 mask &= ~1ull;
-#endif
  
         kvm_mmu_set_mmio_spte_mask(mask);
  }
@@ -5819,6 +6001,7 @@ out:
  
  void kvm_arch_exit(void)
  {
+       kvm_lapic_exit();
         perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
  
         if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
@@ -5932,7 +6115,8 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
  
         kvm_x86_ops->patch_hypercall(vcpu, instruction);
  
-       return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
+       return emulator_write_emulated(ctxt, rip, instruction, 3,
+               &ctxt->exception);
  }
  
  static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
@@ -6021,12 +6205,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
         }
  
         /* try to inject new event if pending */
-       if (vcpu->arch.nmi_pending) {
-               if (kvm_x86_ops->nmi_allowed(vcpu)) {
-                       --vcpu->arch.nmi_pending;
-                       vcpu->arch.nmi_injected = true;
-                       kvm_x86_ops->set_nmi(vcpu);
-               }
+       if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+               --vcpu->arch.nmi_pending;
+               vcpu->arch.nmi_injected = true;
+               kvm_x86_ops->set_nmi(vcpu);
         } else if (kvm_cpu_has_injectable_intr(vcpu)) {
                 /*
                  * Because interrupts can be injected asynchronously, we are
@@ -6311,7 +6493,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
                 kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
         else {
                 kvm_x86_ops->sync_pir_to_irr(vcpu);
-               kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+               if (ioapic_in_kernel(vcpu->kvm))
+                       kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
         }
         kvm_x86_ops->load_eoi_exitmap(vcpu);
  }
@@ -6395,6 +6578,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 }
                 if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
                         vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+                       vcpu->mmio_needed = 0;
                         r = 0;
                         goto out;
                 }
@@ -6471,10 +6655,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 if (inject_pending_event(vcpu, req_int_win) != 0)
                         req_immediate_exit = true;
                 /* enable NMI/IRQ window open exits if needed */
-               else if (vcpu->arch.nmi_pending)
-                       kvm_x86_ops->enable_nmi_window(vcpu);
-               else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
-                       kvm_x86_ops->enable_irq_window(vcpu);
+               else {
+                       if (vcpu->arch.nmi_pending)
+                               kvm_x86_ops->enable_nmi_window(vcpu);
+                       if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
+                               kvm_x86_ops->enable_irq_window(vcpu);
+               }
  
                 if (kvm_lapic_enabled(vcpu)) {
                         update_cr8_intercept(vcpu);
@@ -6492,8 +6678,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         kvm_x86_ops->prepare_guest_switch(vcpu);
         if (vcpu->fpu_active)
                 kvm_load_guest_fpu(vcpu);
-       kvm_load_guest_xcr0(vcpu);
-
         vcpu->mode = IN_GUEST_MODE;
  
         srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -6516,6 +6700,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                 goto cancel_injection;
         }
  
+       kvm_load_guest_xcr0(vcpu);
+
         if (req_immediate_exit)
                 smp_send_reschedule(vcpu->cpu);
  
@@ -6542,12 +6728,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
          * KVM_DEBUGREG_WONT_EXIT again.
          */
         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
-               int i;
-
                 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
                 kvm_x86_ops->sync_dirty_debug_regs(vcpu);
-               for (i = 0; i < KVM_NR_DB_REGS; i++)
-                       vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+               kvm_update_dr0123(vcpu);
+               kvm_update_dr6(vcpu);
+               kvm_update_dr7(vcpu);
+               vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
         }
  
         /*
@@ -6565,6 +6751,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         vcpu->mode = OUTSIDE_GUEST_MODE;
         smp_wmb();
  
+       kvm_put_guest_xcr0(vcpu);
+
         /* Interrupt is enabled by handle_external_intr() */
         kvm_x86_ops->handle_external_intr(vcpu);
  
@@ -6888,7 +7076,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
  #endif
  
         kvm_rip_write(vcpu, regs->rip);
-       kvm_set_rflags(vcpu, regs->rflags);
+       kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
  
         vcpu->arch.exception.pending = false;
  
@@ -6966,6 +7154,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
             mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                 return -EINVAL;
  
+       /* INITs are latched while in SMM */
+       if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+           (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+            mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
+               return -EINVAL;
+
         if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
                 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
                 set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
@@ -7037,7 +7231,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                 kvm_update_cpuid(vcpu);
  
         idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+       if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) {
                 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
                 mmu_reset_needed = 1;
         }
@@ -7212,7 +7406,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
          * and assume host would use all available bits.
          * Guest xcr0 would be loaded later.
          */
-       kvm_put_guest_xcr0(vcpu);
         vcpu->guest_fpu_loaded = 1;
         __kernel_fpu_begin();
         __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
@@ -7221,8 +7414,6 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
  
  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
  {
-       kvm_put_guest_xcr0(vcpu);
-
         if (!vcpu->guest_fpu_loaded) {
                 vcpu->fpu_counter = 0;
                 return;
@@ -7232,25 +7423,17 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
         copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
         __kernel_fpu_end();
         ++vcpu->stat.fpu_reload;
-       /*
-        * If using eager FPU mode, or if the guest is a frequent user
-        * of the FPU, just leave the FPU active for next time.
-        * Every 255 times fpu_counter rolls over to 0; a guest that uses
-        * the FPU in bursts will revert to loading it on demand.
-        */
-       if (!vcpu->arch.eager_fpu) {
-               if (++vcpu->fpu_counter < 5)
-                       kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
-       }
         trace_kvm_fpu(0);
  }
  
  void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
  {
+       void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
+
         kvmclock_reset(vcpu);
  
-       free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
         kvm_x86_ops->vcpu_free(vcpu);
+       free_cpumask_var(wbinvd_dirty_mask);
  }
  
  struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
@@ -7272,6 +7455,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
  {
         int r;
  
+       vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
         kvm_vcpu_mtrr_init(vcpu);
         r = vcpu_load(vcpu);
         if (r)
@@ -7312,7 +7496,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
         kvm_mmu_unload(vcpu);
         vcpu_put(vcpu);
  
-       kvm_x86_ops->vcpu_free(vcpu);
+       kvm_arch_vcpu_free(vcpu);
  }
  
  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -7700,7 +7884,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
  
         slot = id_to_memslot(slots, id);
         if (size) {
-               if (WARN_ON(slot->npages))
+               if (slot->npages)
                         return -EEXIST;
  
                 /*
@@ -8145,6 +8329,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
                                       sizeof(val));
  }
  
+static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
+{
+
+       return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
+                                     sizeof(u32));
+}
+
  void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
                                      struct kvm_async_pf *work)
  {
@@ -8171,21 +8362,32 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
                                  struct kvm_async_pf *work)
  {
         struct x86_exception fault;
+       u32 val;
  
-       trace_kvm_async_pf_ready(work->arch.token, work->gva);
         if (work->wakeup_all)
                 work->arch.token = ~0; /* broadcast wakeup */
         else
                 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+       trace_kvm_async_pf_ready(work->arch.token, work->gva);
  
-       if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
-           !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
-               fault.vector = PF_VECTOR;
-               fault.error_code_valid = true;
-               fault.error_code = 0;
-               fault.nested_page_fault = false;
-               fault.address = work->arch.token;
-               kvm_inject_page_fault(vcpu, &fault);
+       if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
+           !apf_get_user(vcpu, &val)) {
+               if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
+                   vcpu->arch.exception.pending &&
+                   vcpu->arch.exception.nr == PF_VECTOR &&
+                   !apf_put_user(vcpu, 0)) {
+                       vcpu->arch.exception.pending = false;
+                       vcpu->arch.exception.nr = 0;
+                       vcpu->arch.exception.has_error_code = false;
+                       vcpu->arch.exception.error_code = 0;
+               } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+                       fault.vector = PF_VECTOR;
+                       fault.error_code_valid = true;
+                       fault.error_code = 0;
+                       fault.nested_page_fault = false;
+                       fault.address = work->arch.token;
+                       kvm_inject_page_fault(vcpu, &fault);
+               }
         }
         vcpu->arch.apf.halted = false;
         vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -8196,8 +8398,7 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
         if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
                 return true;
         else
-               return !kvm_event_needs_reinjection(vcpu) &&
-                       kvm_x86_ops->interrupt_allowed(vcpu);
+               return kvm_can_do_async_pf(vcpu);
  }
  
  void kvm_arch_start_assignment(struct kvm *kvm)