OSDN Git Service

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[uclinux-h8/linux.git] / arch / x86 / kvm / x86.c
index 829d03f..76b4803 100644 (file)
@@ -118,6 +118,7 @@ static void enter_smm(struct kvm_vcpu *vcpu);
 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 static void store_regs(struct kvm_vcpu *vcpu);
 static int sync_regs(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
 
 static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
 static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
@@ -210,7 +211,7 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
                                | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
                                | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
-                               | XFEATURE_MASK_PKRU)
+                               | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
 
 u64 __read_mostly host_efer;
 EXPORT_SYMBOL_GPL(host_efer);
@@ -710,6 +711,17 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 }
 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 
+static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+       if (err) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
+       return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+                                      EMULTYPE_COMPLETE_USER_EXIT);
+}
+
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
        ++vcpu->stat.pf_guest;
@@ -798,8 +810,9 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
 /*
  * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
  */
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
        gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
        gpa_t real_gpa;
        int i;
@@ -810,8 +823,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
         * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
         * to an L1 GPA.
         */
-       real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(pdpt_gfn),
-                                     PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+       real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
+                                    PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
        if (real_gpa == UNMAPPED_GVA)
                return 0;
 
@@ -828,8 +841,16 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
                }
        }
 
+       /*
+        * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
+        * Shadow page roots need to be reconstructed instead.
+        */
+       if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
+               kvm_mmu_free_roots(vcpu, mmu, KVM_MMU_ROOT_CURRENT);
+
        memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
        kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+       kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
        vcpu->arch.pdptrs_from_userspace = false;
 
        return 1;
@@ -856,7 +877,6 @@ EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
        unsigned long old_cr0 = kvm_read_cr0(vcpu);
-       unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
 
        cr0 |= X86_CR0_ET;
 
@@ -886,8 +906,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        }
 #endif
        if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
-           is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
-           !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
+           is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
+           !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
                return 1;
 
        if (!(cr0 & X86_CR0_PG) &&
@@ -990,6 +1010,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
                if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
                        return 1;
        }
+
+       if ((xcr0 & XFEATURE_MASK_XTILE) &&
+           ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
+               return 1;
+
        vcpu->arch.xcr0 = xcr0;
 
        if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
@@ -1051,8 +1076,6 @@ EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
        unsigned long old_cr4 = kvm_read_cr4(vcpu);
-       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
-                                  X86_CR4_SMEP;
 
        if (!kvm_is_valid_cr4(vcpu, cr4))
                return 1;
@@ -1063,9 +1086,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                if ((cr4 ^ old_cr4) & X86_CR4_LA57)
                        return 1;
        } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
-                  && ((cr4 ^ old_cr4) & pdptr_bits)
-                  && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
-                                  kvm_read_cr3(vcpu)))
+                  && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
+                  && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
                return 1;
 
        if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
@@ -1154,14 +1176,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
                return 1;
 
-       if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+       if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
                return 1;
 
        if (cr3 != kvm_read_cr3(vcpu))
                kvm_mmu_new_pgd(vcpu, cr3);
 
        vcpu->arch.cr3 = cr3;
-       kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+       kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+       /* Do not call post_set_cr3, we do not get here for confidential guests.  */
 
 handle_tlb_flush:
        /*
@@ -1359,6 +1382,7 @@ static const u32 msrs_to_save_all[] = {
        MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
        MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
        MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+       MSR_IA32_XFD, MSR_IA32_XFD_ERR,
 };
 
 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
@@ -1815,22 +1839,36 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_set_msr);
 
-static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
 {
-       int err = vcpu->run->msr.error;
-       if (!err) {
+       if (!vcpu->run->msr.error) {
                kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
                kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
        }
+}
 
-       return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
+static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
+{
+       return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
 }
 
-static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+       complete_userspace_rdmsr(vcpu);
+       return complete_emulated_msr_access(vcpu);
+}
+
+static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
 {
        return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
 }
 
+static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
+{
+       complete_userspace_rdmsr(vcpu);
+       return complete_fast_msr_access(vcpu);
+}
+
 static u64 kvm_msr_reason(int r)
 {
        switch (r) {
@@ -1865,18 +1903,6 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
        return 1;
 }
 
-static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
-{
-       return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
-                                  complete_emulated_rdmsr, r);
-}
-
-static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
-{
-       return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
-                                  complete_emulated_wrmsr, r);
-}
-
 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
 {
        u32 ecx = kvm_rcx_read(vcpu);
@@ -1885,18 +1911,16 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
 
        r = kvm_get_msr(vcpu, ecx, &data);
 
-       /* MSR read failed? See if we should ask user space */
-       if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
-               /* Bounce to user space */
-               return 0;
-       }
-
        if (!r) {
                trace_kvm_msr_read(ecx, data);
 
                kvm_rax_write(vcpu, data & -1u);
                kvm_rdx_write(vcpu, (data >> 32) & -1u);
        } else {
+               /* MSR read failed? See if we should ask user space */
+               if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
+                                      complete_fast_rdmsr, r))
+                       return 0;
                trace_kvm_msr_read_ex(ecx);
        }
 
@@ -1912,19 +1936,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
 
        r = kvm_set_msr(vcpu, ecx, data);
 
-       /* MSR write failed? See if we should ask user space */
-       if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
-               /* Bounce to user space */
-               return 0;
-
-       /* Signal all other negative errors to userspace */
-       if (r < 0)
-               return r;
-
-       if (!r)
+       if (!r) {
                trace_kvm_msr_write(ecx, data);
-       else
+       } else {
+               /* MSR write failed? See if we should ask user space */
+               if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
+                                      complete_fast_msr_access, r))
+                       return 0;
+               /* Signal all other negative errors to userspace */
+               if (r < 0)
+                       return r;
                trace_kvm_msr_write_ex(ecx, data);
+       }
 
        return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
 }
@@ -2119,7 +2142,7 @@ static s64 get_kvmclock_base_ns(void)
 }
 #endif
 
-void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
 {
        int version;
        int r;
@@ -2817,7 +2840,7 @@ static void kvm_end_pvclock_update(struct kvm *kvm)
 {
        struct kvm_arch *ka = &kvm->arch;
        struct kvm_vcpu *vcpu;
-       int i;
+       unsigned long i;
 
        write_seqcount_end(&ka->pvclock_sc);
        raw_spin_unlock_irq(&ka->tsc_write_lock);
@@ -3066,7 +3089,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 static void kvmclock_update_fn(struct work_struct *work)
 {
-       int i;
+       unsigned long i;
        struct delayed_work *dwork = to_delayed_work(work);
        struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
                                           kvmclock_update_work);
@@ -3669,6 +3692,30 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                vcpu->arch.msr_misc_features_enables = data;
                break;
+#ifdef CONFIG_X86_64
+       case MSR_IA32_XFD:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+                       return 1;
+
+               if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+                            vcpu->arch.guest_supported_xcr0))
+                       return 1;
+
+               fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
+               break;
+       case MSR_IA32_XFD_ERR:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+                       return 1;
+
+               if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+                            vcpu->arch.guest_supported_xcr0))
+                       return 1;
+
+               vcpu->arch.guest_fpu.xfd_err = data;
+               break;
+#endif
        default:
                if (kvm_pmu_is_valid_msr(vcpu, msr))
                        return kvm_pmu_set_msr(vcpu, msr_info);
@@ -3989,6 +4036,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_K7_HWCR:
                msr_info->data = vcpu->arch.msr_hwcr;
                break;
+#ifdef CONFIG_X86_64
+       case MSR_IA32_XFD:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+                       return 1;
+
+               msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
+               break;
+       case MSR_IA32_XFD_ERR:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+                       return 1;
+
+               msr_info->data = vcpu->arch.guest_fpu.xfd_err;
+               break;
+#endif
        default:
                if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
                        return kvm_pmu_get_msr(vcpu, msr_info);
@@ -4172,7 +4235,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_XEN_HVM:
                r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
                    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
-                   KVM_XEN_HVM_CONFIG_SHARED_INFO;
+                   KVM_XEN_HVM_CONFIG_SHARED_INFO |
+                   KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
                if (sched_info_on())
                        r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
                break;
@@ -4250,6 +4314,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                else
                        r = 0;
                break;
+       case KVM_CAP_XSAVE2: {
+               u64 guest_perm = xstate_get_guest_group_perm();
+
+               r = xstate_required_size(supported_xcr0 & guest_perm, false);
+               if (r < sizeof(struct kvm_xsave))
+                       r = sizeof(struct kvm_xsave);
+               break;
+       }
        default:
                break;
        }
@@ -4853,6 +4925,16 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
                                       vcpu->arch.pkru);
 }
 
+static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+                                         u8 *state, unsigned int size)
+{
+       if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+               return;
+
+       fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+                                      state, size, vcpu->arch.pkru);
+}
+
 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
                                        struct kvm_xsave *guest_xsave)
 {
@@ -5306,6 +5388,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                break;
        }
        case KVM_GET_XSAVE: {
+               r = -EINVAL;
+               if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+                       break;
+
                u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
                r = -ENOMEM;
                if (!u.xsave)
@@ -5320,7 +5406,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                break;
        }
        case KVM_SET_XSAVE: {
-               u.xsave = memdup_user(argp, sizeof(*u.xsave));
+               int size = vcpu->arch.guest_fpu.uabi_size;
+
+               u.xsave = memdup_user(argp, size);
                if (IS_ERR(u.xsave)) {
                        r = PTR_ERR(u.xsave);
                        goto out_nofree;
@@ -5329,6 +5417,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
                break;
        }
+
+       case KVM_GET_XSAVE2: {
+               int size = vcpu->arch.guest_fpu.uabi_size;
+
+               u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+               r = -ENOMEM;
+               if (!u.xsave)
+                       break;
+
+               kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+
+               r = -EFAULT;
+               if (copy_to_user(argp, u.xsave, size))
+                       break;
+
+               r = 0;
+               break;
+       }
+
        case KVM_GET_XCRS: {
                u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
                r = -ENOMEM;
@@ -5693,7 +5800,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
         * VM-Exit.
         */
        struct kvm_vcpu *vcpu;
-       int i;
+       unsigned long i;
 
        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_vcpu_kick(vcpu);
@@ -5962,7 +6069,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
 static int kvm_arch_suspend_notifier(struct kvm *kvm)
 {
        struct kvm_vcpu *vcpu;
-       int i, ret = 0;
+       unsigned long i;
+       int ret = 0;
 
        mutex_lock(&kvm->lock);
        kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -6422,6 +6530,11 @@ static void kvm_init_msr_list(void)
                            min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
                                continue;
                        break;
+               case MSR_IA32_XFD:
+               case MSR_IA32_XFD_ERR:
+                       if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+                               continue;
+                       break;
                default:
                        break;
                }
@@ -6505,13 +6618,14 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
                           struct x86_exception *exception)
 {
+       struct kvm_mmu *mmu = vcpu->arch.mmu;
        gpa_t t_gpa;
 
        BUG_ON(!mmu_is_nested(vcpu));
 
        /* NPT walks are always user-walks */
        access |= PFERR_USER_MASK;
-       t_gpa  = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
+       t_gpa  = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
 
        return t_gpa;
 }
@@ -6519,25 +6633,31 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
                              struct x86_exception *exception)
 {
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
        u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
-       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+       return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
                                struct x86_exception *exception)
 {
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
        u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        access |= PFERR_FETCH_MASK;
-       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+       return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
 }
 
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
                               struct x86_exception *exception)
 {
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
        u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        access |= PFERR_WRITE_MASK;
-       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+       return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
 
@@ -6545,19 +6665,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
                                struct x86_exception *exception)
 {
-       return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+       return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
 }
 
 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
                                      struct kvm_vcpu *vcpu, u32 access,
                                      struct x86_exception *exception)
 {
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
        void *data = val;
        int r = X86EMUL_CONTINUE;
 
        while (bytes) {
-               gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
-                                                           exception);
+               gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
                unsigned offset = addr & (PAGE_SIZE-1);
                unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
@@ -6585,13 +6707,14 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
                                struct x86_exception *exception)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
        u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
        unsigned offset;
        int ret;
 
        /* Inline kvm_read_guest_virt_helper for speed.  */
-       gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
-                                                   exception);
+       gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
+                                   exception);
        if (unlikely(gpa == UNMAPPED_GVA))
                return X86EMUL_PROPAGATE_FAULT;
 
@@ -6650,13 +6773,12 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes
                                      struct kvm_vcpu *vcpu, u32 access,
                                      struct x86_exception *exception)
 {
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
        void *data = val;
        int r = X86EMUL_CONTINUE;
 
        while (bytes) {
-               gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
-                                                            access,
-                                                            exception);
+               gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
                unsigned offset = addr & (PAGE_SIZE-1);
                unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
                int ret;
@@ -6743,6 +6865,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
                                gpa_t *gpa, struct x86_exception *exception,
                                bool write)
 {
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
        u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
                | (write ? PFERR_WRITE_MASK : 0);
 
@@ -6760,7 +6883,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
                return 1;
        }
 
-       *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+       *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
 
        if (*gpa == UNMAPPED_GVA)
                return -1;
@@ -7394,7 +7517,8 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
 
        r = kvm_get_msr(vcpu, msr_index, pdata);
 
-       if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+       if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+                                   complete_emulated_rdmsr, r)) {
                /* Bounce to user space */
                return X86EMUL_IO_NEEDED;
        }
@@ -7410,7 +7534,8 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
 
        r = kvm_set_msr(vcpu, msr_index, data);
 
-       if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+       if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+                                   complete_emulated_msr_access, r)) {
                /* Bounce to user space */
                return X86EMUL_IO_NEEDED;
        }
@@ -7961,6 +8086,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
        if (unlikely(!r))
                return 0;
 
+       kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+
        /*
         * rflags is the old, "raw" value of the flags.  The new value has
         * not been saved yet.
@@ -8128,12 +8255,23 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
        }
 
        /*
-        * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
-        * for kvm_skip_emulated_instruction().  The caller is responsible for
-        * updating interruptibility state and injecting single-step #DBs.
+        * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
+        * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
+        * The caller is responsible for updating interruptibility state and
+        * injecting single-step #DBs.
         */
        if (emulation_type & EMULTYPE_SKIP) {
-               kvm_rip_write(vcpu, ctxt->_eip);
+               if (ctxt->mode != X86EMUL_MODE_PROT64)
+                       ctxt->eip = (u32)ctxt->_eip;
+               else
+                       ctxt->eip = ctxt->_eip;
+
+               if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
+                       r = 1;
+                       goto writeback;
+               }
+
+               kvm_rip_write(vcpu, ctxt->eip);
                if (ctxt->eflags & X86_EFLAGS_RF)
                        kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
                return 1;
@@ -8197,17 +8335,24 @@ restart:
                        writeback = false;
                r = 0;
                vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+       } else if (vcpu->arch.complete_userspace_io) {
+               writeback = false;
+               r = 0;
        } else if (r == EMULATION_RESTART)
                goto restart;
        else
                r = 1;
 
+writeback:
        if (writeback) {
                unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
                toggle_interruptibility(vcpu, ctxt->interruptibility);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
                if (!ctxt->have_exception ||
                    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+                       kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+                       if (ctxt->is_branch)
+                               kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
                        kvm_rip_write(vcpu, ctxt->eip);
                        if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
                                r = kvm_vcpu_do_singlestep(vcpu);
@@ -8394,7 +8539,8 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
 {
        struct kvm *kvm;
        struct kvm_vcpu *vcpu;
-       int i, send_ipi = 0;
+       int send_ipi = 0;
+       unsigned long i;
 
        /*
         * We allow guests to temporarily run on slowing clocks,
@@ -8523,9 +8669,8 @@ static void kvm_timer_init(void)
 static void pvclock_gtod_update_fn(struct work_struct *work)
 {
        struct kvm *kvm;
-
        struct kvm_vcpu *vcpu;
-       int i;
+       unsigned long i;
 
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
@@ -8683,8 +8828,15 @@ void kvm_arch_exit(void)
 #endif
 }
 
-static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
+static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
 {
+       /*
+        * The vCPU has halted, e.g. executed HLT.  Update the run state if the
+        * local APIC is in-kernel, the run loop will detect the non-runnable
+        * state and halt the vCPU.  Exit to userspace if the local APIC is
+        * managed by userspace, in which case userspace is responsible for
+        * handling wake events.
+        */
        ++vcpu->stat.halt_exits;
        if (lapic_in_kernel(vcpu)) {
                vcpu->arch.mp_state = state;
@@ -8695,11 +8847,11 @@ static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
        }
 }
 
-int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
 {
-       return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+       return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
 
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
@@ -8708,7 +8860,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
         * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
         * KVM_EXIT_DEBUG here.
         */
-       return kvm_vcpu_halt(vcpu) && ret;
+       return kvm_emulate_halt_noskip(vcpu) && ret;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
@@ -8716,7 +8868,8 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
 {
        int ret = kvm_skip_emulated_instruction(vcpu);
 
-       return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret;
+       return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+                                       KVM_EXIT_AP_RESET_HOLD) && ret;
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
 
@@ -9819,6 +9972,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (test_thread_flag(TIF_NEED_FPU_LOAD))
                switch_fpu_return();
 
+       if (vcpu->arch.guest_fpu.xfd_err)
+               wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+
        if (unlikely(vcpu->arch.switch_db_regs)) {
                set_debugreg(0, 7);
                set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -9880,8 +10036,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
 
+       /*
+        * Sync xfd before calling handle_exit_irqoff() which may
+        * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
+        * in #NM irqoff handler).
+        */
+       if (vcpu->arch.xfd_no_write_intercept)
+               fpu_sync_guest_vmexit_xfd_state();
+
        static_call(kvm_x86_handle_exit_irqoff)(vcpu);
 
+       if (vcpu->arch.guest_fpu.xfd_err)
+               wrmsrl(MSR_IA32_XFD_ERR, 0);
+
        /*
         * Consume any pending interrupts, including the possible source of
         * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
@@ -9949,7 +10116,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
        if (!kvm_arch_vcpu_runnable(vcpu) &&
            (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
                srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-               kvm_vcpu_block(vcpu);
+               if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+                       kvm_vcpu_halt(vcpu);
+               else
+                       kvm_vcpu_block(vcpu);
                vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 
                if (kvm_x86_ops.post_block)
@@ -10509,7 +10679,8 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
        vcpu->arch.cr2 = sregs->cr2;
        *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
        vcpu->arch.cr3 = sregs->cr3;
-       kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+       kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+       static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
 
        kvm_set_cr8(vcpu, sregs->cr8);
 
@@ -10526,7 +10697,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
        if (update_pdptrs) {
                idx = srcu_read_lock(&vcpu->kvm->srcu);
                if (is_pae_paging(vcpu)) {
-                       load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+                       load_pdptrs(vcpu, kvm_read_cr3(vcpu));
                        *mmu_reset_needed = 1;
                }
                srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -10624,7 +10795,7 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
 {
        bool inhibit = false;
        struct kvm_vcpu *vcpu;
-       int i;
+       unsigned long i;
 
        down_write(&kvm->arch.apicv_update_lock);
 
@@ -11112,7 +11283,7 @@ int kvm_arch_hardware_enable(void)
 {
        struct kvm *kvm;
        struct kvm_vcpu *vcpu;
-       int i;
+       unsigned long i;
        int ret;
        u64 local_tsc;
        u64 max_tsc = 0;
@@ -11369,7 +11540,7 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 
 static void kvm_free_vcpus(struct kvm *kvm)
 {
-       unsigned int i;
+       unsigned long i;
        struct kvm_vcpu *vcpu;
 
        /*
@@ -11379,15 +11550,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
                kvm_clear_async_pf_completion_queue(vcpu);
                kvm_unload_vcpu_mmu(vcpu);
        }
-       kvm_for_each_vcpu(i, vcpu, kvm)
-               kvm_vcpu_destroy(vcpu);
-
-       mutex_lock(&kvm->lock);
-       for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
-               kvm->vcpus[i] = NULL;
 
-       atomic_set(&kvm->online_vcpus, 0);
-       mutex_unlock(&kvm->lock);
+       kvm_destroy_vcpus(kvm);
 }
 
 void kvm_arch_sync_events(struct kvm *kvm)
@@ -11555,9 +11719,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
 }
 
 static int kvm_alloc_memslot_metadata(struct kvm *kvm,
-                                     struct kvm_memory_slot *slot,
-                                     unsigned long npages)
+                                     struct kvm_memory_slot *slot)
 {
+       unsigned long npages = slot->npages;
        int i, r;
 
        /*
@@ -11622,7 +11786,7 @@ out_free:
 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 {
        struct kvm_vcpu *vcpu;
-       int i;
+       unsigned long i;
 
        /*
         * memslots->generation has been incremented.
@@ -11636,13 +11800,18 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
-                               struct kvm_memory_slot *memslot,
-                               const struct kvm_userspace_memory_region *mem,
-                               enum kvm_mr_change change)
+                                  const struct kvm_memory_slot *old,
+                                  struct kvm_memory_slot *new,
+                                  enum kvm_mr_change change)
 {
        if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
-               return kvm_alloc_memslot_metadata(kvm, memslot,
-                                                 mem->memory_size >> PAGE_SHIFT);
+               return kvm_alloc_memslot_metadata(kvm, new);
+
+       if (change == KVM_MR_FLAGS_ONLY)
+               memcpy(&new->arch, &old->arch, sizeof(old->arch));
+       else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
+               return -EIO;
+
        return 0;
 }
 
@@ -11666,13 +11835,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
                                     const struct kvm_memory_slot *new,
                                     enum kvm_mr_change change)
 {
-       bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+       u32 old_flags = old ? old->flags : 0;
+       u32 new_flags = new ? new->flags : 0;
+       bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
 
        /*
         * Update CPU dirty logging if dirty logging is being toggled.  This
         * applies to all operations.
         */
-       if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)
+       if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
                kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
 
        /*
@@ -11690,7 +11861,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
         * MOVE/DELETE: The old mappings will already have been cleaned up by
         *              kvm_arch_flush_shadow_memslot().
         */
-       if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
+       if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
                return;
 
        /*
@@ -11698,7 +11869,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
         * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
         * logging isn't being toggled on or off.
         */
-       if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)))
+       if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
                return;
 
        if (!log_dirty_pages) {
@@ -11734,14 +11905,18 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-                               const struct kvm_userspace_memory_region *mem,
                                struct kvm_memory_slot *old,
                                const struct kvm_memory_slot *new,
                                enum kvm_mr_change change)
 {
-       if (!kvm->arch.n_requested_mmu_pages)
-               kvm_mmu_change_mmu_pages(kvm,
-                               kvm_mmu_calculate_default_mmu_pages(kvm));
+       if (!kvm->arch.n_requested_mmu_pages &&
+           (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
+               unsigned long nr_mmu_pages;
+
+               nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
+               nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
+               kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+       }
 
        kvm_mmu_slot_apply_flags(kvm, old, new, change);
 
@@ -12256,12 +12431,13 @@ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
 
 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
 {
+       struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
        struct x86_exception fault;
        u32 access = error_code &
                (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
 
        if (!(error_code & PFERR_PRESENT_MASK) ||
-           vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
+           mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) {
                /*
                 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
                 * tables probably do not match the TLB.  Just proceed
@@ -12598,6 +12774,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);