OSDN Git Service

KVM: x86/mmu: Apply max PA check for MMIO sptes to 32-bit KVM
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / arch / x86 / kvm / x86.c
index d830a0d..21fb707 100644 (file)
@@ -53,6 +53,7 @@
 #include <linux/pvclock_gtod.h>
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
+#include <linux/nospec.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -260,13 +261,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
        struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
        int err;
 
-       if (((value ^ smsr->values[slot].curr) & mask) == 0)
+       value = (value & mask) | (smsr->values[slot].host & ~mask);
+       if (value == smsr->values[slot].curr)
                return 0;
-       smsr->values[slot].curr = value;
        err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
        if (err)
                return 1;
 
+       smsr->values[slot].curr = value;
        if (!smsr->registered) {
                smsr->urn.on_user_return = kvm_on_user_return;
                user_return_notifier_register(&smsr->urn);
@@ -523,8 +525,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
                                       data, offset, len, access);
 }
 
+static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
+{
+       return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
+              rsvd_bits(1, 2);
+}
+
 /*
- * Load the pae pdptrs.  Return true is they are all valid.
+ * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
  */
 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
 {
@@ -543,8 +551,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
        }
        for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
                if (is_present_gpte(pdpte[i]) &&
-                   (pdpte[i] &
-                    vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
+                   (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
                        ret = 0;
                        goto out;
                }
@@ -570,7 +577,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
        gfn_t gfn;
        int r;
 
-       if (is_long_mode(vcpu) || !is_pae(vcpu))
+       if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu))
                return false;
 
        if (!test_bit(VCPU_EXREG_PDPTR,
@@ -867,9 +874,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
 
 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
+       size_t size = ARRAY_SIZE(vcpu->arch.db);
+
        switch (dr) {
        case 0 ... 3:
-               vcpu->arch.db[dr] = val;
+               vcpu->arch.db[array_index_nospec(dr, size)] = val;
                if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
                        vcpu->arch.eff_db[dr] = val;
                break;
@@ -906,9 +915,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
 
 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
+       size_t size = ARRAY_SIZE(vcpu->arch.db);
+
        switch (dr) {
        case 0 ... 3:
-               *val = vcpu->arch.db[dr];
+               *val = vcpu->arch.db[array_index_nospec(dr, size)];
                break;
        case 4:
                /* fall through */
@@ -990,11 +1001,50 @@ static u32 emulated_msrs[] = {
 
 static unsigned num_emulated_msrs;
 
-bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+u64 kvm_get_arch_capabilities(void)
 {
-       if (efer & efer_reserved_bits)
-               return false;
+       u64 data;
+
+       rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
 
+       if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+               data |= ARCH_CAP_RDCL_NO;
+       if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+               data |= ARCH_CAP_SSB_NO;
+       if (!boot_cpu_has_bug(X86_BUG_MDS))
+               data |= ARCH_CAP_MDS_NO;
+
+       /*
+        * On TAA affected systems, export MDS_NO=0 when:
+        *      - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
+        *      - Updated microcode is present. This is detected by
+        *        the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
+        *        that VERW clears CPU buffers.
+        *
+        * When MDS_NO=0 is exported, guests deploy clear CPU buffer
+        * mitigation and don't complain:
+        *
+        *      "Vulnerable: Clear CPU buffers attempted, no microcode"
+        *
+        * If TSX is disabled on the system, guests are also mitigated against
+        * TAA and clear CPU buffer mitigation is not required for guests.
+        */
+       if (!boot_cpu_has(X86_FEATURE_RTM))
+               data &= ~ARCH_CAP_TAA_NO;
+       else if (!boot_cpu_has_bug(X86_BUG_TAA))
+               data |= ARCH_CAP_TAA_NO;
+       else if (data & ARCH_CAP_TSX_CTRL_MSR)
+               data &= ~ARCH_CAP_MDS_NO;
+
+       /* KVM does not emulate MSR_IA32_TSX_CTRL.  */
+       data &= ~ARCH_CAP_TSX_CTRL_MSR;
+       return data;
+}
+
+EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
+
+static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
        if (efer & EFER_FFXSR) {
                struct kvm_cpuid_entry2 *feat;
 
@@ -1012,19 +1062,33 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
        }
 
        return true;
+
+}
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+       if (efer & efer_reserved_bits)
+               return false;
+
+       return __kvm_valid_efer(vcpu, efer);
 }
 EXPORT_SYMBOL_GPL(kvm_valid_efer);
 
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        u64 old_efer = vcpu->arch.efer;
+       u64 efer = msr_info->data;
 
-       if (!kvm_valid_efer(vcpu, efer))
+       if (efer & efer_reserved_bits)
                return 1;
 
-       if (is_paging(vcpu)
-           && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
-               return 1;
+       if (!msr_info->host_initiated) {
+               if (!__kvm_valid_efer(vcpu, efer))
+                       return 1;
+
+               if (is_paging(vcpu) &&
+                   (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+                       return 1;
+       }
 
        efer &= ~EFER_LMA;
        efer |= vcpu->arch.efer & EFER_LMA;
@@ -1282,7 +1346,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                        vcpu->arch.tsc_always_catchup = 1;
                        return 0;
                } else {
-                       WARN(1, "user requested TSC rate below hardware speed\n");
+                       pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
                        return -1;
                }
        }
@@ -1292,8 +1356,8 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                                user_tsc_khz, tsc_khz);
 
        if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
-               WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
-                         user_tsc_khz);
+               pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+                                   user_tsc_khz);
                return -1;
        }
 
@@ -1930,7 +1994,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        default:
                if (msr >= MSR_IA32_MC0_CTL &&
                    msr < MSR_IA32_MCx_CTL(bank_num)) {
-                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       u32 offset = array_index_nospec(
+                               msr - MSR_IA32_MC0_CTL,
+                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
                        /* only 0 or all 1s can be written to IA32_MCi_CTL
                         * some Linux kernels though clear bit 10 in bank 4 to
                         * workaround a BIOS/GART TBL issue on AMD K8s, ignore
@@ -2054,8 +2121,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_AMD64_BU_CFG2:
                break;
 
+       case MSR_IA32_ARCH_CAPABILITIES:
+               if (!msr_info->host_initiated)
+                       return 1;
+               vcpu->arch.arch_capabilities = data;
+               break;
        case MSR_EFER:
-               return set_efer(vcpu, data);
+               return set_efer(vcpu, msr_info);
        case MSR_K7_HWCR:
                data &= ~(u64)0x40;     /* ignore flush filter disable */
                data &= ~(u64)0x100;    /* ignore ignne emulation enable */
@@ -2286,7 +2358,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        default:
                if (msr >= MSR_IA32_MC0_CTL &&
                    msr < MSR_IA32_MCx_CTL(bank_num)) {
-                       u32 offset = msr - MSR_IA32_MC0_CTL;
+                       u32 offset = array_index_nospec(
+                               msr - MSR_IA32_MC0_CTL,
+                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
                        data = vcpu->arch.mce_banks[offset];
                        break;
                }
@@ -2328,6 +2403,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_UCODE_REV:
                msr_info->data = 0x100000000ULL;
                break;
+       case MSR_IA32_ARCH_CAPABILITIES:
+               if (!msr_info->host_initiated &&
+                   !guest_cpuid_has_arch_capabilities(vcpu))
+                       return 1;
+               msr_info->data = vcpu->arch.arch_capabilities;
+               break;
        case MSR_MTRRcap:
        case 0x200 ... 0x2ff:
                return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -3856,7 +3937,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                                   sizeof(struct kvm_pit_config)))
                        goto out;
        create_pit:
-               mutex_lock(&kvm->slots_lock);
+               mutex_lock(&kvm->lock);
                r = -EEXIST;
                if (kvm->arch.vpit)
                        goto create_pit_unlock;
@@ -3865,7 +3946,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (kvm->arch.vpit)
                        r = 0;
        create_pit_unlock:
-               mutex_unlock(&kvm->slots_lock);
+               mutex_unlock(&kvm->lock);
                break;
        case KVM_GET_IRQCHIP: {
                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
@@ -4326,6 +4407,13 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
        if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
                access |= PFERR_USER_MASK;
 
+       /*
+        * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+        * is returned, but our callers are not ready for that and they blindly
+        * call kvm_inject_page_fault.  Ensure that they at least do not leak
+        * uninitialized kernel stack memory into cr2 and error code.
+        */
+       memset(exception, 0, sizeof(*exception));
        return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
                                           access, exception);
 }
@@ -5468,8 +5556,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                        if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
                                                emulation_type))
                                return EMULATE_DONE;
-                       if (ctxt->have_exception && inject_emulated_exception(vcpu))
+                       if (ctxt->have_exception) {
+                               /*
+                                * #UD should result in just EMULATION_FAILED, and trap-like
+                                * exception should not be encountered during decode.
+                                */
+                               WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
+                                            exception_type(ctxt->exception.vector) == EXCPT_TRAP);
+                               inject_emulated_exception(vcpu);
                                return EMULATE_DONE;
+                       }
                        if (emulation_type & EMULTYPE_SKIP)
                                return EMULATE_FAIL;
                        return handle_emulation_failure(vcpu);
@@ -5534,12 +5630,13 @@ restart:
                unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
                toggle_interruptibility(vcpu, ctxt->interruptibility);
                vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
-               kvm_rip_write(vcpu, ctxt->eip);
-               if (r == EMULATE_DONE && ctxt->tf)
-                       kvm_vcpu_do_singlestep(vcpu, &r);
                if (!ctxt->have_exception ||
-                   exception_type(ctxt->exception.vector) == EXCPT_TRAP)
+                   exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+                       kvm_rip_write(vcpu, ctxt->eip);
+                       if (r == EMULATE_DONE && ctxt->tf)
+                               kvm_vcpu_do_singlestep(vcpu, &r);
                        __kvm_set_rflags(vcpu, ctxt->eflags);
+               }
 
                /*
                 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
@@ -5788,14 +5885,12 @@ static void kvm_set_mmio_spte_mask(void)
        /* Set the present bit. */
        mask |= 1ull;
 
-#ifdef CONFIG_X86_64
        /*
         * If reserved bit is not supported, clear the present bit to disable
         * mmio page fault.
         */
        if (maxphyaddr == 52)
                mask &= ~1ull;
-#endif
 
        kvm_mmu_set_mmio_spte_mask(mask);
 }
@@ -6398,7 +6493,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
                kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
        else {
                kvm_x86_ops->sync_pir_to_irr(vcpu);
-               kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+               if (ioapic_in_kernel(vcpu->kvm))
+                       kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
        }
        kvm_x86_ops->load_eoi_exitmap(vcpu);
 }
@@ -7135,7 +7231,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                kvm_update_cpuid(vcpu);
 
        idx = srcu_read_lock(&vcpu->kvm->srcu);
-       if (!is_long_mode(vcpu) && is_pae(vcpu)) {
+       if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) {
                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
                mmu_reset_needed = 1;
        }
@@ -7359,6 +7455,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
        int r;
 
+       vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
        kvm_vcpu_mtrr_init(vcpu);
        r = vcpu_load(vcpu);
        if (r)
@@ -7399,7 +7496,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_mmu_unload(vcpu);
        vcpu_put(vcpu);
 
-       kvm_x86_ops->vcpu_free(vcpu);
+       kvm_arch_vcpu_free(vcpu);
 }
 
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -7787,7 +7884,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 
        slot = id_to_memslot(slots, id);
        if (size) {
-               if (WARN_ON(slot->npages))
+               if (slot->npages)
                        return -EEXIST;
 
                /*