OSDN Git Service

KVM: x86: Add emulation for MSR_IA32_MCx_CTL2 MSRs.
authorJue Wang <juew@google.com>
Fri, 10 Jun 2022 17:11:32 +0000 (10:11 -0700)
committerPaolo Bonzini <pbonzini@redhat.com>
Fri, 24 Jun 2022 08:52:03 +0000 (04:52 -0400)
This patch adds the emulation of IA32_MCi_CTL2 registers to KVM. A
separate mci_ctl2_banks array is used to keep the existing mce_banks
register layout intact.

In Machine Check Architecture, in addition to MCG_CMCI_P, bit 30 of
the per-bank register IA32_MCi_CTL2 controls whether Corrected Machine
Check error reporting is enabled.

Signed-off-by: Jue Wang <juew@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20220610171134.772566-7-juew@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/x86.c

index 665667d..88a3026 100644 (file)
@@ -826,6 +826,7 @@ struct kvm_vcpu_arch {
        u64 mcg_ctl;
        u64 mcg_ext_ctl;
        u64 *mce_banks;
+       u64 *mci_ctl2_banks;
 
        /* Cache MMIO info */
        u64 mmio_gva;
index 3c6eb68..f743e4e 100644 (file)
@@ -3191,6 +3191,16 @@ static void kvmclock_sync_fn(struct work_struct *work)
                                        KVMCLOCK_SYNC_PERIOD);
 }
 
+/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
+static bool is_mci_control_msr(u32 msr)
+{
+       return (msr & 3) == 0;
+}
+static bool is_mci_status_msr(u32 msr)
+{
+       return (msr & 3) == 1;
+}
+
 /*
  * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
  */
@@ -3209,6 +3219,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        unsigned bank_num = mcg_cap & 0xff;
        u32 msr = msr_info->index;
        u64 data = msr_info->data;
+       u32 offset, last_msr;
 
        switch (msr) {
        case MSR_IA32_MCG_STATUS:
@@ -3222,35 +3233,53 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                vcpu->arch.mcg_ctl = data;
                break;
-       default:
-               if (msr >= MSR_IA32_MC0_CTL &&
-                   msr < MSR_IA32_MCx_CTL(bank_num)) {
-                       u32 offset = array_index_nospec(
-                               msr - MSR_IA32_MC0_CTL,
-                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
-
-                       /* only 0 or all 1s can be written to IA32_MCi_CTL
-                        * some Linux kernels though clear bit 10 in bank 4 to
-                        * workaround a BIOS/GART TBL issue on AMD K8s, ignore
-                        * this to avoid an uncatched #GP in the guest.
+       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+               last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+               if (msr > last_msr)
+                       return 1;
+
+               if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
+                       return 1;
+               /* An attempt to write a 1 to a reserved bit raises #GP */
+               if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
+                       return 1;
+               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+                                           last_msr + 1 - MSR_IA32_MC0_CTL2);
+               vcpu->arch.mci_ctl2_banks[offset] = data;
+               break;
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+               last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+               if (msr > last_msr)
+                       return 1;
+
+               /*
+                * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
+                * values are architecturally undefined.  But, some Linux
+                * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
+                * issue on AMD K8s, allow bit 10 to be clear when setting all
+                * other bits in order to avoid an uncaught #GP in the guest.
                         *
                         * UNIXWARE clears bit 0 of MC1_CTL to ignore
                         * correctable, single-bit ECC data errors.
-                        */
-                       if ((offset & 0x3) == 0 &&
-                           data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
-                               return -1;
-
-                       /* MCi_STATUS */
-                       if (!msr_info->host_initiated &&
-                           (offset & 0x3) == 1 && data != 0) {
-                               if (!can_set_mci_status(vcpu))
-                                       return -1;
-                       }
+                */
+               if (is_mci_control_msr(msr) &&
+                   data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+                       return 1;
 
-                       vcpu->arch.mce_banks[offset] = data;
-                       break;
-               }
+               /*
+                * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
+                * AMD-based CPUs allow non-zero values, but if and only if
+                * HWCR[McStatusWrEn] is set.
+                */
+               if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
+                   data != 0 && !can_set_mci_status(vcpu))
+                       return 1;
+
+               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+                                           last_msr + 1 - MSR_IA32_MC0_CTL);
+               vcpu->arch.mce_banks[offset] = data;
+               break;
+       default:
                return 1;
        }
        return 0;
@@ -3534,7 +3563,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                }
                break;
-       case 0x200 ... 0x2ff:
+       case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+       case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
                return kvm_mtrr_set_msr(vcpu, msr, data);
        case MSR_IA32_APICBASE:
                return kvm_set_apic_base(vcpu, msr_info);
@@ -3704,6 +3734,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_MCG_CTL:
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
                return set_msr_mce(vcpu, msr_info);
 
        case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
@@ -3819,6 +3850,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
        u64 data;
        u64 mcg_cap = vcpu->arch.mcg_cap;
        unsigned bank_num = mcg_cap & 0xff;
+       u32 offset, last_msr;
 
        switch (msr) {
        case MSR_IA32_P5_MC_ADDR:
@@ -3836,16 +3868,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
        case MSR_IA32_MCG_STATUS:
                data = vcpu->arch.mcg_status;
                break;
-       default:
-               if (msr >= MSR_IA32_MC0_CTL &&
-                   msr < MSR_IA32_MCx_CTL(bank_num)) {
-                       u32 offset = array_index_nospec(
-                               msr - MSR_IA32_MC0_CTL,
-                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+               last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
+               if (msr > last_msr)
+                       return 1;
 
-                       data = vcpu->arch.mce_banks[offset];
-                       break;
-               }
+               if (!(mcg_cap & MCG_CMCI_P) && !host)
+                       return 1;
+               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
+                                           last_msr + 1 - MSR_IA32_MC0_CTL2);
+               data = vcpu->arch.mci_ctl2_banks[offset];
+               break;
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+               last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
+               if (msr > last_msr)
+                       return 1;
+
+               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
+                                           last_msr + 1 - MSR_IA32_MC0_CTL);
+               data = vcpu->arch.mce_banks[offset];
+               break;
+       default:
                return 1;
        }
        *pdata = data;
@@ -3949,7 +3992,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        }
        case MSR_MTRRcap:
-       case 0x200 ... 0x2ff:
+       case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
+       case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
                return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
        case 0xcd: /* fsb frequency */
                msr_info->data = 3;
@@ -4065,6 +4109,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_IA32_MCG_CTL:
        case MSR_IA32_MCG_STATUS:
        case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
                return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
                                   msr_info->host_initiated);
        case MSR_IA32_XSS:
@@ -4842,9 +4887,12 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
        /* Init IA32_MCG_CTL to all 1s */
        if (mcg_cap & MCG_CTL_P)
                vcpu->arch.mcg_ctl = ~(u64)0;
-       /* Init IA32_MCi_CTL to all 1s */
-       for (bank = 0; bank < bank_num; bank++)
+       /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
+       for (bank = 0; bank < bank_num; bank++) {
                vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+               if (mcg_cap & MCG_CMCI_P)
+                       vcpu->arch.mci_ctl2_banks[bank] = 0;
+       }
        vcpu->arch.apic->nr_lvt_entries =
                KVM_APIC_MAX_NR_LVT_ENTRIES - !(mcg_cap & MCG_CMCI_P);
 
@@ -11449,7 +11497,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 
        vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
                                       GFP_KERNEL_ACCOUNT);
-       if (!vcpu->arch.mce_banks)
+       vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
+                                           GFP_KERNEL_ACCOUNT);
+       if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
                goto fail_free_pio_data;
        vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
@@ -11503,6 +11553,7 @@ free_wbinvd_dirty_mask:
        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 fail_free_mce_banks:
        kfree(vcpu->arch.mce_banks);
+       kfree(vcpu->arch.mci_ctl2_banks);
 fail_free_pio_data:
        free_page((unsigned long)vcpu->arch.pio_data);
 fail_free_lapic:
@@ -11548,6 +11599,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_hv_vcpu_uninit(vcpu);
        kvm_pmu_destroy(vcpu);
        kfree(vcpu->arch.mce_banks);
+       kfree(vcpu->arch.mci_ctl2_banks);
        kvm_free_lapic(vcpu);
        idx = srcu_read_lock(&vcpu->kvm->srcu);
        kvm_mmu_destroy(vcpu);