Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

[tomoyo/tomoyo-test1.git] / arch / x86 / kvm / vmx / vmx.c
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index b483a8b..72e3943 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -41,13 +41,12 @@
  #include <asm/idtentry.h>
  #include <asm/io.h>
  #include <asm/irq_remapping.h>
-#include <asm/kexec.h>
+#include <asm/reboot.h>
  #include <asm/perf_event.h>
  #include <asm/mmu_context.h>
  #include <asm/mshyperv.h>
  #include <asm/mwait.h>
  #include <asm/spec-ctrl.h>
-#include <asm/virtext.h>
  #include <asm/vmx.h>
  
  #include "capabilities.h"
@@ -237,9 +236,6 @@ static const struct {
  #define L1D_CACHE_ORDER 4
  static void *vmx_l1d_flush_pages;
  
-/* Control for disabling CPU Fill buffer clear */
-static bool __read_mostly vmx_fb_clear_ctrl_available;
-
  static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
  {
         struct page *page;
@@ -255,14 +251,9 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
                 return 0;
         }
  
-       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
-               u64 msr;
-
-               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
-               if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
-                       l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
-                       return 0;
-               }
+       if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+               l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+               return 0;
         }
  
         /* If set to auto use the default l1tf mitigation method */
@@ -366,22 +357,9 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
  static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
  {
         if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
-               return sprintf(s, "???\n");
+               return sysfs_emit(s, "???\n");
  
-       return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
-}
-
-static void vmx_setup_fb_clear_ctrl(void)
-{
-       u64 msr;
-
-       if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
-           !boot_cpu_has_bug(X86_BUG_MDS) &&
-           !boot_cpu_has_bug(X86_BUG_TAA)) {
-               rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
-               if (msr & ARCH_CAP_FB_CLEAR_CTRL)
-                       vmx_fb_clear_ctrl_available = true;
-       }
+       return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
  }
  
  static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
@@ -409,7 +387,9 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
  
  static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
  {
-       vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+       vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+                               !boot_cpu_has_bug(X86_BUG_MDS) &&
+                               !boot_cpu_has_bug(X86_BUG_TAA);
  
         /*
          * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
@@ -754,17 +734,51 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
         return ret;
  }
  
-#ifdef CONFIG_KEXEC_CORE
-static void crash_vmclear_local_loaded_vmcss(void)
+/*
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
+ */
+static int kvm_cpu_vmxoff(void)
+{
+       asm_volatile_goto("1: vmxoff\n\t"
+                         _ASM_EXTABLE(1b, %l[fault])
+                         ::: "cc", "memory" : fault);
+
+       cr4_clear_bits(X86_CR4_VMXE);
+       return 0;
+
+fault:
+       cr4_clear_bits(X86_CR4_VMXE);
+       return -EIO;
+}
+
+static void vmx_emergency_disable(void)
  {
         int cpu = raw_smp_processor_id();
         struct loaded_vmcs *v;
  
+       kvm_rebooting = true;
+
+       /*
+        * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
+        * set in task context.  If this races with VMX is disabled by an NMI,
+        * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
+        * kvm_rebooting set.
+        */
+       if (!(__read_cr4() & X86_CR4_VMXE))
+               return;
+
         list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
                             loaded_vmcss_on_cpu_link)
                 vmcs_clear(v->vmcs);
+
+       kvm_cpu_vmxoff();
  }
-#endif /* CONFIG_KEXEC_CORE */
  
  static void __loaded_vmcs_clear(void *arg)
  {
@@ -1899,25 +1913,14 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
         return kvm_caps.default_tsc_scaling_ratio;
  }
  
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
  {
-       vmcs_write64(TSC_OFFSET, offset);
+       vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
  }
  
-static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
  {
-       vmcs_write64(TSC_MULTIPLIER, multiplier);
-}
-
-/*
- * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
- * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
- * all guests if the "nested" module option is off, and can also be disabled
- * for a single guest by disabling its VMX cpuid bit.
- */
-bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
-{
-       return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
+       vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
  }
  
  /*
@@ -2047,7 +2050,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                         [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
                 break;
         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
-               if (!nested_vmx_allowed(vcpu))
+               if (!guest_can_use(vcpu, X86_FEATURE_VMX))
                         return 1;
                 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
                                     &msr_info->data))
@@ -2355,7 +2358,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
         case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
                 if (!msr_info->host_initiated)
                         return 1; /* they are read-only */
-               if (!nested_vmx_allowed(vcpu))
+               if (!guest_can_use(vcpu, X86_FEATURE_VMX))
                         return 1;
                 return vmx_set_vmx_msr(vcpu, msr_index, data);
         case MSR_IA32_RTIT_CTL:
@@ -2729,11 +2732,11 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
         return 0;
  }
  
-static bool kvm_is_vmx_supported(void)
+static bool __kvm_is_vmx_supported(void)
  {
-       int cpu = raw_smp_processor_id();
+       int cpu = smp_processor_id();
  
-       if (!cpu_has_vmx()) {
+       if (!(cpuid_ecx(1) & feature_bit(VMX))) {
                 pr_err("VMX not supported by CPU %d\n", cpu);
                 return false;
         }
@@ -2747,13 +2750,24 @@ static bool kvm_is_vmx_supported(void)
         return true;
  }
  
+static bool kvm_is_vmx_supported(void)
+{
+       bool supported;
+
+       migrate_disable();
+       supported = __kvm_is_vmx_supported();
+       migrate_enable();
+
+       return supported;
+}
+
  static int vmx_check_processor_compat(void)
  {
         int cpu = raw_smp_processor_id();
         struct vmcs_config vmcs_conf;
         struct vmx_capability vmx_cap;
  
-       if (!kvm_is_vmx_supported())
+       if (!__kvm_is_vmx_supported())
                 return -EIO;
  
         if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
@@ -2833,7 +2847,7 @@ static void vmx_hardware_disable(void)
  {
         vmclear_local_loaded_vmcss();
  
-       if (cpu_vmxoff())
+       if (kvm_cpu_vmxoff())
                 kvm_spurious_fault();
  
         hv_reset_evmcs();
@@ -3071,13 +3085,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
  
         vmx->rmode.vm86_active = 1;
  
-       /*
-        * Very old userspace does not call KVM_SET_TSS_ADDR before entering
-        * vcpu. Warn the user that an update is overdue.
-        */
-       if (!kvm_vmx->tss_addr)
-               pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
-
         vmx_segment_cache_clear(vmx);
  
         vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
@@ -3350,7 +3357,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
         vmx->emulation_required = vmx_emulation_required(vcpu);
  }
  
-static int vmx_get_max_tdp_level(void)
+static int vmx_get_max_ept_level(void)
  {
         if (cpu_has_vmx_ept_5levels())
                 return 5;
@@ -4553,16 +4560,19 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
   * based on a single guest CPUID bit, with a dedicated feature bit.  This also
   * verifies that the control is actually supported by KVM and hardware.
   */
-#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
-({                                                                      \
-       bool __enabled;                                                  \
-                                                                        \
-       if (cpu_has_vmx_##name()) {                                      \
-               __enabled = guest_cpuid_has(&(vmx)->vcpu,                \
-                                           X86_FEATURE_##feat_name);    \
-               vmx_adjust_secondary_exec_control(vmx, exec_control,     \
-                       SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
-       }                                                                \
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting)    \
+({                                                                                             \
+       struct kvm_vcpu *__vcpu = &(vmx)->vcpu;                                                 \
+       bool __enabled;                                                                         \
+                                                                                               \
+       if (cpu_has_vmx_##name()) {                                                             \
+               if (kvm_is_governed_feature(X86_FEATURE_##feat_name))                           \
+                       __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name);             \
+               else                                                                            \
+                       __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name);           \
+               vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
+                                                 __enabled, exiting);                          \
+       }                                                                                       \
  })
  
  /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
@@ -4622,19 +4632,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
         if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
                 exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
  
-       if (cpu_has_vmx_xsaves()) {
-               /* Exposing XSAVES only when XSAVE is exposed */
-               bool xsaves_enabled =
-                       boot_cpu_has(X86_FEATURE_XSAVE) &&
-                       guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
-                       guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
-
-               vcpu->arch.xsaves_enabled = xsaves_enabled;
-
-               vmx_adjust_secondary_exec_control(vmx, &exec_control,
-                                                 SECONDARY_EXEC_XSAVES,
-                                                 xsaves_enabled, false);
-       }
+       vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
  
         /*
          * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
@@ -4653,6 +4651,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
                                                   SECONDARY_EXEC_ENABLE_RDTSCP,
                                                   rdpid_or_rdtscp_enabled, false);
         }
+
         vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
  
         vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
@@ -6796,8 +6795,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
         vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
         read_unlock(&vcpu->kvm->mmu_lock);
  
-       vmx_flush_tlb_current(vcpu);
-
+       /*
+        * No need for a manual TLB flush at this point, KVM has already done a
+        * flush if there were SPTEs pointing at the previous page.
+        */
  out:
         /*
          * Do not pin apic access page in memory, the MMU notifier
@@ -7243,13 +7244,20 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
                                    flags);
  
         vcpu->arch.cr2 = native_read_cr2();
+       vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
+
+       vmx->idt_vectoring_info = 0;
  
         vmx_enable_fb_clear(vmx);
  
-       if (unlikely(vmx->fail))
+       if (unlikely(vmx->fail)) {
                 vmx->exit_reason.full = 0xdead;
-       else
-               vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+               goto out;
+       }
+
+       vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+       if (likely(!vmx->exit_reason.failed_vmentry))
+               vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
  
         if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
             is_nmi(vmx_get_intr_info(vcpu))) {
@@ -7258,6 +7266,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
                 kvm_after_interrupt(vcpu);
         }
  
+out:
         guest_state_exit_irqoff();
  }
  
@@ -7379,8 +7388,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
         loadsegment(es, __USER_DS);
  #endif
  
-       vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
-
         pt_guest_exit(vmx);
  
         kvm_load_host_xsave_state(vcpu);
@@ -7397,17 +7404,12 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
                 vmx->nested.nested_run_pending = 0;
         }
  
-       vmx->idt_vectoring_info = 0;
-
         if (unlikely(vmx->fail))
                 return EXIT_FASTPATH_NONE;
  
         if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
                 kvm_machine_check();
  
-       if (likely(!vmx->exit_reason.failed_vmentry))
-               vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
         trace_kvm_exit(vcpu, KVM_ISA_VMX);
  
         if (unlikely(vmx->exit_reason.failed_vmentry))
@@ -7751,8 +7753,16 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-       /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
-       vcpu->arch.xsaves_enabled = false;
+       /*
+        * XSAVES is effectively enabled if and only if XSAVE is also exposed
+        * to the guest.  XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
+        * set if and only if XSAVE is supported.
+        */
+       if (boot_cpu_has(X86_FEATURE_XSAVE) &&
+           guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
+               kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
+
+       kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
  
         vmx_setup_uret_msrs(vmx);
  
@@ -7760,7 +7770,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                 vmcs_set_secondary_exec_control(vmx,
                                                 vmx_secondary_exec_control(vmx));
  
-       if (nested_vmx_allowed(vcpu))
+       if (guest_can_use(vcpu, X86_FEATURE_VMX))
                 vmx->msr_ia32_feature_control_valid_bits |=
                         FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
                         FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
@@ -7769,7 +7779,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
                         ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
                           FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
  
-       if (nested_vmx_allowed(vcpu))
+       if (guest_can_use(vcpu, X86_FEATURE_VMX))
                 nested_vmx_cr_fixed1_bits_update(vcpu);
  
         if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
@@ -8526,7 +8536,7 @@ static __init int hardware_setup(void)
          */
         vmx_setup_me_spte_mask();
  
-       kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+       kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(),
                           ept_caps_to_lpage_level(vmx_capability.ept));
  
         /*
@@ -8622,10 +8632,8 @@ static void __vmx_exit(void)
  {
         allow_smaller_maxphyaddr = false;
  
-#ifdef CONFIG_KEXEC_CORE
-       RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
-       synchronize_rcu();
-#endif
+       cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
+
         vmx_cleanup_l1d_flush();
  }
  
@@ -8666,18 +8674,14 @@ static int __init vmx_init(void)
         if (r)
                 goto err_l1d_flush;
  
-       vmx_setup_fb_clear_ctrl();
-
         for_each_possible_cpu(cpu) {
                 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
  
                 pi_init_cpu(cpu);
         }
  
-#ifdef CONFIG_KEXEC_CORE
-       rcu_assign_pointer(crash_vmclear_loaded_vmcss,
-                          crash_vmclear_local_loaded_vmcss);
-#endif
+       cpu_emergency_register_virt_callback(vmx_emergency_disable);
+
         vmx_check_vmcs12_offsets();
  
         /*