OSDN Git Service

Merge branch 'kvm-insert-lfence' into kvm-master
authorPaolo Bonzini <pbonzini@redhat.com>
Thu, 11 Jan 2018 17:20:48 +0000 (18:20 +0100)
committerPaolo Bonzini <pbonzini@redhat.com>
Thu, 11 Jan 2018 17:20:48 +0000 (18:20 +0100)
Topic branch for CVE-2017-5753, avoiding conflicts in the next merge window.

1  2 
arch/x86/kvm/vmx.c

diff --combined arch/x86/kvm/vmx.c
@@@ -70,9 -70,6 +70,9 @@@ MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id)
  static bool __read_mostly enable_vpid = 1;
  module_param_named(vpid, enable_vpid, bool, 0444);
  
 +static bool __read_mostly enable_vnmi = 1;
 +module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
 +
  static bool __read_mostly flexpriority_enabled = 1;
  module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
  
@@@ -205,10 -202,6 +205,10 @@@ struct loaded_vmcs 
        bool nmi_known_unmasked;
        unsigned long vmcs_host_cr3;    /* May not match real cr3 */
        unsigned long vmcs_host_cr4;    /* May not match real cr4 */
 +      /* Support for vnmi-less CPUs */
 +      int soft_vnmi_blocked;
 +      ktime_t entry_time;
 +      s64 vnmi_blocked_time;
        struct list_head loaded_vmcss_on_cpu_link;
  };
  
@@@ -493,14 -486,6 +493,14 @@@ struct nested_vmx 
        u64 nested_vmx_cr4_fixed1;
        u64 nested_vmx_vmcs_enum;
        u64 nested_vmx_vmfunc_controls;
 +
 +      /* SMM related state */
 +      struct {
 +              /* in VMX operation on SMM entry? */
 +              bool vmxon;
 +              /* in guest mode on SMM entry? */
 +              bool guest_mode;
 +      } smm;
  };
  
  #define POSTED_INTR_ON  0
@@@ -899,8 -884,16 +899,16 @@@ static inline short vmcs_field_to_offse
  {
        BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
  
-       if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
-           vmcs_field_to_offset_table[field] == 0)
+       if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+               return -ENOENT;
+       /*
+        * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
+        * generic mechanism.
+        */
+       asm("lfence");
+       if (vmcs_field_to_offset_table[field] == 0)
                return -ENOENT;
  
        return vmcs_field_to_offset_table[field];
@@@ -915,13 -908,16 +923,13 @@@ static bool nested_ept_ad_enabled(struc
  static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
  static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
  static bool vmx_xsaves_supported(void);
 -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
  static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
  static void vmx_get_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
  static bool guest_state_valid(struct kvm_vcpu *vcpu);
  static u32 vmx_segment_access_rights(struct kvm_segment *var);
 -static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
  static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 -static int alloc_identity_pagetable(struct kvm *kvm);
  static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
  static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
  static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@@ -1298,11 -1294,6 +1306,11 @@@ static inline bool cpu_has_vmx_invpcid(
                SECONDARY_EXEC_ENABLE_INVPCID;
  }
  
 +static inline bool cpu_has_virtual_nmis(void)
 +{
 +      return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 +}
 +
  static inline bool cpu_has_vmx_wbinvd_exit(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
@@@ -1360,6 -1351,11 +1368,6 @@@ static inline bool nested_cpu_has2(stru
                (vmcs12->secondary_vm_exec_control & bit);
  }
  
 -static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 -{
 -      return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 -}
 -
  static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
  {
        return vmcs12->pin_based_vm_exec_control &
@@@ -1610,15 -1606,18 +1618,15 @@@ static inline void vpid_sync_context(in
  
  static inline void ept_sync_global(void)
  {
 -      if (cpu_has_vmx_invept_global())
 -              __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
 +      __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
  }
  
  static inline void ept_sync_context(u64 eptp)
  {
 -      if (enable_ept) {
 -              if (cpu_has_vmx_invept_context())
 -                      __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
 -              else
 -                      ept_sync_global();
 -      }
 +      if (cpu_has_vmx_invept_context())
 +              __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
 +      else
 +              ept_sync_global();
  }
  
  static __always_inline void vmcs_check16(unsigned long field)
@@@ -2300,7 -2299,7 +2308,7 @@@ static void vmx_vcpu_load(struct kvm_vc
                 * processors.  See 22.2.4.
                 */
                vmcs_writel(HOST_TR_BASE,
 -                          (unsigned long)this_cpu_ptr(&cpu_tss));
 +                          (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
                vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
  
                /*
@@@ -2840,7 -2839,8 +2848,7 @@@ static void nested_vmx_setup_ctls_msrs(
                                SECONDARY_EXEC_ENABLE_PML;
                        vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
                }
 -      } else
 -              vmx->nested.nested_vmx_ept_caps = 0;
 +      }
  
        if (cpu_has_vmx_vmfunc()) {
                vmx->nested.nested_vmx_secondary_ctls_high |=
                 * Advertise EPTP switching unconditionally
                 * since we emulate it
                 */
 -              vmx->nested.nested_vmx_vmfunc_controls =
 -                      VMX_VMFUNC_EPTP_SWITCHING;
 +              if (enable_ept)
 +                      vmx->nested.nested_vmx_vmfunc_controls =
 +                              VMX_VMFUNC_EPTP_SWITCHING;
        }
  
        /*
                        SECONDARY_EXEC_ENABLE_VPID;
                vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
                        VMX_VPID_EXTENT_SUPPORTED_MASK;
 -      } else
 -              vmx->nested.nested_vmx_vpid_caps = 0;
 +      }
  
        if (enable_unrestricted_guest)
                vmx->nested.nested_vmx_secondary_ctls_high |=
@@@ -3552,8 -3552,7 +3560,8 @@@ static int hardware_enable(void
                wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
        }
        kvm_cpu_vmxon(phys_addr);
 -      ept_sync_global();
 +      if (enable_ept)
 +              ept_sync_global();
  
        return 0;
  }
@@@ -3666,8 -3665,8 +3674,8 @@@ static __init int setup_vmcs_config(str
                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                        SECONDARY_EXEC_SHADOW_VMCS |
                        SECONDARY_EXEC_XSAVES |
 -                      SECONDARY_EXEC_RDSEED |
 -                      SECONDARY_EXEC_RDRAND |
 +                      SECONDARY_EXEC_RDSEED_EXITING |
 +                      SECONDARY_EXEC_RDRAND_EXITING |
                        SECONDARY_EXEC_ENABLE_PML |
                        SECONDARY_EXEC_TSC_SCALING |
                        SECONDARY_EXEC_ENABLE_VMFUNC;
                                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
                                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
  
 +      rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
 +              &vmx_capability.ept, &vmx_capability.vpid);
 +
        if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
                /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
                   enabled */
                _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
                                             CPU_BASED_CR3_STORE_EXITING |
                                             CPU_BASED_INVLPG_EXITING);
 -              rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
 -                    vmx_capability.ept, vmx_capability.vpid);
 +      } else if (vmx_capability.ept) {
 +              vmx_capability.ept = 0;
 +              pr_warn_once("EPT CAP should not exist if not support "
 +                              "1-setting enable EPT VM-execution control\n");
 +      }
 +      if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
 +              vmx_capability.vpid) {
 +              vmx_capability.vpid = 0;
 +              pr_warn_once("VPID CAP should not exist if not support "
 +                              "1-setting enable VPID VM-execution control\n");
        }
  
        min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
                                &_vmexit_control) < 0)
                return -EIO;
  
 -      min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
 -              PIN_BASED_VIRTUAL_NMIS;
 -      opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
 +      min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
 +      opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
 +               PIN_BASED_VMX_PREEMPTION_TIMER;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
                                &_pin_based_exec_control) < 0)
                return -EIO;
@@@ -4801,18 -4789,18 +4809,18 @@@ static int init_rmode_identity_map(stru
        kvm_pfn_t identity_map_pfn;
        u32 tmp;
  
 -      if (!enable_ept)
 -              return 0;
 -
        /* Protect kvm->arch.ept_identity_pagetable_done. */
        mutex_lock(&kvm->slots_lock);
  
        if (likely(kvm->arch.ept_identity_pagetable_done))
                goto out2;
  
 +      if (!kvm->arch.ept_identity_map_addr)
 +              kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
        identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
  
 -      r = alloc_identity_pagetable(kvm);
 +      r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
 +                                  kvm->arch.ept_identity_map_addr, PAGE_SIZE);
        if (r < 0)
                goto out2;
  
@@@ -4884,6 -4872,20 +4892,6 @@@ out
        return r;
  }
  
 -static int alloc_identity_pagetable(struct kvm *kvm)
 -{
 -      /* Called with kvm->slots_lock held. */
 -
 -      int r = 0;
 -
 -      BUG_ON(kvm->arch.ept_identity_pagetable_done);
 -
 -      r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
 -                                  kvm->arch.ept_identity_map_addr, PAGE_SIZE);
 -
 -      return r;
 -}
 -
  static int allocate_vpid(void)
  {
        int vpid;
@@@ -5239,10 -5241,6 +5247,10 @@@ static u32 vmx_pin_based_exec_ctrl(stru
  
        if (!kvm_vcpu_apicv_active(&vmx->vcpu))
                pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
 +
 +      if (!enable_vnmi)
 +              pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
 +
        /* Enable the preemption timer dynamically */
        pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
        return pin_based_exec_ctrl;
@@@ -5292,13 -5290,13 +5300,13 @@@ static u32 vmx_exec_control(struct vcpu
  static bool vmx_rdrand_supported(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
 -              SECONDARY_EXEC_RDRAND;
 +              SECONDARY_EXEC_RDRAND_EXITING;
  }
  
  static bool vmx_rdseed_supported(void)
  {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
 -              SECONDARY_EXEC_RDSEED;
 +              SECONDARY_EXEC_RDSEED_EXITING;
  }
  
  static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
        if (vmx_rdrand_supported()) {
                bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
                if (rdrand_enabled)
 -                      exec_control &= ~SECONDARY_EXEC_RDRAND;
 +                      exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
  
                if (nested) {
                        if (rdrand_enabled)
                                vmx->nested.nested_vmx_secondary_ctls_high |=
 -                                      SECONDARY_EXEC_RDRAND;
 +                                      SECONDARY_EXEC_RDRAND_EXITING;
                        else
                                vmx->nested.nested_vmx_secondary_ctls_high &=
 -                                      ~SECONDARY_EXEC_RDRAND;
 +                                      ~SECONDARY_EXEC_RDRAND_EXITING;
                }
        }
  
        if (vmx_rdseed_supported()) {
                bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
                if (rdseed_enabled)
 -                      exec_control &= ~SECONDARY_EXEC_RDSEED;
 +                      exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
  
                if (nested) {
                        if (rdseed_enabled)
                                vmx->nested.nested_vmx_secondary_ctls_high |=
 -                                      SECONDARY_EXEC_RDSEED;
 +                                      SECONDARY_EXEC_RDSEED_EXITING;
                        else
                                vmx->nested.nested_vmx_secondary_ctls_high &=
 -                                      ~SECONDARY_EXEC_RDSEED;
 +                                      ~SECONDARY_EXEC_RDSEED_EXITING;
                }
        }
  
@@@ -5436,7 -5434,7 +5444,7 @@@ static void ept_set_mmio_spte_mask(void
  /*
   * Sets up the vmcs for emulated real mode.
   */
 -static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 +static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
  {
  #ifdef CONFIG_X86_64
        unsigned long a;
                vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
                vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
        }
 -
 -      return 0;
  }
  
  static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
                vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
        }
  
 -      vmcs_writel(GUEST_RFLAGS, 0x02);
 +      kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
        kvm_rip_write(vcpu, 0xfff0);
  
        vmcs_writel(GUEST_GDTR_BASE, 0);
        vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 +      if (kvm_mpx_supported())
 +              vmcs_write64(GUEST_BNDCFGS, 0);
  
        setup_msrs(vmx);
  
@@@ -5677,8 -5675,7 +5685,8 @@@ static void enable_irq_window(struct kv
  
  static void enable_nmi_window(struct kvm_vcpu *vcpu)
  {
 -      if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
 +      if (!enable_vnmi ||
 +          vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
                enable_irq_window(vcpu);
                return;
        }
@@@ -5718,19 -5715,6 +5726,19 @@@ static void vmx_inject_nmi(struct kvm_v
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
  
 +      if (!enable_vnmi) {
 +              /*
 +               * Tracking the NMI-blocked state in software is built upon
 +               * finding the next open IRQ window. This, in turn, depends on
 +               * well-behaving guests: They have to keep IRQs disabled at
 +               * least as long as the NMI handler runs. Otherwise we may
 +               * cause NMI nesting, maybe breaking the guest. But as this is
 +               * highly unlikely, we can live with the residual risk.
 +               */
 +              vmx->loaded_vmcs->soft_vnmi_blocked = 1;
 +              vmx->loaded_vmcs->vnmi_blocked_time = 0;
 +      }
 +
        ++vcpu->stat.nmi_injections;
        vmx->loaded_vmcs->nmi_known_unmasked = false;
  
@@@ -5749,8 -5733,6 +5757,8 @@@ static bool vmx_get_nmi_mask(struct kvm
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        bool masked;
  
 +      if (!enable_vnmi)
 +              return vmx->loaded_vmcs->soft_vnmi_blocked;
        if (vmx->loaded_vmcs->nmi_known_unmasked)
                return false;
        masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
@@@ -5762,20 -5744,13 +5770,20 @@@ static void vmx_set_nmi_mask(struct kvm
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
  
 -      vmx->loaded_vmcs->nmi_known_unmasked = !masked;
 -      if (masked)
 -              vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 -                            GUEST_INTR_STATE_NMI);
 -      else
 -              vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
 -                              GUEST_INTR_STATE_NMI);
 +      if (!enable_vnmi) {
 +              if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
 +                      vmx->loaded_vmcs->soft_vnmi_blocked = masked;
 +                      vmx->loaded_vmcs->vnmi_blocked_time = 0;
 +              }
 +      } else {
 +              vmx->loaded_vmcs->nmi_known_unmasked = !masked;
 +              if (masked)
 +                      vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 +                                    GUEST_INTR_STATE_NMI);
 +              else
 +                      vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
 +                                      GUEST_INTR_STATE_NMI);
 +      }
  }
  
  static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
        if (to_vmx(vcpu)->nested.nested_run_pending)
                return 0;
  
 +      if (!enable_vnmi &&
 +          to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
 +              return 0;
 +
        return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
                   | GUEST_INTR_STATE_NMI));
@@@ -5915,9 -5886,11 +5923,9 @@@ static int handle_exception(struct kvm_
                return 1;  /* already handled by vmx_vcpu_run() */
  
        if (is_invalid_opcode(intr_info)) {
 -              if (is_guest_mode(vcpu)) {
 -                      kvm_queue_exception(vcpu, UD_VECTOR);
 -                      return 1;
 -              }
                er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
 +              if (er == EMULATE_USER_EXIT)
 +                      return 0;
                if (er != EMULATE_DONE)
                        kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
                /* EPT won't cause page fault directly */
                WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
 -              return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
 -                              true);
 +              return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
        }
  
        ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@@ -6512,7 -6486,6 +6520,7 @@@ static int handle_ept_violation(struct 
         * AAK134, BY25.
         */
        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
 +                      enable_vnmi &&
                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
  
@@@ -6572,7 -6545,6 +6580,7 @@@ static int handle_ept_misconfig(struct 
  
  static int handle_nmi_window(struct kvm_vcpu *vcpu)
  {
 +      WARN_ON_ONCE(!enable_vnmi);
        vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
                        CPU_BASED_VIRTUAL_NMI_PENDING);
        ++vcpu->stat.nmi_window_exits;
@@@ -6600,7 -6572,7 +6608,7 @@@ static int handle_invalid_guest_state(s
                if (kvm_test_request(KVM_REQ_EVENT, vcpu))
                        return 1;
  
 -              err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
 +              err = emulate_instruction(vcpu, 0);
  
                if (err == EMULATE_USER_EXIT) {
                        ++vcpu->stat.mmio_exits;
@@@ -6748,10 -6720,16 +6756,10 @@@ static __init int hardware_setup(void
                        goto out;
        }
  
 -      vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
        memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
        memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
  
 -      /*
 -       * Allow direct access to the PC debug port (it is often used for I/O
 -       * delays, but the vmexits simply slow things down).
 -       */
        memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
 -      clear_bit(0x80, vmx_io_bitmap_a);
  
        memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
  
  
        if (!cpu_has_vmx_ept() ||
            !cpu_has_vmx_ept_4levels() ||
 -          !cpu_has_vmx_ept_mt_wb()) {
 +          !cpu_has_vmx_ept_mt_wb() ||
 +          !cpu_has_vmx_invept_global())
                enable_ept = 0;
 -              enable_unrestricted_guest = 0;
 -              enable_ept_ad_bits = 0;
 -      }
  
        if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
                enable_ept_ad_bits = 0;
  
 -      if (!cpu_has_vmx_unrestricted_guest())
 +      if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
                enable_unrestricted_guest = 0;
  
        if (!cpu_has_vmx_flexpriority())
                flexpriority_enabled = 0;
  
 +      if (!cpu_has_virtual_nmis())
 +              enable_vnmi = 0;
 +
        /*
         * set_apic_access_page_addr() is used to reload apic access
         * page upon invalidation.  No need to do anything if not
        if (enable_ept && !cpu_has_vmx_ept_2m_page())
                kvm_disable_largepages();
  
 -      if (!cpu_has_vmx_ple())
 +      if (!cpu_has_vmx_ple()) {
                ple_gap = 0;
 +              ple_window = 0;
 +              ple_window_grow = 0;
 +              ple_window_max = 0;
 +              ple_window_shrink = 0;
 +      }
  
        if (!cpu_has_vmx_apicv()) {
                enable_apicv = 0;
@@@ -6997,7 -6969,7 +7005,7 @@@ static struct loaded_vmcs *nested_get_c
        }
  
        /* Create a new VMCS */
 -      item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
 +      item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
        if (!item)
                return NULL;
        item->vmcs02.vmcs = alloc_vmcs();
@@@ -7406,11 -7378,10 +7414,11 @@@ static inline void nested_release_vmcs1
   */
  static void free_nested(struct vcpu_vmx *vmx)
  {
 -      if (!vmx->nested.vmxon)
 +      if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                return;
  
        vmx->nested.vmxon = false;
 +      vmx->nested.smm.vmxon = false;
        free_vpid(vmx->nested.vpid02);
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
@@@ -8015,7 -7986,6 +8023,7 @@@ static int handle_pml_full(struct kvm_v
         * "blocked by NMI" bit has to be set before next VM entry.
         */
        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
 +                      enable_vnmi &&
                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                GUEST_INTR_STATE_NMI);
@@@ -8453,9 -8423,9 +8461,9 @@@ static bool nested_vmx_exit_reflected(s
        case EXIT_REASON_RDPMC:
                return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
        case EXIT_REASON_RDRAND:
 -              return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
 +              return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
        case EXIT_REASON_RDSEED:
 -              return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
 +              return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
        case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@@ -8860,25 -8830,6 +8868,25 @@@ static int vmx_handle_exit(struct kvm_v
                return 0;
        }
  
 +      if (unlikely(!enable_vnmi &&
 +                   vmx->loaded_vmcs->soft_vnmi_blocked)) {
 +              if (vmx_interrupt_allowed(vcpu)) {
 +                      vmx->loaded_vmcs->soft_vnmi_blocked = 0;
 +              } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
 +                         vcpu->arch.nmi_pending) {
 +                      /*
 +                       * This CPU don't support us in finding the end of an
 +                       * NMI-blocked window if the guest runs with IRQs
 +                       * disabled. So we pull the trigger after 1 s of
 +                       * futile waiting, but inform the user about this.
 +                       */
 +                      printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
 +                             "state on VCPU %d after 1 s timeout\n",
 +                             __func__, vcpu->vcpu_id);
 +                      vmx->loaded_vmcs->soft_vnmi_blocked = 0;
 +              }
 +      }
 +
        if (exit_reason < kvm_vmx_max_exit_handlers
            && kvm_vmx_exit_handlers[exit_reason])
                return kvm_vmx_exit_handlers[exit_reason](vcpu);
@@@ -9161,38 -9112,33 +9169,38 @@@ static void vmx_recover_nmi_blocking(st
  
        idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
  
 -      if (vmx->loaded_vmcs->nmi_known_unmasked)
 -              return;
 -      /*
 -       * Can't use vmx->exit_intr_info since we're not sure what
 -       * the exit reason is.
 -       */
 -      exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 -      unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
 -      vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
 -      /*
 -       * SDM 3: 27.7.1.2 (September 2008)
 -       * Re-set bit "block by NMI" before VM entry if vmexit caused by
 -       * a guest IRET fault.
 -       * SDM 3: 23.2.2 (September 2008)
 -       * Bit 12 is undefined in any of the following cases:
 -       *  If the VM exit sets the valid bit in the IDT-vectoring
 -       *   information field.
 -       *  If the VM exit is due to a double fault.
 -       */
 -      if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
 -          vector != DF_VECTOR && !idtv_info_valid)
 -              vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 -                            GUEST_INTR_STATE_NMI);
 -      else
 -              vmx->loaded_vmcs->nmi_known_unmasked =
 -                      !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
 -                        & GUEST_INTR_STATE_NMI);
 +      if (enable_vnmi) {
 +              if (vmx->loaded_vmcs->nmi_known_unmasked)
 +                      return;
 +              /*
 +               * Can't use vmx->exit_intr_info since we're not sure what
 +               * the exit reason is.
 +               */
 +              exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 +              unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
 +              vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
 +              /*
 +               * SDM 3: 27.7.1.2 (September 2008)
 +               * Re-set bit "block by NMI" before VM entry if vmexit caused by
 +               * a guest IRET fault.
 +               * SDM 3: 23.2.2 (September 2008)
 +               * Bit 12 is undefined in any of the following cases:
 +               *  If the VM exit sets the valid bit in the IDT-vectoring
 +               *   information field.
 +               *  If the VM exit is due to a double fault.
 +               */
 +              if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
 +                  vector != DF_VECTOR && !idtv_info_valid)
 +                      vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 +                                    GUEST_INTR_STATE_NMI);
 +              else
 +                      vmx->loaded_vmcs->nmi_known_unmasked =
 +                              !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
 +                                & GUEST_INTR_STATE_NMI);
 +      } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
 +              vmx->loaded_vmcs->vnmi_blocked_time +=
 +                      ktime_to_ns(ktime_sub(ktime_get(),
 +                                            vmx->loaded_vmcs->entry_time));
  }
  
  static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
@@@ -9309,11 -9255,6 +9317,11 @@@ static void __noclone vmx_vcpu_run(stru
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long debugctlmsr, cr3, cr4;
  
 +      /* Record the guest's net vcpu time for enforced NMI injections. */
 +      if (unlikely(!enable_vnmi &&
 +                   vmx->loaded_vmcs->soft_vnmi_blocked))
 +              vmx->loaded_vmcs->entry_time = ktime_get();
 +
        /* Don't enter VMX if guest state is invalid, let the exit handler
           start emulation until we arrive back to a valid state */
        if (vmx->emulation_required)
                /* Save guest registers, load host registers, keep flags */
                "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
                "pop %0 \n\t"
 +              "setbe %c[fail](%0)\n\t"
                "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
                "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
                __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
                "mov %%r13, %c[r13](%0) \n\t"
                "mov %%r14, %c[r14](%0) \n\t"
                "mov %%r15, %c[r15](%0) \n\t"
 +              "xor %%r8d,  %%r8d \n\t"
 +              "xor %%r9d,  %%r9d \n\t"
 +              "xor %%r10d, %%r10d \n\t"
 +              "xor %%r11d, %%r11d \n\t"
 +              "xor %%r12d, %%r12d \n\t"
 +              "xor %%r13d, %%r13d \n\t"
 +              "xor %%r14d, %%r14d \n\t"
 +              "xor %%r15d, %%r15d \n\t"
  #endif
                "mov %%cr2, %%" _ASM_AX "   \n\t"
                "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
  
 +              "xor %%eax, %%eax \n\t"
 +              "xor %%ebx, %%ebx \n\t"
 +              "xor %%esi, %%esi \n\t"
 +              "xor %%edi, %%edi \n\t"
                "pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
 -              "setbe %c[fail](%0) \n\t"
                ".pushsection .rodata \n\t"
                ".global vmx_return \n\t"
                "vmx_return: " _ASM_PTR " 2b \n\t"
@@@ -9554,6 -9483,7 +9562,6 @@@ static void vmx_switch_vmcs(struct kvm_
        vmx->loaded_vmcs = vmcs;
        vmx_vcpu_put(vcpu);
        vmx_vcpu_load(vcpu, cpu);
 -      vcpu->cpu = cpu;
        put_cpu();
  }
  
@@@ -9634,9 -9564,11 +9642,9 @@@ static struct kvm_vcpu *vmx_create_vcpu
        cpu = get_cpu();
        vmx_vcpu_load(&vmx->vcpu, cpu);
        vmx->vcpu.cpu = cpu;
 -      err = vmx_vcpu_setup(vmx);
 +      vmx_vcpu_setup(vmx);
        vmx_vcpu_put(&vmx->vcpu);
        put_cpu();
 -      if (err)
 -              goto free_vmcs;
        if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
                err = alloc_apic_access_page(kvm);
                if (err)
        }
  
        if (enable_ept) {
 -              if (!kvm->arch.ept_identity_map_addr)
 -                      kvm->arch.ept_identity_map_addr =
 -                              VMX_EPT_IDENTITY_PAGETABLE_ADDR;
                err = init_rmode_identity_map(kvm);
                if (err)
                        goto free_vmcs;
@@@ -9805,7 -9740,8 +9813,7 @@@ static void nested_vmx_cr_fixed1_bits_u
        cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));
        cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));
        cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU));
 -      /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */
 -      cr4_fixed1_update(bit(11),            ecx, bit(2));
 +      cr4_fixed1_update(X86_CR4_UMIP,       ecx, bit(X86_FEATURE_UMIP));
  
  #undef cr4_fixed1_update
  }
@@@ -10879,11 -10815,6 +10887,11 @@@ static int check_vmentry_postreqs(struc
                        return 1;
        }
  
 +      if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
 +              (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
 +              (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
 +                      return 1;
 +
        return 0;
  }
  
@@@ -11108,12 -11039,13 +11116,12 @@@ static int vmx_check_nested_events(stru
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qual;
 -
 -      if (kvm_event_needs_reinjection(vcpu))
 -              return -EBUSY;
 +      bool block_nested_events =
 +          vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
  
        if (vcpu->arch.exception.pending &&
                nested_vmx_check_exception(vcpu, &exit_qual)) {
 -              if (vmx->nested.nested_run_pending)
 +              if (block_nested_events)
                        return -EBUSY;
                nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
                vcpu->arch.exception.pending = false;
  
        if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
            vmx->nested.preemption_timer_expired) {
 -              if (vmx->nested.nested_run_pending)
 +              if (block_nested_events)
                        return -EBUSY;
                nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
                return 0;
        }
  
        if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
 -              if (vmx->nested.nested_run_pending)
 +              if (block_nested_events)
                        return -EBUSY;
                nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
                                  NMI_VECTOR | INTR_TYPE_NMI_INTR |
  
        if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
            nested_exit_on_intr(vcpu)) {
 -              if (vmx->nested.nested_run_pending)
 +              if (block_nested_events)
                        return -EBUSY;
                nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
                return 0;
@@@ -11332,24 -11264,6 +11340,24 @@@ static void prepare_vmcs12(struct kvm_v
        kvm_clear_interrupt_queue(vcpu);
  }
  
 +static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
 +                      struct vmcs12 *vmcs12)
 +{
 +      u32 entry_failure_code;
 +
 +      nested_ept_uninit_mmu_context(vcpu);
 +
 +      /*
 +       * Only PDPTE load can fail as the value of cr3 was checked on entry and
 +       * couldn't have changed.
 +       */
 +      if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
 +              nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
 +
 +      if (!enable_ept)
 +              vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
 +}
 +
  /*
   * A part of what we need to when the nested L2 guest exits and we want to
   * run its L1 parent, is to reset L1's guest state to the host state specified
@@@ -11363,6 -11277,7 +11371,6 @@@ static void load_vmcs12_host_state(stru
                                   struct vmcs12 *vmcs12)
  {
        struct kvm_segment seg;
 -      u32 entry_failure_code;
  
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
        vmx_set_cr4(vcpu, vmcs12->host_cr4);
  
 -      nested_ept_uninit_mmu_context(vcpu);
 -
 -      /*
 -       * Only PDPTE load can fail as the value of cr3 was checked on entry and
 -       * couldn't have changed.
 -       */
 -      if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
 -              nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
 -
 -      if (!enable_ept)
 -              vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
 +      load_vmcs12_mmu_host_state(vcpu, vmcs12);
  
        if (enable_vpid) {
                /*
        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
 +      vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
 +      vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
  
        /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
        if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
@@@ -11506,11 -11429,8 +11514,11 @@@ static void nested_vmx_vmexit(struct kv
        leave_guest_mode(vcpu);
  
        if (likely(!vmx->fail)) {
 -              prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
 -                             exit_qualification);
 +              if (exit_reason == -1)
 +                      sync_vmcs12(vcpu, vmcs12);
 +              else
 +                      prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
 +                                     exit_qualification);
  
                if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
                                         vmcs12->vm_exit_msr_store_count))
         */
        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
  
 -      if (enable_shadow_vmcs)
 +      if (enable_shadow_vmcs && exit_reason != -1)
                vmx->nested.sync_shadow_vmcs = true;
  
        /* in case we halted in L2 */
                                INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
                }
  
 -              trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
 -                                             vmcs12->exit_qualification,
 -                                             vmcs12->idt_vectoring_info_field,
 -                                             vmcs12->vm_exit_intr_info,
 -                                             vmcs12->vm_exit_intr_error_code,
 -                                             KVM_ISA_VMX);
 +              if (exit_reason != -1)
 +                      trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
 +                                                     vmcs12->exit_qualification,
 +                                                     vmcs12->idt_vectoring_info_field,
 +                                                     vmcs12->vm_exit_intr_info,
 +                                                     vmcs12->vm_exit_intr_error_code,
 +                                                     KVM_ISA_VMX);
  
                load_vmcs12_host_state(vcpu, vmcs12);
  
         * accordingly.
         */
        nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 +
 +      load_vmcs12_mmu_host_state(vcpu, vmcs12);
 +
        /*
         * The emulated instruction was already skipped in
         * nested_vmx_run, but the updated RIP was never
@@@ -12030,54 -11946,6 +12038,54 @@@ static void vmx_setup_mce(struct kvm_vc
                        ~FEATURE_CONTROL_LMCE;
  }
  
 +static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
 +{
 +      /* we need a nested vmexit to enter SMM, postpone if run is pending */
 +      if (to_vmx(vcpu)->nested.nested_run_pending)
 +              return 0;
 +      return 1;
 +}
 +
 +static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
 +{
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +
 +      vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
 +      if (vmx->nested.smm.guest_mode)
 +              nested_vmx_vmexit(vcpu, -1, 0, 0);
 +
 +      vmx->nested.smm.vmxon = vmx->nested.vmxon;
 +      vmx->nested.vmxon = false;
 +      return 0;
 +}
 +
 +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
 +{
 +      struct vcpu_vmx *vmx = to_vmx(vcpu);
 +      int ret;
 +
 +      if (vmx->nested.smm.vmxon) {
 +              vmx->nested.vmxon = true;
 +              vmx->nested.smm.vmxon = false;
 +      }
 +
 +      if (vmx->nested.smm.guest_mode) {
 +              vcpu->arch.hflags &= ~HF_SMM_MASK;
 +              ret = enter_vmx_non_root_mode(vcpu, false);
 +              vcpu->arch.hflags |= HF_SMM_MASK;
 +              if (ret)
 +                      return ret;
 +
 +              vmx->nested.smm.guest_mode = false;
 +      }
 +      return 0;
 +}
 +
 +static int enable_smi_window(struct kvm_vcpu *vcpu)
 +{
 +      return 0;
 +}
 +
  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
  #endif
  
        .setup_mce = vmx_setup_mce,
 +
 +      .smi_allowed = vmx_smi_allowed,
 +      .pre_enter_smm = vmx_pre_enter_smm,
 +      .pre_leave_smm = vmx_pre_leave_smm,
 +      .enable_smi_window = enable_smi_window,
  };
  
  static int __init vmx_init(void)