OSDN Git Service

Merge tag 'kvm-arm-for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm...
[tomoyo/tomoyo-test1.git] / arch / x86 / kvm / mmu.c
index 98f6e4f..15d2c06 100644 (file)
@@ -140,9 +140,6 @@ module_param(dbg, bool, 0644);
 
 #include <trace/events/kvm.h>
 
-#define CREATE_TRACE_POINTS
-#include "mmutrace.h"
-
 #define SPTE_HOST_WRITEABLE    (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_MMU_WRITEABLE     (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 
@@ -259,11 +256,20 @@ static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
  */
 static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
 
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+static u8 __read_mostly shadow_phys_bits;
 
 static void mmu_spte_set(u64 *sptep, u64 spte);
+static bool is_executable_pte(u64 spte);
 static union kvm_mmu_page_role
 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
 
+#define CREATE_TRACE_POINTS
+#include "mmutrace.h"
+
 
 static inline bool kvm_available_flush_tlb_with_range(void)
 {
@@ -468,6 +474,21 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
+static u8 kvm_get_shadow_phys_bits(void)
+{
+       /*
+        * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
+        * in CPU detection code, but MKTME treats those reduced bits as
+        * 'keyID' thus they are not reserved bits. Therefore for MKTME
+        * we should still return physical address bits reported by CPUID.
+        */
+       if (!boot_cpu_has(X86_FEATURE_TME) ||
+           WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
+               return boot_cpu_data.x86_phys_bits;
+
+       return cpuid_eax(0x80000008) & 0xff;
+}
+
 static void kvm_mmu_reset_all_pte_masks(void)
 {
        u8 low_phys_bits;
@@ -481,6 +502,8 @@ static void kvm_mmu_reset_all_pte_masks(void)
        shadow_present_mask = 0;
        shadow_acc_track_mask = 0;
 
+       shadow_phys_bits = kvm_get_shadow_phys_bits();
+
        /*
         * If the CPU has 46 or less physical address bits, then set an
         * appropriate mask to guard against L1TF attacks. Otherwise, it is
@@ -1073,10 +1096,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 
 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 {
-       if (sp->role.direct)
-               BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
-       else
+       if (!sp->role.direct) {
                sp->gfns[index] = gfn;
+               return;
+       }
+
+       if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
+               pr_err_ratelimited("gfn mismatch under direct page %llx "
+                                  "(expected %llx, got %llx)\n",
+                                  sp->gfn,
+                                  kvm_mmu_page_get_gfn(sp, index), gfn);
 }
 
 /*
@@ -3055,10 +3084,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
                ret = RET_PF_EMULATE;
 
        pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-       pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
-                is_large_pte(*sptep)? "2MB" : "4kB",
-                *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
-                *sptep, sptep);
+       trace_kvm_mmu_set_spte(level, gfn, sptep);
        if (!was_rmapped && is_large_pte(*sptep))
                ++vcpu->kvm->stat.lpages;
 
@@ -3070,8 +3096,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
                }
        }
 
-       kvm_release_pfn_clean(pfn);
-
        return ret;
 }
 
@@ -3106,9 +3130,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
        if (ret <= 0)
                return -1;
 
-       for (i = 0; i < ret; i++, gfn++, start++)
+       for (i = 0; i < ret; i++, gfn++, start++) {
                mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
                             page_to_pfn(pages[i]), true, true);
+               put_page(pages[i]);
+       }
 
        return 0;
 }
@@ -3156,40 +3182,40 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
        __direct_pte_prefetch(vcpu, sp, sptep);
 }
 
-static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
-                       int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+                       int map_writable, int level, kvm_pfn_t pfn,
+                       bool prefault)
 {
-       struct kvm_shadow_walk_iterator iterator;
+       struct kvm_shadow_walk_iterator it;
        struct kvm_mmu_page *sp;
-       int emulate = 0;
-       gfn_t pseudo_gfn;
+       int ret;
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       gfn_t base_gfn = gfn;
 
        if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-               return 0;
+               return RET_PF_RETRY;
 
-       for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
-               if (iterator.level == level) {
-                       emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
-                                              write, level, gfn, pfn, prefault,
-                                              map_writable);
-                       direct_pte_prefetch(vcpu, iterator.sptep);
-                       ++vcpu->stat.pf_fixed;
+       trace_kvm_mmu_spte_requested(gpa, level, pfn);
+       for_each_shadow_entry(vcpu, gpa, it) {
+               base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+               if (it.level == level)
                        break;
-               }
 
-               drop_large_spte(vcpu, iterator.sptep);
-               if (!is_shadow_present_pte(*iterator.sptep)) {
-                       u64 base_addr = iterator.addr;
+               drop_large_spte(vcpu, it.sptep);
+               if (!is_shadow_present_pte(*it.sptep)) {
+                       sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+                                             it.level - 1, true, ACC_ALL);
 
-                       base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
-                       pseudo_gfn = base_addr >> PAGE_SHIFT;
-                       sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
-                                             iterator.level - 1, 1, ACC_ALL);
-
-                       link_shadow_page(vcpu, iterator.sptep, sp);
+                       link_shadow_page(vcpu, it.sptep, sp);
                }
        }
-       return emulate;
+
+       ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
+                          write, level, base_gfn, pfn, prefault,
+                          map_writable);
+       direct_pte_prefetch(vcpu, it.sptep);
+       ++vcpu->stat.pf_fixed;
+       return ret;
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -3216,11 +3242,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
 }
 
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
-                                       gfn_t *gfnp, kvm_pfn_t *pfnp,
+                                       gfn_t gfn, kvm_pfn_t *pfnp,
                                        int *levelp)
 {
        kvm_pfn_t pfn = *pfnp;
-       gfn_t gfn = *gfnp;
        int level = *levelp;
 
        /*
@@ -3247,8 +3272,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
                mask = KVM_PAGES_PER_HPAGE(level) - 1;
                VM_BUG_ON((gfn & mask) != (pfn & mask));
                if (pfn & mask) {
-                       gfn &= ~mask;
-                       *gfnp = gfn;
                        kvm_release_pfn_clean(pfn);
                        pfn &= ~mask;
                        kvm_get_pfn(pfn);
@@ -3505,22 +3528,19 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
        if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
                return r;
 
+       r = RET_PF_RETRY;
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        if (make_mmu_pages_available(vcpu) < 0)
                goto out_unlock;
        if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
-       r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
-       spin_unlock(&vcpu->kvm->mmu_lock);
-
-       return r;
-
+               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+       r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
 out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return RET_PF_RETRY;
+       return r;
 }
 
 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
@@ -4015,19 +4035,6 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
        return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
-bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
-{
-       if (unlikely(!lapic_in_kernel(vcpu) ||
-                    kvm_event_needs_reinjection(vcpu) ||
-                    vcpu->arch.exception.pending))
-               return false;
-
-       if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
-               return false;
-
-       return kvm_x86_ops->interrupt_allowed(vcpu);
-}
-
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                         gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
 {
@@ -4147,22 +4154,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
                return r;
 
+       r = RET_PF_RETRY;
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        if (make_mmu_pages_available(vcpu) < 0)
                goto out_unlock;
        if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
-       r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
-       spin_unlock(&vcpu->kvm->mmu_lock);
-
-       return r;
-
+               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+       r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
 out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
-       return RET_PF_RETRY;
+       return r;
 }
 
 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4494,7 +4498,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
         */
        shadow_zero_check = &context->shadow_zero_check;
        __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
-                               boot_cpu_data.x86_phys_bits,
+                               shadow_phys_bits,
                                context->shadow_root_level, uses_nx,
                                guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
                                is_pse(vcpu), true);
@@ -4531,13 +4535,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 
        if (boot_cpu_is_amd())
                __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
-                                       boot_cpu_data.x86_phys_bits,
+                                       shadow_phys_bits,
                                        context->shadow_root_level, false,
                                        boot_cpu_has(X86_FEATURE_GBPAGES),
                                        true, true);
        else
                __reset_rsvds_bits_mask_ept(shadow_zero_check,
-                                           boot_cpu_data.x86_phys_bits,
+                                           shadow_phys_bits,
                                            false);
 
        if (!shadow_me_mask)
@@ -4558,7 +4562,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
                                struct kvm_mmu *context, bool execonly)
 {
        __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
-                                   boot_cpu_data.x86_phys_bits, execonly);
+                                   shadow_phys_bits, execonly);
 }
 
 #define BYTE_MASK(access) \
@@ -5935,7 +5939,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        int nr_to_scan = sc->nr_to_scan;
        unsigned long freed = 0;
 
-       spin_lock(&kvm_lock);
+       mutex_lock(&kvm_lock);
 
        list_for_each_entry(kvm, &vm_list, vm_list) {
                int idx;
@@ -5977,7 +5981,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
                break;
        }
 
-       spin_unlock(&kvm_lock);
+       mutex_unlock(&kvm_lock);
        return freed;
 }
 
@@ -5999,6 +6003,34 @@ static void mmu_destroy_caches(void)
        kmem_cache_destroy(mmu_page_header_cache);
 }
 
+static void kvm_set_mmio_spte_mask(void)
+{
+       u64 mask;
+
+       /*
+        * Set the reserved bits and the present bit of an paging-structure
+        * entry to generate page fault with PFER.RSV = 1.
+        */
+
+       /*
+        * Mask the uppermost physical address bit, which would be reserved as
+        * long as the supported physical address width is less than 52.
+        */
+       mask = 1ull << 51;
+
+       /* Set the present bit. */
+       mask |= 1ull;
+
+       /*
+        * If reserved bit is not supported, clear the present bit to disable
+        * mmio page fault.
+        */
+       if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
+               mask &= ~1ull;
+
+       kvm_mmu_set_mmio_spte_mask(mask, mask);
+}
+
 int kvm_mmu_module_init(void)
 {
        int ret = -ENOMEM;
@@ -6015,6 +6047,8 @@ int kvm_mmu_module_init(void)
 
        kvm_mmu_reset_all_pte_masks();
 
+       kvm_set_mmio_spte_mask();
+
        pte_list_desc_cache = kmem_cache_create("pte_list_desc",
                                            sizeof(struct pte_list_desc),
                                            0, SLAB_ACCOUNT, NULL);