Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

[tomoyo/tomoyo-test1.git] / arch / x86 / kvm / mmu / mmu.c
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index ec169f5..e1d011c 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -25,6 +25,7 @@
  #include "kvm_cache_regs.h"
  #include "smm.h"
  #include "kvm_emulate.h"
+#include "page_track.h"
  #include "cpuid.h"
  #include "spte.h"
  
@@ -53,7 +54,7 @@
  #include <asm/io.h>
  #include <asm/set_memory.h>
  #include <asm/vmx.h>
-#include <asm/kvm_page_track.h>
+
  #include "trace.h"
  
  extern bool itlb_multihit_kvm_mitigation;
@@ -115,11 +116,6 @@ static int max_huge_page_level __read_mostly;
  static int tdp_root_level __read_mostly;
  static int max_tdp_level __read_mostly;
  
-#ifdef MMU_DEBUG
-bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
  #define PTE_PREFETCH_NUM               8
  
  #include <trace/events/kvm.h>
@@ -278,16 +274,12 @@ static inline bool kvm_available_flush_remote_tlbs_range(void)
         return kvm_x86_ops.flush_remote_tlbs_range;
  }
  
-void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
-                                gfn_t nr_pages)
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
  {
-       int ret = -EOPNOTSUPP;
+       if (!kvm_x86_ops.flush_remote_tlbs_range)
+               return -EOPNOTSUPP;
  
-       if (kvm_x86_ops.flush_remote_tlbs_range)
-               ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn,
-                                                                  nr_pages);
-       if (ret)
-               kvm_flush_remote_tlbs(kvm);
+       return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages);
  }
  
  static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
@@ -490,7 +482,7 @@ retry:
   */
  static void mmu_spte_set(u64 *sptep, u64 new_spte)
  {
-       WARN_ON(is_shadow_present_pte(*sptep));
+       WARN_ON_ONCE(is_shadow_present_pte(*sptep));
         __set_spte(sptep, new_spte);
  }
  
@@ -502,7 +494,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
  {
         u64 old_spte = *sptep;
  
-       WARN_ON(!is_shadow_present_pte(new_spte));
+       WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
         check_spte_writable_invariants(new_spte);
  
         if (!is_shadow_present_pte(old_spte)) {
@@ -515,7 +507,7 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
         else
                 old_spte = __update_clear_spte_slow(sptep, new_spte);
  
-       WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+       WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
  
         return old_spte;
  }
@@ -597,7 +589,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
          * by a refcounted page, the refcount is elevated.
          */
         page = kvm_pfn_to_refcounted_page(pfn);
-       WARN_ON(page && !page_count(page));
+       WARN_ON_ONCE(page && !page_count(page));
  
         if (is_accessed_spte(old_spte))
                 kvm_set_pfn_accessed(pfn);
@@ -812,7 +804,7 @@ static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
         for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
                 linfo = lpage_info_slot(gfn, slot, i);
                 linfo->disallow_lpage += count;
-               WARN_ON(linfo->disallow_lpage < 0);
+               WARN_ON_ONCE(linfo->disallow_lpage < 0);
         }
  }
  
@@ -839,8 +831,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
  
         /* the non-leaf shadow pages are keeping readonly. */
         if (sp->role.level > PG_LEVEL_4K)
-               return kvm_slot_page_track_add_page(kvm, slot, gfn,
-                                                   KVM_PAGE_TRACK_WRITE);
+               return __kvm_write_track_add_gfn(kvm, slot, gfn);
  
         kvm_mmu_gfn_disallow_lpage(slot, gfn);
  
@@ -886,8 +877,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, gfn);
         if (sp->role.level > PG_LEVEL_4K)
-               return kvm_slot_page_track_remove_page(kvm, slot, gfn,
-                                                      KVM_PAGE_TRACK_WRITE);
+               return __kvm_write_track_remove_gfn(kvm, slot, gfn);
  
         kvm_mmu_gfn_allow_lpage(slot, gfn);
  }
@@ -941,10 +931,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
         int count = 0;
  
         if (!rmap_head->val) {
-               rmap_printk("%p %llx 0->1\n", spte, *spte);
                 rmap_head->val = (unsigned long)spte;
         } else if (!(rmap_head->val & 1)) {
-               rmap_printk("%p %llx 1->many\n", spte, *spte);
                 desc = kvm_mmu_memory_cache_alloc(cache);
                 desc->sptes[0] = (u64 *)rmap_head->val;
                 desc->sptes[1] = spte;
@@ -953,7 +941,6 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
                 rmap_head->val = (unsigned long)desc | 1;
                 ++count;
         } else {
-               rmap_printk("%p %llx many->many\n", spte, *spte);
                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
                 count = desc->tail_count + desc->spte_count;
  
@@ -973,7 +960,8 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
         return count;
  }
  
-static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+static void pte_list_desc_remove_entry(struct kvm *kvm,
+                                      struct kvm_rmap_head *rmap_head,
                                        struct pte_list_desc *desc, int i)
  {
         struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
@@ -984,7 +972,7 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
          * when adding an entry and the previous head is full, and heads are
          * removed (this flow) when they become empty.
          */
-       BUG_ON(j < 0);
+       KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
  
         /*
          * Replace the to-be-freed SPTE with the last valid entry from the head
@@ -1009,35 +997,34 @@ static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
         mmu_free_pte_list_desc(head_desc);
  }
  
-static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
+static void pte_list_remove(struct kvm *kvm, u64 *spte,
+                           struct kvm_rmap_head *rmap_head)
  {
         struct pte_list_desc *desc;
         int i;
  
-       if (!rmap_head->val) {
-               pr_err("%s: %p 0->BUG\n", __func__, spte);
-               BUG();
-       } else if (!(rmap_head->val & 1)) {
-               rmap_printk("%p 1->0\n", spte);
-               if ((u64 *)rmap_head->val != spte) {
-                       pr_err("%s:  %p 1->BUG\n", __func__, spte);
-                       BUG();
-               }
+       if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
+               return;
+
+       if (!(rmap_head->val & 1)) {
+               if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
+                       return;
+
                 rmap_head->val = 0;
         } else {
-               rmap_printk("%p many->many\n", spte);
                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
                 while (desc) {
                         for (i = 0; i < desc->spte_count; ++i) {
                                 if (desc->sptes[i] == spte) {
-                                       pte_list_desc_remove_entry(rmap_head, desc, i);
+                                       pte_list_desc_remove_entry(kvm, rmap_head,
+                                                                  desc, i);
                                         return;
                                 }
                         }
                         desc = desc->more;
                 }
-               pr_err("%s: %p many->many\n", __func__, spte);
-               BUG();
+
+               KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
         }
  }
  
@@ -1045,7 +1032,7 @@ static void kvm_zap_one_rmap_spte(struct kvm *kvm,
                                   struct kvm_rmap_head *rmap_head, u64 *sptep)
  {
         mmu_spte_clear_track_bits(kvm, sptep);
-       pte_list_remove(sptep, rmap_head);
+       pte_list_remove(kvm, sptep, rmap_head);
  }
  
  /* Return true if at least one SPTE was zapped, false otherwise */
@@ -1120,7 +1107,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
         slot = __gfn_to_memslot(slots, gfn);
         rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
  
-       pte_list_remove(spte, rmap_head);
+       pte_list_remove(kvm, spte, rmap_head);
  }
  
  /*
@@ -1212,7 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
         struct kvm_mmu_page *sp;
  
         sp = sptep_to_sp(sptep);
-       WARN_ON(sp->role.level == PG_LEVEL_4K);
+       WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
  
         drop_spte(kvm, sptep);
  
@@ -1241,8 +1228,6 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
             !(pt_protect && is_mmu_writable_spte(spte)))
                 return false;
  
-       rmap_printk("spte %p %llx\n", sptep, *sptep);
-
         if (pt_protect)
                 spte &= ~shadow_mmu_writable_mask;
         spte = spte & ~PT_WRITABLE_MASK;
@@ -1267,9 +1252,7 @@ static bool spte_clear_dirty(u64 *sptep)
  {
         u64 spte = *sptep;
  
-       rmap_printk("spte %p %llx\n", sptep, *sptep);
-
-       MMU_WARN_ON(!spte_ad_enabled(spte));
+       KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
         spte &= ~shadow_dirty_mask;
         return mmu_spte_update(sptep, spte);
  }
@@ -1475,14 +1458,11 @@ static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
         u64 new_spte;
         kvm_pfn_t new_pfn;
  
-       WARN_ON(pte_huge(pte));
+       WARN_ON_ONCE(pte_huge(pte));
         new_pfn = pte_pfn(pte);
  
  restart:
         for_each_rmap_spte(rmap_head, &iter, sptep) {
-               rmap_printk("spte %p %llx gfn %llx (%d)\n",
-                           sptep, *sptep, gfn, level);
-
                 need_flush = true;
  
                 if (pte_write(pte)) {
@@ -1588,7 +1568,7 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
         for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
                                  range->start, range->end - 1, &iterator)
                 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
-                              iterator.level, range->pte);
+                              iterator.level, range->arg.pte);
  
         return ret;
  }
@@ -1710,21 +1690,19 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
         return young;
  }
  
-#ifdef MMU_DEBUG
-static int is_empty_shadow_page(u64 *spt)
+static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
  {
-       u64 *pos;
-       u64 *end;
+#ifdef CONFIG_KVM_PROVE_MMU
+       int i;
  
-       for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
-               if (is_shadow_present_pte(*pos)) {
-                       printk(KERN_ERR "%s: %p %llx\n", __func__,
-                              pos, *pos);
-                       return 0;
-               }
-       return 1;
-}
+       for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+               if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
+                       pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
+                                          sp->spt[i], &sp->spt[i],
+                                          kvm_mmu_page_get_gfn(sp, i));
+       }
  #endif
+}
  
  /*
   * This value is the sum of all of the kvm instances's
@@ -1752,7 +1730,8 @@ static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
  
  static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
  {
-       MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
+       kvm_mmu_check_sptes_at_free(sp);
+
         hlist_del(&sp->hash_link);
         list_del(&sp->link);
         free_page((unsigned long)sp->spt);
@@ -1775,16 +1754,16 @@ static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
         pte_list_add(cache, parent_pte, &sp->parent_ptes);
  }
  
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
+static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
                                        u64 *parent_pte)
  {
-       pte_list_remove(parent_pte, &sp->parent_ptes);
+       pte_list_remove(kvm, parent_pte, &sp->parent_ptes);
  }
  
-static void drop_parent_pte(struct kvm_mmu_page *sp,
+static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
                             u64 *parent_pte)
  {
-       mmu_page_remove_parent_pte(sp, parent_pte);
+       mmu_page_remove_parent_pte(kvm, sp, parent_pte);
         mmu_spte_clear_no_track(parent_pte);
  }
  
@@ -1840,7 +1819,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
  static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
  {
         --sp->unsync_children;
-       WARN_ON((int)sp->unsync_children < 0);
+       WARN_ON_ONCE((int)sp->unsync_children < 0);
         __clear_bit(idx, sp->unsync_child_bitmap);
  }
  
@@ -1898,7 +1877,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
  
  static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
-       WARN_ON(!sp->unsync);
+       WARN_ON_ONCE(!sp->unsync);
         trace_kvm_mmu_sync_page(sp);
         sp->unsync = 0;
         --kvm->stat.mmu_unsync;
@@ -2073,11 +2052,11 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec,
         if (pvec->nr == 0)
                 return 0;
  
-       WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+       WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX);
  
         sp = pvec->page[0].sp;
         level = sp->role.level;
-       WARN_ON(level == PG_LEVEL_4K);
+       WARN_ON_ONCE(level == PG_LEVEL_4K);
  
         parents->parent[level-2] = sp;
  
@@ -2099,7 +2078,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
                 if (!sp)
                         return;
  
-               WARN_ON(idx == INVALID_INDEX);
+               WARN_ON_ONCE(idx == INVALID_INDEX);
                 clear_unsync_child_bit(sp, idx);
                 level++;
         } while (!sp->unsync_children);
@@ -2220,7 +2199,7 @@ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm,
                         if (ret < 0)
                                 break;
  
-                       WARN_ON(!list_empty(&invalid_list));
+                       WARN_ON_ONCE(!list_empty(&invalid_list));
                         if (ret > 0)
                                 kvm_flush_remote_tlbs(kvm);
                 }
@@ -2499,7 +2478,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                 if (child->role.access == direct_access)
                         return;
  
-               drop_parent_pte(child, sptep);
+               drop_parent_pte(vcpu->kvm, child, sptep);
                 kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
         }
  }
@@ -2517,7 +2496,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
                         drop_spte(kvm, spte);
                 } else {
                         child = spte_to_child_sp(pte);
-                       drop_parent_pte(child, spte);
+                       drop_parent_pte(kvm, child, spte);
  
                         /*
                          * Recursively zap nested TDP SPs, parentless SPs are
@@ -2548,13 +2527,13 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm,
         return zapped;
  }
  
-static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
         u64 *sptep;
         struct rmap_iterator iter;
  
         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
-               drop_parent_pte(sp, sptep);
+               drop_parent_pte(kvm, sp, sptep);
  }
  
  static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -2593,7 +2572,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
         ++kvm->stat.mmu_shadow_zapped;
         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
         *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
-       kvm_mmu_unlink_parents(sp);
+       kvm_mmu_unlink_parents(kvm, sp);
  
         /* Zapping children means active_mmu_pages has become unstable. */
         list_unstable = *nr_zapped;
@@ -2675,7 +2654,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
         kvm_flush_remote_tlbs(kvm);
  
         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
-               WARN_ON(!sp->role.invalid || sp->root_count);
+               WARN_ON_ONCE(!sp->role.invalid || sp->root_count);
                 kvm_mmu_free_shadow_page(sp);
         }
  }
@@ -2775,12 +2754,9 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
         LIST_HEAD(invalid_list);
         int r;
  
-       pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
         r = 0;
         write_lock(&kvm->mmu_lock);
         for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
-               pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
-                        sp->role.word);
                 r = 1;
                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
         }
@@ -2831,7 +2807,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
          * track machinery is used to write-protect upper-level shadow pages,
          * i.e. this guards the role.level == 4K assertion below!
          */
-       if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
+       if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
                 return -EPERM;
  
         /*
@@ -2873,7 +2849,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
                                 continue;
                 }
  
-               WARN_ON(sp->role.level != PG_LEVEL_4K);
+               WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
                 kvm_unsync_page(kvm, sp);
         }
         if (locked)
@@ -2938,9 +2914,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
         bool prefetch = !fault || fault->prefetch;
         bool write_fault = fault && fault->write;
  
-       pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
-                *sptep, write_fault, gfn);
-
         if (unlikely(is_noslot_pfn(pfn))) {
                 vcpu->stat.pf_mmio_spte_created++;
                 mark_mmio_spte(vcpu, sptep, gfn, pte_access);
@@ -2957,11 +2930,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
                         u64 pte = *sptep;
  
                         child = spte_to_child_sp(pte);
-                       drop_parent_pte(child, sptep);
+                       drop_parent_pte(vcpu->kvm, child, sptep);
                         flush = true;
                 } else if (pfn != spte_to_pfn(*sptep)) {
-                       pgprintk("hfn old %llx new %llx\n",
-                                spte_to_pfn(*sptep), pfn);
                         drop_spte(vcpu->kvm, sptep);
                         flush = true;
                 } else
@@ -2986,8 +2957,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
         if (flush)
                 kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
  
-       pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-
         if (!was_rmapped) {
                 WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
                 rmap_add(vcpu, slot, sptep, gfn, pte_access);
@@ -3033,7 +3002,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
         u64 *spte, *start = NULL;
         int i;
  
-       WARN_ON(!sp->role.direct);
+       WARN_ON_ONCE(!sp->role.direct);
  
         i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1);
         spte = sp->spt + i;
@@ -3574,12 +3543,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
         if (!VALID_PAGE(*root_hpa))
                 return;
  
-       /*
-        * The "root" may be a special root, e.g. a PAE entry, treat it as a
-        * SPTE to ensure any non-PA bits are dropped.
-        */
-       sp = spte_to_child_sp(*root_hpa);
-       if (WARN_ON(!sp))
+       sp = root_to_sp(*root_hpa);
+       if (WARN_ON_ONCE(!sp))
                 return;
  
         if (is_tdp_mmu_page(sp))
@@ -3624,7 +3589,9 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
                                            &invalid_list);
  
         if (free_active_root) {
-               if (to_shadow_page(mmu->root.hpa)) {
+               if (kvm_mmu_is_dummy_root(mmu->root.hpa)) {
+                       /* Nothing to cleanup for dummy roots. */
+               } else if (root_to_sp(mmu->root.hpa)) {
                         mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
                 } else if (mmu->pae_root) {
                         for (i = 0; i < 4; ++i) {
@@ -3648,6 +3615,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
  void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
  {
         unsigned long roots_to_free = 0;
+       struct kvm_mmu_page *sp;
         hpa_t root_hpa;
         int i;
  
@@ -3662,8 +3630,8 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
                 if (!VALID_PAGE(root_hpa))
                         continue;
  
-               if (!to_shadow_page(root_hpa) ||
-                       to_shadow_page(root_hpa)->role.guest_mode)
+               sp = root_to_sp(root_hpa);
+               if (!sp || sp->role.guest_mode)
                         roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
         }
  
@@ -3671,19 +3639,6 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
  
-
-static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
-{
-       int ret = 0;
-
-       if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
-               kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-               ret = 1;
-       }
-
-       return ret;
-}
-
  static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant,
                             u8 level)
  {
@@ -3821,8 +3776,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
         root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
         root_gfn = root_pgd >> PAGE_SHIFT;
  
-       if (mmu_check_root(vcpu, root_gfn))
-               return 1;
+       if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
+               mmu->root.hpa = kvm_mmu_get_dummy_root();
+               return 0;
+       }
  
         /*
          * On SVM, reading PDPTRs might access guest memory, which might fault
@@ -3834,8 +3791,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                         if (!(pdptrs[i] & PT_PRESENT_MASK))
                                 continue;
  
-                       if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
-                               return 1;
+                       if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
+                               pdptrs[i] = 0;
                 }
         }
  
@@ -4002,7 +3959,7 @@ static bool is_unsync_root(hpa_t root)
  {
         struct kvm_mmu_page *sp;
  
-       if (!VALID_PAGE(root))
+       if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root))
                 return false;
  
         /*
@@ -4018,7 +3975,7 @@ static bool is_unsync_root(hpa_t root)
          * requirement isn't satisfied.
          */
         smp_rmb();
-       sp = to_shadow_page(root);
+       sp = root_to_sp(root);
  
         /*
          * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
@@ -4048,11 +4005,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
  
         if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
                 hpa_t root = vcpu->arch.mmu->root.hpa;
-               sp = to_shadow_page(root);
  
                 if (!is_unsync_root(root))
                         return;
  
+               sp = root_to_sp(root);
+
                 write_lock(&vcpu->kvm->mmu_lock);
                 mmu_sync_children(vcpu, sp, true);
                 write_unlock(&vcpu->kvm->mmu_lock);
@@ -4194,7 +4152,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
                 return RET_PF_EMULATE;
  
         reserved = get_mmio_spte(vcpu, addr, &spte);
-       if (WARN_ON(reserved))
+       if (WARN_ON_ONCE(reserved))
                 return -EINVAL;
  
         if (is_mmio_spte(spte)) {
@@ -4232,7 +4190,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
          * guest is writing the page which is write tracked which can
          * not be fixed by page fault handler.
          */
-       if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
+       if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn))
                 return true;
  
         return false;
@@ -4382,7 +4340,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
  static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
                                 struct kvm_page_fault *fault)
  {
-       struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
+       struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
  
         /* Special roots, e.g. pae_root, are not backed by shadow pages. */
         if (sp && is_obsolete_sp(vcpu->kvm, sp))
@@ -4407,6 +4365,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
  {
         int r;
  
+       /* Dummy roots are used only for shadowing bad guest roots. */
+       if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
+               return RET_PF_RETRY;
+
         if (page_fault_handle_page_track(vcpu, fault))
                 return RET_PF_EMULATE;
  
@@ -4443,8 +4405,6 @@ out_unlock:
  static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
                                 struct kvm_page_fault *fault)
  {
-       pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
-
         /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
         fault->max_level = PG_LEVEL_2M;
         return direct_page_fault(vcpu, fault);
@@ -4562,9 +4522,19 @@ static void nonpaging_init_context(struct kvm_mmu *context)
  static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
                                   union kvm_mmu_page_role role)
  {
-       return (role.direct || pgd == root->pgd) &&
-              VALID_PAGE(root->hpa) &&
-              role.word == to_shadow_page(root->hpa)->role.word;
+       struct kvm_mmu_page *sp;
+
+       if (!VALID_PAGE(root->hpa))
+               return false;
+
+       if (!role.direct && pgd != root->pgd)
+               return false;
+
+       sp = root_to_sp(root->hpa);
+       if (WARN_ON_ONCE(!sp))
+               return false;
+
+       return role.word == sp->role.word;
  }
  
  /*
@@ -4634,11 +4604,10 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
                             gpa_t new_pgd, union kvm_mmu_page_role new_role)
  {
         /*
-        * For now, limit the caching to 64-bit hosts+VMs in order to avoid
-        * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
-        * later if necessary.
+        * Limit reuse to 64-bit hosts+VMs without "special" roots in order to
+        * avoid having to deal with PDPTEs and other complexities.
          */
-       if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa))
+       if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
                 kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
  
         if (VALID_PAGE(mmu->root.hpa))
@@ -4684,9 +4653,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
          * If this is a direct root page, it doesn't have a write flooding
          * count. Otherwise, clear the write flooding count.
          */
-       if (!new_role.direct)
-               __clear_sp_write_flooding_count(
-                               to_shadow_page(vcpu->arch.mmu->root.hpa));
+       if (!new_role.direct) {
+               struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
+
+               if (!WARN_ON_ONCE(!sp))
+                       __clear_sp_write_flooding_count(sp);
+       }
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
  
@@ -4808,28 +4780,13 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
         }
  }
  
-static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
-{
-       /*
-        * If TDP is enabled, let the guest use GBPAGES if they're supported in
-        * hardware.  The hardware page walker doesn't let KVM disable GBPAGES,
-        * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
-        * walk for performance and complexity reasons.  Not to mention KVM
-        * _can't_ solve the problem because GVA->GPA walks aren't visible to
-        * KVM once a TDP translation is installed.  Mimic hardware behavior so
-        * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
-        */
-       return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
-                            guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
-}
-
  static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
                                         struct kvm_mmu *context)
  {
         __reset_rsvds_bits_mask(&context->guest_rsvd_check,
                                 vcpu->arch.reserved_gpa_bits,
                                 context->cpu_role.base.level, is_efer_nx(context),
-                               guest_can_use_gbpages(vcpu),
+                               guest_can_use(vcpu, X86_FEATURE_GBPAGES),
                                 is_cr4_pse(context),
                                 guest_cpuid_is_amd_or_hygon(vcpu));
  }
@@ -4906,7 +4863,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
         __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
                                 context->root_role.level,
                                 context->root_role.efer_nx,
-                               guest_can_use_gbpages(vcpu), is_pse, is_amd);
+                               guest_can_use(vcpu, X86_FEATURE_GBPAGES),
+                               is_pse, is_amd);
  
         if (!shadow_me_mask)
                 return;
@@ -5467,8 +5425,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
          * physical address properties) in a single VM would require tracking
          * all relevant CPUID information in kvm_mmu_page_role. That is very
          * undesirable as it would increase the memory requirements for
-        * gfn_track (see struct kvm_mmu_page_role comments).  For now that
-        * problem is swept under the rug; KVM's CPUID API is horrific and
+        * gfn_write_track (see struct kvm_mmu_page_role comments).  For now
+        * that problem is swept under the rug; KVM's CPUID API is horrific and
          * it's all but impossible to solve it without introducing a new API.
          */
         vcpu->arch.root_mmu.root_role.word = 0;
@@ -5531,9 +5489,9 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
         struct kvm *kvm = vcpu->kvm;
  
         kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
-       WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
+       WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
         kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
-       WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
+       WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
  }
  
@@ -5546,16 +5504,21 @@ static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
  
         /*
          * When freeing obsolete roots, treat roots as obsolete if they don't
-        * have an associated shadow page.  This does mean KVM will get false
+        * have an associated shadow page, as it's impossible to determine if
+        * such roots are fresh or stale.  This does mean KVM will get false
          * positives and free roots that don't strictly need to be freed, but
          * such false positives are relatively rare:
          *
-        *  (a) only PAE paging and nested NPT has roots without shadow pages
+        *  (a) only PAE paging and nested NPT have roots without shadow pages
+        *      (or any shadow paging flavor with a dummy root, see note below)
          *  (b) remote reloads due to a memslot update obsoletes _all_ roots
          *  (c) KVM doesn't track previous roots for PAE paging, and the guest
          *      is unlikely to zap an in-use PGD.
+        *
+        * Note!  Dummy roots are unique in that they are obsoleted by memslot
+        * _creation_!  See also FNAME(fetch).
          */
-       sp = to_shadow_page(root_hpa);
+       sp = root_to_sp(root_hpa);
         return !sp || is_obsolete_sp(kvm, sp);
  }
  
@@ -5634,9 +5597,6 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
  {
         unsigned offset, pte_size, misaligned;
  
-       pgprintk("misaligned: gpa %llx bytes %d role %x\n",
-                gpa, bytes, sp->role.word);
-
         offset = offset_in_page(gpa);
         pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
  
@@ -5684,9 +5644,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
         return spte;
  }
  
-static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                             const u8 *new, int bytes,
-                             struct kvm_page_track_notifier_node *node)
+void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+                        int bytes)
  {
         gfn_t gfn = gpa >> PAGE_SHIFT;
         struct kvm_mmu_page *sp;
@@ -5702,8 +5661,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
                 return;
  
-       pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-
         write_lock(&vcpu->kvm->mmu_lock);
  
         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
@@ -5742,7 +5699,18 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
         int r, emulation_type = EMULTYPE_PF;
         bool direct = vcpu->arch.mmu->root_role.direct;
  
-       if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
+       /*
+        * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
+        * checks when emulating instructions that triggers implicit access.
+        * WARN if hardware generates a fault with an error code that collides
+        * with the KVM-defined value.  Clear the flag and continue on, i.e.
+        * don't terminate the VM, as KVM can't possibly be relying on a flag
+        * that KVM doesn't know about.
+        */
+       if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
+               error_code &= ~PFERR_IMPLICIT_ACCESS;
+
+       if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
                 return RET_PF_RETRY;
  
         r = RET_PF_INVALID;
@@ -6099,7 +6067,7 @@ restart:
                  * pages.  Skip the bogus page, otherwise we'll get stuck in an
                  * infinite loop if the page gets put back on the list (again).
                  */
-               if (WARN_ON(sp->role.invalid))
+               if (WARN_ON_ONCE(sp->role.invalid))
                         continue;
  
                 /*
@@ -6199,16 +6167,8 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
  }
  
-static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
-                       struct kvm_memory_slot *slot,
-                       struct kvm_page_track_notifier_node *node)
-{
-       kvm_mmu_zap_all_fast(kvm);
-}
-
  int kvm_mmu_init_vm(struct kvm *kvm)
  {
-       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
         int r;
  
         INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
@@ -6222,10 +6182,6 @@ int kvm_mmu_init_vm(struct kvm *kvm)
                         return r;
         }
  
-       node->track_write = kvm_mmu_pte_write;
-       node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
-       kvm_page_track_register_notifier(kvm, node);
-
         kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
         kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
  
@@ -6246,10 +6202,6 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
  
  void kvm_mmu_uninit_vm(struct kvm *kvm)
  {
-       struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
-
-       kvm_page_track_unregister_notifier(kvm, node);
-
         if (tdp_mmu_enabled)
                 kvm_mmu_uninit_tdp_mmu(kvm);
  
@@ -6670,7 +6622,7 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
          */
         if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
                             PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
-               kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+               kvm_flush_remote_tlbs_memslot(kvm, slot);
  }
  
  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
@@ -6689,20 +6641,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
         }
  }
  
-void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
-                                       const struct kvm_memory_slot *memslot)
-{
-       /*
-        * All current use cases for flushing the TLBs for a specific memslot
-        * related to dirty logging, and many do the TLB flush out of mmu_lock.
-        * The interaction between the various operations on memslot must be
-        * serialized by slots_locks to ensure the TLB flush from one operation
-        * is observed by any other operation on the same memslot.
-        */
-       lockdep_assert_held(&kvm->slots_lock);
-       kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
-}
-
  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
                                    const struct kvm_memory_slot *memslot)
  {
@@ -6732,7 +6670,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
          */
  }
  
-void kvm_mmu_zap_all(struct kvm *kvm)
+static void kvm_mmu_zap_all(struct kvm *kvm)
  {
         struct kvm_mmu_page *sp, *node;
         LIST_HEAD(invalid_list);
@@ -6741,7 +6679,7 @@ void kvm_mmu_zap_all(struct kvm *kvm)
         write_lock(&kvm->mmu_lock);
  restart:
         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-               if (WARN_ON(sp->role.invalid))
+               if (WARN_ON_ONCE(sp->role.invalid))
                         continue;
                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
                         goto restart;
@@ -6757,9 +6695,20 @@ restart:
         write_unlock(&kvm->mmu_lock);
  }
  
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+       kvm_mmu_zap_all(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+                                  struct kvm_memory_slot *slot)
+{
+       kvm_mmu_zap_all_fast(kvm);
+}
+
  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
  {
-       WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+       WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
  
         gen &= MMIO_SPTE_GEN_MASK;
  
@@ -6862,7 +6811,7 @@ static void mmu_destroy_caches(void)
  static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp)
  {
         if (nx_hugepage_mitigation_hard_disabled)
-               return sprintf(buffer, "never\n");
+               return sysfs_emit(buffer, "never\n");
  
         return param_get_bool(buffer, kp);
  }