KVM: x86/mmu: Split huge pages mapped by the TDP MMU during KVM_CLEAR_DIRTY_LOG

author David Matlack <dmatlack@google.com>

Wed, 19 Jan 2022 23:07:37 +0000 (23:07 +0000)

committer Paolo Bonzini <pbonzini@redhat.com>

Thu, 10 Feb 2022 18:50:43 +0000 (13:50 -0500)
author David Matlack <dmatlack@google.com>
Wed, 19 Jan 2022 23:07:37 +0000 (23:07 +0000)
committer Paolo Bonzini <pbonzini@redhat.com>
Thu, 10 Feb 2022 18:50:43 +0000 (13:50 -0500)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index 5d80a0f..2a9746f 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2356,7 +2356,9 @@
                         KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
                         disabled, all huge pages in a memslot will be eagerly
                         split when dirty logging is enabled on that memslot. If
-                       enabled, huge pages will not be eagerly split.
+                       enabled, eager page splitting will be performed during
+                       the KVM_CLEAR_DIRTY ioctl, and only for the pages being
+                       cleared.
  
                         Eager page splitting currently only supports splitting
                         huge pages mapped by the TDP MMU.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h

index 8bfb069..10815b6 100644 (file)
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1590,6 +1590,10 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
  void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
                                        const struct kvm_memory_slot *memslot,
                                        int target_level);
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+                                 const struct kvm_memory_slot *memslot,
+                                 u64 start, u64 end,
+                                 int target_level);
  void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                    const struct kvm_memory_slot *memslot);
  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index 308c8b2..296f872 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1358,6 +1358,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
                 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
  
+               if (READ_ONCE(eager_page_split))
+                       kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
                 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
  
                 /* Cross two large pages? */
@@ -5830,16 +5833,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
  }
  
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+                                  const struct kvm_memory_slot *memslot,
+                                  u64 start, u64 end,
+                                  int target_level)
+{
+       if (is_tdp_mmu_enabled(kvm))
+               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
+                                                target_level, false);
+
+       /*
+        * A TLB flush is unnecessary at this point for the same resons as in
+        * kvm_mmu_slot_try_split_huge_pages().
+        */
+}
+
  void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
-                                      const struct kvm_memory_slot *memslot,
-                                      int target_level)
+                                       const struct kvm_memory_slot *memslot,
+                                       int target_level)
  {
         u64 start = memslot->base_gfn;
         u64 end = start + memslot->npages;
  
         if (is_tdp_mmu_enabled(kvm)) {
                 read_lock(&kvm->mmu_lock);
-               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
                 read_unlock(&kvm->mmu_lock);
         }
  
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c

index 6dfd6db..dae2ceb 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -963,27 +963,33 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
  }
  
  /*
- * tdp_mmu_link_sp_atomic - Atomically replace the given spte with an spte
- * pointing to the provided page table.
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
   *
   * @kvm: kvm instance
   * @iter: a tdp_iter instance currently on the SPTE that should be set
   * @sp: The new TDP page table to install.
   * @account_nx: True if this page table is being installed to split a
   *              non-executable huge page.
+ * @shared: This operation is running under the MMU lock in read mode.
   *
   * Returns: 0 if the new page table was installed. Non-0 if the page table
   *          could not be installed (e.g. the atomic compare-exchange failed).
   */
-static int tdp_mmu_link_sp_atomic(struct kvm *kvm, struct tdp_iter *iter,
-                                 struct kvm_mmu_page *sp, bool account_nx)
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+                          struct kvm_mmu_page *sp, bool account_nx,
+                          bool shared)
  {
         u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
-       int ret;
+       int ret = 0;
  
-       ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
-       if (ret)
-               return ret;
+       if (shared) {
+               ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+               if (ret)
+                       return ret;
+       } else {
+               tdp_mmu_set_spte(kvm, iter, spte);
+       }
  
         spin_lock(&kvm->arch.tdp_mmu_pages_lock);
         list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
@@ -1051,7 +1057,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
                         sp = tdp_mmu_alloc_sp(vcpu);
                         tdp_mmu_init_child_sp(sp, &iter);
  
-                       if (tdp_mmu_link_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) {
+                       if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
                                 tdp_mmu_free_sp(sp);
                                 break;
                         }
@@ -1277,12 +1283,11 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
  }
  
  static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
-                                                      struct tdp_iter *iter)
+                                                      struct tdp_iter *iter,
+                                                      bool shared)
  {
         struct kvm_mmu_page *sp;
  
-       lockdep_assert_held_read(&kvm->mmu_lock);
-
         /*
          * Since we are allocating while under the MMU lock we have to be
          * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
@@ -1297,20 +1302,27 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
                 return sp;
  
         rcu_read_unlock();
-       read_unlock(&kvm->mmu_lock);
+
+       if (shared)
+               read_unlock(&kvm->mmu_lock);
+       else
+               write_unlock(&kvm->mmu_lock);
  
         iter->yielded = true;
         sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
  
-       read_lock(&kvm->mmu_lock);
+       if (shared)
+               read_lock(&kvm->mmu_lock);
+       else
+               write_lock(&kvm->mmu_lock);
+
         rcu_read_lock();
  
         return sp;
  }
  
-static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
-                                         struct tdp_iter *iter,
-                                         struct kvm_mmu_page *sp)
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+                                  struct kvm_mmu_page *sp, bool shared)
  {
         const u64 huge_spte = iter->old_spte;
         const int level = iter->level;
@@ -1333,7 +1345,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
          * correctness standpoint since the translation will be the same either
          * way.
          */
-       ret = tdp_mmu_link_sp_atomic(kvm, iter, sp, false);
+       ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
         if (ret)
                 return ret;
  
@@ -1350,7 +1362,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
  static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
                                          struct kvm_mmu_page *root,
                                          gfn_t start, gfn_t end,
-                                        int target_level)
+                                        int target_level, bool shared)
  {
         struct kvm_mmu_page *sp = NULL;
         struct tdp_iter iter;
@@ -1371,14 +1383,14 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
          */
         for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
  retry:
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
                         continue;
  
                 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
                         continue;
  
                 if (!sp) {
-                       sp = tdp_mmu_alloc_sp_for_split(kvm, &iter);
+                       sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
                         if (!sp) {
                                 ret = -ENOMEM;
                                 break;
@@ -1388,7 +1400,7 @@ retry:
                                 continue;
                 }
  
-               if (tdp_mmu_split_huge_page_atomic(kvm, &iter, sp))
+               if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
                         goto retry;
  
                 sp = NULL;
@@ -1408,23 +1420,24 @@ retry:
         return ret;
  }
  
+
  /*
   * Try to split all huge pages mapped by the TDP MMU down to the target level.
   */
  void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
                                       const struct kvm_memory_slot *slot,
                                       gfn_t start, gfn_t end,
-                                     int target_level)
+                                     int target_level, bool shared)
  {
         struct kvm_mmu_page *root;
         int r = 0;
  
-       lockdep_assert_held_read(&kvm->mmu_lock);
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
  
-       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) {
-               r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level);
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+               r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
                 if (r) {
-                       kvm_tdp_mmu_put_root(kvm, root, true);
+                       kvm_tdp_mmu_put_root(kvm, root, shared);
                         break;
                 }
         }
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h

index fdb3a88..3f98778 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -70,7 +70,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
  void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
                                       const struct kvm_memory_slot *slot,
                                       gfn_t start, gfn_t end,
-                                     int target_level);
+                                     int target_level, bool shared);
  
  static inline void kvm_tdp_mmu_walk_lockless_begin(void)
  {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index ffef31f..803b2e4 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -192,7 +192,7 @@ bool __read_mostly enable_pmu = true;
  EXPORT_SYMBOL_GPL(enable_pmu);
  module_param(enable_pmu, bool, 0444);
  
-static bool __read_mostly eager_page_split = true;
+bool __read_mostly eager_page_split = true;
  module_param(eager_page_split, bool, 0644);
  
  /*
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h

index 767ec7f..aa86aba 100644 (file)
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -307,6 +307,8 @@ extern int pi_inject_timer;
  
  extern bool report_ignored_msrs;
  
+extern bool eager_page_split;
+
  static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
  {
         return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
author	David Matlack <dmatlack@google.com>
	Wed, 19 Jan 2022 23:07:37 +0000 (23:07 +0000)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Thu, 10 Feb 2022 18:50:43 +0000 (13:50 -0500)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
arch/x86/include/asm/kvm_host.h		patch \| blob \| history
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.h		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/kvm/x86.h		patch \| blob \| history