OSDN Git Service

KVM: x86/mmu: Split huge pages mapped by the TDP MMU during KVM_CLEAR_DIRTY_LOG
authorDavid Matlack <dmatlack@google.com>
Wed, 19 Jan 2022 23:07:37 +0000 (23:07 +0000)
committerPaolo Bonzini <pbonzini@redhat.com>
Thu, 10 Feb 2022 18:50:43 +0000 (13:50 -0500)
When using KVM_DIRTY_LOG_INITIALLY_SET, huge pages are not
write-protected when dirty logging is enabled on the memslot. Instead
they are write-protected once userspace invokes KVM_CLEAR_DIRTY_LOG for
the first time and only for the specific sub-region being cleared.

Enhance KVM_CLEAR_DIRTY_LOG to also try to split huge pages prior to
write-protecting to avoid causing write-protection faults on vCPU
threads. This also allows userspace to smear the cost of huge page
splitting across multiple ioctls, rather than splitting the entire
memslot as is the case when initially-all-set is not used.

Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220119230739.2234394-17-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Documentation/admin-guide/kernel-parameters.txt
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/mmu/tdp_mmu.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h

index 5d80a0f..2a9746f 100644 (file)
                        KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
                        disabled, all huge pages in a memslot will be eagerly
                        split when dirty logging is enabled on that memslot. If
-                       enabled, huge pages will not be eagerly split.
+                       enabled, eager page splitting will be performed during
+                       the KVM_CLEAR_DIRTY ioctl, and only for the pages being
+                       cleared.
 
                        Eager page splitting currently only supports splitting
                        huge pages mapped by the TDP MMU.
index 8bfb069..10815b6 100644 (file)
@@ -1590,6 +1590,10 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
                                       const struct kvm_memory_slot *memslot,
                                       int target_level);
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+                                 const struct kvm_memory_slot *memslot,
+                                 u64 start, u64 end,
+                                 int target_level);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
index 308c8b2..296f872 100644 (file)
@@ -1358,6 +1358,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
                gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
 
+               if (READ_ONCE(eager_page_split))
+                       kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
+
                kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
 
                /* Cross two large pages? */
@@ -5830,16 +5833,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
                kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 
+/* Must be called with the mmu_lock held in write-mode. */
+void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
+                                  const struct kvm_memory_slot *memslot,
+                                  u64 start, u64 end,
+                                  int target_level)
+{
+       if (is_tdp_mmu_enabled(kvm))
+               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
+                                                target_level, false);
+
+       /*
+        * A TLB flush is unnecessary at this point for the same resons as in
+        * kvm_mmu_slot_try_split_huge_pages().
+        */
+}
+
 void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
-                                      const struct kvm_memory_slot *memslot,
-                                      int target_level)
+                                       const struct kvm_memory_slot *memslot,
+                                       int target_level)
 {
        u64 start = memslot->base_gfn;
        u64 end = start + memslot->npages;
 
        if (is_tdp_mmu_enabled(kvm)) {
                read_lock(&kvm->mmu_lock);
-               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
+               kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
                read_unlock(&kvm->mmu_lock);
        }
 
index 6dfd6db..dae2ceb 100644 (file)
@@ -963,27 +963,33 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 }
 
 /*
- * tdp_mmu_link_sp_atomic - Atomically replace the given spte with an spte
- * pointing to the provided page table.
+ * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
+ * provided page table.
  *
  * @kvm: kvm instance
  * @iter: a tdp_iter instance currently on the SPTE that should be set
  * @sp: The new TDP page table to install.
  * @account_nx: True if this page table is being installed to split a
  *              non-executable huge page.
+ * @shared: This operation is running under the MMU lock in read mode.
  *
  * Returns: 0 if the new page table was installed. Non-0 if the page table
  *          could not be installed (e.g. the atomic compare-exchange failed).
  */
-static int tdp_mmu_link_sp_atomic(struct kvm *kvm, struct tdp_iter *iter,
-                                 struct kvm_mmu_page *sp, bool account_nx)
+static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
+                          struct kvm_mmu_page *sp, bool account_nx,
+                          bool shared)
 {
        u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
-       int ret;
+       int ret = 0;
 
-       ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
-       if (ret)
-               return ret;
+       if (shared) {
+               ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
+               if (ret)
+                       return ret;
+       } else {
+               tdp_mmu_set_spte(kvm, iter, spte);
+       }
 
        spin_lock(&kvm->arch.tdp_mmu_pages_lock);
        list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
@@ -1051,7 +1057,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
                        sp = tdp_mmu_alloc_sp(vcpu);
                        tdp_mmu_init_child_sp(sp, &iter);
 
-                       if (tdp_mmu_link_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) {
+                       if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
                                tdp_mmu_free_sp(sp);
                                break;
                        }
@@ -1277,12 +1283,11 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
 }
 
 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
-                                                      struct tdp_iter *iter)
+                                                      struct tdp_iter *iter,
+                                                      bool shared)
 {
        struct kvm_mmu_page *sp;
 
-       lockdep_assert_held_read(&kvm->mmu_lock);
-
        /*
         * Since we are allocating while under the MMU lock we have to be
         * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
@@ -1297,20 +1302,27 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
                return sp;
 
        rcu_read_unlock();
-       read_unlock(&kvm->mmu_lock);
+
+       if (shared)
+               read_unlock(&kvm->mmu_lock);
+       else
+               write_unlock(&kvm->mmu_lock);
 
        iter->yielded = true;
        sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
 
-       read_lock(&kvm->mmu_lock);
+       if (shared)
+               read_lock(&kvm->mmu_lock);
+       else
+               write_lock(&kvm->mmu_lock);
+
        rcu_read_lock();
 
        return sp;
 }
 
-static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
-                                         struct tdp_iter *iter,
-                                         struct kvm_mmu_page *sp)
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+                                  struct kvm_mmu_page *sp, bool shared)
 {
        const u64 huge_spte = iter->old_spte;
        const int level = iter->level;
@@ -1333,7 +1345,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
         * correctness standpoint since the translation will be the same either
         * way.
         */
-       ret = tdp_mmu_link_sp_atomic(kvm, iter, sp, false);
+       ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
        if (ret)
                return ret;
 
@@ -1350,7 +1362,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
                                         struct kvm_mmu_page *root,
                                         gfn_t start, gfn_t end,
-                                        int target_level)
+                                        int target_level, bool shared)
 {
        struct kvm_mmu_page *sp = NULL;
        struct tdp_iter iter;
@@ -1371,14 +1383,14 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
         */
        for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
 retry:
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
                        continue;
 
                if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
                        continue;
 
                if (!sp) {
-                       sp = tdp_mmu_alloc_sp_for_split(kvm, &iter);
+                       sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
                        if (!sp) {
                                ret = -ENOMEM;
                                break;
@@ -1388,7 +1400,7 @@ retry:
                                continue;
                }
 
-               if (tdp_mmu_split_huge_page_atomic(kvm, &iter, sp))
+               if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
                        goto retry;
 
                sp = NULL;
@@ -1408,23 +1420,24 @@ retry:
        return ret;
 }
 
+
 /*
  * Try to split all huge pages mapped by the TDP MMU down to the target level.
  */
 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
                                      const struct kvm_memory_slot *slot,
                                      gfn_t start, gfn_t end,
-                                     int target_level)
+                                     int target_level, bool shared)
 {
        struct kvm_mmu_page *root;
        int r = 0;
 
-       lockdep_assert_held_read(&kvm->mmu_lock);
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
 
-       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) {
-               r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level);
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
+               r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
                if (r) {
-                       kvm_tdp_mmu_put_root(kvm, root, true);
+                       kvm_tdp_mmu_put_root(kvm, root, shared);
                        break;
                }
        }
index fdb3a88..3f98778 100644 (file)
@@ -70,7 +70,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
                                      const struct kvm_memory_slot *slot,
                                      gfn_t start, gfn_t end,
-                                     int target_level);
+                                     int target_level, bool shared);
 
 static inline void kvm_tdp_mmu_walk_lockless_begin(void)
 {
index ffef31f..803b2e4 100644 (file)
@@ -192,7 +192,7 @@ bool __read_mostly enable_pmu = true;
 EXPORT_SYMBOL_GPL(enable_pmu);
 module_param(enable_pmu, bool, 0444);
 
-static bool __read_mostly eager_page_split = true;
+bool __read_mostly eager_page_split = true;
 module_param(eager_page_split, bool, 0644);
 
 /*
index 767ec7f..aa86aba 100644 (file)
@@ -307,6 +307,8 @@ extern int pi_inject_timer;
 
 extern bool report_ignored_msrs;
 
+extern bool eager_page_split;
+
 static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 {
        return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,