KVM: x86/mmu: Allow zap gfn range to operate under the mmu read lock

author Ben Gardon <bgardon@google.com>

Thu, 1 Apr 2021 23:37:32 +0000 (16:37 -0700)

committer Paolo Bonzini <pbonzini@redhat.com>

Mon, 19 Apr 2021 13:06:04 +0000 (09:06 -0400)
author Ben Gardon <bgardon@google.com>
Thu, 1 Apr 2021 23:37:32 +0000 (16:37 -0700)
committer Paolo Bonzini <pbonzini@redhat.com>
Mon, 19 Apr 2021 13:06:04 +0000 (09:06 -0400)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index 1ea1191..d29aded 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3121,7 +3121,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
         sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
  
         if (is_tdp_mmu_page(sp))
-               kvm_tdp_mmu_put_root(kvm, sp);
+               kvm_tdp_mmu_put_root(kvm, sp, false);
         else if (!--sp->root_count && sp->role.invalid)
                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
  
@@ -5496,16 +5496,24 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
                 }
         }
  
-       if (is_tdp_mmu_enabled(kvm)) {
-               for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
-                       flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
-                                                         gfn_end, flush);
-       }
-
         if (flush)
                 kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
  
         write_unlock(&kvm->mmu_lock);
+
+       if (is_tdp_mmu_enabled(kvm)) {
+               flush = false;
+
+               read_lock(&kvm->mmu_lock);
+               for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
+                       flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
+                                                         gfn_end, flush, true);
+               if (flush)
+                       kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+                                                          gfn_end);
+
+               read_unlock(&kvm->mmu_lock);
+       }
  }
  
  static bool slot_rmap_write_protect(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c

index ef65fd9..2fb8103 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -27,6 +27,15 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
         INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
  }
  
+static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
+                                                            bool shared)
+{
+       if (shared)
+               lockdep_assert_held_read(&kvm->mmu_lock);
+       else
+               lockdep_assert_held_write(&kvm->mmu_lock);
+}
+
  void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  {
         if (!kvm->arch.tdp_mmu_enabled)
@@ -42,7 +51,8 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
  }
  
  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield, bool flush);
+                         gfn_t start, gfn_t end, bool can_yield, bool flush,
+                         bool shared);
  
  static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
  {
@@ -66,11 +76,12 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
         tdp_mmu_free_sp(sp);
  }
  
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared)
  {
         gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
  
-       lockdep_assert_held_write(&kvm->mmu_lock);
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
  
         if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
                 return;
@@ -81,7 +92,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
         list_del_rcu(&root->link);
         spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
  
-       zap_gfn_range(kvm, root, 0, max_gfn, false, false);
+       zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared);
  
         call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
  }
@@ -94,12 +105,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
   * function will return NULL.
   */
  static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
-                                             struct kvm_mmu_page *prev_root)
+                                             struct kvm_mmu_page *prev_root,
+                                             bool shared)
  {
         struct kvm_mmu_page *next_root;
  
-       lockdep_assert_held_write(&kvm->mmu_lock);
-
         rcu_read_lock();
  
         if (prev_root)
@@ -117,7 +127,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
         rcu_read_unlock();
  
         if (prev_root)
-               kvm_tdp_mmu_put_root(kvm, prev_root);
+               kvm_tdp_mmu_put_root(kvm, prev_root, shared);
  
         return next_root;
  }
@@ -127,12 +137,16 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
   * This makes it safe to release the MMU lock and yield within the loop, but
   * if exiting the loop early, the caller must drop the reference to the most
   * recent root. (Unless keeping a live reference is desirable.)
+ *
+ * If shared is set, this function is operating under the MMU lock in read
+ * mode. In the unlikely event that this thread must free a root, the lock
+ * will be temporarily dropped and reacquired in write mode.
   */
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)  \
-       for (_root = tdp_mmu_next_root(_kvm, NULL);             \
-            _root;                                             \
-            _root = tdp_mmu_next_root(_kvm, _root))            \
-               if (kvm_mmu_page_as_id(_root) != _as_id) {      \
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
+       for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);            \
+            _root;                                                     \
+            _root = tdp_mmu_next_root(_kvm, _root, _shared))           \
+               if (kvm_mmu_page_as_id(_root) != _as_id) {              \
                 } else
  
  #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                             \
@@ -636,7 +650,8 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
   * Return false if a yield was not needed.
   */
  static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
-                                            struct tdp_iter *iter, bool flush)
+                                            struct tdp_iter *iter, bool flush,
+                                            bool shared)
  {
         /* Ensure forward progress has been made before yielding. */
         if (iter->next_last_level_gfn == iter->yielded_gfn)
@@ -648,7 +663,11 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
                 if (flush)
                         kvm_flush_remote_tlbs(kvm);
  
-               cond_resched_rwlock_write(&kvm->mmu_lock);
+               if (shared)
+                       cond_resched_rwlock_read(&kvm->mmu_lock);
+               else
+                       cond_resched_rwlock_write(&kvm->mmu_lock);
+
                 rcu_read_lock();
  
                 WARN_ON(iter->gfn > iter->next_last_level_gfn);
@@ -666,24 +685,32 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
   * non-root pages mapping GFNs strictly within that range. Returns true if
   * SPTEs have been cleared and a TLB flush is needed before releasing the
   * MMU lock.
+ *
   * If can_yield is true, will release the MMU lock and reschedule if the
   * scheduler needs the CPU or there is contention on the MMU lock. If this
   * function cannot yield, it will not release the MMU lock or reschedule and
   * the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup.  Note, in some use cases a flush may be
- * required by prior actions.  Ensure the pending flush is performed prior to
- * yielding.
+ * operation can cause a soft lockup.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU lock in write mode.
   */
  static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
-                         gfn_t start, gfn_t end, bool can_yield, bool flush)
+                         gfn_t start, gfn_t end, bool can_yield, bool flush,
+                         bool shared)
  {
         struct tdp_iter iter;
  
+       kvm_lockdep_assert_mmu_lock_held(kvm, shared);
+
         rcu_read_lock();
  
         tdp_root_for_each_pte(iter, root, start, end) {
+retry:
                 if (can_yield &&
-                   tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+                   tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
                         flush = false;
                         continue;
                 }
@@ -701,8 +728,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
                     !is_last_spte(iter.old_spte, iter.level))
                         continue;
  
-               tdp_mmu_set_spte(kvm, &iter, 0);
-               flush = true;
+               if (!shared) {
+                       tdp_mmu_set_spte(kvm, &iter, 0);
+                       flush = true;
+               } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
+                       /*
+                        * The iter must explicitly re-read the SPTE because
+                        * the atomic cmpxchg failed.
+                        */
+                       iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
+                       goto retry;
+               }
         }
  
         rcu_read_unlock();
@@ -714,14 +750,21 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
   * non-root pages mapping GFNs strictly within that range. Returns true if
   * SPTEs have been cleared and a TLB flush is needed before releasing the
   * MMU lock.
+ *
+ * If shared is true, this thread holds the MMU lock in read mode and must
+ * account for the possibility that other threads are modifying the paging
+ * structures concurrently. If shared is false, this thread should hold the
+ * MMU in write mode.
   */
  bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
-                                gfn_t end, bool can_yield, bool flush)
+                                gfn_t end, bool can_yield, bool flush,
+                                bool shared)
  {
         struct kvm_mmu_page *root;
  
-       for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
-               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
+       for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+               flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
+                                     shared);
  
         return flush;
  }
@@ -733,7 +776,8 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
         int i;
  
         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
-               flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, flush);
+               flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn,
+                                                 flush, false);
  
         if (flush)
                 kvm_flush_remote_tlbs(kvm);
@@ -892,7 +936,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
  
         for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
                 flush |= zap_gfn_range(kvm, root, range->start, range->end,
-                                      range->may_block, flush);
+                                      range->may_block, flush, false);
  
         return flush;
  }
@@ -1038,7 +1082,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
  
         for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
                                    min_level, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
                         continue;
  
                 if (!is_shadow_present_pte(iter.old_spte) ||
@@ -1067,7 +1111,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
         struct kvm_mmu_page *root;
         bool spte_set = false;
  
-       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
                 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
                              slot->base_gfn + slot->npages, min_level);
  
@@ -1091,7 +1135,7 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
         rcu_read_lock();
  
         tdp_root_for_each_leaf_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, false, false))
                         continue;
  
                 if (spte_ad_need_write_protect(iter.old_spte)) {
@@ -1126,7 +1170,7 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
         struct kvm_mmu_page *root;
         bool spte_set = false;
  
-       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
                 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
                                 slot->base_gfn + slot->npages);
  
@@ -1213,7 +1257,7 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
         rcu_read_lock();
  
         tdp_root_for_each_pte(iter, root, start, end) {
-               if (tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+               if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
                         flush = false;
                         continue;
                 }
@@ -1248,7 +1292,7 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
  {
         struct kvm_mmu_page *root;
  
-       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
+       for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, false)
                 flush = zap_collapsible_spte_range(kvm, root, slot, flush);
  
         return flush;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h

index 25268c4..2e1913b 100644 (file)
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -13,14 +13,18 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm *kvm,
         return refcount_inc_not_zero(&root->tdp_mmu_root_count);
  }
  
-void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root);
+void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
+                         bool shared);
  
  bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
-                                gfn_t end, bool can_yield, bool flush);
+                                gfn_t end, bool can_yield, bool flush,
+                                bool shared);
  static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
-                                            gfn_t start, gfn_t end, bool flush)
+                                            gfn_t start, gfn_t end, bool flush,
+                                            bool shared)
  {
-       return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
+       return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
+                                          shared);
  }
  static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
  {
@@ -37,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
          */
         lockdep_assert_held_write(&kvm->mmu_lock);
         return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
-                                          sp->gfn, end, false, false);
+                                          sp->gfn, end, false, false, false);
  }
  void kvm_tdp_mmu_zap_all(struct kvm *kvm);
author	Ben Gardon <bgardon@google.com>
	Thu, 1 Apr 2021 23:37:32 +0000 (16:37 -0700)
committer	Paolo Bonzini <pbonzini@redhat.com>
	Mon, 19 Apr 2021 13:06:04 +0000 (09:06 -0400)
arch/x86/kvm/mmu/mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.c		patch \| blob \| history
arch/x86/kvm/mmu/tdp_mmu.h		patch \| blob \| history