mm: backport speculative page fault

author Park Ju Hyung <qkrwngud825@gmail.com>

Thu, 20 Jun 2019 18:53:36 +0000 (03:53 +0900)

committer 0ranko0P <ranko0p@outlook.com>

Wed, 4 Dec 2019 17:17:29 +0000 (01:17 +0800)
author Park Ju Hyung <qkrwngud825@gmail.com>
Thu, 20 Jun 2019 18:53:36 +0000 (03:53 +0900)
committer 0ranko0P <ranko0p@outlook.com>
Wed, 4 Dec 2019 17:17:29 +0000 (01:17 +0800)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index 83f04b1..2861902 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -106,6 +106,7 @@ config ARM64
         select HAVE_CONTEXT_TRACKING
         select HAVE_ARM_SMCCC
         select THREAD_INFO_IN_TASK
+       select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
         help
           ARM 64-bit (AArch64) Linux support.
  
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c

index 4970252..66e6e5f 100644 (file)
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -343,6 +343,16 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
         }
  
         /*
+        * let's try a speculative page fault without grabbing the
+        * mmap_sem.
+        */
+       fault = handle_speculative_fault(mm, addr, mm_flags);
+       if (fault != VM_FAULT_RETRY) {
+               perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
+               goto done;
+       }
+
+       /*
          * As per x86, we may deadlock here. However, since the kernel only
          * validly references user space from well defined areas of the code,
          * we can bug out early if this is from code which shouldn't.
@@ -407,6 +417,8 @@ retry:
  
         up_read(&mm->mmap_sem);
  
+done:
+
         /*
          * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
          */
diff --git a/fs/exec.c b/fs/exec.c

index 23215a7..83b58dd 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -303,7 +303,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         vma->vm_start = vma->vm_end - PAGE_SIZE;
         vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
         vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
  
         err = insert_vm_struct(mm, vma);
         if (err)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c

index c3faa39..3204874 100644 (file)
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1103,8 +1103,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                                         goto out_mm;
                                 }
                                 for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                                       vma->vm_flags &= ~VM_SOFTDIRTY;
+                                       vm_write_begin(vma);
+                                       WRITE_ONCE(vma->vm_flags,
+                                                  vma->vm_flags & ~VM_SOFTDIRTY);
                                         vma_set_page_prot(vma);
+                                       vm_write_end(vma);
                                 }
                                 downgrade_write(&mm->mmap_sem);
                                 break;
@@ -1508,7 +1511,7 @@ const struct file_operations proc_pagemap_operations = {
  #endif /* CONFIG_PROC_PAGE_MONITOR */
  
  #ifdef CONFIG_PROCESS_RECLAIM
-static int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
+int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
                                 unsigned long end, struct mm_walk *walk)
  {
         struct reclaim_param *rp = walk->private;
@@ -1535,6 +1538,9 @@ cont:
                 if (!page)
                         continue;
  
+               if (page_mapcount(page) != 1)
+                       continue;
+
                 if (isolate_lru_page(page))
                         continue;
  
@@ -1557,7 +1563,7 @@ cont:
                 goto cont;
  
         cond_resched();
-       return 0;
+       return (rp->nr_to_reclaim == 0) ? -EPIPE : 0;
  }
  
  enum reclaim_type {
@@ -1628,6 +1634,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
         unsigned long start = 0;
         unsigned long end = 0;
         struct reclaim_param rp;
+       int ret;
  
         memset(buffer, 0, sizeof(buffer));
         if (count > sizeof(buffer) - 1)
@@ -1689,7 +1696,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
         reclaim_walk.mm = mm;
         reclaim_walk.pmd_entry = reclaim_pte_range;
  
-       rp.nr_to_reclaim = ~0;
+       rp.nr_to_reclaim = INT_MAX;
         rp.nr_reclaimed = 0;
         reclaim_walk.private = &rp;
  
@@ -1703,9 +1710,11 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
                                 continue;
  
                         rp.vma = vma;
-                       walk_page_range(max(vma->vm_start, start),
+                       ret = walk_page_range(max(vma->vm_start, start),
                                         min(vma->vm_end, end),
                                         &reclaim_walk);
+                       if (ret)
+                               break;
                         vma = vma->vm_next;
                 }
         } else {
@@ -1720,8 +1729,10 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
                                 continue;
  
                         rp.vma = vma;
-                       walk_page_range(vma->vm_start, vma->vm_end,
+                       ret = walk_page_range(vma->vm_start, vma->vm_end,
                                 &reclaim_walk);
+                       if (ret)
+                               break;
                 }
         }
  
diff --git a/include/linux/mm.h b/include/linux/mm.h

index ed48a15..4ea0553 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -235,6 +235,7 @@ extern pgprot_t protection_map[16];
  #define FAULT_FLAG_KILLABLE    0x10    /* The fault task is in SIGKILL killable region */
  #define FAULT_FLAG_TRIED       0x20    /* Second try */
  #define FAULT_FLAG_USER                0x40    /* The fault originated in userspace */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding mmap_sem */
  
  /*
   * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -247,6 +248,29 @@ extern pgprot_t protection_map[16];
   * pgoff should be used in favour of virtual_address, if possible.
   */
  struct vm_fault {
+       struct vm_area_struct *vma; /* Target VMA */
+       pmd_t *pmd;                     /* Pointer to pmd entry matching
+                                        * the 'address' */
+       pud_t *pud;              /* Pointer to pud entry matching
+                                         * the 'address'
+                                         */
+       unsigned long address;   /* Faulting virtual address */
+       spinlock_t *ptl;                 /* Page table lock.
+                                         * Protects pte page table if 'pte'
+                                         * is not NULL, otherwise pmd.
+                                         */
+       pte_t orig_pte;           /* Value of PTE at the time of fault */
+       /*
+         * These entries are required when handling speculative page fault.
+         * This way the page handling is done using consistent field values.
+         */
+       unsigned long vma_flags;
+       pgprot_t vma_page_prot;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       unsigned int sequence;
+       pmd_t orig_pmd;                 /* value of PMD at the time of fault */
+#endif
+
         unsigned int flags;             /* FAULT_FLAG_xxx flags */
         gfp_t gfp_mask;                 /* gfp mask to be used for allocations */
         pgoff_t pgoff;                  /* Logical page offset based on vma */
@@ -588,15 +612,15 @@ static inline void set_compound_order(struct page *page, unsigned int order)
   * pte_mkwrite.  But get_user_pages can cause write faults for mappings
   * that do not have writing enabled, when used by access_process_vm.
   */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
  {
-       if (likely(vma->vm_flags & VM_WRITE))
+       if (likely(vma_flags & VM_WRITE))
                 pte = pte_mkwrite(pte);
         return pte;
  }
  
  void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-               struct page *page, pte_t *pte, bool write, bool anon);
+               struct page *page, pte_t *pte, bool write, bool anon, struct vm_fault *vmf2);
  #endif
  
  /*
@@ -1052,6 +1076,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
  #define VM_FAULT_LOCKED        0x0200  /* ->fault locked the returned page */
  #define VM_FAULT_RETRY 0x0400  /* ->fault blocked, must retry */
  #define VM_FAULT_FALLBACK 0x0800       /* huge page fault failed, fall back to small */
+#define VM_FAULT_PTNOTSAME 0x4000      /* Page table entries have changed */
  
  #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
  
@@ -1103,8 +1128,23 @@ struct zap_details {
         pgoff_t last_index;                     /* Highest page->index to unmap */
  };
  
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-               pte_t pte);
+static inline void INIT_VMA(struct vm_area_struct *vma)
+{
+       INIT_LIST_HEAD(&vma->anon_vma_chain);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       seqcount_init(&vma->vm_sequence);
+       atomic_set(&vma->vm_ref_count, 1);
+#endif
+}
+
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                               pte_t pte, unsigned long vma_flags);
+static inline struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                               pte_t pte)
+{
+       return __vm_normal_page(vma, addr, pte, vma->vm_flags);
+}
+
  struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                 pmd_t pmd);
  
@@ -1206,6 +1246,31 @@ int invalidate_inode_page(struct page *page);
  #ifdef CONFIG_MMU
  extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, unsigned int flags);
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern int __handle_speculative_fault(struct mm_struct *mm,
+                                     unsigned long address,
+                                     unsigned int flags);
+static inline int handle_speculative_fault(struct mm_struct *mm,
+                                          unsigned long address,
+                                          unsigned int flags)
+{
+       /*
+        * Try speculative page fault for multithreaded user space task only.
+        */
+       if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1)
+               return VM_FAULT_RETRY;
+       return __handle_speculative_fault(mm, address, flags);
+}
+#else
+static inline int handle_speculative_fault(struct mm_struct *mm,
+                                          unsigned long address,
+                                          unsigned int flags)
+{
+       return VM_FAULT_RETRY;
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
  extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                             unsigned long address, unsigned int fault_flags);
  #else
@@ -1227,6 +1292,47 @@ static inline int fixup_user_fault(struct task_struct *tsk,
  }
  #endif
  
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+       write_seqcount_begin(&vma->vm_sequence);
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+                                        int subclass)
+{
+       write_seqcount_begin_nested(&vma->vm_sequence, subclass);
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+       write_seqcount_end(&vma->vm_sequence);
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+       raw_write_seqcount_begin(&vma->vm_sequence);
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+       raw_write_seqcount_end(&vma->vm_sequence);
+}
+#else
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+                                        int subclass)
+{
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
  extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
  extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                 void *buf, int len, unsigned int gup_flags);
@@ -1873,12 +1979,26 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
  
  /* mmap.c */
  extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
-extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
-       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
-extern struct vm_area_struct *vma_merge(struct mm_struct *,
+extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, bool keep_locked);
+static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+{
+       return __vma_adjust(vma, start, end, pgoff, insert, false);
+}
+extern struct vm_area_struct *__vma_merge(struct mm_struct *,
         struct vm_area_struct *prev, unsigned long addr, unsigned long end,
         unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-       struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
+       struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *, bool keep_locked);
+static inline struct vm_area_struct *vma_merge(struct mm_struct *mm,
+               struct vm_area_struct *prev, unsigned long addr, unsigned long end,
+               unsigned long vm_flags, struct anon_vma *anon, struct file *file, pgoff_t off,
+               struct mempolicy *pol, struct vm_userfaultfd_ctx ctx, const char __user *anon_name)
+{
+       return __vma_merge(mm, prev, addr, end, vm_flags, anon, file,
+               off, pol, ctx, anon_name, false);
+}
+
  extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
  extern int split_vma(struct mm_struct *,
         struct vm_area_struct *, unsigned long addr, int new_below);
@@ -2391,6 +2511,9 @@ struct reclaim_param {
  };
  extern struct reclaim_param reclaim_task_anon(struct task_struct *task,
                 int nr_to_reclaim);
+extern int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
+                               unsigned long end, struct mm_walk *walk);
+extern unsigned long reclaim_global(unsigned long nr_to_reclaim);
  #endif
  
  #endif /* __KERNEL__ */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 29c17fa..d7f4e01 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -367,6 +367,10 @@ struct vm_area_struct {
         struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
  #endif
         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       seqcount_t vm_sequence;
+       atomic_t vm_ref_count;          /* see vma_get(), vma_put() */
+#endif
  };
  
  struct core_thread {
@@ -404,6 +408,9 @@ struct kioctx_table;
  struct mm_struct {
         struct vm_area_struct *mmap;            /* list of VMAs */
         struct rb_root mm_rb;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       rwlock_t mm_rb_lock;
+#endif
         u64 vmacache_seqnum;                   /* per-thread vmacache */
  #ifdef CONFIG_MMU
         unsigned long (*get_unmapped_area) (struct file *filp,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h

index d2f4a73..ef0f002 100644 (file)
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -430,8 +430,10 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
         pgoff_t pgoff;
         if (unlikely(is_vm_hugetlb_page(vma)))
                 return linear_hugepage_index(vma, address);
-       pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
-       pgoff += vma->vm_pgoff;
+
+       pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
+       pgoff += READ_ONCE(vma->vm_pgoff);
+
         return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  }
  
diff --git a/include/linux/rmap.h b/include/linux/rmap.h

index e72b857..9eb8f9d 100644 (file)
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -159,7 +159,14 @@ void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
  void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
  void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
                            unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+static inline void page_add_new_anon_rmap(struct page *page,
+                                       struct vm_area_struct *vma, unsigned long address)
+{
+       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+       __page_add_new_anon_rmap(page, vma, address);
+}
+
  void page_add_file_rmap(struct page *);
  void page_remove_rmap(struct page *);
  
diff --git a/include/linux/swap.h b/include/linux/swap.h

index f1007d5..d3f84dc 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -318,8 +318,13 @@ extern void swap_setup(void);
  
  extern void add_page_to_unevictable_list(struct page *page);
  
-extern void lru_cache_add_active_or_unevictable(struct page *page,
-                                               struct vm_area_struct *vma);
+extern void __lru_cache_add_active_or_unevictable(struct page *page,
+                                               unsigned long vma_flags);
+static inline void lru_cache_add_active_or_unevictable(struct page *page,
+                                               struct vm_area_struct *vma)
+{
+       return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
+}
  
  /* linux/mm/vmscan.c */
  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h

index d31afe5..25d09ff 100644 (file)
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -90,6 +90,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PGPGOUTCLEAN, PSWPIN, PSWPOUT,
                 VMACACHE_FIND_CALLS,
                 VMACACHE_FIND_HITS,
  #endif
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+               SPECULATIVE_PGFAULT,
+#endif
                 NR_VM_EVENT_ITEMS
  };
  
diff --git a/include/trace/events/pagefault.h b/include/trace/events/pagefault.h

new file mode 100644 (file)

index 0000000..d9438f3
--- /dev/null
+++ b/include/trace/events/pagefault.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pagefault
+
+#if !defined(_TRACE_PAGEFAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGEFAULT_H
+
+#include <linux/tracepoint.h>
+#include <linux/mm.h>
+
+DECLARE_EVENT_CLASS(spf,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, caller)
+               __field(unsigned long, vm_start)
+               __field(unsigned long, vm_end)
+               __field(unsigned long, address)
+       ),
+
+       TP_fast_assign(
+               __entry->caller         = caller;
+               __entry->vm_start       = vma->vm_start;
+               __entry->vm_end         = vma->vm_end;
+               __entry->address        = address;
+       ),
+
+       TP_printk("ip:%lx vma:%lx-%lx address:%lx",
+                 __entry->caller, __entry->vm_start, __entry->vm_end,
+                 __entry->address)
+);
+
+DEFINE_EVENT(spf, spf_vma_changed,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_noanon,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_notsup,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_access,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_pmd_changed,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+#endif /* _TRACE_PAGEFAULT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h

index 686da16..80c9564 100644 (file)
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -111,6 +111,7 @@ enum perf_sw_ids {
         PERF_COUNT_SW_EMULATION_FAULTS          = 8,
         PERF_COUNT_SW_DUMMY                     = 9,
         PERF_COUNT_SW_BPF_OUTPUT                = 10,
+       PERF_COUNT_SW_SPF                       = 11,
  
         PERF_COUNT_SW_MAX,                      /* non-ABI */
  };
diff --git a/kernel/fork.c b/kernel/fork.c

index 5d3cab4..6a9c619 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -468,7 +468,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                 if (!tmp)
                         goto fail_nomem;
                 *tmp = *mpnt;
-               INIT_LIST_HEAD(&tmp->anon_vma_chain);
+               INIT_VMA(tmp);
                 retval = vma_dup_policy(mpnt, tmp);
                 if (retval)
                         goto fail_nomem_policy;
@@ -610,6 +610,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
         mm->mmap = NULL;
         mm->mm_rb = RB_ROOT;
         mm->vmacache_seqnum = 0;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       rwlock_init(&mm->mm_rb_lock);
+#endif
         atomic_set(&mm->mm_users, 1);
         atomic_set(&mm->mm_count, 1);
         init_rwsem(&mm->mmap_sem);
diff --git a/mm/Kconfig b/mm/Kconfig

index 274a315..80050a3 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -751,3 +751,24 @@ config VM_MAX_READAHEAD
          This sets the VM_MAX_READAHEAD value to allow the readahead window
          to grow to a maximum size of configured. This will benefit sequential
          read throughput and thus early boot performance.
+config ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
+       def_bool n
+
+config SPECULATIVE_PAGE_FAULT
+       bool "Speculative page faults"
+       default y
+       depends on ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
+       depends on MMU && SMP
+       help
+         Try to handle user space page faults without holding the mmap_sem.
+
+         This should allow better concurrency for massively threaded processes
+         since the page fault handler will not wait for other thread's memory
+         layout change to be done, assuming that this change is done in
+         another part of the process's memory space. This type of page fault
+         is named speculative page fault.
+
+         If the speculative page fault fails because a concurrent modification
+         is detected or because underlying PMD or PTE tables are not yet
+         allocated, the speculative page fault fails and a classic page fault
+         is then tried.
diff --git a/mm/filemap.c b/mm/filemap.c

index 6f3c539..fe72693 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2147,7 +2147,7 @@ repeat:
                 if (file->f_ra.mmap_miss > 0)
                         file->f_ra.mmap_miss--;
                 addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
-               do_set_pte(vma, addr, page, pte, false, false);
+               do_set_pte(vma, addr, page, pte, false, false, vmf);
                 unlock_page(page);
                 goto next;
  unlock:
diff --git a/mm/init-mm.c b/mm/init-mm.c

index 975e49f..f94d003 100644 (file)
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -16,6 +16,9 @@
  
  struct mm_struct init_mm = {
         .mm_rb          = RB_ROOT,
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       .mm_rb_lock = __RW_LOCK_UNLOCKED(init_mm.mm_rb_lock),
+#endif
         .pgd            = swapper_pg_dir,
         .mm_users       = ATOMIC_INIT(2),
         .mm_count       = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h

index c3533af..88f6ac5 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,6 +34,26 @@
  /* Do not use these with a slab allocator */
  #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
  
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern struct vm_area_struct *get_vma(struct mm_struct *mm,
+                                     unsigned long addr);
+extern void put_vma(struct vm_area_struct *vma);
+
+static inline bool vma_has_changed(struct vm_fault *vmf)
+{
+       int ret = RB_EMPTY_NODE(&vmf->vma->vm_rb);
+       unsigned int seq = READ_ONCE(vmf->vma->vm_sequence.sequence);
+
+       /*
+        * Matches both the wmb in write_seqlock_{begin,end}() and
+        * the wmb in vma_rb_erase().
+        */
+       smp_rmb();
+
+       return ret || seq != vmf->sequence;
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                 unsigned long floor, unsigned long ceiling);
  
diff --git a/mm/madvise.c b/mm/madvise.c

index d1d09bd..17191ea 100644 (file)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -128,7 +128,9 @@ success:
         /*
          * vm_flags is protected by the mmap_sem held in write mode.
          */
-       vma->vm_flags = new_flags;
+       vm_write_begin(vma);
+       WRITE_ONCE(vma->vm_flags, new_flags);
+       vm_write_end(vma);
  
  out:
         if (error == -ENOMEM)
diff --git a/mm/memory.c b/mm/memory.c

index 5dfc9fa..7f9c0c9 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -72,6 +72,9 @@
  
  #include "internal.h"
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/pagefault.h>
+
  #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
  #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
  #endif
@@ -528,7 +531,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                  * Hide vma from rmap and truncate_pagecache before freeing
                  * pgtables
                  */
+               vm_write_begin(vma);
                 unlink_anon_vmas(vma);
+               vm_write_end(vma);
                 unlink_file_vma(vma);
  
                 if (is_vm_hugetlb_page(vma)) {
@@ -542,7 +547,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                && !is_vm_hugetlb_page(next)) {
                                 vma = next;
                                 next = vma->vm_next;
+                               vm_write_begin(vma);
                                 unlink_anon_vmas(vma);
+                               vm_write_end(vma);
                                 unlink_file_vma(vma);
                         }
                         free_pgd_range(tlb, addr, vma->vm_end,
@@ -678,7 +685,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                 dump_page(page, "bad pte");
         printk(KERN_ALERT
                 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
-               (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+               (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma, mapping, index);
         /*
          * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
          */
@@ -692,7 +699,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  }
  
  /*
- * vm_normal_page -- This function gets the "struct page" associated with a pte.
+ * __vm_normal_page -- This function gets the "struct page" associated with
+ * a pte.
   *
   * "Special" mappings do not wish to be associated with a "struct page" (either
   * it doesn't exist, or it exists but they don't want to touch it). In this
@@ -738,8 +746,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  #else
  # define HAVE_PTE_SPECIAL 0
  #endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-                               pte_t pte)
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                               pte_t pte, unsigned long vma_flags)
  {
         unsigned long pfn = pte_pfn(pte);
  
@@ -748,7 +756,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                         goto check_pfn;
                 if (vma->vm_ops && vma->vm_ops->find_special_page)
                         return vma->vm_ops->find_special_page(vma, addr);
-               if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+               if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP))
                         return NULL;
                 if (!is_zero_pfn(pfn))
                         print_bad_pte(vma, addr, pte, NULL);
@@ -757,8 +765,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
  
         /* !HAVE_PTE_SPECIAL case follows: */
  
-       if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
-               if (vma->vm_flags & VM_MIXEDMAP) {
+       if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+               if (vma_flags & VM_MIXEDMAP) {
                         if (!pfn_valid(pfn))
                                 return NULL;
                         goto out;
@@ -767,7 +775,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                         off = (addr - vma->vm_start) >> PAGE_SHIFT;
                         if (pfn == vma->vm_pgoff + off)
                                 return NULL;
-                       if (!is_cow_mapping(vma->vm_flags))
+                       if (!is_cow_mapping(vma_flags))
                                 return NULL;
                 }
         }
@@ -925,6 +933,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         spinlock_t *src_ptl, *dst_ptl;
         int progress = 0;
         int rss[NR_MM_COUNTERS];
+       unsigned long orig_addr = addr;
         swp_entry_t entry = (swp_entry_t){0};
  
  again:
@@ -962,6 +971,14 @@ again:
                 progress += 8;
         } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
  
+       /*
+        * Prevent the page fault handler to copy the page while stale tlb entry
+        * are still not flushed.
+        */
+       if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT) &&
+           is_cow_mapping(vma->vm_flags))
+               flush_tlb_range(vma, orig_addr, end);
+
         arch_leave_lazy_mmu_mode();
         spin_unlock(src_ptl);
         pte_unmap(orig_src_pte);
@@ -1289,6 +1306,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
                 details = NULL;
  
         BUG_ON(addr >= end);
+       vm_write_begin(vma);
         tlb_start_vma(tlb, vma);
         pgd = pgd_offset(vma->vm_mm, addr);
         do {
@@ -1298,6 +1316,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
                 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
         } while (pgd++, addr = next, addr != end);
         tlb_end_vma(tlb, vma);
+       vm_write_end(vma);
  }
  
  
@@ -1969,6 +1988,143 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
  }
  EXPORT_SYMBOL_GPL(apply_to_page_range);
  
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static bool pte_spinlock(struct vm_fault *vmf)
+{
+       bool ret = false;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       pmd_t pmdval;
+#endif
+
+       /* Check if vma is still valid */
+       if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+               vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+               spin_lock(vmf->ptl);
+               return true;
+       }
+
+again:
+       local_irq_disable();
+       if (vma_has_changed(vmf)) {
+               trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /*
+        * We check if the pmd value is still the same to ensure that there
+        * is not a huge collapse operation in progress in our back.
+        */
+       pmdval = READ_ONCE(*vmf->pmd);
+       if (!pmd_same(pmdval, vmf->orig_pmd)) {
+               trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+#endif
+
+       vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+       if (unlikely(!spin_trylock(vmf->ptl))) {
+               local_irq_enable();
+               goto again;
+       }
+
+       if (vma_has_changed(vmf)) {
+               spin_unlock(vmf->ptl);
+               trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+
+       ret = true;
+out:
+       local_irq_enable();
+       return ret;
+}
+
+static bool pte_map_lock(struct vm_fault *vmf)
+{
+       bool ret = false;
+       pte_t *pte;
+       spinlock_t *ptl;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       pmd_t pmdval;
+#endif
+
+       if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+               vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+                                              vmf->address, &vmf->ptl);
+               return true;
+       }
+
+       /*
+        * The first vma_has_changed() guarantees the page-tables are still
+        * valid, having IRQs disabled ensures they stay around, hence the
+        * second vma_has_changed() to make sure they are still valid once
+        * we've got the lock. After that a concurrent zap_pte_range() will
+        * block on the PTL and thus we're safe.
+        */
+again:
+       local_irq_disable();
+       if (vma_has_changed(vmf)) {
+               trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /*
+        * We check if the pmd value is still the same to ensure that there
+        * is not a huge collapse operation in progress in our back.
+        */
+       pmdval = READ_ONCE(*vmf->pmd);
+       if (!pmd_same(pmdval, vmf->orig_pmd)) {
+               trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+#endif
+
+       /*
+        * Same as pte_offset_map_lock() except that we call
+        * spin_trylock() in place of spin_lock() to avoid race with
+        * unmap path which may have the lock and wait for this CPU
+        * to invalidate TLB but this CPU has irq disabled.
+        * Since we are in a speculative patch, accept it could fail
+        */
+       ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+       pte = pte_offset_map(vmf->pmd, vmf->address);
+       if (unlikely(!spin_trylock(ptl))) {
+               pte_unmap(pte);
+               local_irq_enable();
+               goto again;
+       }
+
+       if (vma_has_changed(vmf)) {
+               pte_unmap_unlock(pte, ptl);
+               trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+
+       vmf->pte = pte;
+       vmf->ptl = ptl;
+       ret = true;
+out:
+       local_irq_enable();
+       return ret;
+}
+#else
+static inline bool pte_spinlock(struct vm_fault *vmf2)
+{
+       vmf2->ptl = pte_lockptr(vmf2->vma->vm_mm, vmf2->pmd);
+       spin_lock(vmf2->ptl);
+       return true;
+}
+
+static inline bool pte_map_lock(struct vm_fault *vmf2)
+{
+       vmf2->pte = pte_offset_map_lock(vmf2->vma->vm_mm, vmf2->pmd,
+                                       vmf2->address, &vmf2->ptl);
+       return true;
+}
+#endif
+
  /*
   * handle_pte_fault chooses page fault handler according to an entry which was
   * read non-atomically.  Before making any commitment, on those architectures
@@ -1976,21 +2132,28 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
   * parts, do_swap_page must check under lock before unmapping the pte and
   * proceeding (but do_wp_page is only called after already making such a check;
   * and do_anonymous_page can safely check later on).
+ *
+ * pte_unmap_same() returns:
+ * 0                    if the PTE are the same
+ * VM_FAULT_PTNOTSAME  if the PTE are different
+ * VM_FAULT_RETRY       if the VMA has changed in our back during
+ *                      a speculative page fault handling.
   */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-                               pte_t *page_table, pte_t orig_pte)
+static inline int pte_unmap_same(struct vm_fault *vmf2)
  {
-       int same = 1;
+       int ret = 0;
  #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
         if (sizeof(pte_t) > sizeof(unsigned long)) {
-               spinlock_t *ptl = pte_lockptr(mm, pmd);
-               spin_lock(ptl);
-               same = pte_same(*page_table, orig_pte);
-               spin_unlock(ptl);
+               if (pte_spinlock(vmf2)) {
+                       if (!pte_same(*vmf2->pte, vmf2->orig_pte))
+                               ret = VM_FAULT_PTNOTSAME;
+                       spin_unlock(vmf2->ptl);
+               } else
+                       ret = VM_FAULT_RETRY;
         }
  #endif
-       pte_unmap(page_table);
-       return same;
+       pte_unmap(vmf2->pte);
+       return ret;
  }
  
  static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
@@ -2081,7 +2244,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
                         struct vm_area_struct *vma, unsigned long address,
                         pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
                         struct page *page, int page_mkwrite,
-                       int dirty_shared)
+                       int dirty_shared, struct vm_fault *vmf2)
         __releases(ptl)
  {
         pte_t entry;
@@ -2095,7 +2258,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
  
         flush_cache_page(vma, address, pte_pfn(orig_pte));
         entry = pte_mkyoung(orig_pte);
-       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       entry = maybe_mkwrite(pte_mkdirty(entry), vmf2->vma_flags);
         if (ptep_set_access_flags(vma, address, page_table, entry, 1))
                 update_mmu_cache(vma, address, page_table);
         pte_unmap_unlock(page_table, ptl);
@@ -2146,7 +2309,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
   */
  static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, pte_t *page_table, pmd_t *pmd,
-                       pte_t orig_pte, struct page *old_page)
+                       pte_t orig_pte, struct page *old_page, struct vm_fault *vmf2)
  {
         struct page *new_page = NULL;
         spinlock_t *ptl = NULL;
@@ -2155,23 +2318,24 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
         const unsigned long mmun_start = address & PAGE_MASK;   /* For mmu_notifiers */
         const unsigned long mmun_end = mmun_start + PAGE_SIZE;  /* For mmu_notifiers */
         struct mem_cgroup *memcg;
+       int ret = VM_FAULT_OOM;
  
         if (unlikely(anon_vma_prepare(vma)))
-               goto oom;
+               goto out;
  
         if (is_zero_pfn(pte_pfn(orig_pte))) {
                 new_page = alloc_zeroed_user_highpage_movable(vma, address);
                 if (!new_page)
-                       goto oom;
+                       goto out;
         } else {
                 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
                 if (!new_page)
-                       goto oom;
+                       goto out;
                 cow_user_page(new_page, old_page, address, vma);
         }
  
         if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
-               goto oom_free_new;
+               goto out_free_new;
  
         __SetPageUptodate(new_page);
  
@@ -2180,7 +2344,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
         /*
          * Re-check the pte - we dropped the lock
          */
-       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2)) {
+               ret = VM_FAULT_RETRY;
+               goto out_uncharge;
+       }
+       page_table = vmf2->pte;
+       ptl = vmf2->ptl;
         if (likely(pte_same(*page_table, orig_pte))) {
                 if (old_page) {
                         if (!PageAnon(old_page)) {
@@ -2191,8 +2360,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                         inc_mm_counter_fast(mm, MM_ANONPAGES);
                 }
                 flush_cache_page(vma, address, pte_pfn(orig_pte));
-               entry = mk_pte(new_page, vma->vm_page_prot);
-               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+               entry = mk_pte(new_page, vmf2->vma_page_prot);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vmf2->vma_flags);
                 /*
                  * Clear the pte entry and flush it first, before updating the
                  * pte with the new entry. This will avoid a race condition
@@ -2200,9 +2369,9 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                  * thread doing COW.
                  */
                 ptep_clear_flush_notify(vma, address, page_table);
-               page_add_new_anon_rmap(new_page, vma, address);
+               __page_add_new_anon_rmap(new_page, vma, address);
                 mem_cgroup_commit_charge(new_page, memcg, false);
-               lru_cache_add_active_or_unevictable(new_page, vma);
+               __lru_cache_add_active_or_unevictable(new_page, vmf2->vma_flags);
                 /*
                  * We call the notify macro here because, when using secondary
                  * mmu page tables (such as kvm shadow page tables), we want the
@@ -2253,7 +2422,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                  * Don't let another task, with possibly unlocked vma,
                  * keep the mlocked page.
                  */
-               if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+               if (page_copied && (vmf2->vma_flags & VM_LOCKED)) {
                         lock_page(old_page);    /* LRU manipulation */
                         munlock_vma_page(old_page);
                         unlock_page(old_page);
@@ -2261,12 +2430,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 page_cache_release(old_page);
         }
         return page_copied ? VM_FAULT_WRITE : 0;
-oom_free_new:
+out_uncharge:
+       mem_cgroup_cancel_charge(new_page, memcg);
+out_free_new:
         page_cache_release(new_page);
-oom:
+out:
         if (old_page)
                 page_cache_release(old_page);
-       return VM_FAULT_OOM;
+       return ret;
  }
  
  /*
@@ -2276,7 +2447,7 @@ oom:
  static int wp_pfn_shared(struct mm_struct *mm,
                         struct vm_area_struct *vma, unsigned long address,
                         pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
-                       pmd_t *pmd)
+                       pmd_t *pmd, struct vm_fault *vmf2)
  {
         if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
                 struct vm_fault vmf = {
@@ -2291,7 +2462,10 @@ static int wp_pfn_shared(struct mm_struct *mm,
                 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
                 if (ret & VM_FAULT_ERROR)
                         return ret;
-               page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+               if (!pte_map_lock(vmf2))
+                       return VM_FAULT_RETRY;
+               page_table = vmf2->pte;
+               ptl = vmf2->ptl;
                 /*
                  * We might have raced with another page fault while we
                  * released the pte_offset_map_lock.
@@ -2302,13 +2476,13 @@ static int wp_pfn_shared(struct mm_struct *mm,
                 }
         }
         return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
-                            NULL, 0, 0);
+                            NULL, 0, 0, vmf2);
  }
  
  static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
                           unsigned long address, pte_t *page_table,
                           pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
-                         struct page *old_page)
+                         struct page *old_page, struct vm_fault *vmf2)
         __releases(ptl)
  {
         int page_mkwrite = 0;
@@ -2336,8 +2510,11 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
                  * they did, we just return, as we can count on the
                  * MMU to tell us if they didn't also make it writable.
                  */
-               page_table = pte_offset_map_lock(mm, pmd, address,
-                                                &ptl);
+               if (!pte_map_lock(vmf2))
+                       return VM_FAULT_RETRY;
+               page_table = vmf2->pte;
+               ptl = vmf2->ptl;
+
                 if (!pte_same(*page_table, orig_pte)) {
                         unlock_page(old_page);
                         pte_unmap_unlock(page_table, ptl);
@@ -2348,7 +2525,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
         }
  
         return wp_page_reuse(mm, vma, address, page_table, ptl,
-                            orig_pte, old_page, page_mkwrite, 1);
+                            orig_pte, old_page, page_mkwrite, 1, vmf2);
  }
  
  /*
@@ -2371,12 +2548,12 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
   */
  static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pte_t *page_table, pmd_t *pmd,
-               spinlock_t *ptl, pte_t orig_pte)
+               spinlock_t *ptl, pte_t orig_pte, struct vm_fault *vmf2)
         __releases(ptl)
  {
         struct page *old_page;
  
-       old_page = vm_normal_page(vma, address, orig_pte);
+       old_page = __vm_normal_page(vma, address, orig_pte, vmf2->vma_flags);
         if (!old_page) {
                 /*
                  * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
@@ -2385,14 +2562,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                  * We should not cow pages in a shared writeable mapping.
                  * Just mark the pages writable and/or call ops->pfn_mkwrite.
                  */
-               if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+               if ((vmf2->vma_flags & (VM_WRITE|VM_SHARED)) ==
                                      (VM_WRITE|VM_SHARED))
                         return wp_pfn_shared(mm, vma, address, page_table, ptl,
-                                            orig_pte, pmd);
+                                            orig_pte, pmd, vmf2);
  
                 pte_unmap_unlock(page_table, ptl);
                 return wp_page_copy(mm, vma, address, page_table, pmd,
-                                   orig_pte, old_page);
+                                   orig_pte, old_page, vmf2);
         }
  
         /*
@@ -2404,8 +2581,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         page_cache_get(old_page);
                         pte_unmap_unlock(page_table, ptl);
                         lock_page(old_page);
-                       page_table = pte_offset_map_lock(mm, pmd, address,
-                                                        &ptl);
+                       if (!pte_map_lock(vmf2)) {
+                               unlock_page(old_page);
+                               put_page(old_page);
+                               return VM_FAULT_RETRY;
+                       }
+                       page_table = vmf2->pte;
+                       ptl = vmf2->ptl;
                         if (!pte_same(*page_table, orig_pte)) {
                                 unlock_page(old_page);
                                 pte_unmap_unlock(page_table, ptl);
@@ -2423,13 +2605,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         page_move_anon_rmap(old_page, vma, address);
                         unlock_page(old_page);
                         return wp_page_reuse(mm, vma, address, page_table, ptl,
-                                            orig_pte, old_page, 0, 0);
+                                            orig_pte, old_page, 0, 0, vmf2);
                 }
                 unlock_page(old_page);
-       } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+       } else if (unlikely((vmf2->vma_flags & (VM_WRITE|VM_SHARED)) ==
                                         (VM_WRITE|VM_SHARED))) {
                 return wp_page_shared(mm, vma, address, page_table, pmd,
-                                     ptl, orig_pte, old_page);
+                                     ptl, orig_pte, old_page, vmf2);
         }
  
         /*
@@ -2439,7 +2621,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
         pte_unmap_unlock(page_table, ptl);
         return wp_page_copy(mm, vma, address, page_table, pmd,
-                           orig_pte, old_page);
+                           orig_pte, old_page, vmf2);
  }
  
  static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2532,7 +2714,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
   */
  static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pte_t *page_table, pmd_t *pmd,
-               unsigned int flags, pte_t orig_pte)
+               unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
  {
         spinlock_t *ptl;
         struct page *page, *swapcache;
@@ -2541,10 +2723,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
         pte_t pte;
         int locked;
         int exclusive = 0;
-       int ret = 0;
+       int ret;
  
-       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+       ret = pte_unmap_same(vmf2);
+       if (ret) {
+               /*
+                * In the case the PTE are different, meaning that the
+                * page has already been processed by another CPU, we return 0.
+                */
+               if (ret == VM_FAULT_PTNOTSAME)
+                       ret = 0;
                 goto out;
+       }
  
         entry = pte_to_swp_entry(orig_pte);
         if (unlikely(non_swap_entry(entry))) {
@@ -2565,10 +2755,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                         GFP_HIGHUSER_MOVABLE, vma, address);
                 if (!page) {
                         /*
-                        * Back out if somebody else faulted in this pte
-                        * while we released the pte lock.
-                        */
-                       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+                         * Back out if the VMA has changed in our back during
+                         * a speculative page fault or if somebody else
+                         * faulted in this pte while we released the pte lock.
+                         */
+                       if (!pte_map_lock(vmf2)) {
+                               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+                               ret = VM_FAULT_RETRY;
+                               goto out;
+                       }
+                       page_table = vmf2->pte;
+                       ptl = vmf2->ptl;
                         if (likely(pte_same(*page_table, orig_pte)))
                                 ret = VM_FAULT_OOM;
                         delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2621,9 +2818,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
         }
  
         /*
-        * Back out if somebody else already faulted in this pte.
+         * Back out if the VMA has changed in our back during a speculative
+         * page fault or if somebody else already faulted in this pte.
          */
-       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2)) {
+               ret = VM_FAULT_RETRY;
+               goto out_cancel_cgroup;
+       }
+       page_table = vmf2->pte;
+       ptl = vmf2->ptl;
         if (unlikely(!pte_same(*page_table, orig_pte)))
                 goto out_nomap;
  
@@ -2644,9 +2847,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
         inc_mm_counter_fast(mm, MM_ANONPAGES);
         dec_mm_counter_fast(mm, MM_SWAPENTS);
-       pte = mk_pte(page, vma->vm_page_prot);
+       pte = mk_pte(page, vmf2->vma_page_prot);
         if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
-               pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+               pte = maybe_mkwrite(pte_mkdirty(pte), vmf2->vma_flags);
                 flags &= ~FAULT_FLAG_WRITE;
                 ret |= VM_FAULT_WRITE;
                 exclusive = 1;
@@ -2659,14 +2862,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 do_page_add_anon_rmap(page, vma, address, exclusive);
                 mem_cgroup_commit_charge(page, memcg, true);
         } else { /* ksm created a completely new copy */
-               page_add_new_anon_rmap(page, vma, address);
+               __page_add_new_anon_rmap(page, vma, address);
                 mem_cgroup_commit_charge(page, memcg, false);
-               lru_cache_add_active_or_unevictable(page, vma);
+               __lru_cache_add_active_or_unevictable(page, vmf2->vma_flags);
         }
  
         swap_free(entry);
         if ((PageSwapCache(page) && vm_swap_full(page_swap_info(page))) ||
-               (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+               (vmf2->vma_flags & VM_LOCKED) || PageMlocked(page))
                 try_to_free_swap(page);
         unlock_page(page);
         if (page != swapcache) {
@@ -2683,7 +2886,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
         }
  
         if (flags & FAULT_FLAG_WRITE) {
-               ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+               ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte, vmf2);
                 if (ret & VM_FAULT_ERROR)
                         ret &= VM_FAULT_ERROR;
                 goto out;
@@ -2696,8 +2899,9 @@ unlock:
  out:
         return ret;
  out_nomap:
-       mem_cgroup_cancel_charge(page, memcg);
         pte_unmap_unlock(page_table, ptl);
+out_cancel_cgroup:
+       mem_cgroup_cancel_charge(page, memcg);
  out_page:
         unlock_page(page);
  out_release:
@@ -2716,26 +2920,40 @@ out_release:
   */
  static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pte_t *page_table, pmd_t *pmd,
-               unsigned int flags)
+               unsigned int flags, struct vm_fault *vmf2)
  {
         struct mem_cgroup *memcg;
         struct page *page;
         spinlock_t *ptl;
         pte_t entry;
+       int ret = 0;
  
         pte_unmap(page_table);
  
         /* File mapping without ->vm_ops ? */
-       if (vma->vm_flags & VM_SHARED)
+       if (vmf2->vma_flags & VM_SHARED)
                 return VM_FAULT_SIGBUS;
  
         /* Use the zero-page for reads */
         if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
                 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
-                                               vma->vm_page_prot));
-               page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+                                               vmf2->vma_page_prot));
+               if (!pte_map_lock(vmf2))
+                       return VM_FAULT_RETRY;
+               page_table = vmf2->pte;
+               ptl = vmf2->ptl;
                 if (!pte_none(*page_table))
                         goto unlock;
+
+               /*
+                * Don't call the userfaultfd during the speculative path.
+                * We already checked for the VMA to not be managed through
+                * userfaultfd, but it may be set in our back once we have lock
+                * the pte. In such a case we can ignore it this time.
+                */
+               if (vmf2->flags & FAULT_FLAG_SPECULATIVE)
+                       goto setpte;
+
                 /* Deliver the page fault to userland, check inside PT lock */
                 if (userfaultfd_missing(vma)) {
                         pte_unmap_unlock(page_table, ptl);
@@ -2762,16 +2980,22 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
          */
         __SetPageUptodate(page);
  
-       entry = mk_pte(page, vma->vm_page_prot);
-       if (vma->vm_flags & VM_WRITE)
+       entry = mk_pte(page, vmf2->vma_page_prot);
+       if (vmf2->vma_flags & VM_WRITE)
                 entry = pte_mkwrite(pte_mkdirty(entry));
  
-       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-       if (!pte_none(*page_table))
+       if (!pte_map_lock(vmf2)) {
+               ret = VM_FAULT_RETRY;
                 goto release;
+       }
+       page_table = vmf2->pte;
+       ptl = vmf2->ptl;
+
+       if (!pte_none(*page_table))
+               goto unlock_and_release;
  
         /* Deliver the page fault to userland, check inside PT lock */
-       if (userfaultfd_missing(vma)) {
+       if (!(vmf2->flags & FAULT_FLAG_SPECULATIVE) && userfaultfd_missing(vma)) {
                 pte_unmap_unlock(page_table, ptl);
                 mem_cgroup_cancel_charge(page, memcg);
                 page_cache_release(page);
@@ -2780,9 +3004,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         }
  
         inc_mm_counter_fast(mm, MM_ANONPAGES);
-       page_add_new_anon_rmap(page, vma, address);
+       __page_add_new_anon_rmap(page, vma, address);
         mem_cgroup_commit_charge(page, memcg, false);
-       lru_cache_add_active_or_unevictable(page, vma);
+       __lru_cache_add_active_or_unevictable(page, vmf2->vma_flags);
  setpte:
         set_pte_at(mm, address, page_table, entry);
  
@@ -2790,11 +3014,13 @@ setpte:
         update_mmu_cache(vma, address, page_table);
  unlock:
         pte_unmap_unlock(page_table, ptl);
-       return 0;
+       return ret;
+unlock_and_release:
+       pte_unmap_unlock(page_table, ptl);
  release:
         mem_cgroup_cancel_charge(page, memcg);
         page_cache_release(page);
-       goto unlock;
+       return ret;
  oom_free_page:
         page_cache_release(page);
  oom:
@@ -2859,17 +3085,17 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
   * vm_ops->map_pages.
   */
  void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-               struct page *page, pte_t *pte, bool write, bool anon)
+               struct page *page, pte_t *pte, bool write, bool anon,  struct vm_fault *vmf2)
  {
         pte_t entry;
  
         flush_icache_page(vma, page);
-       entry = mk_pte(page, vma->vm_page_prot);
+       entry = mk_pte(page, vmf2->vma_page_prot);
         if (write)
-               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vmf2->vma_flags);
         if (anon) {
                 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-               page_add_new_anon_rmap(page, vma, address);
+               __page_add_new_anon_rmap(page, vma, address);
         } else {
                 inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
                 page_add_file_rmap(page);
@@ -2990,7 +3216,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
  
  static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pmd_t *pmd,
-               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
  {
         struct page *fault_page;
         spinlock_t *ptl;
@@ -3003,7 +3229,10 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          * something).
          */
         if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
-               pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+               if (!pte_map_lock(vmf2))
+                       return VM_FAULT_RETRY;
+               pte = vmf2->pte;
+               ptl = vmf2->ptl;
                 do_fault_around(vma, address, pte, pgoff, flags);
                 if (!pte_same(*pte, orig_pte))
                         goto unlock_out;
@@ -3014,14 +3243,17 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                 return ret;
  
-       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2))
+               return VM_FAULT_RETRY;
+       pte = vmf2->pte;
+       ptl = vmf2->ptl;
         if (unlikely(!pte_same(*pte, orig_pte))) {
                 pte_unmap_unlock(pte, ptl);
                 unlock_page(fault_page);
                 page_cache_release(fault_page);
                 return ret;
         }
-       do_set_pte(vma, address, fault_page, pte, false, false);
+       do_set_pte(vma, address, fault_page, pte, false, false, vmf2);
         unlock_page(fault_page);
  unlock_out:
         pte_unmap_unlock(pte, ptl);
@@ -3030,7 +3262,7 @@ unlock_out:
  
  static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pmd_t *pmd,
-               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
  {
         struct page *fault_page, *new_page;
         struct mem_cgroup *memcg;
@@ -3058,7 +3290,10 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 copy_user_highpage(new_page, fault_page, address, vma);
         __SetPageUptodate(new_page);
  
-       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2))
+               return VM_FAULT_RETRY;
+       pte = vmf2->pte;
+       ptl = vmf2->ptl;
         if (unlikely(!pte_same(*pte, orig_pte))) {
                 pte_unmap_unlock(pte, ptl);
                 if (fault_page) {
@@ -3073,9 +3308,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 }
                 goto uncharge_out;
         }
-       do_set_pte(vma, address, new_page, pte, true, true);
+       do_set_pte(vma, address, new_page, pte, true, true, vmf2);
         mem_cgroup_commit_charge(new_page, memcg, false);
-       lru_cache_add_active_or_unevictable(new_page, vma);
+       __lru_cache_add_active_or_unevictable(new_page, vmf2->vma_flags);
         pte_unmap_unlock(pte, ptl);
         if (fault_page) {
                 unlock_page(fault_page);
@@ -3096,7 +3331,7 @@ uncharge_out:
  
  static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pmd_t *pmd,
-               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
  {
         struct page *fault_page;
         struct address_space *mapping;
@@ -3123,14 +3358,17 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 }
         }
  
-       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2))
+               return VM_FAULT_RETRY;
+       pte = vmf2->pte;
+       ptl = vmf2->ptl;
         if (unlikely(!pte_same(*pte, orig_pte))) {
                 pte_unmap_unlock(pte, ptl);
                 unlock_page(fault_page);
                 page_cache_release(fault_page);
                 return ret;
         }
-       do_set_pte(vma, address, fault_page, pte, true, false);
+       do_set_pte(vma, address, fault_page, pte, true, false, vmf2);
         pte_unmap_unlock(pte, ptl);
  
         if (set_page_dirty(fault_page))
@@ -3165,7 +3403,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
   */
  static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pte_t *page_table, pmd_t *pmd,
-               unsigned int flags, pte_t orig_pte)
+               unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
  {
         pgoff_t pgoff = (((address & PAGE_MASK)
                         - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -3176,11 +3414,11 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 return VM_FAULT_SIGBUS;
         if (!(flags & FAULT_FLAG_WRITE))
                 return do_read_fault(mm, vma, address, pmd, pgoff, flags,
-                               orig_pte);
-       if (!(vma->vm_flags & VM_SHARED))
+                               orig_pte, vmf2);
+       if (!(vmf2->vma_flags & VM_SHARED))
                 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
-                               orig_pte);
-       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+                               orig_pte, vmf2);
+       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte, vmf2);
  }
  
  static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3199,7 +3437,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
  }
  
  static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                  unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+                  unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd, struct vm_fault *vmf2)
  {
         struct page *page = NULL;
         spinlock_t *ptl;
@@ -3222,8 +3460,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * page table entry is not accessible, so there would be no
         * concurrent hardware modifications to the PTE.
         */
-       ptl = pte_lockptr(mm, pmd);
-       spin_lock(ptl);
+       if (!pte_spinlock(vmf2))
+               return VM_FAULT_RETRY;
+       ptl = vmf2->ptl;
         if (unlikely(!pte_same(*ptep, pte))) {
                 pte_unmap_unlock(ptep, ptl);
                 goto out;
@@ -3237,7 +3476,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         set_pte_at(mm, addr, ptep, pte);
         update_mmu_cache(vma, addr, ptep);
  
-       page = vm_normal_page(vma, addr, pte);
+       page = __vm_normal_page(vma, addr, pte, vmf2->vma_flags);
         if (!page) {
                 pte_unmap_unlock(ptep, ptl);
                 return 0;
@@ -3323,7 +3562,8 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
   */
  static int handle_pte_fault(struct mm_struct *mm,
                      struct vm_area_struct *vma, unsigned long address,
-                    pte_t *pte, pmd_t *pmd, unsigned int flags)
+                    pte_t *pte, pmd_t *pmd, unsigned int flags,
+                    struct vm_fault *vmf2)
  {
         pte_t entry;
         spinlock_t *ptl;
@@ -3342,26 +3582,30 @@ static int handle_pte_fault(struct mm_struct *mm,
                 if (pte_none(entry)) {
                         if (vma_is_anonymous(vma))
                                 return do_anonymous_page(mm, vma, address,
-                                                        pte, pmd, flags);
+                                                        pte, pmd, flags, vmf2);
+                       else if (vmf2->flags & FAULT_FLAG_SPECULATIVE)
+                               return VM_FAULT_RETRY;
                         else
                                 return do_fault(mm, vma, address, pte, pmd,
-                                               flags, entry);
+                                               flags, entry, vmf2);
                 }
                 return do_swap_page(mm, vma, address,
-                                       pte, pmd, flags, entry);
+                                       pte, pmd, flags, entry, vmf2);
         }
  
         if (pte_protnone(entry))
-               return do_numa_page(mm, vma, address, entry, pte, pmd);
+               return do_numa_page(mm, vma, address, entry, pte, pmd, vmf2);
+
+       if (!pte_spinlock(vmf2))
+               return VM_FAULT_RETRY;
+       ptl = vmf2->ptl;
  
-       ptl = pte_lockptr(mm, pmd);
-       spin_lock(ptl);
         if (unlikely(!pte_same(*pte, entry)))
                 goto unlock;
         if (flags & FAULT_FLAG_WRITE) {
                 if (!pte_write(entry))
                         return do_wp_page(mm, vma, address,
-                                       pte, pmd, ptl, entry);
+                                       pte, pmd, ptl, entry, vmf2);
                 entry = pte_mkdirty(entry);
         }
         entry = pte_mkyoung(entry);
@@ -3396,6 +3640,14 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         pmd_t *pmd;
         pte_t *pte;
  
+       struct vm_fault vmf2 = {
+               .vma = vma,
+               .address = address & PAGE_MASK,
+               .flags = flags,
+               .vma_flags = vma->vm_flags,
+               .vma_page_prot = vma->vm_page_prot,
+       };
+
         if (unlikely(is_vm_hugetlb_page(vma)))
                 return hugetlb_fault(mm, vma, address, flags);
  
@@ -3403,9 +3655,14 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         pud = pud_alloc(mm, pgd, address);
         if (!pud)
                 return VM_FAULT_OOM;
-       pmd = pmd_alloc(mm, pud, address);
+       vmf2.pmd = pmd = pmd_alloc(mm, pud, address);
         if (!pmd)
                 return VM_FAULT_OOM;
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       vmf2.sequence = raw_read_seqcount(&vma->vm_sequence);
+#endif
+
         if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
                 int ret = create_huge_pmd(mm, vma, address, pmd, flags);
                 if (!(ret & VM_FAULT_FALLBACK))
@@ -3470,10 +3727,232 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          * read mode and khugepaged takes it in write mode. So now it's
          * safe to run pte_offset_map().
          */
-       pte = pte_offset_map(pmd, address);
+       vmf2.pte = pte = pte_offset_map(pmd, address);
+       vmf2.orig_pte = *vmf2.pte;
+
+       return handle_pte_fault(mm, vma, address, pte, pmd, flags, &vmf2);
+}
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+/*
+ * Tries to handle the page fault in a speculative way, without grabbing the
+ * mmap_sem.
+ */
+int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
+                              unsigned int flags)
+{
+       struct vm_fault vmf = {
+               .address = address,
+       };
+       pgd_t *pgd, pgdval;
+/*     p4d_t *p4d, p4dval; */
+       pud_t pudval;
+       int seq, ret = VM_FAULT_RETRY;
+       struct vm_area_struct *vma;
+#ifdef CONFIG_NUMA
+       struct mempolicy *pol;
+#endif
+
+       /* Clear flags that may lead to release the mmap_sem to retry */
+       flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
+       flags |= FAULT_FLAG_SPECULATIVE;
+
+       vma = get_vma(mm, address);
+       if (!vma)
+               return ret;
+
+       seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */
+       if (seq & 1) {
+               trace_spf_vma_changed(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       /*
+        * Can't call vm_ops service has we don't know what they would do
+        * with the VMA.
+        * This include huge page from hugetlbfs.
+        */
+       if (vma->vm_ops) {
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       /*
+        * __anon_vma_prepare() requires the mmap_sem to be held
+        * because vm_next and vm_prev must be safe. This can't be guaranteed
+        * in the speculative path.
+        */
+       if (unlikely(!vma->anon_vma)) {
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       vmf.vma_flags = READ_ONCE(vma->vm_flags);
+       vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
+
+       /* Can't call userland page fault handler in the speculative path */
+       if (unlikely(vmf.vma_flags & VM_UFFD_MISSING)) {
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) {
+               /*
+                * This could be detected by the check address against VMA's
+                * boundaries but we want to trace it as not supported instead
+                * of changed.
+                */
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       if (address < READ_ONCE(vma->vm_start)
+           || READ_ONCE(vma->vm_end) <= address) {
+               trace_spf_vma_changed(_RET_IP_, vma, address);
+               goto out_put;
+       }
+/*
+       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+                                      flags & FAULT_FLAG_INSTRUCTION,
+                                      flags & FAULT_FLAG_REMOTE)) {
+               trace_spf_vma_access(_RET_IP_, vma, address);
+               ret = VM_FAULT_SIGSEGV;
+               goto out_put;
+       }
+*/
+       /* This is one is required to check that the VMA has write access set */
+       if (flags & FAULT_FLAG_WRITE) {
+               if (unlikely(!(vmf.vma_flags & VM_WRITE))) {
+                       trace_spf_vma_access(_RET_IP_, vma, address);
+                       ret = VM_FAULT_SIGSEGV;
+                       goto out_put;
+               }
+       } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE)))) {
+               trace_spf_vma_access(_RET_IP_, vma, address);
+               ret = VM_FAULT_SIGSEGV;
+               goto out_put;
+       }
+
+#ifdef CONFIG_NUMA
+       /*
+        * MPOL_INTERLEAVE implies additional checks in
+        * mpol_misplaced() which are not compatible with the
+        *speculative page fault processing.
+        */
+       pol = __get_vma_policy(vma, address);
+       if (!pol)
+               pol = get_task_policy(current);
+       if (pol && pol->mode == MPOL_INTERLEAVE) {
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+#endif
  
-       return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+       /*
+        * Do a speculative lookup of the PTE entry.
+        */
+       local_irq_disable();
+       pgd = pgd_offset(mm, address);
+       pgdval = READ_ONCE(*pgd);
+       if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
+               goto out_walk;
+/*
+       p4d = p4d_offset(pgd, address);
+       p4dval = READ_ONCE(*p4d);
+       if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval)))
+               goto out_walk;
+*/
+       vmf.pud = pud_offset(pgd, address);
+       pudval = READ_ONCE(*vmf.pud);
+       if (pud_none(pudval) || unlikely(pud_bad(pudval)))
+               goto out_walk;
+
+       /* Huge pages at PUD level are not supported. */
+/*
+       if (unlikely(pud_trans_huge(pudval)))
+               goto out_walk;
+*/
+       vmf.pmd = pmd_offset(vmf.pud, address);
+       vmf.orig_pmd = READ_ONCE(*vmf.pmd);
+       /*
+        * pmd_none could mean that a hugepage collapse is in progress
+        * in our back as collapse_huge_page() mark it before
+        * invalidating the pte (which is done once the IPI is catched
+        * by all CPU and we have interrupt disabled).
+        * For this reason we cannot handle THP in a speculative way since we
+        * can't safely indentify an in progress collapse operation done in our
+        * back on that PMD.
+        * Regarding the order of the following checks, see comment in
+        * pmd_devmap_trans_unstable()
+        */
+       if (/* unlikely(pmd_devmap(vmf.orig_pmd) || */
+                    pmd_none(vmf.orig_pmd) || pmd_trans_huge(vmf.orig_pmd)  /* ||
+                    is_swap_pmd(vmf.orig_pmd))*/)
+               goto out_walk;
+
+       /*
+        * The above does not allocate/instantiate page-tables because doing so
+        * would lead to the possibility of instantiating page-tables after
+        * free_pgtables() -- and consequently leaking them.
+        *
+        * The result is that we take at least one !speculative fault per PMD
+        * in order to instantiate it.
+        */
+
+       vmf.pte = pte_offset_map(vmf.pmd, address);
+       vmf.orig_pte = READ_ONCE(*vmf.pte);
+       barrier(); /* See comment in handle_pte_fault() */
+       /*
+       if (pte_none(vmf.orig_pte)) {
+               pte_unmap(vmf.pte);
+               vmf.pte = NULL;
+       }
+       */
+       vmf.vma = vma;
+       vmf.pgoff = linear_page_index(vma, address);
+/*     vmf.gfp_mask = __get_fault_gfp_mask(vma); */
+       vmf.sequence = seq;
+       vmf.flags = flags;
+
+       local_irq_enable();
+
+       /*
+        * We need to re-validate the VMA after checking the bounds, otherwise
+        * we might have a false positive on the bounds.
+        */
+       if (read_seqcount_retry(&vma->vm_sequence, seq)) {
+               trace_spf_vma_changed(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       mem_cgroup_oom_enable();
+       ret = handle_pte_fault(mm, vmf.vma, vmf.address, vmf.pte,
+                               vmf.pmd, vmf.flags, &vmf);
+       mem_cgroup_oom_disable();
+
+       put_vma(vma);
+
+       if (ret != VM_FAULT_RETRY)
+               count_vm_event(SPECULATIVE_PGFAULT);
+
+       /*
+        * The task may have entered a memcg OOM situation but
+        * if the allocation error was handled gracefully (no
+        * VM_FAULT_OOM), there is no need to kill anything.
+        * Just clean up the OOM state peacefully.
+        */
+       if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+               mem_cgroup_oom_synchronize(false);
+       return ret;
+
+out_walk:
+       trace_spf_vma_notsup(_RET_IP_, vma, address);
+       local_irq_enable();
+out_put:
+       put_vma(vma);
+       return ret;
  }
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
  
  /*
   * By the time we get here, we already hold the mm semaphore
diff --git a/mm/migrate.c b/mm/migrate.c

index d8d8c2a..118b074 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -241,7 +241,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
  
         /* Recheck VMA as permissions can change since migration started  */
         if (is_write_migration_entry(entry))
-               pte = maybe_mkwrite(pte, vma);
+               pte = maybe_mkwrite(pte, vma->vm_flags);
  
  #ifdef CONFIG_HUGETLB_PAGE
         if (PageHuge(new)) {
diff --git a/mm/mlock.c b/mm/mlock.c

index 966dbdc..68ddb11 100644 (file)
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -423,7 +423,9 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
  void munlock_vma_pages_range(struct vm_area_struct *vma,
                              unsigned long start, unsigned long end)
  {
-       vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+       vm_write_begin(vma);
+       WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK);
+       vm_write_end(vma);
  
         while (start < end) {
                 struct page *page = NULL;
@@ -549,9 +551,11 @@ success:
          * set VM_LOCKED, populate_vma_page_range will bring it back.
          */
  
-       if (lock)
-               vma->vm_flags = newflags;
-       else
+       if (lock) {
+               vm_write_begin(vma);
+               WRITE_ONCE(vma->vm_flags, newflags);
+               vm_write_end(vma);
+       } else
                 munlock_vma_pages_range(vma, start, end);
  
  out:
diff --git a/mm/mmap.c b/mm/mmap.c

index 3bb666c..37871a7 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -288,6 +288,27 @@ void unlink_file_vma(struct vm_area_struct *vma)
         }
  }
  
+static void __free_vma(struct vm_area_struct *vma)
+{
+       if (vma->vm_file)
+               fput(vma->vm_file);
+       mpol_put(vma_policy(vma));
+       kmem_cache_free(vm_area_cachep, vma);
+}
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+void put_vma(struct vm_area_struct *vma)
+{
+       if (atomic_dec_and_test(&vma->vm_ref_count))
+               __free_vma(vma);
+}
+#else
+static inline void put_vma(struct vm_area_struct *vma)
+{
+       return __free_vma(vma);
+}
+#endif
+
  /*
   * Close a vm structure and free it, returning the next.
   */
@@ -298,10 +319,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
         might_sleep();
         if (vma->vm_ops && vma->vm_ops->close)
                 vma->vm_ops->close(vma);
-       if (vma->vm_file)
-               fput(vma->vm_file);
-       mpol_put(vma_policy(vma));
-       kmem_cache_free(vm_area_cachep, vma);
+       put_vma(vma);
         return next;
  }
  
@@ -515,6 +533,14 @@ static void validate_mm(struct mm_struct *mm)
  #define validate_mm(mm) do { } while (0)
  #endif
  
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+#define mm_rb_write_lock(mm)   write_lock(&(mm)->mm_rb_lock)
+#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock)
+#else
+#define mm_rb_write_lock(mm)   do { } while (0)
+#define mm_rb_write_unlock(mm) do { } while (0)
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
  RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
                      unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
  
@@ -533,16 +559,20 @@ static void vma_gap_update(struct vm_area_struct *vma)
  }
  
  static inline void vma_rb_insert(struct vm_area_struct *vma,
-                                struct rb_root *root)
+                                struct mm_struct *mm)
  {
+       struct rb_root *root = &mm->mm_rb;
+
         /* All rb_subtree_gap values must be consistent prior to insertion */
         validate_mm_rb(root, NULL);
  
         rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
  }
  
-static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+static void vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
  {
+       struct rb_root *root = &mm->mm_rb;
+
         /*
          * All rb_subtree_gap values must be consistent prior to erase,
          * with the possible exception of the vma being erased.
@@ -554,7 +584,15 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
          * so make sure we instantiate it only once with our desired
          * augmented rbtree callbacks.
          */
+       mm_rb_write_lock(mm);
         rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+       mm_rb_write_unlock(mm); /* wmb */
+
+       /*
+        * Ensure the removal is complete before clearing the node.
+        * Matched by vma_has_changed()/handle_speculative_fault().
+        */
+       RB_CLEAR_NODE(&vma->vm_rb);
  }
  
  /*
@@ -669,10 +707,12 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
          * immediately update the gap to the correct value. Finally we
          * rebalance the rbtree after all augmented values have been set.
          */
+       mm_rb_write_lock(mm);
         rb_link_node(&vma->vm_rb, rb_parent, rb_link);
         vma->rb_subtree_gap = 0;
         vma_gap_update(vma);
-       vma_rb_insert(vma, &mm->mm_rb);
+       vma_rb_insert(vma, mm);
+       mm_rb_write_unlock(mm);
  }
  
  static void __vma_link_file(struct vm_area_struct *vma)
@@ -746,7 +786,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
  {
         struct vm_area_struct *next;
  
-       vma_rb_erase(vma, &mm->mm_rb);
+       vma_rb_erase(vma, mm);
         prev->vm_next = next = vma->vm_next;
         if (next)
                 next->vm_prev = prev;
@@ -762,8 +802,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
   * are necessary.  The "insert" vma (if any) is to be inserted
   * before we drop the necessary locks.
   */
-int vma_adjust(struct vm_area_struct *vma, unsigned long start,
-       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
+       bool keep_locked)
  {
         struct mm_struct *mm = vma->vm_mm;
         struct vm_area_struct *next = vma->vm_next;
@@ -776,6 +817,30 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
         long adjust_next = 0;
         int remove_next = 0;
  
+       /*
+        * Why using vm_raw_write*() functions here to avoid lockdep's warning ?
+        *
+        * Locked is complaining about a theoretical lock dependency, involving
+        * 3 locks:
+        *       mapping->i_mmap_rwsem --> vma->vm_sequence --> fs_reclaim
+        *
+        * Here are the major path leading to this dependency :
+        *      1. __vma_adjust() mmap_sem      -> vm_sequence -> i_mmap_rwsem
+        *      2. move_vmap() mmap_sem -> vm_sequence -> fs_reclaim
+        *      3. __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem
+        *      4. unmap_mapping_range() i_mmap_rwsem -> vm_sequence
+        *
+        * So there is no way to solve this easily, especially because in
+        * unmap_mapping_range() the i_mmap_rwsem is grab while the impacted
+        * VMAs are not yet known.
+        * However, the way the vm_seq is used is guarantying that we will
+        * never block on it since we just check for its value and never wait
+        * for it to move, see vma_has_changed() and handle_speculative_fault().
+        */
+       vm_raw_write_begin(vma);
+       if (next)
+               vm_raw_write_begin(next);
+
         if (next && !insert) {
                 struct vm_area_struct *exporter = NULL;
  
@@ -817,8 +882,12 @@ again:                     remove_next = 1 + (end > next->vm_end);
  
                         importer->anon_vma = exporter->anon_vma;
                         error = anon_vma_clone(importer, exporter);
-                       if (error)
+                       if (error) {
+                               if (next && next != vma)
+                                       vm_raw_write_end(next);
+                               vm_raw_write_end(vma);
                                 return error;
+                       }
                 }
         }
  
@@ -864,17 +933,18 @@ again:                    remove_next = 1 + (end > next->vm_end);
         }
  
         if (start != vma->vm_start) {
-               vma->vm_start = start;
+               WRITE_ONCE(vma->vm_start, start);
                 start_changed = true;
         }
         if (end != vma->vm_end) {
-               vma->vm_end = end;
+               WRITE_ONCE(vma->vm_end, end);
                 end_changed = true;
         }
-       vma->vm_pgoff = pgoff;
+       WRITE_ONCE(vma->vm_pgoff, pgoff);
         if (adjust_next) {
-               next->vm_start += adjust_next << PAGE_SHIFT;
-               next->vm_pgoff += adjust_next;
+               WRITE_ONCE(next->vm_start,
+                          next->vm_start + (adjust_next << PAGE_SHIFT));
+               WRITE_ONCE(next->vm_pgoff, next->vm_pgoff + adjust_next);
         }
  
         if (root) {
@@ -929,19 +999,21 @@ again:                    remove_next = 1 + (end > next->vm_end);
         if (remove_next) {
                 if (file) {
                         uprobe_munmap(next, next->vm_start, next->vm_end);
-                       fput(file);
                 }
                 if (next->anon_vma)
                         anon_vma_merge(vma, next);
                 mm->map_count--;
-               mpol_put(vma_policy(next));
-               kmem_cache_free(vm_area_cachep, next);
+               vm_raw_write_end(next);
+               put_vma(next);
                 /*
                  * In mprotect's case 6 (see comments on vma_merge),
                  * we must remove another next too. It would clutter
                  * up the code too much to do both in one go.
                  */
                 next = vma->vm_next;
+               if (next)
+                       vm_raw_write_begin(next);
+
                 if (remove_next == 2)
                         goto again;
                 else if (next)
@@ -952,6 +1024,12 @@ again:                    remove_next = 1 + (end > next->vm_end);
         if (insert && file)
                 uprobe_mmap(insert);
  
+       if (next && next != vma)
+               vm_raw_write_end(next);
+
+       if (!keep_locked)
+               vm_raw_write_end(vma);
+
         validate_mm(mm);
  
         return 0;
@@ -1080,13 +1158,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
   * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
   * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
   */
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
                         struct vm_area_struct *prev, unsigned long addr,
                         unsigned long end, unsigned long vm_flags,
                         struct anon_vma *anon_vma, struct file *file,
                         pgoff_t pgoff, struct mempolicy *policy,
                         struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
-                       const char __user *anon_name)
+                       const char __user *anon_name, bool keep_locked)
  {
         pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
         struct vm_area_struct *area, *next;
@@ -1129,11 +1207,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                 is_mergeable_anon_vma(prev->anon_vma,
                                                       next->anon_vma, NULL)) {
                                                         /* cases 1, 6 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               next->vm_end, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                               next->vm_end, prev->vm_pgoff, NULL, keep_locked);
                 } else                                  /* cases 2, 5, 7 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               end, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                               end, prev->vm_pgoff, NULL, keep_locked);
                 if (err)
                         return NULL;
                 khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1150,11 +1228,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                              vm_userfaultfd_ctx,
                                              anon_name)) {
                 if (prev && addr < prev->vm_end)        /* case 4 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               addr, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                               addr, prev->vm_pgoff, NULL, keep_locked);
                 else                                    /* cases 3, 8 */
-                       err = vma_adjust(area, addr, next->vm_end,
-                               next->vm_pgoff - pglen, NULL);
+                       err = __vma_adjust(area, addr, next->vm_end,
+                               next->vm_pgoff - pglen, NULL, keep_locked);
                 if (err)
                         return NULL;
                 khugepaged_enter_vma_merge(area, vm_flags);
@@ -1690,7 +1768,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
         vma->vm_flags = vm_flags;
         vma->vm_page_prot = vm_get_page_prot(vm_flags);
         vma->vm_pgoff = pgoff;
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
  
         if (file) {
                 if (vm_flags & VM_DENYWRITE) {
@@ -1743,13 +1821,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
  out:
         perf_event_mmap(vma);
  
+       vm_write_begin(vma);
         vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
         if (vm_flags & VM_LOCKED) {
                 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
                                         vma == get_gate_vma(current->mm)))
                         mm->locked_vm += (len >> PAGE_SHIFT);
                 else
-                       vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+                       WRITE_ONCE(vma->vm_flags,
+                               vma->vm_flags & VM_LOCKED_CLEAR_MASK);
         }
  
         if (file)
@@ -1762,9 +1842,10 @@ out:
          * then new mapped in-place (which must be aimed as
          * a completely new data area).
          */
-       vma->vm_flags |= VM_SOFTDIRTY;
+       WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY);
  
         vma_set_page_prot(vma);
+       vm_write_end(vma);
  
         return addr;
  
@@ -2126,15 +2207,10 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
  EXPORT_SYMBOL(get_unmapped_area);
  
  /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+struct vm_area_struct *__find_vma(struct mm_struct *mm, unsigned long addr)
  {
         struct rb_node *rb_node;
-       struct vm_area_struct *vma;
-
-       /* Check the cache first. */
-       vma = vmacache_find(mm, addr);
-       if (likely(vma))
-               return vma;
+       struct vm_area_struct *vma = NULL;
  
         rb_node = mm->mm_rb.rb_node;
  
@@ -2152,13 +2228,41 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
                         rb_node = rb_node->rb_right;
         }
  
+       return vma;
+}
+
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+       struct vm_area_struct *vma;
+
+       /* Check the cache first. */
+       vma = vmacache_find(mm, addr);
+       if (likely(vma))
+               return vma;
+
+       vma = __find_vma(mm, addr);
+
         if (vma)
                 vmacache_update(addr, vma);
         return vma;
  }
-
  EXPORT_SYMBOL(find_vma);
  
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
+{
+       struct vm_area_struct *vma = NULL;
+
+       read_lock(&mm->mm_rb_lock);
+       vma = __find_vma(mm, addr);
+       if (vma)
+               atomic_inc(&vma->vm_ref_count);
+       read_unlock(&mm->mm_rb_lock);
+
+       return vma;
+}
+#endif
+
  /*
   * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
   */
@@ -2389,8 +2493,8 @@ int expand_downwards(struct vm_area_struct *vma,
                                 vm_stat_account(mm, vma->vm_flags,
                                                 vma->vm_file, grow);
                                 anon_vma_interval_tree_pre_update_vma(vma);
-                               vma->vm_start = address;
-                               vma->vm_pgoff -= grow;
+                               WRITE_ONCE(vma->vm_start, address);
+                               WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow);
                                 anon_vma_interval_tree_post_update_vma(vma);
                                 vma_gap_update(vma);
                                 spin_unlock(&mm->page_table_lock);
@@ -2536,7 +2640,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
         insertion_point = (prev ? &prev->vm_next : &mm->mmap);
         vma->vm_prev = NULL;
         do {
-               vma_rb_erase(vma, &mm->mm_rb);
+               vma_rb_erase(vma, mm);
                 mm->map_count--;
                 tail_vma = vma;
                 vma = vma->vm_next;
@@ -2574,7 +2678,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
         /* most fields are the same, copy all, and then fixup */
         *new = *vma;
  
-       INIT_LIST_HEAD(&new->anon_vma_chain);
+       INIT_VMA(new);
  
         if (new_below)
                 new->vm_end = addr;
@@ -2908,7 +3012,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
                 return -ENOMEM;
         }
  
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
         vma->vm_mm = mm;
         vma->vm_start = addr;
         vma->vm_end = addr + len;
@@ -3058,9 +3162,21 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
  
         if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
                 return NULL;    /* should never get here */
-       new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+
+       /* There is 3 cases to manage here in
+        *         AAAA                    AAAA                          AAAA                      AAAA
+        * PPPP....      PPPP......NNNN          PPPP....NNNN      PP........NN
+        * PPPPPPPP(A)   PPPP..NNNNNNNN(B)       PPPPPPPPPPPP(1)           NULL
+        *                                                                       PPPPPPPPNNNN(2)
+        *                                       PPPPNNNNNNNN(3)
+        *
+        * new_vma == prev in case A,1,2
+        * new_vma == next in case B,3
+        */
+       new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                             vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-                           vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
+                           vma->vm_userfaultfd_ctx, vma_get_anon_name(vma), true);
+
         if (new_vma) {
                 /*
                  * Source vma may have been merged into new_vma
@@ -3093,13 +3209,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                 new_vma->vm_pgoff = pgoff;
                 if (vma_dup_policy(vma, new_vma))
                         goto out_free_vma;
-               INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+               INIT_VMA(new_vma);
                 if (anon_vma_clone(new_vma, vma))
                         goto out_free_mempol;
                 if (new_vma->vm_file)
                         get_file(new_vma->vm_file);
                 if (new_vma->vm_ops && new_vma->vm_ops->open)
                         new_vma->vm_ops->open(new_vma);
+
+               /*
+                * As the VMA is linked right now, it may be hit by the
+                * speculative page fault handler. But we don't want it to
+                * to start mapping page in this area until the caller has
+                * potentially move the pte from the moved VMA. To prevent
+                * that we protect it right now, and let the caller unprotect
+                * it once the move is done.
+                */
+               vm_raw_write_begin(new_vma);
+
                 vma_link(mm, new_vma, prev, rb_link, rb_parent);
                 *need_rmap_locks = false;
         }
@@ -3193,7 +3320,7 @@ static struct vm_area_struct *__install_special_mapping(
         if (unlikely(vma == NULL))
                 return ERR_PTR(-ENOMEM);
  
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
         vma->vm_mm = mm;
         vma->vm_start = addr;
         vma->vm_end = addr + len;
diff --git a/mm/mprotect.c b/mm/mprotect.c

index fcd678c..39260d0 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -368,12 +368,14 @@ success:
          * vm_flags and vm_page_prot are protected by the mmap_sem
          * held in write mode.
          */
-       vma->vm_flags = newflags;
+       vm_write_begin(vma);
+       WRITE_ONCE(vma->vm_flags, newflags);
         dirty_accountable = vma_wants_writenotify(vma);
         vma_set_page_prot(vma);
  
         change_protection(vma, start, end, vma->vm_page_prot,
                           dirty_accountable, 0);
+       vm_write_end(vma);
  
         /*
          * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
diff --git a/mm/mremap.c b/mm/mremap.c

index 450b306..a0ab573 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -285,6 +285,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         if (!new_vma)
                 return -ENOMEM;
  
+       /* new_vma is returned protected by copy_vma, to prevent speculative
+        * page fault to be done in the destination area before we move the pte.
+        * Now, we must also protect the source VMA since we don't want pages
+        * to be mapped in our back while we are copying the PTEs.
+        */
+       if (vma != new_vma)
+               vm_raw_write_begin(vma);
+
         moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
                                      need_rmap_locks);
         if (moved_len < old_len) {
@@ -301,6 +309,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                  */
                 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
                                  true);
+               if (vma != new_vma)
+                       vm_raw_write_end(vma);
+
                 vma = new_vma;
                 old_len = new_len;
                 old_addr = new_addr;
@@ -308,7 +319,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         } else {
                 arch_remap(mm, old_addr, old_addr + old_len,
                            new_addr, new_addr + new_len);
+
+               if (vma != new_vma)
+                       vm_raw_write_end(vma);
         }
+       vm_raw_write_end(new_vma);
  
         /* Conceal VM_ACCOUNT so old reservation is not undone */
         if (vm_flags & VM_ACCOUNT) {
diff --git a/mm/nommu.c b/mm/nommu.c

index ca1884c..e86cc3a 100644 (file)
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1270,7 +1270,7 @@ unsigned long do_mmap(struct file *file,
         region->vm_flags = vm_flags;
         region->vm_pgoff = pgoff;
  
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
         vma->vm_flags = vm_flags;
         vma->vm_pgoff = pgoff;
  
diff --git a/mm/rmap.c b/mm/rmap.c

index 59e1c26..81f201d 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1208,10 +1208,9 @@ void do_page_add_anon_rmap(struct page *page,
   * This means the inc-and-test can be bypassed.
   * Page does not have to be locked.
   */
-void page_add_new_anon_rmap(struct page *page,
+void __page_add_new_anon_rmap(struct page *page,
         struct vm_area_struct *vma, unsigned long address)
  {
-       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
         SetPageSwapBacked(page);
         atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
         if (PageTransHuge(page))
diff --git a/mm/swap.c b/mm/swap.c

index ee61c48..d414381 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -708,12 +708,12 @@ void add_page_to_unevictable_list(struct page *page)
   * directly back onto it's zone's unevictable list, it does NOT use a
   * per cpu pagevec.
   */
-void lru_cache_add_active_or_unevictable(struct page *page,
-                                        struct vm_area_struct *vma)
+void __lru_cache_add_active_or_unevictable(struct page *page,
+                                        unsigned long vma_flags)
  {
         VM_BUG_ON_PAGE(PageLRU(page), page);
  
-       if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+       if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
                 SetPageActive(page);
                 lru_cache_add(page);
                 return;
diff --git a/mm/vmstat.c b/mm/vmstat.c

index aa3015e..3c55085 100644 (file)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -871,6 +871,9 @@ const char * const vmstat_text[] = {
         "vmacache_find_calls",
         "vmacache_find_hits",
  #endif
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       "speculative_pgfault",
+#endif
  #endif /* CONFIG_VM_EVENTS_COUNTERS */
  };
  #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
author	Park Ju Hyung <qkrwngud825@gmail.com>
	Thu, 20 Jun 2019 18:53:36 +0000 (03:53 +0900)
committer	0ranko0P <ranko0p@outlook.com>
	Wed, 4 Dec 2019 17:17:29 +0000 (01:17 +0800)
arch/arm64/Kconfig		patch \| blob \| history
arch/arm64/mm/fault.c		patch \| blob \| history
fs/exec.c		patch \| blob \| history
fs/proc/task_mmu.c		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/pagemap.h		patch \| blob \| history
include/linux/rmap.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
include/linux/vm_event_item.h		patch \| blob \| history
include/trace/events/pagefault.h	[new file with mode: 0644]	patch \| blob
include/uapi/linux/perf_event.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/Kconfig		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/init-mm.c		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/madvise.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/mlock.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
mm/mremap.c		patch \| blob \| history
mm/nommu.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history
mm/vmstat.c		patch \| blob \| history