OSDN Git Service

mm: backport speculative page fault
authorPark Ju Hyung <qkrwngud825@gmail.com>
Thu, 20 Jun 2019 18:53:36 +0000 (03:53 +0900)
committer0ranko0P <ranko0p@outlook.com>
Wed, 4 Dec 2019 17:17:29 +0000 (01:17 +0800)
Signed-off-by: Park Ju Hyung <qkrwngud825@gmail.com>
28 files changed:
arch/arm64/Kconfig
arch/arm64/mm/fault.c
fs/exec.c
fs/proc/task_mmu.c
include/linux/mm.h
include/linux/mm_types.h
include/linux/pagemap.h
include/linux/rmap.h
include/linux/swap.h
include/linux/vm_event_item.h
include/trace/events/pagefault.h [new file with mode: 0644]
include/uapi/linux/perf_event.h
kernel/fork.c
mm/Kconfig
mm/filemap.c
mm/init-mm.c
mm/internal.h
mm/madvise.c
mm/memory.c
mm/migrate.c
mm/mlock.c
mm/mmap.c
mm/mprotect.c
mm/mremap.c
mm/nommu.c
mm/rmap.c
mm/swap.c
mm/vmstat.c

index 83f04b1..2861902 100644 (file)
@@ -106,6 +106,7 @@ config ARM64
        select HAVE_CONTEXT_TRACKING
        select HAVE_ARM_SMCCC
        select THREAD_INFO_IN_TASK
+       select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
        help
          ARM 64-bit (AArch64) Linux support.
 
index 4970252..66e6e5f 100644 (file)
@@ -343,6 +343,16 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
        }
 
        /*
+        * let's try a speculative page fault without grabbing the
+        * mmap_sem.
+        */
+       fault = handle_speculative_fault(mm, addr, mm_flags);
+       if (fault != VM_FAULT_RETRY) {
+               perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
+               goto done;
+       }
+
+       /*
         * As per x86, we may deadlock here. However, since the kernel only
         * validly references user space from well defined areas of the code,
         * we can bug out early if this is from code which shouldn't.
@@ -407,6 +417,8 @@ retry:
 
        up_read(&mm->mmap_sem);
 
+done:
+
        /*
         * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
         */
index 23215a7..83b58dd 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -303,7 +303,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
 
        err = insert_vm_struct(mm, vma);
        if (err)
index c3faa39..3204874 100644 (file)
@@ -1103,8 +1103,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                                        goto out_mm;
                                }
                                for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                                       vma->vm_flags &= ~VM_SOFTDIRTY;
+                                       vm_write_begin(vma);
+                                       WRITE_ONCE(vma->vm_flags,
+                                                  vma->vm_flags & ~VM_SOFTDIRTY);
                                        vma_set_page_prot(vma);
+                                       vm_write_end(vma);
                                }
                                downgrade_write(&mm->mmap_sem);
                                break;
@@ -1508,7 +1511,7 @@ const struct file_operations proc_pagemap_operations = {
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
 #ifdef CONFIG_PROCESS_RECLAIM
-static int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
+int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)
 {
        struct reclaim_param *rp = walk->private;
@@ -1535,6 +1538,9 @@ cont:
                if (!page)
                        continue;
 
+               if (page_mapcount(page) != 1)
+                       continue;
+
                if (isolate_lru_page(page))
                        continue;
 
@@ -1557,7 +1563,7 @@ cont:
                goto cont;
 
        cond_resched();
-       return 0;
+       return (rp->nr_to_reclaim == 0) ? -EPIPE : 0;
 }
 
 enum reclaim_type {
@@ -1628,6 +1634,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
        unsigned long start = 0;
        unsigned long end = 0;
        struct reclaim_param rp;
+       int ret;
 
        memset(buffer, 0, sizeof(buffer));
        if (count > sizeof(buffer) - 1)
@@ -1689,7 +1696,7 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
        reclaim_walk.mm = mm;
        reclaim_walk.pmd_entry = reclaim_pte_range;
 
-       rp.nr_to_reclaim = ~0;
+       rp.nr_to_reclaim = INT_MAX;
        rp.nr_reclaimed = 0;
        reclaim_walk.private = &rp;
 
@@ -1703,9 +1710,11 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
                                continue;
 
                        rp.vma = vma;
-                       walk_page_range(max(vma->vm_start, start),
+                       ret = walk_page_range(max(vma->vm_start, start),
                                        min(vma->vm_end, end),
                                        &reclaim_walk);
+                       if (ret)
+                               break;
                        vma = vma->vm_next;
                }
        } else {
@@ -1720,8 +1729,10 @@ static ssize_t reclaim_write(struct file *file, const char __user *buf,
                                continue;
 
                        rp.vma = vma;
-                       walk_page_range(vma->vm_start, vma->vm_end,
+                       ret = walk_page_range(vma->vm_start, vma->vm_end,
                                &reclaim_walk);
+                       if (ret)
+                               break;
                }
        }
 
index ed48a15..4ea0553 100644 (file)
@@ -235,6 +235,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_KILLABLE    0x10    /* The fault task is in SIGKILL killable region */
 #define FAULT_FLAG_TRIED       0x20    /* Second try */
 #define FAULT_FLAG_USER                0x40    /* The fault originated in userspace */
+#define FAULT_FLAG_SPECULATIVE 0x200   /* Speculative fault, not holding mmap_sem */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -247,6 +248,29 @@ extern pgprot_t protection_map[16];
  * pgoff should be used in favour of virtual_address, if possible.
  */
 struct vm_fault {
+       struct vm_area_struct *vma; /* Target VMA */
+       pmd_t *pmd;                     /* Pointer to pmd entry matching
+                                        * the 'address' */
+       pud_t *pud;              /* Pointer to pud entry matching
+                                         * the 'address'
+                                         */
+       unsigned long address;   /* Faulting virtual address */
+       spinlock_t *ptl;                 /* Page table lock.
+                                         * Protects pte page table if 'pte'
+                                         * is not NULL, otherwise pmd.
+                                         */
+       pte_t orig_pte;           /* Value of PTE at the time of fault */
+       /*
+         * These entries are required when handling speculative page fault.
+         * This way the page handling is done using consistent field values.
+         */
+       unsigned long vma_flags;
+       pgprot_t vma_page_prot;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       unsigned int sequence;
+       pmd_t orig_pmd;                 /* value of PMD at the time of fault */
+#endif
+
        unsigned int flags;             /* FAULT_FLAG_xxx flags */
        gfp_t gfp_mask;                 /* gfp mask to be used for allocations */
        pgoff_t pgoff;                  /* Logical page offset based on vma */
@@ -588,15 +612,15 @@ static inline void set_compound_order(struct page *page, unsigned int order)
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+static inline pte_t maybe_mkwrite(pte_t pte, unsigned long vma_flags)
 {
-       if (likely(vma->vm_flags & VM_WRITE))
+       if (likely(vma_flags & VM_WRITE))
                pte = pte_mkwrite(pte);
        return pte;
 }
 
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-               struct page *page, pte_t *pte, bool write, bool anon);
+               struct page *page, pte_t *pte, bool write, bool anon, struct vm_fault *vmf2);
 #endif
 
 /*
@@ -1052,6 +1076,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_LOCKED        0x0200  /* ->fault locked the returned page */
 #define VM_FAULT_RETRY 0x0400  /* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800       /* huge page fault failed, fall back to small */
+#define VM_FAULT_PTNOTSAME 0x4000      /* Page table entries have changed */
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
@@ -1103,8 +1128,23 @@ struct zap_details {
        pgoff_t last_index;                     /* Highest page->index to unmap */
 };
 
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-               pte_t pte);
+static inline void INIT_VMA(struct vm_area_struct *vma)
+{
+       INIT_LIST_HEAD(&vma->anon_vma_chain);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       seqcount_init(&vma->vm_sequence);
+       atomic_set(&vma->vm_ref_count, 1);
+#endif
+}
+
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                               pte_t pte, unsigned long vma_flags);
+static inline struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                               pte_t pte)
+{
+       return __vm_normal_page(vma, addr, pte, vma->vm_flags);
+}
+
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd);
 
@@ -1206,6 +1246,31 @@ int invalidate_inode_page(struct page *page);
 #ifdef CONFIG_MMU
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern int __handle_speculative_fault(struct mm_struct *mm,
+                                     unsigned long address,
+                                     unsigned int flags);
+static inline int handle_speculative_fault(struct mm_struct *mm,
+                                          unsigned long address,
+                                          unsigned int flags)
+{
+       /*
+        * Try speculative page fault for multithreaded user space task only.
+        */
+       if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1)
+               return VM_FAULT_RETRY;
+       return __handle_speculative_fault(mm, address, flags);
+}
+#else
+static inline int handle_speculative_fault(struct mm_struct *mm,
+                                          unsigned long address,
+                                          unsigned int flags)
+{
+       return VM_FAULT_RETRY;
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags);
 #else
@@ -1227,6 +1292,47 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 }
 #endif
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+       write_seqcount_begin(&vma->vm_sequence);
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+                                        int subclass)
+{
+       write_seqcount_begin_nested(&vma->vm_sequence, subclass);
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+       write_seqcount_end(&vma->vm_sequence);
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+       raw_write_seqcount_begin(&vma->vm_sequence);
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+       raw_write_seqcount_end(&vma->vm_sequence);
+}
+#else
+static inline void vm_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
+                                        int subclass)
+{
+}
+static inline void vm_write_end(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
+{
+}
+static inline void vm_raw_write_end(struct vm_area_struct *vma)
+{
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
@@ -1873,12 +1979,26 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
 
 /* mmap.c */
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
-extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
-       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
-extern struct vm_area_struct *vma_merge(struct mm_struct *,
+extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, bool keep_locked);
+static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+{
+       return __vma_adjust(vma, start, end, pgoff, insert, false);
+}
+extern struct vm_area_struct *__vma_merge(struct mm_struct *,
        struct vm_area_struct *prev, unsigned long addr, unsigned long end,
        unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-       struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
+       struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *, bool keep_locked);
+static inline struct vm_area_struct *vma_merge(struct mm_struct *mm,
+               struct vm_area_struct *prev, unsigned long addr, unsigned long end,
+               unsigned long vm_flags, struct anon_vma *anon, struct file *file, pgoff_t off,
+               struct mempolicy *pol, struct vm_userfaultfd_ctx ctx, const char __user *anon_name)
+{
+       return __vma_merge(mm, prev, addr, end, vm_flags, anon, file,
+               off, pol, ctx, anon_name, false);
+}
+
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
        struct vm_area_struct *, unsigned long addr, int new_below);
@@ -2391,6 +2511,9 @@ struct reclaim_param {
 };
 extern struct reclaim_param reclaim_task_anon(struct task_struct *task,
                int nr_to_reclaim);
+extern int reclaim_pte_range(pmd_t *pmd, unsigned long addr,
+                               unsigned long end, struct mm_walk *walk);
+extern unsigned long reclaim_global(unsigned long nr_to_reclaim);
 #endif
 
 #endif /* __KERNEL__ */
index 29c17fa..d7f4e01 100644 (file)
@@ -367,6 +367,10 @@ struct vm_area_struct {
        struct mempolicy *vm_policy;    /* NUMA policy for the VMA */
 #endif
        struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       seqcount_t vm_sequence;
+       atomic_t vm_ref_count;          /* see vma_get(), vma_put() */
+#endif
 };
 
 struct core_thread {
@@ -404,6 +408,9 @@ struct kioctx_table;
 struct mm_struct {
        struct vm_area_struct *mmap;            /* list of VMAs */
        struct rb_root mm_rb;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       rwlock_t mm_rb_lock;
+#endif
        u64 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
        unsigned long (*get_unmapped_area) (struct file *filp,
index d2f4a73..ef0f002 100644 (file)
@@ -430,8 +430,10 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
        pgoff_t pgoff;
        if (unlikely(is_vm_hugetlb_page(vma)))
                return linear_hugepage_index(vma, address);
-       pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
-       pgoff += vma->vm_pgoff;
+
+       pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
+       pgoff += READ_ONCE(vma->vm_pgoff);
+
        return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 }
 
index e72b857..9eb8f9d 100644 (file)
@@ -159,7 +159,14 @@ void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
                           unsigned long, int);
-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+static inline void page_add_new_anon_rmap(struct page *page,
+                                       struct vm_area_struct *vma, unsigned long address)
+{
+       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+       __page_add_new_anon_rmap(page, vma, address);
+}
+
 void page_add_file_rmap(struct page *);
 void page_remove_rmap(struct page *);
 
index f1007d5..d3f84dc 100644 (file)
@@ -318,8 +318,13 @@ extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
 
-extern void lru_cache_add_active_or_unevictable(struct page *page,
-                                               struct vm_area_struct *vma);
+extern void __lru_cache_add_active_or_unevictable(struct page *page,
+                                               unsigned long vma_flags);
+static inline void lru_cache_add_active_or_unevictable(struct page *page,
+                                               struct vm_area_struct *vma)
+{
+       return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
+}
 
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
index d31afe5..25d09ff 100644 (file)
@@ -90,6 +90,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PGPGOUTCLEAN, PSWPIN, PSWPOUT,
                VMACACHE_FIND_CALLS,
                VMACACHE_FIND_HITS,
 #endif
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+               SPECULATIVE_PGFAULT,
+#endif
                NR_VM_EVENT_ITEMS
 };
 
diff --git a/include/trace/events/pagefault.h b/include/trace/events/pagefault.h
new file mode 100644 (file)
index 0000000..d9438f3
--- /dev/null
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM pagefault
+
+#if !defined(_TRACE_PAGEFAULT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PAGEFAULT_H
+
+#include <linux/tracepoint.h>
+#include <linux/mm.h>
+
+DECLARE_EVENT_CLASS(spf,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address),
+
+       TP_STRUCT__entry(
+               __field(unsigned long, caller)
+               __field(unsigned long, vm_start)
+               __field(unsigned long, vm_end)
+               __field(unsigned long, address)
+       ),
+
+       TP_fast_assign(
+               __entry->caller         = caller;
+               __entry->vm_start       = vma->vm_start;
+               __entry->vm_end         = vma->vm_end;
+               __entry->address        = address;
+       ),
+
+       TP_printk("ip:%lx vma:%lx-%lx address:%lx",
+                 __entry->caller, __entry->vm_start, __entry->vm_end,
+                 __entry->address)
+);
+
+DEFINE_EVENT(spf, spf_vma_changed,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_noanon,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_notsup,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_vma_access,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+DEFINE_EVENT(spf, spf_pmd_changed,
+
+       TP_PROTO(unsigned long caller,
+                struct vm_area_struct *vma, unsigned long address),
+
+       TP_ARGS(caller, vma, address)
+);
+
+#endif /* _TRACE_PAGEFAULT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
index 686da16..80c9564 100644 (file)
@@ -111,6 +111,7 @@ enum perf_sw_ids {
        PERF_COUNT_SW_EMULATION_FAULTS          = 8,
        PERF_COUNT_SW_DUMMY                     = 9,
        PERF_COUNT_SW_BPF_OUTPUT                = 10,
+       PERF_COUNT_SW_SPF                       = 11,
 
        PERF_COUNT_SW_MAX,                      /* non-ABI */
 };
index 5d3cab4..6a9c619 100644 (file)
@@ -468,7 +468,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (!tmp)
                        goto fail_nomem;
                *tmp = *mpnt;
-               INIT_LIST_HEAD(&tmp->anon_vma_chain);
+               INIT_VMA(tmp);
                retval = vma_dup_policy(mpnt, tmp);
                if (retval)
                        goto fail_nomem_policy;
@@ -610,6 +610,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        mm->mmap = NULL;
        mm->mm_rb = RB_ROOT;
        mm->vmacache_seqnum = 0;
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       rwlock_init(&mm->mm_rb_lock);
+#endif
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        init_rwsem(&mm->mmap_sem);
index 274a315..80050a3 100644 (file)
@@ -751,3 +751,24 @@ config VM_MAX_READAHEAD
         This sets the VM_MAX_READAHEAD value to allow the readahead window
         to grow to a maximum size of configured. This will benefit sequential
         read throughput and thus early boot performance.
+config ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
+       def_bool n
+
+config SPECULATIVE_PAGE_FAULT
+       bool "Speculative page faults"
+       default y
+       depends on ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
+       depends on MMU && SMP
+       help
+         Try to handle user space page faults without holding the mmap_sem.
+
+         This should allow better concurrency for massively threaded processes
+         since the page fault handler will not wait for other thread's memory
+         layout change to be done, assuming that this change is done in
+         another part of the process's memory space. This type of page fault
+         is named speculative page fault.
+
+         If the speculative page fault fails because a concurrent modification
+         is detected or because underlying PMD or PTE tables are not yet
+         allocated, the speculative page fault fails and a classic page fault
+         is then tried.
index 6f3c539..fe72693 100644 (file)
@@ -2147,7 +2147,7 @@ repeat:
                if (file->f_ra.mmap_miss > 0)
                        file->f_ra.mmap_miss--;
                addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
-               do_set_pte(vma, addr, page, pte, false, false);
+               do_set_pte(vma, addr, page, pte, false, false, vmf);
                unlock_page(page);
                goto next;
 unlock:
index 975e49f..f94d003 100644 (file)
@@ -16,6 +16,9 @@
 
 struct mm_struct init_mm = {
        .mm_rb          = RB_ROOT,
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       .mm_rb_lock = __RW_LOCK_UNLOCKED(init_mm.mm_rb_lock),
+#endif
        .pgd            = swapper_pg_dir,
        .mm_users       = ATOMIC_INIT(2),
        .mm_count       = ATOMIC_INIT(1),
index c3533af..88f6ac5 100644 (file)
 /* Do not use these with a slab allocator */
 #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern struct vm_area_struct *get_vma(struct mm_struct *mm,
+                                     unsigned long addr);
+extern void put_vma(struct vm_area_struct *vma);
+
+static inline bool vma_has_changed(struct vm_fault *vmf)
+{
+       int ret = RB_EMPTY_NODE(&vmf->vma->vm_rb);
+       unsigned int seq = READ_ONCE(vmf->vma->vm_sequence.sequence);
+
+       /*
+        * Matches both the wmb in write_seqlock_{begin,end}() and
+        * the wmb in vma_rb_erase().
+        */
+       smp_rmb();
+
+       return ret || seq != vmf->sequence;
+}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);
 
index d1d09bd..17191ea 100644 (file)
@@ -128,7 +128,9 @@ success:
        /*
         * vm_flags is protected by the mmap_sem held in write mode.
         */
-       vma->vm_flags = new_flags;
+       vm_write_begin(vma);
+       WRITE_ONCE(vma->vm_flags, new_flags);
+       vm_write_end(vma);
 
 out:
        if (error == -ENOMEM)
index 5dfc9fa..7f9c0c9 100644 (file)
@@ -72,6 +72,9 @@
 
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/pagefault.h>
+
 #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
 #endif
@@ -528,7 +531,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
+               vm_write_begin(vma);
                unlink_anon_vmas(vma);
+               vm_write_end(vma);
                unlink_file_vma(vma);
 
                if (is_vm_hugetlb_page(vma)) {
@@ -542,7 +547,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
                                next = vma->vm_next;
+                               vm_write_begin(vma);
                                unlink_anon_vmas(vma);
+                               vm_write_end(vma);
                                unlink_file_vma(vma);
                        }
                        free_pgd_range(tlb, addr, vma->vm_end,
@@ -678,7 +685,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                dump_page(page, "bad pte");
        printk(KERN_ALERT
                "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
-               (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+               (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma, mapping, index);
        /*
         * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
         */
@@ -692,7 +699,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 }
 
 /*
- * vm_normal_page -- This function gets the "struct page" associated with a pte.
+ * __vm_normal_page -- This function gets the "struct page" associated with
+ * a pte.
  *
  * "Special" mappings do not wish to be associated with a "struct page" (either
  * it doesn't exist, or it exists but they don't want to touch it). In this
@@ -738,8 +746,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
 #else
 # define HAVE_PTE_SPECIAL 0
 #endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-                               pte_t pte)
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                               pte_t pte, unsigned long vma_flags)
 {
        unsigned long pfn = pte_pfn(pte);
 
@@ -748,7 +756,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                        goto check_pfn;
                if (vma->vm_ops && vma->vm_ops->find_special_page)
                        return vma->vm_ops->find_special_page(vma, addr);
-               if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+               if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
                if (!is_zero_pfn(pfn))
                        print_bad_pte(vma, addr, pte, NULL);
@@ -757,8 +765,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 
        /* !HAVE_PTE_SPECIAL case follows: */
 
-       if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
-               if (vma->vm_flags & VM_MIXEDMAP) {
+       if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+               if (vma_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
@@ -767,7 +775,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
-                       if (!is_cow_mapping(vma->vm_flags))
+                       if (!is_cow_mapping(vma_flags))
                                return NULL;
                }
        }
@@ -925,6 +933,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        spinlock_t *src_ptl, *dst_ptl;
        int progress = 0;
        int rss[NR_MM_COUNTERS];
+       unsigned long orig_addr = addr;
        swp_entry_t entry = (swp_entry_t){0};
 
 again:
@@ -962,6 +971,14 @@ again:
                progress += 8;
        } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 
+       /*
+        * Prevent the page fault handler to copy the page while stale tlb entry
+        * are still not flushed.
+        */
+       if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT) &&
+           is_cow_mapping(vma->vm_flags))
+               flush_tlb_range(vma, orig_addr, end);
+
        arch_leave_lazy_mmu_mode();
        spin_unlock(src_ptl);
        pte_unmap(orig_src_pte);
@@ -1289,6 +1306,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
                details = NULL;
 
        BUG_ON(addr >= end);
+       vm_write_begin(vma);
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
@@ -1298,6 +1316,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
                next = zap_pud_range(tlb, vma, pgd, addr, next, details);
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
+       vm_write_end(vma);
 }
 
 
@@ -1969,6 +1988,143 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static bool pte_spinlock(struct vm_fault *vmf)
+{
+       bool ret = false;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       pmd_t pmdval;
+#endif
+
+       /* Check if vma is still valid */
+       if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+               vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+               spin_lock(vmf->ptl);
+               return true;
+       }
+
+again:
+       local_irq_disable();
+       if (vma_has_changed(vmf)) {
+               trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /*
+        * We check if the pmd value is still the same to ensure that there
+        * is not a huge collapse operation in progress in our back.
+        */
+       pmdval = READ_ONCE(*vmf->pmd);
+       if (!pmd_same(pmdval, vmf->orig_pmd)) {
+               trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+#endif
+
+       vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+       if (unlikely(!spin_trylock(vmf->ptl))) {
+               local_irq_enable();
+               goto again;
+       }
+
+       if (vma_has_changed(vmf)) {
+               spin_unlock(vmf->ptl);
+               trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+
+       ret = true;
+out:
+       local_irq_enable();
+       return ret;
+}
+
+static bool pte_map_lock(struct vm_fault *vmf)
+{
+       bool ret = false;
+       pte_t *pte;
+       spinlock_t *ptl;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       pmd_t pmdval;
+#endif
+
+       if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+               vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+                                              vmf->address, &vmf->ptl);
+               return true;
+       }
+
+       /*
+        * The first vma_has_changed() guarantees the page-tables are still
+        * valid, having IRQs disabled ensures they stay around, hence the
+        * second vma_has_changed() to make sure they are still valid once
+        * we've got the lock. After that a concurrent zap_pte_range() will
+        * block on the PTL and thus we're safe.
+        */
+again:
+       local_irq_disable();
+       if (vma_has_changed(vmf)) {
+               trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /*
+        * We check if the pmd value is still the same to ensure that there
+        * is not a huge collapse operation in progress in our back.
+        */
+       pmdval = READ_ONCE(*vmf->pmd);
+       if (!pmd_same(pmdval, vmf->orig_pmd)) {
+               trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+#endif
+
+       /*
+        * Same as pte_offset_map_lock() except that we call
+        * spin_trylock() in place of spin_lock() to avoid race with
+        * unmap path which may have the lock and wait for this CPU
+        * to invalidate TLB but this CPU has irq disabled.
+        * Since we are in a speculative patch, accept it could fail
+        */
+       ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+       pte = pte_offset_map(vmf->pmd, vmf->address);
+       if (unlikely(!spin_trylock(ptl))) {
+               pte_unmap(pte);
+               local_irq_enable();
+               goto again;
+       }
+
+       if (vma_has_changed(vmf)) {
+               pte_unmap_unlock(pte, ptl);
+               trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+               goto out;
+       }
+
+       vmf->pte = pte;
+       vmf->ptl = ptl;
+       ret = true;
+out:
+       local_irq_enable();
+       return ret;
+}
+#else
+static inline bool pte_spinlock(struct vm_fault *vmf2)
+{
+       vmf2->ptl = pte_lockptr(vmf2->vma->vm_mm, vmf2->pmd);
+       spin_lock(vmf2->ptl);
+       return true;
+}
+
+static inline bool pte_map_lock(struct vm_fault *vmf2)
+{
+       vmf2->pte = pte_offset_map_lock(vmf2->vma->vm_mm, vmf2->pmd,
+                                       vmf2->address, &vmf2->ptl);
+       return true;
+}
+#endif
+
 /*
  * handle_pte_fault chooses page fault handler according to an entry which was
  * read non-atomically.  Before making any commitment, on those architectures
@@ -1976,21 +2132,28 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
  * parts, do_swap_page must check under lock before unmapping the pte and
  * proceeding (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page can safely check later on).
+ *
+ * pte_unmap_same() returns:
+ * 0                    if the PTE are the same
+ * VM_FAULT_PTNOTSAME  if the PTE are different
+ * VM_FAULT_RETRY       if the VMA has changed in our back during
+ *                      a speculative page fault handling.
  */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-                               pte_t *page_table, pte_t orig_pte)
+static inline int pte_unmap_same(struct vm_fault *vmf2)
 {
-       int same = 1;
+       int ret = 0;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
-               spinlock_t *ptl = pte_lockptr(mm, pmd);
-               spin_lock(ptl);
-               same = pte_same(*page_table, orig_pte);
-               spin_unlock(ptl);
+               if (pte_spinlock(vmf2)) {
+                       if (!pte_same(*vmf2->pte, vmf2->orig_pte))
+                               ret = VM_FAULT_PTNOTSAME;
+                       spin_unlock(vmf2->ptl);
+               } else
+                       ret = VM_FAULT_RETRY;
        }
 #endif
-       pte_unmap(page_table);
-       return same;
+       pte_unmap(vmf2->pte);
+       return ret;
 }
 
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
@@ -2081,7 +2244,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
                        pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
                        struct page *page, int page_mkwrite,
-                       int dirty_shared)
+                       int dirty_shared, struct vm_fault *vmf2)
        __releases(ptl)
 {
        pte_t entry;
@@ -2095,7 +2258,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
 
        flush_cache_page(vma, address, pte_pfn(orig_pte));
        entry = pte_mkyoung(orig_pte);
-       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       entry = maybe_mkwrite(pte_mkdirty(entry), vmf2->vma_flags);
        if (ptep_set_access_flags(vma, address, page_table, entry, 1))
                update_mmu_cache(vma, address, page_table);
        pte_unmap_unlock(page_table, ptl);
@@ -2146,7 +2309,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
  */
 static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *page_table, pmd_t *pmd,
-                       pte_t orig_pte, struct page *old_page)
+                       pte_t orig_pte, struct page *old_page, struct vm_fault *vmf2)
 {
        struct page *new_page = NULL;
        spinlock_t *ptl = NULL;
@@ -2155,23 +2318,24 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
        const unsigned long mmun_start = address & PAGE_MASK;   /* For mmu_notifiers */
        const unsigned long mmun_end = mmun_start + PAGE_SIZE;  /* For mmu_notifiers */
        struct mem_cgroup *memcg;
+       int ret = VM_FAULT_OOM;
 
        if (unlikely(anon_vma_prepare(vma)))
-               goto oom;
+               goto out;
 
        if (is_zero_pfn(pte_pfn(orig_pte))) {
                new_page = alloc_zeroed_user_highpage_movable(vma, address);
                if (!new_page)
-                       goto oom;
+                       goto out;
        } else {
                new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
                if (!new_page)
-                       goto oom;
+                       goto out;
                cow_user_page(new_page, old_page, address, vma);
        }
 
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
-               goto oom_free_new;
+               goto out_free_new;
 
        __SetPageUptodate(new_page);
 
@@ -2180,7 +2344,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
        /*
         * Re-check the pte - we dropped the lock
         */
-       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2)) {
+               ret = VM_FAULT_RETRY;
+               goto out_uncharge;
+       }
+       page_table = vmf2->pte;
+       ptl = vmf2->ptl;
        if (likely(pte_same(*page_table, orig_pte))) {
                if (old_page) {
                        if (!PageAnon(old_page)) {
@@ -2191,8 +2360,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                        inc_mm_counter_fast(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, address, pte_pfn(orig_pte));
-               entry = mk_pte(new_page, vma->vm_page_prot);
-               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+               entry = mk_pte(new_page, vmf2->vma_page_prot);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vmf2->vma_flags);
                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry. This will avoid a race condition
@@ -2200,9 +2369,9 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, address, page_table);
-               page_add_new_anon_rmap(new_page, vma, address);
+               __page_add_new_anon_rmap(new_page, vma, address);
                mem_cgroup_commit_charge(new_page, memcg, false);
-               lru_cache_add_active_or_unevictable(new_page, vma);
+               __lru_cache_add_active_or_unevictable(new_page, vmf2->vma_flags);
                /*
                 * We call the notify macro here because, when using secondary
                 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2253,7 +2422,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 * Don't let another task, with possibly unlocked vma,
                 * keep the mlocked page.
                 */
-               if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+               if (page_copied && (vmf2->vma_flags & VM_LOCKED)) {
                        lock_page(old_page);    /* LRU manipulation */
                        munlock_vma_page(old_page);
                        unlock_page(old_page);
@@ -2261,12 +2430,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                page_cache_release(old_page);
        }
        return page_copied ? VM_FAULT_WRITE : 0;
-oom_free_new:
+out_uncharge:
+       mem_cgroup_cancel_charge(new_page, memcg);
+out_free_new:
        page_cache_release(new_page);
-oom:
+out:
        if (old_page)
                page_cache_release(old_page);
-       return VM_FAULT_OOM;
+       return ret;
 }
 
 /*
@@ -2276,7 +2447,7 @@ oom:
 static int wp_pfn_shared(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
                        pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
-                       pmd_t *pmd)
+                       pmd_t *pmd, struct vm_fault *vmf2)
 {
        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
                struct vm_fault vmf = {
@@ -2291,7 +2462,10 @@ static int wp_pfn_shared(struct mm_struct *mm,
                ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
                if (ret & VM_FAULT_ERROR)
                        return ret;
-               page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+               if (!pte_map_lock(vmf2))
+                       return VM_FAULT_RETRY;
+               page_table = vmf2->pte;
+               ptl = vmf2->ptl;
                /*
                 * We might have raced with another page fault while we
                 * released the pte_offset_map_lock.
@@ -2302,13 +2476,13 @@ static int wp_pfn_shared(struct mm_struct *mm,
                }
        }
        return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
-                            NULL, 0, 0);
+                            NULL, 0, 0, vmf2);
 }
 
 static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
                          unsigned long address, pte_t *page_table,
                          pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
-                         struct page *old_page)
+                         struct page *old_page, struct vm_fault *vmf2)
        __releases(ptl)
 {
        int page_mkwrite = 0;
@@ -2336,8 +2510,11 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
                 * they did, we just return, as we can count on the
                 * MMU to tell us if they didn't also make it writable.
                 */
-               page_table = pte_offset_map_lock(mm, pmd, address,
-                                                &ptl);
+               if (!pte_map_lock(vmf2))
+                       return VM_FAULT_RETRY;
+               page_table = vmf2->pte;
+               ptl = vmf2->ptl;
+
                if (!pte_same(*page_table, orig_pte)) {
                        unlock_page(old_page);
                        pte_unmap_unlock(page_table, ptl);
@@ -2348,7 +2525,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        return wp_page_reuse(mm, vma, address, page_table, ptl,
-                            orig_pte, old_page, page_mkwrite, 1);
+                            orig_pte, old_page, page_mkwrite, 1, vmf2);
 }
 
 /*
@@ -2371,12 +2548,12 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
-               spinlock_t *ptl, pte_t orig_pte)
+               spinlock_t *ptl, pte_t orig_pte, struct vm_fault *vmf2)
        __releases(ptl)
 {
        struct page *old_page;
 
-       old_page = vm_normal_page(vma, address, orig_pte);
+       old_page = __vm_normal_page(vma, address, orig_pte, vmf2->vma_flags);
        if (!old_page) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
@@ -2385,14 +2562,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * We should not cow pages in a shared writeable mapping.
                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
                 */
-               if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+               if ((vmf2->vma_flags & (VM_WRITE|VM_SHARED)) ==
                                     (VM_WRITE|VM_SHARED))
                        return wp_pfn_shared(mm, vma, address, page_table, ptl,
-                                            orig_pte, pmd);
+                                            orig_pte, pmd, vmf2);
 
                pte_unmap_unlock(page_table, ptl);
                return wp_page_copy(mm, vma, address, page_table, pmd,
-                                   orig_pte, old_page);
+                                   orig_pte, old_page, vmf2);
        }
 
        /*
@@ -2404,8 +2581,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        page_cache_get(old_page);
                        pte_unmap_unlock(page_table, ptl);
                        lock_page(old_page);
-                       page_table = pte_offset_map_lock(mm, pmd, address,
-                                                        &ptl);
+                       if (!pte_map_lock(vmf2)) {
+                               unlock_page(old_page);
+                               put_page(old_page);
+                               return VM_FAULT_RETRY;
+                       }
+                       page_table = vmf2->pte;
+                       ptl = vmf2->ptl;
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
                                pte_unmap_unlock(page_table, ptl);
@@ -2423,13 +2605,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        page_move_anon_rmap(old_page, vma, address);
                        unlock_page(old_page);
                        return wp_page_reuse(mm, vma, address, page_table, ptl,
-                                            orig_pte, old_page, 0, 0);
+                                            orig_pte, old_page, 0, 0, vmf2);
                }
                unlock_page(old_page);
-       } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+       } else if (unlikely((vmf2->vma_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
                return wp_page_shared(mm, vma, address, page_table, pmd,
-                                     ptl, orig_pte, old_page);
+                                     ptl, orig_pte, old_page, vmf2);
        }
 
        /*
@@ -2439,7 +2621,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        pte_unmap_unlock(page_table, ptl);
        return wp_page_copy(mm, vma, address, page_table, pmd,
-                           orig_pte, old_page);
+                           orig_pte, old_page, vmf2);
 }
 
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2532,7 +2714,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
-               unsigned int flags, pte_t orig_pte)
+               unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
 {
        spinlock_t *ptl;
        struct page *page, *swapcache;
@@ -2541,10 +2723,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t pte;
        int locked;
        int exclusive = 0;
-       int ret = 0;
+       int ret;
 
-       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+       ret = pte_unmap_same(vmf2);
+       if (ret) {
+               /*
+                * In the case the PTE are different, meaning that the
+                * page has already been processed by another CPU, we return 0.
+                */
+               if (ret == VM_FAULT_PTNOTSAME)
+                       ret = 0;
                goto out;
+       }
 
        entry = pte_to_swp_entry(orig_pte);
        if (unlikely(non_swap_entry(entry))) {
@@ -2565,10 +2755,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                        GFP_HIGHUSER_MOVABLE, vma, address);
                if (!page) {
                        /*
-                        * Back out if somebody else faulted in this pte
-                        * while we released the pte lock.
-                        */
-                       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+                         * Back out if the VMA has changed in our back during
+                         * a speculative page fault or if somebody else
+                         * faulted in this pte while we released the pte lock.
+                         */
+                       if (!pte_map_lock(vmf2)) {
+                               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+                               ret = VM_FAULT_RETRY;
+                               goto out;
+                       }
+                       page_table = vmf2->pte;
+                       ptl = vmf2->ptl;
                        if (likely(pte_same(*page_table, orig_pte)))
                                ret = VM_FAULT_OOM;
                        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2621,9 +2818,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        /*
-        * Back out if somebody else already faulted in this pte.
+         * Back out if the VMA has changed in our back during a speculative
+         * page fault or if somebody else already faulted in this pte.
         */
-       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2)) {
+               ret = VM_FAULT_RETRY;
+               goto out_cancel_cgroup;
+       }
+       page_table = vmf2->pte;
+       ptl = vmf2->ptl;
        if (unlikely(!pte_same(*page_table, orig_pte)))
                goto out_nomap;
 
@@ -2644,9 +2847,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        inc_mm_counter_fast(mm, MM_ANONPAGES);
        dec_mm_counter_fast(mm, MM_SWAPENTS);
-       pte = mk_pte(page, vma->vm_page_prot);
+       pte = mk_pte(page, vmf2->vma_page_prot);
        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
-               pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+               pte = maybe_mkwrite(pte_mkdirty(pte), vmf2->vma_flags);
                flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
                exclusive = 1;
@@ -2659,14 +2862,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                do_page_add_anon_rmap(page, vma, address, exclusive);
                mem_cgroup_commit_charge(page, memcg, true);
        } else { /* ksm created a completely new copy */
-               page_add_new_anon_rmap(page, vma, address);
+               __page_add_new_anon_rmap(page, vma, address);
                mem_cgroup_commit_charge(page, memcg, false);
-               lru_cache_add_active_or_unevictable(page, vma);
+               __lru_cache_add_active_or_unevictable(page, vmf2->vma_flags);
        }
 
        swap_free(entry);
        if ((PageSwapCache(page) && vm_swap_full(page_swap_info(page))) ||
-               (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+               (vmf2->vma_flags & VM_LOCKED) || PageMlocked(page))
                try_to_free_swap(page);
        unlock_page(page);
        if (page != swapcache) {
@@ -2683,7 +2886,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        if (flags & FAULT_FLAG_WRITE) {
-               ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+               ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte, vmf2);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
@@ -2696,8 +2899,9 @@ unlock:
 out:
        return ret;
 out_nomap:
-       mem_cgroup_cancel_charge(page, memcg);
        pte_unmap_unlock(page_table, ptl);
+out_cancel_cgroup:
+       mem_cgroup_cancel_charge(page, memcg);
 out_page:
        unlock_page(page);
 out_release:
@@ -2716,26 +2920,40 @@ out_release:
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
-               unsigned int flags)
+               unsigned int flags, struct vm_fault *vmf2)
 {
        struct mem_cgroup *memcg;
        struct page *page;
        spinlock_t *ptl;
        pte_t entry;
+       int ret = 0;
 
        pte_unmap(page_table);
 
        /* File mapping without ->vm_ops ? */
-       if (vma->vm_flags & VM_SHARED)
+       if (vmf2->vma_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;
 
        /* Use the zero-page for reads */
        if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
-                                               vma->vm_page_prot));
-               page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+                                               vmf2->vma_page_prot));
+               if (!pte_map_lock(vmf2))
+                       return VM_FAULT_RETRY;
+               page_table = vmf2->pte;
+               ptl = vmf2->ptl;
                if (!pte_none(*page_table))
                        goto unlock;
+
+               /*
+                * Don't call the userfaultfd during the speculative path.
+                * We already checked for the VMA to not be managed through
+                * userfaultfd, but it may be set in our back once we have lock
+                * the pte. In such a case we can ignore it this time.
+                */
+               if (vmf2->flags & FAULT_FLAG_SPECULATIVE)
+                       goto setpte;
+
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
                        pte_unmap_unlock(page_table, ptl);
@@ -2762,16 +2980,22 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        __SetPageUptodate(page);
 
-       entry = mk_pte(page, vma->vm_page_prot);
-       if (vma->vm_flags & VM_WRITE)
+       entry = mk_pte(page, vmf2->vma_page_prot);
+       if (vmf2->vma_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));
 
-       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-       if (!pte_none(*page_table))
+       if (!pte_map_lock(vmf2)) {
+               ret = VM_FAULT_RETRY;
                goto release;
+       }
+       page_table = vmf2->pte;
+       ptl = vmf2->ptl;
+
+       if (!pte_none(*page_table))
+               goto unlock_and_release;
 
        /* Deliver the page fault to userland, check inside PT lock */
-       if (userfaultfd_missing(vma)) {
+       if (!(vmf2->flags & FAULT_FLAG_SPECULATIVE) && userfaultfd_missing(vma)) {
                pte_unmap_unlock(page_table, ptl);
                mem_cgroup_cancel_charge(page, memcg);
                page_cache_release(page);
@@ -2780,9 +3004,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        inc_mm_counter_fast(mm, MM_ANONPAGES);
-       page_add_new_anon_rmap(page, vma, address);
+       __page_add_new_anon_rmap(page, vma, address);
        mem_cgroup_commit_charge(page, memcg, false);
-       lru_cache_add_active_or_unevictable(page, vma);
+       __lru_cache_add_active_or_unevictable(page, vmf2->vma_flags);
 setpte:
        set_pte_at(mm, address, page_table, entry);
 
@@ -2790,11 +3014,13 @@ setpte:
        update_mmu_cache(vma, address, page_table);
 unlock:
        pte_unmap_unlock(page_table, ptl);
-       return 0;
+       return ret;
+unlock_and_release:
+       pte_unmap_unlock(page_table, ptl);
 release:
        mem_cgroup_cancel_charge(page, memcg);
        page_cache_release(page);
-       goto unlock;
+       return ret;
 oom_free_page:
        page_cache_release(page);
 oom:
@@ -2859,17 +3085,17 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
  * vm_ops->map_pages.
  */
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-               struct page *page, pte_t *pte, bool write, bool anon)
+               struct page *page, pte_t *pte, bool write, bool anon,  struct vm_fault *vmf2)
 {
        pte_t entry;
 
        flush_icache_page(vma, page);
-       entry = mk_pte(page, vma->vm_page_prot);
+       entry = mk_pte(page, vmf2->vma_page_prot);
        if (write)
-               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+               entry = maybe_mkwrite(pte_mkdirty(entry), vmf2->vma_flags);
        if (anon) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-               page_add_new_anon_rmap(page, vma, address);
+               __page_add_new_anon_rmap(page, vma, address);
        } else {
                inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
                page_add_file_rmap(page);
@@ -2990,7 +3216,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
 
 static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd,
-               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
 {
        struct page *fault_page;
        spinlock_t *ptl;
@@ -3003,7 +3229,10 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * something).
         */
        if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
-               pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+               if (!pte_map_lock(vmf2))
+                       return VM_FAULT_RETRY;
+               pte = vmf2->pte;
+               ptl = vmf2->ptl;
                do_fault_around(vma, address, pte, pgoff, flags);
                if (!pte_same(*pte, orig_pte))
                        goto unlock_out;
@@ -3014,14 +3243,17 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;
 
-       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2))
+               return VM_FAULT_RETRY;
+       pte = vmf2->pte;
+       ptl = vmf2->ptl;
        if (unlikely(!pte_same(*pte, orig_pte))) {
                pte_unmap_unlock(pte, ptl);
                unlock_page(fault_page);
                page_cache_release(fault_page);
                return ret;
        }
-       do_set_pte(vma, address, fault_page, pte, false, false);
+       do_set_pte(vma, address, fault_page, pte, false, false, vmf2);
        unlock_page(fault_page);
 unlock_out:
        pte_unmap_unlock(pte, ptl);
@@ -3030,7 +3262,7 @@ unlock_out:
 
 static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd,
-               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
 {
        struct page *fault_page, *new_page;
        struct mem_cgroup *memcg;
@@ -3058,7 +3290,10 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                copy_user_highpage(new_page, fault_page, address, vma);
        __SetPageUptodate(new_page);
 
-       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2))
+               return VM_FAULT_RETRY;
+       pte = vmf2->pte;
+       ptl = vmf2->ptl;
        if (unlikely(!pte_same(*pte, orig_pte))) {
                pte_unmap_unlock(pte, ptl);
                if (fault_page) {
@@ -3073,9 +3308,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                goto uncharge_out;
        }
-       do_set_pte(vma, address, new_page, pte, true, true);
+       do_set_pte(vma, address, new_page, pte, true, true, vmf2);
        mem_cgroup_commit_charge(new_page, memcg, false);
-       lru_cache_add_active_or_unevictable(new_page, vma);
+       __lru_cache_add_active_or_unevictable(new_page, vmf2->vma_flags);
        pte_unmap_unlock(pte, ptl);
        if (fault_page) {
                unlock_page(fault_page);
@@ -3096,7 +3331,7 @@ uncharge_out:
 
 static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd,
-               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
 {
        struct page *fault_page;
        struct address_space *mapping;
@@ -3123,14 +3358,17 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                }
        }
 
-       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!pte_map_lock(vmf2))
+               return VM_FAULT_RETRY;
+       pte = vmf2->pte;
+       ptl = vmf2->ptl;
        if (unlikely(!pte_same(*pte, orig_pte))) {
                pte_unmap_unlock(pte, ptl);
                unlock_page(fault_page);
                page_cache_release(fault_page);
                return ret;
        }
-       do_set_pte(vma, address, fault_page, pte, true, false);
+       do_set_pte(vma, address, fault_page, pte, true, false, vmf2);
        pte_unmap_unlock(pte, ptl);
 
        if (set_page_dirty(fault_page))
@@ -3165,7 +3403,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
-               unsigned int flags, pte_t orig_pte)
+               unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
 {
        pgoff_t pgoff = (((address & PAGE_MASK)
                        - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -3176,11 +3414,11 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                return VM_FAULT_SIGBUS;
        if (!(flags & FAULT_FLAG_WRITE))
                return do_read_fault(mm, vma, address, pmd, pgoff, flags,
-                               orig_pte);
-       if (!(vma->vm_flags & VM_SHARED))
+                               orig_pte, vmf2);
+       if (!(vmf2->vma_flags & VM_SHARED))
                return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
-                               orig_pte);
-       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+                               orig_pte, vmf2);
+       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte, vmf2);
 }
 
 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3199,7 +3437,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 }
 
 static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                  unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+                  unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd, struct vm_fault *vmf2)
 {
        struct page *page = NULL;
        spinlock_t *ptl;
@@ -3222,8 +3460,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        * page table entry is not accessible, so there would be no
        * concurrent hardware modifications to the PTE.
        */
-       ptl = pte_lockptr(mm, pmd);
-       spin_lock(ptl);
+       if (!pte_spinlock(vmf2))
+               return VM_FAULT_RETRY;
+       ptl = vmf2->ptl;
        if (unlikely(!pte_same(*ptep, pte))) {
                pte_unmap_unlock(ptep, ptl);
                goto out;
@@ -3237,7 +3476,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        set_pte_at(mm, addr, ptep, pte);
        update_mmu_cache(vma, addr, ptep);
 
-       page = vm_normal_page(vma, addr, pte);
+       page = __vm_normal_page(vma, addr, pte, vmf2->vma_flags);
        if (!page) {
                pte_unmap_unlock(ptep, ptl);
                return 0;
@@ -3323,7 +3562,8 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 static int handle_pte_fault(struct mm_struct *mm,
                     struct vm_area_struct *vma, unsigned long address,
-                    pte_t *pte, pmd_t *pmd, unsigned int flags)
+                    pte_t *pte, pmd_t *pmd, unsigned int flags,
+                    struct vm_fault *vmf2)
 {
        pte_t entry;
        spinlock_t *ptl;
@@ -3342,26 +3582,30 @@ static int handle_pte_fault(struct mm_struct *mm,
                if (pte_none(entry)) {
                        if (vma_is_anonymous(vma))
                                return do_anonymous_page(mm, vma, address,
-                                                        pte, pmd, flags);
+                                                        pte, pmd, flags, vmf2);
+                       else if (vmf2->flags & FAULT_FLAG_SPECULATIVE)
+                               return VM_FAULT_RETRY;
                        else
                                return do_fault(mm, vma, address, pte, pmd,
-                                               flags, entry);
+                                               flags, entry, vmf2);
                }
                return do_swap_page(mm, vma, address,
-                                       pte, pmd, flags, entry);
+                                       pte, pmd, flags, entry, vmf2);
        }
 
        if (pte_protnone(entry))
-               return do_numa_page(mm, vma, address, entry, pte, pmd);
+               return do_numa_page(mm, vma, address, entry, pte, pmd, vmf2);
+
+       if (!pte_spinlock(vmf2))
+               return VM_FAULT_RETRY;
+       ptl = vmf2->ptl;
 
-       ptl = pte_lockptr(mm, pmd);
-       spin_lock(ptl);
        if (unlikely(!pte_same(*pte, entry)))
                goto unlock;
        if (flags & FAULT_FLAG_WRITE) {
                if (!pte_write(entry))
                        return do_wp_page(mm, vma, address,
-                                       pte, pmd, ptl, entry);
+                                       pte, pmd, ptl, entry, vmf2);
                entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
@@ -3396,6 +3640,14 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pmd_t *pmd;
        pte_t *pte;
 
+       struct vm_fault vmf2 = {
+               .vma = vma,
+               .address = address & PAGE_MASK,
+               .flags = flags,
+               .vma_flags = vma->vm_flags,
+               .vma_page_prot = vma->vm_page_prot,
+       };
+
        if (unlikely(is_vm_hugetlb_page(vma)))
                return hugetlb_fault(mm, vma, address, flags);
 
@@ -3403,9 +3655,14 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pud = pud_alloc(mm, pgd, address);
        if (!pud)
                return VM_FAULT_OOM;
-       pmd = pmd_alloc(mm, pud, address);
+       vmf2.pmd = pmd = pmd_alloc(mm, pud, address);
        if (!pmd)
                return VM_FAULT_OOM;
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       vmf2.sequence = raw_read_seqcount(&vma->vm_sequence);
+#endif
+
        if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
                int ret = create_huge_pmd(mm, vma, address, pmd, flags);
                if (!(ret & VM_FAULT_FALLBACK))
@@ -3470,10 +3727,232 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * read mode and khugepaged takes it in write mode. So now it's
         * safe to run pte_offset_map().
         */
-       pte = pte_offset_map(pmd, address);
+       vmf2.pte = pte = pte_offset_map(pmd, address);
+       vmf2.orig_pte = *vmf2.pte;
+
+       return handle_pte_fault(mm, vma, address, pte, pmd, flags, &vmf2);
+}
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+/*
+ * Tries to handle the page fault in a speculative way, without grabbing the
+ * mmap_sem.
+ */
+int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
+                              unsigned int flags)
+{
+       struct vm_fault vmf = {
+               .address = address,
+       };
+       pgd_t *pgd, pgdval;
+/*     p4d_t *p4d, p4dval; */
+       pud_t pudval;
+       int seq, ret = VM_FAULT_RETRY;
+       struct vm_area_struct *vma;
+#ifdef CONFIG_NUMA
+       struct mempolicy *pol;
+#endif
+
+       /* Clear flags that may lead to release the mmap_sem to retry */
+       flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
+       flags |= FAULT_FLAG_SPECULATIVE;
+
+       vma = get_vma(mm, address);
+       if (!vma)
+               return ret;
+
+       seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */
+       if (seq & 1) {
+               trace_spf_vma_changed(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       /*
+        * Can't call vm_ops service has we don't know what they would do
+        * with the VMA.
+        * This include huge page from hugetlbfs.
+        */
+       if (vma->vm_ops) {
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       /*
+        * __anon_vma_prepare() requires the mmap_sem to be held
+        * because vm_next and vm_prev must be safe. This can't be guaranteed
+        * in the speculative path.
+        */
+       if (unlikely(!vma->anon_vma)) {
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       vmf.vma_flags = READ_ONCE(vma->vm_flags);
+       vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
+
+       /* Can't call userland page fault handler in the speculative path */
+       if (unlikely(vmf.vma_flags & VM_UFFD_MISSING)) {
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) {
+               /*
+                * This could be detected by the check address against VMA's
+                * boundaries but we want to trace it as not supported instead
+                * of changed.
+                */
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       if (address < READ_ONCE(vma->vm_start)
+           || READ_ONCE(vma->vm_end) <= address) {
+               trace_spf_vma_changed(_RET_IP_, vma, address);
+               goto out_put;
+       }
+/*
+       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+                                      flags & FAULT_FLAG_INSTRUCTION,
+                                      flags & FAULT_FLAG_REMOTE)) {
+               trace_spf_vma_access(_RET_IP_, vma, address);
+               ret = VM_FAULT_SIGSEGV;
+               goto out_put;
+       }
+*/
+       /* This is one is required to check that the VMA has write access set */
+       if (flags & FAULT_FLAG_WRITE) {
+               if (unlikely(!(vmf.vma_flags & VM_WRITE))) {
+                       trace_spf_vma_access(_RET_IP_, vma, address);
+                       ret = VM_FAULT_SIGSEGV;
+                       goto out_put;
+               }
+       } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE)))) {
+               trace_spf_vma_access(_RET_IP_, vma, address);
+               ret = VM_FAULT_SIGSEGV;
+               goto out_put;
+       }
+
+#ifdef CONFIG_NUMA
+       /*
+        * MPOL_INTERLEAVE implies additional checks in
+        * mpol_misplaced() which are not compatible with the
+        *speculative page fault processing.
+        */
+       pol = __get_vma_policy(vma, address);
+       if (!pol)
+               pol = get_task_policy(current);
+       if (pol && pol->mode == MPOL_INTERLEAVE) {
+               trace_spf_vma_notsup(_RET_IP_, vma, address);
+               goto out_put;
+       }
+#endif
 
-       return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+       /*
+        * Do a speculative lookup of the PTE entry.
+        */
+       local_irq_disable();
+       pgd = pgd_offset(mm, address);
+       pgdval = READ_ONCE(*pgd);
+       if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
+               goto out_walk;
+/*
+       p4d = p4d_offset(pgd, address);
+       p4dval = READ_ONCE(*p4d);
+       if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval)))
+               goto out_walk;
+*/
+       vmf.pud = pud_offset(pgd, address);
+       pudval = READ_ONCE(*vmf.pud);
+       if (pud_none(pudval) || unlikely(pud_bad(pudval)))
+               goto out_walk;
+
+       /* Huge pages at PUD level are not supported. */
+/*
+       if (unlikely(pud_trans_huge(pudval)))
+               goto out_walk;
+*/
+       vmf.pmd = pmd_offset(vmf.pud, address);
+       vmf.orig_pmd = READ_ONCE(*vmf.pmd);
+       /*
+        * pmd_none could mean that a hugepage collapse is in progress
+        * in our back as collapse_huge_page() mark it before
+        * invalidating the pte (which is done once the IPI is catched
+        * by all CPU and we have interrupt disabled).
+        * For this reason we cannot handle THP in a speculative way since we
+        * can't safely indentify an in progress collapse operation done in our
+        * back on that PMD.
+        * Regarding the order of the following checks, see comment in
+        * pmd_devmap_trans_unstable()
+        */
+       if (/* unlikely(pmd_devmap(vmf.orig_pmd) || */
+                    pmd_none(vmf.orig_pmd) || pmd_trans_huge(vmf.orig_pmd)  /* ||
+                    is_swap_pmd(vmf.orig_pmd))*/)
+               goto out_walk;
+
+       /*
+        * The above does not allocate/instantiate page-tables because doing so
+        * would lead to the possibility of instantiating page-tables after
+        * free_pgtables() -- and consequently leaking them.
+        *
+        * The result is that we take at least one !speculative fault per PMD
+        * in order to instantiate it.
+        */
+
+       vmf.pte = pte_offset_map(vmf.pmd, address);
+       vmf.orig_pte = READ_ONCE(*vmf.pte);
+       barrier(); /* See comment in handle_pte_fault() */
+       /*
+       if (pte_none(vmf.orig_pte)) {
+               pte_unmap(vmf.pte);
+               vmf.pte = NULL;
+       }
+       */
+       vmf.vma = vma;
+       vmf.pgoff = linear_page_index(vma, address);
+/*     vmf.gfp_mask = __get_fault_gfp_mask(vma); */
+       vmf.sequence = seq;
+       vmf.flags = flags;
+
+       local_irq_enable();
+
+       /*
+        * We need to re-validate the VMA after checking the bounds, otherwise
+        * we might have a false positive on the bounds.
+        */
+       if (read_seqcount_retry(&vma->vm_sequence, seq)) {
+               trace_spf_vma_changed(_RET_IP_, vma, address);
+               goto out_put;
+       }
+
+       mem_cgroup_oom_enable();
+       ret = handle_pte_fault(mm, vmf.vma, vmf.address, vmf.pte,
+                               vmf.pmd, vmf.flags, &vmf);
+       mem_cgroup_oom_disable();
+
+       put_vma(vma);
+
+       if (ret != VM_FAULT_RETRY)
+               count_vm_event(SPECULATIVE_PGFAULT);
+
+       /*
+        * The task may have entered a memcg OOM situation but
+        * if the allocation error was handled gracefully (no
+        * VM_FAULT_OOM), there is no need to kill anything.
+        * Just clean up the OOM state peacefully.
+        */
+       if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+               mem_cgroup_oom_synchronize(false);
+       return ret;
+
+out_walk:
+       trace_spf_vma_notsup(_RET_IP_, vma, address);
+       local_irq_enable();
+out_put:
+       put_vma(vma);
+       return ret;
 }
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
 
 /*
  * By the time we get here, we already hold the mm semaphore
index d8d8c2a..118b074 100644 (file)
@@ -241,7 +241,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 
        /* Recheck VMA as permissions can change since migration started  */
        if (is_write_migration_entry(entry))
-               pte = maybe_mkwrite(pte, vma);
+               pte = maybe_mkwrite(pte, vma->vm_flags);
 
 #ifdef CONFIG_HUGETLB_PAGE
        if (PageHuge(new)) {
index 966dbdc..68ddb11 100644 (file)
@@ -423,7 +423,9 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
 void munlock_vma_pages_range(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end)
 {
-       vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+       vm_write_begin(vma);
+       WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK);
+       vm_write_end(vma);
 
        while (start < end) {
                struct page *page = NULL;
@@ -549,9 +551,11 @@ success:
         * set VM_LOCKED, populate_vma_page_range will bring it back.
         */
 
-       if (lock)
-               vma->vm_flags = newflags;
-       else
+       if (lock) {
+               vm_write_begin(vma);
+               WRITE_ONCE(vma->vm_flags, newflags);
+               vm_write_end(vma);
+       } else
                munlock_vma_pages_range(vma, start, end);
 
 out:
index 3bb666c..37871a7 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -288,6 +288,27 @@ void unlink_file_vma(struct vm_area_struct *vma)
        }
 }
 
+static void __free_vma(struct vm_area_struct *vma)
+{
+       if (vma->vm_file)
+               fput(vma->vm_file);
+       mpol_put(vma_policy(vma));
+       kmem_cache_free(vm_area_cachep, vma);
+}
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+void put_vma(struct vm_area_struct *vma)
+{
+       if (atomic_dec_and_test(&vma->vm_ref_count))
+               __free_vma(vma);
+}
+#else
+static inline void put_vma(struct vm_area_struct *vma)
+{
+       return __free_vma(vma);
+}
+#endif
+
 /*
  * Close a vm structure and free it, returning the next.
  */
@@ -298,10 +319,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
        might_sleep();
        if (vma->vm_ops && vma->vm_ops->close)
                vma->vm_ops->close(vma);
-       if (vma->vm_file)
-               fput(vma->vm_file);
-       mpol_put(vma_policy(vma));
-       kmem_cache_free(vm_area_cachep, vma);
+       put_vma(vma);
        return next;
 }
 
@@ -515,6 +533,14 @@ static void validate_mm(struct mm_struct *mm)
 #define validate_mm(mm) do { } while (0)
 #endif
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+#define mm_rb_write_lock(mm)   write_lock(&(mm)->mm_rb_lock)
+#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock)
+#else
+#define mm_rb_write_lock(mm)   do { } while (0)
+#define mm_rb_write_unlock(mm) do { } while (0)
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
 RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
                     unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
 
@@ -533,16 +559,20 @@ static void vma_gap_update(struct vm_area_struct *vma)
 }
 
 static inline void vma_rb_insert(struct vm_area_struct *vma,
-                                struct rb_root *root)
+                                struct mm_struct *mm)
 {
+       struct rb_root *root = &mm->mm_rb;
+
        /* All rb_subtree_gap values must be consistent prior to insertion */
        validate_mm_rb(root, NULL);
 
        rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 }
 
-static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+static void vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
 {
+       struct rb_root *root = &mm->mm_rb;
+
        /*
         * All rb_subtree_gap values must be consistent prior to erase,
         * with the possible exception of the vma being erased.
@@ -554,7 +584,15 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
         * so make sure we instantiate it only once with our desired
         * augmented rbtree callbacks.
         */
+       mm_rb_write_lock(mm);
        rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+       mm_rb_write_unlock(mm); /* wmb */
+
+       /*
+        * Ensure the removal is complete before clearing the node.
+        * Matched by vma_has_changed()/handle_speculative_fault().
+        */
+       RB_CLEAR_NODE(&vma->vm_rb);
 }
 
 /*
@@ -669,10 +707,12 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
         * immediately update the gap to the correct value. Finally we
         * rebalance the rbtree after all augmented values have been set.
         */
+       mm_rb_write_lock(mm);
        rb_link_node(&vma->vm_rb, rb_parent, rb_link);
        vma->rb_subtree_gap = 0;
        vma_gap_update(vma);
-       vma_rb_insert(vma, &mm->mm_rb);
+       vma_rb_insert(vma, mm);
+       mm_rb_write_unlock(mm);
 }
 
 static void __vma_link_file(struct vm_area_struct *vma)
@@ -746,7 +786,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct vm_area_struct *next;
 
-       vma_rb_erase(vma, &mm->mm_rb);
+       vma_rb_erase(vma, mm);
        prev->vm_next = next = vma->vm_next;
        if (next)
                next->vm_prev = prev;
@@ -762,8 +802,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
  * are necessary.  The "insert" vma (if any) is to be inserted
  * before we drop the necessary locks.
  */
-int vma_adjust(struct vm_area_struct *vma, unsigned long start,
-       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
+       bool keep_locked)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next = vma->vm_next;
@@ -776,6 +817,30 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        long adjust_next = 0;
        int remove_next = 0;
 
+       /*
+        * Why using vm_raw_write*() functions here to avoid lockdep's warning ?
+        *
+        * Locked is complaining about a theoretical lock dependency, involving
+        * 3 locks:
+        *       mapping->i_mmap_rwsem --> vma->vm_sequence --> fs_reclaim
+        *
+        * Here are the major path leading to this dependency :
+        *      1. __vma_adjust() mmap_sem      -> vm_sequence -> i_mmap_rwsem
+        *      2. move_vmap() mmap_sem -> vm_sequence -> fs_reclaim
+        *      3. __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem
+        *      4. unmap_mapping_range() i_mmap_rwsem -> vm_sequence
+        *
+        * So there is no way to solve this easily, especially because in
+        * unmap_mapping_range() the i_mmap_rwsem is grab while the impacted
+        * VMAs are not yet known.
+        * However, the way the vm_seq is used is guarantying that we will
+        * never block on it since we just check for its value and never wait
+        * for it to move, see vma_has_changed() and handle_speculative_fault().
+        */
+       vm_raw_write_begin(vma);
+       if (next)
+               vm_raw_write_begin(next);
+
        if (next && !insert) {
                struct vm_area_struct *exporter = NULL;
 
@@ -817,8 +882,12 @@ again:                     remove_next = 1 + (end > next->vm_end);
 
                        importer->anon_vma = exporter->anon_vma;
                        error = anon_vma_clone(importer, exporter);
-                       if (error)
+                       if (error) {
+                               if (next && next != vma)
+                                       vm_raw_write_end(next);
+                               vm_raw_write_end(vma);
                                return error;
+                       }
                }
        }
 
@@ -864,17 +933,18 @@ again:                    remove_next = 1 + (end > next->vm_end);
        }
 
        if (start != vma->vm_start) {
-               vma->vm_start = start;
+               WRITE_ONCE(vma->vm_start, start);
                start_changed = true;
        }
        if (end != vma->vm_end) {
-               vma->vm_end = end;
+               WRITE_ONCE(vma->vm_end, end);
                end_changed = true;
        }
-       vma->vm_pgoff = pgoff;
+       WRITE_ONCE(vma->vm_pgoff, pgoff);
        if (adjust_next) {
-               next->vm_start += adjust_next << PAGE_SHIFT;
-               next->vm_pgoff += adjust_next;
+               WRITE_ONCE(next->vm_start,
+                          next->vm_start + (adjust_next << PAGE_SHIFT));
+               WRITE_ONCE(next->vm_pgoff, next->vm_pgoff + adjust_next);
        }
 
        if (root) {
@@ -929,19 +999,21 @@ again:                    remove_next = 1 + (end > next->vm_end);
        if (remove_next) {
                if (file) {
                        uprobe_munmap(next, next->vm_start, next->vm_end);
-                       fput(file);
                }
                if (next->anon_vma)
                        anon_vma_merge(vma, next);
                mm->map_count--;
-               mpol_put(vma_policy(next));
-               kmem_cache_free(vm_area_cachep, next);
+               vm_raw_write_end(next);
+               put_vma(next);
                /*
                 * In mprotect's case 6 (see comments on vma_merge),
                 * we must remove another next too. It would clutter
                 * up the code too much to do both in one go.
                 */
                next = vma->vm_next;
+               if (next)
+                       vm_raw_write_begin(next);
+
                if (remove_next == 2)
                        goto again;
                else if (next)
@@ -952,6 +1024,12 @@ again:                    remove_next = 1 + (end > next->vm_end);
        if (insert && file)
                uprobe_mmap(insert);
 
+       if (next && next != vma)
+               vm_raw_write_end(next);
+
+       if (!keep_locked)
+               vm_raw_write_end(vma);
+
        validate_mm(mm);
 
        return 0;
@@ -1080,13 +1158,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
  * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
  */
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
                        struct vm_area_struct *prev, unsigned long addr,
                        unsigned long end, unsigned long vm_flags,
                        struct anon_vma *anon_vma, struct file *file,
                        pgoff_t pgoff, struct mempolicy *policy,
                        struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
-                       const char __user *anon_name)
+                       const char __user *anon_name, bool keep_locked)
 {
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        struct vm_area_struct *area, *next;
@@ -1129,11 +1207,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                is_mergeable_anon_vma(prev->anon_vma,
                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               next->vm_end, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                               next->vm_end, prev->vm_pgoff, NULL, keep_locked);
                } else                                  /* cases 2, 5, 7 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               end, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                               end, prev->vm_pgoff, NULL, keep_locked);
                if (err)
                        return NULL;
                khugepaged_enter_vma_merge(prev, vm_flags);
@@ -1150,11 +1228,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                             vm_userfaultfd_ctx,
                                             anon_name)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               addr, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                               addr, prev->vm_pgoff, NULL, keep_locked);
                else                                    /* cases 3, 8 */
-                       err = vma_adjust(area, addr, next->vm_end,
-                               next->vm_pgoff - pglen, NULL);
+                       err = __vma_adjust(area, addr, next->vm_end,
+                               next->vm_pgoff - pglen, NULL, keep_locked);
                if (err)
                        return NULL;
                khugepaged_enter_vma_merge(area, vm_flags);
@@ -1690,7 +1768,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
        vma->vm_flags = vm_flags;
        vma->vm_page_prot = vm_get_page_prot(vm_flags);
        vma->vm_pgoff = pgoff;
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
 
        if (file) {
                if (vm_flags & VM_DENYWRITE) {
@@ -1743,13 +1821,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 out:
        perf_event_mmap(vma);
 
+       vm_write_begin(vma);
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
                                        vma == get_gate_vma(current->mm)))
                        mm->locked_vm += (len >> PAGE_SHIFT);
                else
-                       vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+                       WRITE_ONCE(vma->vm_flags,
+                               vma->vm_flags & VM_LOCKED_CLEAR_MASK);
        }
 
        if (file)
@@ -1762,9 +1842,10 @@ out:
         * then new mapped in-place (which must be aimed as
         * a completely new data area).
         */
-       vma->vm_flags |= VM_SOFTDIRTY;
+       WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY);
 
        vma_set_page_prot(vma);
+       vm_write_end(vma);
 
        return addr;
 
@@ -2126,15 +2207,10 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 EXPORT_SYMBOL(get_unmapped_area);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+struct vm_area_struct *__find_vma(struct mm_struct *mm, unsigned long addr)
 {
        struct rb_node *rb_node;
-       struct vm_area_struct *vma;
-
-       /* Check the cache first. */
-       vma = vmacache_find(mm, addr);
-       if (likely(vma))
-               return vma;
+       struct vm_area_struct *vma = NULL;
 
        rb_node = mm->mm_rb.rb_node;
 
@@ -2152,13 +2228,41 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
                        rb_node = rb_node->rb_right;
        }
 
+       return vma;
+}
+
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+       struct vm_area_struct *vma;
+
+       /* Check the cache first. */
+       vma = vmacache_find(mm, addr);
+       if (likely(vma))
+               return vma;
+
+       vma = __find_vma(mm, addr);
+
        if (vma)
                vmacache_update(addr, vma);
        return vma;
 }
-
 EXPORT_SYMBOL(find_vma);
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
+{
+       struct vm_area_struct *vma = NULL;
+
+       read_lock(&mm->mm_rb_lock);
+       vma = __find_vma(mm, addr);
+       if (vma)
+               atomic_inc(&vma->vm_ref_count);
+       read_unlock(&mm->mm_rb_lock);
+
+       return vma;
+}
+#endif
+
 /*
  * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
  */
@@ -2389,8 +2493,8 @@ int expand_downwards(struct vm_area_struct *vma,
                                vm_stat_account(mm, vma->vm_flags,
                                                vma->vm_file, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
-                               vma->vm_start = address;
-                               vma->vm_pgoff -= grow;
+                               WRITE_ONCE(vma->vm_start, address);
+                               WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow);
                                anon_vma_interval_tree_post_update_vma(vma);
                                vma_gap_update(vma);
                                spin_unlock(&mm->page_table_lock);
@@ -2536,7 +2640,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
        vma->vm_prev = NULL;
        do {
-               vma_rb_erase(vma, &mm->mm_rb);
+               vma_rb_erase(vma, mm);
                mm->map_count--;
                tail_vma = vma;
                vma = vma->vm_next;
@@ -2574,7 +2678,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
        /* most fields are the same, copy all, and then fixup */
        *new = *vma;
 
-       INIT_LIST_HEAD(&new->anon_vma_chain);
+       INIT_VMA(new);
 
        if (new_below)
                new->vm_end = addr;
@@ -2908,7 +3012,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
                return -ENOMEM;
        }
 
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
        vma->vm_mm = mm;
        vma->vm_start = addr;
        vma->vm_end = addr + len;
@@ -3058,9 +3162,21 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 
        if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
                return NULL;    /* should never get here */
-       new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+
+       /* There is 3 cases to manage here in
+        *         AAAA                    AAAA                          AAAA                      AAAA
+        * PPPP....      PPPP......NNNN          PPPP....NNNN      PP........NN
+        * PPPPPPPP(A)   PPPP..NNNNNNNN(B)       PPPPPPPPPPPP(1)           NULL
+        *                                                                       PPPPPPPPNNNN(2)
+        *                                       PPPPNNNNNNNN(3)
+        *
+        * new_vma == prev in case A,1,2
+        * new_vma == next in case B,3
+        */
+       new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                            vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-                           vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
+                           vma->vm_userfaultfd_ctx, vma_get_anon_name(vma), true);
+
        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
@@ -3093,13 +3209,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                new_vma->vm_pgoff = pgoff;
                if (vma_dup_policy(vma, new_vma))
                        goto out_free_vma;
-               INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+               INIT_VMA(new_vma);
                if (anon_vma_clone(new_vma, vma))
                        goto out_free_mempol;
                if (new_vma->vm_file)
                        get_file(new_vma->vm_file);
                if (new_vma->vm_ops && new_vma->vm_ops->open)
                        new_vma->vm_ops->open(new_vma);
+
+               /*
+                * As the VMA is linked right now, it may be hit by the
+                * speculative page fault handler. But we don't want it to
+                * to start mapping page in this area until the caller has
+                * potentially move the pte from the moved VMA. To prevent
+                * that we protect it right now, and let the caller unprotect
+                * it once the move is done.
+                */
+               vm_raw_write_begin(new_vma);
+
                vma_link(mm, new_vma, prev, rb_link, rb_parent);
                *need_rmap_locks = false;
        }
@@ -3193,7 +3320,7 @@ static struct vm_area_struct *__install_special_mapping(
        if (unlikely(vma == NULL))
                return ERR_PTR(-ENOMEM);
 
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
        vma->vm_mm = mm;
        vma->vm_start = addr;
        vma->vm_end = addr + len;
index fcd678c..39260d0 100644 (file)
@@ -368,12 +368,14 @@ success:
         * vm_flags and vm_page_prot are protected by the mmap_sem
         * held in write mode.
         */
-       vma->vm_flags = newflags;
+       vm_write_begin(vma);
+       WRITE_ONCE(vma->vm_flags, newflags);
        dirty_accountable = vma_wants_writenotify(vma);
        vma_set_page_prot(vma);
 
        change_protection(vma, start, end, vma->vm_page_prot,
                          dirty_accountable, 0);
+       vm_write_end(vma);
 
        /*
         * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
index 450b306..a0ab573 100644 (file)
@@ -285,6 +285,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        if (!new_vma)
                return -ENOMEM;
 
+       /* new_vma is returned protected by copy_vma, to prevent speculative
+        * page fault to be done in the destination area before we move the pte.
+        * Now, we must also protect the source VMA since we don't want pages
+        * to be mapped in our back while we are copying the PTEs.
+        */
+       if (vma != new_vma)
+               vm_raw_write_begin(vma);
+
        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
                                     need_rmap_locks);
        if (moved_len < old_len) {
@@ -301,6 +309,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
                 */
                move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
                                 true);
+               if (vma != new_vma)
+                       vm_raw_write_end(vma);
+
                vma = new_vma;
                old_len = new_len;
                old_addr = new_addr;
@@ -308,7 +319,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        } else {
                arch_remap(mm, old_addr, old_addr + old_len,
                           new_addr, new_addr + new_len);
+
+               if (vma != new_vma)
+                       vm_raw_write_end(vma);
        }
+       vm_raw_write_end(new_vma);
 
        /* Conceal VM_ACCOUNT so old reservation is not undone */
        if (vm_flags & VM_ACCOUNT) {
index ca1884c..e86cc3a 100644 (file)
@@ -1270,7 +1270,7 @@ unsigned long do_mmap(struct file *file,
        region->vm_flags = vm_flags;
        region->vm_pgoff = pgoff;
 
-       INIT_LIST_HEAD(&vma->anon_vma_chain);
+       INIT_VMA(vma);
        vma->vm_flags = vm_flags;
        vma->vm_pgoff = pgoff;
 
index 59e1c26..81f201d 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1208,10 +1208,9 @@ void do_page_add_anon_rmap(struct page *page,
  * This means the inc-and-test can be bypassed.
  * Page does not have to be locked.
  */
-void page_add_new_anon_rmap(struct page *page,
+void __page_add_new_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
-       VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
        if (PageTransHuge(page))
index ee61c48..d414381 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -708,12 +708,12 @@ void add_page_to_unevictable_list(struct page *page)
  * directly back onto it's zone's unevictable list, it does NOT use a
  * per cpu pagevec.
  */
-void lru_cache_add_active_or_unevictable(struct page *page,
-                                        struct vm_area_struct *vma)
+void __lru_cache_add_active_or_unevictable(struct page *page,
+                                        unsigned long vma_flags)
 {
        VM_BUG_ON_PAGE(PageLRU(page), page);
 
-       if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+       if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
                SetPageActive(page);
                lru_cache_add(page);
                return;
index aa3015e..3c55085 100644 (file)
@@ -871,6 +871,9 @@ const char * const vmstat_text[] = {
        "vmacache_find_calls",
        "vmacache_find_hits",
 #endif
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+       "speculative_pgfault",
+#endif
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */