#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/pagefault.h>
+
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
+ vm_write_begin(vma);
unlink_anon_vmas(vma);
+ vm_write_end(vma);
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
+ vm_write_begin(vma);
unlink_anon_vmas(vma);
+ vm_write_end(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
dump_page(page, "bad pte");
printk(KERN_ALERT
"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
- (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+ (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma, mapping, index);
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
}
/*
- * vm_normal_page -- This function gets the "struct page" associated with a pte.
+ * __vm_normal_page -- This function gets the "struct page" associated with
+ * a pte.
*
* "Special" mappings do not wish to be associated with a "struct page" (either
* it doesn't exist, or it exists but they don't want to touch it). In this
#else
# define HAVE_PTE_SPECIAL 0
#endif
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte)
+struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte, unsigned long vma_flags)
{
unsigned long pfn = pte_pfn(pte);
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
return vma->vm_ops->find_special_page(vma, addr);
- if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+ if (vma_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (!is_zero_pfn(pfn))
print_bad_pte(vma, addr, pte, NULL);
/* !HAVE_PTE_SPECIAL case follows: */
- if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
- if (vma->vm_flags & VM_MIXEDMAP) {
+ if (unlikely(vma_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+ if (vma_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
return NULL;
goto out;
off = (addr - vma->vm_start) >> PAGE_SHIFT;
if (pfn == vma->vm_pgoff + off)
return NULL;
- if (!is_cow_mapping(vma->vm_flags))
+ if (!is_cow_mapping(vma_flags))
return NULL;
}
}
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
int rss[NR_MM_COUNTERS];
+ unsigned long orig_addr = addr;
swp_entry_t entry = (swp_entry_t){0};
again:
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ /*
+ * Prevent the page fault handler to copy the page while stale tlb entry
+ * are still not flushed.
+ */
+ if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT) &&
+ is_cow_mapping(vma->vm_flags))
+ flush_tlb_range(vma, orig_addr, end);
+
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
pte_unmap(orig_src_pte);
details = NULL;
BUG_ON(addr >= end);
+ vm_write_begin(vma);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = zap_pud_range(tlb, vma, pgd, addr, next, details);
} while (pgd++, addr = next, addr != end);
tlb_end_vma(tlb, vma);
+ vm_write_end(vma);
}
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static bool pte_spinlock(struct vm_fault *vmf)
+{
+ bool ret = false;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ pmd_t pmdval;
+#endif
+
+ /* Check if vma is still valid */
+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ return true;
+ }
+
+again:
+ local_irq_disable();
+ if (vma_has_changed(vmf)) {
+ trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+ goto out;
+ }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * We check if the pmd value is still the same to ensure that there
+ * is not a huge collapse operation in progress in our back.
+ */
+ pmdval = READ_ONCE(*vmf->pmd);
+ if (!pmd_same(pmdval, vmf->orig_pmd)) {
+ trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
+ goto out;
+ }
+#endif
+
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ if (unlikely(!spin_trylock(vmf->ptl))) {
+ local_irq_enable();
+ goto again;
+ }
+
+ if (vma_has_changed(vmf)) {
+ spin_unlock(vmf->ptl);
+ trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+ goto out;
+ }
+
+ ret = true;
+out:
+ local_irq_enable();
+ return ret;
+}
+
+static bool pte_map_lock(struct vm_fault *vmf)
+{
+ bool ret = false;
+ pte_t *pte;
+ spinlock_t *ptl;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ pmd_t pmdval;
+#endif
+
+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ return true;
+ }
+
+ /*
+ * The first vma_has_changed() guarantees the page-tables are still
+ * valid, having IRQs disabled ensures they stay around, hence the
+ * second vma_has_changed() to make sure they are still valid once
+ * we've got the lock. After that a concurrent zap_pte_range() will
+ * block on the PTL and thus we're safe.
+ */
+again:
+ local_irq_disable();
+ if (vma_has_changed(vmf)) {
+ trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+ goto out;
+ }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * We check if the pmd value is still the same to ensure that there
+ * is not a huge collapse operation in progress in our back.
+ */
+ pmdval = READ_ONCE(*vmf->pmd);
+ if (!pmd_same(pmdval, vmf->orig_pmd)) {
+ trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address);
+ goto out;
+ }
+#endif
+
+ /*
+ * Same as pte_offset_map_lock() except that we call
+ * spin_trylock() in place of spin_lock() to avoid race with
+ * unmap path which may have the lock and wait for this CPU
+ * to invalidate TLB but this CPU has irq disabled.
+ * Since we are in a speculative patch, accept it could fail
+ */
+ ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ pte = pte_offset_map(vmf->pmd, vmf->address);
+ if (unlikely(!spin_trylock(ptl))) {
+ pte_unmap(pte);
+ local_irq_enable();
+ goto again;
+ }
+
+ if (vma_has_changed(vmf)) {
+ pte_unmap_unlock(pte, ptl);
+ trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address);
+ goto out;
+ }
+
+ vmf->pte = pte;
+ vmf->ptl = ptl;
+ ret = true;
+out:
+ local_irq_enable();
+ return ret;
+}
+#else
+static inline bool pte_spinlock(struct vm_fault *vmf2)
+{
+ vmf2->ptl = pte_lockptr(vmf2->vma->vm_mm, vmf2->pmd);
+ spin_lock(vmf2->ptl);
+ return true;
+}
+
+static inline bool pte_map_lock(struct vm_fault *vmf2)
+{
+ vmf2->pte = pte_offset_map_lock(vmf2->vma->vm_mm, vmf2->pmd,
+ vmf2->address, &vmf2->ptl);
+ return true;
+}
+#endif
+
/*
* handle_pte_fault chooses page fault handler according to an entry which was
* read non-atomically. Before making any commitment, on those architectures
* parts, do_swap_page must check under lock before unmapping the pte and
* proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
+ *
+ * pte_unmap_same() returns:
+ * 0 if the PTE are the same
+ * VM_FAULT_PTNOTSAME if the PTE are different
+ * VM_FAULT_RETRY if the VMA has changed in our back during
+ * a speculative page fault handling.
*/
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
- pte_t *page_table, pte_t orig_pte)
+static inline int pte_unmap_same(struct vm_fault *vmf2)
{
- int same = 1;
+ int ret = 0;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spinlock_t *ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
- same = pte_same(*page_table, orig_pte);
- spin_unlock(ptl);
+ if (pte_spinlock(vmf2)) {
+ if (!pte_same(*vmf2->pte, vmf2->orig_pte))
+ ret = VM_FAULT_PTNOTSAME;
+ spin_unlock(vmf2->ptl);
+ } else
+ ret = VM_FAULT_RETRY;
}
#endif
- pte_unmap(page_table);
- return same;
+ pte_unmap(vmf2->pte);
+ return ret;
}
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
struct vm_area_struct *vma, unsigned long address,
pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
struct page *page, int page_mkwrite,
- int dirty_shared)
+ int dirty_shared, struct vm_fault *vmf2)
__releases(ptl)
{
pte_t entry;
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vmf2->vma_flags);
if (ptep_set_access_flags(vma, address, page_table, entry, 1))
update_mmu_cache(vma, address, page_table);
pte_unmap_unlock(page_table, ptl);
*/
static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- pte_t orig_pte, struct page *old_page)
+ pte_t orig_pte, struct page *old_page, struct vm_fault *vmf2)
{
struct page *new_page = NULL;
spinlock_t *ptl = NULL;
const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
struct mem_cgroup *memcg;
+ int ret = VM_FAULT_OOM;
if (unlikely(anon_vma_prepare(vma)))
- goto oom;
+ goto out;
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma, address);
if (!new_page)
- goto oom;
+ goto out;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
- goto oom;
+ goto out;
cow_user_page(new_page, old_page, address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
- goto oom_free_new;
+ goto out_free_new;
__SetPageUptodate(new_page);
/*
* Re-check the pte - we dropped the lock
*/
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(vmf2)) {
+ ret = VM_FAULT_RETRY;
+ goto out_uncharge;
+ }
+ page_table = vmf2->pte;
+ ptl = vmf2->ptl;
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
flush_cache_page(vma, address, pte_pfn(orig_pte));
- entry = mk_pte(new_page, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = mk_pte(new_page, vmf2->vma_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vmf2->vma_flags);
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
- page_add_new_anon_rmap(new_page, vma, address);
+ __page_add_new_anon_rmap(new_page, vma, address);
mem_cgroup_commit_charge(new_page, memcg, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ __lru_cache_add_active_or_unevictable(new_page, vmf2->vma_flags);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
- if (page_copied && (vma->vm_flags & VM_LOCKED)) {
+ if (page_copied && (vmf2->vma_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
munlock_vma_page(old_page);
unlock_page(old_page);
page_cache_release(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
-oom_free_new:
+out_uncharge:
+ mem_cgroup_cancel_charge(new_page, memcg);
+out_free_new:
page_cache_release(new_page);
-oom:
+out:
if (old_page)
page_cache_release(old_page);
- return VM_FAULT_OOM;
+ return ret;
}
/*
static int wp_pfn_shared(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
- pmd_t *pmd)
+ pmd_t *pmd, struct vm_fault *vmf2)
{
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
struct vm_fault vmf = {
ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
if (ret & VM_FAULT_ERROR)
return ret;
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(vmf2))
+ return VM_FAULT_RETRY;
+ page_table = vmf2->pte;
+ ptl = vmf2->ptl;
/*
* We might have raced with another page fault while we
* released the pte_offset_map_lock.
}
}
return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
- NULL, 0, 0);
+ NULL, 0, 0, vmf2);
}
static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table,
pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
- struct page *old_page)
+ struct page *old_page, struct vm_fault *vmf2)
__releases(ptl)
{
int page_mkwrite = 0;
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
+ if (!pte_map_lock(vmf2))
+ return VM_FAULT_RETRY;
+ page_table = vmf2->pte;
+ ptl = vmf2->ptl;
+
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
}
return wp_page_reuse(mm, vma, address, page_table, ptl,
- orig_pte, old_page, page_mkwrite, 1);
+ orig_pte, old_page, page_mkwrite, 1, vmf2);
}
/*
*/
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- spinlock_t *ptl, pte_t orig_pte)
+ spinlock_t *ptl, pte_t orig_pte, struct vm_fault *vmf2)
__releases(ptl)
{
struct page *old_page;
- old_page = vm_normal_page(vma, address, orig_pte);
+ old_page = __vm_normal_page(vma, address, orig_pte, vmf2->vma_flags);
if (!old_page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
- if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ if ((vmf2->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
return wp_pfn_shared(mm, vma, address, page_table, ptl,
- orig_pte, pmd);
+ orig_pte, pmd, vmf2);
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
- orig_pte, old_page);
+ orig_pte, old_page, vmf2);
}
/*
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
+ if (!pte_map_lock(vmf2)) {
+ unlock_page(old_page);
+ put_page(old_page);
+ return VM_FAULT_RETRY;
+ }
+ page_table = vmf2->pte;
+ ptl = vmf2->ptl;
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
page_move_anon_rmap(old_page, vma, address);
unlock_page(old_page);
return wp_page_reuse(mm, vma, address, page_table, ptl,
- orig_pte, old_page, 0, 0);
+ orig_pte, old_page, 0, 0, vmf2);
}
unlock_page(old_page);
- } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ } else if (unlikely((vmf2->vma_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(mm, vma, address, page_table, pmd,
- ptl, orig_pte, old_page);
+ ptl, orig_pte, old_page, vmf2);
}
/*
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
- orig_pte, old_page);
+ orig_pte, old_page, vmf2);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
*/
static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
{
spinlock_t *ptl;
struct page *page, *swapcache;
pte_t pte;
int locked;
int exclusive = 0;
- int ret = 0;
+ int ret;
- if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+ ret = pte_unmap_same(vmf2);
+ if (ret) {
+ /*
+ * In the case the PTE are different, meaning that the
+ * page has already been processed by another CPU, we return 0.
+ */
+ if (ret == VM_FAULT_PTNOTSAME)
+ ret = 0;
goto out;
+ }
entry = pte_to_swp_entry(orig_pte);
if (unlikely(non_swap_entry(entry))) {
GFP_HIGHUSER_MOVABLE, vma, address);
if (!page) {
/*
- * Back out if somebody else faulted in this pte
- * while we released the pte lock.
- */
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ * Back out if the VMA has changed in our back during
+ * a speculative page fault or if somebody else
+ * faulted in this pte while we released the pte lock.
+ */
+ if (!pte_map_lock(vmf2)) {
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ ret = VM_FAULT_RETRY;
+ goto out;
+ }
+ page_table = vmf2->pte;
+ ptl = vmf2->ptl;
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
}
/*
- * Back out if somebody else already faulted in this pte.
+ * Back out if the VMA has changed in our back during a speculative
+ * page fault or if somebody else already faulted in this pte.
*/
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(vmf2)) {
+ ret = VM_FAULT_RETRY;
+ goto out_cancel_cgroup;
+ }
+ page_table = vmf2->pte;
+ ptl = vmf2->ptl;
if (unlikely(!pte_same(*page_table, orig_pte)))
goto out_nomap;
inc_mm_counter_fast(mm, MM_ANONPAGES);
dec_mm_counter_fast(mm, MM_SWAPENTS);
- pte = mk_pte(page, vma->vm_page_prot);
+ pte = mk_pte(page, vmf2->vma_page_prot);
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ pte = maybe_mkwrite(pte_mkdirty(pte), vmf2->vma_flags);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = 1;
do_page_add_anon_rmap(page, vma, address, exclusive);
mem_cgroup_commit_charge(page, memcg, true);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, address);
+ __page_add_new_anon_rmap(page, vma, address);
mem_cgroup_commit_charge(page, memcg, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ __lru_cache_add_active_or_unevictable(page, vmf2->vma_flags);
}
swap_free(entry);
if ((PageSwapCache(page) && vm_swap_full(page_swap_info(page))) ||
- (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ (vmf2->vma_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache) {
}
if (flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+ ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte, vmf2);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg);
pte_unmap_unlock(page_table, ptl);
+out_cancel_cgroup:
+ mem_cgroup_cancel_charge(page, memcg);
out_page:
unlock_page(page);
out_release:
*/
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags)
+ unsigned int flags, struct vm_fault *vmf2)
{
struct mem_cgroup *memcg;
struct page *page;
spinlock_t *ptl;
pte_t entry;
+ int ret = 0;
pte_unmap(page_table);
/* File mapping without ->vm_ops ? */
- if (vma->vm_flags & VM_SHARED)
+ if (vmf2->vma_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
/* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
- vma->vm_page_prot));
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ vmf2->vma_page_prot));
+ if (!pte_map_lock(vmf2))
+ return VM_FAULT_RETRY;
+ page_table = vmf2->pte;
+ ptl = vmf2->ptl;
if (!pte_none(*page_table))
goto unlock;
+
+ /*
+ * Don't call the userfaultfd during the speculative path.
+ * We already checked for the VMA to not be managed through
+ * userfaultfd, but it may be set in our back once we have lock
+ * the pte. In such a case we can ignore it this time.
+ */
+ if (vmf2->flags & FAULT_FLAG_SPECULATIVE)
+ goto setpte;
+
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
*/
__SetPageUptodate(page);
- entry = mk_pte(page, vma->vm_page_prot);
- if (vma->vm_flags & VM_WRITE)
+ entry = mk_pte(page, vmf2->vma_page_prot);
+ if (vmf2->vma_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
- if (!pte_none(*page_table))
+ if (!pte_map_lock(vmf2)) {
+ ret = VM_FAULT_RETRY;
goto release;
+ }
+ page_table = vmf2->pte;
+ ptl = vmf2->ptl;
+
+ if (!pte_none(*page_table))
+ goto unlock_and_release;
/* Deliver the page fault to userland, check inside PT lock */
- if (userfaultfd_missing(vma)) {
+ if (!(vmf2->flags & FAULT_FLAG_SPECULATIVE) && userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
mem_cgroup_cancel_charge(page, memcg);
page_cache_release(page);
}
inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
+ __page_add_new_anon_rmap(page, vma, address);
mem_cgroup_commit_charge(page, memcg, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ __lru_cache_add_active_or_unevictable(page, vmf2->vma_flags);
setpte:
set_pte_at(mm, address, page_table, entry);
update_mmu_cache(vma, address, page_table);
unlock:
pte_unmap_unlock(page_table, ptl);
- return 0;
+ return ret;
+unlock_and_release:
+ pte_unmap_unlock(page_table, ptl);
release:
mem_cgroup_cancel_charge(page, memcg);
page_cache_release(page);
- goto unlock;
+ return ret;
oom_free_page:
page_cache_release(page);
oom:
* vm_ops->map_pages.
*/
void do_set_pte(struct vm_area_struct *vma, unsigned long address,
- struct page *page, pte_t *pte, bool write, bool anon)
+ struct page *page, pte_t *pte, bool write, bool anon, struct vm_fault *vmf2)
{
pte_t entry;
flush_icache_page(vma, page);
- entry = mk_pte(page, vma->vm_page_prot);
+ entry = mk_pte(page, vmf2->vma_page_prot);
if (write)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vmf2->vma_flags);
if (anon) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
+ __page_add_new_anon_rmap(page, vma, address);
} else {
inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
page_add_file_rmap(page);
static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
{
struct page *fault_page;
spinlock_t *ptl;
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(vmf2))
+ return VM_FAULT_RETRY;
+ pte = vmf2->pte;
+ ptl = vmf2->ptl;
do_fault_around(vma, address, pte, pgoff, flags);
if (!pte_same(*pte, orig_pte))
goto unlock_out;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(vmf2))
+ return VM_FAULT_RETRY;
+ pte = vmf2->pte;
+ ptl = vmf2->ptl;
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
page_cache_release(fault_page);
return ret;
}
- do_set_pte(vma, address, fault_page, pte, false, false);
+ do_set_pte(vma, address, fault_page, pte, false, false, vmf2);
unlock_page(fault_page);
unlock_out:
pte_unmap_unlock(pte, ptl);
static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
{
struct page *fault_page, *new_page;
struct mem_cgroup *memcg;
copy_user_highpage(new_page, fault_page, address, vma);
__SetPageUptodate(new_page);
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(vmf2))
+ return VM_FAULT_RETRY;
+ pte = vmf2->pte;
+ ptl = vmf2->ptl;
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
if (fault_page) {
}
goto uncharge_out;
}
- do_set_pte(vma, address, new_page, pte, true, true);
+ do_set_pte(vma, address, new_page, pte, true, true, vmf2);
mem_cgroup_commit_charge(new_page, memcg, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ __lru_cache_add_active_or_unevictable(new_page, vmf2->vma_flags);
pte_unmap_unlock(pte, ptl);
if (fault_page) {
unlock_page(fault_page);
static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
{
struct page *fault_page;
struct address_space *mapping;
}
}
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (!pte_map_lock(vmf2))
+ return VM_FAULT_RETRY;
+ pte = vmf2->pte;
+ ptl = vmf2->ptl;
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
page_cache_release(fault_page);
return ret;
}
- do_set_pte(vma, address, fault_page, pte, true, false);
+ do_set_pte(vma, address, fault_page, pte, true, false, vmf2);
pte_unmap_unlock(pte, ptl);
if (set_page_dirty(fault_page))
*/
static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte, struct vm_fault *vmf2)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
return VM_FAULT_SIGBUS;
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- if (!(vma->vm_flags & VM_SHARED))
+ orig_pte, vmf2);
+ if (!(vmf2->vma_flags & VM_SHARED))
return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ orig_pte, vmf2);
+ return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte, vmf2);
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
}
static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+ unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd, struct vm_fault *vmf2)
{
struct page *page = NULL;
spinlock_t *ptl;
* page table entry is not accessible, so there would be no
* concurrent hardware modifications to the PTE.
*/
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
+ if (!pte_spinlock(vmf2))
+ return VM_FAULT_RETRY;
+ ptl = vmf2->ptl;
if (unlikely(!pte_same(*ptep, pte))) {
pte_unmap_unlock(ptep, ptl);
goto out;
set_pte_at(mm, addr, ptep, pte);
update_mmu_cache(vma, addr, ptep);
- page = vm_normal_page(vma, addr, pte);
+ page = __vm_normal_page(vma, addr, pte, vmf2->vma_flags);
if (!page) {
pte_unmap_unlock(ptep, ptl);
return 0;
*/
static int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
+ pte_t *pte, pmd_t *pmd, unsigned int flags,
+ struct vm_fault *vmf2)
{
pte_t entry;
spinlock_t *ptl;
if (pte_none(entry)) {
if (vma_is_anonymous(vma))
return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
+ pte, pmd, flags, vmf2);
+ else if (vmf2->flags & FAULT_FLAG_SPECULATIVE)
+ return VM_FAULT_RETRY;
else
return do_fault(mm, vma, address, pte, pmd,
- flags, entry);
+ flags, entry, vmf2);
}
return do_swap_page(mm, vma, address,
- pte, pmd, flags, entry);
+ pte, pmd, flags, entry, vmf2);
}
if (pte_protnone(entry))
- return do_numa_page(mm, vma, address, entry, pte, pmd);
+ return do_numa_page(mm, vma, address, entry, pte, pmd, vmf2);
+
+ if (!pte_spinlock(vmf2))
+ return VM_FAULT_RETRY;
+ ptl = vmf2->ptl;
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
- pte, pmd, ptl, entry);
+ pte, pmd, ptl, entry, vmf2);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
pmd_t *pmd;
pte_t *pte;
+ struct vm_fault vmf2 = {
+ .vma = vma,
+ .address = address & PAGE_MASK,
+ .flags = flags,
+ .vma_flags = vma->vm_flags,
+ .vma_page_prot = vma->vm_page_prot,
+ };
+
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
- pmd = pmd_alloc(mm, pud, address);
+ vmf2.pmd = pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+ vmf2.sequence = raw_read_seqcount(&vma->vm_sequence);
+#endif
+
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
int ret = create_huge_pmd(mm, vma, address, pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
- pte = pte_offset_map(pmd, address);
+ vmf2.pte = pte = pte_offset_map(pmd, address);
+ vmf2.orig_pte = *vmf2.pte;
+
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags, &vmf2);
+}
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+/*
+ * Tries to handle the page fault in a speculative way, without grabbing the
+ * mmap_sem.
+ */
+int __handle_speculative_fault(struct mm_struct *mm, unsigned long address,
+ unsigned int flags)
+{
+ struct vm_fault vmf = {
+ .address = address,
+ };
+ pgd_t *pgd, pgdval;
+/* p4d_t *p4d, p4dval; */
+ pud_t pudval;
+ int seq, ret = VM_FAULT_RETRY;
+ struct vm_area_struct *vma;
+#ifdef CONFIG_NUMA
+ struct mempolicy *pol;
+#endif
+
+ /* Clear flags that may lead to release the mmap_sem to retry */
+ flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
+ flags |= FAULT_FLAG_SPECULATIVE;
+
+ vma = get_vma(mm, address);
+ if (!vma)
+ return ret;
+
+ seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */
+ if (seq & 1) {
+ trace_spf_vma_changed(_RET_IP_, vma, address);
+ goto out_put;
+ }
+
+ /*
+ * Can't call vm_ops service has we don't know what they would do
+ * with the VMA.
+ * This include huge page from hugetlbfs.
+ */
+ if (vma->vm_ops) {
+ trace_spf_vma_notsup(_RET_IP_, vma, address);
+ goto out_put;
+ }
+
+ /*
+ * __anon_vma_prepare() requires the mmap_sem to be held
+ * because vm_next and vm_prev must be safe. This can't be guaranteed
+ * in the speculative path.
+ */
+ if (unlikely(!vma->anon_vma)) {
+ trace_spf_vma_notsup(_RET_IP_, vma, address);
+ goto out_put;
+ }
+
+ vmf.vma_flags = READ_ONCE(vma->vm_flags);
+ vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
+
+ /* Can't call userland page fault handler in the speculative path */
+ if (unlikely(vmf.vma_flags & VM_UFFD_MISSING)) {
+ trace_spf_vma_notsup(_RET_IP_, vma, address);
+ goto out_put;
+ }
+
+ if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) {
+ /*
+ * This could be detected by the check address against VMA's
+ * boundaries but we want to trace it as not supported instead
+ * of changed.
+ */
+ trace_spf_vma_notsup(_RET_IP_, vma, address);
+ goto out_put;
+ }
+
+ if (address < READ_ONCE(vma->vm_start)
+ || READ_ONCE(vma->vm_end) <= address) {
+ trace_spf_vma_changed(_RET_IP_, vma, address);
+ goto out_put;
+ }
+/*
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE)) {
+ trace_spf_vma_access(_RET_IP_, vma, address);
+ ret = VM_FAULT_SIGSEGV;
+ goto out_put;
+ }
+*/
+ /* This is one is required to check that the VMA has write access set */
+ if (flags & FAULT_FLAG_WRITE) {
+ if (unlikely(!(vmf.vma_flags & VM_WRITE))) {
+ trace_spf_vma_access(_RET_IP_, vma, address);
+ ret = VM_FAULT_SIGSEGV;
+ goto out_put;
+ }
+ } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE)))) {
+ trace_spf_vma_access(_RET_IP_, vma, address);
+ ret = VM_FAULT_SIGSEGV;
+ goto out_put;
+ }
+
+#ifdef CONFIG_NUMA
+ /*
+ * MPOL_INTERLEAVE implies additional checks in
+ * mpol_misplaced() which are not compatible with the
+ *speculative page fault processing.
+ */
+ pol = __get_vma_policy(vma, address);
+ if (!pol)
+ pol = get_task_policy(current);
+ if (pol && pol->mode == MPOL_INTERLEAVE) {
+ trace_spf_vma_notsup(_RET_IP_, vma, address);
+ goto out_put;
+ }
+#endif
- return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+ /*
+ * Do a speculative lookup of the PTE entry.
+ */
+ local_irq_disable();
+ pgd = pgd_offset(mm, address);
+ pgdval = READ_ONCE(*pgd);
+ if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval)))
+ goto out_walk;
+/*
+ p4d = p4d_offset(pgd, address);
+ p4dval = READ_ONCE(*p4d);
+ if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval)))
+ goto out_walk;
+*/
+ vmf.pud = pud_offset(pgd, address);
+ pudval = READ_ONCE(*vmf.pud);
+ if (pud_none(pudval) || unlikely(pud_bad(pudval)))
+ goto out_walk;
+
+ /* Huge pages at PUD level are not supported. */
+/*
+ if (unlikely(pud_trans_huge(pudval)))
+ goto out_walk;
+*/
+ vmf.pmd = pmd_offset(vmf.pud, address);
+ vmf.orig_pmd = READ_ONCE(*vmf.pmd);
+ /*
+ * pmd_none could mean that a hugepage collapse is in progress
+ * in our back as collapse_huge_page() mark it before
+ * invalidating the pte (which is done once the IPI is catched
+ * by all CPU and we have interrupt disabled).
+ * For this reason we cannot handle THP in a speculative way since we
+ * can't safely indentify an in progress collapse operation done in our
+ * back on that PMD.
+ * Regarding the order of the following checks, see comment in
+ * pmd_devmap_trans_unstable()
+ */
+ if (/* unlikely(pmd_devmap(vmf.orig_pmd) || */
+ pmd_none(vmf.orig_pmd) || pmd_trans_huge(vmf.orig_pmd) /* ||
+ is_swap_pmd(vmf.orig_pmd))*/)
+ goto out_walk;
+
+ /*
+ * The above does not allocate/instantiate page-tables because doing so
+ * would lead to the possibility of instantiating page-tables after
+ * free_pgtables() -- and consequently leaking them.
+ *
+ * The result is that we take at least one !speculative fault per PMD
+ * in order to instantiate it.
+ */
+
+ vmf.pte = pte_offset_map(vmf.pmd, address);
+ vmf.orig_pte = READ_ONCE(*vmf.pte);
+ barrier(); /* See comment in handle_pte_fault() */
+ /*
+ if (pte_none(vmf.orig_pte)) {
+ pte_unmap(vmf.pte);
+ vmf.pte = NULL;
+ }
+ */
+ vmf.vma = vma;
+ vmf.pgoff = linear_page_index(vma, address);
+/* vmf.gfp_mask = __get_fault_gfp_mask(vma); */
+ vmf.sequence = seq;
+ vmf.flags = flags;
+
+ local_irq_enable();
+
+ /*
+ * We need to re-validate the VMA after checking the bounds, otherwise
+ * we might have a false positive on the bounds.
+ */
+ if (read_seqcount_retry(&vma->vm_sequence, seq)) {
+ trace_spf_vma_changed(_RET_IP_, vma, address);
+ goto out_put;
+ }
+
+ mem_cgroup_oom_enable();
+ ret = handle_pte_fault(mm, vmf.vma, vmf.address, vmf.pte,
+ vmf.pmd, vmf.flags, &vmf);
+ mem_cgroup_oom_disable();
+
+ put_vma(vma);
+
+ if (ret != VM_FAULT_RETRY)
+ count_vm_event(SPECULATIVE_PGFAULT);
+
+ /*
+ * The task may have entered a memcg OOM situation but
+ * if the allocation error was handled gracefully (no
+ * VM_FAULT_OOM), there is no need to kill anything.
+ * Just clean up the OOM state peacefully.
+ */
+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+ mem_cgroup_oom_synchronize(false);
+ return ret;
+
+out_walk:
+ trace_spf_vma_notsup(_RET_IP_, vma, address);
+ local_irq_enable();
+out_put:
+ put_vma(vma);
+ return ret;
}
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
/*
* By the time we get here, we already hold the mm semaphore
}
}
+static void __free_vma(struct vm_area_struct *vma)
+{
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ mpol_put(vma_policy(vma));
+ kmem_cache_free(vm_area_cachep, vma);
+}
+
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+void put_vma(struct vm_area_struct *vma)
+{
+ if (atomic_dec_and_test(&vma->vm_ref_count))
+ __free_vma(vma);
+}
+#else
+static inline void put_vma(struct vm_area_struct *vma)
+{
+ return __free_vma(vma);
+}
+#endif
+
/*
* Close a vm structure and free it, returning the next.
*/
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file)
- fput(vma->vm_file);
- mpol_put(vma_policy(vma));
- kmem_cache_free(vm_area_cachep, vma);
+ put_vma(vma);
return next;
}
#define validate_mm(mm) do { } while (0)
#endif
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+#define mm_rb_write_lock(mm) write_lock(&(mm)->mm_rb_lock)
+#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock)
+#else
+#define mm_rb_write_lock(mm) do { } while (0)
+#define mm_rb_write_unlock(mm) do { } while (0)
+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
+
RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
}
static inline void vma_rb_insert(struct vm_area_struct *vma,
- struct rb_root *root)
+ struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
+
/* All rb_subtree_gap values must be consistent prior to insertion */
validate_mm_rb(root, NULL);
rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
}
-static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+static void vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
+
/*
* All rb_subtree_gap values must be consistent prior to erase,
* with the possible exception of the vma being erased.
* so make sure we instantiate it only once with our desired
* augmented rbtree callbacks.
*/
+ mm_rb_write_lock(mm);
rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+ mm_rb_write_unlock(mm); /* wmb */
+
+ /*
+ * Ensure the removal is complete before clearing the node.
+ * Matched by vma_has_changed()/handle_speculative_fault().
+ */
+ RB_CLEAR_NODE(&vma->vm_rb);
}
/*
* immediately update the gap to the correct value. Finally we
* rebalance the rbtree after all augmented values have been set.
*/
+ mm_rb_write_lock(mm);
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
vma->rb_subtree_gap = 0;
vma_gap_update(vma);
- vma_rb_insert(vma, &mm->mm_rb);
+ vma_rb_insert(vma, mm);
+ mm_rb_write_unlock(mm);
}
static void __vma_link_file(struct vm_area_struct *vma)
{
struct vm_area_struct *next;
- vma_rb_erase(vma, &mm->mm_rb);
+ vma_rb_erase(vma, mm);
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
* are necessary. The "insert" vma (if any) is to be inserted
* before we drop the necessary locks.
*/
-int vma_adjust(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
+ bool keep_locked)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *next = vma->vm_next;
long adjust_next = 0;
int remove_next = 0;
+ /*
+ * Why using vm_raw_write*() functions here to avoid lockdep's warning ?
+ *
+ * Locked is complaining about a theoretical lock dependency, involving
+ * 3 locks:
+ * mapping->i_mmap_rwsem --> vma->vm_sequence --> fs_reclaim
+ *
+ * Here are the major path leading to this dependency :
+ * 1. __vma_adjust() mmap_sem -> vm_sequence -> i_mmap_rwsem
+ * 2. move_vmap() mmap_sem -> vm_sequence -> fs_reclaim
+ * 3. __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem
+ * 4. unmap_mapping_range() i_mmap_rwsem -> vm_sequence
+ *
+ * So there is no way to solve this easily, especially because in
+ * unmap_mapping_range() the i_mmap_rwsem is grab while the impacted
+ * VMAs are not yet known.
+ * However, the way the vm_seq is used is guarantying that we will
+ * never block on it since we just check for its value and never wait
+ * for it to move, see vma_has_changed() and handle_speculative_fault().
+ */
+ vm_raw_write_begin(vma);
+ if (next)
+ vm_raw_write_begin(next);
+
if (next && !insert) {
struct vm_area_struct *exporter = NULL;
importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
- if (error)
+ if (error) {
+ if (next && next != vma)
+ vm_raw_write_end(next);
+ vm_raw_write_end(vma);
return error;
+ }
}
}
}
if (start != vma->vm_start) {
- vma->vm_start = start;
+ WRITE_ONCE(vma->vm_start, start);
start_changed = true;
}
if (end != vma->vm_end) {
- vma->vm_end = end;
+ WRITE_ONCE(vma->vm_end, end);
end_changed = true;
}
- vma->vm_pgoff = pgoff;
+ WRITE_ONCE(vma->vm_pgoff, pgoff);
if (adjust_next) {
- next->vm_start += adjust_next << PAGE_SHIFT;
- next->vm_pgoff += adjust_next;
+ WRITE_ONCE(next->vm_start,
+ next->vm_start + (adjust_next << PAGE_SHIFT));
+ WRITE_ONCE(next->vm_pgoff, next->vm_pgoff + adjust_next);
}
if (root) {
if (remove_next) {
if (file) {
uprobe_munmap(next, next->vm_start, next->vm_end);
- fput(file);
}
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- mpol_put(vma_policy(next));
- kmem_cache_free(vm_area_cachep, next);
+ vm_raw_write_end(next);
+ put_vma(next);
/*
* In mprotect's case 6 (see comments on vma_merge),
* we must remove another next too. It would clutter
* up the code too much to do both in one go.
*/
next = vma->vm_next;
+ if (next)
+ vm_raw_write_begin(next);
+
if (remove_next == 2)
goto again;
else if (next)
if (insert && file)
uprobe_mmap(insert);
+ if (next && next != vma)
+ vm_raw_write_end(next);
+
+ if (!keep_locked)
+ vm_raw_write_end(vma);
+
validate_mm(mm);
return 0;
* Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
* mprotect_fixup updates vm_flags & vm_page_prot on successful return.
*/
-struct vm_area_struct *vma_merge(struct mm_struct *mm,
+struct vm_area_struct *__vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char __user *anon_name)
+ const char __user *anon_name, bool keep_locked)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
- err = vma_adjust(prev, prev->vm_start,
- next->vm_end, prev->vm_pgoff, NULL);
+ err = __vma_adjust(prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff, NULL, keep_locked);
} else /* cases 2, 5, 7 */
- err = vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL);
+ err = __vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL, keep_locked);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
vm_userfaultfd_ctx,
anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
- err = vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL);
+ err = __vma_adjust(prev, prev->vm_start,
+ addr, prev->vm_pgoff, NULL, keep_locked);
else /* cases 3, 8 */
- err = vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL);
+ err = __vma_adjust(area, addr, next->vm_end,
+ next->vm_pgoff - pglen, NULL, keep_locked);
if (err)
return NULL;
khugepaged_enter_vma_merge(area, vm_flags);
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
- INIT_LIST_HEAD(&vma->anon_vma_chain);
+ INIT_VMA(vma);
if (file) {
if (vm_flags & VM_DENYWRITE) {
out:
perf_event_mmap(vma);
+ vm_write_begin(vma);
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ WRITE_ONCE(vma->vm_flags,
+ vma->vm_flags & VM_LOCKED_CLEAR_MASK);
}
if (file)
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
- vma->vm_flags |= VM_SOFTDIRTY;
+ WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY);
vma_set_page_prot(vma);
+ vm_write_end(vma);
return addr;
EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+struct vm_area_struct *__find_vma(struct mm_struct *mm, unsigned long addr)
{
struct rb_node *rb_node;
- struct vm_area_struct *vma;
-
- /* Check the cache first. */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
+ struct vm_area_struct *vma = NULL;
rb_node = mm->mm_rb.rb_node;
rb_node = rb_node->rb_right;
}
+ return vma;
+}
+
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma;
+
+ /* Check the cache first. */
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
+
+ vma = __find_vma(mm, addr);
+
if (vma)
vmacache_update(addr, vma);
return vma;
}
-
EXPORT_SYMBOL(find_vma);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
+{
+ struct vm_area_struct *vma = NULL;
+
+ read_lock(&mm->mm_rb_lock);
+ vma = __find_vma(mm, addr);
+ if (vma)
+ atomic_inc(&vma->vm_ref_count);
+ read_unlock(&mm->mm_rb_lock);
+
+ return vma;
+}
+#endif
+
/*
* Same as find_vma, but also return a pointer to the previous VMA in *pprev.
*/
vm_stat_account(mm, vma->vm_flags,
vma->vm_file, grow);
anon_vma_interval_tree_pre_update_vma(vma);
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
+ WRITE_ONCE(vma->vm_start, address);
+ WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow);
anon_vma_interval_tree_post_update_vma(vma);
vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
- vma_rb_erase(vma, &mm->mm_rb);
+ vma_rb_erase(vma, mm);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
/* most fields are the same, copy all, and then fixup */
*new = *vma;
- INIT_LIST_HEAD(&new->anon_vma_chain);
+ INIT_VMA(new);
if (new_below)
new->vm_end = addr;
return -ENOMEM;
}
- INIT_LIST_HEAD(&vma->anon_vma_chain);
+ INIT_VMA(vma);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
return NULL; /* should never get here */
- new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
+
+ /* There is 3 cases to manage here in
+ * AAAA AAAA AAAA AAAA
+ * PPPP.... PPPP......NNNN PPPP....NNNN PP........NN
+ * PPPPPPPP(A) PPPP..NNNNNNNN(B) PPPPPPPPPPPP(1) NULL
+ * PPPPPPPPNNNN(2)
+ * PPPPNNNNNNNN(3)
+ *
+ * new_vma == prev in case A,1,2
+ * new_vma == next in case B,3
+ */
+ new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
+ vma->vm_userfaultfd_ctx, vma_get_anon_name(vma), true);
+
if (new_vma) {
/*
* Source vma may have been merged into new_vma
new_vma->vm_pgoff = pgoff;
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
- INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+ INIT_VMA(new_vma);
if (anon_vma_clone(new_vma, vma))
goto out_free_mempol;
if (new_vma->vm_file)
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
+
+ /*
+ * As the VMA is linked right now, it may be hit by the
+ * speculative page fault handler. But we don't want it to
+ * to start mapping page in this area until the caller has
+ * potentially move the pte from the moved VMA. To prevent
+ * that we protect it right now, and let the caller unprotect
+ * it once the move is done.
+ */
+ vm_raw_write_begin(new_vma);
+
vma_link(mm, new_vma, prev, rb_link, rb_parent);
*need_rmap_locks = false;
}
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
- INIT_LIST_HEAD(&vma->anon_vma_chain);
+ INIT_VMA(vma);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;