OSDN Git Service

crypto: talitos - HMAC SNOOP NO AFEU mode requires SW icv checking.
[android-x86/kernel.git] / mm / memory.c
index e18c57b..47248dc 100644 (file)
@@ -75,7 +75,7 @@
 
 #include "internal.h"
 
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
 #endif
 
@@ -373,15 +373,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 {
        struct mmu_table_batch **batch = &tlb->batch;
 
-       /*
-        * When there's less then two users of this mm there cannot be a
-        * concurrent page-table walk.
-        */
-       if (atomic_read(&tlb->mm->mm_users) < 2) {
-               __tlb_remove_table(table);
-               return;
-       }
-
        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                if (*batch == NULL) {
@@ -1124,6 +1115,7 @@ again:
        init_rss_vec(rss);
        start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        pte = start_pte;
+       flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = *pte;
@@ -1640,6 +1632,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
        if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
                return -EINVAL;
 
+       if (!pfn_modify_allowed(pfn, pgprot))
+               return -EACCES;
+
        ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
 
        return ret;
@@ -1658,6 +1653,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
        if (track_pfn_insert(vma, &pgprot, pfn))
                return -EINVAL;
 
+       if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+               return -EACCES;
+
        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
@@ -1691,6 +1689,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 {
        pte_t *pte;
        spinlock_t *ptl;
+       int err = 0;
 
        pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
@@ -1698,12 +1697,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(*pte));
+               if (!pfn_modify_allowed(pfn, prot)) {
+                       err = -EACCES;
+                       break;
+               }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
-       return 0;
+       return err;
 }
 
 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
@@ -1712,6 +1715,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
 {
        pmd_t *pmd;
        unsigned long next;
+       int err;
 
        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
@@ -1720,9 +1724,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
-               if (remap_pte_range(mm, pmd, addr, next,
-                               pfn + (addr >> PAGE_SHIFT), prot))
-                       return -ENOMEM;
+               err = remap_pte_range(mm, pmd, addr, next,
+                               pfn + (addr >> PAGE_SHIFT), prot);
+               if (err)
+                       return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
 }
@@ -1733,6 +1738,7 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
 {
        pud_t *pud;
        unsigned long next;
+       int err;
 
        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, pgd, addr);
@@ -1740,9 +1746,10 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
-               if (remap_pmd_range(mm, pud, addr, next,
-                               pfn + (addr >> PAGE_SHIFT), prot))
-                       return -ENOMEM;
+               err = remap_pmd_range(mm, pud, addr, next,
+                               pfn + (addr >> PAGE_SHIFT), prot);
+               if (err)
+                       return err;
        } while (pud++, addr = next, addr != end);
        return 0;
 }
@@ -2699,40 +2706,6 @@ out_release:
 }
 
 /*
- * This is like a special single-page "expand_{down|up}wards()",
- * except we must first make sure that 'address{-|+}PAGE_SIZE'
- * doesn't hit another vma.
- */
-static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
-{
-       address &= PAGE_MASK;
-       if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
-               struct vm_area_struct *prev = vma->vm_prev;
-
-               /*
-                * Is there a mapping abutting this one below?
-                *
-                * That's only ok if it's the same stack mapping
-                * that has gotten split..
-                */
-               if (prev && prev->vm_end == address)
-                       return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-
-               return expand_downwards(vma, address - PAGE_SIZE);
-       }
-       if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
-               struct vm_area_struct *next = vma->vm_next;
-
-               /* As VM_GROWSDOWN but s/below/above/ */
-               if (next && next->vm_start == address + PAGE_SIZE)
-                       return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
-
-               return expand_upwards(vma, address + PAGE_SIZE);
-       }
-       return 0;
-}
-
-/*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2748,10 +2721,6 @@ static int do_anonymous_page(struct fault_env *fe)
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;
 
-       /* Check if we need to add a guard page to the stack */
-       if (check_stack_guard_page(vma, fe->address) < 0)
-               return VM_FAULT_SIGSEGV;
-
        /*
         * Use pte_alloc() instead of pte_alloc_map().  We can't run
         * pte_offset_map() on pmds where a huge pmd might be created
@@ -2854,6 +2823,28 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
        struct vm_fault vmf;
        int ret;
 
+       /*
+        * Preallocate pte before we take page_lock because this might lead to
+        * deadlocks for memcg reclaim which waits for pages under writeback:
+        *                              lock_page(A)
+        *                              SetPageWriteback(A)
+        *                              unlock_page(A)
+        * lock_page(B)
+        *                              lock_page(B)
+        * pte_alloc_pne
+        *   shrink_page_list
+        *     wait_on_page_writeback(A)
+        *                              SetPageWriteback(B)
+        *                              unlock_page(B)
+        *                              # flush A, B to clear the writeback
+        */
+       if (pmd_none(*fe->pmd) && !fe->prealloc_pte) {
+               fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
+               if (!fe->prealloc_pte)
+                       return VM_FAULT_OOM;
+               smp_wmb(); /* See comment in __pte_alloc() */
+       }
+
        vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
        vmf.pgoff = pgoff;
        vmf.flags = fe->flags;
@@ -2885,6 +2876,17 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
        return ret;
 }
 
+/*
+ * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
+ * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
+ * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
+ * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
+ */
+static int pmd_devmap_trans_unstable(pmd_t *pmd)
+{
+       return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
+}
+
 static int pte_alloc_one_map(struct fault_env *fe)
 {
        struct vm_area_struct *vma = fe->vma;
@@ -2908,18 +2910,27 @@ static int pte_alloc_one_map(struct fault_env *fe)
 map_pte:
        /*
         * If a huge pmd materialized under us just retry later.  Use
-        * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
-        * didn't become pmd_trans_huge under us and then back to pmd_none, as
-        * a result of MADV_DONTNEED running immediately after a huge pmd fault
-        * in a different thread of this mm, in turn leading to a misleading
-        * pmd_trans_huge() retval.  All we have to ensure is that it is a
-        * regular pmd that we can walk with pte_offset_map() and we can do that
-        * through an atomic read in C, which is what pmd_trans_unstable()
-        * provides.
+        * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
+        * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
+        * under us and then back to pmd_none, as a result of MADV_DONTNEED
+        * running immediately after a huge pmd fault in a different thread of
+        * this mm, in turn leading to a misleading pmd_trans_huge() retval.
+        * All we have to ensure is that it is a regular pmd that we can walk
+        * with pte_offset_map() and we can do that through an atomic read in
+        * C, which is what pmd_trans_unstable() provides.
         */
-       if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+       if (pmd_devmap_trans_unstable(fe->pmd))
                return VM_FAULT_NOPAGE;
 
+       /*
+        * At this point we know that our vmf->pmd points to a page of ptes
+        * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
+        * for the duration of the fault.  If a racing MADV_DONTNEED runs and
+        * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
+        * be valid and we will re-check to make sure the vmf->pte isn't
+        * pte_none() under vmf->ptl protection when we return to
+        * alloc_set_pte().
+        */
        fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
                        &fe->ptl);
        return 0;
@@ -3318,15 +3329,24 @@ static int do_fault(struct fault_env *fe)
 {
        struct vm_area_struct *vma = fe->vma;
        pgoff_t pgoff = linear_page_index(vma, fe->address);
+       int ret;
 
        /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
        if (!vma->vm_ops->fault)
-               return VM_FAULT_SIGBUS;
-       if (!(fe->flags & FAULT_FLAG_WRITE))
-               return do_read_fault(fe, pgoff);
-       if (!(vma->vm_flags & VM_SHARED))
-               return do_cow_fault(fe, pgoff);
-       return do_shared_fault(fe, pgoff);
+               ret = VM_FAULT_SIGBUS;
+       else if (!(fe->flags & FAULT_FLAG_WRITE))
+               ret = do_read_fault(fe, pgoff);
+       else if (!(vma->vm_flags & VM_SHARED))
+               ret = do_cow_fault(fe, pgoff);
+       else
+               ret = do_shared_fault(fe, pgoff);
+
+       /* preallocated pagetable is unused: free it */
+       if (fe->prealloc_pte) {
+               pte_free(vma->vm_mm, fe->prealloc_pte);
+               fe->prealloc_pte = 0;
+       }
+       return ret;
 }
 
 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3493,7 +3513,7 @@ static int handle_pte_fault(struct fault_env *fe)
                fe->pte = NULL;
        } else {
                /* See comment in pte_alloc_one_map() */
-               if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+               if (pmd_devmap_trans_unstable(fe->pmd))
                        return 0;
                /*
                 * A regular pmd is established and it can't morph into a huge
@@ -3633,6 +3653,11 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        /* do counter updates before entering really critical section. */
        check_sync_rss_stat(current);
 
+       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+                                           flags & FAULT_FLAG_INSTRUCTION,
+                                           flags & FAULT_FLAG_REMOTE))
+               return VM_FAULT_SIGSEGV;
+
        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
@@ -3640,11 +3665,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_oom_enable();
 
-       if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
-                                           flags & FAULT_FLAG_INSTRUCTION,
-                                           flags & FAULT_FLAG_REMOTE))
-               return VM_FAULT_SIGSEGV;
-
        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else
@@ -3672,8 +3692,18 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
         * further.
         */
        if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
-                               && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
+                               && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags))) {
+
+               /*
+                * We are going to enforce SIGBUS but the PF path might have
+                * dropped the mmap_sem already so take it again so that
+                * we do not break expectations of all arch specific PF paths
+                * and g-u-p
+                */
+               if (ret & VM_FAULT_RETRY)
+                       down_read(&vma->vm_mm->mmap_sem);
                ret = VM_FAULT_SIGBUS;
+       }
 
        return ret;
 }
@@ -3853,6 +3883,9 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                return -EINVAL;
 
        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
+       if (!maddr)
+               return -ENOMEM;
+
        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
@@ -3868,7 +3901,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
  * Access another process' address space as given in mm.  If non-NULL, use the
  * given task for page fault accounting.
  */
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
                unsigned long addr, void *buf, int len, unsigned int gup_flags)
 {
        struct vm_area_struct *vma;