OSDN Git Service

crypto: talitos - HMAC SNOOP NO AFEU mode requires SW icv checking.
[android-x86/kernel.git] / mm / huge_memory.c
index 3cae1dc..f32f73f 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/page_idle.h>
 #include <linux/shmem_fs.h>
+#include <linux/page_owner.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -542,7 +543,8 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
 
        VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-       if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+       if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
+                                 true)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1060,7 +1062,7 @@ alloc:
        }
 
        if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
-                                       huge_gfp, &memcg, true))) {
+                               huge_gfp | __GFP_NORETRY, &memcg, true))) {
                put_page(new_page);
                split_huge_pmd(vma, fe->pmd, fe->address);
                if (page)
@@ -1258,12 +1260,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
 
        /* Migration could have started since the pmd_trans_migrating check */
        if (!page_locked) {
+               page_nid = -1;
                if (!get_page_unless_zero(page))
                        goto out_unlock;
                spin_unlock(fe->ptl);
                wait_on_page_locked(page);
                put_page(page);
-               page_nid = -1;
                goto out;
        }
 
@@ -1444,7 +1446,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                  unsigned long new_addr, unsigned long old_end,
-                 pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
+                 pmd_t *old_pmd, pmd_t *new_pmd)
 {
        spinlock_t *old_ptl, *new_ptl;
        pmd_t pmd;
@@ -1475,7 +1477,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
-               if (pmd_present(pmd) && pmd_dirty(pmd))
+               if (pmd_present(pmd))
                        force_flush = true;
                VM_BUG_ON(!pmd_none(*new_pmd));
 
@@ -1486,12 +1488,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
                }
                set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-               if (new_ptl != old_ptl)
-                       spin_unlock(new_ptl);
                if (force_flush)
                        flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
-               else
-                       *need_flush = true;
+               if (new_ptl != old_ptl)
+                       spin_unlock(new_ptl);
                spin_unlock(old_ptl);
                return true;
        }
@@ -1509,37 +1509,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 {
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
-       int ret = 0;
+       pmd_t entry;
+       bool preserve_write;
+       int ret;
 
        ptl = __pmd_trans_huge_lock(pmd, vma);
-       if (ptl) {
-               pmd_t entry;
-               bool preserve_write = prot_numa && pmd_write(*pmd);
-               ret = 1;
+       if (!ptl)
+               return 0;
 
-               /*
-                * Avoid trapping faults against the zero page. The read-only
-                * data is likely to be read-cached on the local CPU and
-                * local/remote hits to the zero page are not interesting.
-                */
-               if (prot_numa && is_huge_zero_pmd(*pmd)) {
-                       spin_unlock(ptl);
-                       return ret;
-               }
+       preserve_write = prot_numa && pmd_write(*pmd);
+       ret = 1;
 
-               if (!prot_numa || !pmd_protnone(*pmd)) {
-                       entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
-                       entry = pmd_modify(entry, newprot);
-                       if (preserve_write)
-                               entry = pmd_mkwrite(entry);
-                       ret = HPAGE_PMD_NR;
-                       set_pmd_at(mm, addr, pmd, entry);
-                       BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
-                                       pmd_write(entry));
-               }
-               spin_unlock(ptl);
-       }
+       /*
+        * Avoid trapping faults against the zero page. The read-only
+        * data is likely to be read-cached on the local CPU and
+        * local/remote hits to the zero page are not interesting.
+        */
+       if (prot_numa && is_huge_zero_pmd(*pmd))
+               goto unlock;
+
+       if (prot_numa && pmd_protnone(*pmd))
+               goto unlock;
 
+       /*
+        * In case prot_numa, we are under down_read(mmap_sem). It's critical
+        * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+        * which is also under down_read(mmap_sem):
+        *
+        *      CPU0:                           CPU1:
+        *                              change_huge_pmd(prot_numa=1)
+        *                               pmdp_huge_get_and_clear_notify()
+        * madvise_dontneed()
+        *  zap_pmd_range()
+        *   pmd_trans_huge(*pmd) == 0 (without ptl)
+        *   // skip the pmd
+        *                               set_pmd_at();
+        *                               // pmd is re-established
+        *
+        * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+        * which may break userspace.
+        *
+        * pmdp_invalidate() is required to make sure we don't miss
+        * dirty/young flags set by hardware.
+        */
+       entry = *pmd;
+       pmdp_invalidate(vma, addr, pmd);
+
+       /*
+        * Recover dirty/young flags.  It relies on pmdp_invalidate to not
+        * corrupt them.
+        */
+       if (pmd_dirty(*pmd))
+               entry = pmd_mkdirty(entry);
+       if (pmd_young(*pmd))
+               entry = pmd_mkyoung(entry);
+
+       entry = pmd_modify(entry, newprot);
+       if (preserve_write)
+               entry = pmd_mkwrite(entry);
+       ret = HPAGE_PMD_NR;
+       set_pmd_at(mm, addr, pmd, entry);
+       BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+       spin_unlock(ptl);
        return ret;
 }
 
@@ -1609,6 +1641,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                if (vma_is_dax(vma))
                        return;
                page = pmd_page(_pmd);
+               if (!PageDirty(page) && pmd_dirty(_pmd))
+                       set_page_dirty(page);
                if (!PageReferenced(page) && pmd_young(_pmd))
                        SetPageReferenced(page);
                page_remove_rmap(page, true);
@@ -1806,7 +1840,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
        }
 }
 
-static void freeze_page(struct page *page)
+static void unmap_page(struct page *page)
 {
        enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
                TTU_RMAP_LOCKED;
@@ -1829,7 +1863,7 @@ static void freeze_page(struct page *page)
        VM_BUG_ON_PAGE(ret, page + i - 1);
 }
 
-static void unfreeze_page(struct page *page)
+static void remap_page(struct page *page)
 {
        int i;
 
@@ -1843,26 +1877,13 @@ static void __split_huge_page_tail(struct page *head, int tail,
        struct page *page_tail = head + tail;
 
        VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
-       VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
 
        /*
-        * tail_page->_refcount is zero and not changing from under us. But
-        * get_page_unless_zero() may be running from under us on the
-        * tail_page. If we used atomic_set() below instead of atomic_inc() or
-        * atomic_add(), we would then run atomic_set() concurrently with
-        * get_page_unless_zero(), and atomic_set() is implemented in C not
-        * using locked ops. spin_unlock on x86 sometime uses locked ops
-        * because of PPro errata 66, 92, so unless somebody can guarantee
-        * atomic_set() here would be safe on all archs (and not only on x86),
-        * it's safer to use atomic_inc()/atomic_add().
+        * Clone page flags before unfreezing refcount.
+        *
+        * After successful get_page_unless_zero() might follow flags change,
+        * for exmaple lock_page() which set PG_waiters.
         */
-       if (PageAnon(head)) {
-               page_ref_inc(page_tail);
-       } else {
-               /* Additional pin to radix tree */
-               page_ref_add(page_tail, 2);
-       }
-
        page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        page_tail->flags |= (head->flags &
                        ((1L << PG_referenced) |
@@ -1874,36 +1895,42 @@ static void __split_huge_page_tail(struct page *head, int tail,
                         (1L << PG_unevictable) |
                         (1L << PG_dirty)));
 
-       /*
-        * After clearing PageTail the gup refcount can be released.
-        * Page flags also must be visible before we make the page non-compound.
-        */
+       /* ->mapping in first tail page is compound_mapcount */
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                       page_tail);
+       page_tail->mapping = head->mapping;
+       page_tail->index = head->index + tail;
+
+       /* Page flags must be visible before we make the page non-compound. */
        smp_wmb();
 
+       /*
+        * Clear PageTail before unfreezing page refcount.
+        *
+        * After successful get_page_unless_zero() might follow put_page()
+        * which needs correct compound_head().
+        */
        clear_compound_head(page_tail);
 
+       /* Finally unfreeze refcount. Additional reference from page cache. */
+       page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
+                                         PageSwapCache(head)));
+
        if (page_is_young(head))
                set_page_young(page_tail);
        if (page_is_idle(head))
                set_page_idle(page_tail);
 
-       /* ->mapping in first tail page is compound_mapcount */
-       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
-                       page_tail);
-       page_tail->mapping = head->mapping;
-
-       page_tail->index = head->index + tail;
        page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
        lru_add_page_tail(head, page_tail, lruvec, list);
 }
 
 static void __split_huge_page(struct page *page, struct list_head *list,
-               unsigned long flags)
+               pgoff_t end, unsigned long flags)
 {
        struct page *head = compound_head(page);
        struct zone *zone = page_zone(head);
        struct lruvec *lruvec;
-       pgoff_t end = -1;
        int i;
 
        lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
@@ -1911,9 +1938,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
        /* complete memcg works before add pages to LRU */
        mem_cgroup_split_huge_fixup(head);
 
-       if (!PageAnon(page))
-               end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
-
        for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
                __split_huge_page_tail(head, i, lruvec, list);
                /* Some pages can be beyond i_size: drop them from page cache */
@@ -1927,6 +1951,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
        }
 
        ClearPageCompound(head);
+
+       split_page_owner(head, HPAGE_PMD_ORDER);
+
        /* See comment in __split_huge_page_tail() */
        if (PageAnon(head)) {
                page_ref_inc(head);
@@ -1938,7 +1965,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 
        spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
 
-       unfreeze_page(head);
+       remap_page(head);
 
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                struct page *subpage = head + i;
@@ -2066,6 +2093,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
        int count, mapcount, extra_pins, ret;
        bool mlocked;
        unsigned long flags;
+       pgoff_t end;
 
        VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -2087,6 +2115,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                        goto out;
                }
                extra_pins = 0;
+               end = -1;
                mapping = NULL;
                anon_vma_lock_write(anon_vma);
        } else {
@@ -2102,10 +2131,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                extra_pins = HPAGE_PMD_NR;
                anon_vma = NULL;
                i_mmap_lock_read(mapping);
+
+               /*
+                *__split_huge_page() may need to trim off pages beyond EOF:
+                * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
+                * which cannot be nested inside the page tree lock. So note
+                * end now: i_size itself may be changed at any moment, but
+                * head page lock is good enough to serialize the trimming.
+                */
+               end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
        }
 
        /*
-        * Racy check if we can split the page, before freeze_page() will
+        * Racy check if we can split the page, before unmap_page() will
         * split PMDs
         */
        if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
@@ -2114,7 +2152,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
        }
 
        mlocked = PageMlocked(page);
-       freeze_page(head);
+       unmap_page(head);
        VM_BUG_ON_PAGE(compound_mapcount(head), head);
 
        /* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -2151,7 +2189,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                if (mapping)
                        __dec_node_page_state(page, NR_SHMEM_THPS);
                spin_unlock(&pgdata->split_queue_lock);
-               __split_huge_page(page, list, flags);
+               __split_huge_page(page, list, end, flags);
                ret = 0;
        } else {
                if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
@@ -2166,7 +2204,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 fail:          if (mapping)
                        spin_unlock(&mapping->tree_lock);
                spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
-               unfreeze_page(head);
+               remap_page(head);
                ret = -EBUSY;
        }
 
@@ -2247,11 +2285,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 
        list_for_each_safe(pos, next, &list) {
                page = list_entry((void *)pos, struct page, mapping);
-               lock_page(page);
+               if (!trylock_page(page))
+                       goto next;
                /* split_huge_page() removes page from list on success */
                if (!split_huge_page(page))
                        split++;
                unlock_page(page);
+next:
                put_page(page);
        }