crypto: talitos - HMAC SNOOP NO AFEU mode requires SW icv checking.

[android-x86/kernel.git] / mm / huge_memory.c
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 3cae1dc..f32f73f 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -30,6 +30,7 @@
  #include <linux/userfaultfd_k.h>
  #include <linux/page_idle.h>
  #include <linux/shmem_fs.h>
+#include <linux/page_owner.h>
  
  #include <asm/tlb.h>
  #include <asm/pgalloc.h>
@@ -542,7 +543,8 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
  
         VM_BUG_ON_PAGE(!PageCompound(page), page);
  
-       if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
+       if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg,
+                                 true)) {
                 put_page(page);
                 count_vm_event(THP_FAULT_FALLBACK);
                 return VM_FAULT_FALLBACK;
@@ -1060,7 +1062,7 @@ alloc:
         }
  
         if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
-                                       huge_gfp, &memcg, true))) {
+                               huge_gfp | __GFP_NORETRY, &memcg, true))) {
                 put_page(new_page);
                 split_huge_pmd(vma, fe->pmd, fe->address);
                 if (page)
@@ -1258,12 +1260,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
  
         /* Migration could have started since the pmd_trans_migrating check */
         if (!page_locked) {
+               page_nid = -1;
                 if (!get_page_unless_zero(page))
                         goto out_unlock;
                 spin_unlock(fe->ptl);
                 wait_on_page_locked(page);
                 put_page(page);
-               page_nid = -1;
                 goto out;
         }
  
@@ -1444,7 +1446,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
  
  bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                   unsigned long new_addr, unsigned long old_end,
-                 pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
+                 pmd_t *old_pmd, pmd_t *new_pmd)
  {
         spinlock_t *old_ptl, *new_ptl;
         pmd_t pmd;
@@ -1475,7 +1477,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                 if (new_ptl != old_ptl)
                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
-               if (pmd_present(pmd) && pmd_dirty(pmd))
+               if (pmd_present(pmd))
                         force_flush = true;
                 VM_BUG_ON(!pmd_none(*new_pmd));
  
@@ -1486,12 +1488,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                         pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
                 }
                 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-               if (new_ptl != old_ptl)
-                       spin_unlock(new_ptl);
                 if (force_flush)
                         flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
-               else
-                       *need_flush = true;
+               if (new_ptl != old_ptl)
+                       spin_unlock(new_ptl);
                 spin_unlock(old_ptl);
                 return true;
         }
@@ -1509,37 +1509,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  {
         struct mm_struct *mm = vma->vm_mm;
         spinlock_t *ptl;
-       int ret = 0;
+       pmd_t entry;
+       bool preserve_write;
+       int ret;
  
         ptl = __pmd_trans_huge_lock(pmd, vma);
-       if (ptl) {
-               pmd_t entry;
-               bool preserve_write = prot_numa && pmd_write(*pmd);
-               ret = 1;
+       if (!ptl)
+               return 0;
  
-               /*
-                * Avoid trapping faults against the zero page. The read-only
-                * data is likely to be read-cached on the local CPU and
-                * local/remote hits to the zero page are not interesting.
-                */
-               if (prot_numa && is_huge_zero_pmd(*pmd)) {
-                       spin_unlock(ptl);
-                       return ret;
-               }
+       preserve_write = prot_numa && pmd_write(*pmd);
+       ret = 1;
  
-               if (!prot_numa || !pmd_protnone(*pmd)) {
-                       entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
-                       entry = pmd_modify(entry, newprot);
-                       if (preserve_write)
-                               entry = pmd_mkwrite(entry);
-                       ret = HPAGE_PMD_NR;
-                       set_pmd_at(mm, addr, pmd, entry);
-                       BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
-                                       pmd_write(entry));
-               }
-               spin_unlock(ptl);
-       }
+       /*
+        * Avoid trapping faults against the zero page. The read-only
+        * data is likely to be read-cached on the local CPU and
+        * local/remote hits to the zero page are not interesting.
+        */
+       if (prot_numa && is_huge_zero_pmd(*pmd))
+               goto unlock;
+
+       if (prot_numa && pmd_protnone(*pmd))
+               goto unlock;
  
+       /*
+        * In case prot_numa, we are under down_read(mmap_sem). It's critical
+        * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+        * which is also under down_read(mmap_sem):
+        *
+        *      CPU0:                           CPU1:
+        *                              change_huge_pmd(prot_numa=1)
+        *                               pmdp_huge_get_and_clear_notify()
+        * madvise_dontneed()
+        *  zap_pmd_range()
+        *   pmd_trans_huge(*pmd) == 0 (without ptl)
+        *   // skip the pmd
+        *                               set_pmd_at();
+        *                               // pmd is re-established
+        *
+        * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+        * which may break userspace.
+        *
+        * pmdp_invalidate() is required to make sure we don't miss
+        * dirty/young flags set by hardware.
+        */
+       entry = *pmd;
+       pmdp_invalidate(vma, addr, pmd);
+
+       /*
+        * Recover dirty/young flags.  It relies on pmdp_invalidate to not
+        * corrupt them.
+        */
+       if (pmd_dirty(*pmd))
+               entry = pmd_mkdirty(entry);
+       if (pmd_young(*pmd))
+               entry = pmd_mkyoung(entry);
+
+       entry = pmd_modify(entry, newprot);
+       if (preserve_write)
+               entry = pmd_mkwrite(entry);
+       ret = HPAGE_PMD_NR;
+       set_pmd_at(mm, addr, pmd, entry);
+       BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+       spin_unlock(ptl);
         return ret;
  }
  
@@ -1609,6 +1641,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                 if (vma_is_dax(vma))
                         return;
                 page = pmd_page(_pmd);
+               if (!PageDirty(page) && pmd_dirty(_pmd))
+                       set_page_dirty(page);
                 if (!PageReferenced(page) && pmd_young(_pmd))
                         SetPageReferenced(page);
                 page_remove_rmap(page, true);
@@ -1806,7 +1840,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
         }
  }
  
-static void freeze_page(struct page *page)
+static void unmap_page(struct page *page)
  {
         enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
                 TTU_RMAP_LOCKED;
@@ -1829,7 +1863,7 @@ static void freeze_page(struct page *page)
         VM_BUG_ON_PAGE(ret, page + i - 1);
  }
  
-static void unfreeze_page(struct page *page)
+static void remap_page(struct page *page)
  {
         int i;
  
@@ -1843,26 +1877,13 @@ static void __split_huge_page_tail(struct page *head, int tail,
         struct page *page_tail = head + tail;
  
         VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
-       VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
  
         /*
-        * tail_page->_refcount is zero and not changing from under us. But
-        * get_page_unless_zero() may be running from under us on the
-        * tail_page. If we used atomic_set() below instead of atomic_inc() or
-        * atomic_add(), we would then run atomic_set() concurrently with
-        * get_page_unless_zero(), and atomic_set() is implemented in C not
-        * using locked ops. spin_unlock on x86 sometime uses locked ops
-        * because of PPro errata 66, 92, so unless somebody can guarantee
-        * atomic_set() here would be safe on all archs (and not only on x86),
-        * it's safer to use atomic_inc()/atomic_add().
+        * Clone page flags before unfreezing refcount.
+        *
+        * After successful get_page_unless_zero() might follow flags change,
+        * for exmaple lock_page() which set PG_waiters.
          */
-       if (PageAnon(head)) {
-               page_ref_inc(page_tail);
-       } else {
-               /* Additional pin to radix tree */
-               page_ref_add(page_tail, 2);
-       }
-
         page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
         page_tail->flags |= (head->flags &
                         ((1L << PG_referenced) |
@@ -1874,36 +1895,42 @@ static void __split_huge_page_tail(struct page *head, int tail,
                          (1L << PG_unevictable) |
                          (1L << PG_dirty)));
  
-       /*
-        * After clearing PageTail the gup refcount can be released.
-        * Page flags also must be visible before we make the page non-compound.
-        */
+       /* ->mapping in first tail page is compound_mapcount */
+       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+                       page_tail);
+       page_tail->mapping = head->mapping;
+       page_tail->index = head->index + tail;
+
+       /* Page flags must be visible before we make the page non-compound. */
         smp_wmb();
  
+       /*
+        * Clear PageTail before unfreezing page refcount.
+        *
+        * After successful get_page_unless_zero() might follow put_page()
+        * which needs correct compound_head().
+        */
         clear_compound_head(page_tail);
  
+       /* Finally unfreeze refcount. Additional reference from page cache. */
+       page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
+                                         PageSwapCache(head)));
+
         if (page_is_young(head))
                 set_page_young(page_tail);
         if (page_is_idle(head))
                 set_page_idle(page_tail);
  
-       /* ->mapping in first tail page is compound_mapcount */
-       VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
-                       page_tail);
-       page_tail->mapping = head->mapping;
-
-       page_tail->index = head->index + tail;
         page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
         lru_add_page_tail(head, page_tail, lruvec, list);
  }
  
  static void __split_huge_page(struct page *page, struct list_head *list,
-               unsigned long flags)
+               pgoff_t end, unsigned long flags)
  {
         struct page *head = compound_head(page);
         struct zone *zone = page_zone(head);
         struct lruvec *lruvec;
-       pgoff_t end = -1;
         int i;
  
         lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
@@ -1911,9 +1938,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
         /* complete memcg works before add pages to LRU */
         mem_cgroup_split_huge_fixup(head);
  
-       if (!PageAnon(page))
-               end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
-
         for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
                 __split_huge_page_tail(head, i, lruvec, list);
                 /* Some pages can be beyond i_size: drop them from page cache */
@@ -1927,6 +1951,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
         }
  
         ClearPageCompound(head);
+
+       split_page_owner(head, HPAGE_PMD_ORDER);
+
         /* See comment in __split_huge_page_tail() */
         if (PageAnon(head)) {
                 page_ref_inc(head);
@@ -1938,7 +1965,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
  
         spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
  
-       unfreeze_page(head);
+       remap_page(head);
  
         for (i = 0; i < HPAGE_PMD_NR; i++) {
                 struct page *subpage = head + i;
@@ -2066,6 +2093,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         int count, mapcount, extra_pins, ret;
         bool mlocked;
         unsigned long flags;
+       pgoff_t end;
  
         VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
         VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -2087,6 +2115,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                         goto out;
                 }
                 extra_pins = 0;
+               end = -1;
                 mapping = NULL;
                 anon_vma_lock_write(anon_vma);
         } else {
@@ -2102,10 +2131,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                 extra_pins = HPAGE_PMD_NR;
                 anon_vma = NULL;
                 i_mmap_lock_read(mapping);
+
+               /*
+                *__split_huge_page() may need to trim off pages beyond EOF:
+                * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
+                * which cannot be nested inside the page tree lock. So note
+                * end now: i_size itself may be changed at any moment, but
+                * head page lock is good enough to serialize the trimming.
+                */
+               end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
         }
  
         /*
-        * Racy check if we can split the page, before freeze_page() will
+        * Racy check if we can split the page, before unmap_page() will
          * split PMDs
          */
         if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
@@ -2114,7 +2152,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
         }
  
         mlocked = PageMlocked(page);
-       freeze_page(head);
+       unmap_page(head);
         VM_BUG_ON_PAGE(compound_mapcount(head), head);
  
         /* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -2151,7 +2189,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                 if (mapping)
                         __dec_node_page_state(page, NR_SHMEM_THPS);
                 spin_unlock(&pgdata->split_queue_lock);
-               __split_huge_page(page, list, flags);
+               __split_huge_page(page, list, end, flags);
                 ret = 0;
         } else {
                 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
@@ -2166,7 +2204,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
  fail:          if (mapping)
                         spin_unlock(&mapping->tree_lock);
                 spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
-               unfreeze_page(head);
+               remap_page(head);
                 ret = -EBUSY;
         }
  
@@ -2247,11 +2285,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
  
         list_for_each_safe(pos, next, &list) {
                 page = list_entry((void *)pos, struct page, mapping);
-               lock_page(page);
+               if (!trylock_page(page))
+                       goto next;
                 /* split_huge_page() removes page from list on success */
                 if (!split_huge_page(page))
                         split++;
                 unlock_page(page);
+next:
                 put_page(page);
         }