hugetlbfs: fix kernel BUG at fs/hugetlbfs/inode.c:444!

[sagit-ice-cold/kernel_xiaomi_msm8998.git] / mm / huge_memory.c
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 62fe06b..465786c 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1269,6 +1269,16 @@ out_unlock:
         return ret;
  }
  
+/*
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+{
+       return pmd_write(pmd) ||
+              ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+}
+
  struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                    unsigned long addr,
                                    pmd_t *pmd,
@@ -1279,7 +1289,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
  
         assert_spin_locked(pmd_lockptr(mm, pmd));
  
-       if (flags & FOLL_WRITE && !pmd_write(*pmd))
+       if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
                 goto out;
  
         /* Avoid dumping huge zero page */
@@ -1294,17 +1304,11 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
         VM_BUG_ON_PAGE(!PageHead(page), page);
         if (flags & FOLL_TOUCH) {
                 pmd_t _pmd;
-               /*
-                * We should set the dirty bit only for FOLL_WRITE but
-                * for now the dirty bit in the pmd is meaningless.
-                * And if the dirty bit will become meaningful and
-                * we'll only set it with FOLL_WRITE, an atomic
-                * set_bit will be required on the pmd to set the
-                * young bit, instead of the current set_pmd_at.
-                */
-               _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+               _pmd = pmd_mkyoung(*pmd);
+               if (flags & FOLL_WRITE)
+                       _pmd = pmd_mkdirty(_pmd);
                 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-                                         pmd, _pmd,  1))
+                                         pmd, _pmd, flags & FOLL_WRITE))
                         update_mmu_cache_pmd(vma, addr, pmd);
         }
         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
@@ -1353,8 +1357,11 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
          */
         if (unlikely(pmd_trans_migrating(*pmdp))) {
                 page = pmd_page(*pmdp);
+               if (!get_page_unless_zero(page))
+                       goto out_unlock;
                 spin_unlock(ptl);
                 wait_on_page_locked(page);
+               put_page(page);
                 goto out;
         }
  
@@ -1386,9 +1393,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
         /* Migration could have started since the pmd_trans_migrating check */
         if (!page_locked) {
+               page_nid = -1;
+               if (!get_page_unless_zero(page))
+                       goto out_unlock;
                 spin_unlock(ptl);
                 wait_on_page_locked(page);
-               page_nid = -1;
+               put_page(page);
                 goto out;
         }
  
@@ -1501,7 +1511,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
         spinlock_t *old_ptl, *new_ptl;
         int ret = 0;
         pmd_t pmd;
-
+       bool force_flush = false;
         struct mm_struct *mm = vma->vm_mm;
  
         if ((old_addr & ~HPAGE_PMD_MASK) ||
@@ -1529,6 +1539,8 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                 if (new_ptl != old_ptl)
                         spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
+               if (pmd_present(pmd))
+                       force_flush = true;
                 VM_BUG_ON(!pmd_none(*new_pmd));
  
                 if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1537,6 +1549,8 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                         pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
                 }
                 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+               if (force_flush)
+                       flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
                 if (new_ptl != old_ptl)
                         spin_unlock(new_ptl);
                 spin_unlock(old_ptl);
@@ -1556,35 +1570,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  {
         struct mm_struct *mm = vma->vm_mm;
         spinlock_t *ptl;
+       pmd_t entry;
+       bool preserve_write;
+
         int ret = 0;
  
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-               pmd_t entry;
-               bool preserve_write = prot_numa && pmd_write(*pmd);
-               ret = 1;
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+               return 0;
  
-               /*
-                * Avoid trapping faults against the zero page. The read-only
-                * data is likely to be read-cached on the local CPU and
-                * local/remote hits to the zero page are not interesting.
-                */
-               if (prot_numa && is_huge_zero_pmd(*pmd)) {
-                       spin_unlock(ptl);
-                       return ret;
-               }
+       preserve_write = prot_numa && pmd_write(*pmd);
+       ret = 1;
  
-               if (!prot_numa || !pmd_protnone(*pmd)) {
-                       entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
-                       entry = pmd_modify(entry, newprot);
-                       if (preserve_write)
-                               entry = pmd_mkwrite(entry);
-                       ret = HPAGE_PMD_NR;
-                       set_pmd_at(mm, addr, pmd, entry);
-                       BUG_ON(!preserve_write && pmd_write(entry));
-               }
-               spin_unlock(ptl);
-       }
+       /*
+        * Avoid trapping faults against the zero page. The read-only
+        * data is likely to be read-cached on the local CPU and
+        * local/remote hits to the zero page are not interesting.
+        */
+       if (prot_numa && is_huge_zero_pmd(*pmd))
+               goto unlock;
  
+       if (prot_numa && pmd_protnone(*pmd))
+               goto unlock;
+
+       /*
+        * In case prot_numa, we are under down_read(mmap_sem). It's critical
+        * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+        * which is also under down_read(mmap_sem):
+        *
+        *      CPU0:                           CPU1:
+        *                              change_huge_pmd(prot_numa=1)
+        *                               pmdp_huge_get_and_clear_notify()
+        * madvise_dontneed()
+        *  zap_pmd_range()
+        *   pmd_trans_huge(*pmd) == 0 (without ptl)
+        *   // skip the pmd
+        *                               set_pmd_at();
+        *                               // pmd is re-established
+        *
+        * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+        * which may break userspace.
+        *
+        * pmdp_invalidate() is required to make sure we don't miss
+        * dirty/young flags set by hardware.
+        */
+       entry = *pmd;
+       pmdp_invalidate(vma, addr, pmd);
+
+       /*
+        * Recover dirty/young flags.  It relies on pmdp_invalidate to not
+        * corrupt them.
+        */
+       if (pmd_dirty(*pmd))
+               entry = pmd_mkdirty(entry);
+       if (pmd_young(*pmd))
+               entry = pmd_mkyoung(entry);
+
+       entry = pmd_modify(entry, newprot);
+       if (preserve_write)
+               entry = pmd_mkwrite(entry);
+       ret = HPAGE_PMD_NR;
+       set_pmd_at(mm, addr, pmd, entry);
+       BUG_ON(!preserve_write && pmd_write(entry));
+unlock:
+       spin_unlock(ptl);
         return ret;
  }
  
@@ -2134,10 +2182,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
                  * page fault if needed.
                  */
                 return 0;
-       if (vma->vm_ops)
+       if (vma->vm_ops || (vm_flags & VM_NO_THP))
                 /* khugepaged not yet working on file or special mappings */
                 return 0;
-       VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma);
         hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
         hend = vma->vm_end & HPAGE_PMD_MASK;
         if (hstart < hend)
@@ -2498,8 +2545,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
                 return false;
         if (is_vma_temporary_stack(vma))
                 return false;
-       VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
-       return true;
+       return !(vma->vm_flags & VM_NO_THP);
  }
  
  static void collapse_huge_page(struct mm_struct *mm,