OSDN Git Service

hugetlbfs: fix kernel BUG at fs/hugetlbfs/inode.c:444!
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / mm / huge_memory.c
index 62fe06b..465786c 100644 (file)
@@ -1269,6 +1269,16 @@ out_unlock:
        return ret;
 }
 
+/*
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
+ */
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+{
+       return pmd_write(pmd) ||
+              ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+}
+
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                   unsigned long addr,
                                   pmd_t *pmd,
@@ -1279,7 +1289,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 
        assert_spin_locked(pmd_lockptr(mm, pmd));
 
-       if (flags & FOLL_WRITE && !pmd_write(*pmd))
+       if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
                goto out;
 
        /* Avoid dumping huge zero page */
@@ -1294,17 +1304,11 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
        VM_BUG_ON_PAGE(!PageHead(page), page);
        if (flags & FOLL_TOUCH) {
                pmd_t _pmd;
-               /*
-                * We should set the dirty bit only for FOLL_WRITE but
-                * for now the dirty bit in the pmd is meaningless.
-                * And if the dirty bit will become meaningful and
-                * we'll only set it with FOLL_WRITE, an atomic
-                * set_bit will be required on the pmd to set the
-                * young bit, instead of the current set_pmd_at.
-                */
-               _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+               _pmd = pmd_mkyoung(*pmd);
+               if (flags & FOLL_WRITE)
+                       _pmd = pmd_mkdirty(_pmd);
                if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
-                                         pmd, _pmd,  1))
+                                         pmd, _pmd, flags & FOLL_WRITE))
                        update_mmu_cache_pmd(vma, addr, pmd);
        }
        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
@@ -1353,8 +1357,11 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        if (unlikely(pmd_trans_migrating(*pmdp))) {
                page = pmd_page(*pmdp);
+               if (!get_page_unless_zero(page))
+                       goto out_unlock;
                spin_unlock(ptl);
                wait_on_page_locked(page);
+               put_page(page);
                goto out;
        }
 
@@ -1386,9 +1393,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
        /* Migration could have started since the pmd_trans_migrating check */
        if (!page_locked) {
+               page_nid = -1;
+               if (!get_page_unless_zero(page))
+                       goto out_unlock;
                spin_unlock(ptl);
                wait_on_page_locked(page);
-               page_nid = -1;
+               put_page(page);
                goto out;
        }
 
@@ -1501,7 +1511,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
        spinlock_t *old_ptl, *new_ptl;
        int ret = 0;
        pmd_t pmd;
-
+       bool force_flush = false;
        struct mm_struct *mm = vma->vm_mm;
 
        if ((old_addr & ~HPAGE_PMD_MASK) ||
@@ -1529,6 +1539,8 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
+               if (pmd_present(pmd))
+                       force_flush = true;
                VM_BUG_ON(!pmd_none(*new_pmd));
 
                if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1537,6 +1549,8 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
                }
                set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+               if (force_flush)
+                       flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
@@ -1556,35 +1570,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 {
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
+       pmd_t entry;
+       bool preserve_write;
+
        int ret = 0;
 
-       if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-               pmd_t entry;
-               bool preserve_write = prot_numa && pmd_write(*pmd);
-               ret = 1;
+       if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+               return 0;
 
-               /*
-                * Avoid trapping faults against the zero page. The read-only
-                * data is likely to be read-cached on the local CPU and
-                * local/remote hits to the zero page are not interesting.
-                */
-               if (prot_numa && is_huge_zero_pmd(*pmd)) {
-                       spin_unlock(ptl);
-                       return ret;
-               }
+       preserve_write = prot_numa && pmd_write(*pmd);
+       ret = 1;
 
-               if (!prot_numa || !pmd_protnone(*pmd)) {
-                       entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
-                       entry = pmd_modify(entry, newprot);
-                       if (preserve_write)
-                               entry = pmd_mkwrite(entry);
-                       ret = HPAGE_PMD_NR;
-                       set_pmd_at(mm, addr, pmd, entry);
-                       BUG_ON(!preserve_write && pmd_write(entry));
-               }
-               spin_unlock(ptl);
-       }
+       /*
+        * Avoid trapping faults against the zero page. The read-only
+        * data is likely to be read-cached on the local CPU and
+        * local/remote hits to the zero page are not interesting.
+        */
+       if (prot_numa && is_huge_zero_pmd(*pmd))
+               goto unlock;
 
+       if (prot_numa && pmd_protnone(*pmd))
+               goto unlock;
+
+       /*
+        * In case prot_numa, we are under down_read(mmap_sem). It's critical
+        * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+        * which is also under down_read(mmap_sem):
+        *
+        *      CPU0:                           CPU1:
+        *                              change_huge_pmd(prot_numa=1)
+        *                               pmdp_huge_get_and_clear_notify()
+        * madvise_dontneed()
+        *  zap_pmd_range()
+        *   pmd_trans_huge(*pmd) == 0 (without ptl)
+        *   // skip the pmd
+        *                               set_pmd_at();
+        *                               // pmd is re-established
+        *
+        * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+        * which may break userspace.
+        *
+        * pmdp_invalidate() is required to make sure we don't miss
+        * dirty/young flags set by hardware.
+        */
+       entry = *pmd;
+       pmdp_invalidate(vma, addr, pmd);
+
+       /*
+        * Recover dirty/young flags.  It relies on pmdp_invalidate to not
+        * corrupt them.
+        */
+       if (pmd_dirty(*pmd))
+               entry = pmd_mkdirty(entry);
+       if (pmd_young(*pmd))
+               entry = pmd_mkyoung(entry);
+
+       entry = pmd_modify(entry, newprot);
+       if (preserve_write)
+               entry = pmd_mkwrite(entry);
+       ret = HPAGE_PMD_NR;
+       set_pmd_at(mm, addr, pmd, entry);
+       BUG_ON(!preserve_write && pmd_write(entry));
+unlock:
+       spin_unlock(ptl);
        return ret;
 }
 
@@ -2134,10 +2182,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
                 * page fault if needed.
                 */
                return 0;
-       if (vma->vm_ops)
+       if (vma->vm_ops || (vm_flags & VM_NO_THP))
                /* khugepaged not yet working on file or special mappings */
                return 0;
-       VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma);
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
@@ -2498,8 +2545,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
                return false;
        if (is_vma_temporary_stack(vma))
                return false;
-       VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
-       return true;
+       return !(vma->vm_flags & VM_NO_THP);
 }
 
 static void collapse_huge_page(struct mm_struct *mm,