OSDN Git Service

hugetlbfs: fix kernel BUG at fs/hugetlbfs/inode.c:444!
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / mm / hugetlb.c
index 4fe4340..b09988d 100644 (file)
@@ -372,8 +372,10 @@ retry_locked:
                spin_unlock(&resv->lock);
 
                trg = kmalloc(sizeof(*trg), GFP_KERNEL);
-               if (!trg)
+               if (!trg) {
+                       kfree(nrg);
                        return -ENOMEM;
+               }
 
                spin_lock(&resv->lock);
                list_add(&trg->link, &resv->region_cache);
@@ -483,8 +485,16 @@ static long region_del(struct resv_map *resv, long f, long t)
 retry:
        spin_lock(&resv->lock);
        list_for_each_entry_safe(rg, trg, head, link) {
-               if (rg->to <= f)
+               /*
+                * Skip regions before the range to be deleted.  file_region
+                * ranges are normally of the form [from, to).  However, there
+                * may be a "placeholder" entry in the map which is of the form
+                * (from, to) with from == to.  Check for placeholder entries
+                * at the beginning of the range to be deleted.
+                */
+               if (rg->to <= f && (rg->to != rg->from || rg->to != f))
                        continue;
+
                if (rg->from >= t)
                        break;
 
@@ -1406,12 +1416,13 @@ static void dissolve_free_huge_page(struct page *page)
 {
        spin_lock(&hugetlb_lock);
        if (PageHuge(page) && !page_count(page)) {
-               struct hstate *h = page_hstate(page);
-               int nid = page_to_nid(page);
-               list_del(&page->lru);
+               struct page *head = compound_head(page);
+               struct hstate *h = page_hstate(head);
+               int nid = page_to_nid(head);
+               list_del(&head->lru);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
-               update_and_free_page(h, page);
+               update_and_free_page(h, head);
        }
        spin_unlock(&hugetlb_lock);
 }
@@ -1419,7 +1430,8 @@ static void dissolve_free_huge_page(struct page *page)
 /*
  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
  * make specified memory blocks removable from the system.
- * Note that start_pfn should aligned with (minimum) hugepage size.
+ * Note that this will dissolve a free gigantic hugepage completely, if any
+ * part of it lies within the given range.
  */
 void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -1428,7 +1440,6 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
        if (!hugepages_supported())
                return;
 
-       VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
        for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
                dissolve_free_huge_page(pfn_to_page(pfn));
 }
@@ -1712,23 +1723,32 @@ free:
 }
 
 /*
- * When releasing a hugetlb pool reservation, any surplus pages that were
- * allocated to satisfy the reservation must be explicitly freed if they were
- * never used.
- * Called with hugetlb_lock held.
+ * This routine has two main purposes:
+ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
+ *    in unused_resv_pages.  This corresponds to the prior adjustments made
+ *    to the associated reservation map.
+ * 2) Free any unused surplus pages that may have been allocated to satisfy
+ *    the reservation.  As many as unused_resv_pages may be freed.
+ *
+ * Called with hugetlb_lock held.  However, the lock could be dropped (and
+ * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
+ * we must make sure nobody else can claim pages we are in the process of
+ * freeing.  Do this by ensuring resv_huge_page always is greater than the
+ * number of huge pages we plan to free when dropping the lock.
  */
 static void return_unused_surplus_pages(struct hstate *h,
                                        unsigned long unused_resv_pages)
 {
        unsigned long nr_pages;
 
-       /* Uncommit the reservation */
-       h->resv_huge_pages -= unused_resv_pages;
-
        /* Cannot return gigantic pages currently */
        if (hstate_is_gigantic(h))
-               return;
+               goto out;
 
+       /*
+        * Part (or even all) of the reservation could have been backed
+        * by pre-allocated pages. Only free surplus pages.
+        */
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
 
        /*
@@ -1738,12 +1758,22 @@ static void return_unused_surplus_pages(struct hstate *h,
         * when the nodes with surplus pages have no free pages.
         * free_pool_huge_page() will balance the the freed pages across the
         * on-line nodes with memory and will handle the hstate accounting.
+        *
+        * Note that we decrement resv_huge_pages as we free the pages.  If
+        * we drop the lock, resv_huge_pages will still be sufficiently large
+        * to cover subsequent pages we may free.
         */
        while (nr_pages--) {
+               h->resv_huge_pages--;
+               unused_resv_pages--;
                if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
-                       break;
+                       goto out;
                cond_resched_lock(&hugetlb_lock);
        }
+
+out:
+       /* Fully uncommit the reservation */
+       h->resv_huge_pages -= unused_resv_pages;
 }
 
 
@@ -2008,6 +2038,7 @@ static void __init gather_bootmem_prealloc(void)
                 */
                if (hstate_is_gigantic(h))
                        adjust_managed_page_count(page, 1 << h->order);
+               cond_resched();
        }
 }
 
@@ -2160,6 +2191,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
                 * and reducing the surplus.
                 */
                spin_unlock(&hugetlb_lock);
+
+               /* yield cpu to avoid soft lockup */
+               cond_resched();
+
                if (hstate_is_gigantic(h))
                        ret = alloc_fresh_gigantic_page(h, nodes_allowed);
                else
@@ -3068,7 +3103,7 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *vma)
 {
-       pte_t *src_pte, *dst_pte, entry;
+       pte_t *src_pte, *dst_pte, entry, dst_entry;
        struct page *ptepage;
        unsigned long addr;
        int cow;
@@ -3096,15 +3131,30 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        break;
                }
 
-               /* If the pagetables are shared don't copy or take references */
-               if (dst_pte == src_pte)
+               /*
+                * If the pagetables are shared don't copy or take references.
+                * dst_pte == src_pte is the common case of src/dest sharing.
+                *
+                * However, src could have 'unshared' and dst shares with
+                * another vma.  If dst_pte !none, this implies sharing.
+                * Check here before taking page table lock, and once again
+                * after taking the lock below.
+                */
+               dst_entry = huge_ptep_get(dst_pte);
+               if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
                        continue;
 
                dst_ptl = huge_pte_lock(h, dst, dst_pte);
                src_ptl = huge_pte_lockptr(h, src, src_pte);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                entry = huge_ptep_get(src_pte);
-               if (huge_pte_none(entry)) { /* skip none entry */
+               dst_entry = huge_ptep_get(dst_pte);
+               if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+                       /*
+                        * Skip if src entry none.  Also, skip in the
+                        * unlikely case dst entry !none as this implies
+                        * sharing with another vma.
+                        */
                        ;
                } else if (unlikely(is_hugetlb_entry_migration(entry) ||
                                    is_hugetlb_entry_hwpoisoned(entry))) {
@@ -3502,6 +3552,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
                return err;
        ClearPagePrivate(page);
 
+       /*
+        * set page dirty so that it will not be removed from cache/file
+        * by non-hugetlbfs specific code paths.
+        */
+       set_page_dirty(page);
+
        spin_lock(&inode->i_lock);
        inode->i_blocks += blocks_per_huge_page(h);
        spin_unlock(&inode->i_lock);
@@ -3696,12 +3752,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
                                VM_FAULT_SET_HINDEX(hstate_index(h));
+       } else {
+               ptep = huge_pte_alloc(mm, address, huge_page_size(h));
+               if (!ptep)
+                       return VM_FAULT_OOM;
        }
 
-       ptep = huge_pte_alloc(mm, address, huge_page_size(h));
-       if (!ptep)
-               return VM_FAULT_OOM;
-
        mapping = vma->vm_file->f_mapping;
        idx = vma_hugecache_offset(h, vma, address);
 
@@ -4199,7 +4255,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
                if (saddr) {
                        spte = huge_pte_offset(svma->vm_mm, saddr);
                        if (spte) {
-                               mm_inc_nr_pmds(mm);
                                get_page(virt_to_page(spte));
                                break;
                        }
@@ -4214,9 +4269,9 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (pud_none(*pud)) {
                pud_populate(mm, pud,
                                (pmd_t *)((unsigned long)spte & PAGE_MASK));
+               mm_inc_nr_pmds(mm);
        } else {
                put_page(virt_to_page(spte));
-               mm_inc_nr_pmds(mm);
        }
        spin_unlock(ptl);
 out:
@@ -4329,6 +4384,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 {
        struct page *page = NULL;
        spinlock_t *ptl;
+       pte_t pte;
 retry:
        ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
@@ -4338,12 +4394,13 @@ retry:
         */
        if (!pmd_huge(*pmd))
                goto out;
-       if (pmd_present(*pmd)) {
+       pte = huge_ptep_get((pte_t *)pmd);
+       if (pte_present(pte)) {
                page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
                if (flags & FOLL_GET)
                        get_page(page);
        } else {
-               if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+               if (is_hugetlb_entry_migration(pte)) {
                        spin_unlock(ptl);
                        __migration_entry_wait(mm, (pte_t *)pmd, ptl);
                        goto retry;