X-Git-Url: http://git.osdn.net/view?a=blobdiff_plain;f=mm%2Fhugetlb.c;h=f1a45f5077fe45b91441ee099d6696f3dc9cea60;hb=57d813863143eb18769ac685d300487625f7e75a;hp=827bb02a43a4e425393c296db89e7a49d6adf02b;hpb=94521b2fd27b5ef43573f9f425a48839fa80ebaa;p=sagit-ice-cold%2Fkernel_xiaomi_msm8998.git diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 827bb02a43a4..f1a45f5077fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -372,8 +372,10 @@ retry_locked: spin_unlock(&resv->lock); trg = kmalloc(sizeof(*trg), GFP_KERNEL); - if (!trg) + if (!trg) { + kfree(nrg); return -ENOMEM; + } spin_lock(&resv->lock); list_add(&trg->link, &resv->region_cache); @@ -483,8 +485,16 @@ static long region_del(struct resv_map *resv, long f, long t) retry: spin_lock(&resv->lock); list_for_each_entry_safe(rg, trg, head, link) { - if (rg->to <= f) + /* + * Skip regions before the range to be deleted. file_region + * ranges are normally of the form [from, to). However, there + * may be a "placeholder" entry in the map which is of the form + * (from, to) with from == to. Check for placeholder entries + * at the beginning of the range to be deleted. + */ + if (rg->to <= f && (rg->to != rg->from || rg->to != f)) continue; + if (rg->from >= t) break; @@ -1406,12 +1416,13 @@ static void dissolve_free_huge_page(struct page *page) { spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - list_del(&page->lru); + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + int nid = page_to_nid(head); + list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; - update_and_free_page(h, page); + update_and_free_page(h, head); } spin_unlock(&hugetlb_lock); } @@ -1419,7 +1430,8 @@ static void dissolve_free_huge_page(struct page *page) /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. - * Note that start_pfn should aligned with (minimum) hugepage size. + * Note that this will dissolve a free gigantic hugepage completely, if any + * part of it lies within the given range. */ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { @@ -1428,7 +1440,6 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) if (!hugepages_supported()) return; - VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) dissolve_free_huge_page(pfn_to_page(pfn)); } @@ -1712,23 +1723,32 @@ free: } /* - * When releasing a hugetlb pool reservation, any surplus pages that were - * allocated to satisfy the reservation must be explicitly freed if they were - * never used. - * Called with hugetlb_lock held. + * This routine has two main purposes: + * 1) Decrement the reservation count (resv_huge_pages) by the value passed + * in unused_resv_pages. This corresponds to the prior adjustments made + * to the associated reservation map. + * 2) Free any unused surplus pages that may have been allocated to satisfy + * the reservation. As many as unused_resv_pages may be freed. + * + * Called with hugetlb_lock held. However, the lock could be dropped (and + * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, + * we must make sure nobody else can claim pages we are in the process of + * freeing. Do this by ensuring resv_huge_page always is greater than the + * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - /* Uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) - return; + goto out; + /* + * Part (or even all) of the reservation could have been backed + * by pre-allocated pages. Only free surplus pages. + */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* @@ -1738,12 +1758,22 @@ static void return_unused_surplus_pages(struct hstate *h, * when the nodes with surplus pages have no free pages. * free_pool_huge_page() will balance the the freed pages across the * on-line nodes with memory and will handle the hstate accounting. + * + * Note that we decrement resv_huge_pages as we free the pages. If + * we drop the lock, resv_huge_pages will still be sufficiently large + * to cover subsequent pages we may free. */ while (nr_pages--) { + h->resv_huge_pages--; + unused_resv_pages--; if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) - break; + goto out; cond_resched_lock(&hugetlb_lock); } + +out: + /* Fully uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; } @@ -1886,7 +1916,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; - + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { + SetPagePrivate(page); + h->resv_huge_pages--; + } spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ @@ -2005,6 +2038,7 @@ static void __init gather_bootmem_prealloc(void) */ if (hstate_is_gigantic(h)) adjust_managed_page_count(page, 1 << h->order); + cond_resched(); } } @@ -2157,6 +2191,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * and reducing the surplus. */ spin_unlock(&hugetlb_lock); + + /* yield cpu to avoid soft lockup */ + cond_resched(); + if (hstate_is_gigantic(h)) ret = alloc_fresh_gigantic_page(h, nodes_allowed); else @@ -3065,7 +3103,7 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { - pte_t *src_pte, *dst_pte, entry; + pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; int cow; @@ -3093,15 +3131,30 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* If the pagetables are shared don't copy or take references */ - if (dst_pte == src_pte) + /* + * If the pagetables are shared don't copy or take references. + * dst_pte == src_pte is the common case of src/dest sharing. + * + * However, src could have 'unshared' and dst shares with + * another vma. If dst_pte !none, this implies sharing. + * Check here before taking page table lock, and once again + * after taking the lock below. + */ + dst_entry = huge_ptep_get(dst_pte); + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) continue; dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); - if (huge_pte_none(entry)) { /* skip none entry */ + dst_entry = huge_ptep_get(dst_pte); + if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + /* + * Skip if src entry none. Also, skip in the + * unlikely case dst entry !none as this implies + * sharing with another vma. + */ ; } else if (unlikely(is_hugetlb_entry_migration(entry) || is_hugetlb_entry_hwpoisoned(entry))) { @@ -3499,6 +3552,12 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return err; ClearPagePrivate(page); + /* + * set page dirty so that it will not be removed from cache/file + * by non-hugetlbfs specific code paths. + */ + set_page_dirty(page); + spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); @@ -3693,12 +3752,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; - mapping = vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, vma, address); @@ -3994,6 +4053,14 @@ int hugetlb_reserve_pages(struct inode *inode, struct resv_map *resv_map; long gbl_reserve; + /* This should never happen */ + if (from > to) { +#ifdef CONFIG_DEBUG_VM + WARN(1, "%s called with a negative range\n", __func__); +#endif + return -EINVAL; + } + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -4083,7 +4150,9 @@ int hugetlb_reserve_pages(struct inode *inode, return 0; out_err: if (!vma || vma->vm_flags & VM_MAYSHARE) - region_abort(resv_map, from, to); + /* Don't call region_abort if region_chg failed */ + if (chg >= 0) + region_abort(resv_map, from, to); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); return ret; @@ -4157,13 +4226,41 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && - vma->vm_start <= base && end <= vma->vm_end) + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) return true; return false; } /* + * Determine if start,end range within vma could be mapped by shared pmd. + * If yes, adjust start and end to cover range associated with possible + * shared pmd mappings. + */ +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ + unsigned long check_addr = *start; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { + unsigned long a_start = check_addr & PUD_MASK; + unsigned long a_end = a_start + PUD_SIZE; + + /* + * If sharing is possible, adjust start/end if necessary. + */ + if (range_in_vma(vma, a_start, a_end)) { + if (a_start < *start) + *start = a_start; + if (a_end > *end) + *end = a_end; + } + } +} + +/* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the @@ -4196,7 +4293,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (saddr) { spte = huge_pte_offset(svma->vm_mm, saddr); if (spte) { - mm_inc_nr_pmds(mm); get_page(virt_to_page(spte)); break; } @@ -4211,9 +4307,9 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); + mm_inc_nr_pmds(mm); } else { put_page(virt_to_page(spte)); - mm_inc_nr_pmds(mm); } spin_unlock(ptl); out: @@ -4260,6 +4356,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } + +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ @@ -4326,6 +4427,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, { struct page *page = NULL; spinlock_t *ptl; + pte_t pte; retry: ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); @@ -4335,12 +4437,13 @@ retry: */ if (!pmd_huge(*pmd)) goto out; - if (pmd_present(*pmd)) { + pte = huge_ptep_get((pte_t *)pmd); + if (pte_present(pte)) { page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); if (flags & FOLL_GET) get_page(page); } else { - if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { + if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); __migration_entry_wait(mm, (pte_t *)pmd, ptl); goto retry;