OSDN Git Service

MIPS: VDSO: Prevent use of smp_processor_id()
[android-x86/kernel.git] / mm / page_alloc.c
index 6de9440..13a6421 100644 (file)
@@ -284,8 +284,37 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
 static inline void reset_deferred_meminit(pg_data_t *pgdat)
 {
+       phys_addr_t start_addr, end_addr;
+       unsigned long max_pgcnt;
+       unsigned long reserved;
+
+       /*
+        * Initialise at least 2G of a node but also take into account that
+        * two large system hashes that can take up 1GB for 0.25TB/node.
+        */
+       max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
+                       (pgdat->node_spanned_pages >> 8));
+
+       /*
+        * Compensate the all the memblock reservations (e.g. crash kernel)
+        * from the initial estimation to make sure we will initialize enough
+        * memory to boot.
+        */
+       start_addr = PFN_PHYS(pgdat->node_start_pfn);
+       end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
+       reserved = memblock_reserved_memory_within(start_addr, end_addr);
+       max_pgcnt += PHYS_PFN(reserved);
+
+       pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
        pgdat->first_deferred_pfn = ULONG_MAX;
 }
 
@@ -308,20 +337,11 @@ static inline bool update_defer_init(pg_data_t *pgdat,
                                unsigned long pfn, unsigned long zone_end,
                                unsigned long *nr_initialised)
 {
-       unsigned long max_initialise;
-
        /* Always populate low zones for address-contrained allocations */
        if (zone_end < pgdat_end_pfn(pgdat))
                return true;
-       /*
-        * Initialise at least 2G of a node but also take into account that
-        * two large system hashes that can take up 1GB for 0.25TB/node.
-        */
-       max_initialise = max(2UL << (30 - PAGE_SHIFT),
-               (pgdat->node_spanned_pages >> 8));
-
        (*nr_initialised)++;
-       if ((*nr_initialised > max_initialise) &&
+       if ((*nr_initialised > pgdat->static_init_pgcnt) &&
            (pfn & (PAGES_PER_SECTION - 1)) == 0) {
                pgdat->first_deferred_pfn = pfn;
                return false;
@@ -1576,6 +1596,10 @@ void __init page_alloc_init_late(void)
        /* Reinit limits that are based on free pages after the kernel is up */
        files_maxfiles_init();
 #endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+       /* Discard memblock private memory */
+       memblock_discard();
+#endif
 
        for_each_populated_zone(zone)
                set_zone_contiguous(zone);
@@ -1864,14 +1888,14 @@ int move_freepages(struct zone *zone,
 #endif
 
        for (page = start_page; page <= end_page;) {
-               /* Make sure we are not inadvertently changing nodes */
-               VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
                        continue;
                }
 
+               /* Make sure we are not inadvertently changing nodes */
+               VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+
                if (!PageBuddy(page)) {
                        page++;
                        continue;
@@ -2085,13 +2109,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
                                continue;
 
                        /*
-                        * It should never happen but changes to locking could
-                        * inadvertently allow a per-cpu drain to add pages
-                        * to MIGRATE_HIGHATOMIC while unreserving so be safe
-                        * and watch for underflows.
+                        * In page freeing path, migratetype change is racy so
+                        * we can counter several free pages in a pageblock
+                        * in this loop althoug we changed the pageblock type
+                        * from highatomic to ac->migratetype. So we should
+                        * adjust the count once.
                         */
-                       zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
-                               zone->nr_reserved_highatomic);
+                       if (get_pageblock_migratetype(page) ==
+                                                       MIGRATE_HIGHATOMIC) {
+                               /*
+                                * It should never happen but changes to
+                                * locking could inadvertently allow a per-cpu
+                                * drain to add pages to MIGRATE_HIGHATOMIC
+                                * while unreserving so be safe and watch for
+                                * underflows.
+                                */
+                               zone->nr_reserved_highatomic -= min(
+                                               pageblock_nr_pages,
+                                               zone->nr_reserved_highatomic);
+                       }
 
                        /*
                         * Convert to ac->migratetype and avoid the normal
@@ -2192,7 +2228,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        unsigned long count, struct list_head *list,
                        int migratetype, bool cold)
 {
-       int i;
+       int i, alloced = 0;
 
        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
@@ -2217,13 +2253,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                else
                        list_add_tail(&page->lru, list);
                list = &page->lru;
+               alloced++;
                if (is_migrate_cma(get_pcppage_migratetype(page)))
                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                                              -(1 << order));
        }
+
+       /*
+        * i pages were removed from the buddy list even if some leak due
+        * to check_pcp_refill failing so adjust NR_FREE_PAGES based
+        * on i. Do not confuse with 'alloced' which is the number of
+        * pages added to the pcp list.
+        */
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
        spin_unlock(&zone->lock);
-       return i;
+       return alloced;
 }
 
 #ifdef CONFIG_NUMA
@@ -2548,30 +2592,23 @@ int __isolate_free_page(struct page *page, unsigned int order)
  * Update NUMA hit/miss statistics
  *
  * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
  */
 static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
                                                                gfp_t flags)
 {
 #ifdef CONFIG_NUMA
-       int local_nid = numa_node_id();
        enum zone_stat_item local_stat = NUMA_LOCAL;
 
-       if (unlikely(flags & __GFP_OTHER_NODE)) {
+       if (z->node != numa_node_id())
                local_stat = NUMA_OTHER;
-               local_nid = preferred_zone->node;
-       }
 
-       if (z->node == local_nid) {
+       if (z->node == preferred_zone->node)
                __inc_zone_state(z, NUMA_HIT);
-               __inc_zone_state(z, local_stat);
-       } else {
+       else {
                __inc_zone_state(z, NUMA_MISS);
                __inc_zone_state(preferred_zone, NUMA_FOREIGN);
        }
+       __inc_zone_state(z, local_stat);
 #endif
 }
 
@@ -2784,9 +2821,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                if (!area->nr_free)
                        continue;
 
-               if (alloc_harder)
-                       return true;
-
                for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
                        if (!list_empty(&area->free_list[mt]))
                                return true;
@@ -2798,6 +2832,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                        return true;
                }
 #endif
+               if (alloc_harder &&
+                       !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+                       return true;
        }
        return false;
 }
@@ -2850,7 +2887,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 #ifdef CONFIG_NUMA
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
-       return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+       return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
                                RECLAIM_DISTANCE;
 }
 #else  /* CONFIG_NUMA */
@@ -3117,6 +3154,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                enum compact_priority prio, enum compact_result *compact_result)
 {
        struct page *page;
+       unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
 
        if (!order)
                return NULL;
@@ -3124,7 +3162,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
                                                                        prio);
-       current->flags &= ~PF_MEMALLOC;
+       current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
 
        if (*compact_result <= COMPACT_INACTIVE)
                return NULL;
@@ -3384,12 +3422,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 }
 
 /*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
-/*
  * Checks whether it makes sense to retry the reclaim to make a forward progress
  * for the given allocation request.
  * The reclaim feedback represented by did_some_progress (any progress during
@@ -3494,12 +3526,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        struct page *page = NULL;
        unsigned int alloc_flags;
        unsigned long did_some_progress;
-       enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
+       enum compact_priority compact_priority;
        enum compact_result compact_result;
-       int compaction_retries = 0;
-       int no_progress_loops = 0;
-       unsigned long alloc_start = jiffies;
-       unsigned int stall_timeout = 10 * HZ;
+       int compaction_retries;
+       int no_progress_loops;
+       unsigned int cpuset_mems_cookie;
 
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -3520,6 +3551,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                gfp_mask &= ~__GFP_ATOMIC;
 
+retry_cpuset:
+       compaction_retries = 0;
+       no_progress_loops = 0;
+       compact_priority = DEF_COMPACT_PRIORITY;
+       cpuset_mems_cookie = read_mems_allowed_begin();
+       /*
+        * We need to recalculate the starting point for the zonelist iterator
+        * because we might have used different nodemask in the fast path, or
+        * there was a cpuset modification and we are retrying - otherwise we
+        * could end up iterating over non-eligible zones endlessly.
+        */
+       ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+                                       ac->high_zoneidx, ac->nodemask);
+       if (!ac->preferred_zoneref->zone)
+               goto nopage;
+
+
        /*
         * The fast path uses conservative alloc_flags to succeed only until
         * kswapd needs to be woken up, and to avoid the cost of setting up
@@ -3592,7 +3640,6 @@ retry:
         * orientated.
         */
        if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
-               ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
                ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                                        ac->high_zoneidx, ac->nodemask);
        }
@@ -3655,14 +3702,6 @@ retry:
        if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
                goto nopage;
 
-       /* Make sure we know about allocations which stall for too long */
-       if (time_after(jiffies, alloc_start + stall_timeout)) {
-               warn_alloc(gfp_mask,
-                       "page allocation stalls for %ums, order:%u",
-                       jiffies_to_msecs(jiffies-alloc_start), order);
-               stall_timeout += 10 * HZ;
-       }
-
        if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
                                 did_some_progress > 0, &no_progress_loops))
                goto retry;
@@ -3679,6 +3718,13 @@ retry:
                                &compaction_retries))
                goto retry;
 
+       /*
+        * It's possible we raced with cpuset update so the OOM would be
+        * premature (see below the nopage: label for full explanation).
+        */
+       if (read_mems_allowed_retry(cpuset_mems_cookie))
+               goto retry_cpuset;
+
        /* Reclaim has failed us, start killing things */
        page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
        if (page)
@@ -3691,6 +3737,16 @@ retry:
        }
 
 nopage:
+       /*
+        * When updating a task's mems_allowed or mempolicy nodemask, it is
+        * possible to race with parallel threads in such a way that our
+        * allocation can fail while the mask is being updated. If we are about
+        * to fail, check if the cpuset changed during allocation and if so,
+        * retry.
+        */
+       if (read_mems_allowed_retry(cpuset_mems_cookie))
+               goto retry_cpuset;
+
        warn_alloc(gfp_mask,
                        "page allocation failure: order:%u", order);
 got_pg:
@@ -3705,7 +3761,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
        struct page *page;
-       unsigned int cpuset_mems_cookie;
        unsigned int alloc_flags = ALLOC_WMARK_LOW;
        gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
        struct alloc_context ac = {
@@ -3742,9 +3797,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 
-retry_cpuset:
-       cpuset_mems_cookie = read_mems_allowed_begin();
-
        /* Dirty zone balancing only done in the fast path */
        ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
@@ -3755,8 +3807,13 @@ retry_cpuset:
         */
        ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
                                        ac.high_zoneidx, ac.nodemask);
-       if (!ac.preferred_zoneref) {
+       if (!ac.preferred_zoneref->zone) {
                page = NULL;
+               /*
+                * This might be due to race with cpuset_current_mems_allowed
+                * update, so make sure we retry with original nodemask in the
+                * slow path.
+                */
                goto no_zone;
        }
 
@@ -3765,6 +3822,7 @@ retry_cpuset:
        if (likely(page))
                goto out;
 
+no_zone:
        /*
         * Runtime PM, block IO and its error handling path can deadlock
         * because I/O on the device might not complete.
@@ -3776,21 +3834,10 @@ retry_cpuset:
         * Restore the original nodemask if it was potentially replaced with
         * &cpuset_current_mems_allowed to optimize the fast-path attempt.
         */
-       if (cpusets_enabled())
+       if (unlikely(ac.nodemask != nodemask))
                ac.nodemask = nodemask;
-       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
 
-no_zone:
-       /*
-        * When updating a task's mems_allowed, it is possible to race with
-        * parallel threads in such a way that an allocation can fail while
-        * the mask is being updated. If a page allocation is about to fail,
-        * check if the cpuset changed during allocation and if so, retry.
-        */
-       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
-               alloc_mask = gfp_mask;
-               goto retry_cpuset;
-       }
+       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
 
 out:
        if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
@@ -3908,11 +3955,11 @@ refill:
                /* Even if we own the page, we do not use atomic_set().
                 * This would break get_page_unless_zero() users.
                 */
-               page_ref_add(page, size - 1);
+               page_ref_add(page, size);
 
                /* reset page count bias and offset to start of new frag */
                nc->pfmemalloc = page_is_pfmemalloc(page);
-               nc->pagecnt_bias = size;
+               nc->pagecnt_bias = size + 1;
                nc->offset = size;
        }
 
@@ -3928,10 +3975,10 @@ refill:
                size = nc->size;
 #endif
                /* OK, page count is 0, we can safely set it */
-               set_page_count(page, size);
+               set_page_count(page, size + 1);
 
                /* reset page count bias and offset to start of new frag */
-               nc->pagecnt_bias = size;
+               nc->pagecnt_bias = size + 1;
                offset = size - fragsz;
        }
 
@@ -4311,17 +4358,18 @@ void show_free_areas(unsigned int filter)
                        K(node_page_state(pgdat, NR_FILE_MAPPED)),
                        K(node_page_state(pgdat, NR_FILE_DIRTY)),
                        K(node_page_state(pgdat, NR_WRITEBACK)),
+                       K(node_page_state(pgdat, NR_SHMEM)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
                        K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
                                        * HPAGE_PMD_NR),
                        K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
 #endif
-                       K(node_page_state(pgdat, NR_SHMEM)),
                        K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
                        K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
                        node_page_state(pgdat, NR_PAGES_SCANNED),
-                       !pgdat_reclaimable(pgdat) ? "yes" : "no");
+                       pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+                               "yes" : "no");
        }
 
        for_each_populated_zone(zone) {
@@ -5443,13 +5491,15 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long *zone_end_pfn,
                                        unsigned long *ignored)
 {
+       unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+       unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
 
        /* Get the start and end of the zone */
-       *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-       *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+       *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+       *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
        adjust_zone_range_for_zone_movable(nid, zone_type,
                                node_start_pfn, node_end_pfn,
                                zone_start_pfn, zone_end_pfn);
@@ -5876,7 +5926,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        /* pg_data_t should be reset to zero when it's allocated */
        WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
 
-       reset_deferred_meminit(pgdat);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        pgdat->per_cpu_nodestats = NULL;
@@ -5898,6 +5947,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                (unsigned long)pgdat->node_mem_map);
 #endif
 
+       reset_deferred_meminit(pgdat);
        free_area_init_core(pgdat);
 }
 
@@ -6399,8 +6449,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
        }
 
        if (pages && s)
-               pr_info("Freeing %s memory: %ldK (%p - %p)\n",
-                       s, pages << (PAGE_SHIFT - 10), start, end);
+               pr_info("Freeing %s memory: %ldK\n",
+                       s, pages << (PAGE_SHIFT - 10));
 
        return pages;
 }
@@ -7238,11 +7288,18 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 
        /*
         * In case of -EBUSY, we'd like to know which page causes problem.
-        * So, just fall through. We will check it in test_pages_isolated().
+        * So, just fall through. test_pages_isolated() has a tracepoint
+        * which will report the busy page.
+        *
+        * It is possible that busy pages could become available before
+        * the call to test_pages_isolated, and the range will actually be
+        * allocated.  So, if we fall through be sure to clear ret so that
+        * -EBUSY is not accidentally used or returned to caller.
         */
        ret = __alloc_contig_migrate_range(&cc, start, end);
        if (ret && ret != -EBUSY)
                goto done;
+       ret =0;
 
        /*
         * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
@@ -7289,7 +7346,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 
        /* Make sure the range is really isolated. */
        if (test_pages_isolated(outer_start, end, false)) {
-               pr_info("%s: [%lx, %lx) PFNs busy\n",
+               pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
                        __func__, outer_start, end);
                ret = -EBUSY;
                goto done;