MIPS: VDSO: Prevent use of smp_processor_id()

[android-x86/kernel.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 6de9440..13a6421 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -284,8 +284,37 @@ EXPORT_SYMBOL(nr_online_nodes);
  int page_group_by_mobility_disabled __read_mostly;
  
  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
  static inline void reset_deferred_meminit(pg_data_t *pgdat)
  {
+       phys_addr_t start_addr, end_addr;
+       unsigned long max_pgcnt;
+       unsigned long reserved;
+
+       /*
+        * Initialise at least 2G of a node but also take into account that
+        * two large system hashes that can take up 1GB for 0.25TB/node.
+        */
+       max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
+                       (pgdat->node_spanned_pages >> 8));
+
+       /*
+        * Compensate the all the memblock reservations (e.g. crash kernel)
+        * from the initial estimation to make sure we will initialize enough
+        * memory to boot.
+        */
+       start_addr = PFN_PHYS(pgdat->node_start_pfn);
+       end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
+       reserved = memblock_reserved_memory_within(start_addr, end_addr);
+       max_pgcnt += PHYS_PFN(reserved);
+
+       pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
         pgdat->first_deferred_pfn = ULONG_MAX;
  }
  
@@ -308,20 +337,11 @@ static inline bool update_defer_init(pg_data_t *pgdat,
                                 unsigned long pfn, unsigned long zone_end,
                                 unsigned long *nr_initialised)
  {
-       unsigned long max_initialise;
-
         /* Always populate low zones for address-contrained allocations */
         if (zone_end < pgdat_end_pfn(pgdat))
                 return true;
-       /*
-        * Initialise at least 2G of a node but also take into account that
-        * two large system hashes that can take up 1GB for 0.25TB/node.
-        */
-       max_initialise = max(2UL << (30 - PAGE_SHIFT),
-               (pgdat->node_spanned_pages >> 8));
-
         (*nr_initialised)++;
-       if ((*nr_initialised > max_initialise) &&
+       if ((*nr_initialised > pgdat->static_init_pgcnt) &&
             (pfn & (PAGES_PER_SECTION - 1)) == 0) {
                 pgdat->first_deferred_pfn = pfn;
                 return false;
@@ -1576,6 +1596,10 @@ void __init page_alloc_init_late(void)
         /* Reinit limits that are based on free pages after the kernel is up */
         files_maxfiles_init();
  #endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+       /* Discard memblock private memory */
+       memblock_discard();
+#endif
  
         for_each_populated_zone(zone)
                 set_zone_contiguous(zone);
@@ -1864,14 +1888,14 @@ int move_freepages(struct zone *zone,
  #endif
  
         for (page = start_page; page <= end_page;) {
-               /* Make sure we are not inadvertently changing nodes */
-               VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
                 if (!pfn_valid_within(page_to_pfn(page))) {
                         page++;
                         continue;
                 }
  
+               /* Make sure we are not inadvertently changing nodes */
+               VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+
                 if (!PageBuddy(page)) {
                         page++;
                         continue;
@@ -2085,13 +2109,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
                                 continue;
  
                         /*
-                        * It should never happen but changes to locking could
-                        * inadvertently allow a per-cpu drain to add pages
-                        * to MIGRATE_HIGHATOMIC while unreserving so be safe
-                        * and watch for underflows.
+                        * In page freeing path, migratetype change is racy so
+                        * we can counter several free pages in a pageblock
+                        * in this loop althoug we changed the pageblock type
+                        * from highatomic to ac->migratetype. So we should
+                        * adjust the count once.
                          */
-                       zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
-                               zone->nr_reserved_highatomic);
+                       if (get_pageblock_migratetype(page) ==
+                                                       MIGRATE_HIGHATOMIC) {
+                               /*
+                                * It should never happen but changes to
+                                * locking could inadvertently allow a per-cpu
+                                * drain to add pages to MIGRATE_HIGHATOMIC
+                                * while unreserving so be safe and watch for
+                                * underflows.
+                                */
+                               zone->nr_reserved_highatomic -= min(
+                                               pageblock_nr_pages,
+                                               zone->nr_reserved_highatomic);
+                       }
  
                         /*
                          * Convert to ac->migratetype and avoid the normal
@@ -2192,7 +2228,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                         unsigned long count, struct list_head *list,
                         int migratetype, bool cold)
  {
-       int i;
+       int i, alloced = 0;
  
         spin_lock(&zone->lock);
         for (i = 0; i < count; ++i) {
@@ -2217,13 +2253,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                 else
                         list_add_tail(&page->lru, list);
                 list = &page->lru;
+               alloced++;
                 if (is_migrate_cma(get_pcppage_migratetype(page)))
                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                                               -(1 << order));
         }
+
+       /*
+        * i pages were removed from the buddy list even if some leak due
+        * to check_pcp_refill failing so adjust NR_FREE_PAGES based
+        * on i. Do not confuse with 'alloced' which is the number of
+        * pages added to the pcp list.
+        */
         __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
         spin_unlock(&zone->lock);
-       return i;
+       return alloced;
  }
  
  #ifdef CONFIG_NUMA
@@ -2548,30 +2592,23 @@ int __isolate_free_page(struct page *page, unsigned int order)
   * Update NUMA hit/miss statistics
   *
   * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
   */
  static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
                                                                 gfp_t flags)
  {
  #ifdef CONFIG_NUMA
-       int local_nid = numa_node_id();
         enum zone_stat_item local_stat = NUMA_LOCAL;
  
-       if (unlikely(flags & __GFP_OTHER_NODE)) {
+       if (z->node != numa_node_id())
                 local_stat = NUMA_OTHER;
-               local_nid = preferred_zone->node;
-       }
  
-       if (z->node == local_nid) {
+       if (z->node == preferred_zone->node)
                 __inc_zone_state(z, NUMA_HIT);
-               __inc_zone_state(z, local_stat);
-       } else {
+       else {
                 __inc_zone_state(z, NUMA_MISS);
                 __inc_zone_state(preferred_zone, NUMA_FOREIGN);
         }
+       __inc_zone_state(z, local_stat);
  #endif
  }
  
@@ -2784,9 +2821,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                 if (!area->nr_free)
                         continue;
  
-               if (alloc_harder)
-                       return true;
-
                 for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
                         if (!list_empty(&area->free_list[mt]))
                                 return true;
@@ -2798,6 +2832,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         return true;
                 }
  #endif
+               if (alloc_harder &&
+                       !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+                       return true;
         }
         return false;
  }
@@ -2850,7 +2887,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
  #ifdef CONFIG_NUMA
  static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
-       return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+       return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
                                 RECLAIM_DISTANCE;
  }
  #else  /* CONFIG_NUMA */
@@ -3117,6 +3154,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 enum compact_priority prio, enum compact_result *compact_result)
  {
         struct page *page;
+       unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
  
         if (!order)
                 return NULL;
@@ -3124,7 +3162,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         current->flags |= PF_MEMALLOC;
         *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
                                                                         prio);
-       current->flags &= ~PF_MEMALLOC;
+       current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
  
         if (*compact_result <= COMPACT_INACTIVE)
                 return NULL;
@@ -3384,12 +3422,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
  }
  
  /*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
-/*
   * Checks whether it makes sense to retry the reclaim to make a forward progress
   * for the given allocation request.
   * The reclaim feedback represented by did_some_progress (any progress during
@@ -3494,12 +3526,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         struct page *page = NULL;
         unsigned int alloc_flags;
         unsigned long did_some_progress;
-       enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
+       enum compact_priority compact_priority;
         enum compact_result compact_result;
-       int compaction_retries = 0;
-       int no_progress_loops = 0;
-       unsigned long alloc_start = jiffies;
-       unsigned int stall_timeout = 10 * HZ;
+       int compaction_retries;
+       int no_progress_loops;
+       unsigned int cpuset_mems_cookie;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -3520,6 +3551,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                 gfp_mask &= ~__GFP_ATOMIC;
  
+retry_cpuset:
+       compaction_retries = 0;
+       no_progress_loops = 0;
+       compact_priority = DEF_COMPACT_PRIORITY;
+       cpuset_mems_cookie = read_mems_allowed_begin();
+       /*
+        * We need to recalculate the starting point for the zonelist iterator
+        * because we might have used different nodemask in the fast path, or
+        * there was a cpuset modification and we are retrying - otherwise we
+        * could end up iterating over non-eligible zones endlessly.
+        */
+       ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
+                                       ac->high_zoneidx, ac->nodemask);
+       if (!ac->preferred_zoneref->zone)
+               goto nopage;
+
+
         /*
          * The fast path uses conservative alloc_flags to succeed only until
          * kswapd needs to be woken up, and to avoid the cost of setting up
@@ -3592,7 +3640,6 @@ retry:
          * orientated.
          */
         if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
-               ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                                         ac->high_zoneidx, ac->nodemask);
         }
@@ -3655,14 +3702,6 @@ retry:
         if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
                 goto nopage;
  
-       /* Make sure we know about allocations which stall for too long */
-       if (time_after(jiffies, alloc_start + stall_timeout)) {
-               warn_alloc(gfp_mask,
-                       "page allocation stalls for %ums, order:%u",
-                       jiffies_to_msecs(jiffies-alloc_start), order);
-               stall_timeout += 10 * HZ;
-       }
-
         if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
                                  did_some_progress > 0, &no_progress_loops))
                 goto retry;
@@ -3679,6 +3718,13 @@ retry:
                                 &compaction_retries))
                 goto retry;
  
+       /*
+        * It's possible we raced with cpuset update so the OOM would be
+        * premature (see below the nopage: label for full explanation).
+        */
+       if (read_mems_allowed_retry(cpuset_mems_cookie))
+               goto retry_cpuset;
+
         /* Reclaim has failed us, start killing things */
         page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
         if (page)
@@ -3691,6 +3737,16 @@ retry:
         }
  
  nopage:
+       /*
+        * When updating a task's mems_allowed or mempolicy nodemask, it is
+        * possible to race with parallel threads in such a way that our
+        * allocation can fail while the mask is being updated. If we are about
+        * to fail, check if the cpuset changed during allocation and if so,
+        * retry.
+        */
+       if (read_mems_allowed_retry(cpuset_mems_cookie))
+               goto retry_cpuset;
+
         warn_alloc(gfp_mask,
                         "page allocation failure: order:%u", order);
  got_pg:
@@ -3705,7 +3761,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                         struct zonelist *zonelist, nodemask_t *nodemask)
  {
         struct page *page;
-       unsigned int cpuset_mems_cookie;
         unsigned int alloc_flags = ALLOC_WMARK_LOW;
         gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
         struct alloc_context ac = {
@@ -3742,9 +3797,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
         if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
                 alloc_flags |= ALLOC_CMA;
  
-retry_cpuset:
-       cpuset_mems_cookie = read_mems_allowed_begin();
-
         /* Dirty zone balancing only done in the fast path */
         ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
  
@@ -3755,8 +3807,13 @@ retry_cpuset:
          */
         ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
                                         ac.high_zoneidx, ac.nodemask);
-       if (!ac.preferred_zoneref) {
+       if (!ac.preferred_zoneref->zone) {
                 page = NULL;
+               /*
+                * This might be due to race with cpuset_current_mems_allowed
+                * update, so make sure we retry with original nodemask in the
+                * slow path.
+                */
                 goto no_zone;
         }
  
@@ -3765,6 +3822,7 @@ retry_cpuset:
         if (likely(page))
                 goto out;
  
+no_zone:
         /*
          * Runtime PM, block IO and its error handling path can deadlock
          * because I/O on the device might not complete.
@@ -3776,21 +3834,10 @@ retry_cpuset:
          * Restore the original nodemask if it was potentially replaced with
          * &cpuset_current_mems_allowed to optimize the fast-path attempt.
          */
-       if (cpusets_enabled())
+       if (unlikely(ac.nodemask != nodemask))
                 ac.nodemask = nodemask;
-       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
  
-no_zone:
-       /*
-        * When updating a task's mems_allowed, it is possible to race with
-        * parallel threads in such a way that an allocation can fail while
-        * the mask is being updated. If a page allocation is about to fail,
-        * check if the cpuset changed during allocation and if so, retry.
-        */
-       if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
-               alloc_mask = gfp_mask;
-               goto retry_cpuset;
-       }
+       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
  
  out:
         if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
@@ -3908,11 +3955,11 @@ refill:
                 /* Even if we own the page, we do not use atomic_set().
                  * This would break get_page_unless_zero() users.
                  */
-               page_ref_add(page, size - 1);
+               page_ref_add(page, size);
  
                 /* reset page count bias and offset to start of new frag */
                 nc->pfmemalloc = page_is_pfmemalloc(page);
-               nc->pagecnt_bias = size;
+               nc->pagecnt_bias = size + 1;
                 nc->offset = size;
         }
  
@@ -3928,10 +3975,10 @@ refill:
                 size = nc->size;
  #endif
                 /* OK, page count is 0, we can safely set it */
-               set_page_count(page, size);
+               set_page_count(page, size + 1);
  
                 /* reset page count bias and offset to start of new frag */
-               nc->pagecnt_bias = size;
+               nc->pagecnt_bias = size + 1;
                 offset = size - fragsz;
         }
  
@@ -4311,17 +4358,18 @@ void show_free_areas(unsigned int filter)
                         K(node_page_state(pgdat, NR_FILE_MAPPED)),
                         K(node_page_state(pgdat, NR_FILE_DIRTY)),
                         K(node_page_state(pgdat, NR_WRITEBACK)),
+                       K(node_page_state(pgdat, NR_SHMEM)),
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
                         K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
                         K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
                                         * HPAGE_PMD_NR),
                         K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
  #endif
-                       K(node_page_state(pgdat, NR_SHMEM)),
                         K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
                         K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
                         node_page_state(pgdat, NR_PAGES_SCANNED),
-                       !pgdat_reclaimable(pgdat) ? "yes" : "no");
+                       pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+                               "yes" : "no");
         }
  
         for_each_populated_zone(zone) {
@@ -5443,13 +5491,15 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                         unsigned long *zone_end_pfn,
                                         unsigned long *ignored)
  {
+       unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+       unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
         /* When hotadd a new node from cpu_up(), the node should be empty */
         if (!node_start_pfn && !node_end_pfn)
                 return 0;
  
         /* Get the start and end of the zone */
-       *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-       *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+       *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+       *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
         adjust_zone_range_for_zone_movable(nid, zone_type,
                                 node_start_pfn, node_end_pfn,
                                 zone_start_pfn, zone_end_pfn);
@@ -5876,7 +5926,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
         /* pg_data_t should be reset to zero when it's allocated */
         WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
  
-       reset_deferred_meminit(pgdat);
         pgdat->node_id = nid;
         pgdat->node_start_pfn = node_start_pfn;
         pgdat->per_cpu_nodestats = NULL;
@@ -5898,6 +5947,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                 (unsigned long)pgdat->node_mem_map);
  #endif
  
+       reset_deferred_meminit(pgdat);
         free_area_init_core(pgdat);
  }
  
@@ -6399,8 +6449,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
         }
  
         if (pages && s)
-               pr_info("Freeing %s memory: %ldK (%p - %p)\n",
-                       s, pages << (PAGE_SHIFT - 10), start, end);
+               pr_info("Freeing %s memory: %ldK\n",
+                       s, pages << (PAGE_SHIFT - 10));
  
         return pages;
  }
@@ -7238,11 +7288,18 @@ int alloc_contig_range(unsigned long start, unsigned long end,
  
         /*
          * In case of -EBUSY, we'd like to know which page causes problem.
-        * So, just fall through. We will check it in test_pages_isolated().
+        * So, just fall through. test_pages_isolated() has a tracepoint
+        * which will report the busy page.
+        *
+        * It is possible that busy pages could become available before
+        * the call to test_pages_isolated, and the range will actually be
+        * allocated.  So, if we fall through be sure to clear ret so that
+        * -EBUSY is not accidentally used or returned to caller.
          */
         ret = __alloc_contig_migrate_range(&cc, start, end);
         if (ret && ret != -EBUSY)
                 goto done;
+       ret =0;
  
         /*
          * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
@@ -7289,7 +7346,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
  
         /* Make sure the range is really isolated. */
         if (test_pages_isolated(outer_start, end, false)) {
-               pr_info("%s: [%lx, %lx) PFNs busy\n",
+               pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
                         __func__, outer_start, end);
                 ret = -EBUSY;
                 goto done;