MIPS: VDSO: Prevent use of smp_processor_id()

[android-x86/kernel.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 76fda22..4e5846b 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -234,12 +234,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
                 pgdat_reclaimable_pages(pgdat) * 6;
  }
  
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/**
+ * lruvec_lru_size -  Returns the number of pages on the given LRU list.
+ * @lruvec: lru vector
+ * @lru: lru to use
+ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
  {
+       unsigned long lru_size;
+       int zid;
+
         if (!mem_cgroup_disabled())
-               return mem_cgroup_get_lru_size(lruvec, lru);
+               lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+       else
+               lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+
+       for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+               struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
+               unsigned long size;
+
+               if (!managed_zone(zone))
+                       continue;
+
+               if (!mem_cgroup_disabled())
+                       size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+               else
+                       size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
+                                      NR_ZONE_LRU_BASE + lru);
+               lru_size -= min(size, lru_size);
+       }
+
+       return lru_size;
  
-       return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
  }
  
  /*
@@ -268,10 +295,13 @@ EXPORT_SYMBOL(register_shrinker);
   */
  void unregister_shrinker(struct shrinker *shrinker)
  {
+       if (!shrinker->nr_deferred)
+               return;
         down_write(&shrinker_rwsem);
         list_del(&shrinker->list);
         up_write(&shrinker_rwsem);
         kfree(shrinker->nr_deferred);
+       shrinker->nr_deferred = NULL;
  }
  EXPORT_SYMBOL(unregister_shrinker);
  
@@ -291,6 +321,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
         int nid = shrinkctl->nid;
         long batch_size = shrinker->batch ? shrinker->batch
                                           : SHRINK_BATCH;
+       long scanned = 0, next_deferred;
  
         freeable = shrinker->count_objects(shrinker, shrinkctl);
         if (freeable == 0)
@@ -312,7 +343,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
                        shrinker->scan_objects, total_scan);
                 total_scan = freeable;
-       }
+               next_deferred = nr;
+       } else
+               next_deferred = total_scan;
  
         /*
          * We need to avoid excessive windup on filesystem shrinkers
@@ -369,17 +402,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  
                 count_vm_events(SLABS_SCANNED, nr_to_scan);
                 total_scan -= nr_to_scan;
+               scanned += nr_to_scan;
  
                 cond_resched();
         }
  
+       if (next_deferred >= scanned)
+               next_deferred -= scanned;
+       else
+               next_deferred = 0;
         /*
          * move the unused scan count back into the shrinker in a
          * manner that handles concurrent updates. If we exhausted the
          * scan, there is no need to do an update.
          */
-       if (total_scan > 0)
-               new_nr = atomic_long_add_return(total_scan,
+       if (next_deferred > 0)
+               new_nr = atomic_long_add_return(next_deferred,
                                                 &shrinker->nr_deferred[nid]);
         else
                 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
@@ -1336,6 +1374,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
  
                 if (PageDirty(page)) {
                         struct address_space *mapping;
+                       bool migrate_dirty;
  
                         /* ISOLATE_CLEAN means only clean pages */
                         if (mode & ISOLATE_CLEAN)
@@ -1344,10 +1383,19 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
                         /*
                          * Only pages without mappings or that have a
                          * ->migratepage callback are possible to migrate
-                        * without blocking
+                        * without blocking. However, we can be racing with
+                        * truncation so it's necessary to lock the page
+                        * to stabilise the mapping as truncation holds
+                        * the page lock until after the page is removed
+                        * from the page cache.
                          */
+                       if (!trylock_page(page))
+                               return ret;
+
                         mapping = page_mapping(page);
-                       if (mapping && !mapping->a_ops->migratepage)
+                       migrate_dirty = !mapping || mapping->a_ops->migratepage;
+                       unlock_page(page);
+                       if (!migrate_dirty)
                                 return ret;
                 }
         }
@@ -1374,8 +1422,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
   * be complete before mem_cgroup_update_lru_size due to a santity check.
   */
  static __always_inline void update_lru_sizes(struct lruvec *lruvec,
-                       enum lru_list lru, unsigned long *nr_zone_taken,
-                       unsigned long nr_taken)
+                       enum lru_list lru, unsigned long *nr_zone_taken)
  {
         int zid;
  
@@ -1384,11 +1431,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
                         continue;
  
                 __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
-       }
-
  #ifdef CONFIG_MEMCG
-       mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
+               mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
  #endif
+       }
+
  }
  
  /*
@@ -1493,7 +1540,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
         *nr_scanned = scan;
         trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
                                     nr_taken, mode, is_file_lru(lru));
-       update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
+       update_lru_sizes(lruvec, lru, nr_zone_taken);
         return nr_taken;
  }
  
@@ -2011,11 +2058,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
                                                 struct scan_control *sc)
  {
         unsigned long inactive_ratio;
-       unsigned long inactive;
-       unsigned long active;
+       unsigned long inactive, active;
+       enum lru_list inactive_lru = file * LRU_FILE;
+       enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
         unsigned long gb;
-       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-       int zid;
  
         /*
          * If we don't have swap space, anonymous page deactivation
@@ -2024,29 +2070,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
         if (!file && !total_swap_pages)
                 return false;
  
-       inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
-       active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
-       /*
-        * For zone-constrained allocations, it is necessary to check if
-        * deactivations are required for lowmem to be reclaimed. This
-        * calculates the inactive/active pages available in eligible zones.
-        */
-       for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
-               struct zone *zone = &pgdat->node_zones[zid];
-               unsigned long inactive_zone, active_zone;
-
-               if (!managed_zone(zone))
-                       continue;
-
-               inactive_zone = zone_page_state(zone,
-                               NR_ZONE_LRU_BASE + (file * LRU_FILE));
-               active_zone = zone_page_state(zone,
-                               NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
-
-               inactive -= min(inactive, inactive_zone);
-               active -= min(active, active_zone);
-       }
+       inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
+       active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
  
         gb = (inactive + active) >> (30 - PAGE_SHIFT);
         if (gb)
@@ -2193,7 +2218,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
          * system is under heavy pressure.
          */
         if (!inactive_list_is_low(lruvec, true, sc) &&
-           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
                 scan_balance = SCAN_FILE;
                 goto out;
         }
@@ -2219,10 +2244,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
          * anon in [0], file in [1]
          */
  
-       anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
-               lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
-       file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
-               lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+       anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+               lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+       file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+               lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
  
         spin_lock_irq(&pgdat->lru_lock);
         if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2260,7 +2285,7 @@ out:
                         unsigned long size;
                         unsigned long scan;
  
-                       size = lruvec_lru_size(lruvec, lru);
+                       size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
                         scan = size >> sc->priority;
  
                         if (!scan && pass && force_scan)
@@ -2354,6 +2379,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
                         }
                 }
  
+               cond_resched();
+
                 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
                         continue;
  
@@ -2592,6 +2619,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
         } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                          sc->nr_scanned - nr_scanned, sc));
  
+       /*
+        * Kswapd gives up on balancing particular nodes after too
+        * many failures to reclaim anything from them and goes to
+        * sleep. On reclaim progress, reset the failure counter. A
+        * successful direct reclaim run will revive a dormant kswapd.
+        */
+       if (reclaimable)
+               pgdat->kswapd_failures = 0;
+
         return reclaimable;
  }
  
@@ -2666,10 +2702,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                  GFP_KERNEL | __GFP_HARDWALL))
                                 continue;
  
-                       if (sc->priority != DEF_PRIORITY &&
-                           !pgdat_reclaimable(zone->zone_pgdat))
-                               continue;       /* Let kswapd poll it */
-
                         /*
                          * If we already have plenty of memory free for
                          * compaction in this zone, don't free any more.
@@ -2806,7 +2838,7 @@ retry:
         return 0;
  }
  
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
  {
         struct zone *zone;
         unsigned long pfmemalloc_reserve = 0;
@@ -2814,10 +2846,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
         int i;
         bool wmark_ok;
  
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
-               if (!managed_zone(zone) ||
-                   pgdat_reclaimable_pages(pgdat) == 0)
+               if (!managed_zone(zone))
+                       continue;
+
+               if (!zone_reclaimable_pages(zone))
                         continue;
  
                 pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2894,7 +2931,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
  
                 /* Throttle based on the first usable node */
                 pgdat = zone->zone_pgdat;
-               if (pfmemalloc_watermark_ok(pgdat))
+               if (allow_direct_reclaim(pgdat))
                         goto out;
                 break;
         }
@@ -2916,14 +2953,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
          */
         if (!(gfp_mask & __GFP_FS)) {
                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-                       pfmemalloc_watermark_ok(pgdat), HZ);
+                       allow_direct_reclaim(pgdat), HZ);
  
                 goto check_pending;
         }
  
         /* Throttle until kswapd wakes the process */
         wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               pfmemalloc_watermark_ok(pgdat));
+               allow_direct_reclaim(pgdat));
  
  check_pending:
         if (fatal_signal_pending(current))
@@ -2939,7 +2976,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
         unsigned long nr_reclaimed;
         struct scan_control sc = {
                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
-               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .gfp_mask = memalloc_noio_flags(gfp_mask),
                 .reclaim_idx = gfp_zone(gfp_mask),
                 .order = order,
                 .nodemask = nodemask,
@@ -2954,12 +2991,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
          * 1 is returned so that the page allocator does not OOM kill at this
          * point.
          */
-       if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
+       if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
                 return 1;
  
         trace_mm_vmscan_direct_reclaim_begin(order,
                                 sc.may_writepage,
-                               gfp_mask,
+                               sc.gfp_mask,
                                 sc.reclaim_idx);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -3086,6 +3123,7 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
          */
         clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
         clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
+       clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
  
         return true;
  }
@@ -3102,7 +3140,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  
         /*
          * The throttled processes are normally woken up in balance_pgdat() as
-        * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+        * soon as allow_direct_reclaim() is true. But there is a potential
          * race between when kswapd checks the watermarks and a process gets
          * throttled. There is also a potential race if processes get
          * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3116,6 +3154,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         if (waitqueue_active(&pgdat->pfmemalloc_wait))
                 wake_up_all(&pgdat->pfmemalloc_wait);
  
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
         for (i = 0; i <= classzone_idx; i++) {
                 struct zone *zone = pgdat->node_zones + i;
  
@@ -3202,9 +3244,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
         count_vm_event(PAGEOUTRUN);
  
         do {
+               unsigned long nr_reclaimed = sc.nr_reclaimed;
                 bool raise_priority = true;
  
-               sc.nr_reclaimed = 0;
                 sc.reclaim_idx = classzone_idx;
  
                 /*
@@ -3259,7 +3301,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * If we're getting trouble reclaiming, start doing writepage
                  * even in laptop mode.
                  */
-               if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
+               if (sc.priority < DEF_PRIORITY - 2)
                         sc.may_writepage = 1;
  
                 /* Call soft limit reclaim before calling shrink_node. */
@@ -3283,7 +3325,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * able to safely make forward progress. Wake them
                  */
                 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-                               pfmemalloc_watermark_ok(pgdat))
+                               allow_direct_reclaim(pgdat))
                         wake_up_all(&pgdat->pfmemalloc_wait);
  
                 /* Check if kswapd should be suspending */
@@ -3294,10 +3336,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * Raise priority if scanning rate is too low or there was no
                  * progress in reclaiming pages
                  */
-               if (raise_priority || !sc.nr_reclaimed)
+               nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+               if (raise_priority || !nr_reclaimed)
                         sc.priority--;
         } while (sc.priority >= 1);
  
+       if (!sc.nr_reclaimed)
+               pgdat->kswapd_failures++;
+
  out:
         /*
          * Return the order kswapd stopped reclaiming at as
@@ -3497,6 +3543,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
  
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return;
+
         /* Only wake kswapd if all zones are unbalanced */
         for (z = 0; z <= classzone_idx; z++) {
                 zone = pgdat->node_zones + z;
@@ -3710,16 +3760,15 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
         const unsigned long nr_pages = 1 << order;
         struct task_struct *p = current;
         struct reclaim_state reclaim_state;
-       int classzone_idx = gfp_zone(gfp_mask);
         struct scan_control sc = {
                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .gfp_mask = memalloc_noio_flags(gfp_mask),
                 .order = order,
                 .priority = NODE_RECLAIM_PRIORITY,
                 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
                 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
                 .may_swap = 1,
-               .reclaim_idx = classzone_idx,
+               .reclaim_idx = gfp_zone(gfp_mask),
         };
  
         cond_resched();
@@ -3729,7 +3778,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
          * and RECLAIM_UNMAP.
          */
         p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
-       lockdep_set_current_reclaim_state(gfp_mask);
+       lockdep_set_current_reclaim_state(sc.gfp_mask);
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
  
@@ -3767,9 +3816,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
             sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
                 return NODE_RECLAIM_FULL;
  
-       if (!pgdat_reclaimable(pgdat))
-               return NODE_RECLAIM_FULL;
-
         /*
          * Do not scan if the allocation should not be delayed.
          */
@@ -3812,7 +3858,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
   */
  int page_evictable(struct page *page)
  {
-       return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+       int ret;
+
+       /* Prevent address_space of inode and swap cache from being freed */
+       rcu_read_lock();
+       ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+       rcu_read_unlock();
+       return ret;
  }
  
  #ifdef CONFIG_SHMEM