OSDN Git Service

MIPS: VDSO: Prevent use of smp_processor_id()
[android-x86/kernel.git] / mm / vmscan.c
index 76fda22..4e5846b 100644 (file)
@@ -234,12 +234,39 @@ bool pgdat_reclaimable(struct pglist_data *pgdat)
                pgdat_reclaimable_pages(pgdat) * 6;
 }
 
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
+/**
+ * lruvec_lru_size -  Returns the number of pages on the given LRU list.
+ * @lruvec: lru vector
+ * @lru: lru to use
+ * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ */
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
 {
+       unsigned long lru_size;
+       int zid;
+
        if (!mem_cgroup_disabled())
-               return mem_cgroup_get_lru_size(lruvec, lru);
+               lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+       else
+               lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+
+       for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+               struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
+               unsigned long size;
+
+               if (!managed_zone(zone))
+                       continue;
+
+               if (!mem_cgroup_disabled())
+                       size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+               else
+                       size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
+                                      NR_ZONE_LRU_BASE + lru);
+               lru_size -= min(size, lru_size);
+       }
+
+       return lru_size;
 
-       return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 }
 
 /*
@@ -268,10 +295,13 @@ EXPORT_SYMBOL(register_shrinker);
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
+       if (!shrinker->nr_deferred)
+               return;
        down_write(&shrinker_rwsem);
        list_del(&shrinker->list);
        up_write(&shrinker_rwsem);
        kfree(shrinker->nr_deferred);
+       shrinker->nr_deferred = NULL;
 }
 EXPORT_SYMBOL(unregister_shrinker);
 
@@ -291,6 +321,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
        int nid = shrinkctl->nid;
        long batch_size = shrinker->batch ? shrinker->batch
                                          : SHRINK_BATCH;
+       long scanned = 0, next_deferred;
 
        freeable = shrinker->count_objects(shrinker, shrinkctl);
        if (freeable == 0)
@@ -312,7 +343,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
                       shrinker->scan_objects, total_scan);
                total_scan = freeable;
-       }
+               next_deferred = nr;
+       } else
+               next_deferred = total_scan;
 
        /*
         * We need to avoid excessive windup on filesystem shrinkers
@@ -369,17 +402,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 
                count_vm_events(SLABS_SCANNED, nr_to_scan);
                total_scan -= nr_to_scan;
+               scanned += nr_to_scan;
 
                cond_resched();
        }
 
+       if (next_deferred >= scanned)
+               next_deferred -= scanned;
+       else
+               next_deferred = 0;
        /*
         * move the unused scan count back into the shrinker in a
         * manner that handles concurrent updates. If we exhausted the
         * scan, there is no need to do an update.
         */
-       if (total_scan > 0)
-               new_nr = atomic_long_add_return(total_scan,
+       if (next_deferred > 0)
+               new_nr = atomic_long_add_return(next_deferred,
                                                &shrinker->nr_deferred[nid]);
        else
                new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
@@ -1336,6 +1374,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 
                if (PageDirty(page)) {
                        struct address_space *mapping;
+                       bool migrate_dirty;
 
                        /* ISOLATE_CLEAN means only clean pages */
                        if (mode & ISOLATE_CLEAN)
@@ -1344,10 +1383,19 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
                        /*
                         * Only pages without mappings or that have a
                         * ->migratepage callback are possible to migrate
-                        * without blocking
+                        * without blocking. However, we can be racing with
+                        * truncation so it's necessary to lock the page
+                        * to stabilise the mapping as truncation holds
+                        * the page lock until after the page is removed
+                        * from the page cache.
                         */
+                       if (!trylock_page(page))
+                               return ret;
+
                        mapping = page_mapping(page);
-                       if (mapping && !mapping->a_ops->migratepage)
+                       migrate_dirty = !mapping || mapping->a_ops->migratepage;
+                       unlock_page(page);
+                       if (!migrate_dirty)
                                return ret;
                }
        }
@@ -1374,8 +1422,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
  * be complete before mem_cgroup_update_lru_size due to a santity check.
  */
 static __always_inline void update_lru_sizes(struct lruvec *lruvec,
-                       enum lru_list lru, unsigned long *nr_zone_taken,
-                       unsigned long nr_taken)
+                       enum lru_list lru, unsigned long *nr_zone_taken)
 {
        int zid;
 
@@ -1384,11 +1431,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
                        continue;
 
                __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
-       }
-
 #ifdef CONFIG_MEMCG
-       mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
+               mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
 #endif
+       }
+
 }
 
 /*
@@ -1493,7 +1540,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        *nr_scanned = scan;
        trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
                                    nr_taken, mode, is_file_lru(lru));
-       update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
+       update_lru_sizes(lruvec, lru, nr_zone_taken);
        return nr_taken;
 }
 
@@ -2011,11 +2058,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
                                                struct scan_control *sc)
 {
        unsigned long inactive_ratio;
-       unsigned long inactive;
-       unsigned long active;
+       unsigned long inactive, active;
+       enum lru_list inactive_lru = file * LRU_FILE;
+       enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
        unsigned long gb;
-       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-       int zid;
 
        /*
         * If we don't have swap space, anonymous page deactivation
@@ -2024,29 +2070,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
        if (!file && !total_swap_pages)
                return false;
 
-       inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
-       active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
-
-       /*
-        * For zone-constrained allocations, it is necessary to check if
-        * deactivations are required for lowmem to be reclaimed. This
-        * calculates the inactive/active pages available in eligible zones.
-        */
-       for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
-               struct zone *zone = &pgdat->node_zones[zid];
-               unsigned long inactive_zone, active_zone;
-
-               if (!managed_zone(zone))
-                       continue;
-
-               inactive_zone = zone_page_state(zone,
-                               NR_ZONE_LRU_BASE + (file * LRU_FILE));
-               active_zone = zone_page_state(zone,
-                               NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
-
-               inactive -= min(inactive, inactive_zone);
-               active -= min(active, active_zone);
-       }
+       inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
+       active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
@@ -2193,7 +2218,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * system is under heavy pressure.
         */
        if (!inactive_list_is_low(lruvec, true, sc) &&
-           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -2219,10 +2244,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * anon in [0], file in [1]
         */
 
-       anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
-               lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
-       file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
-               lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+       anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
+               lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
+       file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
+               lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
 
        spin_lock_irq(&pgdat->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2260,7 +2285,7 @@ out:
                        unsigned long size;
                        unsigned long scan;
 
-                       size = lruvec_lru_size(lruvec, lru);
+                       size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
                        scan = size >> sc->priority;
 
                        if (!scan && pass && force_scan)
@@ -2354,6 +2379,8 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
                        }
                }
 
+               cond_resched();
+
                if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
                        continue;
 
@@ -2592,6 +2619,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
        } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
 
+       /*
+        * Kswapd gives up on balancing particular nodes after too
+        * many failures to reclaim anything from them and goes to
+        * sleep. On reclaim progress, reset the failure counter. A
+        * successful direct reclaim run will revive a dormant kswapd.
+        */
+       if (reclaimable)
+               pgdat->kswapd_failures = 0;
+
        return reclaimable;
 }
 
@@ -2666,10 +2702,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                 GFP_KERNEL | __GFP_HARDWALL))
                                continue;
 
-                       if (sc->priority != DEF_PRIORITY &&
-                           !pgdat_reclaimable(zone->zone_pgdat))
-                               continue;       /* Let kswapd poll it */
-
                        /*
                         * If we already have plenty of memory free for
                         * compaction in this zone, don't free any more.
@@ -2806,7 +2838,7 @@ retry:
        return 0;
 }
 
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
 {
        struct zone *zone;
        unsigned long pfmemalloc_reserve = 0;
@@ -2814,10 +2846,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
        int i;
        bool wmark_ok;
 
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
-               if (!managed_zone(zone) ||
-                   pgdat_reclaimable_pages(pgdat) == 0)
+               if (!managed_zone(zone))
+                       continue;
+
+               if (!zone_reclaimable_pages(zone))
                        continue;
 
                pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2894,7 +2931,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 
                /* Throttle based on the first usable node */
                pgdat = zone->zone_pgdat;
-               if (pfmemalloc_watermark_ok(pgdat))
+               if (allow_direct_reclaim(pgdat))
                        goto out;
                break;
        }
@@ -2916,14 +2953,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
         */
        if (!(gfp_mask & __GFP_FS)) {
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-                       pfmemalloc_watermark_ok(pgdat), HZ);
+                       allow_direct_reclaim(pgdat), HZ);
 
                goto check_pending;
        }
 
        /* Throttle until kswapd wakes the process */
        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               pfmemalloc_watermark_ok(pgdat));
+               allow_direct_reclaim(pgdat));
 
 check_pending:
        if (fatal_signal_pending(current))
@@ -2939,7 +2976,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
-               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .gfp_mask = memalloc_noio_flags(gfp_mask),
                .reclaim_idx = gfp_zone(gfp_mask),
                .order = order,
                .nodemask = nodemask,
@@ -2954,12 +2991,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
         * 1 is returned so that the page allocator does not OOM kill at this
         * point.
         */
-       if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
+       if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
                return 1;
 
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
-                               gfp_mask,
+                               sc.gfp_mask,
                                sc.reclaim_idx);
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -3086,6 +3123,7 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
         */
        clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
        clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
+       clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
 
        return true;
 }
@@ -3102,7 +3140,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 
        /*
         * The throttled processes are normally woken up in balance_pgdat() as
-        * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+        * soon as allow_direct_reclaim() is true. But there is a potential
         * race between when kswapd checks the watermarks and a process gets
         * throttled. There is also a potential race if processes get
         * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3116,6 +3154,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        if (waitqueue_active(&pgdat->pfmemalloc_wait))
                wake_up_all(&pgdat->pfmemalloc_wait);
 
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
        for (i = 0; i <= classzone_idx; i++) {
                struct zone *zone = pgdat->node_zones + i;
 
@@ -3202,9 +3244,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
        count_vm_event(PAGEOUTRUN);
 
        do {
+               unsigned long nr_reclaimed = sc.nr_reclaimed;
                bool raise_priority = true;
 
-               sc.nr_reclaimed = 0;
                sc.reclaim_idx = classzone_idx;
 
                /*
@@ -3259,7 +3301,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 * If we're getting trouble reclaiming, start doing writepage
                 * even in laptop mode.
                 */
-               if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
+               if (sc.priority < DEF_PRIORITY - 2)
                        sc.may_writepage = 1;
 
                /* Call soft limit reclaim before calling shrink_node. */
@@ -3283,7 +3325,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 * able to safely make forward progress. Wake them
                 */
                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-                               pfmemalloc_watermark_ok(pgdat))
+                               allow_direct_reclaim(pgdat))
                        wake_up_all(&pgdat->pfmemalloc_wait);
 
                /* Check if kswapd should be suspending */
@@ -3294,10 +3336,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 * Raise priority if scanning rate is too low or there was no
                 * progress in reclaiming pages
                 */
-               if (raise_priority || !sc.nr_reclaimed)
+               nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+               if (raise_priority || !nr_reclaimed)
                        sc.priority--;
        } while (sc.priority >= 1);
 
+       if (!sc.nr_reclaimed)
+               pgdat->kswapd_failures++;
+
 out:
        /*
         * Return the order kswapd stopped reclaiming at as
@@ -3497,6 +3543,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
 
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return;
+
        /* Only wake kswapd if all zones are unbalanced */
        for (z = 0; z <= classzone_idx; z++) {
                zone = pgdat->node_zones + z;
@@ -3710,16 +3760,15 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
        const unsigned long nr_pages = 1 << order;
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
-       int classzone_idx = gfp_zone(gfp_mask);
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .gfp_mask = memalloc_noio_flags(gfp_mask),
                .order = order,
                .priority = NODE_RECLAIM_PRIORITY,
                .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
                .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
                .may_swap = 1,
-               .reclaim_idx = classzone_idx,
+               .reclaim_idx = gfp_zone(gfp_mask),
        };
 
        cond_resched();
@@ -3729,7 +3778,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
         * and RECLAIM_UNMAP.
         */
        p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
-       lockdep_set_current_reclaim_state(gfp_mask);
+       lockdep_set_current_reclaim_state(sc.gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
 
@@ -3767,9 +3816,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
            sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
                return NODE_RECLAIM_FULL;
 
-       if (!pgdat_reclaimable(pgdat))
-               return NODE_RECLAIM_FULL;
-
        /*
         * Do not scan if the allocation should not be delayed.
         */
@@ -3812,7 +3858,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  */
 int page_evictable(struct page *page)
 {
-       return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+       int ret;
+
+       /* Prevent address_space of inode and swap cache from being freed */
+       rcu_read_lock();
+       ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+       rcu_read_unlock();
+       return ret;
 }
 
 #ifdef CONFIG_SHMEM