mm, oom: introduce oom reaper

author Michal Hocko <mhocko@suse.com>

Fri, 25 Mar 2016 21:20:24 +0000 (14:20 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 25 Mar 2016 23:37:42 +0000 (16:37 -0700)
author Michal Hocko <mhocko@suse.com>
Fri, 25 Mar 2016 21:20:24 +0000 (14:20 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Mar 2016 23:37:42 +0000 (16:37 -0700)
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 450fc97..ed6407d 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1132,6 +1132,8 @@ struct zap_details {
         struct address_space *check_mapping;    /* Check page->mapping if set */
         pgoff_t first_index;                    /* Lowest page->index to unmap */
         pgoff_t last_index;                     /* Highest page->index to unmap */
         struct address_space *check_mapping;    /* Check page->mapping if set */
         pgoff_t first_index;                    /* Lowest page->index to unmap */
         pgoff_t last_index;                     /* Highest page->index to unmap */
+       bool ignore_dirty;                      /* Ignore dirty pages */
+       bool check_swap_entries;                /* Check also swap entries */
  };
  
  struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
  };
  
  struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/internal.h b/mm/internal.h

index 7449392..b79abb6 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,6 +38,11 @@
  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                 unsigned long floor, unsigned long ceiling);
  
  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                 unsigned long floor, unsigned long ceiling);
  
+void unmap_page_range(struct mmu_gather *tlb,
+                            struct vm_area_struct *vma,
+                            unsigned long addr, unsigned long end,
+                            struct zap_details *details);
+
  extern int __do_page_cache_readahead(struct address_space *mapping,
                 struct file *filp, pgoff_t offset, unsigned long nr_to_read,
                 unsigned long lookahead_size);
  extern int __do_page_cache_readahead(struct address_space *mapping,
                 struct file *filp, pgoff_t offset, unsigned long nr_to_read,
                 unsigned long lookahead_size);
diff --git a/mm/memory.c b/mm/memory.c

index 81dca00..098f00d 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1102,6 +1102,12 @@ again:
  
                         if (!PageAnon(page)) {
                                 if (pte_dirty(ptent)) {
  
                         if (!PageAnon(page)) {
                                 if (pte_dirty(ptent)) {
+                                       /*
+                                        * oom_reaper cannot tear down dirty
+                                        * pages
+                                        */
+                                       if (unlikely(details && details->ignore_dirty))
+                                               continue;
                                         force_flush = 1;
                                         set_page_dirty(page);
                                 }
                                         force_flush = 1;
                                         set_page_dirty(page);
                                 }
@@ -1120,8 +1126,8 @@ again:
                         }
                         continue;
                 }
                         }
                         continue;
                 }
-               /* If details->check_mapping, we leave swap entries. */
-               if (unlikely(details))
+               /* only check swap_entries if explicitly asked for in details */
+               if (unlikely(details && !details->check_swap_entries))
                         continue;
  
                 entry = pte_to_swp_entry(ptent);
                         continue;
  
                 entry = pte_to_swp_entry(ptent);
@@ -1226,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
         return addr;
  }
  
         return addr;
  }
  
-static void unmap_page_range(struct mmu_gather *tlb,
+void unmap_page_range(struct mmu_gather *tlb,
                              struct vm_area_struct *vma,
                              unsigned long addr, unsigned long end,
                              struct zap_details *details)
                              struct vm_area_struct *vma,
                              unsigned long addr, unsigned long end,
                              struct zap_details *details)
@@ -1234,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
         pgd_t *pgd;
         unsigned long next;
  
         pgd_t *pgd;
         unsigned long next;
  
-       if (details && !details->check_mapping)
-               details = NULL;
-
         BUG_ON(addr >= end);
         tlb_start_vma(tlb, vma);
         pgd = pgd_offset(vma->vm_mm, addr);
         BUG_ON(addr >= end);
         tlb_start_vma(tlb, vma);
         pgd = pgd_offset(vma->vm_mm, addr);
@@ -2432,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
  void unmap_mapping_range(struct address_space *mapping,
                 loff_t const holebegin, loff_t const holelen, int even_cows)
  {
  void unmap_mapping_range(struct address_space *mapping,
                 loff_t const holebegin, loff_t const holelen, int even_cows)
  {
-       struct zap_details details;
+       struct zap_details details = { };
         pgoff_t hba = holebegin >> PAGE_SHIFT;
         pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
  
         pgoff_t hba = holebegin >> PAGE_SHIFT;
         pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
  
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 06f7e17..f7ed6ec 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,11 @@
  #include <linux/freezer.h>
  #include <linux/ftrace.h>
  #include <linux/ratelimit.h>
  #include <linux/freezer.h>
  #include <linux/ftrace.h>
  #include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+
+#include <asm/tlb.h>
+#include "internal.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/oom.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/oom.h>
@@ -405,6 +410,133 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
  
  bool oom_killer_disabled __read_mostly;
  
  
  bool oom_killer_disabled __read_mostly;
  
+#ifdef CONFIG_MMU
+/*
+ * OOM Reaper kernel thread which tries to reap the memory used by the OOM
+ * victim (if that is possible) to help the OOM killer to move on.
+ */
+static struct task_struct *oom_reaper_th;
+static struct mm_struct *mm_to_reap;
+static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
+
+static bool __oom_reap_vmas(struct mm_struct *mm)
+{
+       struct mmu_gather tlb;
+       struct vm_area_struct *vma;
+       struct zap_details details = {.check_swap_entries = true,
+                                     .ignore_dirty = true};
+       bool ret = true;
+
+       /* We might have raced with exit path */
+       if (!atomic_inc_not_zero(&mm->mm_users))
+               return true;
+
+       if (!down_read_trylock(&mm->mmap_sem)) {
+               ret = false;
+               goto out;
+       }
+
+       tlb_gather_mmu(&tlb, mm, 0, -1);
+       for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+               if (is_vm_hugetlb_page(vma))
+                       continue;
+
+               /*
+                * mlocked VMAs require explicit munlocking before unmap.
+                * Let's keep it simple here and skip such VMAs.
+                */
+               if (vma->vm_flags & VM_LOCKED)
+                       continue;
+
+               /*
+                * Only anonymous pages have a good chance to be dropped
+                * without additional steps which we cannot afford as we
+                * are OOM already.
+                *
+                * We do not even care about fs backed pages because all
+                * which are reclaimable have already been reclaimed and
+                * we do not want to block exit_mmap by keeping mm ref
+                * count elevated without a good reason.
+                */
+               if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+                       unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
+                                        &details);
+       }
+       tlb_finish_mmu(&tlb, 0, -1);
+       up_read(&mm->mmap_sem);
+out:
+       mmput(mm);
+       return ret;
+}
+
+static void oom_reap_vmas(struct mm_struct *mm)
+{
+       int attempts = 0;
+
+       /* Retry the down_read_trylock(mmap_sem) a few times */
+       while (attempts++ < 10 && !__oom_reap_vmas(mm))
+               schedule_timeout_idle(HZ/10);
+
+       /* Drop a reference taken by wake_oom_reaper */
+       mmdrop(mm);
+}
+
+static int oom_reaper(void *unused)
+{
+       while (true) {
+               struct mm_struct *mm;
+
+               wait_event_freezable(oom_reaper_wait,
+                                    (mm = READ_ONCE(mm_to_reap)));
+               oom_reap_vmas(mm);
+               WRITE_ONCE(mm_to_reap, NULL);
+       }
+
+       return 0;
+}
+
+static void wake_oom_reaper(struct mm_struct *mm)
+{
+       struct mm_struct *old_mm;
+
+       if (!oom_reaper_th)
+               return;
+
+       /*
+        * Pin the given mm. Use mm_count instead of mm_users because
+        * we do not want to delay the address space tear down.
+        */
+       atomic_inc(&mm->mm_count);
+
+       /*
+        * Make sure that only a single mm is ever queued for the reaper
+        * because multiple are not necessary and the operation might be
+        * disruptive so better reduce it to the bare minimum.
+        */
+       old_mm = cmpxchg(&mm_to_reap, NULL, mm);
+       if (!old_mm)
+               wake_up(&oom_reaper_wait);
+       else
+               mmdrop(mm);
+}
+
+static int __init oom_init(void)
+{
+       oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+       if (IS_ERR(oom_reaper_th)) {
+               pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
+                               PTR_ERR(oom_reaper_th));
+               oom_reaper_th = NULL;
+       }
+       return 0;
+}
+subsys_initcall(oom_init)
+#else
+static void wake_oom_reaper(struct mm_struct *mm)
+{
+}
+#endif
+
  /**
   * mark_oom_victim - mark the given task as OOM victim
   * @tsk: task to mark
  /**
   * mark_oom_victim - mark the given task as OOM victim
   * @tsk: task to mark
@@ -510,6 +642,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
         unsigned int victim_points = 0;
         static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
                                               DEFAULT_RATELIMIT_BURST);
         unsigned int victim_points = 0;
         static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
                                               DEFAULT_RATELIMIT_BURST);
+       bool can_oom_reap = true;
  
         /*
          * If the task is already exiting, don't alarm the sysadmin or kill
  
         /*
          * If the task is already exiting, don't alarm the sysadmin or kill
@@ -600,17 +733,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
                         continue;
                 if (same_thread_group(p, victim))
                         continue;
                         continue;
                 if (same_thread_group(p, victim))
                         continue;
-               if (unlikely(p->flags & PF_KTHREAD))
-                       continue;
-               if (is_global_init(p))
-                       continue;
-               if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+               if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
+                   p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+                       /*
+                        * We cannot use oom_reaper for the mm shared by this
+                        * process because it wouldn't get killed and so the
+                        * memory might be still used.
+                        */
+                       can_oom_reap = false;
                         continue;
                         continue;
-
+               }
                 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
         }
         rcu_read_unlock();
  
                 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
         }
         rcu_read_unlock();
  
+       if (can_oom_reap)
+               wake_oom_reaper(mm);
+
         mmdrop(mm);
         put_task_struct(victim);
  }
         mmdrop(mm);
         put_task_struct(victim);
  }
author	Michal Hocko <mhocko@suse.com>
	Fri, 25 Mar 2016 21:20:24 +0000 (14:20 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 25 Mar 2016 23:37:42 +0000 (16:37 -0700)
include/linux/mm.h		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history