Merge branch 'for-next/bti-user' into for-next/bti

[tomoyo/tomoyo-test1.git] / include / linux / mm.h
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 9e5fce1..b61ca54 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,6 +27,7 @@
  #include <linux/memremap.h>
  #include <linux/overflow.h>
  #include <linux/sizes.h>
+#include <linux/sched.h>
  
  struct mempolicy;
  struct anon_vma;
@@ -345,6 +346,20 @@ extern unsigned int kobjsize(const void *objp);
  /* Bits set in the VMA until the stack is in its final location */
  #define VM_STACK_INCOMPLETE_SETUP      (VM_RAND_READ | VM_SEQ_READ)
  
+#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
+
+/* Common data flag combinations */
+#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
+                                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \
+                                VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_EXEC     (VM_READ | VM_WRITE | VM_EXEC | \
+                                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#ifndef VM_DATA_DEFAULT_FLAGS          /* arch can override this */
+#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
+#endif
+
  #ifndef VM_STACK_DEFAULT_FLAGS         /* arch can override this */
  #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
  #endif
@@ -357,12 +372,18 @@ extern unsigned int kobjsize(const void *objp);
  
  #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
  
+/* VMA basic access permission flags */
+#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
+
+
  /*
   * Special vmas that are non-mergable, non-mlock()able.
- * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
   */
  #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
  
+/* This mask prevents VMA from being scanned with khugepaged */
+#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
+
  /* This mask defines which mm->def_flags a process can inherit its parent */
  #define VM_INIT_DEF_MASK       VM_NOHUGEPAGE
  
@@ -381,15 +402,75 @@ extern unsigned int kobjsize(const void *objp);
   */
  extern pgprot_t protection_map[16];
  
-#define FAULT_FLAG_WRITE       0x01    /* Fault was a write access */
-#define FAULT_FLAG_MKWRITE     0x02    /* Fault was mkwrite of existing pte */
-#define FAULT_FLAG_ALLOW_RETRY 0x04    /* Retry fault if blocking */
-#define FAULT_FLAG_RETRY_NOWAIT        0x08    /* Don't drop mmap_sem and wait when retrying */
-#define FAULT_FLAG_KILLABLE    0x10    /* The fault task is in SIGKILL killable region */
-#define FAULT_FLAG_TRIED       0x20    /* Second try */
-#define FAULT_FLAG_USER                0x40    /* The fault originated in userspace */
-#define FAULT_FLAG_REMOTE      0x80    /* faulting for non current tsk/mm */
-#define FAULT_FLAG_INSTRUCTION  0x100  /* The fault was during an instruction fetch */
+/**
+ * Fault flag definitions.
+ *
+ * @FAULT_FLAG_WRITE: Fault was a write fault.
+ * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
+ * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
+ * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_sem and wait when retrying.
+ * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
+ * @FAULT_FLAG_TRIED: The fault has been tried once.
+ * @FAULT_FLAG_USER: The fault originated in userspace.
+ * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
+ * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
+ * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+ *
+ * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+ * whether we would allow page faults to retry by specifying these two
+ * fault flags correctly.  Currently there can be three legal combinations:
+ *
+ * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
+ *                              this is the first try
+ *
+ * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
+ *                              we've already tried at least once
+ *
+ * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
+ *
+ * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
+ * be used.  Note that page faults can be allowed to retry for multiple times,
+ * in which case we'll have an initial fault with flags (a) then later on
+ * continuous faults with flags (b).  We should always try to detect pending
+ * signals before a retry to make sure the continuous page faults can still be
+ * interrupted if necessary.
+ */
+#define FAULT_FLAG_WRITE                       0x01
+#define FAULT_FLAG_MKWRITE                     0x02
+#define FAULT_FLAG_ALLOW_RETRY                 0x04
+#define FAULT_FLAG_RETRY_NOWAIT                        0x08
+#define FAULT_FLAG_KILLABLE                    0x10
+#define FAULT_FLAG_TRIED                       0x20
+#define FAULT_FLAG_USER                                0x40
+#define FAULT_FLAG_REMOTE                      0x80
+#define FAULT_FLAG_INSTRUCTION                 0x100
+#define FAULT_FLAG_INTERRUPTIBLE               0x200
+
+/*
+ * The default fault flags that should be used by most of the
+ * arch-specific page fault handlers.
+ */
+#define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \
+                            FAULT_FLAG_KILLABLE | \
+                            FAULT_FLAG_INTERRUPTIBLE)
+
+/**
+ * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
+ *
+ * This is mostly used for places where we want to try to avoid taking
+ * the mmap_sem for too long a time when waiting for another condition
+ * to change, in which case we can try to be polite to release the
+ * mmap_sem in the first round to avoid potential starvation of other
+ * processes that would also want the mmap_sem.
+ *
+ * Return: true if the page fault allows retry and this is the first
+ * attempt of the fault handling; false otherwise.
+ */
+static inline bool fault_flag_allow_retry_first(unsigned int flags)
+{
+       return (flags & FAULT_FLAG_ALLOW_RETRY) &&
+           (!(flags & FAULT_FLAG_TRIED));
+}
  
  #define FAULT_FLAG_TRACE \
         { FAULT_FLAG_WRITE,             "WRITE" }, \
@@ -400,7 +481,8 @@ extern pgprot_t protection_map[16];
         { FAULT_FLAG_TRIED,             "TRIED" }, \
         { FAULT_FLAG_USER,              "USER" }, \
         { FAULT_FLAG_REMOTE,            "REMOTE" }, \
-       { FAULT_FLAG_INSTRUCTION,       "INSTRUCTION" }
+       { FAULT_FLAG_INSTRUCTION,       "INSTRUCTION" }, \
+       { FAULT_FLAG_INTERRUPTIBLE,     "INTERRUPTIBLE" }
  
  /*
   * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -544,6 +626,36 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
         return !vma->vm_ops;
  }
  
+static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
+{
+       int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+       if (!maybe_stack)
+               return false;
+
+       if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+                                               VM_STACK_INCOMPLETE_SETUP)
+               return true;
+
+       return false;
+}
+
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+       if (!current->mm)
+               return true;
+
+       if (current->mm != vma->vm_mm)
+               return true;
+
+       return false;
+}
+
+static inline bool vma_is_accessible(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & VM_ACCESS_FLAGS;
+}
+
  #ifdef CONFIG_SHMEM
  /*
   * The vma_is_shmem is not inline because it is used only by slow
@@ -773,6 +885,24 @@ static inline unsigned int compound_order(struct page *page)
         return page[1].compound_order;
  }
  
+static inline bool hpage_pincount_available(struct page *page)
+{
+       /*
+        * Can the page->hpage_pinned_refcount field be used? That field is in
+        * the 3rd page of the compound page, so the smallest (2-page) compound
+        * pages cannot support it.
+        */
+       page = compound_head(page);
+       return PageCompound(page) && compound_order(page) > 1;
+}
+
+static inline int compound_pincount(struct page *page)
+{
+       VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+       page = compound_head(page);
+       return atomic_read(compound_pincount_ptr(page));
+}
+
  static inline void set_compound_order(struct page *page, unsigned int order)
  {
         page[1].compound_order = order;
@@ -1004,6 +1134,8 @@ static inline void get_page(struct page *page)
         page_ref_inc(page);
  }
  
+bool __must_check try_grab_page(struct page *page, unsigned int flags);
+
  static inline __must_check bool try_get_page(struct page *page)
  {
         page = compound_head(page);
@@ -1032,29 +1164,87 @@ static inline void put_page(struct page *page)
                 __put_page(page);
  }
  
-/**
- * unpin_user_page() - release a gup-pinned page
- * @page:            pointer to page to be released
+/*
+ * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
+ * the page's refcount so that two separate items are tracked: the original page
+ * reference count, and also a new count of how many pin_user_pages() calls were
+ * made against the page. ("gup-pinned" is another term for the latter).
+ *
+ * With this scheme, pin_user_pages() becomes special: such pages are marked as
+ * distinct from normal pages. As such, the unpin_user_page() call (and its
+ * variants) must be used in order to release gup-pinned pages.
   *
- * Pages that were pinned via pin_user_pages*() must be released via either
- * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
- * that eventually such pages can be separately tracked and uniquely handled. In
- * particular, interactions with RDMA and filesystems need special handling.
+ * Choice of value:
   *
- * unpin_user_page() and put_page() are not interchangeable, despite this early
- * implementation that makes them look the same. unpin_user_page() calls must
- * be perfectly matched up with pin*() calls.
+ * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
+ * counts with respect to pin_user_pages() and unpin_user_page() becomes
+ * simpler, due to the fact that adding an even power of two to the page
+ * refcount has the effect of using only the upper N bits, for the code that
+ * counts up using the bias value. This means that the lower bits are left for
+ * the exclusive use of the original code that increments and decrements by one
+ * (or at least, by much smaller values than the bias value).
+ *
+ * Of course, once the lower bits overflow into the upper bits (and this is
+ * OK, because subtraction recovers the original values), then visual inspection
+ * no longer suffices to directly view the separate counts. However, for normal
+ * applications that don't have huge page reference counts, this won't be an
+ * issue.
+ *
+ * Locking: the lockless algorithm described in page_cache_get_speculative()
+ * and page_cache_gup_pin_speculative() provides safe operation for
+ * get_user_pages and page_mkclean and other calls that race to set up page
+ * table entries.
   */
-static inline void unpin_user_page(struct page *page)
-{
-       put_page(page);
-}
+#define GUP_PIN_COUNTING_BIAS (1U << 10)
  
+void unpin_user_page(struct page *page);
  void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                  bool make_dirty);
-
  void unpin_user_pages(struct page **pages, unsigned long npages);
  
+/**
+ * page_maybe_dma_pinned() - report if a page is pinned for DMA.
+ *
+ * This function checks if a page has been pinned via a call to
+ * pin_user_pages*().
+ *
+ * For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
+ * because it means "definitely not pinned for DMA", but true means "probably
+ * pinned for DMA, but possibly a false positive due to having at least
+ * GUP_PIN_COUNTING_BIAS worth of normal page references".
+ *
+ * False positives are OK, because: a) it's unlikely for a page to get that many
+ * refcounts, and b) all the callers of this routine are expected to be able to
+ * deal gracefully with a false positive.
+ *
+ * For huge pages, the result will be exactly correct. That's because we have
+ * more tracking data available: the 3rd struct page in the compound page is
+ * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS
+ * scheme).
+ *
+ * For more information, please see Documentation/vm/pin_user_pages.rst.
+ *
+ * @page:      pointer to page to be queried.
+ * @Return:    True, if it is likely that the page has been "dma-pinned".
+ *             False, if the page is definitely not dma-pinned.
+ */
+static inline bool page_maybe_dma_pinned(struct page *page)
+{
+       if (hpage_pincount_available(page))
+               return compound_pincount(page) > 0;
+
+       /*
+        * page_ref_count() is signed. If that refcount overflows, then
+        * page_ref_count() returns a negative value, and callers will avoid
+        * further incrementing the refcount.
+        *
+        * Here, for that overflow case, use the signed bit to count a little
+        * bit higher via unsigned math, and thus still get an accurate result.
+        */
+       return ((unsigned int)page_ref_count(compound_head(page))) >=
+               GUP_PIN_COUNTING_BIAS;
+}
+
  #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
  #define SECTION_IN_PAGE_FLAGS
  #endif
@@ -1602,9 +1792,26 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
                 unsigned long old_addr, struct vm_area_struct *new_vma,
                 unsigned long new_addr, unsigned long len,
                 bool need_rmap_locks);
+
+/*
+ * Flags used by change_protection().  For now we make it a bitmap so
+ * that we can pass in multiple flags just like parameters.  However
+ * for now all the callers are only use one of the flags at the same
+ * time.
+ */
+/* Whether we should allow dirty bit accounting */
+#define  MM_CP_DIRTY_ACCT                  (1UL << 0)
+/* Whether this protection change is for NUMA hints */
+#define  MM_CP_PROT_NUMA                   (1UL << 1)
+/* Whether this change is for write protecting */
+#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
+#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
+#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
+                                           MM_CP_UFFD_WP_RESOLVE)
+
  extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
                               unsigned long end, pgprot_t newprot,
-                             int dirty_accountable, int prot_numa);
+                             unsigned long cp_flags);
  extern int mprotect_fixup(struct vm_area_struct *vma,
                           struct vm_area_struct **pprev, unsigned long start,
                           unsigned long end, unsigned long newflags);
@@ -1723,6 +1930,18 @@ static inline void sync_mm_rss(struct mm_struct *mm)
  }
  #endif
  
+#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
+static inline int pte_special(pte_t pte)
+{
+       return 0;
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+       return pte;
+}
+#endif
+
  #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
  static inline int pte_devmap(pte_t pte)
  {
@@ -2367,26 +2586,7 @@ struct vm_unmapped_area_info {
         unsigned long align_offset;
  };
  
-extern unsigned long unmapped_area(struct vm_unmapped_area_info *info);
-extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
-
-/*
- * Search for an unmapped address range.
- *
- * We are looking for a range that:
- * - does not intersect with any VMA;
- * - is contained within the [low_limit, high_limit) interval;
- * - is at least the desired size.
- * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
- */
-static inline unsigned long
-vm_unmapped_area(struct vm_unmapped_area_info *info)
-{
-       if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
-               return unmapped_area_topdown(info);
-       else
-               return unmapped_area(info);
-}
+extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
  
  /* truncate.c */
  extern void truncate_inode_pages(struct address_space *, loff_t);
@@ -2522,6 +2722,8 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
  int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                         unsigned long pfn, unsigned long size, pgprot_t);
  int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+                       struct page **pages, unsigned long *num);
  int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                 unsigned long num);
  int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
@@ -2718,6 +2920,10 @@ static inline bool debug_pagealloc_enabled_static(void)
  #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP)
  extern void __kernel_map_pages(struct page *page, int numpages, int enable);
  
+/*
+ * When called in DEBUG_PAGEALLOC context, the call should most likely be
+ * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static()
+ */
  static inline void
  kernel_map_pages(struct page *page, int numpages, int enable)
  {
@@ -2866,6 +3072,23 @@ extern long copy_huge_page_from_user(struct page *dst_page,
                                 const void __user *usr_src,
                                 unsigned int pages_per_huge_page,
                                 bool allow_pagefault);
+
+/**
+ * vma_is_special_huge - Are transhuge page-table entries considered special?
+ * @vma: Pointer to the struct vm_area_struct to consider
+ *
+ * Whether transhuge page-table entries are considered "special" following
+ * the definition in vm_normal_page().
+ *
+ * Return: true if transhuge page-table entries should be considered special,
+ * false otherwise.
+ */
+static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
+{
+       return vma_is_dax(vma) || (vma->vm_file &&
+                                  (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
+}
+
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
  
  #ifdef CONFIG_DEBUG_PAGEALLOC