mm/gup.c: update the documentation

[tomoyo/tomoyo-test1.git] / mm / readahead.c
diff --git a/mm/readahead.c b/mm/readahead.c

index 2fe72cd..3c9a8dd 100644 (file)
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -22,6 +22,7 @@
  #include <linux/mm_inline.h>
  #include <linux/blk-cgroup.h>
  #include <linux/fadvise.h>
+#include <linux/sched/mm.h>
  
  #include "internal.h"
  
@@ -113,94 +114,126 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
  
  EXPORT_SYMBOL(read_cache_pages);
  
-static int read_pages(struct address_space *mapping, struct file *filp,
-               struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
+static void read_pages(struct readahead_control *rac, struct list_head *pages,
+               bool skip_page)
  {
+       const struct address_space_operations *aops = rac->mapping->a_ops;
+       struct page *page;
         struct blk_plug plug;
-       unsigned page_idx;
-       int ret;
+
+       if (!readahead_count(rac))
+               goto out;
  
         blk_start_plug(&plug);
  
-       if (mapping->a_ops->readpages) {
-               ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+       if (aops->readahead) {
+               aops->readahead(rac);
+               /* Clean up the remaining pages */
+               while ((page = readahead_page(rac))) {
+                       unlock_page(page);
+                       put_page(page);
+               }
+       } else if (aops->readpages) {
+               aops->readpages(rac->file, rac->mapping, pages,
+                               readahead_count(rac));
                 /* Clean up the remaining pages */
                 put_pages_list(pages);
-               goto out;
-       }
-
-       for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-               struct page *page = lru_to_page(pages);
-               list_del(&page->lru);
-               if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
-                       mapping->a_ops->readpage(filp, page);
-               put_page(page);
+               rac->_index += rac->_nr_pages;
+               rac->_nr_pages = 0;
+       } else {
+               while ((page = readahead_page(rac))) {
+                       aops->readpage(rac->file, page);
+                       put_page(page);
+               }
         }
-       ret = 0;
  
-out:
         blk_finish_plug(&plug);
  
-       return ret;
+       BUG_ON(!list_empty(pages));
+       BUG_ON(readahead_count(rac));
+
+out:
+       if (skip_page)
+               rac->_index++;
  }
  
-/*
- * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates
- * the pages first, then submits them for I/O. This avoids the very bad
- * behaviour which would occur if page allocations are causing VM writeback.
- * We really don't want to intermingle reads and writes like that.
+/**
+ * page_cache_readahead_unbounded - Start unchecked readahead.
+ * @mapping: File address space.
+ * @file: This instance of the open file; used for authentication.
+ * @index: First page index to read.
+ * @nr_to_read: The number of pages to read.
+ * @lookahead_size: Where to start the next readahead.
   *
- * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ * This function is for filesystems to call when they want to start
+ * readahead beyond a file's stated i_size.  This is almost certainly
+ * not the function you want to call.  Use page_cache_async_readahead()
+ * or page_cache_sync_readahead() instead.
+ *
+ * Context: File is referenced by caller.  Mutexes may be held by caller.
+ * May sleep, but will not reenter filesystem to reclaim memory.
   */
-unsigned int __do_page_cache_readahead(struct address_space *mapping,
-               struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+void page_cache_readahead_unbounded(struct address_space *mapping,
+               struct file *file, pgoff_t index, unsigned long nr_to_read,
                 unsigned long lookahead_size)
  {
-       struct inode *inode = mapping->host;
-       struct page *page;
-       unsigned long end_index;        /* The last page we want to read */
         LIST_HEAD(page_pool);
-       int page_idx;
-       unsigned int nr_pages = 0;
-       loff_t isize = i_size_read(inode);
         gfp_t gfp_mask = readahead_gfp_mask(mapping);
+       struct readahead_control rac = {
+               .mapping = mapping,
+               .file = file,
+               ._index = index,
+       };
+       unsigned long i;
  
-       if (isize == 0)
-               goto out;
-
-       end_index = ((isize - 1) >> PAGE_SHIFT);
+       /*
+        * Partway through the readahead operation, we will have added
+        * locked pages to the page cache, but will not yet have submitted
+        * them for I/O.  Adding another page may need to allocate memory,
+        * which can trigger memory reclaim.  Telling the VM we're in
+        * the middle of a filesystem operation will cause it to not
+        * touch file-backed pages, preventing a deadlock.  Most (all?)
+        * filesystems already specify __GFP_NOFS in their mapping's
+        * gfp_mask, but let's be explicit here.
+        */
+       unsigned int nofs = memalloc_nofs_save();
  
         /*
          * Preallocate as many pages as we will need.
          */
-       for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
-               pgoff_t page_offset = offset + page_idx;
+       for (i = 0; i < nr_to_read; i++) {
+               struct page *page = xa_load(&mapping->i_pages, index + i);
  
-               if (page_offset > end_index)
-                       break;
+               BUG_ON(index + i != rac._index + rac._nr_pages);
  
-               page = xa_load(&mapping->i_pages, page_offset);
                 if (page && !xa_is_value(page)) {
                         /*
-                        * Page already present?  Kick off the current batch of
-                        * contiguous pages before continuing with the next
-                        * batch.
+                        * Page already present?  Kick off the current batch
+                        * of contiguous pages before continuing with the
+                        * next batch.  This page may be the one we would
+                        * have intended to mark as Readahead, but we don't
+                        * have a stable reference to this page, and it's
+                        * not worth getting one just for that.
                          */
-                       if (nr_pages)
-                               read_pages(mapping, filp, &page_pool, nr_pages,
-                                               gfp_mask);
-                       nr_pages = 0;
+                       read_pages(&rac, &page_pool, true);
                         continue;
                 }
  
                 page = __page_cache_alloc(gfp_mask);
                 if (!page)
                         break;
-               page->index = page_offset;
-               list_add(&page->lru, &page_pool);
-               if (page_idx == nr_to_read - lookahead_size)
+               if (mapping->a_ops->readpages) {
+                       page->index = index + i;
+                       list_add(&page->lru, &page_pool);
+               } else if (add_to_page_cache_lru(page, mapping, index + i,
+                                       gfp_mask) < 0) {
+                       put_page(page);
+                       read_pages(&rac, &page_pool, true);
+                       continue;
+               }
+               if (i == nr_to_read - lookahead_size)
                         SetPageReadahead(page);
-               nr_pages++;
+               rac._nr_pages++;
         }
  
         /*
@@ -208,26 +241,53 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping,
          * uptodate then the caller will launch readpage again, and
          * will then handle the error.
          */
-       if (nr_pages)
-               read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
-       BUG_ON(!list_empty(&page_pool));
-out:
-       return nr_pages;
+       read_pages(&rac, &page_pool, false);
+       memalloc_nofs_restore(nofs);
+}
+EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+
+/*
+ * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates
+ * the pages first, then submits them for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ */
+void __do_page_cache_readahead(struct address_space *mapping,
+               struct file *file, pgoff_t index, unsigned long nr_to_read,
+               unsigned long lookahead_size)
+{
+       struct inode *inode = mapping->host;
+       loff_t isize = i_size_read(inode);
+       pgoff_t end_index;      /* The last page we want to read */
+
+       if (isize == 0)
+               return;
+
+       end_index = (isize - 1) >> PAGE_SHIFT;
+       if (index > end_index)
+               return;
+       /* Don't read past the page containing the last byte of the file */
+       if (nr_to_read > end_index - index)
+               nr_to_read = end_index - index + 1;
+
+       page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
+                       lookahead_size);
  }
  
  /*
   * Chunk the readahead into 2 megabyte units, so that we don't pin too much
   * memory at once.
   */
-int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
-                              pgoff_t offset, unsigned long nr_to_read)
+void force_page_cache_readahead(struct address_space *mapping,
+               struct file *filp, pgoff_t index, unsigned long nr_to_read)
  {
         struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
         struct file_ra_state *ra = &filp->f_ra;
         unsigned long max_pages;
  
-       if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
-               return -EINVAL;
+       if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
+                       !mapping->a_ops->readahead))
+               return;
  
         /*
          * If the request exceeds the readahead window, allow the read to
@@ -240,12 +300,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
  
                 if (this_chunk > nr_to_read)
                         this_chunk = nr_to_read;
-               __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0);
+               __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
  
-               offset += this_chunk;
+               index += this_chunk;
                 nr_to_read -= this_chunk;
         }
-       return 0;
  }
  
  /*
@@ -324,21 +383,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
   */
  
  /*
- * Count contiguously cached pages from @offset-1 to @offset-@max,
+ * Count contiguously cached pages from @index-1 to @index-@max,
   * this count is a conservative estimation of
   *     - length of the sequential read sequence, or
   *     - thrashing threshold in memory tight systems
   */
  static pgoff_t count_history_pages(struct address_space *mapping,
-                                  pgoff_t offset, unsigned long max)
+                                  pgoff_t index, unsigned long max)
  {
         pgoff_t head;
  
         rcu_read_lock();
-       head = page_cache_prev_miss(mapping, offset - 1, max);
+       head = page_cache_prev_miss(mapping, index - 1, max);
         rcu_read_unlock();
  
-       return offset - 1 - head;
+       return index - 1 - head;
  }
  
  /*
@@ -346,13 +405,13 @@ static pgoff_t count_history_pages(struct address_space *mapping,
   */
  static int try_context_readahead(struct address_space *mapping,
                                  struct file_ra_state *ra,
-                                pgoff_t offset,
+                                pgoff_t index,
                                  unsigned long req_size,
                                  unsigned long max)
  {
         pgoff_t size;
  
-       size = count_history_pages(mapping, offset, max);
+       size = count_history_pages(mapping, index, max);
  
         /*
          * not enough history pages:
@@ -365,10 +424,10 @@ static int try_context_readahead(struct address_space *mapping,
          * starts from beginning of file:
          * it is a strong indication of long-run stream (or whole-file-read)
          */
-       if (size >= offset)
+       if (size >= index)
                 size *= 2;
  
-       ra->start = offset;
+       ra->start = index;
         ra->size = min(size + req_size, max);
         ra->async_size = 1;
  
@@ -378,16 +437,15 @@ static int try_context_readahead(struct address_space *mapping,
  /*
   * A minimal readahead algorithm for trivial sequential/random reads.
   */
-static unsigned long
-ondemand_readahead(struct address_space *mapping,
-                  struct file_ra_state *ra, struct file *filp,
-                  bool hit_readahead_marker, pgoff_t offset,
-                  unsigned long req_size)
+static void ondemand_readahead(struct address_space *mapping,
+               struct file_ra_state *ra, struct file *filp,
+               bool hit_readahead_marker, pgoff_t index,
+               unsigned long req_size)
  {
         struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
         unsigned long max_pages = ra->ra_pages;
         unsigned long add_pages;
-       pgoff_t prev_offset;
+       pgoff_t prev_index;
  
         /*
          * If the request exceeds the readahead window, allow the read to
@@ -399,15 +457,15 @@ ondemand_readahead(struct address_space *mapping,
         /*
          * start of file
          */
-       if (!offset)
+       if (!index)
                 goto initial_readahead;
  
         /*
-        * It's the expected callback offset, assume sequential access.
+        * It's the expected callback index, assume sequential access.
          * Ramp up sizes, and push forward the readahead window.
          */
-       if ((offset == (ra->start + ra->size - ra->async_size) ||
-            offset == (ra->start + ra->size))) {
+       if ((index == (ra->start + ra->size - ra->async_size) ||
+            index == (ra->start + ra->size))) {
                 ra->start += ra->size;
                 ra->size = get_next_ra_size(ra, max_pages);
                 ra->async_size = ra->size;
@@ -424,14 +482,14 @@ ondemand_readahead(struct address_space *mapping,
                 pgoff_t start;
  
                 rcu_read_lock();
-               start = page_cache_next_miss(mapping, offset + 1, max_pages);
+               start = page_cache_next_miss(mapping, index + 1, max_pages);
                 rcu_read_unlock();
  
-               if (!start || start - offset > max_pages)
-                       return 0;
+               if (!start || start - index > max_pages)
+                       return;
  
                 ra->start = start;
-               ra->size = start - offset;      /* old async_size */
+               ra->size = start - index;       /* old async_size */
                 ra->size += req_size;
                 ra->size = get_next_ra_size(ra, max_pages);
                 ra->async_size = ra->size;
@@ -446,28 +504,29 @@ ondemand_readahead(struct address_space *mapping,
  
         /*
          * sequential cache miss
-        * trivial case: (offset - prev_offset) == 1
-        * unaligned reads: (offset - prev_offset) == 0
+        * trivial case: (index - prev_index) == 1
+        * unaligned reads: (index - prev_index) == 0
          */
-       prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
-       if (offset - prev_offset <= 1UL)
+       prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
+       if (index - prev_index <= 1UL)
                 goto initial_readahead;
  
         /*
          * Query the page cache and look for the traces(cached history pages)
          * that a sequential stream would leave behind.
          */
-       if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
+       if (try_context_readahead(mapping, ra, index, req_size, max_pages))
                 goto readit;
  
         /*
          * standalone, small random read
          * Read as is, and do not pollute the readahead state.
          */
-       return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
+       __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+       return;
  
  initial_readahead:
-       ra->start = offset;
+       ra->start = index;
         ra->size = get_init_ra_size(req_size, max_pages);
         ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
  
@@ -478,7 +537,7 @@ readit:
          * the resulted next readahead window into the current one.
          * Take care of maximum IO pages as above.
          */
-       if (offset == ra->start && ra->size == ra->async_size) {
+       if (index == ra->start && ra->size == ra->async_size) {
                 add_pages = get_next_ra_size(ra, max_pages);
                 if (ra->size + add_pages <= max_pages) {
                         ra->async_size = add_pages;
@@ -489,7 +548,7 @@ readit:
                 }
         }
  
-       return ra_submit(ra, mapping, filp);
+       ra_submit(ra, mapping, filp);
  }
  
  /**
@@ -497,9 +556,8 @@ readit:
   * @mapping: address_space which holds the pagecache and I/O vectors
   * @ra: file_ra_state which holds the readahead state
   * @filp: passed on to ->readpage() and ->readpages()
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- *            pagecache pages
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
   *
   * page_cache_sync_readahead() should be called when a cache miss happened:
   * it will submit the read.  The readahead logic may decide to piggyback more
@@ -508,7 +566,7 @@ readit:
   */
  void page_cache_sync_readahead(struct address_space *mapping,
                                struct file_ra_state *ra, struct file *filp,
-                              pgoff_t offset, unsigned long req_size)
+                              pgoff_t index, unsigned long req_count)
  {
         /* no read-ahead */
         if (!ra->ra_pages)
@@ -519,12 +577,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
  
         /* be dumb */
         if (filp && (filp->f_mode & FMODE_RANDOM)) {
-               force_page_cache_readahead(mapping, filp, offset, req_size);
+               force_page_cache_readahead(mapping, filp, index, req_count);
                 return;
         }
  
         /* do read-ahead */
-       ondemand_readahead(mapping, ra, filp, false, offset, req_size);
+       ondemand_readahead(mapping, ra, filp, false, index, req_count);
  }
  EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
  
@@ -533,21 +591,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
   * @mapping: address_space which holds the pagecache and I/O vectors
   * @ra: file_ra_state which holds the readahead state
   * @filp: passed on to ->readpage() and ->readpages()
- * @page: the page at @offset which has the PG_readahead flag set
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- *            pagecache pages
+ * @page: The page at @index which triggered the readahead call.
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
   *
   * page_cache_async_readahead() should be called when a page is used which
- * has the PG_readahead flag; this is a marker to suggest that the application
+ * is marked as PageReadahead; this is a marker to suggest that the application
   * has used up enough of the readahead window that we should start pulling in
   * more pages.
   */
  void
  page_cache_async_readahead(struct address_space *mapping,
                            struct file_ra_state *ra, struct file *filp,
-                          struct page *page, pgoff_t offset,
-                          unsigned long req_size)
+                          struct page *page, pgoff_t index,
+                          unsigned long req_count)
  {
         /* no read-ahead */
         if (!ra->ra_pages)
@@ -571,7 +628,7 @@ page_cache_async_readahead(struct address_space *mapping,
                 return;
  
         /* do read-ahead */
-       ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+       ondemand_readahead(mapping, ra, filp, true, index, req_count);
  }
  EXPORT_SYMBOL_GPL(page_cache_async_readahead);