OSDN Git Service

mm/gup.c: update the documentation
[tomoyo/tomoyo-test1.git] / mm / readahead.c
index 2fe72cd..3c9a8dd 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/mm_inline.h>
 #include <linux/blk-cgroup.h>
 #include <linux/fadvise.h>
+#include <linux/sched/mm.h>
 
 #include "internal.h"
 
@@ -113,94 +114,126 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
 
 EXPORT_SYMBOL(read_cache_pages);
 
-static int read_pages(struct address_space *mapping, struct file *filp,
-               struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
+static void read_pages(struct readahead_control *rac, struct list_head *pages,
+               bool skip_page)
 {
+       const struct address_space_operations *aops = rac->mapping->a_ops;
+       struct page *page;
        struct blk_plug plug;
-       unsigned page_idx;
-       int ret;
+
+       if (!readahead_count(rac))
+               goto out;
 
        blk_start_plug(&plug);
 
-       if (mapping->a_ops->readpages) {
-               ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+       if (aops->readahead) {
+               aops->readahead(rac);
+               /* Clean up the remaining pages */
+               while ((page = readahead_page(rac))) {
+                       unlock_page(page);
+                       put_page(page);
+               }
+       } else if (aops->readpages) {
+               aops->readpages(rac->file, rac->mapping, pages,
+                               readahead_count(rac));
                /* Clean up the remaining pages */
                put_pages_list(pages);
-               goto out;
-       }
-
-       for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-               struct page *page = lru_to_page(pages);
-               list_del(&page->lru);
-               if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
-                       mapping->a_ops->readpage(filp, page);
-               put_page(page);
+               rac->_index += rac->_nr_pages;
+               rac->_nr_pages = 0;
+       } else {
+               while ((page = readahead_page(rac))) {
+                       aops->readpage(rac->file, page);
+                       put_page(page);
+               }
        }
-       ret = 0;
 
-out:
        blk_finish_plug(&plug);
 
-       return ret;
+       BUG_ON(!list_empty(pages));
+       BUG_ON(readahead_count(rac));
+
+out:
+       if (skip_page)
+               rac->_index++;
 }
 
-/*
- * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates
- * the pages first, then submits them for I/O. This avoids the very bad
- * behaviour which would occur if page allocations are causing VM writeback.
- * We really don't want to intermingle reads and writes like that.
+/**
+ * page_cache_readahead_unbounded - Start unchecked readahead.
+ * @mapping: File address space.
+ * @file: This instance of the open file; used for authentication.
+ * @index: First page index to read.
+ * @nr_to_read: The number of pages to read.
+ * @lookahead_size: Where to start the next readahead.
  *
- * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ * This function is for filesystems to call when they want to start
+ * readahead beyond a file's stated i_size.  This is almost certainly
+ * not the function you want to call.  Use page_cache_async_readahead()
+ * or page_cache_sync_readahead() instead.
+ *
+ * Context: File is referenced by caller.  Mutexes may be held by caller.
+ * May sleep, but will not reenter filesystem to reclaim memory.
  */
-unsigned int __do_page_cache_readahead(struct address_space *mapping,
-               struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+void page_cache_readahead_unbounded(struct address_space *mapping,
+               struct file *file, pgoff_t index, unsigned long nr_to_read,
                unsigned long lookahead_size)
 {
-       struct inode *inode = mapping->host;
-       struct page *page;
-       unsigned long end_index;        /* The last page we want to read */
        LIST_HEAD(page_pool);
-       int page_idx;
-       unsigned int nr_pages = 0;
-       loff_t isize = i_size_read(inode);
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
+       struct readahead_control rac = {
+               .mapping = mapping,
+               .file = file,
+               ._index = index,
+       };
+       unsigned long i;
 
-       if (isize == 0)
-               goto out;
-
-       end_index = ((isize - 1) >> PAGE_SHIFT);
+       /*
+        * Partway through the readahead operation, we will have added
+        * locked pages to the page cache, but will not yet have submitted
+        * them for I/O.  Adding another page may need to allocate memory,
+        * which can trigger memory reclaim.  Telling the VM we're in
+        * the middle of a filesystem operation will cause it to not
+        * touch file-backed pages, preventing a deadlock.  Most (all?)
+        * filesystems already specify __GFP_NOFS in their mapping's
+        * gfp_mask, but let's be explicit here.
+        */
+       unsigned int nofs = memalloc_nofs_save();
 
        /*
         * Preallocate as many pages as we will need.
         */
-       for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
-               pgoff_t page_offset = offset + page_idx;
+       for (i = 0; i < nr_to_read; i++) {
+               struct page *page = xa_load(&mapping->i_pages, index + i);
 
-               if (page_offset > end_index)
-                       break;
+               BUG_ON(index + i != rac._index + rac._nr_pages);
 
-               page = xa_load(&mapping->i_pages, page_offset);
                if (page && !xa_is_value(page)) {
                        /*
-                        * Page already present?  Kick off the current batch of
-                        * contiguous pages before continuing with the next
-                        * batch.
+                        * Page already present?  Kick off the current batch
+                        * of contiguous pages before continuing with the
+                        * next batch.  This page may be the one we would
+                        * have intended to mark as Readahead, but we don't
+                        * have a stable reference to this page, and it's
+                        * not worth getting one just for that.
                         */
-                       if (nr_pages)
-                               read_pages(mapping, filp, &page_pool, nr_pages,
-                                               gfp_mask);
-                       nr_pages = 0;
+                       read_pages(&rac, &page_pool, true);
                        continue;
                }
 
                page = __page_cache_alloc(gfp_mask);
                if (!page)
                        break;
-               page->index = page_offset;
-               list_add(&page->lru, &page_pool);
-               if (page_idx == nr_to_read - lookahead_size)
+               if (mapping->a_ops->readpages) {
+                       page->index = index + i;
+                       list_add(&page->lru, &page_pool);
+               } else if (add_to_page_cache_lru(page, mapping, index + i,
+                                       gfp_mask) < 0) {
+                       put_page(page);
+                       read_pages(&rac, &page_pool, true);
+                       continue;
+               }
+               if (i == nr_to_read - lookahead_size)
                        SetPageReadahead(page);
-               nr_pages++;
+               rac._nr_pages++;
        }
 
        /*
@@ -208,26 +241,53 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping,
         * uptodate then the caller will launch readpage again, and
         * will then handle the error.
         */
-       if (nr_pages)
-               read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
-       BUG_ON(!list_empty(&page_pool));
-out:
-       return nr_pages;
+       read_pages(&rac, &page_pool, false);
+       memalloc_nofs_restore(nofs);
+}
+EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+
+/*
+ * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates
+ * the pages first, then submits them for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ */
+void __do_page_cache_readahead(struct address_space *mapping,
+               struct file *file, pgoff_t index, unsigned long nr_to_read,
+               unsigned long lookahead_size)
+{
+       struct inode *inode = mapping->host;
+       loff_t isize = i_size_read(inode);
+       pgoff_t end_index;      /* The last page we want to read */
+
+       if (isize == 0)
+               return;
+
+       end_index = (isize - 1) >> PAGE_SHIFT;
+       if (index > end_index)
+               return;
+       /* Don't read past the page containing the last byte of the file */
+       if (nr_to_read > end_index - index)
+               nr_to_read = end_index - index + 1;
+
+       page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
+                       lookahead_size);
 }
 
 /*
  * Chunk the readahead into 2 megabyte units, so that we don't pin too much
  * memory at once.
  */
-int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
-                              pgoff_t offset, unsigned long nr_to_read)
+void force_page_cache_readahead(struct address_space *mapping,
+               struct file *filp, pgoff_t index, unsigned long nr_to_read)
 {
        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        struct file_ra_state *ra = &filp->f_ra;
        unsigned long max_pages;
 
-       if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
-               return -EINVAL;
+       if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
+                       !mapping->a_ops->readahead))
+               return;
 
        /*
         * If the request exceeds the readahead window, allow the read to
@@ -240,12 +300,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 
                if (this_chunk > nr_to_read)
                        this_chunk = nr_to_read;
-               __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0);
+               __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
 
-               offset += this_chunk;
+               index += this_chunk;
                nr_to_read -= this_chunk;
        }
-       return 0;
 }
 
 /*
@@ -324,21 +383,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
  */
 
 /*
- * Count contiguously cached pages from @offset-1 to @offset-@max,
+ * Count contiguously cached pages from @index-1 to @index-@max,
  * this count is a conservative estimation of
  *     - length of the sequential read sequence, or
  *     - thrashing threshold in memory tight systems
  */
 static pgoff_t count_history_pages(struct address_space *mapping,
-                                  pgoff_t offset, unsigned long max)
+                                  pgoff_t index, unsigned long max)
 {
        pgoff_t head;
 
        rcu_read_lock();
-       head = page_cache_prev_miss(mapping, offset - 1, max);
+       head = page_cache_prev_miss(mapping, index - 1, max);
        rcu_read_unlock();
 
-       return offset - 1 - head;
+       return index - 1 - head;
 }
 
 /*
@@ -346,13 +405,13 @@ static pgoff_t count_history_pages(struct address_space *mapping,
  */
 static int try_context_readahead(struct address_space *mapping,
                                 struct file_ra_state *ra,
-                                pgoff_t offset,
+                                pgoff_t index,
                                 unsigned long req_size,
                                 unsigned long max)
 {
        pgoff_t size;
 
-       size = count_history_pages(mapping, offset, max);
+       size = count_history_pages(mapping, index, max);
 
        /*
         * not enough history pages:
@@ -365,10 +424,10 @@ static int try_context_readahead(struct address_space *mapping,
         * starts from beginning of file:
         * it is a strong indication of long-run stream (or whole-file-read)
         */
-       if (size >= offset)
+       if (size >= index)
                size *= 2;
 
-       ra->start = offset;
+       ra->start = index;
        ra->size = min(size + req_size, max);
        ra->async_size = 1;
 
@@ -378,16 +437,15 @@ static int try_context_readahead(struct address_space *mapping,
 /*
  * A minimal readahead algorithm for trivial sequential/random reads.
  */
-static unsigned long
-ondemand_readahead(struct address_space *mapping,
-                  struct file_ra_state *ra, struct file *filp,
-                  bool hit_readahead_marker, pgoff_t offset,
-                  unsigned long req_size)
+static void ondemand_readahead(struct address_space *mapping,
+               struct file_ra_state *ra, struct file *filp,
+               bool hit_readahead_marker, pgoff_t index,
+               unsigned long req_size)
 {
        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        unsigned long max_pages = ra->ra_pages;
        unsigned long add_pages;
-       pgoff_t prev_offset;
+       pgoff_t prev_index;
 
        /*
         * If the request exceeds the readahead window, allow the read to
@@ -399,15 +457,15 @@ ondemand_readahead(struct address_space *mapping,
        /*
         * start of file
         */
-       if (!offset)
+       if (!index)
                goto initial_readahead;
 
        /*
-        * It's the expected callback offset, assume sequential access.
+        * It's the expected callback index, assume sequential access.
         * Ramp up sizes, and push forward the readahead window.
         */
-       if ((offset == (ra->start + ra->size - ra->async_size) ||
-            offset == (ra->start + ra->size))) {
+       if ((index == (ra->start + ra->size - ra->async_size) ||
+            index == (ra->start + ra->size))) {
                ra->start += ra->size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
@@ -424,14 +482,14 @@ ondemand_readahead(struct address_space *mapping,
                pgoff_t start;
 
                rcu_read_lock();
-               start = page_cache_next_miss(mapping, offset + 1, max_pages);
+               start = page_cache_next_miss(mapping, index + 1, max_pages);
                rcu_read_unlock();
 
-               if (!start || start - offset > max_pages)
-                       return 0;
+               if (!start || start - index > max_pages)
+                       return;
 
                ra->start = start;
-               ra->size = start - offset;      /* old async_size */
+               ra->size = start - index;       /* old async_size */
                ra->size += req_size;
                ra->size = get_next_ra_size(ra, max_pages);
                ra->async_size = ra->size;
@@ -446,28 +504,29 @@ ondemand_readahead(struct address_space *mapping,
 
        /*
         * sequential cache miss
-        * trivial case: (offset - prev_offset) == 1
-        * unaligned reads: (offset - prev_offset) == 0
+        * trivial case: (index - prev_index) == 1
+        * unaligned reads: (index - prev_index) == 0
         */
-       prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
-       if (offset - prev_offset <= 1UL)
+       prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
+       if (index - prev_index <= 1UL)
                goto initial_readahead;
 
        /*
         * Query the page cache and look for the traces(cached history pages)
         * that a sequential stream would leave behind.
         */
-       if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
+       if (try_context_readahead(mapping, ra, index, req_size, max_pages))
                goto readit;
 
        /*
         * standalone, small random read
         * Read as is, and do not pollute the readahead state.
         */
-       return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
+       __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+       return;
 
 initial_readahead:
-       ra->start = offset;
+       ra->start = index;
        ra->size = get_init_ra_size(req_size, max_pages);
        ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 
@@ -478,7 +537,7 @@ readit:
         * the resulted next readahead window into the current one.
         * Take care of maximum IO pages as above.
         */
-       if (offset == ra->start && ra->size == ra->async_size) {
+       if (index == ra->start && ra->size == ra->async_size) {
                add_pages = get_next_ra_size(ra, max_pages);
                if (ra->size + add_pages <= max_pages) {
                        ra->async_size = add_pages;
@@ -489,7 +548,7 @@ readit:
                }
        }
 
-       return ra_submit(ra, mapping, filp);
+       ra_submit(ra, mapping, filp);
 }
 
 /**
@@ -497,9 +556,8 @@ readit:
  * @mapping: address_space which holds the pagecache and I/O vectors
  * @ra: file_ra_state which holds the readahead state
  * @filp: passed on to ->readpage() and ->readpages()
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- *            pagecache pages
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
  *
  * page_cache_sync_readahead() should be called when a cache miss happened:
  * it will submit the read.  The readahead logic may decide to piggyback more
@@ -508,7 +566,7 @@ readit:
  */
 void page_cache_sync_readahead(struct address_space *mapping,
                               struct file_ra_state *ra, struct file *filp,
-                              pgoff_t offset, unsigned long req_size)
+                              pgoff_t index, unsigned long req_count)
 {
        /* no read-ahead */
        if (!ra->ra_pages)
@@ -519,12 +577,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
 
        /* be dumb */
        if (filp && (filp->f_mode & FMODE_RANDOM)) {
-               force_page_cache_readahead(mapping, filp, offset, req_size);
+               force_page_cache_readahead(mapping, filp, index, req_count);
                return;
        }
 
        /* do read-ahead */
-       ondemand_readahead(mapping, ra, filp, false, offset, req_size);
+       ondemand_readahead(mapping, ra, filp, false, index, req_count);
 }
 EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
 
@@ -533,21 +591,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
  * @mapping: address_space which holds the pagecache and I/O vectors
  * @ra: file_ra_state which holds the readahead state
  * @filp: passed on to ->readpage() and ->readpages()
- * @page: the page at @offset which has the PG_readahead flag set
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- *            pagecache pages
+ * @page: The page at @index which triggered the readahead call.
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
  *
  * page_cache_async_readahead() should be called when a page is used which
- * has the PG_readahead flag; this is a marker to suggest that the application
+ * is marked as PageReadahead; this is a marker to suggest that the application
  * has used up enough of the readahead window that we should start pulling in
  * more pages.
  */
 void
 page_cache_async_readahead(struct address_space *mapping,
                           struct file_ra_state *ra, struct file *filp,
-                          struct page *page, pgoff_t offset,
-                          unsigned long req_size)
+                          struct page *page, pgoff_t index,
+                          unsigned long req_count)
 {
        /* no read-ahead */
        if (!ra->ra_pages)
@@ -571,7 +628,7 @@ page_cache_async_readahead(struct address_space *mapping,
                return;
 
        /* do read-ahead */
-       ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+       ondemand_readahead(mapping, ra, filp, true, index, req_count);
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);