Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[uclinux-h8/linux.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index a3b4021..d78f577 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1587,6 +1587,9 @@ EXPORT_SYMBOL(find_lock_entry);
   *   @gfp_mask and added to the page cache and the VM's LRU
   *   list. The page is returned locked and with an increased
   *   refcount.
+ * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
+ *   its own locking dance if the page is already in cache, or unlock the page
+ *   before returning if we had to add the page to pagecache.
   *
   * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
   * if the GFP flags specified for FGP_CREAT are atomic.
@@ -1641,7 +1644,7 @@ no_page:
                 if (!page)
                         return NULL;
  
-               if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+               if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
                         fgp_flags |= FGP_LOCK;
  
                 /* Init accessed so avoid atomic mark_page_accessed later */
@@ -1655,6 +1658,13 @@ no_page:
                         if (err == -EEXIST)
                                 goto repeat;
                 }
+
+               /*
+                * add_to_page_cache_lru locks the page, and for mmap we expect
+                * an unlocked page.
+                */
+               if (page && (fgp_flags & FGP_FOR_MMAP))
+                       unlock_page(page);
         }
  
         return page;
@@ -2379,64 +2389,98 @@ out:
  EXPORT_SYMBOL(generic_file_read_iter);
  
  #ifdef CONFIG_MMU
-/**
- * page_cache_read - adds requested page to the page cache if not already there
- * @file:      file to read
- * @offset:    page index
- * @gfp_mask:  memory allocation flags
- *
- * This adds the requested page to the page cache if it isn't already there,
- * and schedules an I/O to read in its contents from disk.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
+#define MMAP_LOTSAMISS  (100)
+static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+                                            struct file *fpin)
  {
-       struct address_space *mapping = file->f_mapping;
-       struct page *page;
-       int ret;
+       int flags = vmf->flags;
  
-       do {
-               page = __page_cache_alloc(gfp_mask);
-               if (!page)
-                       return -ENOMEM;
+       if (fpin)
+               return fpin;
  
-               ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
-               if (ret == 0)
-                       ret = mapping->a_ops->readpage(file, page);
-               else if (ret == -EEXIST)
-                       ret = 0; /* losing race to add is OK */
+       /*
+        * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+        * anything, so we only pin the file and drop the mmap_sem if only
+        * FAULT_FLAG_ALLOW_RETRY is set.
+        */
+       if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+           FAULT_FLAG_ALLOW_RETRY) {
+               fpin = get_file(vmf->vma->vm_file);
+               up_read(&vmf->vma->vm_mm->mmap_sem);
+       }
+       return fpin;
+}
  
-               put_page(page);
+/*
+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
+ * @vmf - the vm_fault for this fault.
+ * @page - the page to lock.
+ * @fpin - the pointer to the file we may pin (or is already pinned).
+ *
+ * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
+ * It differs in that it actually returns the page locked if it returns 1 and 0
+ * if it couldn't lock the page.  If we did have to drop the mmap_sem then fpin
+ * will point to the pinned file and needs to be fput()'ed at a later point.
+ */
+static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
+                                    struct file **fpin)
+{
+       if (trylock_page(page))
+               return 1;
  
-       } while (ret == AOP_TRUNCATED_PAGE);
+       /*
+        * NOTE! This will make us return with VM_FAULT_RETRY, but with
+        * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
+        * is supposed to work. We have way too many special cases..
+        */
+       if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+               return 0;
  
-       return ret;
+       *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
+       if (vmf->flags & FAULT_FLAG_KILLABLE) {
+               if (__lock_page_killable(page)) {
+                       /*
+                        * We didn't have the right flags to drop the mmap_sem,
+                        * but all fault_handlers only check for fatal signals
+                        * if we return VM_FAULT_RETRY, so we need to drop the
+                        * mmap_sem here and return 0 if we don't have a fpin.
+                        */
+                       if (*fpin == NULL)
+                               up_read(&vmf->vma->vm_mm->mmap_sem);
+                       return 0;
+               }
+       } else
+               __lock_page(page);
+       return 1;
  }
  
-#define MMAP_LOTSAMISS  (100)
  
  /*
- * Synchronous readahead happens when we don't even find
- * a page in the page cache at all.
+ * Synchronous readahead happens when we don't even find a page in the page
+ * cache at all.  We don't want to perform IO under the mmap sem, so if we have
+ * to drop the mmap sem we return the file that was pinned in order for us to do
+ * that.  If we didn't pin a file then we return NULL.  The file that is
+ * returned needs to be fput()'ed when we're done with it.
   */
-static void do_sync_mmap_readahead(struct vm_area_struct *vma,
-                                  struct file_ra_state *ra,
-                                  struct file *file,
-                                  pgoff_t offset)
+static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
  {
+       struct file *file = vmf->vma->vm_file;
+       struct file_ra_state *ra = &file->f_ra;
         struct address_space *mapping = file->f_mapping;
+       struct file *fpin = NULL;
+       pgoff_t offset = vmf->pgoff;
  
         /* If we don't want any read-ahead, don't bother */
-       if (vma->vm_flags & VM_RAND_READ)
-               return;
+       if (vmf->vma->vm_flags & VM_RAND_READ)
+               return fpin;
         if (!ra->ra_pages)
-               return;
+               return fpin;
  
-       if (vma->vm_flags & VM_SEQ_READ) {
+       if (vmf->vma->vm_flags & VM_SEQ_READ) {
+               fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                 page_cache_sync_readahead(mapping, ra, file, offset,
                                           ra->ra_pages);
-               return;
+               return fpin;
         }
  
         /* Avoid banging the cache line if not needed */
@@ -2448,37 +2492,44 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
          * stop bothering with read-ahead. It will only hurt.
          */
         if (ra->mmap_miss > MMAP_LOTSAMISS)
-               return;
+               return fpin;
  
         /*
          * mmap read-around
          */
+       fpin = maybe_unlock_mmap_for_io(vmf, fpin);
         ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
         ra->size = ra->ra_pages;
         ra->async_size = ra->ra_pages / 4;
         ra_submit(ra, mapping, file);
+       return fpin;
  }
  
  /*
   * Asynchronous readahead happens when we find the page and PG_readahead,
- * so we want to possibly extend the readahead further..
+ * so we want to possibly extend the readahead further.  We return the file that
+ * was pinned if we have to drop the mmap_sem in order to do IO.
   */
-static void do_async_mmap_readahead(struct vm_area_struct *vma,
-                                   struct file_ra_state *ra,
-                                   struct file *file,
-                                   struct page *page,
-                                   pgoff_t offset)
+static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
+                                           struct page *page)
  {
+       struct file *file = vmf->vma->vm_file;
+       struct file_ra_state *ra = &file->f_ra;
         struct address_space *mapping = file->f_mapping;
+       struct file *fpin = NULL;
+       pgoff_t offset = vmf->pgoff;
  
         /* If we don't want any read-ahead, don't bother */
-       if (vma->vm_flags & VM_RAND_READ)
-               return;
+       if (vmf->vma->vm_flags & VM_RAND_READ)
+               return fpin;
         if (ra->mmap_miss > 0)
                 ra->mmap_miss--;
-       if (PageReadahead(page))
+       if (PageReadahead(page)) {
+               fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                 page_cache_async_readahead(mapping, ra, file,
                                            page, offset, ra->ra_pages);
+       }
+       return fpin;
  }
  
  /**
@@ -2510,6 +2561,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
  {
         int error;
         struct file *file = vmf->vma->vm_file;
+       struct file *fpin = NULL;
         struct address_space *mapping = file->f_mapping;
         struct file_ra_state *ra = &file->f_ra;
         struct inode *inode = mapping->host;
@@ -2531,23 +2583,26 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
                  * We found the page, so try async readahead before
                  * waiting for the lock.
                  */
-               do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
+               fpin = do_async_mmap_readahead(vmf, page);
         } else if (!page) {
                 /* No page in the page cache at all */
-               do_sync_mmap_readahead(vmf->vma, ra, file, offset);
                 count_vm_event(PGMAJFAULT);
                 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                 ret = VM_FAULT_MAJOR;
+               fpin = do_sync_mmap_readahead(vmf);
  retry_find:
-               page = find_get_page(mapping, offset);
-               if (!page)
-                       goto no_cached_page;
+               page = pagecache_get_page(mapping, offset,
+                                         FGP_CREAT|FGP_FOR_MMAP,
+                                         vmf->gfp_mask);
+               if (!page) {
+                       if (fpin)
+                               goto out_retry;
+                       return vmf_error(-ENOMEM);
+               }
         }
  
-       if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
-               put_page(page);
-               return ret | VM_FAULT_RETRY;
-       }
+       if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+               goto out_retry;
  
         /* Did it get truncated? */
         if (unlikely(page->mapping != mapping)) {
@@ -2565,6 +2620,16 @@ retry_find:
                 goto page_not_uptodate;
  
         /*
+        * We've made it this far and we had to drop our mmap_sem, now is the
+        * time to return to the upper layer and have it re-find the vma and
+        * redo the fault.
+        */
+       if (fpin) {
+               unlock_page(page);
+               goto out_retry;
+       }
+
+       /*
          * Found the page and have a reference on it.
          * We must recheck i_size under page lock.
          */
@@ -2578,28 +2643,6 @@ retry_find:
         vmf->page = page;
         return ret | VM_FAULT_LOCKED;
  
-no_cached_page:
-       /*
-        * We're only likely to ever get here if MADV_RANDOM is in
-        * effect.
-        */
-       error = page_cache_read(file, offset, vmf->gfp_mask);
-
-       /*
-        * The page we want has now been added to the page cache.
-        * In the unlikely event that someone removed it in the
-        * meantime, we'll just come back here and read it again.
-        */
-       if (error >= 0)
-               goto retry_find;
-
-       /*
-        * An error return from page_cache_read can result if the
-        * system is low on memory, or a problem occurs while trying
-        * to schedule I/O.
-        */
-       return vmf_error(error);
-
  page_not_uptodate:
         /*
          * Umm, take care of errors if the page isn't up-to-date.
@@ -2608,12 +2651,15 @@ page_not_uptodate:
          * and we need to check for errors.
          */
         ClearPageError(page);
+       fpin = maybe_unlock_mmap_for_io(vmf, fpin);
         error = mapping->a_ops->readpage(file, page);
         if (!error) {
                 wait_on_page_locked(page);
                 if (!PageUptodate(page))
                         error = -EIO;
         }
+       if (fpin)
+               goto out_retry;
         put_page(page);
  
         if (!error || error == AOP_TRUNCATED_PAGE)
@@ -2622,6 +2668,18 @@ page_not_uptodate:
         /* Things didn't work out. Return zero to tell the mm layer so. */
         shrink_readahead_size_eio(file, ra);
         return VM_FAULT_SIGBUS;
+
+out_retry:
+       /*
+        * We dropped the mmap_sem, we need to return to the fault handler to
+        * re-find the vma and come back and find our hopefully still populated
+        * page.
+        */
+       if (page)
+               put_page(page);
+       if (fpin)
+               fput(fpin);
+       return ret | VM_FAULT_RETRY;
  }
  EXPORT_SYMBOL(filemap_fault);