OSDN Git Service

NFS: Fix bool initialization/comparison
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / mm / filemap.c
index 327910c..f217120 100644 (file)
  *   ->tasklist_lock            (memory_failure, collect_procs_ao)
  */
 
+static int page_cache_tree_insert(struct address_space *mapping,
+                                 struct page *page, void **shadowp)
+{
+       struct radix_tree_node *node;
+       void **slot;
+       int error;
+
+       error = __radix_tree_create(&mapping->page_tree, page->index,
+                                   &node, &slot);
+       if (error)
+               return error;
+       if (*slot) {
+               void *p;
+
+               p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+               if (!radix_tree_exceptional_entry(p))
+                       return -EEXIST;
+               if (shadowp)
+                       *shadowp = p;
+               mapping->nrshadows--;
+               if (node)
+                       workingset_node_shadows_dec(node);
+       }
+       radix_tree_replace_slot(slot, page);
+       mapping->nrpages++;
+       if (node) {
+               workingset_node_pages_inc(node);
+               /*
+                * Don't track node that contains actual pages.
+                *
+                * Avoid acquiring the list_lru lock if already
+                * untracked.  The list_empty() test is safe as
+                * node->private_list is protected by
+                * mapping->tree_lock.
+                */
+               if (!list_empty(&node->private_list))
+                       list_lru_del(&workingset_shadow_nodes,
+                                    &node->private_list);
+       }
+       return 0;
+}
+
 static void page_cache_tree_delete(struct address_space *mapping,
                                   struct page *page, void *shadow)
 {
@@ -122,6 +164,14 @@ static void page_cache_tree_delete(struct address_space *mapping,
 
        __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
 
+       if (!node) {
+               /*
+                * We need a node to properly account shadow
+                * entries. Don't plant any without. XXX
+                */
+               shadow = NULL;
+       }
+
        if (shadow) {
                mapping->nrshadows++;
                /*
@@ -290,7 +340,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                .range_end = end,
        };
 
-       if (!mapping_cap_writeback_dirty(mapping))
+       if (!mapping_cap_writeback_dirty(mapping) ||
+           !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;
 
        wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -331,23 +382,14 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
 
-/**
- * filemap_fdatawait_range - wait for writeback to complete
- * @mapping:           address space structure to wait for
- * @start_byte:                offset in bytes where the range starts
- * @end_byte:          offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
-                           loff_t end_byte)
+static int __filemap_fdatawait_range(struct address_space *mapping,
+                                    loff_t start_byte, loff_t end_byte)
 {
        pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
        pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
        struct pagevec pvec;
        int nr_pages;
-       int ret2, ret = 0;
+       int ret = 0;
 
        if (end_byte < start_byte)
                goto out;
@@ -374,6 +416,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                cond_resched();
        }
 out:
+       return ret;
+}
+
+/**
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping:           address space structure to wait for
+ * @start_byte:                offset in bytes where the range starts
+ * @end_byte:          offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them.  Check error status of
+ * the address space and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+                           loff_t end_byte)
+{
+       int ret, ret2;
+
+       ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
        ret2 = filemap_check_errors(mapping);
        if (!ret)
                ret = ret2;
@@ -383,11 +448,38 @@ out:
 EXPORT_SYMBOL(filemap_fdatawait_range);
 
 /**
+ * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.  Unlike filemap_fdatawait(), this function
+ * does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves.  Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+void filemap_fdatawait_keep_errors(struct address_space *mapping)
+{
+       loff_t i_size = i_size_read(mapping->host);
+
+       if (i_size == 0)
+               return;
+
+       __filemap_fdatawait_range(mapping, 0, i_size - 1);
+}
+
+/**
  * filemap_fdatawait - wait for all under-writeback pages to complete
  * @mapping: address space structure to wait for
  *
  * Walk the list of under-writeback pages of the given address space
- * and wait for all of them.
+ * and wait for all of them.  Check error status of the address space
+ * and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
  */
 int filemap_fdatawait(struct address_space *mapping)
 {
@@ -480,7 +572,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
        VM_BUG_ON_PAGE(!PageLocked(new), new);
        VM_BUG_ON_PAGE(new->mapping, new);
 
-       error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+       error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
        if (!error) {
                struct address_space *mapping = old->mapping;
                void (*freepage)(struct page *);
@@ -497,9 +589,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                memcg = mem_cgroup_begin_page_stat(old);
                spin_lock_irqsave(&mapping->tree_lock, flags);
                __delete_from_page_cache(old, NULL, memcg);
-               error = radix_tree_insert(&mapping->page_tree, offset, new);
+               error = page_cache_tree_insert(mapping, new, NULL);
                BUG_ON(error);
-               mapping->nrpages++;
 
                /*
                 * hugetlb pages do not participate in page cache accounting.
@@ -510,7 +601,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                        __inc_zone_page_state(new, NR_SHMEM);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
                mem_cgroup_end_page_stat(memcg);
-               mem_cgroup_migrate(old, new, true);
+               mem_cgroup_replace_page(old, new);
                radix_tree_preload_end();
                if (freepage)
                        freepage(old);
@@ -521,48 +612,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 
-static int page_cache_tree_insert(struct address_space *mapping,
-                                 struct page *page, void **shadowp)
-{
-       struct radix_tree_node *node;
-       void **slot;
-       int error;
-
-       error = __radix_tree_create(&mapping->page_tree, page->index,
-                                   &node, &slot);
-       if (error)
-               return error;
-       if (*slot) {
-               void *p;
-
-               p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
-               if (!radix_tree_exceptional_entry(p))
-                       return -EEXIST;
-               if (shadowp)
-                       *shadowp = p;
-               mapping->nrshadows--;
-               if (node)
-                       workingset_node_shadows_dec(node);
-       }
-       radix_tree_replace_slot(slot, page);
-       mapping->nrpages++;
-       if (node) {
-               workingset_node_pages_inc(node);
-               /*
-                * Don't track node that contains actual pages.
-                *
-                * Avoid acquiring the list_lru lock if already
-                * untracked.  The list_empty() test is safe as
-                * node->private_list is protected by
-                * mapping->tree_lock.
-                */
-               if (!list_empty(&node->private_list))
-                       list_lru_del(&workingset_shadow_nodes,
-                                    &node->private_list);
-       }
-       return 0;
-}
-
 static int __add_to_page_cache_locked(struct page *page,
                                      struct address_space *mapping,
                                      pgoff_t offset, gfp_t gfp_mask,
@@ -582,7 +631,7 @@ static int __add_to_page_cache_locked(struct page *page,
                        return error;
        }
 
-       error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
+       error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
        if (error) {
                if (!huge)
                        mem_cgroup_cancel_charge(page, memcg);
@@ -817,9 +866,12 @@ void page_endio(struct page *page, int rw, int err)
                unlock_page(page);
        } else { /* rw == WRITE */
                if (err) {
+                       struct address_space *mapping;
+
                        SetPageError(page);
-                       if (page->mapping)
-                               mapping_set_error(page->mapping, err);
+                       mapping = page_mapping(page);
+                       if (mapping)
+                               mapping_set_error(mapping, err);
                }
                end_page_writeback(page);
        }
@@ -1141,8 +1193,7 @@ no_page:
                if (fgp_flags & FGP_ACCESSED)
                        __SetPageReferenced(page);
 
-               err = add_to_page_cache_lru(page, mapping, offset,
-                               gfp_mask & GFP_RECLAIM_MASK);
+               err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
                if (unlikely(err)) {
                        page_cache_release(page);
                        page = NULL;
@@ -1511,6 +1562,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
 
                cond_resched();
 find_page:
+               if (fatal_signal_pending(current)) {
+                       error = -EINTR;
+                       goto out;
+               }
+
                page = find_get_page(mapping, index);
                if (!page) {
                        page_cache_sync_readahead(mapping,
@@ -1526,6 +1582,15 @@ find_page:
                                        index, last_index - index);
                }
                if (!PageUptodate(page)) {
+                       /*
+                        * See comment in do_read_cache_page on why
+                        * wait_on_page_locked is used to avoid unnecessarily
+                        * serialisations and why it's safe.
+                        */
+                       wait_on_page_locked_killable(page);
+                       if (PageUptodate(page))
+                               goto page_ok;
+
                        if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
                                        !mapping->a_ops->is_partially_uptodate)
                                goto page_not_up_to_date;
@@ -1681,7 +1746,7 @@ no_cached_page:
                        goto out;
                }
                error = add_to_page_cache_lru(page, mapping, index,
-                                       GFP_KERNEL & mapping_gfp_mask(mapping));
+                               mapping_gfp_constraint(mapping, GFP_KERNEL));
                if (error) {
                        page_cache_release(page);
                        if (error == -EEXIST) {
@@ -1771,19 +1836,18 @@ EXPORT_SYMBOL(generic_file_read_iter);
  * This adds the requested page to the page cache if it isn't already there,
  * and schedules an I/O to read in its contents from disk.
  */
-static int page_cache_read(struct file *file, pgoff_t offset)
+static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
 {
        struct address_space *mapping = file->f_mapping;
        struct page *page;
        int ret;
 
        do {
-               page = page_cache_alloc_cold(mapping);
+               page = __page_cache_alloc(gfp_mask|__GFP_COLD);
                if (!page)
                        return -ENOMEM;
 
-               ret = add_to_page_cache_lru(page, mapping, offset,
-                               GFP_KERNEL & mapping_gfp_mask(mapping));
+               ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
                if (ret == 0)
                        ret = mapping->a_ops->readpage(file, page);
                else if (ret == -EEXIST)
@@ -1807,7 +1871,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
                                   struct file *file,
                                   pgoff_t offset)
 {
-       unsigned long ra_pages;
        struct address_space *mapping = file->f_mapping;
 
        /* If we don't want any read-ahead, don't bother */
@@ -1836,10 +1899,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
        /*
         * mmap read-around
         */
-       ra_pages = max_sane_readahead(ra->ra_pages);
-       ra->start = max_t(long, 0, offset - ra_pages / 2);
-       ra->size = ra_pages;
-       ra->async_size = ra_pages / 4;
+       ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+       ra->size = ra->ra_pages;
+       ra->async_size = ra->ra_pages / 4;
        ra_submit(ra, mapping, file);
 }
 
@@ -1966,7 +2028,7 @@ no_cached_page:
         * We're only likely to ever get here if MADV_RANDOM is in
         * effect.
         */
-       error = page_cache_read(file, offset);
+       error = page_cache_read(file, offset, vmf->gfp_mask);
 
        /*
         * The page we want has now been added to the page cache.
@@ -2163,7 +2225,7 @@ static struct page *wait_on_page_read(struct page *page)
        return page;
 }
 
-static struct page *__read_cache_page(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
                                pgoff_t index,
                                int (*filler)(void *, struct page *),
                                void *data,
@@ -2185,53 +2247,74 @@ repeat:
                        /* Presumably ENOMEM for radix tree node */
                        return ERR_PTR(err);
                }
+
+filler:
                err = filler(data, page);
                if (err < 0) {
                        page_cache_release(page);
-                       page = ERR_PTR(err);
-               } else {
-                       page = wait_on_page_read(page);
+                       return ERR_PTR(err);
                }
-       }
-       return page;
-}
 
-static struct page *do_read_cache_page(struct address_space *mapping,
-                               pgoff_t index,
-                               int (*filler)(void *, struct page *),
-                               void *data,
-                               gfp_t gfp)
-
-{
-       struct page *page;
-       int err;
+               page = wait_on_page_read(page);
+               if (IS_ERR(page))
+                       return page;
+               goto out;
+       }
+       if (PageUptodate(page))
+               goto out;
 
-retry:
-       page = __read_cache_page(mapping, index, filler, data, gfp);
-       if (IS_ERR(page))
-               return page;
+       /*
+        * Page is not up to date and may be locked due one of the following
+        * case a: Page is being filled and the page lock is held
+        * case b: Read/write error clearing the page uptodate status
+        * case c: Truncation in progress (page locked)
+        * case d: Reclaim in progress
+        *
+        * Case a, the page will be up to date when the page is unlocked.
+        *    There is no need to serialise on the page lock here as the page
+        *    is pinned so the lock gives no additional protection. Even if the
+        *    the page is truncated, the data is still valid if PageUptodate as
+        *    it's a race vs truncate race.
+        * Case b, the page will not be up to date
+        * Case c, the page may be truncated but in itself, the data may still
+        *    be valid after IO completes as it's a read vs truncate race. The
+        *    operation must restart if the page is not uptodate on unlock but
+        *    otherwise serialising on page lock to stabilise the mapping gives
+        *    no additional guarantees to the caller as the page lock is
+        *    released before return.
+        * Case d, similar to truncation. If reclaim holds the page lock, it
+        *    will be a race with remove_mapping that determines if the mapping
+        *    is valid on unlock but otherwise the data is valid and there is
+        *    no need to serialise with page lock.
+        *
+        * As the page lock gives no additional guarantee, we optimistically
+        * wait on the page to be unlocked and check if it's up to date and
+        * use the page if it is. Otherwise, the page lock is required to
+        * distinguish between the different cases. The motivation is that we
+        * avoid spurious serialisations and wakeups when multiple processes
+        * wait on the same page for IO to complete.
+        */
+       wait_on_page_locked(page);
        if (PageUptodate(page))
                goto out;
 
+       /* Distinguish between all the cases under the safety of the lock */
        lock_page(page);
+
+       /* Case c or d, restart the operation */
        if (!page->mapping) {
                unlock_page(page);
                page_cache_release(page);
-               goto retry;
+               goto repeat;
        }
+
+       /* Someone else locked and filled the page in a very small window */
        if (PageUptodate(page)) {
                unlock_page(page);
                goto out;
        }
-       err = filler(data, page);
-       if (err < 0) {
-               page_cache_release(page);
-               return ERR_PTR(err);
-       } else {
-               page = wait_on_page_read(page);
-               if (IS_ERR(page))
-                       return page;
-       }
+       goto filler;
+
 out:
        mark_page_accessed(page);
        return page;
@@ -2674,7 +2757,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
  * page is known to the local caching routines.
  *
  * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
+ * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
  *
  */
 int try_to_release_page(struct page *page, gfp_t gfp_mask)