From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 3 Jun 2020 02:59:25 +0000 (-0700)
Subject: Merge tag 'for-5.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
X-Git-Tag: v5.8-rc1~182
X-Git-Url: http://git.osdn.net/view?p=tomoyo%2Ftomoyo-test1.git;a=commitdiff_plain;h=f3cdc8ae116e27d84e1f33c7a2995960cebb73ac;hp=-c

Merge tag 'for-5.8-tag' of git://git./linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "Highlights:

   - speedup dead root detection during orphan cleanup, eg. when there
     are many deleted subvolumes waiting to be cleaned, the trees are
     now looked up in radix tree instead of a O(N^2) search

   - snapshot creation with inherited qgroup will mark the qgroup
     inconsistent, requires a rescan

   - send will emit file capabilities after chown, this produces a
     stream that does not need postprocessing to set the capabilities
     again

   - direct io ported to iomap infrastructure, cleaned up and simplified
     code, notably removing last use of struct buffer_head in btrfs code

  Core changes:

   - factor out backreference iteration, to be used by ordinary
     backreferences and relocation code

   - improved global block reserve utilization
      * better logic to serialize requests
      * increased maximum available for unlink
      * improved handling on large pages (64K)

   - direct io cleanups and fixes
      * simplify layering, where cloned bios were unnecessarily created
        for some cases
      * error handling fixes (submit, endio)
      * remove repair worker thread, used to avoid deadlocks during
        repair

   - refactored block group reading code, preparatory work for new type
     of block group storage that should improve mount time on large
     filesystems

  Cleanups:

   - cleaned up (and slightly sped up) set/get helpers for metadata data
     structure members

   - root bit REF_COWS got renamed to SHAREABLE to reflect the that the
     blocks of the tree get shared either among subvolumes or with the
     relocation trees

  Fixes:

   - when subvolume deletion fails due to ENOSPC, the filesystem is not
     turned read-only

   - device scan deals with devices from other filesystems that changed
     ownership due to overwrite (mkfs)

   - fix a race between scrub and block group removal/allocation

   - fix long standing bug of a runaway balance operation, printing the
     same line to the syslog, caused by a stale status bit on a reloc
     tree that prevented progress

   - fix corrupt log due to concurrent fsync of inodes with shared
     extents

   - fix space underflow for NODATACOW and buffered writes when it for
     some reason needs to fallback to COW mode"

* tag 'for-5.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (133 commits)
  btrfs: fix space_info bytes_may_use underflow during space cache writeout
  btrfs: fix space_info bytes_may_use underflow after nocow buffered write
  btrfs: fix wrong file range cleanup after an error filling dealloc range
  btrfs: remove redundant local variable in read_block_for_search
  btrfs: open code key_search
  btrfs: split btrfs_direct_IO to read and write part
  btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK
  fs: remove dio_end_io()
  btrfs: switch to iomap_dio_rw() for dio
  iomap: remove lockdep_assert_held()
  iomap: add a filesystem hook for direct I/O bio submission
  fs: export generic_file_buffered_read()
  btrfs: turn space cache writeout failure messages into debug messages
  btrfs: include error on messages about failure to write space/inode caches
  btrfs: remove useless 'fail_unlock' label from btrfs_csum_file_blocks()
  btrfs: do not ignore error from btrfs_next_leaf() when inserting checksums
  btrfs: make checksum item extension more efficient
  btrfs: fix corrupt log due to concurrent fsync of inodes with shared extents
  btrfs: unexport btrfs_compress_set_level()
  btrfs: simplify iget helpers
  ...
---

f3cdc8ae116e27d84e1f33c7a2995960cebb73ac
diff --combined fs/btrfs/disk-io.c
index 7278789ff8a7,f8ec2d8606fd..7c6f0bbb54a5
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@@ -358,16 -358,14 +358,14 @@@ static int btrfs_check_super_csum(struc
  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
  
  	shash->tfm = fs_info->csum_shash;
- 	crypto_shash_init(shash);
  
  	/*
  	 * The super_block structure does not span the whole
  	 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
  	 * filled with zeros and is included in the checksum.
  	 */
- 	crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
- 			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
- 	crypto_shash_final(shash, result);
+ 	crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
+ 			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
  
  	if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
  		return 1;
@@@ -709,9 -707,7 +707,7 @@@ static void end_workqueue_bio(struct bi
  		else
  			wq = fs_info->endio_write_workers;
  	} else {
- 		if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR))
- 			wq = fs_info->endio_repair_workers;
- 		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+ 		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
  			wq = fs_info->endio_raid56_workers;
  		else if (end_io_wq->metadata)
  			wq = fs_info->endio_meta_workers;
@@@ -980,7 -976,9 +976,7 @@@ static void btree_invalidatepage(struc
  		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
  			   "page private not zero on page %llu",
  			   (unsigned long long)page_offset(page));
 -		ClearPagePrivate(page);
 -		set_page_private(page, 0);
 -		put_page(page);
 +		detach_page_private(page);
  	}
  }
  
@@@ -1135,9 -1133,12 +1131,12 @@@ static void __setup_root(struct btrfs_r
  	root->log_transid = 0;
  	root->log_transid_committed = -1;
  	root->last_log_commit = 0;
- 	if (!dummy)
+ 	if (!dummy) {
  		extent_io_tree_init(fs_info, &root->dirty_log_pages,
  				    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
+ 		extent_io_tree_init(fs_info, &root->log_csum_range,
+ 				    IO_TREE_LOG_CSUM_RANGE, NULL);
+ 	}
  
  	memset(&root->root_key, 0, sizeof(root->root_key));
  	memset(&root->root_item, 0, sizeof(root->root_item));
@@@ -1275,12 -1276,13 +1274,13 @@@ static struct btrfs_root *alloc_log_tre
  	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
  
  	/*
- 	 * DON'T set REF_COWS for log trees
+ 	 * DON'T set SHAREABLE bit for log trees.
  	 *
- 	 * log trees do not get reference counted because they go away
- 	 * before a real commit is actually done.  They do store pointers
- 	 * to file data extents, and those reference counts still get
- 	 * updated (along with back refs to the log tree).
+ 	 * Log trees are not exposed to user space thus can't be snapshotted,
+ 	 * and they go away before a real commit is actually done.
+ 	 *
+ 	 * They do store pointers to file data extents, and those reference
+ 	 * counts still get updated (along with back refs to the log tree).
  	 */
  
  	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
@@@ -1418,8 -1420,9 +1418,9 @@@ static int btrfs_init_fs_root(struct bt
  	if (ret)
  		goto fail;
  
- 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- 		set_bit(BTRFS_ROOT_REF_COWS, &root->state);
+ 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ 	    root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ 		set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
  		btrfs_check_and_init_root_item(&root->root_item);
  	}
  
@@@ -1524,6 -1527,7 +1525,7 @@@ void btrfs_free_fs_info(struct btrfs_fs
  	btrfs_put_root(fs_info->uuid_root);
  	btrfs_put_root(fs_info->free_space_root);
  	btrfs_put_root(fs_info->fs_root);
+ 	btrfs_put_root(fs_info->data_reloc_root);
  	btrfs_check_leaked_roots(fs_info);
  	btrfs_extent_buffer_leak_debug_check(fs_info);
  	kfree(fs_info->super_copy);
@@@ -1533,35 -1537,34 +1535,34 @@@
  
  
  struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
- 				     struct btrfs_key *location,
- 				     bool check_ref)
+ 				     u64 objectid, bool check_ref)
  {
  	struct btrfs_root *root;
  	struct btrfs_path *path;
  	struct btrfs_key key;
  	int ret;
  
- 	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+ 	if (objectid == BTRFS_ROOT_TREE_OBJECTID)
  		return btrfs_grab_root(fs_info->tree_root);
- 	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ 	if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
  		return btrfs_grab_root(fs_info->extent_root);
- 	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+ 	if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
  		return btrfs_grab_root(fs_info->chunk_root);
- 	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+ 	if (objectid == BTRFS_DEV_TREE_OBJECTID)
  		return btrfs_grab_root(fs_info->dev_root);
- 	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+ 	if (objectid == BTRFS_CSUM_TREE_OBJECTID)
  		return btrfs_grab_root(fs_info->csum_root);
- 	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ 	if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
  		return btrfs_grab_root(fs_info->quota_root) ?
  			fs_info->quota_root : ERR_PTR(-ENOENT);
- 	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
+ 	if (objectid == BTRFS_UUID_TREE_OBJECTID)
  		return btrfs_grab_root(fs_info->uuid_root) ?
  			fs_info->uuid_root : ERR_PTR(-ENOENT);
- 	if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ 	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
  		return btrfs_grab_root(fs_info->free_space_root) ?
  			fs_info->free_space_root : ERR_PTR(-ENOENT);
  again:
- 	root = btrfs_lookup_fs_root(fs_info, location->objectid);
+ 	root = btrfs_lookup_fs_root(fs_info, objectid);
  	if (root) {
  		if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
  			btrfs_put_root(root);
@@@ -1570,7 -1573,10 +1571,10 @@@
  		return root;
  	}
  
- 	root = btrfs_read_tree_root(fs_info->tree_root, location);
+ 	key.objectid = objectid;
+ 	key.type = BTRFS_ROOT_ITEM_KEY;
+ 	key.offset = (u64)-1;
+ 	root = btrfs_read_tree_root(fs_info->tree_root, &key);
  	if (IS_ERR(root))
  		return root;
  
@@@ -1590,7 -1596,7 +1594,7 @@@
  	}
  	key.objectid = BTRFS_ORPHAN_OBJECTID;
  	key.type = BTRFS_ORPHAN_ITEM_KEY;
- 	key.offset = location->objectid;
+ 	key.offset = objectid;
  
  	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
  	btrfs_free_path(path);
@@@ -1940,7 -1946,6 +1944,6 @@@ static void btrfs_stop_all_workers(stru
  	btrfs_destroy_workqueue(fs_info->workers);
  	btrfs_destroy_workqueue(fs_info->endio_workers);
  	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
- 	btrfs_destroy_workqueue(fs_info->endio_repair_workers);
  	btrfs_destroy_workqueue(fs_info->rmw_workers);
  	btrfs_destroy_workqueue(fs_info->endio_write_workers);
  	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
@@@ -1981,6 -1986,7 +1984,7 @@@ static void free_root_pointers(struct b
  	free_root_extent_buffers(info->quota_root);
  	free_root_extent_buffers(info->uuid_root);
  	free_root_extent_buffers(info->fs_root);
+ 	free_root_extent_buffers(info->data_reloc_root);
  	if (free_chunk_root)
  		free_root_extent_buffers(info->chunk_root);
  	free_root_extent_buffers(info->free_space_root);
@@@ -1993,6 -1999,7 +1997,7 @@@ void btrfs_put_root(struct btrfs_root *
  
  	if (refcount_dec_and_test(&root->refs)) {
  		WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ 		WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
  		if (root->anon_dev)
  			free_anon_bdev(root->anon_dev);
  		btrfs_drew_lock_destroy(&root->snapshot_lock);
@@@ -2143,8 -2150,6 +2148,6 @@@ static int btrfs_init_workqueues(struc
  	fs_info->endio_raid56_workers =
  		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
  				      max_active, 4);
- 	fs_info->endio_repair_workers =
- 		btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
  	fs_info->rmw_workers =
  		btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
  	fs_info->endio_write_workers =
@@@ -2168,7 -2173,6 +2171,6 @@@
  	      fs_info->flush_workers &&
  	      fs_info->endio_workers && fs_info->endio_meta_workers &&
  	      fs_info->endio_meta_write_workers &&
- 	      fs_info->endio_repair_workers &&
  	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
  	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
  	      fs_info->caching_workers && fs_info->readahead_workers &&
@@@ -2290,6 -2294,19 +2292,19 @@@ static int btrfs_read_roots(struct btrf
  	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
  	fs_info->csum_root = root;
  
+ 	/*
+ 	 * This tree can share blocks with some other fs tree during relocation
+ 	 * and we need a proper setup by btrfs_get_fs_root
+ 	 */
+ 	root = btrfs_get_fs_root(tree_root->fs_info,
+ 				 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
+ 	if (IS_ERR(root)) {
+ 		ret = PTR_ERR(root);
+ 		goto out;
+ 	}
+ 	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ 	fs_info->data_reloc_root = root;
+ 
  	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
  	root = btrfs_read_tree_root(tree_root, &location);
  	if (!IS_ERR(root)) {
@@@ -2827,7 -2844,6 +2842,6 @@@ int __cold open_ctree(struct super_bloc
  	u64 generation;
  	u64 features;
  	u16 csum_type;
- 	struct btrfs_key location;
  	struct btrfs_super_block *disk_super;
  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
  	struct btrfs_root *tree_root;
@@@ -3241,11 -3257,7 +3255,7 @@@
  		}
  	}
  
- 	location.objectid = BTRFS_FS_TREE_OBJECTID;
- 	location.type = BTRFS_ROOT_ITEM_KEY;
- 	location.offset = 0;
- 
- 	fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true);
+ 	fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
  	if (IS_ERR(fs_info->fs_root)) {
  		err = PTR_ERR(fs_info->fs_root);
  		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
@@@ -3508,10 -3520,9 +3518,9 @@@ static int write_dev_supers(struct btrf
  
  		btrfs_set_super_bytenr(sb, bytenr);
  
- 		crypto_shash_init(shash);
- 		crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE,
- 				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
- 		crypto_shash_final(shash, sb->csum);
+ 		crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
+ 				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
+ 				    sb->csum);
  
  		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
  					   GFP_NOFS);
diff --combined fs/btrfs/extent_io.c
index e12eb32d9e17,c59e07360083..68c96057ad2d
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@@ -2333,7 -2333,7 +2333,7 @@@ int repair_io_failure(struct btrfs_fs_i
  	return 0;
  }
  
- int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
+ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
  {
  	struct btrfs_fs_info *fs_info = eb->fs_info;
  	u64 start = eb->start;
@@@ -2537,8 -2537,9 +2537,9 @@@ int btrfs_get_io_failure_record(struct 
  	return 0;
  }
  
- bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
- 			   struct io_failure_record *failrec, int failed_mirror)
+ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
+ 				   struct io_failure_record *failrec,
+ 				   int failed_mirror)
  {
  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	int num_copies;
@@@ -2561,7 -2562,7 +2562,7 @@@
  	 *	a) deliver good data to the caller
  	 *	b) correct the bad sectors on disk
  	 */
- 	if (failed_bio_pages > 1) {
+ 	if (needs_validation) {
  		/*
  		 * to fulfill b), we need to know the exact failing sectors, as
  		 * we don't want to rewrite any more than the failed ones. thus,
@@@ -2600,94 -2601,115 +2601,115 @@@
  	return true;
  }
  
- 
- struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
- 				    struct io_failure_record *failrec,
- 				    struct page *page, int pg_offset, int icsum,
- 				    bio_end_io_t *endio_func, void *data)
+ static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
  {
- 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- 	struct bio *bio;
- 	struct btrfs_io_bio *btrfs_failed_bio;
- 	struct btrfs_io_bio *btrfs_bio;
+ 	u64 len = 0;
+ 	const u32 blocksize = inode->i_sb->s_blocksize;
  
- 	bio = btrfs_io_bio_alloc(1);
- 	bio->bi_end_io = endio_func;
- 	bio->bi_iter.bi_sector = failrec->logical >> 9;
- 	bio->bi_iter.bi_size = 0;
- 	bio->bi_private = data;
+ 	/*
+ 	 * If bi_status is BLK_STS_OK, then this was a checksum error, not an
+ 	 * I/O error. In this case, we already know exactly which sector was
+ 	 * bad, so we don't need to validate.
+ 	 */
+ 	if (bio->bi_status == BLK_STS_OK)
+ 		return false;
  
- 	btrfs_failed_bio = btrfs_io_bio(failed_bio);
- 	if (btrfs_failed_bio->csum) {
- 		u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ 	/*
+ 	 * We need to validate each sector individually if the failed I/O was
+ 	 * for multiple sectors.
+ 	 *
+ 	 * There are a few possible bios that can end up here:
+ 	 * 1. A buffered read bio, which is not cloned.
+ 	 * 2. A direct I/O read bio, which is cloned.
+ 	 * 3. A (buffered or direct) repair bio, which is not cloned.
+ 	 *
+ 	 * For cloned bios (case 2), we can get the size from
+ 	 * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get
+ 	 * it from the bvecs.
+ 	 */
+ 	if (bio_flagged(bio, BIO_CLONED)) {
+ 		if (btrfs_io_bio(bio)->iter.bi_size > blocksize)
+ 			return true;
+ 	} else {
+ 		struct bio_vec *bvec;
+ 		int i;
  
- 		btrfs_bio = btrfs_io_bio(bio);
- 		btrfs_bio->csum = btrfs_bio->csum_inline;
- 		icsum *= csum_size;
- 		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
- 		       csum_size);
+ 		bio_for_each_bvec_all(bvec, bio, i) {
+ 			len += bvec->bv_len;
+ 			if (len > blocksize)
+ 				return true;
+ 		}
  	}
- 
- 	bio_add_page(bio, page, failrec->len, pg_offset);
- 
- 	return bio;
+ 	return false;
  }
  
- /*
-  * This is a generic handler for readpage errors. If other copies exist, read
-  * those and write back good data to the failed position. Does not investigate
-  * in remapping the failed extent elsewhere, hoping the device will be smart
-  * enough to do this as needed
-  */
- static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
- 			      struct page *page, u64 start, u64 end,
- 			      int failed_mirror)
+ blk_status_t btrfs_submit_read_repair(struct inode *inode,
+ 				      struct bio *failed_bio, u64 phy_offset,
+ 				      struct page *page, unsigned int pgoff,
+ 				      u64 start, u64 end, int failed_mirror,
+ 				      submit_bio_hook_t *submit_bio_hook)
  {
  	struct io_failure_record *failrec;
- 	struct inode *inode = page->mapping->host;
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
  	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- 	struct bio *bio;
- 	int read_mode = 0;
+ 	struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
+ 	const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits;
+ 	bool need_validation;
+ 	struct bio *repair_bio;
+ 	struct btrfs_io_bio *repair_io_bio;
  	blk_status_t status;
  	int ret;
- 	unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
+ 
+ 	btrfs_debug(fs_info,
+ 		   "repair read error: read error at %llu", start);
  
  	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
  
  	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
  	if (ret)
- 		return ret;
+ 		return errno_to_blk_status(ret);
+ 
+ 	need_validation = btrfs_io_needs_validation(inode, failed_bio);
  
- 	if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
+ 	if (!btrfs_check_repairable(inode, need_validation, failrec,
  				    failed_mirror)) {
  		free_io_failure(failure_tree, tree, failrec);
- 		return -EIO;
+ 		return BLK_STS_IOERR;
  	}
  
- 	if (failed_bio_pages > 1)
- 		read_mode |= REQ_FAILFAST_DEV;
+ 	repair_bio = btrfs_io_bio_alloc(1);
+ 	repair_io_bio = btrfs_io_bio(repair_bio);
+ 	repair_bio->bi_opf = REQ_OP_READ;
+ 	if (need_validation)
+ 		repair_bio->bi_opf |= REQ_FAILFAST_DEV;
+ 	repair_bio->bi_end_io = failed_bio->bi_end_io;
+ 	repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
+ 	repair_bio->bi_private = failed_bio->bi_private;
  
- 	phy_offset >>= inode->i_sb->s_blocksize_bits;
- 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- 				      start - page_offset(page),
- 				      (int)phy_offset, failed_bio->bi_end_io,
- 				      NULL);
- 	bio->bi_opf = REQ_OP_READ | read_mode;
+ 	if (failed_io_bio->csum) {
+ 		const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ 
+ 		repair_io_bio->csum = repair_io_bio->csum_inline;
+ 		memcpy(repair_io_bio->csum,
+ 		       failed_io_bio->csum + csum_size * icsum, csum_size);
+ 	}
+ 
+ 	bio_add_page(repair_bio, page, failrec->len, pgoff);
+ 	repair_io_bio->logical = failrec->start;
+ 	repair_io_bio->iter = repair_bio->bi_iter;
  
  	btrfs_debug(btrfs_sb(inode->i_sb),
- 		"Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
- 		read_mode, failrec->this_mirror, failrec->in_validation);
+ "repair read error: submitting new read to mirror %d, in_validation=%d",
+ 		    failrec->this_mirror, failrec->in_validation);
  
- 	status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
- 					 failrec->bio_flags);
+ 	status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
+ 				 failrec->bio_flags);
  	if (status) {
  		free_io_failure(failure_tree, tree, failrec);
- 		bio_put(bio);
- 		ret = blk_status_to_errno(status);
+ 		bio_put(repair_bio);
  	}
- 
- 	return ret;
+ 	return status;
  }
  
  /* lots and lots of room for performance fixes in the end_bio funcs */
@@@ -2859,9 -2881,10 +2881,10 @@@ static void end_bio_extent_readpage(str
  			 * If it can't handle the error it will return -EIO and
  			 * we remain responsible for that page.
  			 */
- 			ret = bio_readpage_error(bio, offset, page, start, end,
- 						 mirror);
- 			if (ret == 0) {
+ 			if (!btrfs_submit_read_repair(inode, bio, offset, page,
+ 						start - page_offset(page),
+ 						start, end, mirror,
+ 						tree->ops->submit_bio_hook)) {
  				uptodate = !bio->bi_status;
  				offset += len;
  				continue;
@@@ -3076,16 -3099,22 +3099,16 @@@ static int submit_extent_page(unsigned 
  static void attach_extent_buffer_page(struct extent_buffer *eb,
  				      struct page *page)
  {
 -	if (!PagePrivate(page)) {
 -		SetPagePrivate(page);
 -		get_page(page);
 -		set_page_private(page, (unsigned long)eb);
 -	} else {
 +	if (!PagePrivate(page))
 +		attach_page_private(page, eb);
 +	else
  		WARN_ON(page->private != (unsigned long)eb);
 -	}
  }
  
  void set_page_extent_mapped(struct page *page)
  {
 -	if (!PagePrivate(page)) {
 -		SetPagePrivate(page);
 -		get_page(page);
 -		set_page_private(page, EXTENT_PAGE_PRIVATE);
 -	}
 +	if (!PagePrivate(page))
 +		attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
  }
  
  static struct extent_map *
@@@ -4361,32 -4390,51 +4384,32 @@@ int extent_writepages(struct address_sp
  	return ret;
  }
  
 -int extent_readpages(struct address_space *mapping, struct list_head *pages,
 -		     unsigned nr_pages)
 +void extent_readahead(struct readahead_control *rac)
  {
  	struct bio *bio = NULL;
  	unsigned long bio_flags = 0;
  	struct page *pagepool[16];
  	struct extent_map *em_cached = NULL;
 -	int nr = 0;
  	u64 prev_em_start = (u64)-1;
 +	int nr;
  
 -	while (!list_empty(pages)) {
 -		u64 contig_end = 0;
 -
 -		for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
 -			struct page *page = lru_to_page(pages);
 -
 -			prefetchw(&page->flags);
 -			list_del(&page->lru);
 -			if (add_to_page_cache_lru(page, mapping, page->index,
 -						readahead_gfp_mask(mapping))) {
 -				put_page(page);
 -				break;
 -			}
 -
 -			pagepool[nr++] = page;
 -			contig_end = page_offset(page) + PAGE_SIZE - 1;
 -		}
 -
 -		if (nr) {
 -			u64 contig_start = page_offset(pagepool[0]);
 +	while ((nr = readahead_page_batch(rac, pagepool))) {
 +		u64 contig_start = page_offset(pagepool[0]);
 +		u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
  
 -			ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
 +		ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
  
 -			contiguous_readpages(pagepool, nr, contig_start,
 -				     contig_end, &em_cached, &bio, &bio_flags,
 -				     &prev_em_start);
 -		}
 +		contiguous_readpages(pagepool, nr, contig_start, contig_end,
 +				&em_cached, &bio, &bio_flags, &prev_em_start);
  	}
  
  	if (em_cached)
  		free_extent_map(em_cached);
  
 -	if (bio)
 -		return submit_one_bio(bio, 0, bio_flags);
 -	return 0;
 +	if (bio) {
 +		if (submit_one_bio(bio, 0, bio_flags))
 +			return;
 +	}
  }
  
  /*
@@@ -4862,7 -4910,7 +4885,7 @@@ static void __free_extent_buffer(struc
  	kmem_cache_free(extent_buffer_cache, eb);
  }
  
- int extent_buffer_under_io(struct extent_buffer *eb)
+ int extent_buffer_under_io(const struct extent_buffer *eb)
  {
  	return (atomic_read(&eb->io_pages) ||
  		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
@@@ -4904,7 -4952,10 +4927,7 @@@ static void btrfs_release_extent_buffer
  			 * We need to make sure we haven't be attached
  			 * to a new eb.
  			 */
 -			ClearPagePrivate(page);
 -			set_page_private(page, 0);
 -			/* One for the page private */
 -			put_page(page);
 +			detach_page_private(page);
  		}
  
  		if (mapped)
@@@ -4967,7 -5018,7 +4990,7 @@@ __alloc_extent_buffer(struct btrfs_fs_i
  	return eb;
  }
  
- struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
  {
  	int i;
  	struct page *p;
@@@ -5373,7 -5424,7 +5396,7 @@@ void free_extent_buffer_stale(struct ex
  	release_extent_buffer(eb);
  }
  
- void clear_extent_buffer_dirty(struct extent_buffer *eb)
+ void clear_extent_buffer_dirty(const struct extent_buffer *eb)
  {
  	int i;
  	int num_pages;
@@@ -5571,8 -5622,7 +5594,7 @@@ void read_extent_buffer(const struct ex
  	struct page *page;
  	char *kaddr;
  	char *dst = (char *)dstv;
- 	size_t start_offset = offset_in_page(eb->start);
- 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ 	unsigned long i = start >> PAGE_SHIFT;
  
  	if (start + len > eb->len) {
  		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
@@@ -5581,7 -5631,7 +5603,7 @@@
  		return;
  	}
  
- 	offset = offset_in_page(start_offset + start);
+ 	offset = offset_in_page(start);
  
  	while (len > 0) {
  		page = eb->pages[i];
@@@ -5606,14 -5656,13 +5628,13 @@@ int read_extent_buffer_to_user(const st
  	struct page *page;
  	char *kaddr;
  	char __user *dst = (char __user *)dstv;
- 	size_t start_offset = offset_in_page(eb->start);
- 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ 	unsigned long i = start >> PAGE_SHIFT;
  	int ret = 0;
  
  	WARN_ON(start > eb->len);
  	WARN_ON(start + len > eb->start + eb->len);
  
- 	offset = offset_in_page(start_offset + start);
+ 	offset = offset_in_page(start);
  
  	while (len > 0) {
  		page = eb->pages[i];
@@@ -5634,48 -5683,6 +5655,6 @@@
  	return ret;
  }
  
- /*
-  * return 0 if the item is found within a page.
-  * return 1 if the item spans two pages.
-  * return -EINVAL otherwise.
-  */
- int map_private_extent_buffer(const struct extent_buffer *eb,
- 			      unsigned long start, unsigned long min_len,
- 			      char **map, unsigned long *map_start,
- 			      unsigned long *map_len)
- {
- 	size_t offset;
- 	char *kaddr;
- 	struct page *p;
- 	size_t start_offset = offset_in_page(eb->start);
- 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
- 	unsigned long end_i = (start_offset + start + min_len - 1) >>
- 		PAGE_SHIFT;
- 
- 	if (start + min_len > eb->len) {
- 		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- 		       eb->start, eb->len, start, min_len);
- 		return -EINVAL;
- 	}
- 
- 	if (i != end_i)
- 		return 1;
- 
- 	if (i == 0) {
- 		offset = start_offset;
- 		*map_start = 0;
- 	} else {
- 		offset = 0;
- 		*map_start = ((u64)i << PAGE_SHIFT) - start_offset;
- 	}
- 
- 	p = eb->pages[i];
- 	kaddr = page_address(p);
- 	*map = kaddr + offset;
- 	*map_len = PAGE_SIZE - offset;
- 	return 0;
- }
- 
  int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
  			 unsigned long start, unsigned long len)
  {
@@@ -5684,14 -5691,13 +5663,13 @@@
  	struct page *page;
  	char *kaddr;
  	char *ptr = (char *)ptrv;
- 	size_t start_offset = offset_in_page(eb->start);
- 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ 	unsigned long i = start >> PAGE_SHIFT;
  	int ret = 0;
  
  	WARN_ON(start > eb->len);
  	WARN_ON(start + len > eb->start + eb->len);
  
- 	offset = offset_in_page(start_offset + start);
+ 	offset = offset_in_page(start);
  
  	while (len > 0) {
  		page = eb->pages[i];
@@@ -5711,7 -5717,7 +5689,7 @@@
  	return ret;
  }
  
- void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
+ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
  		const void *srcv)
  {
  	char *kaddr;
@@@ -5722,7 -5728,7 +5700,7 @@@
  			BTRFS_FSID_SIZE);
  }
  
- void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
+ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
  {
  	char *kaddr;
  
@@@ -5732,7 -5738,7 +5710,7 @@@
  			BTRFS_FSID_SIZE);
  }
  
- void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
  			 unsigned long start, unsigned long len)
  {
  	size_t cur;
@@@ -5740,13 -5746,12 +5718,12 @@@
  	struct page *page;
  	char *kaddr;
  	char *src = (char *)srcv;
- 	size_t start_offset = offset_in_page(eb->start);
- 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ 	unsigned long i = start >> PAGE_SHIFT;
  
  	WARN_ON(start > eb->len);
  	WARN_ON(start + len > eb->start + eb->len);
  
- 	offset = offset_in_page(start_offset + start);
+ 	offset = offset_in_page(start);
  
  	while (len > 0) {
  		page = eb->pages[i];
@@@ -5763,20 -5768,19 +5740,19 @@@
  	}
  }
  
- void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
+ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
  		unsigned long len)
  {
  	size_t cur;
  	size_t offset;
  	struct page *page;
  	char *kaddr;
- 	size_t start_offset = offset_in_page(eb->start);
- 	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
+ 	unsigned long i = start >> PAGE_SHIFT;
  
  	WARN_ON(start > eb->len);
  	WARN_ON(start + len > eb->start + eb->len);
  
- 	offset = offset_in_page(start_offset + start);
+ 	offset = offset_in_page(start);
  
  	while (len > 0) {
  		page = eb->pages[i];
@@@ -5792,8 -5796,8 +5768,8 @@@
  	}
  }
  
- void copy_extent_buffer_full(struct extent_buffer *dst,
- 			     struct extent_buffer *src)
+ void copy_extent_buffer_full(const struct extent_buffer *dst,
+ 			     const struct extent_buffer *src)
  {
  	int i;
  	int num_pages;
@@@ -5806,7 -5810,8 +5782,8 @@@
  				page_address(src->pages[i]));
  }
  
- void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+ void copy_extent_buffer(const struct extent_buffer *dst,
+ 			const struct extent_buffer *src,
  			unsigned long dst_offset, unsigned long src_offset,
  			unsigned long len)
  {
@@@ -5815,12 -5820,11 +5792,11 @@@
  	size_t offset;
  	struct page *page;
  	char *kaddr;
- 	size_t start_offset = offset_in_page(dst->start);
- 	unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
+ 	unsigned long i = dst_offset >> PAGE_SHIFT;
  
  	WARN_ON(src->len != dst_len);
  
- 	offset = offset_in_page(start_offset + dst_offset);
+ 	offset = offset_in_page(dst_offset);
  
  	while (len > 0) {
  		page = dst->pages[i];
@@@ -5851,12 -5855,11 +5827,11 @@@
   * This helper hides the ugliness of finding the byte in an extent buffer which
   * contains a given bit.
   */
- static inline void eb_bitmap_offset(struct extent_buffer *eb,
+ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
  				    unsigned long start, unsigned long nr,
  				    unsigned long *page_index,
  				    size_t *page_offset)
  {
- 	size_t start_offset = offset_in_page(eb->start);
  	size_t byte_offset = BIT_BYTE(nr);
  	size_t offset;
  
@@@ -5865,7 -5868,7 +5840,7 @@@
  	 * the bitmap item in the extent buffer + the offset of the byte in the
  	 * bitmap item.
  	 */
- 	offset = start_offset + start + byte_offset;
+ 	offset = start + byte_offset;
  
  	*page_index = offset >> PAGE_SHIFT;
  	*page_offset = offset_in_page(offset);
@@@ -5877,7 -5880,7 +5852,7 @@@
   * @start: offset of the bitmap item in the extent buffer
   * @nr: bit number to test
   */
- int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
  			   unsigned long nr)
  {
  	u8 *kaddr;
@@@ -5899,7 -5902,7 +5874,7 @@@
   * @pos: bit number of the first bit
   * @len: number of bits to set
   */
- void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
  			      unsigned long pos, unsigned long len)
  {
  	u8 *kaddr;
@@@ -5941,8 -5944,9 +5916,9 @@@
   * @pos: bit number of the first bit
   * @len: number of bits to clear
   */
- void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
- 				unsigned long pos, unsigned long len)
+ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ 				unsigned long start, unsigned long pos,
+ 				unsigned long len)
  {
  	u8 *kaddr;
  	struct page *page;
@@@ -6003,14 -6007,14 +5979,14 @@@ static void copy_pages(struct page *dst
  		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
  }
  
- void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- 			   unsigned long src_offset, unsigned long len)
+ void memcpy_extent_buffer(const struct extent_buffer *dst,
+ 			  unsigned long dst_offset, unsigned long src_offset,
+ 			  unsigned long len)
  {
  	struct btrfs_fs_info *fs_info = dst->fs_info;
  	size_t cur;
  	size_t dst_off_in_page;
  	size_t src_off_in_page;
- 	size_t start_offset = offset_in_page(dst->start);
  	unsigned long dst_i;
  	unsigned long src_i;
  
@@@ -6028,11 -6032,11 +6004,11 @@@
  	}
  
  	while (len > 0) {
- 		dst_off_in_page = offset_in_page(start_offset + dst_offset);
- 		src_off_in_page = offset_in_page(start_offset + src_offset);
+ 		dst_off_in_page = offset_in_page(dst_offset);
+ 		src_off_in_page = offset_in_page(src_offset);
  
- 		dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
- 		src_i = (start_offset + src_offset) >> PAGE_SHIFT;
+ 		dst_i = dst_offset >> PAGE_SHIFT;
+ 		src_i = src_offset >> PAGE_SHIFT;
  
  		cur = min(len, (unsigned long)(PAGE_SIZE -
  					       src_off_in_page));
@@@ -6048,8 -6052,9 +6024,9 @@@
  	}
  }
  
- void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- 			   unsigned long src_offset, unsigned long len)
+ void memmove_extent_buffer(const struct extent_buffer *dst,
+ 			   unsigned long dst_offset, unsigned long src_offset,
+ 			   unsigned long len)
  {
  	struct btrfs_fs_info *fs_info = dst->fs_info;
  	size_t cur;
@@@ -6057,7 -6062,6 +6034,6 @@@
  	size_t src_off_in_page;
  	unsigned long dst_end = dst_offset + len - 1;
  	unsigned long src_end = src_offset + len - 1;
- 	size_t start_offset = offset_in_page(dst->start);
  	unsigned long dst_i;
  	unsigned long src_i;
  
@@@ -6078,11 -6082,11 +6054,11 @@@
  		return;
  	}
  	while (len > 0) {
- 		dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
- 		src_i = (start_offset + src_end) >> PAGE_SHIFT;
+ 		dst_i = dst_end >> PAGE_SHIFT;
+ 		src_i = src_end >> PAGE_SHIFT;
  
- 		dst_off_in_page = offset_in_page(start_offset + dst_end);
- 		src_off_in_page = offset_in_page(start_offset + src_end);
+ 		dst_off_in_page = offset_in_page(dst_end);
+ 		src_off_in_page = offset_in_page(src_end);
  
  		cur = min_t(unsigned long, len, src_off_in_page + 1);
  		cur = min(cur, dst_off_in_page + 1);
diff --combined fs/btrfs/extent_io.h
index 25594e09fdcd,9a10681b12bf..602bf3af9fb4
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@@ -66,6 -66,10 +66,10 @@@ struct btrfs_io_bio
  struct io_failure_record;
  struct extent_io_tree;
  
+ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
+ 					 int mirror_num,
+ 					 unsigned long bio_flags);
+ 
  typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
  		struct bio *bio, u64 bio_offset);
  
@@@ -74,8 -78,7 +78,7 @@@ struct extent_io_ops 
  	 * The following callbacks must be always defined, the function
  	 * pointer will be called unconditionally.
  	 */
- 	blk_status_t (*submit_bio_hook)(struct inode *inode, struct bio *bio,
- 					int mirror_num, unsigned long bio_flags);
+ 	submit_bio_hook_t *submit_bio_hook;
  	int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
  				    struct page *page, u64 start, u64 end,
  				    int mirror);
@@@ -198,7 -201,8 +201,7 @@@ int extent_writepages(struct address_sp
  		      struct writeback_control *wbc);
  int btree_write_cache_pages(struct address_space *mapping,
  			    struct writeback_control *wbc);
 -int extent_readpages(struct address_space *mapping, struct list_head *pages,
 -		     unsigned nr_pages);
 +void extent_readahead(struct readahead_control *rac);
  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
  		__u64 start, __u64 len);
  void set_page_extent_mapped(struct page *page);
@@@ -209,7 -213,7 +212,7 @@@ struct extent_buffer *__alloc_dummy_ext
  						  u64 start, unsigned long len);
  struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
  						u64 start);
- struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
+ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
  struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
  					 u64 start);
  void free_extent_buffer(struct extent_buffer *eb);
@@@ -227,7 -231,7 +230,7 @@@ static inline int num_extent_pages(cons
  	       (eb->start >> PAGE_SHIFT);
  }
  
- static inline int extent_buffer_uptodate(struct extent_buffer *eb)
+ static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
  {
  	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
  }
@@@ -240,37 -244,37 +243,37 @@@ void read_extent_buffer(const struct ex
  int read_extent_buffer_to_user(const struct extent_buffer *eb,
  			       void __user *dst, unsigned long start,
  			       unsigned long len);
- void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src);
- void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
+ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
+ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
  		const void *src);
- void write_extent_buffer(struct extent_buffer *eb, const void *src,
+ void write_extent_buffer(const struct extent_buffer *eb, const void *src,
  			 unsigned long start, unsigned long len);
- void copy_extent_buffer_full(struct extent_buffer *dst,
- 			     struct extent_buffer *src);
- void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+ void copy_extent_buffer_full(const struct extent_buffer *dst,
+ 			     const struct extent_buffer *src);
+ void copy_extent_buffer(const struct extent_buffer *dst,
+ 			const struct extent_buffer *src,
  			unsigned long dst_offset, unsigned long src_offset,
  			unsigned long len);
- void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- 			   unsigned long src_offset, unsigned long len);
- void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
- 			   unsigned long src_offset, unsigned long len);
- void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
+ void memcpy_extent_buffer(const struct extent_buffer *dst,
+ 			  unsigned long dst_offset, unsigned long src_offset,
+ 			  unsigned long len);
+ void memmove_extent_buffer(const struct extent_buffer *dst,
+ 			   unsigned long dst_offset, unsigned long src_offset,
+ 			   unsigned long len);
+ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
  			   unsigned long len);
- int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
  			   unsigned long pos);
- void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
  			      unsigned long pos, unsigned long len);
- void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
- 				unsigned long pos, unsigned long len);
- void clear_extent_buffer_dirty(struct extent_buffer *eb);
+ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
+ 				unsigned long start, unsigned long pos,
+ 				unsigned long len);
+ void clear_extent_buffer_dirty(const struct extent_buffer *eb);
  bool set_extent_buffer_dirty(struct extent_buffer *eb);
  void set_extent_buffer_uptodate(struct extent_buffer *eb);
  void clear_extent_buffer_uptodate(struct extent_buffer *eb);
- int extent_buffer_under_io(struct extent_buffer *eb);
- int map_private_extent_buffer(const struct extent_buffer *eb,
- 			      unsigned long offset, unsigned long min_len,
- 			      char **map, unsigned long *map_start,
- 			      unsigned long *map_len);
+ int extent_buffer_under_io(const struct extent_buffer *eb);
  void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
  void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
  void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
@@@ -289,7 -293,7 +292,7 @@@ int repair_io_failure(struct btrfs_fs_i
  		      u64 length, u64 logical, struct page *page,
  		      unsigned int pg_offset, int mirror_num);
  void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
- int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num);
+ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
  
  /*
   * When IO fails, either with EIO or csum verification fails, we
@@@ -311,12 -315,12 +314,12 @@@ struct io_failure_record 
  };
  
  
- bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
- 			    struct io_failure_record *failrec, int fail_mirror);
- struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
- 				    struct io_failure_record *failrec,
- 				    struct page *page, int pg_offset, int icsum,
- 				    bio_end_io_t *endio_func, void *data);
+ blk_status_t btrfs_submit_read_repair(struct inode *inode,
+ 				      struct bio *failed_bio, u64 phy_offset,
+ 				      struct page *page, unsigned int pgoff,
+ 				      u64 start, u64 end, int failed_mirror,
+ 				      submit_bio_hook_t *submit_bio_hook);
+ 
  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
  bool find_lock_delalloc_range(struct inode *inode,
  			     struct page *locked_page, u64 *start,
diff --combined fs/btrfs/inode.c
index 8b3489f229c7,1242d0aa108d..768c8be4c765
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -5,7 -5,6 +5,6 @@@
  
  #include <linux/kernel.h>
  #include <linux/bio.h>
- #include <linux/buffer_head.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/pagemap.h>
@@@ -49,17 -48,18 +48,18 @@@
  #include "qgroup.h"
  #include "delalloc-space.h"
  #include "block-group.h"
+ #include "space-info.h"
  
  struct btrfs_iget_args {
- 	struct btrfs_key *location;
+ 	u64 ino;
  	struct btrfs_root *root;
  };
  
  struct btrfs_dio_data {
  	u64 reserve;
- 	u64 unsubmitted_oe_range_start;
- 	u64 unsubmitted_oe_range_end;
- 	int overwrite;
+ 	loff_t length;
+ 	ssize_t submitted;
+ 	struct extent_changeset *data_reserved;
  };
  
  static const struct inode_operations btrfs_dir_inode_operations;
@@@ -1142,7 -1142,7 +1142,7 @@@ out_unlock
  	 */
  	if (extent_reserved) {
  		extent_clear_unlock_delalloc(inode, start,
- 					     start + cur_alloc_size,
+ 					     start + cur_alloc_size - 1,
  					     locked_page,
  					     clear_bits,
  					     page_ops);
@@@ -1355,6 -1355,66 +1355,66 @@@ static noinline int csum_exist_in_range
  	return 1;
  }
  
+ static int fallback_to_cow(struct inode *inode, struct page *locked_page,
+ 			   const u64 start, const u64 end,
+ 			   int *page_started, unsigned long *nr_written)
+ {
+ 	const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
+ 	const u64 range_bytes = end + 1 - start;
+ 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ 	u64 range_start = start;
+ 	u64 count;
+ 
+ 	/*
+ 	 * If EXTENT_NORESERVE is set it means that when the buffered write was
+ 	 * made we had not enough available data space and therefore we did not
+ 	 * reserve data space for it, since we though we could do NOCOW for the
+ 	 * respective file range (either there is prealloc extent or the inode
+ 	 * has the NOCOW bit set).
+ 	 *
+ 	 * However when we need to fallback to COW mode (because for example the
+ 	 * block group for the corresponding extent was turned to RO mode by a
+ 	 * scrub or relocation) we need to do the following:
+ 	 *
+ 	 * 1) We increment the bytes_may_use counter of the data space info.
+ 	 *    If COW succeeds, it allocates a new data extent and after doing
+ 	 *    that it decrements the space info's bytes_may_use counter and
+ 	 *    increments its bytes_reserved counter by the same amount (we do
+ 	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
+ 	 *    bytes_may_use counter to compensate (when space is reserved at
+ 	 *    buffered write time, the bytes_may_use counter is incremented);
+ 	 *
+ 	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
+ 	 *    that if the COW path fails for any reason, it decrements (through
+ 	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
+ 	 *    data space info, which we incremented in the step above.
+ 	 *
+ 	 * If we need to fallback to cow and the inode corresponds to a free
+ 	 * space cache inode, we must also increment bytes_may_use of the data
+ 	 * space_info for the same reason. Space caches always get a prealloc
+ 	 * extent for them, however scrub or balance may have set the block
+ 	 * group that contains that extent to RO mode.
+ 	 */
+ 	count = count_range_bits(io_tree, &range_start, end, range_bytes,
+ 				 EXTENT_NORESERVE, 0);
+ 	if (count > 0 || is_space_ino) {
+ 		const u64 bytes = is_space_ino ? range_bytes : count;
+ 		struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ 		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+ 
+ 		spin_lock(&sinfo->lock);
+ 		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+ 		spin_unlock(&sinfo->lock);
+ 
+ 		if (count > 0)
+ 			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
+ 					 0, 0, NULL);
+ 	}
+ 
+ 	return cow_file_range(inode, locked_page, start, end, page_started,
+ 			      nr_written, 1);
+ }
+ 
  /*
   * when nowcow writeback call back.  This checks for snapshots or COW copies
   * of the extents that exist in the file, and COWs the file as required.
@@@ -1602,9 -1662,9 +1662,9 @@@ out_check
  		 * NOCOW, following one which needs to be COW'ed
  		 */
  		if (cow_start != (u64)-1) {
- 			ret = cow_file_range(inode, locked_page,
- 					     cow_start, found_key.offset - 1,
- 					     page_started, nr_written, 1);
+ 			ret = fallback_to_cow(inode, locked_page, cow_start,
+ 					      found_key.offset - 1,
+ 					      page_started, nr_written);
  			if (ret) {
  				if (nocow)
  					btrfs_dec_nocow_writers(fs_info,
@@@ -1693,8 -1753,8 +1753,8 @@@
  
  	if (cow_start != (u64)-1) {
  		cur_offset = end;
- 		ret = cow_file_range(inode, locked_page, cow_start, end,
- 				     page_started, nr_written, 1);
+ 		ret = fallback_to_cow(inode, locked_page, cow_start, end,
+ 				      page_started, nr_written);
  		if (ret)
  			goto error;
  	}
@@@ -2726,10 -2786,9 +2786,9 @@@ void btrfs_writepage_endio_finish_order
  	btrfs_queue_work(wq, &ordered_extent->work);
  }
  
- static int __readpage_endio_check(struct inode *inode,
- 				  struct btrfs_io_bio *io_bio,
- 				  int icsum, struct page *page,
- 				  int pgoff, u64 start, size_t len)
+ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
+ 			   int icsum, struct page *page, int pgoff, u64 start,
+ 			   size_t len)
  {
  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@@ -2743,9 -2802,7 +2802,7 @@@
  	kaddr = kmap_atomic(page);
  	shash->tfm = fs_info->csum_shash;
  
- 	crypto_shash_init(shash);
- 	crypto_shash_update(shash, kaddr + pgoff, len);
- 	crypto_shash_final(shash, csum);
+ 	crypto_shash_digest(shash, kaddr + pgoff, len, csum);
  
  	if (memcmp(csum, csum_expected, csum_size))
  		goto zeroit;
@@@ -2790,8 -2847,8 +2847,8 @@@ static int btrfs_readpage_end_io_hook(s
  	}
  
  	phy_offset >>= inode->i_sb->s_blocksize_bits;
- 	return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
- 				      start, (size_t)(end - start + 1));
+ 	return check_data_csum(inode, io_bio, phy_offset, page, offset, start,
+ 			       (size_t)(end - start + 1));
  }
  
  /*
@@@ -2981,7 -3038,7 +3038,7 @@@ int btrfs_orphan_cleanup(struct btrfs_r
  		found_key.objectid = found_key.offset;
  		found_key.type = BTRFS_INODE_ITEM_KEY;
  		found_key.offset = 0;
- 		inode = btrfs_iget(fs_info->sb, &found_key, root);
+ 		inode = btrfs_iget(fs_info->sb, last_objectid, root);
  		ret = PTR_ERR_OR_ZERO(inode);
  		if (ret && ret != -ENOENT)
  			goto out;
@@@ -3000,18 -3057,16 +3057,16 @@@
  			 * orphan must not get deleted.
  			 * find_dead_roots already ran before us, so if this
  			 * is a snapshot deletion, we should find the root
- 			 * in the dead_roots list
+ 			 * in the fs_roots radix tree.
  			 */
- 			spin_lock(&fs_info->trans_lock);
- 			list_for_each_entry(dead_root, &fs_info->dead_roots,
- 					    root_list) {
- 				if (dead_root->root_key.objectid ==
- 				    found_key.objectid) {
- 					is_dead_root = 1;
- 					break;
- 				}
- 			}
- 			spin_unlock(&fs_info->trans_lock);
+ 
+ 			spin_lock(&fs_info->fs_roots_radix_lock);
+ 			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ 							 (unsigned long)found_key.objectid);
+ 			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
+ 				is_dead_root = 1;
+ 			spin_unlock(&fs_info->fs_roots_radix_lock);
+ 
  			if (is_dead_root) {
  				/* prevent this orphan from being found again */
  				key.offset = found_key.objectid - 1;
@@@ -3357,43 -3412,40 +3412,40 @@@ static void fill_inode_item(struct btrf
  
  	btrfs_init_map_token(&token, leaf);
  
- 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
- 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
- 	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
- 				   &token);
- 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
- 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
- 
- 	btrfs_set_token_timespec_sec(leaf, &item->atime,
- 				     inode->i_atime.tv_sec, &token);
- 	btrfs_set_token_timespec_nsec(leaf, &item->atime,
- 				      inode->i_atime.tv_nsec, &token);
- 
- 	btrfs_set_token_timespec_sec(leaf, &item->mtime,
- 				     inode->i_mtime.tv_sec, &token);
- 	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
- 				      inode->i_mtime.tv_nsec, &token);
- 
- 	btrfs_set_token_timespec_sec(leaf, &item->ctime,
- 				     inode->i_ctime.tv_sec, &token);
- 	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
- 				      inode->i_ctime.tv_nsec, &token);
- 
- 	btrfs_set_token_timespec_sec(leaf, &item->otime,
- 				     BTRFS_I(inode)->i_otime.tv_sec, &token);
- 	btrfs_set_token_timespec_nsec(leaf, &item->otime,
- 				      BTRFS_I(inode)->i_otime.tv_nsec, &token);
- 
- 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
- 				     &token);
- 	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
- 					 &token);
- 	btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
- 				       &token);
- 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
- 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
- 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
- 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+ 	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
+ 	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
+ 	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
+ 	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
+ 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+ 
+ 	btrfs_set_token_timespec_sec(&token, &item->atime,
+ 				     inode->i_atime.tv_sec);
+ 	btrfs_set_token_timespec_nsec(&token, &item->atime,
+ 				      inode->i_atime.tv_nsec);
+ 
+ 	btrfs_set_token_timespec_sec(&token, &item->mtime,
+ 				     inode->i_mtime.tv_sec);
+ 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
+ 				      inode->i_mtime.tv_nsec);
+ 
+ 	btrfs_set_token_timespec_sec(&token, &item->ctime,
+ 				     inode->i_ctime.tv_sec);
+ 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
+ 				      inode->i_ctime.tv_nsec);
+ 
+ 	btrfs_set_token_timespec_sec(&token, &item->otime,
+ 				     BTRFS_I(inode)->i_otime.tv_sec);
+ 	btrfs_set_token_timespec_nsec(&token, &item->otime,
+ 				      BTRFS_I(inode)->i_otime.tv_nsec);
+ 
+ 	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
+ 	btrfs_set_token_inode_generation(&token, item,
+ 					 BTRFS_I(inode)->generation);
+ 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
+ 	btrfs_set_token_inode_transid(&token, item, trans->transid);
+ 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ 	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
+ 	btrfs_set_token_inode_block_group(&token, item, 0);
  }
  
  /*
@@@ -3618,7 -3670,7 +3670,7 @@@ static struct btrfs_trans_handle *__unl
  	 * 1 for the inode ref
  	 * 1 for the inode
  	 */
- 	return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
+ 	return btrfs_start_transaction_fallback_global_rsv(root, 5);
  }
  
  static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@@ -4108,11 -4160,12 +4160,12 @@@ int btrfs_truncate_inode_items(struct b
  	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
  
  	/*
- 	 * for non-free space inodes and ref cows, we want to back off from
- 	 * time to time
+ 	 * For non-free space inodes and non-shareable roots, we want to back
+ 	 * off from time to time.  This means all inodes in subvolume roots,
+ 	 * reloc roots, and data reloc roots.
  	 */
  	if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
- 	    test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ 	    test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
  		be_nice = true;
  
  	path = btrfs_alloc_path();
@@@ -4120,20 -4173,19 +4173,19 @@@
  		return -ENOMEM;
  	path->reada = READA_BACK;
  
- 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
  		lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
  				 &cached_state);
  
- 	/*
- 	 * We want to drop from the next block forward in case this new size is
- 	 * not block aligned since we will be keeping the last block of the
- 	 * extent just the way it is.
- 	 */
- 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- 	    root == fs_info->tree_root)
+ 		/*
+ 		 * We want to drop from the next block forward in case this
+ 		 * new size is not block aligned since we will be keeping the
+ 		 * last block of the extent just the way it is.
+ 		 */
  		btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
  					fs_info->sectorsize),
  					(u64)-1, 0);
+ 	}
  
  	/*
  	 * This function is also used to drop the items in the log tree before
@@@ -4241,7 -4293,7 +4293,7 @@@ search_again
  							 extent_num_bytes);
  				num_dec = (orig_num_bytes -
  					   extent_num_bytes);
- 				if (test_bit(BTRFS_ROOT_REF_COWS,
+ 				if (test_bit(BTRFS_ROOT_SHAREABLE,
  					     &root->state) &&
  				    extent_start != 0)
  					inode_sub_bytes(inode, num_dec);
@@@ -4257,7 -4309,7 +4309,7 @@@
  				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
  				if (extent_start != 0) {
  					found_extent = 1;
- 					if (test_bit(BTRFS_ROOT_REF_COWS,
+ 					if (test_bit(BTRFS_ROOT_SHAREABLE,
  						     &root->state))
  						inode_sub_bytes(inode, num_dec);
  				}
@@@ -4293,7 -4345,7 +4345,7 @@@
  				clear_len = fs_info->sectorsize;
  			}
  
- 			if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ 			if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
  				inode_sub_bytes(inode, item_end + 1 - new_size);
  		}
  delete:
@@@ -4334,8 -4386,7 +4386,7 @@@
  		should_throttle = false;
  
  		if (found_extent &&
- 		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
- 		     root == fs_info->tree_root)) {
+ 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
  			struct btrfs_ref ref = { 0 };
  
  			bytes_deleted += extent_num_bytes;
@@@ -4759,10 -4810,7 +4810,7 @@@ static int btrfs_setsize(struct inode *
  
  		truncate_setsize(inode, newsize);
  
- 		/* Disable nonlocked read DIO to avoid the endless truncate */
- 		btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
  		inode_dio_wait(inode);
- 		btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
  
  		ret = btrfs_truncate(inode, newsize == oldsize);
  		if (ret && inode->i_nlink) {
@@@ -4856,8 -4904,8 +4904,8 @@@ static void evict_inode_truncate_pages(
  
  	/*
  	 * Keep looping until we have no more ranges in the io tree.
 -	 * We can have ongoing bios started by readpages (called from readahead)
 -	 * that have their endio callback (extent_io.c:end_bio_extent_readpage)
 +	 * We can have ongoing bios started by readahead that have
 +	 * their endio callback (extent_io.c:end_bio_extent_readpage)
  	 * still in progress (unlocked the pages in the bio but did not yet
  	 * unlocked the ranges in the io tree). Therefore this means some
  	 * ranges can still be locked and eviction started because before
@@@ -5154,7 -5202,7 +5202,7 @@@ static int fixup_tree_root_location(str
  
  	btrfs_release_path(path);
  
- 	new_root = btrfs_get_fs_root(fs_info, location, true);
+ 	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
  	if (IS_ERR(new_root)) {
  		err = PTR_ERR(new_root);
  		goto out;
@@@ -5232,9 -5280,11 +5280,11 @@@ static void inode_tree_del(struct inod
  static int btrfs_init_locked_inode(struct inode *inode, void *p)
  {
  	struct btrfs_iget_args *args = p;
- 	inode->i_ino = args->location->objectid;
- 	memcpy(&BTRFS_I(inode)->location, args->location,
- 	       sizeof(*args->location));
+ 
+ 	inode->i_ino = args->ino;
+ 	BTRFS_I(inode)->location.objectid = args->ino;
+ 	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ 	BTRFS_I(inode)->location.offset = 0;
  	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
  	BUG_ON(args->root && !BTRFS_I(inode)->root);
  	return 0;
@@@ -5243,19 -5293,19 +5293,19 @@@
  static int btrfs_find_actor(struct inode *inode, void *opaque)
  {
  	struct btrfs_iget_args *args = opaque;
- 	return args->location->objectid == BTRFS_I(inode)->location.objectid &&
+ 
+ 	return args->ino == BTRFS_I(inode)->location.objectid &&
  		args->root == BTRFS_I(inode)->root;
  }
  
- static struct inode *btrfs_iget_locked(struct super_block *s,
- 				       struct btrfs_key *location,
+ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
  				       struct btrfs_root *root)
  {
  	struct inode *inode;
  	struct btrfs_iget_args args;
- 	unsigned long hashval = btrfs_inode_hash(location->objectid, root);
+ 	unsigned long hashval = btrfs_inode_hash(ino, root);
  
- 	args.location = location;
+ 	args.ino = ino;
  	args.root = root;
  
  	inode = iget5_locked(s, hashval, btrfs_find_actor,
@@@ -5265,17 -5315,17 +5315,17 @@@
  }
  
  /*
-  * Get an inode object given its location and corresponding root.
+  * Get an inode object given its inode number and corresponding root.
   * Path can be preallocated to prevent recursing back to iget through
   * allocator. NULL is also valid but may require an additional allocation
   * later.
   */
- struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
  			      struct btrfs_root *root, struct btrfs_path *path)
  {
  	struct inode *inode;
  
- 	inode = btrfs_iget_locked(s, location, root);
+ 	inode = btrfs_iget_locked(s, ino, root);
  	if (!inode)
  		return ERR_PTR(-ENOMEM);
  
@@@ -5302,10 -5352,9 +5352,9 @@@
  	return inode;
  }
  
- struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
- 			 struct btrfs_root *root)
+ struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
  {
- 	return btrfs_iget_path(s, location, root, NULL);
+ 	return btrfs_iget_path(s, ino, root, NULL);
  }
  
  static struct inode *new_simple_dir(struct super_block *s,
@@@ -5374,7 -5423,7 +5423,7 @@@ struct inode *btrfs_lookup_dentry(struc
  		return ERR_PTR(ret);
  
  	if (location.type == BTRFS_INODE_ITEM_KEY) {
- 		inode = btrfs_iget(dir->i_sb, &location, root);
+ 		inode = btrfs_iget(dir->i_sb, location.objectid, root);
  		if (IS_ERR(inode))
  			return inode;
  
@@@ -5398,7 -5447,7 +5447,7 @@@
  		else
  			inode = new_simple_dir(dir->i_sb, &location, sub_root);
  	} else {
- 		inode = btrfs_iget(dir->i_sb, &location, sub_root);
+ 		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
  	}
  	if (root != sub_root)
  		btrfs_put_root(sub_root);
@@@ -5779,7 -5828,8 +5828,8 @@@ int btrfs_set_inode_index(struct btrfs_
  static int btrfs_insert_inode_locked(struct inode *inode)
  {
  	struct btrfs_iget_args args;
- 	args.location = &BTRFS_I(inode)->location;
+ 
+ 	args.ino = BTRFS_I(inode)->location.objectid;
  	args.root = BTRFS_I(inode)->root;
  
  	return insert_inode_locked4(inode,
@@@ -6991,7 -7041,7 +7041,7 @@@ out
  }
  
  static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- 			      struct extent_state **cached_state, int writing)
+ 			      struct extent_state **cached_state, bool writing)
  {
  	struct btrfs_ordered_extent *ordered;
  	int ret = 0;
@@@ -7050,11 -7100,11 +7100,11 @@@
  			 * for it to complete) and then invalidate the pages for
  			 * this range (through invalidate_inode_pages2_range()),
  			 * but that can lead us to a deadlock with a concurrent
 -			 * call to readpages() (a buffered read or a defrag call
 +			 * call to readahead (a buffered read or a defrag call
  			 * triggered a readahead) on a page lock due to an
  			 * ordered dio extent we created before but did not have
  			 * yet a corresponding bio submitted (whence it can not
 -			 * complete), which makes readpages() wait for that
 +			 * complete), which makes readahead wait for that
  			 * ordered extent to complete while holding a lock on
  			 * that page.
  			 */
@@@ -7129,30 -7179,7 +7179,7 @@@ static struct extent_map *create_io_em(
  }
  
  
- static int btrfs_get_blocks_direct_read(struct extent_map *em,
- 					struct buffer_head *bh_result,
- 					struct inode *inode,
- 					u64 start, u64 len)
- {
- 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- 
- 	if (em->block_start == EXTENT_MAP_HOLE ||
- 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- 		return -ENOENT;
- 
- 	len = min(len, em->len - (start - em->start));
- 
- 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- 		inode->i_blkbits;
- 	bh_result->b_size = len;
- 	bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- 	set_buffer_mapped(bh_result);
- 
- 	return 0;
- }
- 
  static int btrfs_get_blocks_direct_write(struct extent_map **map,
- 					 struct buffer_head *bh_result,
  					 struct inode *inode,
  					 struct btrfs_dio_data *dio_data,
  					 u64 start, u64 len)
@@@ -7214,7 -7241,6 +7241,6 @@@
  	}
  
  	/* this will cow the extent */
- 	len = bh_result->b_size;
  	free_extent_map(em);
  	*map = em = btrfs_new_extent_direct(inode, start, len);
  	if (IS_ERR(em)) {
@@@ -7225,64 -7251,73 +7251,73 @@@
  	len = min(len, em->len - (start - em->start));
  
  skip_cow:
- 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- 		inode->i_blkbits;
- 	bh_result->b_size = len;
- 	bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- 	set_buffer_mapped(bh_result);
- 
- 	if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- 		set_buffer_new(bh_result);
- 
  	/*
  	 * Need to update the i_size under the extent lock so buffered
  	 * readers will get the updated i_size when we unlock.
  	 */
- 	if (!dio_data->overwrite && start + len > i_size_read(inode))
+ 	if (start + len > i_size_read(inode))
  		i_size_write(inode, start + len);
  
- 	WARN_ON(dio_data->reserve < len);
  	dio_data->reserve -= len;
- 	dio_data->unsubmitted_oe_range_end = start + len;
- 	current->journal_info = dio_data;
  out:
  	return ret;
  }
  
- static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- 				   struct buffer_head *bh_result, int create)
+ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ 		loff_t length, unsigned flags, struct iomap *iomap,
+ 		struct iomap *srcmap)
  {
  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  	struct extent_map *em;
  	struct extent_state *cached_state = NULL;
  	struct btrfs_dio_data *dio_data = NULL;
- 	u64 start = iblock << inode->i_blkbits;
  	u64 lockstart, lockend;
- 	u64 len = bh_result->b_size;
+ 	const bool write = !!(flags & IOMAP_WRITE);
  	int ret = 0;
+ 	u64 len = length;
+ 	bool unlock_extents = false;
  
- 	if (!create)
+ 	if (!write)
  		len = min_t(u64, len, fs_info->sectorsize);
  
  	lockstart = start;
  	lockend = start + len - 1;
  
- 	if (current->journal_info) {
- 		/*
- 		 * Need to pull our outstanding extents and set journal_info to NULL so
- 		 * that anything that needs to check if there's a transaction doesn't get
- 		 * confused.
- 		 */
- 		dio_data = current->journal_info;
- 		current->journal_info = NULL;
+ 	/*
+ 	 * The generic stuff only does filemap_write_and_wait_range, which
+ 	 * isn't enough if we've written compressed pages to this area, so we
+ 	 * need to flush the dirty pages again to make absolutely sure that any
+ 	 * outstanding dirty pages are on disk.
+ 	 */
+ 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ 		     &BTRFS_I(inode)->runtime_flags))
+ 		ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ 					       start + length - 1);
+ 
+ 	dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+ 	if (!dio_data)
+ 		return -ENOMEM;
+ 
+ 	dio_data->length = length;
+ 	if (write) {
+ 		dio_data->reserve = round_up(length, fs_info->sectorsize);
+ 		ret = btrfs_delalloc_reserve_space(inode,
+ 				&dio_data->data_reserved,
+ 				start, dio_data->reserve);
+ 		if (ret) {
+ 			extent_changeset_free(dio_data->data_reserved);
+ 			kfree(dio_data);
+ 			return ret;
+ 		}
  	}
+ 	iomap->private = dio_data;
+ 
  
  	/*
  	 * If this errors out it's because we couldn't invalidate pagecache for
  	 * this range and we need to fallback to buffered.
  	 */
- 	if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- 			       create)) {
+ 	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
  		ret = -ENOTBLK;
  		goto err;
  	}
@@@ -7314,36 -7349,48 +7349,48 @@@
  		goto unlock_err;
  	}
  
- 	if (create) {
- 		ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- 						    dio_data, start, len);
+ 	len = min(len, em->len - (start - em->start));
+ 	if (write) {
+ 		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ 						    start, len);
  		if (ret < 0)
  			goto unlock_err;
- 
- 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- 				     lockend, &cached_state);
+ 		unlock_extents = true;
+ 		/* Recalc len in case the new em is smaller than requested */
+ 		len = min(len, em->len - (start - em->start));
  	} else {
- 		ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- 						   start, len);
- 		/* Can be negative only if we read from a hole */
- 		if (ret < 0) {
- 			ret = 0;
- 			free_extent_map(em);
- 			goto unlock_err;
- 		}
  		/*
  		 * We need to unlock only the end area that we aren't using.
  		 * The rest is going to be unlocked by the endio routine.
  		 */
- 		lockstart = start + bh_result->b_size;
- 		if (lockstart < lockend) {
- 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- 					     lockstart, lockend, &cached_state);
- 		} else {
- 			free_extent_state(cached_state);
- 		}
+ 		lockstart = start + len;
+ 		if (lockstart < lockend)
+ 			unlock_extents = true;
  	}
  
+ 	if (unlock_extents)
+ 		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ 				     lockstart, lockend, &cached_state);
+ 	else
+ 		free_extent_state(cached_state);
+ 
+ 	/*
+ 	 * Translate extent map information to iomap.
+ 	 * We trim the extents (and move the addr) even though iomap code does
+ 	 * that, since we have locked only the parts we are performing I/O in.
+ 	 */
+ 	if ((em->block_start == EXTENT_MAP_HOLE) ||
+ 	    (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ 		iomap->addr = IOMAP_NULL_ADDR;
+ 		iomap->type = IOMAP_HOLE;
+ 	} else {
+ 		iomap->addr = em->block_start + (start - em->start);
+ 		iomap->type = IOMAP_MAPPED;
+ 	}
+ 	iomap->offset = start;
+ 	iomap->bdev = fs_info->fs_devices->latest_bdev;
+ 	iomap->length = len;
+ 
  	free_extent_map(em);
  
  	return 0;
@@@ -7352,370 -7399,152 +7399,152 @@@ unlock_err
  	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
  			     &cached_state);
  err:
- 	if (dio_data)
- 		current->journal_info = dio_data;
+ 	if (dio_data) {
+ 		btrfs_delalloc_release_space(inode, dio_data->data_reserved,
+ 				start, dio_data->reserve, true);
+ 		btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+ 		extent_changeset_free(dio_data->data_reserved);
+ 		kfree(dio_data);
+ 	}
  	return ret;
  }
  
- static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
- 						 struct bio *bio,
- 						 int mirror_num)
+ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ 		ssize_t written, unsigned flags, struct iomap *iomap)
  {
- 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- 	blk_status_t ret;
+ 	int ret = 0;
+ 	struct btrfs_dio_data *dio_data = iomap->private;
+ 	size_t submitted = dio_data->submitted;
+ 	const bool write = !!(flags & IOMAP_WRITE);
  
- 	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
+ 	if (!write && (iomap->type == IOMAP_HOLE)) {
+ 		/* If reading from a hole, unlock and return */
+ 		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ 		goto out;
+ 	}
  
- 	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
- 	if (ret)
- 		return ret;
+ 	if (submitted < length) {
+ 		pos += submitted;
+ 		length -= submitted;
+ 		if (write)
+ 			__endio_write_update_ordered(inode, pos, length, false);
+ 		else
+ 			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ 				      pos + length - 1);
+ 		ret = -ENOTBLK;
+ 	}
  
- 	ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ 	if (write) {
+ 		if (dio_data->reserve)
+ 			btrfs_delalloc_release_space(inode,
+ 					dio_data->data_reserved, pos,
+ 					dio_data->reserve, true);
+ 		btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ 		extent_changeset_free(dio_data->data_reserved);
+ 	}
+ out:
+ 	kfree(dio_data);
+ 	iomap->private = NULL;
  
  	return ret;
  }
  
- static int btrfs_check_dio_repairable(struct inode *inode,
- 				      struct bio *failed_bio,
- 				      struct io_failure_record *failrec,
- 				      int failed_mirror)
+ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
  {
- 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- 	int num_copies;
- 
- 	num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
- 	if (num_copies == 1) {
- 		/*
- 		 * we only have a single copy of the data, so don't bother with
- 		 * all the retry and error correction code that follows. no
- 		 * matter what the error is, it is very likely to persist.
- 		 */
- 		btrfs_debug(fs_info,
- 			"Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
- 			num_copies, failrec->this_mirror, failed_mirror);
- 		return 0;
- 	}
- 
- 	failrec->failed_mirror = failed_mirror;
- 	failrec->this_mirror++;
- 	if (failrec->this_mirror == failed_mirror)
- 		failrec->this_mirror++;
+ 	/*
+ 	 * This implies a barrier so that stores to dio_bio->bi_status before
+ 	 * this and loads of dio_bio->bi_status after this are fully ordered.
+ 	 */
+ 	if (!refcount_dec_and_test(&dip->refs))
+ 		return;
  
- 	if (failrec->this_mirror > num_copies) {
- 		btrfs_debug(fs_info,
- 			"Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
- 			num_copies, failrec->this_mirror, failed_mirror);
- 		return 0;
+ 	if (bio_op(dip->dio_bio) == REQ_OP_WRITE) {
+ 		__endio_write_update_ordered(dip->inode, dip->logical_offset,
+ 					     dip->bytes,
+ 					     !dip->dio_bio->bi_status);
+ 	} else {
+ 		unlock_extent(&BTRFS_I(dip->inode)->io_tree,
+ 			      dip->logical_offset,
+ 			      dip->logical_offset + dip->bytes - 1);
  	}
  
- 	return 1;
+ 	bio_endio(dip->dio_bio);
+ 	kfree(dip);
  }
  
- static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
- 				   struct page *page, unsigned int pgoff,
- 				   u64 start, u64 end, int failed_mirror,
- 				   bio_end_io_t *repair_endio, void *repair_arg)
+ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
+ 					  int mirror_num,
+ 					  unsigned long bio_flags)
  {
- 	struct io_failure_record *failrec;
- 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- 	struct bio *bio;
- 	int isector;
- 	unsigned int read_mode = 0;
- 	int segs;
- 	int ret;
- 	blk_status_t status;
- 	struct bio_vec bvec;
+ 	struct btrfs_dio_private *dip = bio->bi_private;
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ 	blk_status_t ret;
  
- 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
+ 	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
  
- 	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
+ 	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
  	if (ret)
- 		return errno_to_blk_status(ret);
- 
- 	ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
- 					 failed_mirror);
- 	if (!ret) {
- 		free_io_failure(failure_tree, io_tree, failrec);
- 		return BLK_STS_IOERR;
- 	}
- 
- 	segs = bio_segments(failed_bio);
- 	bio_get_first_bvec(failed_bio, &bvec);
- 	if (segs > 1 ||
- 	    (bvec.bv_len > btrfs_inode_sectorsize(inode)))
- 		read_mode |= REQ_FAILFAST_DEV;
- 
- 	isector = start - btrfs_io_bio(failed_bio)->logical;
- 	isector >>= inode->i_sb->s_blocksize_bits;
- 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- 				pgoff, isector, repair_endio, repair_arg);
- 	bio->bi_opf = REQ_OP_READ | read_mode;
- 
- 	btrfs_debug(BTRFS_I(inode)->root->fs_info,
- 		    "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
- 		    read_mode, failrec->this_mirror, failrec->in_validation);
- 
- 	status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
- 	if (status) {
- 		free_io_failure(failure_tree, io_tree, failrec);
- 		bio_put(bio);
- 	}
- 
- 	return status;
- }
- 
- struct btrfs_retry_complete {
- 	struct completion done;
- 	struct inode *inode;
- 	u64 start;
- 	int uptodate;
- };
+ 		return ret;
  
- static void btrfs_retry_endio_nocsum(struct bio *bio)
- {
- 	struct btrfs_retry_complete *done = bio->bi_private;
- 	struct inode *inode = done->inode;
- 	struct bio_vec *bvec;
- 	struct extent_io_tree *io_tree, *failure_tree;
- 	struct bvec_iter_all iter_all;
- 
- 	if (bio->bi_status)
- 		goto end;
- 
- 	ASSERT(bio->bi_vcnt == 1);
- 	io_tree = &BTRFS_I(inode)->io_tree;
- 	failure_tree = &BTRFS_I(inode)->io_failure_tree;
- 	ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
- 
- 	done->uptodate = 1;
- 	ASSERT(!bio_flagged(bio, BIO_CLONED));
- 	bio_for_each_segment_all(bvec, bio, iter_all)
- 		clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
- 				 io_tree, done->start, bvec->bv_page,
- 				 btrfs_ino(BTRFS_I(inode)), 0);
- end:
- 	complete(&done->done);
- 	bio_put(bio);
+ 	refcount_inc(&dip->refs);
+ 	ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ 	if (ret)
+ 		refcount_dec(&dip->refs);
+ 	return ret;
  }
  
- static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
- 						struct btrfs_io_bio *io_bio)
+ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
+ 					     struct btrfs_io_bio *io_bio,
+ 					     const bool uptodate)
  {
- 	struct btrfs_fs_info *fs_info;
+ 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ 	const u32 sectorsize = fs_info->sectorsize;
+ 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+ 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ 	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
  	struct bio_vec bvec;
  	struct bvec_iter iter;
- 	struct btrfs_retry_complete done;
- 	u64 start;
- 	unsigned int pgoff;
- 	u32 sectorsize;
- 	int nr_sectors;
- 	blk_status_t ret;
+ 	u64 start = io_bio->logical;
+ 	int icsum = 0;
  	blk_status_t err = BLK_STS_OK;
  
- 	fs_info = BTRFS_I(inode)->root->fs_info;
- 	sectorsize = fs_info->sectorsize;
- 
- 	start = io_bio->logical;
- 	done.inode = inode;
- 	io_bio->bio.bi_iter = io_bio->iter;
+ 	__bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) {
+ 		unsigned int i, nr_sectors, pgoff;
  
- 	bio_for_each_segment(bvec, &io_bio->bio, iter) {
  		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
  		pgoff = bvec.bv_offset;
- 
- next_block_or_try_again:
- 		done.uptodate = 0;
- 		done.start = start;
- 		init_completion(&done.done);
- 
- 		ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
- 				pgoff, start, start + sectorsize - 1,
- 				io_bio->mirror_num,
- 				btrfs_retry_endio_nocsum, &done);
- 		if (ret) {
- 			err = ret;
- 			goto next;
- 		}
- 
- 		wait_for_completion_io(&done.done);
- 
- 		if (!done.uptodate) {
- 			/* We might have another mirror, so try again */
- 			goto next_block_or_try_again;
- 		}
- 
- next:
- 		start += sectorsize;
- 
- 		nr_sectors--;
- 		if (nr_sectors) {
- 			pgoff += sectorsize;
+ 		for (i = 0; i < nr_sectors; i++) {
  			ASSERT(pgoff < PAGE_SIZE);
- 			goto next_block_or_try_again;
- 		}
- 	}
- 
- 	return err;
- }
- 
- static void btrfs_retry_endio(struct bio *bio)
- {
- 	struct btrfs_retry_complete *done = bio->bi_private;
- 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- 	struct extent_io_tree *io_tree, *failure_tree;
- 	struct inode *inode = done->inode;
- 	struct bio_vec *bvec;
- 	int uptodate;
- 	int ret;
- 	int i = 0;
- 	struct bvec_iter_all iter_all;
- 
- 	if (bio->bi_status)
- 		goto end;
- 
- 	uptodate = 1;
- 
- 	ASSERT(bio->bi_vcnt == 1);
- 	ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
- 
- 	io_tree = &BTRFS_I(inode)->io_tree;
- 	failure_tree = &BTRFS_I(inode)->io_failure_tree;
- 
- 	ASSERT(!bio_flagged(bio, BIO_CLONED));
- 	bio_for_each_segment_all(bvec, bio, iter_all) {
- 		ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
- 					     bvec->bv_offset, done->start,
- 					     bvec->bv_len);
- 		if (!ret)
- 			clean_io_failure(BTRFS_I(inode)->root->fs_info,
- 					 failure_tree, io_tree, done->start,
- 					 bvec->bv_page,
- 					 btrfs_ino(BTRFS_I(inode)),
- 					 bvec->bv_offset);
- 		else
- 			uptodate = 0;
- 		i++;
- 	}
- 
- 	done->uptodate = uptodate;
- end:
- 	complete(&done->done);
- 	bio_put(bio);
- }
- 
- static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
- 		struct btrfs_io_bio *io_bio, blk_status_t err)
- {
- 	struct btrfs_fs_info *fs_info;
- 	struct bio_vec bvec;
- 	struct bvec_iter iter;
- 	struct btrfs_retry_complete done;
- 	u64 start;
- 	u64 offset = 0;
- 	u32 sectorsize;
- 	int nr_sectors;
- 	unsigned int pgoff;
- 	int csum_pos;
- 	bool uptodate = (err == 0);
- 	int ret;
- 	blk_status_t status;
- 
- 	fs_info = BTRFS_I(inode)->root->fs_info;
- 	sectorsize = fs_info->sectorsize;
- 
- 	err = BLK_STS_OK;
- 	start = io_bio->logical;
- 	done.inode = inode;
- 	io_bio->bio.bi_iter = io_bio->iter;
- 
- 	bio_for_each_segment(bvec, &io_bio->bio, iter) {
- 		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
- 
- 		pgoff = bvec.bv_offset;
- next_block:
- 		if (uptodate) {
- 			csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
- 			ret = __readpage_endio_check(inode, io_bio, csum_pos,
- 					bvec.bv_page, pgoff, start, sectorsize);
- 			if (likely(!ret))
- 				goto next;
- 		}
- try_again:
- 		done.uptodate = 0;
- 		done.start = start;
- 		init_completion(&done.done);
- 
- 		status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
- 					pgoff, start, start + sectorsize - 1,
- 					io_bio->mirror_num, btrfs_retry_endio,
- 					&done);
- 		if (status) {
- 			err = status;
- 			goto next;
- 		}
- 
- 		wait_for_completion_io(&done.done);
- 
- 		if (!done.uptodate) {
- 			/* We might have another mirror, so try again */
- 			goto try_again;
- 		}
- next:
- 		offset += sectorsize;
- 		start += sectorsize;
- 
- 		ASSERT(nr_sectors);
- 
- 		nr_sectors--;
- 		if (nr_sectors) {
+ 			if (uptodate &&
+ 			    (!csum || !check_data_csum(inode, io_bio, icsum,
+ 						       bvec.bv_page, pgoff,
+ 						       start, sectorsize))) {
+ 				clean_io_failure(fs_info, failure_tree, io_tree,
+ 						 start, bvec.bv_page,
+ 						 btrfs_ino(BTRFS_I(inode)),
+ 						 pgoff);
+ 			} else {
+ 				blk_status_t status;
+ 
+ 				status = btrfs_submit_read_repair(inode,
+ 							&io_bio->bio,
+ 							start - io_bio->logical,
+ 							bvec.bv_page, pgoff,
+ 							start,
+ 							start + sectorsize - 1,
+ 							io_bio->mirror_num,
+ 							submit_dio_repair_bio);
+ 				if (status)
+ 					err = status;
+ 			}
+ 			start += sectorsize;
+ 			icsum++;
  			pgoff += sectorsize;
- 			ASSERT(pgoff < PAGE_SIZE);
- 			goto next_block;
  		}
  	}
- 
  	return err;
  }
  
- static blk_status_t btrfs_subio_endio_read(struct inode *inode,
- 		struct btrfs_io_bio *io_bio, blk_status_t err)
- {
- 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- 
- 	if (skip_csum) {
- 		if (unlikely(err))
- 			return __btrfs_correct_data_nocsum(inode, io_bio);
- 		else
- 			return BLK_STS_OK;
- 	} else {
- 		return __btrfs_subio_endio_read(inode, io_bio, err);
- 	}
- }
- 
- static void btrfs_endio_direct_read(struct bio *bio)
- {
- 	struct btrfs_dio_private *dip = bio->bi_private;
- 	struct inode *inode = dip->inode;
- 	struct bio *dio_bio;
- 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- 	blk_status_t err = bio->bi_status;
- 
- 	if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
- 		err = btrfs_subio_endio_read(inode, io_bio, err);
- 
- 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
- 		      dip->logical_offset + dip->bytes - 1);
- 	dio_bio = dip->dio_bio;
- 
- 	kfree(dip);
- 
- 	dio_bio->bi_status = err;
- 	dio_end_io(dio_bio);
- 	btrfs_io_bio_free_csum(io_bio);
- 	bio_put(bio);
- }
- 
  static void __endio_write_update_ordered(struct inode *inode,
  					 const u64 offset, const u64 bytes,
  					 const bool uptodate)
@@@ -7759,21 -7588,6 +7588,6 @@@
  	}
  }
  
- static void btrfs_endio_direct_write(struct bio *bio)
- {
- 	struct btrfs_dio_private *dip = bio->bi_private;
- 	struct bio *dio_bio = dip->dio_bio;
- 
- 	__endio_write_update_ordered(dip->inode, dip->logical_offset,
- 				     dip->bytes, !bio->bi_status);
- 
- 	kfree(dip);
- 
- 	dio_bio->bi_status = bio->bi_status;
- 	dio_end_io(dio_bio);
- 	bio_put(bio);
- }
- 
  static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
  				    struct bio *bio, u64 offset)
  {
@@@ -7797,64 -7611,16 +7611,16 @@@ static void btrfs_end_dio_bio(struct bi
  			   (unsigned long long)bio->bi_iter.bi_sector,
  			   bio->bi_iter.bi_size, err);
  
- 	if (dip->subio_endio)
- 		err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
- 
- 	if (err) {
- 		/*
- 		 * We want to perceive the errors flag being set before
- 		 * decrementing the reference count. We don't need a barrier
- 		 * since atomic operations with a return value are fully
- 		 * ordered as per atomic_t.txt
- 		 */
- 		dip->errors = 1;
+ 	if (bio_op(bio) == REQ_OP_READ) {
+ 		err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio),
+ 					       !err);
  	}
  
- 	/* if there are more bios still pending for this dio, just exit */
- 	if (!atomic_dec_and_test(&dip->pending_bios))
- 		goto out;
+ 	if (err)
+ 		dip->dio_bio->bi_status = err;
  
- 	if (dip->errors) {
- 		bio_io_error(dip->orig_bio);
- 	} else {
- 		dip->dio_bio->bi_status = BLK_STS_OK;
- 		bio_endio(dip->orig_bio);
- 	}
- out:
  	bio_put(bio);
- }
- 
- static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
- 						 struct btrfs_dio_private *dip,
- 						 struct bio *bio,
- 						 u64 file_offset)
- {
- 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
- 	struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
- 	u16 csum_size;
- 	blk_status_t ret;
- 
- 	/*
- 	 * We load all the csum data we need when we submit
- 	 * the first bio to reduce the csum tree search and
- 	 * contention.
- 	 */
- 	if (dip->logical_offset == file_offset) {
- 		ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset,
- 					    NULL);
- 		if (ret)
- 			return ret;
- 	}
- 
- 	if (bio == dip->orig_bio)
- 		return 0;
- 
- 	file_offset -= dip->logical_offset;
- 	file_offset >>= inode->i_sb->s_blocksize_bits;
- 	csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy);
- 	io_bio->csum = orig_io_bio->csum + csum_size * file_offset;
- 
- 	return 0;
+ 	btrfs_dio_private_put(dip);
  }
  
  static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
@@@ -7892,10 -7658,12 +7658,12 @@@
  		if (ret)
  			goto err;
  	} else {
- 		ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
- 						     file_offset);
- 		if (ret)
- 			goto err;
+ 		u64 csum_offset;
+ 
+ 		csum_offset = file_offset - dip->logical_offset;
+ 		csum_offset >>= inode->i_sb->s_blocksize_bits;
+ 		csum_offset *= btrfs_super_csum_size(fs_info->super_copy);
+ 		btrfs_io_bio(bio)->csum = dip->csums + csum_offset;
  	}
  map:
  	ret = btrfs_map_bio(fs_info, bio, 0);
@@@ -7903,14 -7671,53 +7671,53 @@@ err
  	return ret;
  }
  
- static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
+ /*
+  * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
+  * or ordered extents whether or not we submit any bios.
+  */
+ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
+ 							  struct inode *inode,
+ 							  loff_t file_offset)
  {
- 	struct inode *inode = dip->inode;
+ 	const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ 	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
+ 	size_t dip_size;
+ 	struct btrfs_dio_private *dip;
+ 
+ 	dip_size = sizeof(*dip);
+ 	if (!write && csum) {
+ 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ 		const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+ 		size_t nblocks;
+ 
+ 		nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
+ 		dip_size += csum_size * nblocks;
+ 	}
+ 
+ 	dip = kzalloc(dip_size, GFP_NOFS);
+ 	if (!dip)
+ 		return NULL;
+ 
+ 	dip->inode = inode;
+ 	dip->logical_offset = file_offset;
+ 	dip->bytes = dio_bio->bi_iter.bi_size;
+ 	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
+ 	dip->dio_bio = dio_bio;
+ 	refcount_set(&dip->refs, 1);
+ 	return dip;
+ }
+ 
+ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ 		struct bio *dio_bio, loff_t file_offset)
+ {
+ 	const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ 	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ 	const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
+ 			     BTRFS_BLOCK_GROUP_RAID56_MASK);
+ 	struct btrfs_dio_private *dip;
  	struct bio *bio;
- 	struct bio *orig_bio = dip->orig_bio;
- 	u64 start_sector = orig_bio->bi_iter.bi_sector;
- 	u64 file_offset = dip->logical_offset;
+ 	u64 start_sector;
  	int async_submit = 0;
  	u64 submit_len;
  	int clone_offset = 0;
@@@ -7918,330 -7725,108 +7725,108 @@@
  	int ret;
  	blk_status_t status;
  	struct btrfs_io_geometry geom;
+ 	struct btrfs_dio_data *dio_data = iomap->private;
  
- 	submit_len = orig_bio->bi_iter.bi_size;
- 	ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
- 				    start_sector << 9, submit_len, &geom);
- 	if (ret)
- 		return -EIO;
+ 	dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
+ 	if (!dip) {
+ 		if (!write) {
+ 			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+ 				file_offset + dio_bio->bi_iter.bi_size - 1);
+ 		}
+ 		dio_bio->bi_status = BLK_STS_RESOURCE;
+ 		bio_endio(dio_bio);
+ 		return BLK_QC_T_NONE;
+ 	}
  
- 	if (geom.len >= submit_len) {
- 		bio = orig_bio;
- 		dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
- 		goto submit;
+ 	if (!write && csum) {
+ 		/*
+ 		 * Load the csums up front to reduce csum tree searches and
+ 		 * contention when submitting bios.
+ 		 */
+ 		status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset,
+ 					       dip->csums);
+ 		if (status != BLK_STS_OK)
+ 			goto out_err;
  	}
  
- 	/* async crcs make it difficult to collect full stripe writes. */
- 	if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
- 		async_submit = 0;
- 	else
- 		async_submit = 1;
+ 	start_sector = dio_bio->bi_iter.bi_sector;
+ 	submit_len = dio_bio->bi_iter.bi_size;
  
- 	/* bio split */
- 	ASSERT(geom.len <= INT_MAX);
- 	atomic_inc(&dip->pending_bios);
  	do {
+ 		ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio),
+ 					    start_sector << 9, submit_len,
+ 					    &geom);
+ 		if (ret) {
+ 			status = errno_to_blk_status(ret);
+ 			goto out_err;
+ 		}
+ 		ASSERT(geom.len <= INT_MAX);
+ 
  		clone_len = min_t(int, submit_len, geom.len);
  
  		/*
  		 * This will never fail as it's passing GPF_NOFS and
  		 * the allocation is backed by btrfs_bioset.
  		 */
- 		bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
- 					      clone_len);
+ 		bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
  		bio->bi_private = dip;
  		bio->bi_end_io = btrfs_end_dio_bio;
  		btrfs_io_bio(bio)->logical = file_offset;
  
  		ASSERT(submit_len >= clone_len);
  		submit_len -= clone_len;
- 		if (submit_len == 0)
- 			break;
  
  		/*
  		 * Increase the count before we submit the bio so we know
  		 * the end IO handler won't happen before we increase the
  		 * count. Otherwise, the dip might get freed before we're
  		 * done setting it up.
+ 		 *
+ 		 * We transfer the initial reference to the last bio, so we
+ 		 * don't need to increment the reference count for the last one.
  		 */
- 		atomic_inc(&dip->pending_bios);
+ 		if (submit_len > 0) {
+ 			refcount_inc(&dip->refs);
+ 			/*
+ 			 * If we are submitting more than one bio, submit them
+ 			 * all asynchronously. The exception is RAID 5 or 6, as
+ 			 * asynchronous checksums make it difficult to collect
+ 			 * full stripe writes.
+ 			 */
+ 			if (!raid56)
+ 				async_submit = 1;
+ 		}
  
  		status = btrfs_submit_dio_bio(bio, inode, file_offset,
  						async_submit);
  		if (status) {
  			bio_put(bio);
- 			atomic_dec(&dip->pending_bios);
+ 			if (submit_len > 0)
+ 				refcount_dec(&dip->refs);
  			goto out_err;
  		}
  
+ 		dio_data->submitted += clone_len;
  		clone_offset += clone_len;
  		start_sector += clone_len >> 9;
  		file_offset += clone_len;
- 
- 		ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
- 				      start_sector << 9, submit_len, &geom);
- 		if (ret)
- 			goto out_err;
  	} while (submit_len > 0);
+ 	return BLK_QC_T_NONE;
  
- submit:
- 	status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
- 	if (!status)
- 		return 0;
- 
- 	bio_put(bio);
  out_err:
- 	dip->errors = 1;
- 	/*
- 	 * Before atomic variable goto zero, we must  make sure dip->errors is
- 	 * perceived to be set. This ordering is ensured by the fact that an
- 	 * atomic operations with a return value are fully ordered as per
- 	 * atomic_t.txt
- 	 */
- 	if (atomic_dec_and_test(&dip->pending_bios))
- 		bio_io_error(dip->orig_bio);
- 
- 	/* bio_end_io() will handle error, so we needn't return it */
- 	return 0;
- }
- 
- static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- 				loff_t file_offset)
- {
- 	struct btrfs_dio_private *dip = NULL;
- 	struct bio *bio = NULL;
- 	struct btrfs_io_bio *io_bio;
- 	bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
- 	int ret = 0;
- 
- 	bio = btrfs_bio_clone(dio_bio);
- 
- 	dip = kzalloc(sizeof(*dip), GFP_NOFS);
- 	if (!dip) {
- 		ret = -ENOMEM;
- 		goto free_ordered;
- 	}
- 
- 	dip->private = dio_bio->bi_private;
- 	dip->inode = inode;
- 	dip->logical_offset = file_offset;
- 	dip->bytes = dio_bio->bi_iter.bi_size;
- 	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
- 	bio->bi_private = dip;
- 	dip->orig_bio = bio;
- 	dip->dio_bio = dio_bio;
- 	atomic_set(&dip->pending_bios, 0);
- 	io_bio = btrfs_io_bio(bio);
- 	io_bio->logical = file_offset;
- 
- 	if (write) {
- 		bio->bi_end_io = btrfs_endio_direct_write;
- 	} else {
- 		bio->bi_end_io = btrfs_endio_direct_read;
- 		dip->subio_endio = btrfs_subio_endio_read;
- 	}
- 
- 	/*
- 	 * Reset the range for unsubmitted ordered extents (to a 0 length range)
- 	 * even if we fail to submit a bio, because in such case we do the
- 	 * corresponding error handling below and it must not be done a second
- 	 * time by btrfs_direct_IO().
- 	 */
- 	if (write) {
- 		struct btrfs_dio_data *dio_data = current->journal_info;
- 
- 		dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- 			dip->bytes;
- 		dio_data->unsubmitted_oe_range_start =
- 			dio_data->unsubmitted_oe_range_end;
- 	}
- 
- 	ret = btrfs_submit_direct_hook(dip);
- 	if (!ret)
- 		return;
- 
- 	btrfs_io_bio_free_csum(io_bio);
- 
- free_ordered:
- 	/*
- 	 * If we arrived here it means either we failed to submit the dip
- 	 * or we either failed to clone the dio_bio or failed to allocate the
- 	 * dip. If we cloned the dio_bio and allocated the dip, we can just
- 	 * call bio_endio against our io_bio so that we get proper resource
- 	 * cleanup if we fail to submit the dip, otherwise, we must do the
- 	 * same as btrfs_endio_direct_[write|read] because we can't call these
- 	 * callbacks - they require an allocated dip and a clone of dio_bio.
- 	 */
- 	if (bio && dip) {
- 		bio_io_error(bio);
- 		/*
- 		 * The end io callbacks free our dip, do the final put on bio
- 		 * and all the cleanup and final put for dio_bio (through
- 		 * dio_end_io()).
- 		 */
- 		dip = NULL;
- 		bio = NULL;
- 	} else {
- 		if (write)
- 			__endio_write_update_ordered(inode,
- 						file_offset,
- 						dio_bio->bi_iter.bi_size,
- 						false);
- 		else
- 			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
- 			      file_offset + dio_bio->bi_iter.bi_size - 1);
- 
- 		dio_bio->bi_status = BLK_STS_IOERR;
- 		/*
- 		 * Releases and cleans up our dio_bio, no need to bio_put()
- 		 * nor bio_endio()/bio_io_error() against dio_bio.
- 		 */
- 		dio_end_io(dio_bio);
- 	}
- 	if (bio)
- 		bio_put(bio);
- 	kfree(dip);
+ 	dip->dio_bio->bi_status = status;
+ 	btrfs_dio_private_put(dip);
+ 	return BLK_QC_T_NONE;
  }
  
- static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
- 			       const struct iov_iter *iter, loff_t offset)
- {
- 	int seg;
- 	int i;
- 	unsigned int blocksize_mask = fs_info->sectorsize - 1;
- 	ssize_t retval = -EINVAL;
- 
- 	if (offset & blocksize_mask)
- 		goto out;
- 
- 	if (iov_iter_alignment(iter) & blocksize_mask)
- 		goto out;
- 
- 	/* If this is a write we don't need to check anymore */
- 	if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
- 		return 0;
- 	/*
- 	 * Check to make sure we don't have duplicate iov_base's in this
- 	 * iovec, if so return EINVAL, otherwise we'll get csum errors
- 	 * when reading back.
- 	 */
- 	for (seg = 0; seg < iter->nr_segs; seg++) {
- 		for (i = seg + 1; i < iter->nr_segs; i++) {
- 			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
- 				goto out;
- 		}
- 	}
- 	retval = 0;
- out:
- 	return retval;
- }
- 
- static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
- {
- 	struct file *file = iocb->ki_filp;
- 	struct inode *inode = file->f_mapping->host;
- 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- 	struct btrfs_dio_data dio_data = { 0 };
- 	struct extent_changeset *data_reserved = NULL;
- 	loff_t offset = iocb->ki_pos;
- 	size_t count = 0;
- 	int flags = 0;
- 	bool wakeup = true;
- 	bool relock = false;
- 	ssize_t ret;
- 
- 	if (check_direct_IO(fs_info, iter, offset))
- 		return 0;
- 
- 	inode_dio_begin(inode);
- 
- 	/*
- 	 * The generic stuff only does filemap_write_and_wait_range, which
- 	 * isn't enough if we've written compressed pages to this area, so
- 	 * we need to flush the dirty pages again to make absolutely sure
- 	 * that any outstanding dirty pages are on disk.
- 	 */
- 	count = iov_iter_count(iter);
- 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- 		     &BTRFS_I(inode)->runtime_flags))
- 		filemap_fdatawrite_range(inode->i_mapping, offset,
- 					 offset + count - 1);
- 
- 	if (iov_iter_rw(iter) == WRITE) {
- 		/*
- 		 * If the write DIO is beyond the EOF, we need update
- 		 * the isize, but it is protected by i_mutex. So we can
- 		 * not unlock the i_mutex at this case.
- 		 */
- 		if (offset + count <= inode->i_size) {
- 			dio_data.overwrite = 1;
- 			inode_unlock(inode);
- 			relock = true;
- 		} else if (iocb->ki_flags & IOCB_NOWAIT) {
- 			ret = -EAGAIN;
- 			goto out;
- 		}
- 		ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- 						   offset, count);
- 		if (ret)
- 			goto out;
- 
- 		/*
- 		 * We need to know how many extents we reserved so that we can
- 		 * do the accounting properly if we go over the number we
- 		 * originally calculated.  Abuse current->journal_info for this.
- 		 */
- 		dio_data.reserve = round_up(count,
- 					    fs_info->sectorsize);
- 		dio_data.unsubmitted_oe_range_start = (u64)offset;
- 		dio_data.unsubmitted_oe_range_end = (u64)offset;
- 		current->journal_info = &dio_data;
- 		down_read(&BTRFS_I(inode)->dio_sem);
- 	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- 				     &BTRFS_I(inode)->runtime_flags)) {
- 		inode_dio_end(inode);
- 		flags = DIO_LOCKING | DIO_SKIP_HOLES;
- 		wakeup = false;
- 	}
- 
- 	ret = __blockdev_direct_IO(iocb, inode,
- 				   fs_info->fs_devices->latest_bdev,
- 				   iter, btrfs_get_blocks_direct, NULL,
- 				   btrfs_submit_direct, flags);
- 	if (iov_iter_rw(iter) == WRITE) {
- 		up_read(&BTRFS_I(inode)->dio_sem);
- 		current->journal_info = NULL;
- 		if (ret < 0 && ret != -EIOCBQUEUED) {
- 			if (dio_data.reserve)
- 				btrfs_delalloc_release_space(inode, data_reserved,
- 					offset, dio_data.reserve, true);
- 			/*
- 			 * On error we might have left some ordered extents
- 			 * without submitting corresponding bios for them, so
- 			 * cleanup them up to avoid other tasks getting them
- 			 * and waiting for them to complete forever.
- 			 */
- 			if (dio_data.unsubmitted_oe_range_start <
- 			    dio_data.unsubmitted_oe_range_end)
- 				__endio_write_update_ordered(inode,
- 					dio_data.unsubmitted_oe_range_start,
- 					dio_data.unsubmitted_oe_range_end -
- 					dio_data.unsubmitted_oe_range_start,
- 					false);
- 		} else if (ret >= 0 && (size_t)ret < count)
- 			btrfs_delalloc_release_space(inode, data_reserved,
- 					offset, count - (size_t)ret, true);
- 		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
- 	}
- out:
- 	if (wakeup)
- 		inode_dio_end(inode);
- 	if (relock)
- 		inode_lock(inode);
+ const struct iomap_ops btrfs_dio_iomap_ops = {
+ 	.iomap_begin            = btrfs_dio_iomap_begin,
+ 	.iomap_end              = btrfs_dio_iomap_end,
+ };
  
- 	extent_changeset_free(data_reserved);
- 	return ret;
- }
+ const struct iomap_dio_ops btrfs_dops = {
+ 	.submit_io		= btrfs_submit_direct,
+ };
  
  #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
  
@@@ -8293,16 -7878,21 +7878,16 @@@ static int btrfs_writepages(struct addr
  	return extent_writepages(mapping, wbc);
  }
  
 -static int
 -btrfs_readpages(struct file *file, struct address_space *mapping,
 -		struct list_head *pages, unsigned nr_pages)
 +static void btrfs_readahead(struct readahead_control *rac)
  {
 -	return extent_readpages(mapping, pages, nr_pages);
 +	extent_readahead(rac);
  }
  
  static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
  {
  	int ret = try_release_extent_mapping(page, gfp_flags);
 -	if (ret == 1) {
 -		ClearPagePrivate(page);
 -		set_page_private(page, 0);
 -		put_page(page);
 -	}
 +	if (ret == 1)
 +		detach_page_private(page);
  	return ret;
  }
  
@@@ -8324,8 -7914,14 +7909,8 @@@ static int btrfs_migratepage(struct add
  	if (ret != MIGRATEPAGE_SUCCESS)
  		return ret;
  
 -	if (page_has_private(page)) {
 -		ClearPagePrivate(page);
 -		get_page(newpage);
 -		set_page_private(newpage, page_private(page));
 -		set_page_private(page, 0);
 -		put_page(page);
 -		SetPagePrivate(newpage);
 -	}
 +	if (page_has_private(page))
 +		attach_page_private(newpage, detach_page_private(page));
  
  	if (PagePrivate2(page)) {
  		ClearPagePrivate2(page);
@@@ -8447,7 -8043,11 +8032,7 @@@ again
  	}
  
  	ClearPageChecked(page);
 -	if (PagePrivate(page)) {
 -		ClearPagePrivate(page);
 -		set_page_private(page, 0);
 -		put_page(page);
 -	}
 +	detach_page_private(page);
  }
  
  /*
@@@ -10538,8 -10138,8 +10123,8 @@@ static const struct address_space_opera
  	.readpage	= btrfs_readpage,
  	.writepage	= btrfs_writepage,
  	.writepages	= btrfs_writepages,
 -	.readpages	= btrfs_readpages,
 +	.readahead	= btrfs_readahead,
- 	.direct_IO	= btrfs_direct_IO,
+ 	.direct_IO	= noop_direct_IO,
  	.invalidatepage = btrfs_invalidatepage,
  	.releasepage	= btrfs_releasepage,
  #ifdef CONFIG_MIGRATION
diff --combined fs/btrfs/send.c
index 6a92ecf9eaa2,0f37660b14b2..d9813a5b075a
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@@ -23,6 -23,7 +23,7 @@@
  #include "btrfs_inode.h"
  #include "transaction.h"
  #include "compression.h"
+ #include "xattr.h"
  
  /*
   * Maximum number of references an extent can have in order for us to attempt to
@@@ -4545,6 -4546,10 +4546,10 @@@ static int __process_new_xattr(int num
  	struct fs_path *p;
  	struct posix_acl_xattr_header dummy_acl;
  
+ 	/* Capabilities are emitted by finish_inode_if_needed */
+ 	if (!strncmp(name, XATTR_NAME_CAPS, name_len))
+ 		return 0;
+ 
  	p = fs_path_alloc();
  	if (!p)
  		return -ENOMEM;
@@@ -4801,17 -4806,12 +4806,12 @@@ static ssize_t fill_read_buf(struct sen
  	struct inode *inode;
  	struct page *page;
  	char *addr;
- 	struct btrfs_key key;
  	pgoff_t index = offset >> PAGE_SHIFT;
  	pgoff_t last_index;
  	unsigned pg_offset = offset_in_page(offset);
  	ssize_t ret = 0;
  
- 	key.objectid = sctx->cur_ino;
- 	key.type = BTRFS_INODE_ITEM_KEY;
- 	key.offset = 0;
- 
- 	inode = btrfs_iget(fs_info->sb, &key, root);
+ 	inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
  	if (IS_ERR(inode))
  		return PTR_ERR(inode);
  
@@@ -5107,6 -5107,64 +5107,64 @@@ static int send_extent_data(struct send
  	return 0;
  }
  
+ /*
+  * Search for a capability xattr related to sctx->cur_ino. If the capability is
+  * found, call send_set_xattr function to emit it.
+  *
+  * Return 0 if there isn't a capability, or when the capability was emitted
+  * successfully, or < 0 if an error occurred.
+  */
+ static int send_capabilities(struct send_ctx *sctx)
+ {
+ 	struct fs_path *fspath = NULL;
+ 	struct btrfs_path *path;
+ 	struct btrfs_dir_item *di;
+ 	struct extent_buffer *leaf;
+ 	unsigned long data_ptr;
+ 	char *buf = NULL;
+ 	int buf_len;
+ 	int ret = 0;
+ 
+ 	path = alloc_path_for_send();
+ 	if (!path)
+ 		return -ENOMEM;
+ 
+ 	di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
+ 				XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
+ 	if (!di) {
+ 		/* There is no xattr for this inode */
+ 		goto out;
+ 	} else if (IS_ERR(di)) {
+ 		ret = PTR_ERR(di);
+ 		goto out;
+ 	}
+ 
+ 	leaf = path->nodes[0];
+ 	buf_len = btrfs_dir_data_len(leaf, di);
+ 
+ 	fspath = fs_path_alloc();
+ 	buf = kmalloc(buf_len, GFP_KERNEL);
+ 	if (!fspath || !buf) {
+ 		ret = -ENOMEM;
+ 		goto out;
+ 	}
+ 
+ 	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ 	if (ret < 0)
+ 		goto out;
+ 
+ 	data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
+ 	read_extent_buffer(leaf, buf, data_ptr, buf_len);
+ 
+ 	ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ 			strlen(XATTR_NAME_CAPS), buf, buf_len);
+ out:
+ 	kfree(buf);
+ 	fs_path_free(fspath);
+ 	btrfs_free_path(path);
+ 	return ret;
+ }
+ 
  static int clone_range(struct send_ctx *sctx,
  		       struct clone_root *clone_root,
  		       const u64 disk_byte,
@@@ -5972,6 -6030,10 +6030,10 @@@ static int finish_inode_if_needed(struc
  			goto out;
  	}
  
+ 	ret = send_capabilities(sctx);
+ 	if (ret < 0)
+ 		goto out;
+ 
  	/*
  	 * If other directory inodes depended on our current directory
  	 * inode's move/rename, now do their move/rename operations.
@@@ -7021,7 -7083,6 +7083,6 @@@ long btrfs_ioctl_send(struct file *mnt_
  	struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
  	struct btrfs_fs_info *fs_info = send_root->fs_info;
  	struct btrfs_root *clone_root;
- 	struct btrfs_key key;
  	struct send_ctx *sctx = NULL;
  	u32 i;
  	u64 *clone_sources_tmp = NULL;
@@@ -7065,6 -7126,13 +7126,6 @@@
  		goto out;
  	}
  
 -	if (!access_ok(arg->clone_sources,
 -			sizeof(*arg->clone_sources) *
 -			arg->clone_sources_count)) {
 -		ret = -EFAULT;
 -		goto out;
 -	}
 -
  	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
  		ret = -EINVAL;
  		goto out;
@@@ -7143,11 -7211,8 +7204,8 @@@
  		}
  
  		for (i = 0; i < arg->clone_sources_count; i++) {
- 			key.objectid = clone_sources_tmp[i];
- 			key.type = BTRFS_ROOT_ITEM_KEY;
- 			key.offset = (u64)-1;
- 
- 			clone_root = btrfs_get_fs_root(fs_info, &key, true);
+ 			clone_root = btrfs_get_fs_root(fs_info,
+ 						clone_sources_tmp[i], true);
  			if (IS_ERR(clone_root)) {
  				ret = PTR_ERR(clone_root);
  				goto out;
@@@ -7178,11 -7243,8 +7236,8 @@@
  	}
  
  	if (arg->parent_root) {
- 		key.objectid = arg->parent_root;
- 		key.type = BTRFS_ROOT_ITEM_KEY;
- 		key.offset = (u64)-1;
- 
- 		sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true);
+ 		sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
+ 						      true);
  		if (IS_ERR(sctx->parent_root)) {
  			ret = PTR_ERR(sctx->parent_root);
  			goto out;
diff --combined fs/direct-io.c
index 6d5370eac2a8,c44d60f375bc..1543b5af400e
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@@ -386,25 -386,6 +386,6 @@@ static void dio_bio_end_io(struct bio *
  	spin_unlock_irqrestore(&dio->bio_lock, flags);
  }
  
- /**
-  * dio_end_io - handle the end io action for the given bio
-  * @bio: The direct io bio thats being completed
-  *
-  * This is meant to be called by any filesystem that uses their own dio_submit_t
-  * so that the DIO specific endio actions are dealt with after the filesystem
-  * has done it's completion work.
-  */
- void dio_end_io(struct bio *bio)
- {
- 	struct dio *dio = bio->bi_private;
- 
- 	if (dio->is_async)
- 		dio_bio_end_aio(bio);
- 	else
- 		dio_bio_end_io(bio);
- }
- EXPORT_SYMBOL_GPL(dio_end_io);
- 
  static inline void
  dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
  	      struct block_device *bdev,
@@@ -500,7 -481,7 +481,7 @@@ static struct bio *dio_await_one(struc
  		spin_unlock_irqrestore(&dio->bio_lock, flags);
  		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
  		    !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
 -			io_schedule();
 +			blk_io_schedule();
  		/* wake up sets us TASK_RUNNING */
  		spin_lock_irqsave(&dio->bio_lock, flags);
  		dio->waiter = NULL;
diff --combined fs/iomap/direct-io.c
index fd3bd06fabb6,e4addfc58107..ec7b78e6feca
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@@ -59,7 -59,7 +59,7 @@@ int iomap_dio_iopoll(struct kiocb *kioc
  EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
  
  static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
- 		struct bio *bio)
+ 		struct bio *bio, loff_t pos)
  {
  	atomic_inc(&dio->ref);
  
@@@ -67,7 -67,12 +67,12 @@@
  		bio_set_polled(bio, dio->iocb);
  
  	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
- 	dio->submit.cookie = submit_bio(bio);
+ 	if (dio->dops && dio->dops->submit_io)
+ 		dio->submit.cookie = dio->dops->submit_io(
+ 				file_inode(dio->iocb->ki_filp),
+ 				iomap, bio, pos);
+ 	else
+ 		dio->submit.cookie = submit_bio(bio);
  }
  
  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@@ -191,7 -196,7 +196,7 @@@ iomap_dio_zero(struct iomap_dio *dio, s
  	get_page(page);
  	__bio_add_page(bio, page, len, 0);
  	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
- 	iomap_dio_submit_bio(dio, iomap, bio);
+ 	iomap_dio_submit_bio(dio, iomap, bio, pos);
  }
  
  static loff_t
@@@ -299,11 -304,11 +304,11 @@@ iomap_dio_bio_actor(struct inode *inode
  		}
  
  		dio->size += n;
- 		pos += n;
  		copied += n;
  
  		nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES);
- 		iomap_dio_submit_bio(dio, iomap, bio);
+ 		iomap_dio_submit_bio(dio, iomap, bio, pos);
+ 		pos += n;
  	} while (nr_pages);
  
  	/*
@@@ -411,8 -416,6 +416,6 @@@ iomap_dio_rw(struct kiocb *iocb, struc
  	struct blk_plug plug;
  	struct iomap_dio *dio;
  
- 	lockdep_assert_held(&inode->i_rwsem);
- 
  	if (!count)
  		return 0;
  
@@@ -561,7 -564,7 +564,7 @@@
  			    !dio->submit.last_queue ||
  			    !blk_poll(dio->submit.last_queue,
  					 dio->submit.cookie, true))
 -				io_schedule();
 +				blk_io_schedule();
  		}
  		__set_current_state(TASK_RUNNING);
  	}
diff --combined include/linux/bio.h
index 683ff5fd8871,8e23f51ccfa4..91676d4b2dfe
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@@ -70,7 -70,7 +70,7 @@@ static inline bool bio_has_data(struct 
  	return false;
  }
  
 -static inline bool bio_no_advance_iter(struct bio *bio)
 +static inline bool bio_no_advance_iter(const struct bio *bio)
  {
  	return bio_op(bio) == REQ_OP_DISCARD ||
  	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
@@@ -138,8 -138,8 +138,8 @@@ static inline bool bio_next_segment(con
  #define bio_for_each_segment_all(bvl, bio, iter) \
  	for (bvl = bvec_init_iter_all(&iter); bio_next_segment((bio), &iter); )
  
 -static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 -				    unsigned bytes)
 +static inline void bio_advance_iter(const struct bio *bio,
 +				    struct bvec_iter *iter, unsigned int bytes)
  {
  	iter->bi_sector += bytes >> 9;
  
@@@ -169,6 -169,14 +169,14 @@@
  #define bio_for_each_bvec(bvl, bio, iter)			\
  	__bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter)
  
+ /*
+  * Iterate over all multi-page bvecs. Drivers shouldn't use this version for the
+  * same reasons as bio_for_each_segment_all().
+  */
+ #define bio_for_each_bvec_all(bvl, bio, i)		\
+ 	for (i = 0, bvl = bio_first_bvec_all(bio);	\
+ 	     i < (bio)->bi_vcnt; i++, bvl++)		\
+ 
  #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
  
  static inline unsigned bio_segments(struct bio *bio)
@@@ -417,7 -425,6 +425,7 @@@ static inline void bio_io_error(struct 
  
  static inline void bio_wouldblock_error(struct bio *bio)
  {
 +	bio_set_flag(bio, BIO_QUIET);
  	bio->bi_status = BLK_STS_AGAIN;
  	bio_endio(bio);
  }
@@@ -445,6 -452,12 +453,6 @@@ void bio_release_pages(struct bio *bio
  extern void bio_set_pages_dirty(struct bio *bio);
  extern void bio_check_pages_dirty(struct bio *bio);
  
 -void generic_start_io_acct(struct request_queue *q, int op,
 -				unsigned long sectors, struct hd_struct *part);
 -void generic_end_io_acct(struct request_queue *q, int op,
 -				struct hd_struct *part,
 -				unsigned long start_time);
 -
  extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
  			       struct bio *src, struct bvec_iter *src_iter);
  extern void bio_copy_data(struct bio *dst, struct bio *src);
diff --combined include/linux/fs.h
index f3e167ffbb74,e84623d5e173..cffc3619eed5
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -292,7 -292,6 +292,7 @@@ enum positive_aop_returns 
  struct page;
  struct address_space;
  struct writeback_control;
 +struct readahead_control;
  
  /*
   * Write life time hint values.
@@@ -376,7 -375,6 +376,7 @@@ struct address_space_operations 
  	 */
  	int (*readpages)(struct file *filp, struct address_space *mapping,
  			struct list_head *pages, unsigned nr_pages);
 +	void (*readahead)(struct readahead_control *);
  
  	int (*write_begin)(struct file *, struct address_space *mapping,
  				loff_t pos, unsigned len, unsigned flags,
@@@ -978,7 -976,6 +978,7 @@@ struct file 
  #endif /* #ifdef CONFIG_EPOLL */
  	struct address_space	*f_mapping;
  	errseq_t		f_wb_err;
 +	errseq_t		f_sb_err; /* for syncfs */
  } __randomize_layout
    __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */
  
@@@ -1523,9 -1520,6 +1523,9 @@@ struct super_block 
  	/* Being remounted read-only */
  	int s_readonly_remount;
  
 +	/* per-sb errseq_t for reporting writeback errors via syncfs */
 +	errseq_t s_wb_err;
 +
  	/* AIO completions deferred from interrupt context */
  	struct workqueue_struct *s_dio_done_wq;
  	struct hlist_head s_pins;
@@@ -1727,11 -1721,7 +1727,11 @@@ extern int vfs_link(struct dentry *, st
  extern int vfs_rmdir(struct inode *, struct dentry *);
  extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
  extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
 -extern int vfs_whiteout(struct inode *, struct dentry *);
 +
 +static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
 +{
 +	return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
 +}
  
  extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
  				  int open_flag);
@@@ -2166,8 -2156,6 +2166,8 @@@ static inline void kiocb_clone(struct k
   *
   * I_CREATING		New object's inode in the middle of setting up.
   *
 + * I_DONTCACHE		Evict inode as soon as it is not used anymore.
 + *
   * Q: What is the difference between I_WILL_FREE and I_FREEING?
   */
  #define I_DIRTY_SYNC		(1 << 0)
@@@ -2190,7 -2178,6 +2190,7 @@@
  #define I_WB_SWITCH		(1 << 13)
  #define I_OVL_INUSE		(1 << 14)
  #define I_CREATING		(1 << 15)
 +#define I_DONTCACHE		(1 << 16)
  
  #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
  #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
@@@ -2594,6 -2581,7 +2594,6 @@@ extern struct kmem_cache *names_cachep
  #ifdef CONFIG_BLOCK
  extern int register_blkdev(unsigned int, const char *);
  extern void unregister_blkdev(unsigned int, const char *);
 -extern void bdev_unhash_inode(dev_t dev);
  extern struct block_device *bdget(dev_t);
  extern struct block_device *bdgrab(struct block_device *bdev);
  extern void bd_set_size(struct block_device *, loff_t size);
@@@ -2649,6 -2637,7 +2649,6 @@@ extern int sync_filesystem(struct super
  extern const struct file_operations def_blk_fops;
  extern const struct file_operations def_chr_fops;
  #ifdef CONFIG_BLOCK
 -extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
  extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
  extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
  extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
@@@ -2734,6 -2723,7 +2734,6 @@@ extern bool is_bad_inode(struct inode *
  extern int revalidate_disk(struct gendisk *);
  extern int check_disk_change(struct block_device *);
  extern int __invalidate_device(struct block_device *, bool);
 -extern int invalidate_partition(struct gendisk *, int);
  #endif
  unsigned long invalidate_mapping_pages(struct address_space *mapping,
  					pgoff_t start, pgoff_t end);
@@@ -2837,18 -2827,6 +2837,18 @@@ static inline errseq_t filemap_sample_w
  	return errseq_sample(&mapping->wb_err);
  }
  
 +/**
 + * file_sample_sb_err - sample the current errseq_t to test for later errors
 + * @mapping: mapping to be sampled
 + *
 + * Grab the most current superblock-level errseq_t value for the given
 + * struct file.
 + */
 +static inline errseq_t file_sample_sb_err(struct file *file)
 +{
 +	return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
 +}
 +
  static inline int filemap_nr_thps(struct address_space *mapping)
  {
  #ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@@ -3071,10 -3049,8 +3071,10 @@@ extern int inode_needs_sync(struct inod
  extern int generic_delete_inode(struct inode *inode);
  static inline int generic_drop_inode(struct inode *inode)
  {
 -	return !inode->i_nlink || inode_unhashed(inode);
 +	return !inode->i_nlink || inode_unhashed(inode) ||
 +		(inode->i_state & I_DONTCACHE);
  }
 +extern void d_mark_dontcache(struct inode *inode);
  
  extern struct inode *ilookup5_nowait(struct super_block *sb,
  		unsigned long hashval, int (*test)(struct inode *, void *),
@@@ -3148,6 -3124,8 +3148,8 @@@ extern int generic_file_rw_checks(struc
  extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
  				    struct file *file_out, loff_t pos_out,
  				    size_t *count, unsigned int flags);
+ extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
+ 		struct iov_iter *to, ssize_t already_read);
  extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
  extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
  extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
@@@ -3209,8 -3187,6 +3211,6 @@@ enum 
  	DIO_SKIP_HOLES	= 0x02,
  };
  
- void dio_end_io(struct bio *bio);
- 
  ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
  			     struct block_device *bdev, struct iov_iter *iter,
  			     get_block_t get_block,
@@@ -3418,6 -3394,11 +3418,6 @@@ extern void setattr_copy(struct inode *
  
  extern int file_update_time(struct file *file);
  
 -static inline bool io_is_direct(struct file *filp)
 -{
 -	return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
 -}
 -
  static inline bool vma_is_dax(const struct vm_area_struct *vma)
  {
  	return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
@@@ -3442,7 -3423,7 +3442,7 @@@ static inline int iocb_flags(struct fil
  	int res = 0;
  	if (file->f_flags & O_APPEND)
  		res |= IOCB_APPEND;
 -	if (io_is_direct(file))
 +	if (file->f_flags & O_DIRECT)
  		res |= IOCB_DIRECT;
  	if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
  		res |= IOCB_DSYNC;
diff --combined include/linux/iomap.h
index bc20bd04c2a2,5b4875344874..a5c219c29b10
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@@ -155,7 -155,8 +155,7 @@@ loff_t iomap_apply(struct inode *inode
  ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
  		const struct iomap_ops *ops);
  int iomap_readpage(struct page *page, const struct iomap_ops *ops);
 -int iomap_readpages(struct address_space *mapping, struct list_head *pages,
 -		unsigned nr_pages, const struct iomap_ops *ops);
 +void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
  int iomap_set_page_dirty(struct page *page);
  int iomap_is_partially_uptodate(struct page *page, unsigned long from,
  		unsigned long count);
@@@ -251,6 -252,8 +251,8 @@@ int iomap_writepages(struct address_spa
  struct iomap_dio_ops {
  	int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
  		      unsigned flags);
+ 	blk_qc_t (*submit_io)(struct inode *inode, struct iomap *iomap,
+ 			struct bio *bio, loff_t file_offset);
  };
  
  ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
diff --combined mm/filemap.c
index fe079e9219d1,ad82672a9941..3430280df607
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@@ -1991,7 -1991,7 +1991,7 @@@ static void shrink_readahead_size_eio(s
   * * total number of bytes copied, including those the were already @written
   * * negative error code if nothing was copied
   */
- static ssize_t generic_file_buffered_read(struct kiocb *iocb,
+ ssize_t generic_file_buffered_read(struct kiocb *iocb,
  		struct iov_iter *iter, ssize_t written)
  {
  	struct file *filp = iocb->ki_filp;
@@@ -2243,6 -2243,7 +2243,7 @@@ out
  	file_accessed(filp);
  	return written ? written : error;
  }
+ EXPORT_SYMBOL_GPL(generic_file_buffered_read);
  
  /**
   * generic_file_read_iter - generic filesystem read routine
@@@ -2566,6 -2567,7 +2567,6 @@@ page_not_uptodate
  	if (!error || error == AOP_TRUNCATED_PAGE)
  		goto retry_find;
  
 -	/* Things didn't work out. Return zero to tell the mm layer so. */
  	shrink_readahead_size_eio(ra);
  	return VM_FAULT_SIGBUS;