From: Linus Torvalds Date: Wed, 3 Jun 2020 02:59:25 +0000 (-0700) Subject: Merge tag 'for-5.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux X-Git-Tag: v5.8-rc1~182 X-Git-Url: http://git.osdn.net/view?p=tomoyo%2Ftomoyo-test1.git;a=commitdiff_plain;h=f3cdc8ae116e27d84e1f33c7a2995960cebb73ac;hp=-c Merge tag 'for-5.8-tag' of git://git./linux/kernel/git/kdave/linux Pull btrfs updates from David Sterba: "Highlights: - speedup dead root detection during orphan cleanup, eg. when there are many deleted subvolumes waiting to be cleaned, the trees are now looked up in radix tree instead of a O(N^2) search - snapshot creation with inherited qgroup will mark the qgroup inconsistent, requires a rescan - send will emit file capabilities after chown, this produces a stream that does not need postprocessing to set the capabilities again - direct io ported to iomap infrastructure, cleaned up and simplified code, notably removing last use of struct buffer_head in btrfs code Core changes: - factor out backreference iteration, to be used by ordinary backreferences and relocation code - improved global block reserve utilization * better logic to serialize requests * increased maximum available for unlink * improved handling on large pages (64K) - direct io cleanups and fixes * simplify layering, where cloned bios were unnecessarily created for some cases * error handling fixes (submit, endio) * remove repair worker thread, used to avoid deadlocks during repair - refactored block group reading code, preparatory work for new type of block group storage that should improve mount time on large filesystems Cleanups: - cleaned up (and slightly sped up) set/get helpers for metadata data structure members - root bit REF_COWS got renamed to SHAREABLE to reflect the that the blocks of the tree get shared either among subvolumes or with the relocation trees Fixes: - when subvolume deletion fails due to ENOSPC, the filesystem is not turned read-only - device scan deals with devices from other filesystems that changed ownership due to overwrite (mkfs) - fix a race between scrub and block group removal/allocation - fix long standing bug of a runaway balance operation, printing the same line to the syslog, caused by a stale status bit on a reloc tree that prevented progress - fix corrupt log due to concurrent fsync of inodes with shared extents - fix space underflow for NODATACOW and buffered writes when it for some reason needs to fallback to COW mode" * tag 'for-5.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (133 commits) btrfs: fix space_info bytes_may_use underflow during space cache writeout btrfs: fix space_info bytes_may_use underflow after nocow buffered write btrfs: fix wrong file range cleanup after an error filling dealloc range btrfs: remove redundant local variable in read_block_for_search btrfs: open code key_search btrfs: split btrfs_direct_IO to read and write part btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK fs: remove dio_end_io() btrfs: switch to iomap_dio_rw() for dio iomap: remove lockdep_assert_held() iomap: add a filesystem hook for direct I/O bio submission fs: export generic_file_buffered_read() btrfs: turn space cache writeout failure messages into debug messages btrfs: include error on messages about failure to write space/inode caches btrfs: remove useless 'fail_unlock' label from btrfs_csum_file_blocks() btrfs: do not ignore error from btrfs_next_leaf() when inserting checksums btrfs: make checksum item extension more efficient btrfs: fix corrupt log due to concurrent fsync of inodes with shared extents btrfs: unexport btrfs_compress_set_level() btrfs: simplify iget helpers ... --- f3cdc8ae116e27d84e1f33c7a2995960cebb73ac diff --combined fs/btrfs/disk-io.c index 7278789ff8a7,f8ec2d8606fd..7c6f0bbb54a5 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@@ -358,16 -358,14 +358,14 @@@ static int btrfs_check_super_csum(struc SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - crypto_shash_final(shash, result); + crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) return 1; @@@ -709,9 -707,7 +707,7 @@@ static void end_workqueue_bio(struct bi else wq = fs_info->endio_write_workers; } else { - if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR)) - wq = fs_info->endio_repair_workers; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) wq = fs_info->endio_raid56_workers; else if (end_io_wq->metadata) wq = fs_info->endio_meta_workers; @@@ -980,7 -976,9 +976,7 @@@ static void btree_invalidatepage(struc btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, "page private not zero on page %llu", (unsigned long long)page_offset(page)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); + detach_page_private(page); } } @@@ -1135,9 -1133,12 +1131,12 @@@ static void __setup_root(struct btrfs_r root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; - if (!dummy) + if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); + extent_io_tree_init(fs_info, &root->log_csum_range, + IO_TREE_LOG_CSUM_RANGE, NULL); + } memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@@ -1275,12 -1276,13 +1274,13 @@@ static struct btrfs_root *alloc_log_tre root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; /* - * DON'T set REF_COWS for log trees + * DON'T set SHAREABLE bit for log trees. * - * log trees do not get reference counted because they go away - * before a real commit is actually done. They do store pointers - * to file data extents, and those reference counts still get - * updated (along with back refs to the log tree). + * Log trees are not exposed to user space thus can't be snapshotted, + * and they go away before a real commit is actually done. + * + * They do store pointers to file data extents, and those reference + * counts still get updated (along with back refs to the log tree). */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, @@@ -1418,8 -1420,9 +1418,9 @@@ static int btrfs_init_fs_root(struct bt if (ret) goto fail; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - set_bit(BTRFS_ROOT_REF_COWS, &root->state); + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@@ -1524,6 -1527,7 +1525,7 @@@ void btrfs_free_fs_info(struct btrfs_fs btrfs_put_root(fs_info->uuid_root); btrfs_put_root(fs_info->free_space_root); btrfs_put_root(fs_info->fs_root); + btrfs_put_root(fs_info->data_reloc_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@@ -1533,35 -1537,34 +1535,34 @@@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *location, - bool check_ref) + u64 objectid, bool check_ref) { struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; int ret; - if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + if (objectid == BTRFS_ROOT_TREE_OBJECTID) return btrfs_grab_root(fs_info->tree_root); - if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) return btrfs_grab_root(fs_info->extent_root); - if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + if (objectid == BTRFS_CHUNK_TREE_OBJECTID) return btrfs_grab_root(fs_info->chunk_root); - if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + if (objectid == BTRFS_DEV_TREE_OBJECTID) return btrfs_grab_root(fs_info->dev_root); - if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + if (objectid == BTRFS_CSUM_TREE_OBJECTID) return btrfs_grab_root(fs_info->csum_root); - if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) + if (objectid == BTRFS_QUOTA_TREE_OBJECTID) return btrfs_grab_root(fs_info->quota_root) ? fs_info->quota_root : ERR_PTR(-ENOENT); - if (location->objectid == BTRFS_UUID_TREE_OBJECTID) + if (objectid == BTRFS_UUID_TREE_OBJECTID) return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); - if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) return btrfs_grab_root(fs_info->free_space_root) ? fs_info->free_space_root : ERR_PTR(-ENOENT); again: - root = btrfs_lookup_fs_root(fs_info, location->objectid); + root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); @@@ -1570,7 -1573,10 +1571,10 @@@ return root; } - root = btrfs_read_tree_root(fs_info->tree_root, location); + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(root)) return root; @@@ -1590,7 -1596,7 +1594,7 @@@ } key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; - key.offset = location->objectid; + key.offset = objectid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); btrfs_free_path(path); @@@ -1940,7 -1946,6 +1944,6 @@@ static void btrfs_stop_all_workers(stru btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); - btrfs_destroy_workqueue(fs_info->endio_repair_workers); btrfs_destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); @@@ -1981,6 -1986,7 +1984,7 @@@ static void free_root_pointers(struct b free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); + free_root_extent_buffers(info->data_reloc_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); @@@ -1993,6 -1999,7 +1997,7 @@@ void btrfs_put_root(struct btrfs_root * if (refcount_dec_and_test(&root->refs)) { WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); btrfs_drew_lock_destroy(&root->snapshot_lock); @@@ -2143,8 -2150,6 +2148,6 @@@ static int btrfs_init_workqueues(struc fs_info->endio_raid56_workers = btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, max_active, 4); - fs_info->endio_repair_workers = - btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0); fs_info->rmw_workers = btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); fs_info->endio_write_workers = @@@ -2168,7 -2173,6 +2171,6 @@@ fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && - fs_info->endio_repair_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && @@@ -2290,6 -2294,19 +2292,19 @@@ static int btrfs_read_roots(struct btrf set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->csum_root = root; + /* + * This tree can share blocks with some other fs tree during relocation + * and we need a proper setup by btrfs_get_fs_root + */ + root = btrfs_get_fs_root(tree_root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID, true); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { @@@ -2827,7 -2844,6 +2842,6 @@@ int __cold open_ctree(struct super_bloc u64 generation; u64 features; u16 csum_type; - struct btrfs_key location; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; @@@ -3241,11 -3257,7 +3255,7 @@@ } } - location.objectid = BTRFS_FS_TREE_OBJECTID; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = 0; - - fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true); + fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", err); @@@ -3508,10 -3520,9 +3518,9 @@@ static int write_dev_supers(struct btrf btrfs_set_super_bytenr(sb, bytenr); - crypto_shash_init(shash); - crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - crypto_shash_final(shash, sb->csum); + crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, + sb->csum); page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); diff --combined fs/btrfs/extent_io.c index e12eb32d9e17,c59e07360083..68c96057ad2d --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@@ -2333,7 -2333,7 +2333,7 @@@ int repair_io_failure(struct btrfs_fs_i return 0; } - int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num) + int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 start = eb->start; @@@ -2537,8 -2537,9 +2537,9 @@@ int btrfs_get_io_failure_record(struct return 0; } - bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, - struct io_failure_record *failrec, int failed_mirror) + static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, + struct io_failure_record *failrec, + int failed_mirror) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int num_copies; @@@ -2561,7 -2562,7 +2562,7 @@@ * a) deliver good data to the caller * b) correct the bad sectors on disk */ - if (failed_bio_pages > 1) { + if (needs_validation) { /* * to fulfill b), we need to know the exact failing sectors, as * we don't want to rewrite any more than the failed ones. thus, @@@ -2600,94 -2601,115 +2601,115 @@@ return true; } - - struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, - struct io_failure_record *failrec, - struct page *page, int pg_offset, int icsum, - bio_end_io_t *endio_func, void *data) + static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio *bio; - struct btrfs_io_bio *btrfs_failed_bio; - struct btrfs_io_bio *btrfs_bio; + u64 len = 0; + const u32 blocksize = inode->i_sb->s_blocksize; - bio = btrfs_io_bio_alloc(1); - bio->bi_end_io = endio_func; - bio->bi_iter.bi_sector = failrec->logical >> 9; - bio->bi_iter.bi_size = 0; - bio->bi_private = data; + /* + * If bi_status is BLK_STS_OK, then this was a checksum error, not an + * I/O error. In this case, we already know exactly which sector was + * bad, so we don't need to validate. + */ + if (bio->bi_status == BLK_STS_OK) + return false; - btrfs_failed_bio = btrfs_io_bio(failed_bio); - if (btrfs_failed_bio->csum) { - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + /* + * We need to validate each sector individually if the failed I/O was + * for multiple sectors. + * + * There are a few possible bios that can end up here: + * 1. A buffered read bio, which is not cloned. + * 2. A direct I/O read bio, which is cloned. + * 3. A (buffered or direct) repair bio, which is not cloned. + * + * For cloned bios (case 2), we can get the size from + * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get + * it from the bvecs. + */ + if (bio_flagged(bio, BIO_CLONED)) { + if (btrfs_io_bio(bio)->iter.bi_size > blocksize) + return true; + } else { + struct bio_vec *bvec; + int i; - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = btrfs_bio->csum_inline; - icsum *= csum_size; - memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, - csum_size); + bio_for_each_bvec_all(bvec, bio, i) { + len += bvec->bv_len; + if (len > blocksize) + return true; + } } - - bio_add_page(bio, page, failrec->len, pg_offset); - - return bio; + return false; } - /* - * This is a generic handler for readpage errors. If other copies exist, read - * those and write back good data to the failed position. Does not investigate - * in remapping the failed extent elsewhere, hoping the device will be smart - * enough to do this as needed - */ - static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int failed_mirror) + blk_status_t btrfs_submit_read_repair(struct inode *inode, + struct bio *failed_bio, u64 phy_offset, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + submit_bio_hook_t *submit_bio_hook) { struct io_failure_record *failrec; - struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct bio *bio; - int read_mode = 0; + struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); + const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits; + bool need_validation; + struct bio *repair_bio; + struct btrfs_io_bio *repair_io_bio; blk_status_t status; int ret; - unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT; + + btrfs_debug(fs_info, + "repair read error: read error at %llu", start); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) - return ret; + return errno_to_blk_status(ret); + + need_validation = btrfs_io_needs_validation(inode, failed_bio); - if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, + if (!btrfs_check_repairable(inode, need_validation, failrec, failed_mirror)) { free_io_failure(failure_tree, tree, failrec); - return -EIO; + return BLK_STS_IOERR; } - if (failed_bio_pages > 1) - read_mode |= REQ_FAILFAST_DEV; + repair_bio = btrfs_io_bio_alloc(1); + repair_io_bio = btrfs_io_bio(repair_bio); + repair_bio->bi_opf = REQ_OP_READ; + if (need_validation) + repair_bio->bi_opf |= REQ_FAILFAST_DEV; + repair_bio->bi_end_io = failed_bio->bi_end_io; + repair_bio->bi_iter.bi_sector = failrec->logical >> 9; + repair_bio->bi_private = failed_bio->bi_private; - phy_offset >>= inode->i_sb->s_blocksize_bits; - bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, - start - page_offset(page), - (int)phy_offset, failed_bio->bi_end_io, - NULL); - bio->bi_opf = REQ_OP_READ | read_mode; + if (failed_io_bio->csum) { + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + + repair_io_bio->csum = repair_io_bio->csum_inline; + memcpy(repair_io_bio->csum, + failed_io_bio->csum + csum_size * icsum, csum_size); + } + + bio_add_page(repair_bio, page, failrec->len, pgoff); + repair_io_bio->logical = failrec->start; + repair_io_bio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), - "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", - read_mode, failrec->this_mirror, failrec->in_validation); + "repair read error: submitting new read to mirror %d, in_validation=%d", + failrec->this_mirror, failrec->in_validation); - status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, - failrec->bio_flags); + status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, + failrec->bio_flags); if (status) { free_io_failure(failure_tree, tree, failrec); - bio_put(bio); - ret = blk_status_to_errno(status); + bio_put(repair_bio); } - - return ret; + return status; } /* lots and lots of room for performance fixes in the end_bio funcs */ @@@ -2859,9 -2881,10 +2881,10 @@@ static void end_bio_extent_readpage(str * If it can't handle the error it will return -EIO and * we remain responsible for that page. */ - ret = bio_readpage_error(bio, offset, page, start, end, - mirror); - if (ret == 0) { + if (!btrfs_submit_read_repair(inode, bio, offset, page, + start - page_offset(page), + start, end, mirror, + tree->ops->submit_bio_hook)) { uptodate = !bio->bi_status; offset += len; continue; @@@ -3076,16 -3099,22 +3099,16 @@@ static int submit_extent_page(unsigned static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long)eb); - } else { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else WARN_ON(page->private != (unsigned long)eb); - } } void set_page_extent_mapped(struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); } static struct extent_map * @@@ -4361,32 -4390,51 +4384,32 @@@ int extent_writepages(struct address_sp return ret; } -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages) +void extent_readahead(struct readahead_control *rac) { struct bio *bio = NULL; unsigned long bio_flags = 0; struct page *pagepool[16]; struct extent_map *em_cached = NULL; - int nr = 0; u64 prev_em_start = (u64)-1; + int nr; - while (!list_empty(pages)) { - u64 contig_end = 0; - - for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { - struct page *page = lru_to_page(pages); - - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - break; - } - - pagepool[nr++] = page; - contig_end = page_offset(page) + PAGE_SIZE - 1; - } - - if (nr) { - u64 contig_start = page_offset(pagepool[0]); + while ((nr = readahead_page_batch(rac, pagepool))) { + u64 contig_start = page_offset(pagepool[0]); + u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); + ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - contiguous_readpages(pagepool, nr, contig_start, - contig_end, &em_cached, &bio, &bio_flags, - &prev_em_start); - } + contiguous_readpages(pagepool, nr, contig_start, contig_end, + &em_cached, &bio, &bio_flags, &prev_em_start); } if (em_cached) free_extent_map(em_cached); - if (bio) - return submit_one_bio(bio, 0, bio_flags); - return 0; + if (bio) { + if (submit_one_bio(bio, 0, bio_flags)) + return; + } } /* @@@ -4862,7 -4910,7 +4885,7 @@@ static void __free_extent_buffer(struc kmem_cache_free(extent_buffer_cache, eb); } - int extent_buffer_under_io(struct extent_buffer *eb) + int extent_buffer_under_io(const struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || @@@ -4904,7 -4952,10 +4927,7 @@@ static void btrfs_release_extent_buffer * We need to make sure we haven't be attached * to a new eb. */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - put_page(page); + detach_page_private(page); } if (mapped) @@@ -4967,7 -5018,7 +4990,7 @@@ __alloc_extent_buffer(struct btrfs_fs_i return eb; } - struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) + struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { int i; struct page *p; @@@ -5373,7 -5424,7 +5396,7 @@@ void free_extent_buffer_stale(struct ex release_extent_buffer(eb); } - void clear_extent_buffer_dirty(struct extent_buffer *eb) + void clear_extent_buffer_dirty(const struct extent_buffer *eb) { int i; int num_pages; @@@ -5571,8 -5622,7 +5594,7 @@@ void read_extent_buffer(const struct ex struct page *page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; if (start + len > eb->len) { WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", @@@ -5581,7 -5631,7 +5603,7 @@@ return; } - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@@ -5606,14 -5656,13 +5628,13 @@@ int read_extent_buffer_to_user(const st struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@@ -5634,48 -5683,6 +5655,6 @@@ return ret; } - /* - * return 0 if the item is found within a page. - * return 1 if the item spans two pages. - * return -EINVAL otherwise. - */ - int map_private_extent_buffer(const struct extent_buffer *eb, - unsigned long start, unsigned long min_len, - char **map, unsigned long *map_start, - unsigned long *map_len) - { - size_t offset; - char *kaddr; - struct page *p; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; - unsigned long end_i = (start_offset + start + min_len - 1) >> - PAGE_SHIFT; - - if (start + min_len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, min_len); - return -EINVAL; - } - - if (i != end_i) - return 1; - - if (i == 0) { - offset = start_offset; - *map_start = 0; - } else { - offset = 0; - *map_start = ((u64)i << PAGE_SHIFT) - start_offset; - } - - p = eb->pages[i]; - kaddr = page_address(p); - *map = kaddr + offset; - *map_len = PAGE_SIZE - offset; - return 0; - } - int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { @@@ -5684,14 -5691,13 +5663,13 @@@ struct page *page; char *kaddr; char *ptr = (char *)ptrv; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@@ -5711,7 -5717,7 +5689,7 @@@ return ret; } - void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, + void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; @@@ -5722,7 -5728,7 +5700,7 @@@ BTRFS_FSID_SIZE); } - void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) + void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; @@@ -5732,7 -5738,7 +5710,7 @@@ BTRFS_FSID_SIZE); } - void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len) { size_t cur; @@@ -5740,13 -5746,12 +5718,12 @@@ struct page *page; char *kaddr; char *src = (char *)srcv; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@@ -5763,20 -5768,19 +5740,19 @@@ } } - void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, + void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, unsigned long len) { size_t cur; size_t offset; struct page *page; char *kaddr; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@@ -5792,8 -5796,8 +5768,8 @@@ } } - void copy_extent_buffer_full(struct extent_buffer *dst, - struct extent_buffer *src) + void copy_extent_buffer_full(const struct extent_buffer *dst, + const struct extent_buffer *src) { int i; int num_pages; @@@ -5806,7 -5810,8 +5782,8 @@@ page_address(src->pages[i])); } - void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + void copy_extent_buffer(const struct extent_buffer *dst, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { @@@ -5815,12 -5820,11 +5792,11 @@@ size_t offset; struct page *page; char *kaddr; - size_t start_offset = offset_in_page(dst->start); - unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; + unsigned long i = dst_offset >> PAGE_SHIFT; WARN_ON(src->len != dst_len); - offset = offset_in_page(start_offset + dst_offset); + offset = offset_in_page(dst_offset); while (len > 0) { page = dst->pages[i]; @@@ -5851,12 -5855,11 +5827,11 @@@ * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. */ - static inline void eb_bitmap_offset(struct extent_buffer *eb, + static inline void eb_bitmap_offset(const struct extent_buffer *eb, unsigned long start, unsigned long nr, unsigned long *page_index, size_t *page_offset) { - size_t start_offset = offset_in_page(eb->start); size_t byte_offset = BIT_BYTE(nr); size_t offset; @@@ -5865,7 -5868,7 +5840,7 @@@ * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start_offset + start + byte_offset; + offset = start + byte_offset; *page_index = offset >> PAGE_SHIFT; *page_offset = offset_in_page(offset); @@@ -5877,7 -5880,7 +5852,7 @@@ * @start: offset of the bitmap item in the extent buffer * @nr: bit number to test */ - int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) { u8 *kaddr; @@@ -5899,7 -5902,7 +5874,7 @@@ * @pos: bit number of the first bit * @len: number of bits to set */ - void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { u8 *kaddr; @@@ -5941,8 -5944,9 +5916,9 @@@ * @pos: bit number of the first bit * @len: number of bits to clear */ - void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, - unsigned long pos, unsigned long len) + void extent_buffer_bitmap_clear(const struct extent_buffer *eb, + unsigned long start, unsigned long pos, + unsigned long len) { u8 *kaddr; struct page *page; @@@ -6003,14 -6007,14 +5979,14 @@@ static void copy_pages(struct page *dst memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } - void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) + void memcpy_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) { struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; - size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@@ -6028,11 -6032,11 +6004,11 @@@ } while (len > 0) { - dst_off_in_page = offset_in_page(start_offset + dst_offset); - src_off_in_page = offset_in_page(start_offset + src_offset); + dst_off_in_page = offset_in_page(dst_offset); + src_off_in_page = offset_in_page(src_offset); - dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; - src_i = (start_offset + src_offset) >> PAGE_SHIFT; + dst_i = dst_offset >> PAGE_SHIFT; + src_i = src_offset >> PAGE_SHIFT; cur = min(len, (unsigned long)(PAGE_SIZE - src_off_in_page)); @@@ -6048,8 -6052,9 +6024,9 @@@ } } - void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) + void memmove_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) { struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; @@@ -6057,7 -6062,6 +6034,6 @@@ size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@@ -6078,11 -6082,11 +6054,11 @@@ return; } while (len > 0) { - dst_i = (start_offset + dst_end) >> PAGE_SHIFT; - src_i = (start_offset + src_end) >> PAGE_SHIFT; + dst_i = dst_end >> PAGE_SHIFT; + src_i = src_end >> PAGE_SHIFT; - dst_off_in_page = offset_in_page(start_offset + dst_end); - src_off_in_page = offset_in_page(start_offset + src_end); + dst_off_in_page = offset_in_page(dst_end); + src_off_in_page = offset_in_page(src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --combined fs/btrfs/extent_io.h index 25594e09fdcd,9a10681b12bf..602bf3af9fb4 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@@ -66,6 -66,10 +66,10 @@@ struct btrfs_io_bio struct io_failure_record; struct extent_io_tree; + typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, + int mirror_num, + unsigned long bio_flags); + typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); @@@ -74,8 -78,7 +78,7 @@@ struct extent_io_ops * The following callbacks must be always defined, the function * pointer will be called unconditionally. */ - blk_status_t (*submit_bio_hook)(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); + submit_bio_hook_t *submit_bio_hook; int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); @@@ -198,7 -201,8 +201,7 @@@ int extent_writepages(struct address_sp struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages); +void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); @@@ -209,7 -213,7 +212,7 @@@ struct extent_buffer *__alloc_dummy_ext u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); - struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); + struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); void free_extent_buffer(struct extent_buffer *eb); @@@ -227,7 -231,7 +230,7 @@@ static inline int num_extent_pages(cons (eb->start >> PAGE_SHIFT); } - static inline int extent_buffer_uptodate(struct extent_buffer *eb) + static inline int extent_buffer_uptodate(const struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } @@@ -240,37 -244,37 +243,37 @@@ void read_extent_buffer(const struct ex int read_extent_buffer_to_user(const struct extent_buffer *eb, void __user *dst, unsigned long start, unsigned long len); - void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src); - void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, + void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src); + void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *src); - void write_extent_buffer(struct extent_buffer *eb, const void *src, + void write_extent_buffer(const struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); - void copy_extent_buffer_full(struct extent_buffer *dst, - struct extent_buffer *src); - void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + void copy_extent_buffer_full(const struct extent_buffer *dst, + const struct extent_buffer *src); + void copy_extent_buffer(const struct extent_buffer *dst, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len); - void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len); - void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len); - void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, + void memcpy_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); + void memmove_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); + void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, unsigned long len); - int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long pos); - void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); - void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, - unsigned long pos, unsigned long len); - void clear_extent_buffer_dirty(struct extent_buffer *eb); + void extent_buffer_bitmap_clear(const struct extent_buffer *eb, + unsigned long start, unsigned long pos, + unsigned long len); + void clear_extent_buffer_dirty(const struct extent_buffer *eb); bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); - int extent_buffer_under_io(struct extent_buffer *eb); - int map_private_extent_buffer(const struct extent_buffer *eb, - unsigned long offset, unsigned long min_len, - char **map, unsigned long *map_start, - unsigned long *map_len); + int extent_buffer_under_io(const struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, @@@ -289,7 -293,7 +292,7 @@@ int repair_io_failure(struct btrfs_fs_i u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); - int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num); + int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); /* * When IO fails, either with EIO or csum verification fails, we @@@ -311,12 -315,12 +314,12 @@@ struct io_failure_record }; - bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, - struct io_failure_record *failrec, int fail_mirror); - struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, - struct io_failure_record *failrec, - struct page *page, int pg_offset, int icsum, - bio_end_io_t *endio_func, void *data); + blk_status_t btrfs_submit_read_repair(struct inode *inode, + struct bio *failed_bio, u64 phy_offset, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + submit_bio_hook_t *submit_bio_hook); + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --combined fs/btrfs/inode.c index 8b3489f229c7,1242d0aa108d..768c8be4c765 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@@ -5,7 -5,6 +5,6 @@@ #include #include - #include #include #include #include @@@ -49,17 -48,18 +48,18 @@@ #include "qgroup.h" #include "delalloc-space.h" #include "block-group.h" + #include "space-info.h" struct btrfs_iget_args { - struct btrfs_key *location; + u64 ino; struct btrfs_root *root; }; struct btrfs_dio_data { u64 reserve; - u64 unsubmitted_oe_range_start; - u64 unsubmitted_oe_range_end; - int overwrite; + loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; }; static const struct inode_operations btrfs_dir_inode_operations; @@@ -1142,7 -1142,7 +1142,7 @@@ out_unlock */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, - start + cur_alloc_size, + start + cur_alloc_size - 1, locked_page, clear_bits, page_ops); @@@ -1355,6 -1355,66 +1355,66 @@@ static noinline int csum_exist_in_range return 1; } + static int fallback_to_cow(struct inode *inode, struct page *locked_page, + const u64 start, const u64 end, + int *page_started, unsigned long *nr_written) + { + const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode)); + const u64 range_bytes = end + 1 - start; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 range_start = start; + u64 count; + + /* + * If EXTENT_NORESERVE is set it means that when the buffered write was + * made we had not enough available data space and therefore we did not + * reserve data space for it, since we though we could do NOCOW for the + * respective file range (either there is prealloc extent or the inode + * has the NOCOW bit set). + * + * However when we need to fallback to COW mode (because for example the + * block group for the corresponding extent was turned to RO mode by a + * scrub or relocation) we need to do the following: + * + * 1) We increment the bytes_may_use counter of the data space info. + * If COW succeeds, it allocates a new data extent and after doing + * that it decrements the space info's bytes_may_use counter and + * increments its bytes_reserved counter by the same amount (we do + * this at btrfs_add_reserved_bytes()). So we need to increment the + * bytes_may_use counter to compensate (when space is reserved at + * buffered write time, the bytes_may_use counter is incremented); + * + * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so + * that if the COW path fails for any reason, it decrements (through + * extent_clear_unlock_delalloc()) the bytes_may_use counter of the + * data space info, which we incremented in the step above. + * + * If we need to fallback to cow and the inode corresponds to a free + * space cache inode, we must also increment bytes_may_use of the data + * space_info for the same reason. Space caches always get a prealloc + * extent for them, however scrub or balance may have set the block + * group that contains that extent to RO mode. + */ + count = count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0); + if (count > 0 || is_space_ino) { + const u64 bytes = is_space_ino ? range_bytes : count; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_space_info *sinfo = fs_info->data_sinfo; + + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + spin_unlock(&sinfo->lock); + + if (count > 0) + clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, + 0, 0, NULL); + } + + return cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 1); + } + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@@ -1602,9 -1662,9 +1662,9 @@@ out_check * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, - cow_start, found_key.offset - 1, - page_started, nr_written, 1); + ret = fallback_to_cow(inode, locked_page, cow_start, + found_key.offset - 1, + page_started, nr_written); if (ret) { if (nocow) btrfs_dec_nocow_writers(fs_info, @@@ -1693,8 -1753,8 +1753,8 @@@ if (cow_start != (u64)-1) { cur_offset = end; - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = fallback_to_cow(inode, locked_page, cow_start, end, + page_started, nr_written); if (ret) goto error; } @@@ -2726,10 -2786,9 +2786,9 @@@ void btrfs_writepage_endio_finish_order btrfs_queue_work(wq, &ordered_extent->work); } - static int __readpage_endio_check(struct inode *inode, - struct btrfs_io_bio *io_bio, - int icsum, struct page *page, - int pgoff, u64 start, size_t len) + static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, + int icsum, struct page *page, int pgoff, u64 start, + size_t len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@@ -2743,9 -2802,7 +2802,7 @@@ kaddr = kmap_atomic(page); shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - crypto_shash_update(shash, kaddr + pgoff, len); - crypto_shash_final(shash, csum); + crypto_shash_digest(shash, kaddr + pgoff, len, csum); if (memcmp(csum, csum_expected, csum_size)) goto zeroit; @@@ -2790,8 -2847,8 +2847,8 @@@ static int btrfs_readpage_end_io_hook(s } phy_offset >>= inode->i_sb->s_blocksize_bits; - return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, - start, (size_t)(end - start + 1)); + return check_data_csum(inode, io_bio, phy_offset, page, offset, start, + (size_t)(end - start + 1)); } /* @@@ -2981,7 -3038,7 +3038,7 @@@ int btrfs_orphan_cleanup(struct btrfs_r found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, &found_key, root); + inode = btrfs_iget(fs_info->sb, last_objectid, root); ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ENOENT) goto out; @@@ -3000,18 -3057,16 +3057,16 @@@ * orphan must not get deleted. * find_dead_roots already ran before us, so if this * is a snapshot deletion, we should find the root - * in the dead_roots list + * in the fs_roots radix tree. */ - spin_lock(&fs_info->trans_lock); - list_for_each_entry(dead_root, &fs_info->dead_roots, - root_list) { - if (dead_root->root_key.objectid == - found_key.objectid) { - is_dead_root = 1; - break; - } - } - spin_unlock(&fs_info->trans_lock); + + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); + if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) + is_dead_root = 1; + spin_unlock(&fs_info->fs_roots_radix_lock); + if (is_dead_root) { /* prevent this orphan from being found again */ key.offset = found_key.objectid - 1; @@@ -3357,43 -3412,40 +3412,40 @@@ static void fill_inode_item(struct btrf btrfs_init_map_token(&token, leaf); - btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); - btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); - btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, - &token); - btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); - btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - - btrfs_set_token_timespec_sec(leaf, &item->atime, - inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->atime, - inode->i_atime.tv_nsec, &token); - - btrfs_set_token_timespec_sec(leaf, &item->mtime, - inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->mtime, - inode->i_mtime.tv_nsec, &token); - - btrfs_set_token_timespec_sec(leaf, &item->ctime, - inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->ctime, - inode->i_ctime.tv_nsec, &token); - - btrfs_set_token_timespec_sec(leaf, &item->otime, - BTRFS_I(inode)->i_otime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->otime, - BTRFS_I(inode)->i_otime.tv_nsec, &token); - - btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), - &token); - btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, - &token); - btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode), - &token); - btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); - btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); - btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); - btrfs_set_token_inode_block_group(leaf, item, 0, &token); + btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); + btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); + btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_token_inode_mode(&token, item, inode->i_mode); + btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); + + btrfs_set_token_timespec_sec(&token, &item->atime, + inode->i_atime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->atime, + inode->i_atime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->mtime, + inode->i_mtime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->mtime, + inode->i_mtime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->ctime, + inode->i_ctime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->ctime, + inode->i_ctime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); + + btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + btrfs_set_token_inode_generation(&token, item, + BTRFS_I(inode)->generation); + btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); + btrfs_set_token_inode_transid(&token, item, trans->transid); + btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + btrfs_set_token_inode_block_group(&token, item, 0); } /* @@@ -3618,7 -3670,7 +3670,7 @@@ static struct btrfs_trans_handle *__unl * 1 for the inode ref * 1 for the inode */ - return btrfs_start_transaction_fallback_global_rsv(root, 5, 5); + return btrfs_start_transaction_fallback_global_rsv(root, 5); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@@ -4108,11 -4160,12 +4160,12 @@@ int btrfs_truncate_inode_items(struct b BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); /* - * for non-free space inodes and ref cows, we want to back off from - * time to time + * For non-free space inodes and non-shareable roots, we want to back + * off from time to time. This means all inodes in subvolume roots, + * reloc roots, and data reloc roots. */ if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && - test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) be_nice = true; path = btrfs_alloc_path(); @@@ -4120,20 -4173,19 +4173,19 @@@ return -ENOMEM; path->reada = READA_BACK; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, &cached_state); - /* - * We want to drop from the next block forward in case this new size is - * not block aligned since we will be keeping the last block of the - * extent just the way it is. - */ - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - root == fs_info->tree_root) + /* + * We want to drop from the next block forward in case this + * new size is not block aligned since we will be keeping the + * last block of the extent just the way it is. + */ btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size, fs_info->sectorsize), (u64)-1, 0); + } /* * This function is also used to drop the items in the log tree before @@@ -4241,7 -4293,7 +4293,7 @@@ search_again extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); - if (test_bit(BTRFS_ROOT_REF_COWS, + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && extent_start != 0) inode_sub_bytes(inode, num_dec); @@@ -4257,7 -4309,7 +4309,7 @@@ num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; - if (test_bit(BTRFS_ROOT_REF_COWS, + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) inode_sub_bytes(inode, num_dec); } @@@ -4293,7 -4345,7 +4345,7 @@@ clear_len = fs_info->sectorsize; } - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); } delete: @@@ -4334,8 -4386,7 +4386,7 @@@ should_throttle = false; if (found_extent && - (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - root == fs_info->tree_root)) { + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref ref = { 0 }; bytes_deleted += extent_num_bytes; @@@ -4759,10 -4810,7 +4810,7 @@@ static int btrfs_setsize(struct inode * truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the endless truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { @@@ -4856,8 -4904,8 +4904,8 @@@ static void evict_inode_truncate_pages( /* * Keep looping until we have no more ranges in the io tree. - * We can have ongoing bios started by readpages (called from readahead) - * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * We can have ongoing bios started by readahead that have + * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before @@@ -5154,7 -5202,7 +5202,7 @@@ static int fixup_tree_root_location(str btrfs_release_path(path); - new_root = btrfs_get_fs_root(fs_info, location, true); + new_root = btrfs_get_fs_root(fs_info, location->objectid, true); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; @@@ -5232,9 -5280,11 +5280,11 @@@ static void inode_tree_del(struct inod static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->location->objectid; - memcpy(&BTRFS_I(inode)->location, args->location, - sizeof(*args->location)); + + inode->i_ino = args->ino; + BTRFS_I(inode)->location.objectid = args->ino; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); BUG_ON(args->root && !BTRFS_I(inode)->root); return 0; @@@ -5243,19 -5293,19 +5293,19 @@@ static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->location->objectid == BTRFS_I(inode)->location.objectid && + + return args->ino == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } - static struct inode *btrfs_iget_locked(struct super_block *s, - struct btrfs_key *location, + static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; - unsigned long hashval = btrfs_inode_hash(location->objectid, root); + unsigned long hashval = btrfs_inode_hash(ino, root); - args.location = location; + args.ino = ino; args.root = root; inode = iget5_locked(s, hashval, btrfs_find_actor, @@@ -5265,17 -5315,17 +5315,17 @@@ } /* - * Get an inode object given its location and corresponding root. + * Get an inode object given its inode number and corresponding root. * Path can be preallocated to prevent recursing back to iget through * allocator. NULL is also valid but may require an additional allocation * later. */ - struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, + struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path) { struct inode *inode; - inode = btrfs_iget_locked(s, location, root); + inode = btrfs_iget_locked(s, ino, root); if (!inode) return ERR_PTR(-ENOMEM); @@@ -5302,10 -5352,9 +5352,9 @@@ return inode; } - struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root) + struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(s, location, root, NULL); + return btrfs_iget_path(s, ino, root, NULL); } static struct inode *new_simple_dir(struct super_block *s, @@@ -5374,7 -5423,7 +5423,7 @@@ struct inode *btrfs_lookup_dentry(struc return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, &location, root); + inode = btrfs_iget(dir->i_sb, location.objectid, root); if (IS_ERR(inode)) return inode; @@@ -5398,7 -5447,7 +5447,7 @@@ else inode = new_simple_dir(dir->i_sb, &location, sub_root); } else { - inode = btrfs_iget(dir->i_sb, &location, sub_root); + inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); } if (root != sub_root) btrfs_put_root(sub_root); @@@ -5779,7 -5828,8 +5828,8 @@@ int btrfs_set_inode_index(struct btrfs_ static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; - args.location = &BTRFS_I(inode)->location; + + args.ino = BTRFS_I(inode)->location.objectid; args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, @@@ -6991,7 -7041,7 +7041,7 @@@ out } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, int writing) + struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@@ -7050,11 -7100,11 +7100,11 @@@ * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent - * call to readpages() (a buffered read or a defrag call + * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not - * complete), which makes readpages() wait for that + * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ @@@ -7129,30 -7179,7 +7179,7 @@@ static struct extent_map *create_io_em( } - static int btrfs_get_blocks_direct_read(struct extent_map *em, - struct buffer_head *bh_result, - struct inode *inode, - u64 start, u64 len) - { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return -ENOENT; - - len = min(len, em->len - (start - em->start)); - - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - return 0; - } - static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@@ -7214,7 -7241,6 +7241,6 @@@ } /* this will cow the extent */ - len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(inode, start, len); if (IS_ERR(em)) { @@@ -7225,64 -7251,73 +7251,73 @@@ len = min(len, em->len - (start - em->start)); skip_cow: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) + if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; out: return ret; } - static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) + static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned flags, struct iomap *iomap, + struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; - u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - u64 len = bh_result->b_size; + const bool write = !!(flags & IOMAP_WRITE); int ret = 0; + u64 len = length; + bool unlock_extents = false; - if (!create) + if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - if (current->journal_info) { - /* - * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transaction doesn't get - * confused. - */ - dio_data = current->journal_info; - current->journal_info = NULL; + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so we + * need to flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + + dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); + if (!dio_data) + return -ENOMEM; + + dio_data->length = length; + if (write) { + dio_data->reserve = round_up(length, fs_info->sectorsize); + ret = btrfs_delalloc_reserve_space(inode, + &dio_data->data_reserved, + start, dio_data->reserve); + if (ret) { + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + return ret; + } } + iomap->private = dio_data; + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, - create)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } @@@ -7314,36 -7349,48 +7349,48 @@@ goto unlock_err; } - if (create) { - ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, - dio_data, start, len); + len = min(len, em->len - (start - em->start)); + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); if (ret < 0) goto unlock_err; - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); } else { - ret = btrfs_get_blocks_direct_read(em, bh_result, inode, - start, len); - /* Can be negative only if we read from a hole */ - if (ret < 0) { - ret = 0; - free_extent_map(em); - goto unlock_err; - } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + bh_result->b_size; - if (lockstart < lockend) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - } else { - free_extent_state(cached_state); - } + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; } + if (unlock_extents) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; + } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->length = len; + free_extent_map(em); return 0; @@@ -7352,370 -7399,152 +7399,152 @@@ unlock_err unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) - current->journal_info = dio_data; + if (dio_data) { + btrfs_delalloc_release_space(inode, dio_data->data_reserved, + start, dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + } return ret; } - static inline blk_status_t submit_dio_repair_bio(struct inode *inode, - struct bio *bio, - int mirror_num) + static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; + int ret = 0; + struct btrfs_dio_data *dio_data = iomap->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); - BUG_ON(bio_op(bio) == REQ_OP_WRITE); + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + goto out; + } - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR); - if (ret) - return ret; + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + __endio_write_update_ordered(inode, pos, length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1); + ret = -ENOTBLK; + } - ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (write) { + if (dio_data->reserve) + btrfs_delalloc_release_space(inode, + dio_data->data_reserved, pos, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + extent_changeset_free(dio_data->data_reserved); + } + out: + kfree(dio_data); + iomap->private = NULL; return ret; } - static int btrfs_check_dio_repairable(struct inode *inode, - struct bio *failed_bio, - struct io_failure_record *failrec, - int failed_mirror) + static void btrfs_dio_private_put(struct btrfs_dio_private *dip) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int num_copies; - - num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return 0; - } - - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; + /* + * This implies a barrier so that stores to dio_bio->bi_status before + * this and loads of dio_bio->bi_status after this are fully ordered. + */ + if (!refcount_dec_and_test(&dip->refs)) + return; - if (failrec->this_mirror > num_copies) { - btrfs_debug(fs_info, - "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return 0; + if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { + __endio_write_update_ordered(dip->inode, dip->logical_offset, + dip->bytes, + !dip->dio_bio->bi_status); + } else { + unlock_extent(&BTRFS_I(dip->inode)->io_tree, + dip->logical_offset, + dip->logical_offset + dip->bytes - 1); } - return 1; + bio_endio(dip->dio_bio); + kfree(dip); } - static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - bio_end_io_t *repair_endio, void *repair_arg) + static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + unsigned long bio_flags) { - struct io_failure_record *failrec; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct bio *bio; - int isector; - unsigned int read_mode = 0; - int segs; - int ret; - blk_status_t status; - struct bio_vec bvec; + struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + blk_status_t ret; - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + BUG_ON(bio_op(bio) == REQ_OP_WRITE); - ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); if (ret) - return errno_to_blk_status(ret); - - ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, - failed_mirror); - if (!ret) { - free_io_failure(failure_tree, io_tree, failrec); - return BLK_STS_IOERR; - } - - segs = bio_segments(failed_bio); - bio_get_first_bvec(failed_bio, &bvec); - if (segs > 1 || - (bvec.bv_len > btrfs_inode_sectorsize(inode))) - read_mode |= REQ_FAILFAST_DEV; - - isector = start - btrfs_io_bio(failed_bio)->logical; - isector >>= inode->i_sb->s_blocksize_bits; - bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, - pgoff, isector, repair_endio, repair_arg); - bio->bi_opf = REQ_OP_READ | read_mode; - - btrfs_debug(BTRFS_I(inode)->root->fs_info, - "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d", - read_mode, failrec->this_mirror, failrec->in_validation); - - status = submit_dio_repair_bio(inode, bio, failrec->this_mirror); - if (status) { - free_io_failure(failure_tree, io_tree, failrec); - bio_put(bio); - } - - return status; - } - - struct btrfs_retry_complete { - struct completion done; - struct inode *inode; - u64 start; - int uptodate; - }; + return ret; - static void btrfs_retry_endio_nocsum(struct bio *bio) - { - struct btrfs_retry_complete *done = bio->bi_private; - struct inode *inode = done->inode; - struct bio_vec *bvec; - struct extent_io_tree *io_tree, *failure_tree; - struct bvec_iter_all iter_all; - - if (bio->bi_status) - goto end; - - ASSERT(bio->bi_vcnt == 1); - io_tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; - ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode)); - - done->uptodate = 1; - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) - clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, - io_tree, done->start, bvec->bv_page, - btrfs_ino(BTRFS_I(inode)), 0); - end: - complete(&done->done); - bio_put(bio); + refcount_inc(&dip->refs); + ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (ret) + refcount_dec(&dip->refs); + return ret; } - static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode, - struct btrfs_io_bio *io_bio) + static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, + struct btrfs_io_bio *io_bio, + const bool uptodate) { - struct btrfs_fs_info *fs_info; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - struct btrfs_retry_complete done; - u64 start; - unsigned int pgoff; - u32 sectorsize; - int nr_sectors; - blk_status_t ret; + u64 start = io_bio->logical; + int icsum = 0; blk_status_t err = BLK_STS_OK; - fs_info = BTRFS_I(inode)->root->fs_info; - sectorsize = fs_info->sectorsize; - - start = io_bio->logical; - done.inode = inode; - io_bio->bio.bi_iter = io_bio->iter; + __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { + unsigned int i, nr_sectors, pgoff; - bio_for_each_segment(bvec, &io_bio->bio, iter) { nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); pgoff = bvec.bv_offset; - - next_block_or_try_again: - done.uptodate = 0; - done.start = start; - init_completion(&done.done); - - ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, - pgoff, start, start + sectorsize - 1, - io_bio->mirror_num, - btrfs_retry_endio_nocsum, &done); - if (ret) { - err = ret; - goto next; - } - - wait_for_completion_io(&done.done); - - if (!done.uptodate) { - /* We might have another mirror, so try again */ - goto next_block_or_try_again; - } - - next: - start += sectorsize; - - nr_sectors--; - if (nr_sectors) { - pgoff += sectorsize; + for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - goto next_block_or_try_again; - } - } - - return err; - } - - static void btrfs_retry_endio(struct bio *bio) - { - struct btrfs_retry_complete *done = bio->bi_private; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct extent_io_tree *io_tree, *failure_tree; - struct inode *inode = done->inode; - struct bio_vec *bvec; - int uptodate; - int ret; - int i = 0; - struct bvec_iter_all iter_all; - - if (bio->bi_status) - goto end; - - uptodate = 1; - - ASSERT(bio->bi_vcnt == 1); - ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode)); - - io_tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, - bvec->bv_offset, done->start, - bvec->bv_len); - if (!ret) - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, io_tree, done->start, - bvec->bv_page, - btrfs_ino(BTRFS_I(inode)), - bvec->bv_offset); - else - uptodate = 0; - i++; - } - - done->uptodate = uptodate; - end: - complete(&done->done); - bio_put(bio); - } - - static blk_status_t __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, blk_status_t err) - { - struct btrfs_fs_info *fs_info; - struct bio_vec bvec; - struct bvec_iter iter; - struct btrfs_retry_complete done; - u64 start; - u64 offset = 0; - u32 sectorsize; - int nr_sectors; - unsigned int pgoff; - int csum_pos; - bool uptodate = (err == 0); - int ret; - blk_status_t status; - - fs_info = BTRFS_I(inode)->root->fs_info; - sectorsize = fs_info->sectorsize; - - err = BLK_STS_OK; - start = io_bio->logical; - done.inode = inode; - io_bio->bio.bi_iter = io_bio->iter; - - bio_for_each_segment(bvec, &io_bio->bio, iter) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - - pgoff = bvec.bv_offset; - next_block: - if (uptodate) { - csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); - ret = __readpage_endio_check(inode, io_bio, csum_pos, - bvec.bv_page, pgoff, start, sectorsize); - if (likely(!ret)) - goto next; - } - try_again: - done.uptodate = 0; - done.start = start; - init_completion(&done.done); - - status = dio_read_error(inode, &io_bio->bio, bvec.bv_page, - pgoff, start, start + sectorsize - 1, - io_bio->mirror_num, btrfs_retry_endio, - &done); - if (status) { - err = status; - goto next; - } - - wait_for_completion_io(&done.done); - - if (!done.uptodate) { - /* We might have another mirror, so try again */ - goto try_again; - } - next: - offset += sectorsize; - start += sectorsize; - - ASSERT(nr_sectors); - - nr_sectors--; - if (nr_sectors) { + if (uptodate && + (!csum || !check_data_csum(inode, io_bio, icsum, + bvec.bv_page, pgoff, + start, sectorsize))) { + clean_io_failure(fs_info, failure_tree, io_tree, + start, bvec.bv_page, + btrfs_ino(BTRFS_I(inode)), + pgoff); + } else { + blk_status_t status; + + status = btrfs_submit_read_repair(inode, + &io_bio->bio, + start - io_bio->logical, + bvec.bv_page, pgoff, + start, + start + sectorsize - 1, + io_bio->mirror_num, + submit_dio_repair_bio); + if (status) + err = status; + } + start += sectorsize; + icsum++; pgoff += sectorsize; - ASSERT(pgoff < PAGE_SIZE); - goto next_block; } } - return err; } - static blk_status_t btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, blk_status_t err) - { - bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - if (skip_csum) { - if (unlikely(err)) - return __btrfs_correct_data_nocsum(inode, io_bio); - else - return BLK_STS_OK; - } else { - return __btrfs_subio_endio_read(inode, io_bio, err); - } - } - - static void btrfs_endio_direct_read(struct bio *bio) - { - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; - struct bio *dio_bio; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - blk_status_t err = bio->bi_status; - - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) - err = btrfs_subio_endio_read(inode, io_bio, err); - - unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, - dip->logical_offset + dip->bytes - 1); - dio_bio = dip->dio_bio; - - kfree(dip); - - dio_bio->bi_status = err; - dio_end_io(dio_bio); - btrfs_io_bio_free_csum(io_bio); - bio_put(bio); - } - static void __endio_write_update_ordered(struct inode *inode, const u64 offset, const u64 bytes, const bool uptodate) @@@ -7759,21 -7588,6 +7588,6 @@@ } } - static void btrfs_endio_direct_write(struct bio *bio) - { - struct btrfs_dio_private *dip = bio->bi_private; - struct bio *dio_bio = dip->dio_bio; - - __endio_write_update_ordered(dip->inode, dip->logical_offset, - dip->bytes, !bio->bi_status); - - kfree(dip); - - dio_bio->bi_status = bio->bi_status; - dio_end_io(dio_bio); - bio_put(bio); - } - static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, u64 offset) { @@@ -7797,64 -7611,16 +7611,16 @@@ static void btrfs_end_dio_bio(struct bi (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); - if (dip->subio_endio) - err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); - - if (err) { - /* - * We want to perceive the errors flag being set before - * decrementing the reference count. We don't need a barrier - * since atomic operations with a return value are fully - * ordered as per atomic_t.txt - */ - dip->errors = 1; + if (bio_op(bio) == REQ_OP_READ) { + err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio), + !err); } - /* if there are more bios still pending for this dio, just exit */ - if (!atomic_dec_and_test(&dip->pending_bios)) - goto out; + if (err) + dip->dio_bio->bi_status = err; - if (dip->errors) { - bio_io_error(dip->orig_bio); - } else { - dip->dio_bio->bi_status = BLK_STS_OK; - bio_endio(dip->orig_bio); - } - out: bio_put(bio); - } - - static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, - struct btrfs_dio_private *dip, - struct bio *bio, - u64 file_offset) - { - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - u16 csum_size; - blk_status_t ret; - - /* - * We load all the csum data we need when we submit - * the first bio to reduce the csum tree search and - * contention. - */ - if (dip->logical_offset == file_offset) { - ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset, - NULL); - if (ret) - return ret; - } - - if (bio == dip->orig_bio) - return 0; - - file_offset -= dip->logical_offset; - file_offset >>= inode->i_sb->s_blocksize_bits; - csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy); - io_bio->csum = orig_io_bio->csum + csum_size * file_offset; - - return 0; + btrfs_dio_private_put(dip); } static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, @@@ -7892,10 -7658,12 +7658,12 @@@ if (ret) goto err; } else { - ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio, - file_offset); - if (ret) - goto err; + u64 csum_offset; + + csum_offset = file_offset - dip->logical_offset; + csum_offset >>= inode->i_sb->s_blocksize_bits; + csum_offset *= btrfs_super_csum_size(fs_info->super_copy); + btrfs_io_bio(bio)->csum = dip->csums + csum_offset; } map: ret = btrfs_map_bio(fs_info, bio, 0); @@@ -7903,14 -7671,53 +7671,53 @@@ err return ret; } - static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) + /* + * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked + * or ordered extents whether or not we submit any bios. + */ + static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, + struct inode *inode, + loff_t file_offset) { - struct inode *inode = dip->inode; + const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); + size_t dip_size; + struct btrfs_dio_private *dip; + + dip_size = sizeof(*dip); + if (!write && csum) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + size_t nblocks; + + nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; + dip_size += csum_size * nblocks; + } + + dip = kzalloc(dip_size, GFP_NOFS); + if (!dip) + return NULL; + + dip->inode = inode; + dip->logical_offset = file_offset; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; + dip->dio_bio = dio_bio; + refcount_set(&dip->refs, 1); + return dip; + } + + static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, + struct bio *dio_bio, loff_t file_offset) + { + const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const bool raid56 = (btrfs_data_alloc_profile(fs_info) & + BTRFS_BLOCK_GROUP_RAID56_MASK); + struct btrfs_dio_private *dip; struct bio *bio; - struct bio *orig_bio = dip->orig_bio; - u64 start_sector = orig_bio->bi_iter.bi_sector; - u64 file_offset = dip->logical_offset; + u64 start_sector; int async_submit = 0; u64 submit_len; int clone_offset = 0; @@@ -7918,330 -7725,108 +7725,108 @@@ int ret; blk_status_t status; struct btrfs_io_geometry geom; + struct btrfs_dio_data *dio_data = iomap->private; - submit_len = orig_bio->bi_iter.bi_size; - ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), - start_sector << 9, submit_len, &geom); - if (ret) - return -EIO; + dip = btrfs_create_dio_private(dio_bio, inode, file_offset); + if (!dip) { + if (!write) { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + dio_bio->bi_status = BLK_STS_RESOURCE; + bio_endio(dio_bio); + return BLK_QC_T_NONE; + } - if (geom.len >= submit_len) { - bio = orig_bio; - dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; - goto submit; + if (!write && csum) { + /* + * Load the csums up front to reduce csum tree searches and + * contention when submitting bios. + */ + status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset, + dip->csums); + if (status != BLK_STS_OK) + goto out_err; } - /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK) - async_submit = 0; - else - async_submit = 1; + start_sector = dio_bio->bi_iter.bi_sector; + submit_len = dio_bio->bi_iter.bi_size; - /* bio split */ - ASSERT(geom.len <= INT_MAX); - atomic_inc(&dip->pending_bios); do { + ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio), + start_sector << 9, submit_len, + &geom); + if (ret) { + status = errno_to_blk_status(ret); + goto out_err; + } + ASSERT(geom.len <= INT_MAX); + clone_len = min_t(int, submit_len, geom.len); /* * This will never fail as it's passing GPF_NOFS and * the allocation is backed by btrfs_bioset. */ - bio = btrfs_bio_clone_partial(orig_bio, clone_offset, - clone_len); + bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; ASSERT(submit_len >= clone_len); submit_len -= clone_len; - if (submit_len == 0) - break; /* * Increase the count before we submit the bio so we know * the end IO handler won't happen before we increase the * count. Otherwise, the dip might get freed before we're * done setting it up. + * + * We transfer the initial reference to the last bio, so we + * don't need to increment the reference count for the last one. */ - atomic_inc(&dip->pending_bios); + if (submit_len > 0) { + refcount_inc(&dip->refs); + /* + * If we are submitting more than one bio, submit them + * all asynchronously. The exception is RAID 5 or 6, as + * asynchronous checksums make it difficult to collect + * full stripe writes. + */ + if (!raid56) + async_submit = 1; + } status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); if (status) { bio_put(bio); - atomic_dec(&dip->pending_bios); + if (submit_len > 0) + refcount_dec(&dip->refs); goto out_err; } + dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; - - ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), - start_sector << 9, submit_len, &geom); - if (ret) - goto out_err; } while (submit_len > 0); + return BLK_QC_T_NONE; - submit: - status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); - if (!status) - return 0; - - bio_put(bio); out_err: - dip->errors = 1; - /* - * Before atomic variable goto zero, we must make sure dip->errors is - * perceived to be set. This ordering is ensured by the fact that an - * atomic operations with a return value are fully ordered as per - * atomic_t.txt - */ - if (atomic_dec_and_test(&dip->pending_bios)) - bio_io_error(dip->orig_bio); - - /* bio_end_io() will handle error, so we needn't return it */ - return 0; - } - - static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, - loff_t file_offset) - { - struct btrfs_dio_private *dip = NULL; - struct bio *bio = NULL; - struct btrfs_io_bio *io_bio; - bool write = (bio_op(dio_bio) == REQ_OP_WRITE); - int ret = 0; - - bio = btrfs_bio_clone(dio_bio); - - dip = kzalloc(sizeof(*dip), GFP_NOFS); - if (!dip) { - ret = -ENOMEM; - goto free_ordered; - } - - dip->private = dio_bio->bi_private; - dip->inode = inode; - dip->logical_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; - bio->bi_private = dip; - dip->orig_bio = bio; - dip->dio_bio = dio_bio; - atomic_set(&dip->pending_bios, 0); - io_bio = btrfs_io_bio(bio); - io_bio->logical = file_offset; - - if (write) { - bio->bi_end_io = btrfs_endio_direct_write; - } else { - bio->bi_end_io = btrfs_endio_direct_read; - dip->subio_endio = btrfs_subio_endio_read; - } - - /* - * Reset the range for unsubmitted ordered extents (to a 0 length range) - * even if we fail to submit a bio, because in such case we do the - * corresponding error handling below and it must not be done a second - * time by btrfs_direct_IO(). - */ - if (write) { - struct btrfs_dio_data *dio_data = current->journal_info; - - dio_data->unsubmitted_oe_range_end = dip->logical_offset + - dip->bytes; - dio_data->unsubmitted_oe_range_start = - dio_data->unsubmitted_oe_range_end; - } - - ret = btrfs_submit_direct_hook(dip); - if (!ret) - return; - - btrfs_io_bio_free_csum(io_bio); - - free_ordered: - /* - * If we arrived here it means either we failed to submit the dip - * or we either failed to clone the dio_bio or failed to allocate the - * dip. If we cloned the dio_bio and allocated the dip, we can just - * call bio_endio against our io_bio so that we get proper resource - * cleanup if we fail to submit the dip, otherwise, we must do the - * same as btrfs_endio_direct_[write|read] because we can't call these - * callbacks - they require an allocated dip and a clone of dio_bio. - */ - if (bio && dip) { - bio_io_error(bio); - /* - * The end io callbacks free our dip, do the final put on bio - * and all the cleanup and final put for dio_bio (through - * dio_end_io()). - */ - dip = NULL; - bio = NULL; - } else { - if (write) - __endio_write_update_ordered(inode, - file_offset, - dio_bio->bi_iter.bi_size, - false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, - file_offset + dio_bio->bi_iter.bi_size - 1); - - dio_bio->bi_status = BLK_STS_IOERR; - /* - * Releases and cleans up our dio_bio, no need to bio_put() - * nor bio_endio()/bio_io_error() against dio_bio. - */ - dio_end_io(dio_bio); - } - if (bio) - bio_put(bio); - kfree(dip); + dip->dio_bio->bi_status = status; + btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } - static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) - { - int seg; - int i; - unsigned int blocksize_mask = fs_info->sectorsize - 1; - ssize_t retval = -EINVAL; - - if (offset & blocksize_mask) - goto out; - - if (iov_iter_alignment(iter) & blocksize_mask) - goto out; - - /* If this is a write we don't need to check anymore */ - if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) - return 0; - /* - * Check to make sure we don't have duplicate iov_base's in this - * iovec, if so return EINVAL, otherwise we'll get csum errors - * when reading back. - */ - for (seg = 0; seg < iter->nr_segs; seg++) { - for (i = seg + 1; i < iter->nr_segs; i++) { - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) - goto out; - } - } - retval = 0; - out: - return retval; - } - - static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) - { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_data dio_data = { 0 }; - struct extent_changeset *data_reserved = NULL; - loff_t offset = iocb->ki_pos; - size_t count = 0; - int flags = 0; - bool wakeup = true; - bool relock = false; - ssize_t ret; - - if (check_direct_IO(fs_info, iter, offset)) - return 0; - - inode_dio_begin(inode); - - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so - * we need to flush the dirty pages again to make absolutely sure - * that any outstanding dirty pages are on disk. - */ - count = iov_iter_count(iter); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, - offset + count - 1); - - if (iov_iter_rw(iter) == WRITE) { - /* - * If the write DIO is beyond the EOF, we need update - * the isize, but it is protected by i_mutex. So we can - * not unlock the i_mutex at this case. - */ - if (offset + count <= inode->i_size) { - dio_data.overwrite = 1; - inode_unlock(inode); - relock = true; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; - goto out; - } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - offset, count); - if (ret) - goto out; - - /* - * We need to know how many extents we reserved so that we can - * do the accounting properly if we go over the number we - * originally calculated. Abuse current->journal_info for this. - */ - dio_data.reserve = round_up(count, - fs_info->sectorsize); - dio_data.unsubmitted_oe_range_start = (u64)offset; - dio_data.unsubmitted_oe_range_end = (u64)offset; - current->journal_info = &dio_data; - down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; - } - - ret = __blockdev_direct_IO(iocb, inode, - fs_info->fs_devices->latest_bdev, - iter, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (iov_iter_rw(iter) == WRITE) { - up_read(&BTRFS_I(inode)->dio_sem); - current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) { - if (dio_data.reserve) - btrfs_delalloc_release_space(inode, data_reserved, - offset, dio_data.reserve, true); - /* - * On error we might have left some ordered extents - * without submitting corresponding bios for them, so - * cleanup them up to avoid other tasks getting them - * and waiting for them to complete forever. - */ - if (dio_data.unsubmitted_oe_range_start < - dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(inode, - dio_data.unsubmitted_oe_range_start, - dio_data.unsubmitted_oe_range_end - - dio_data.unsubmitted_oe_range_start, - false); - } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, data_reserved, - offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); - } - out: - if (wakeup) - inode_dio_end(inode); - if (relock) - inode_lock(inode); + const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, + }; - extent_changeset_free(data_reserved); - return ret; - } + const struct iomap_dio_ops btrfs_dops = { + .submit_io = btrfs_submit_direct, + }; #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) @@@ -8293,16 -7878,21 +7878,16 @@@ static int btrfs_writepages(struct addr return extent_writepages(mapping, wbc); } -static int -btrfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void btrfs_readahead(struct readahead_control *rac) { - return extent_readpages(mapping, pages, nr_pages); + extent_readahead(rac); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + if (ret == 1) + detach_page_private(page); return ret; } @@@ -8324,8 -7914,14 +7909,8 @@@ static int btrfs_migratepage(struct add if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (PagePrivate2(page)) { ClearPagePrivate2(page); @@@ -8447,7 -8043,11 +8032,7 @@@ again } ClearPageChecked(page); - if (PagePrivate(page)) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + detach_page_private(page); } /* @@@ -10538,8 -10138,8 +10123,8 @@@ static const struct address_space_opera .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, - .readpages = btrfs_readpages, + .readahead = btrfs_readahead, - .direct_IO = btrfs_direct_IO, + .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION diff --combined fs/btrfs/send.c index 6a92ecf9eaa2,0f37660b14b2..d9813a5b075a --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@@ -23,6 -23,7 +23,7 @@@ #include "btrfs_inode.h" #include "transaction.h" #include "compression.h" + #include "xattr.h" /* * Maximum number of references an extent can have in order for us to attempt to @@@ -4545,6 -4546,10 +4546,10 @@@ static int __process_new_xattr(int num struct fs_path *p; struct posix_acl_xattr_header dummy_acl; + /* Capabilities are emitted by finish_inode_if_needed */ + if (!strncmp(name, XATTR_NAME_CAPS, name_len)) + return 0; + p = fs_path_alloc(); if (!p) return -ENOMEM; @@@ -4801,17 -4806,12 +4806,12 @@@ static ssize_t fill_read_buf(struct sen struct inode *inode; struct page *page; char *addr; - struct btrfs_key key; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); ssize_t ret = 0; - key.objectid = sctx->cur_ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, root); + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); @@@ -5107,6 -5107,64 +5107,64 @@@ static int send_extent_data(struct send return 0; } + /* + * Search for a capability xattr related to sctx->cur_ino. If the capability is + * found, call send_set_xattr function to emit it. + * + * Return 0 if there isn't a capability, or when the capability was emitted + * successfully, or < 0 if an error occurred. + */ + static int send_capabilities(struct send_ctx *sctx) + { + struct fs_path *fspath = NULL; + struct btrfs_path *path; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + unsigned long data_ptr; + char *buf = NULL; + int buf_len; + int ret = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, + XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); + if (!di) { + /* There is no xattr for this inode */ + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + buf_len = btrfs_dir_data_len(leaf, di); + + fspath = fs_path_alloc(); + buf = kmalloc(buf_len, GFP_KERNEL); + if (!fspath || !buf) { + ret = -ENOMEM; + goto out; + } + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); + read_extent_buffer(leaf, buf, data_ptr, buf_len); + + ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + strlen(XATTR_NAME_CAPS), buf, buf_len); + out: + kfree(buf); + fs_path_free(fspath); + btrfs_free_path(path); + return ret; + } + static int clone_range(struct send_ctx *sctx, struct clone_root *clone_root, const u64 disk_byte, @@@ -5972,6 -6030,10 +6030,10 @@@ static int finish_inode_if_needed(struc goto out; } + ret = send_capabilities(sctx); + if (ret < 0) + goto out; + /* * If other directory inodes depended on our current directory * inode's move/rename, now do their move/rename operations. @@@ -7021,7 -7083,6 +7083,6 @@@ long btrfs_ioctl_send(struct file *mnt_ struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; - struct btrfs_key key; struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; @@@ -7065,6 -7126,13 +7126,6 @@@ goto out; } - if (!access_ok(arg->clone_sources, - sizeof(*arg->clone_sources) * - arg->clone_sources_count)) { - ret = -EFAULT; - goto out; - } - if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { ret = -EINVAL; goto out; @@@ -7143,11 -7211,8 +7204,8 @@@ } for (i = 0; i < arg->clone_sources_count; i++) { - key.objectid = clone_sources_tmp[i]; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - clone_root = btrfs_get_fs_root(fs_info, &key, true); + clone_root = btrfs_get_fs_root(fs_info, + clone_sources_tmp[i], true); if (IS_ERR(clone_root)) { ret = PTR_ERR(clone_root); goto out; @@@ -7178,11 -7243,8 +7236,8 @@@ } if (arg->parent_root) { - key.objectid = arg->parent_root; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true); + sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root, + true); if (IS_ERR(sctx->parent_root)) { ret = PTR_ERR(sctx->parent_root); goto out; diff --combined fs/direct-io.c index 6d5370eac2a8,c44d60f375bc..1543b5af400e --- a/fs/direct-io.c +++ b/fs/direct-io.c @@@ -386,25 -386,6 +386,6 @@@ static void dio_bio_end_io(struct bio * spin_unlock_irqrestore(&dio->bio_lock, flags); } - /** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ - void dio_end_io(struct bio *bio) - { - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio); - else - dio_bio_end_io(bio); - } - EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, @@@ -500,7 -481,7 +481,7 @@@ static struct bio *dio_await_one(struc spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) - io_schedule(); + blk_io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; diff --combined fs/iomap/direct-io.c index fd3bd06fabb6,e4addfc58107..ec7b78e6feca --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@@ -59,7 -59,7 +59,7 @@@ int iomap_dio_iopoll(struct kiocb *kioc EXPORT_SYMBOL_GPL(iomap_dio_iopoll); static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, - struct bio *bio) + struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); @@@ -67,7 -67,12 +67,12 @@@ bio_set_polled(bio, dio->iocb); dio->submit.last_queue = bdev_get_queue(iomap->bdev); - dio->submit.cookie = submit_bio(bio); + if (dio->dops && dio->dops->submit_io) + dio->submit.cookie = dio->dops->submit_io( + file_inode(dio->iocb->ki_filp), + iomap, bio, pos); + else + dio->submit.cookie = submit_bio(bio); } static ssize_t iomap_dio_complete(struct iomap_dio *dio) @@@ -191,7 -196,7 +196,7 @@@ iomap_dio_zero(struct iomap_dio *dio, s get_page(page); __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, flags); - iomap_dio_submit_bio(dio, iomap, bio); + iomap_dio_submit_bio(dio, iomap, bio, pos); } static loff_t @@@ -299,11 -304,11 +304,11 @@@ iomap_dio_bio_actor(struct inode *inode } dio->size += n; - pos += n; copied += n; nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); - iomap_dio_submit_bio(dio, iomap, bio); + iomap_dio_submit_bio(dio, iomap, bio, pos); + pos += n; } while (nr_pages); /* @@@ -411,8 -416,6 +416,6 @@@ iomap_dio_rw(struct kiocb *iocb, struc struct blk_plug plug; struct iomap_dio *dio; - lockdep_assert_held(&inode->i_rwsem); - if (!count) return 0; @@@ -561,7 -564,7 +564,7 @@@ !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, dio->submit.cookie, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); } diff --combined include/linux/bio.h index 683ff5fd8871,8e23f51ccfa4..91676d4b2dfe --- a/include/linux/bio.h +++ b/include/linux/bio.h @@@ -70,7 -70,7 +70,7 @@@ static inline bool bio_has_data(struct return false; } -static inline bool bio_no_advance_iter(struct bio *bio) +static inline bool bio_no_advance_iter(const struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || @@@ -138,8 -138,8 +138,8 @@@ static inline bool bio_next_segment(con #define bio_for_each_segment_all(bvl, bio, iter) \ for (bvl = bvec_init_iter_all(&iter); bio_next_segment((bio), &iter); ) -static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, - unsigned bytes) +static inline void bio_advance_iter(const struct bio *bio, + struct bvec_iter *iter, unsigned int bytes) { iter->bi_sector += bytes >> 9; @@@ -169,6 -169,14 +169,14 @@@ #define bio_for_each_bvec(bvl, bio, iter) \ __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter) + /* + * Iterate over all multi-page bvecs. Drivers shouldn't use this version for the + * same reasons as bio_for_each_segment_all(). + */ + #define bio_for_each_bvec_all(bvl, bio, i) \ + for (i = 0, bvl = bio_first_bvec_all(bio); \ + i < (bio)->bi_vcnt; i++, bvl++) \ + #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) static inline unsigned bio_segments(struct bio *bio) @@@ -417,7 -425,6 +425,7 @@@ static inline void bio_io_error(struct static inline void bio_wouldblock_error(struct bio *bio) { + bio_set_flag(bio, BIO_QUIET); bio->bi_status = BLK_STS_AGAIN; bio_endio(bio); } @@@ -445,6 -452,12 +453,6 @@@ void bio_release_pages(struct bio *bio extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -void generic_start_io_acct(struct request_queue *q, int op, - unsigned long sectors, struct hd_struct *part); -void generic_end_io_acct(struct request_queue *q, int op, - struct hd_struct *part, - unsigned long start_time); - extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); diff --combined include/linux/fs.h index f3e167ffbb74,e84623d5e173..cffc3619eed5 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@@ -292,7 -292,6 +292,7 @@@ enum positive_aop_returns struct page; struct address_space; struct writeback_control; +struct readahead_control; /* * Write life time hint values. @@@ -376,7 -375,6 +376,7 @@@ struct address_space_operations */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); + void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@@ -978,7 -976,6 +978,7 @@@ struct file #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; + errseq_t f_sb_err; /* for syncfs */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@@ -1523,9 -1520,6 +1523,9 @@@ struct super_block /* Being remounted read-only */ int s_readonly_remount; + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; @@@ -1727,11 -1721,7 +1727,11 @@@ extern int vfs_link(struct dentry *, st extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); -extern int vfs_whiteout(struct inode *, struct dentry *); + +static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag); @@@ -2166,8 -2156,6 +2166,8 @@@ static inline void kiocb_clone(struct k * * I_CREATING New object's inode in the middle of setting up. * + * I_DONTCACHE Evict inode as soon as it is not used anymore. + * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ #define I_DIRTY_SYNC (1 << 0) @@@ -2190,7 -2178,6 +2190,7 @@@ #define I_WB_SWITCH (1 << 13) #define I_OVL_INUSE (1 << 14) #define I_CREATING (1 << 15) +#define I_DONTCACHE (1 << 16) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) @@@ -2594,6 -2581,7 +2594,6 @@@ extern struct kmem_cache *names_cachep #ifdef CONFIG_BLOCK extern int register_blkdev(unsigned int, const char *); extern void unregister_blkdev(unsigned int, const char *); -extern void bdev_unhash_inode(dev_t dev); extern struct block_device *bdget(dev_t); extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); @@@ -2649,6 -2637,7 +2649,6 @@@ extern int sync_filesystem(struct super extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; #ifdef CONFIG_BLOCK -extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); @@@ -2734,6 -2723,7 +2734,6 @@@ extern bool is_bad_inode(struct inode * extern int revalidate_disk(struct gendisk *); extern int check_disk_change(struct block_device *); extern int __invalidate_device(struct block_device *, bool); -extern int invalidate_partition(struct gendisk *, int); #endif unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); @@@ -2837,18 -2827,6 +2837,18 @@@ static inline errseq_t filemap_sample_w return errseq_sample(&mapping->wb_err); } +/** + * file_sample_sb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Grab the most current superblock-level errseq_t value for the given + * struct file. + */ +static inline errseq_t file_sample_sb_err(struct file *file) +{ + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS @@@ -3071,10 -3049,8 +3071,10 @@@ extern int inode_needs_sync(struct inod extern int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { - return !inode->i_nlink || inode_unhashed(inode); + return !inode->i_nlink || inode_unhashed(inode) || + (inode->i_state & I_DONTCACHE); } +extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), @@@ -3148,6 -3124,8 +3148,8 @@@ extern int generic_file_rw_checks(struc extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *count, unsigned int flags); + extern ssize_t generic_file_buffered_read(struct kiocb *iocb, + struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); @@@ -3209,8 -3187,6 +3211,6 @@@ enum DIO_SKIP_HOLES = 0x02, }; - void dio_end_io(struct bio *bio); - ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, @@@ -3418,6 -3394,11 +3418,6 @@@ extern void setattr_copy(struct inode * extern int file_update_time(struct file *file); -static inline bool io_is_direct(struct file *filp) -{ - return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host); -} - static inline bool vma_is_dax(const struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); @@@ -3442,7 -3423,7 +3442,7 @@@ static inline int iocb_flags(struct fil int res = 0; if (file->f_flags & O_APPEND) res |= IOCB_APPEND; - if (io_is_direct(file)) + if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) res |= IOCB_DSYNC; diff --combined include/linux/iomap.h index bc20bd04c2a2,5b4875344874..a5c219c29b10 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@@ -155,7 -155,8 +155,7 @@@ loff_t iomap_apply(struct inode *inode ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); -int iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops); +void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); int iomap_set_page_dirty(struct page *page); int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count); @@@ -251,6 -252,8 +251,8 @@@ int iomap_writepages(struct address_spa struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, unsigned flags); + blk_qc_t (*submit_io)(struct inode *inode, struct iomap *iomap, + struct bio *bio, loff_t file_offset); }; ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, diff --combined mm/filemap.c index fe079e9219d1,ad82672a9941..3430280df607 --- a/mm/filemap.c +++ b/mm/filemap.c @@@ -1991,7 -1991,7 +1991,7 @@@ static void shrink_readahead_size_eio(s * * total number of bytes copied, including those the were already @written * * negative error code if nothing was copied */ - static ssize_t generic_file_buffered_read(struct kiocb *iocb, + ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; @@@ -2243,6 -2243,7 +2243,7 @@@ out file_accessed(filp); return written ? written : error; } + EXPORT_SYMBOL_GPL(generic_file_buffered_read); /** * generic_file_read_iter - generic filesystem read routine @@@ -2566,6 -2567,7 +2567,6 @@@ page_not_uptodate if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(ra); return VM_FAULT_SIGBUS;