OSDN Git Service

Merge tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client
[tomoyo/tomoyo-test1.git] / fs / btrfs / ordered-data.c
index a9778a9..b46ab34 100644 (file)
@@ -146,35 +146,11 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
        return ret;
 }
 
-/*
- * Add an ordered extent to the per-inode tree.
- *
- * @inode:           Inode that this extent is for.
- * @file_offset:     Logical offset in file where the extent starts.
- * @num_bytes:       Logical length of extent in file.
- * @ram_bytes:       Full length of unencoded data.
- * @disk_bytenr:     Offset of extent on disk.
- * @disk_num_bytes:  Size of extent on disk.
- * @offset:          Offset into unencoded data where file data starts.
- * @flags:           Flags specifying type of extent (1 << BTRFS_ORDERED_*).
- * @compress_type:   Compression algorithm used for data.
- *
- * Most of these parameters correspond to &struct btrfs_file_extent_item. The
- * tree is given a single reference on the ordered extent that was inserted, and
- * the returned pointer is given a second reference.
- *
- * Return: the new ordered extent or error pointer.
- */
-struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
-                       struct btrfs_inode *inode, u64 file_offset,
-                       u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
-                       u64 disk_num_bytes, u64 offset, unsigned long flags,
-                       int compress_type)
+static struct btrfs_ordered_extent *alloc_ordered_extent(
+                       struct btrfs_inode *inode, u64 file_offset, u64 num_bytes,
+                       u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes,
+                       u64 offset, unsigned long flags, int compress_type)
 {
-       struct btrfs_root *root = inode->root;
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
-       struct rb_node *node;
        struct btrfs_ordered_extent *entry;
        int ret;
 
@@ -184,7 +160,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
                ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
                if (ret < 0)
                        return ERR_PTR(ret);
-               ret = 0;
        } else {
                /*
                 * The ordered extent has reserved qgroup space, release now
@@ -209,15 +184,7 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
        entry->compress_type = compress_type;
        entry->truncated_len = (u64)-1;
        entry->qgroup_rsv = ret;
-       entry->physical = (u64)-1;
-
-       ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
        entry->flags = flags;
-
-       percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
-                                fs_info->delalloc_batch);
-
-       /* one ref for the tree */
        refcount_set(&entry->refs, 1);
        init_waitqueue_head(&entry->wait);
        INIT_LIST_HEAD(&entry->list);
@@ -226,15 +193,40 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
        INIT_LIST_HEAD(&entry->work_list);
        init_completion(&entry->completion);
 
+       /*
+        * We don't need the count_max_extents here, we can assume that all of
+        * that work has been done at higher layers, so this is truly the
+        * smallest the extent is going to get.
+        */
+       spin_lock(&inode->lock);
+       btrfs_mod_outstanding_extents(inode, 1);
+       spin_unlock(&inode->lock);
+
+       return entry;
+}
+
+static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+       struct btrfs_inode *inode = BTRFS_I(entry->inode);
+       struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct rb_node *node;
+
        trace_btrfs_ordered_extent_add(inode, entry);
 
+       percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes,
+                                fs_info->delalloc_batch);
+
+       /* One ref for the tree. */
+       refcount_inc(&entry->refs);
+
        spin_lock_irq(&tree->lock);
-       node = tree_insert(&tree->tree, file_offset,
-                          &entry->rb_node);
+       node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node);
        if (node)
                btrfs_panic(fs_info, -EEXIST,
                                "inconsistency in ordered tree at offset %llu",
-                               file_offset);
+                               entry->file_offset);
        spin_unlock_irq(&tree->lock);
 
        spin_lock(&root->ordered_extent_lock);
@@ -248,43 +240,43 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
                spin_unlock(&fs_info->ordered_root_lock);
        }
        spin_unlock(&root->ordered_extent_lock);
-
-       /*
-        * We don't need the count_max_extents here, we can assume that all of
-        * that work has been done at higher layers, so this is truly the
-        * smallest the extent is going to get.
-        */
-       spin_lock(&inode->lock);
-       btrfs_mod_outstanding_extents(inode, 1);
-       spin_unlock(&inode->lock);
-
-       /* One ref for the returned entry to match semantics of lookup. */
-       refcount_inc(&entry->refs);
-
-       return entry;
 }
 
 /*
- * Add a new btrfs_ordered_extent for the range, but drop the reference instead
- * of returning it to the caller.
+ * Add an ordered extent to the per-inode tree.
+ *
+ * @inode:           Inode that this extent is for.
+ * @file_offset:     Logical offset in file where the extent starts.
+ * @num_bytes:       Logical length of extent in file.
+ * @ram_bytes:       Full length of unencoded data.
+ * @disk_bytenr:     Offset of extent on disk.
+ * @disk_num_bytes:  Size of extent on disk.
+ * @offset:          Offset into unencoded data where file data starts.
+ * @flags:           Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @compress_type:   Compression algorithm used for data.
+ *
+ * Most of these parameters correspond to &struct btrfs_file_extent_item. The
+ * tree is given a single reference on the ordered extent that was inserted, and
+ * the returned pointer is given a second reference.
+ *
+ * Return: the new ordered extent or error pointer.
  */
-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
-                            u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
-                            u64 disk_num_bytes, u64 offset, unsigned long flags,
-                            int compress_type)
+struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
+                       struct btrfs_inode *inode, u64 file_offset,
+                       u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+                       u64 disk_num_bytes, u64 offset, unsigned long flags,
+                       int compress_type)
 {
-       struct btrfs_ordered_extent *ordered;
-
-       ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes,
-                                            ram_bytes, disk_bytenr,
-                                            disk_num_bytes, offset, flags,
-                                            compress_type);
+       struct btrfs_ordered_extent *entry;
 
-       if (IS_ERR(ordered))
-               return PTR_ERR(ordered);
-       btrfs_put_ordered_extent(ordered);
+       ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
 
-       return 0;
+       entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes,
+                                    disk_bytenr, disk_num_bytes, offset, flags,
+                                    compress_type);
+       if (!IS_ERR(entry))
+               insert_ordered_extent(entry);
+       return entry;
 }
 
 /*
@@ -311,6 +303,90 @@ static void finish_ordered_fn(struct btrfs_work *work)
        btrfs_finish_ordered_io(ordered_extent);
 }
 
+static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+                                     struct page *page, u64 file_offset,
+                                     u64 len, bool uptodate)
+{
+       struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+       lockdep_assert_held(&inode->ordered_tree.lock);
+
+       if (page) {
+               ASSERT(page->mapping);
+               ASSERT(page_offset(page) <= file_offset);
+               ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE);
+
+               /*
+                * Ordered (Private2) bit indicates whether we still have
+                * pending io unfinished for the ordered extent.
+                *
+                * If there's no such bit, we need to skip to next range.
+                */
+               if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
+                       return false;
+               btrfs_page_clear_ordered(fs_info, page, file_offset, len);
+       }
+
+       /* Now we're fine to update the accounting. */
+       if (WARN_ON_ONCE(len > ordered->bytes_left)) {
+               btrfs_crit(fs_info,
+"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
+                          inode->root->root_key.objectid, btrfs_ino(inode),
+                          ordered->file_offset, ordered->num_bytes,
+                          len, ordered->bytes_left);
+               ordered->bytes_left = 0;
+       } else {
+               ordered->bytes_left -= len;
+       }
+
+       if (!uptodate)
+               set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+
+       if (ordered->bytes_left)
+               return false;
+
+       /*
+        * All the IO of the ordered extent is finished, we need to queue
+        * the finish_func to be executed.
+        */
+       set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags);
+       cond_wake_up(&ordered->wait);
+       refcount_inc(&ordered->refs);
+       trace_btrfs_ordered_extent_mark_finished(inode, ordered);
+       return true;
+}
+
+static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered)
+{
+       struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ?
+               fs_info->endio_freespace_worker : fs_info->endio_write_workers;
+
+       btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+       btrfs_queue_work(wq, &ordered->work);
+}
+
+bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
+                                struct page *page, u64 file_offset, u64 len,
+                                bool uptodate)
+{
+       struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+       unsigned long flags;
+       bool ret;
+
+       trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate);
+
+       spin_lock_irqsave(&inode->ordered_tree.lock, flags);
+       ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
+       spin_unlock_irqrestore(&inode->ordered_tree.lock, flags);
+
+       if (ret)
+               btrfs_queue_ordered_fn(ordered);
+       return ret;
+}
+
 /*
  * Mark all ordered extents io inside the specified range finished.
  *
@@ -329,21 +405,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
                                    u64 num_bytes, bool uptodate)
 {
        struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       struct btrfs_workqueue *wq;
        struct rb_node *node;
        struct btrfs_ordered_extent *entry = NULL;
        unsigned long flags;
        u64 cur = file_offset;
 
-       if (btrfs_is_free_space_inode(inode))
-               wq = fs_info->endio_freespace_worker;
-       else
-               wq = fs_info->endio_write_workers;
-
-       if (page)
-               ASSERT(page->mapping && page_offset(page) <= file_offset &&
-                      file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
+       trace_btrfs_writepage_end_io_hook(inode, file_offset,
+                                         file_offset + num_bytes - 1,
+                                         uptodate);
 
        spin_lock_irqsave(&tree->lock, flags);
        while (cur < file_offset + num_bytes) {
@@ -397,50 +466,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
                ASSERT(end + 1 - cur < U32_MAX);
                len = end + 1 - cur;
 
-               if (page) {
-                       /*
-                        * Ordered (Private2) bit indicates whether we still
-                        * have pending io unfinished for the ordered extent.
-                        *
-                        * If there's no such bit, we need to skip to next range.
-                        */
-                       if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
-                               cur += len;
-                               continue;
-                       }
-                       btrfs_page_clear_ordered(fs_info, page, cur, len);
-               }
-
-               /* Now we're fine to update the accounting */
-               if (unlikely(len > entry->bytes_left)) {
-                       WARN_ON(1);
-                       btrfs_crit(fs_info,
-"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
-                                  inode->root->root_key.objectid,
-                                  btrfs_ino(inode),
-                                  entry->file_offset,
-                                  entry->num_bytes,
-                                  len, entry->bytes_left);
-                       entry->bytes_left = 0;
-               } else {
-                       entry->bytes_left -= len;
-               }
-
-               if (!uptodate)
-                       set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
-
-               /*
-                * All the IO of the ordered extent is finished, we need to queue
-                * the finish_func to be executed.
-                */
-               if (entry->bytes_left == 0) {
-                       set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-                       cond_wake_up(&entry->wait);
-                       refcount_inc(&entry->refs);
-                       trace_btrfs_ordered_extent_mark_finished(inode, entry);
+               if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) {
                        spin_unlock_irqrestore(&tree->lock, flags);
-                       btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL);
-                       btrfs_queue_work(wq, &entry->work);
+                       btrfs_queue_ordered_fn(entry);
                        spin_lock_irqsave(&tree->lock, flags);
                }
                cur += len;
@@ -564,7 +592,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
        freespace_inode = btrfs_is_free_space_inode(btrfs_inode);
 
        btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered);
-       /* This is paired with btrfs_add_ordered_extent. */
+       /* This is paired with btrfs_alloc_ordered_extent. */
        spin_lock(&btrfs_inode->lock);
        btrfs_mod_outstanding_extents(btrfs_inode, -1);
        spin_unlock(&btrfs_inode->lock);
@@ -712,11 +740,9 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
                             const u64 range_start, const u64 range_len)
 {
        struct btrfs_root *root;
-       struct list_head splice;
+       LIST_HEAD(splice);
        u64 done;
 
-       INIT_LIST_HEAD(&splice);
-
        mutex_lock(&fs_info->ordered_operations_mutex);
        spin_lock(&fs_info->ordered_root_lock);
        list_splice_init(&fs_info->ordered_roots, &splice);
@@ -1117,17 +1143,22 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
 }
 
 /* Split out a new ordered extent for this first @len bytes of @ordered. */
-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
+struct btrfs_ordered_extent *btrfs_split_ordered_extent(
+                       struct btrfs_ordered_extent *ordered, u64 len)
 {
-       struct inode *inode = ordered->inode;
-       struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
-       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       struct btrfs_inode *inode = BTRFS_I(ordered->inode);
+       struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
        u64 file_offset = ordered->file_offset;
        u64 disk_bytenr = ordered->disk_bytenr;
-       unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
+       unsigned long flags = ordered->flags;
+       struct btrfs_ordered_sum *sum, *tmpsum;
+       struct btrfs_ordered_extent *new;
        struct rb_node *node;
+       u64 offset = 0;
 
-       trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+       trace_btrfs_ordered_extent_split(inode, ordered);
 
        ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
 
@@ -1136,18 +1167,27 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
         * reduce the original extent to a zero length either.
         */
        if (WARN_ON_ONCE(len >= ordered->num_bytes))
-               return -EINVAL;
-       /* We cannot split once ordered extent is past end_bio. */
-       if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
-               return -EINVAL;
+               return ERR_PTR(-EINVAL);
+       /* We cannot split partially completed ordered extents. */
+       if (ordered->bytes_left) {
+               ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
+               if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
+                       return ERR_PTR(-EINVAL);
+       }
        /* We cannot split a compressed ordered extent. */
        if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
-               return -EINVAL;
-       /* Checksum list should be empty. */
-       if (WARN_ON_ONCE(!list_empty(&ordered->list)))
-               return -EINVAL;
+               return ERR_PTR(-EINVAL);
 
-       spin_lock_irq(&tree->lock);
+       new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr,
+                                  len, 0, flags, ordered->compress_type);
+       if (IS_ERR(new))
+               return new;
+
+       /* One ref for the tree. */
+       refcount_inc(&new->refs);
+
+       spin_lock_irq(&root->ordered_extent_lock);
+       spin_lock(&tree->lock);
        /* Remove from tree once */
        node = &ordered->rb_node;
        rb_erase(node, &tree->tree);
@@ -1159,26 +1199,48 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
        ordered->disk_bytenr += len;
        ordered->num_bytes -= len;
        ordered->disk_num_bytes -= len;
-       ordered->bytes_left -= len;
+
+       if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
+               ASSERT(ordered->bytes_left == 0);
+               new->bytes_left = 0;
+       } else {
+               ordered->bytes_left -= len;
+       }
+
+       if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) {
+               if (ordered->truncated_len > len) {
+                       ordered->truncated_len -= len;
+               } else {
+                       new->truncated_len = ordered->truncated_len;
+                       ordered->truncated_len = 0;
+               }
+       }
+
+       list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) {
+               if (offset == len)
+                       break;
+               list_move_tail(&sum->list, &new->list);
+               offset += sum->len;
+       }
 
        /* Re-insert the node */
        node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
        if (node)
                btrfs_panic(fs_info, -EEXIST,
                        "zoned: inconsistency in ordered tree at offset %llu",
-                           ordered->file_offset);
+                       ordered->file_offset);
 
-       spin_unlock_irq(&tree->lock);
-
-       /*
-        * The splitting extent is already counted and will be added again in
-        * btrfs_add_ordered_extent(). Subtract len to avoid double counting.
-        */
-       percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch);
+       node = tree_insert(&tree->tree, new->file_offset, &new->rb_node);
+       if (node)
+               btrfs_panic(fs_info, -EEXIST,
+                       "zoned: inconsistency in ordered tree at offset %llu",
+                       new->file_offset);
+       spin_unlock(&tree->lock);
 
-       return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
-                                       disk_bytenr, len, 0, flags,
-                                       ordered->compress_type);
+       list_add_tail(&new->root_extent_list, &root->ordered_extents);
+       root->nr_ordered_extents++;
+       spin_unlock_irq(&root->ordered_extent_lock);
+       return new;
 }
 
 int __init ordered_data_init(void)