OSDN Git Service

fc70be66ee792cc00c94b93f6674c580bd5d663b
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / fs / btrfs / disk-io.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/blkdev.h>
21 #include <linux/scatterlist.h>
22 #include <linux/swap.h>
23 #include <linux/radix-tree.h>
24 #include <linux/writeback.h>
25 #include <linux/buffer_head.h>
26 #include <linux/workqueue.h>
27 #include <linux/kthread.h>
28 #include <linux/freezer.h>
29 #include <linux/slab.h>
30 #include <linux/migrate.h>
31 #include <linux/ratelimit.h>
32 #include <linux/uuid.h>
33 #include <linux/semaphore.h>
34 #include <asm/unaligned.h>
35 #include "ctree.h"
36 #include "disk-io.h"
37 #include "hash.h"
38 #include "transaction.h"
39 #include "btrfs_inode.h"
40 #include "volumes.h"
41 #include "print-tree.h"
42 #include "locking.h"
43 #include "tree-log.h"
44 #include "free-space-cache.h"
45 #include "inode-map.h"
46 #include "check-integrity.h"
47 #include "rcu-string.h"
48 #include "dev-replace.h"
49 #include "raid56.h"
50 #include "sysfs.h"
51 #include "qgroup.h"
52
53 #ifdef CONFIG_X86
54 #include <asm/cpufeature.h>
55 #endif
56
57 static const struct extent_io_ops btree_extent_io_ops;
58 static void end_workqueue_fn(struct btrfs_work *work);
59 static void free_fs_root(struct btrfs_root *root);
60 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
61                                     int read_only);
62 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
63 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
64                                       struct btrfs_root *root);
65 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
66 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
67                                         struct extent_io_tree *dirty_pages,
68                                         int mark);
69 static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
70                                        struct extent_io_tree *pinned_extents);
71 static int btrfs_cleanup_transaction(struct btrfs_root *root);
72 static void btrfs_error_commit_super(struct btrfs_root *root);
73
74 /*
75  * btrfs_end_io_wq structs are used to do processing in task context when an IO
76  * is complete.  This is used during reads to verify checksums, and it is used
77  * by writes to insert metadata for new file extents after IO is complete.
78  */
79 struct btrfs_end_io_wq {
80         struct bio *bio;
81         bio_end_io_t *end_io;
82         void *private;
83         struct btrfs_fs_info *info;
84         int error;
85         enum btrfs_wq_endio_type metadata;
86         struct list_head list;
87         struct btrfs_work work;
88 };
89
90 static struct kmem_cache *btrfs_end_io_wq_cache;
91
92 int __init btrfs_end_io_wq_init(void)
93 {
94         btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
95                                         sizeof(struct btrfs_end_io_wq),
96                                         0,
97                                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
98                                         NULL);
99         if (!btrfs_end_io_wq_cache)
100                 return -ENOMEM;
101         return 0;
102 }
103
104 void btrfs_end_io_wq_exit(void)
105 {
106         if (btrfs_end_io_wq_cache)
107                 kmem_cache_destroy(btrfs_end_io_wq_cache);
108 }
109
110 /*
111  * async submit bios are used to offload expensive checksumming
112  * onto the worker threads.  They checksum file and metadata bios
113  * just before they are sent down the IO stack.
114  */
115 struct async_submit_bio {
116         struct inode *inode;
117         struct bio *bio;
118         struct list_head list;
119         extent_submit_bio_hook_t *submit_bio_start;
120         extent_submit_bio_hook_t *submit_bio_done;
121         int rw;
122         int mirror_num;
123         unsigned long bio_flags;
124         /*
125          * bio_offset is optional, can be used if the pages in the bio
126          * can't tell us where in the file the bio should go
127          */
128         u64 bio_offset;
129         struct btrfs_work work;
130         int error;
131 };
132
133 /*
134  * Lockdep class keys for extent_buffer->lock's in this root.  For a given
135  * eb, the lockdep key is determined by the btrfs_root it belongs to and
136  * the level the eb occupies in the tree.
137  *
138  * Different roots are used for different purposes and may nest inside each
139  * other and they require separate keysets.  As lockdep keys should be
140  * static, assign keysets according to the purpose of the root as indicated
141  * by btrfs_root->objectid.  This ensures that all special purpose roots
142  * have separate keysets.
143  *
144  * Lock-nesting across peer nodes is always done with the immediate parent
145  * node locked thus preventing deadlock.  As lockdep doesn't know this, use
146  * subclass to avoid triggering lockdep warning in such cases.
147  *
148  * The key is set by the readpage_end_io_hook after the buffer has passed
149  * csum validation but before the pages are unlocked.  It is also set by
150  * btrfs_init_new_buffer on freshly allocated blocks.
151  *
152  * We also add a check to make sure the highest level of the tree is the
153  * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
154  * needs update as well.
155  */
156 #ifdef CONFIG_DEBUG_LOCK_ALLOC
157 # if BTRFS_MAX_LEVEL != 8
158 #  error
159 # endif
160
161 static struct btrfs_lockdep_keyset {
162         u64                     id;             /* root objectid */
163         const char              *name_stem;     /* lock name stem */
164         char                    names[BTRFS_MAX_LEVEL + 1][20];
165         struct lock_class_key   keys[BTRFS_MAX_LEVEL + 1];
166 } btrfs_lockdep_keysets[] = {
167         { .id = BTRFS_ROOT_TREE_OBJECTID,       .name_stem = "root"     },
168         { .id = BTRFS_EXTENT_TREE_OBJECTID,     .name_stem = "extent"   },
169         { .id = BTRFS_CHUNK_TREE_OBJECTID,      .name_stem = "chunk"    },
170         { .id = BTRFS_DEV_TREE_OBJECTID,        .name_stem = "dev"      },
171         { .id = BTRFS_FS_TREE_OBJECTID,         .name_stem = "fs"       },
172         { .id = BTRFS_CSUM_TREE_OBJECTID,       .name_stem = "csum"     },
173         { .id = BTRFS_QUOTA_TREE_OBJECTID,      .name_stem = "quota"    },
174         { .id = BTRFS_TREE_LOG_OBJECTID,        .name_stem = "log"      },
175         { .id = BTRFS_TREE_RELOC_OBJECTID,      .name_stem = "treloc"   },
176         { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc"   },
177         { .id = BTRFS_UUID_TREE_OBJECTID,       .name_stem = "uuid"     },
178         { .id = 0,                              .name_stem = "tree"     },
179 };
180
181 void __init btrfs_init_lockdep(void)
182 {
183         int i, j;
184
185         /* initialize lockdep class names */
186         for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
187                 struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
188
189                 for (j = 0; j < ARRAY_SIZE(ks->names); j++)
190                         snprintf(ks->names[j], sizeof(ks->names[j]),
191                                  "btrfs-%s-%02d", ks->name_stem, j);
192         }
193 }
194
195 void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
196                                     int level)
197 {
198         struct btrfs_lockdep_keyset *ks;
199
200         BUG_ON(level >= ARRAY_SIZE(ks->keys));
201
202         /* find the matching keyset, id 0 is the default entry */
203         for (ks = btrfs_lockdep_keysets; ks->id; ks++)
204                 if (ks->id == objectid)
205                         break;
206
207         lockdep_set_class_and_name(&eb->lock,
208                                    &ks->keys[level], ks->names[level]);
209 }
210
211 #endif
212
213 /*
214  * extents on the btree inode are pretty simple, there's one extent
215  * that covers the entire device
216  */
217 static struct extent_map *btree_get_extent(struct inode *inode,
218                 struct page *page, size_t pg_offset, u64 start, u64 len,
219                 int create)
220 {
221         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
222         struct extent_map *em;
223         int ret;
224
225         read_lock(&em_tree->lock);
226         em = lookup_extent_mapping(em_tree, start, len);
227         if (em) {
228                 em->bdev =
229                         BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
230                 read_unlock(&em_tree->lock);
231                 goto out;
232         }
233         read_unlock(&em_tree->lock);
234
235         em = alloc_extent_map();
236         if (!em) {
237                 em = ERR_PTR(-ENOMEM);
238                 goto out;
239         }
240         em->start = 0;
241         em->len = (u64)-1;
242         em->block_len = (u64)-1;
243         em->block_start = 0;
244         em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
245
246         write_lock(&em_tree->lock);
247         ret = add_extent_mapping(em_tree, em, 0);
248         if (ret == -EEXIST) {
249                 free_extent_map(em);
250                 em = lookup_extent_mapping(em_tree, start, len);
251                 if (!em)
252                         em = ERR_PTR(-EIO);
253         } else if (ret) {
254                 free_extent_map(em);
255                 em = ERR_PTR(ret);
256         }
257         write_unlock(&em_tree->lock);
258
259 out:
260         return em;
261 }
262
263 u32 btrfs_csum_data(char *data, u32 seed, size_t len)
264 {
265         return btrfs_crc32c(seed, data, len);
266 }
267
268 void btrfs_csum_final(u32 crc, char *result)
269 {
270         put_unaligned_le32(~crc, result);
271 }
272
273 /*
274  * compute the csum for a btree block, and either verify it or write it
275  * into the csum field of the block.
276  */
277 static int csum_tree_block(struct btrfs_fs_info *fs_info,
278                            struct extent_buffer *buf,
279                            int verify)
280 {
281         u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
282         char *result = NULL;
283         unsigned long len;
284         unsigned long cur_len;
285         unsigned long offset = BTRFS_CSUM_SIZE;
286         char *kaddr;
287         unsigned long map_start;
288         unsigned long map_len;
289         int err;
290         u32 crc = ~(u32)0;
291         unsigned long inline_result;
292
293         len = buf->len - offset;
294         while (len > 0) {
295                 err = map_private_extent_buffer(buf, offset, 32,
296                                         &kaddr, &map_start, &map_len);
297                 if (err)
298                         return 1;
299                 cur_len = min(len, map_len - (offset - map_start));
300                 crc = btrfs_csum_data(kaddr + offset - map_start,
301                                       crc, cur_len);
302                 len -= cur_len;
303                 offset += cur_len;
304         }
305         if (csum_size > sizeof(inline_result)) {
306                 result = kzalloc(csum_size, GFP_NOFS);
307                 if (!result)
308                         return 1;
309         } else {
310                 result = (char *)&inline_result;
311         }
312
313         btrfs_csum_final(crc, result);
314
315         if (verify) {
316                 if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
317                         u32 val;
318                         u32 found = 0;
319                         memcpy(&found, result, csum_size);
320
321                         read_extent_buffer(buf, &val, 0, csum_size);
322                         btrfs_warn_rl(fs_info,
323                                 "%s checksum verify failed on %llu wanted %X found %X "
324                                 "level %d",
325                                 fs_info->sb->s_id, buf->start,
326                                 val, found, btrfs_header_level(buf));
327                         if (result != (char *)&inline_result)
328                                 kfree(result);
329                         return 1;
330                 }
331         } else {
332                 write_extent_buffer(buf, result, 0, csum_size);
333         }
334         if (result != (char *)&inline_result)
335                 kfree(result);
336         return 0;
337 }
338
339 /*
340  * we can't consider a given block up to date unless the transid of the
341  * block matches the transid in the parent node's pointer.  This is how we
342  * detect blocks that either didn't get written at all or got written
343  * in the wrong place.
344  */
345 static int verify_parent_transid(struct extent_io_tree *io_tree,
346                                  struct extent_buffer *eb, u64 parent_transid,
347                                  int atomic)
348 {
349         struct extent_state *cached_state = NULL;
350         int ret;
351         bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
352
353         if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
354                 return 0;
355
356         if (atomic)
357                 return -EAGAIN;
358
359         if (need_lock) {
360                 btrfs_tree_read_lock(eb);
361                 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
362         }
363
364         lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
365                          0, &cached_state);
366         if (extent_buffer_uptodate(eb) &&
367             btrfs_header_generation(eb) == parent_transid) {
368                 ret = 0;
369                 goto out;
370         }
371         btrfs_err_rl(eb->fs_info,
372                 "parent transid verify failed on %llu wanted %llu found %llu",
373                         eb->start,
374                         parent_transid, btrfs_header_generation(eb));
375         ret = 1;
376
377         /*
378          * Things reading via commit roots that don't have normal protection,
379          * like send, can have a really old block in cache that may point at a
380          * block that has been free'd and re-allocated.  So don't clear uptodate
381          * if we find an eb that is under IO (dirty/writeback) because we could
382          * end up reading in the stale data and then writing it back out and
383          * making everybody very sad.
384          */
385         if (!extent_buffer_under_io(eb))
386                 clear_extent_buffer_uptodate(eb);
387 out:
388         unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
389                              &cached_state, GFP_NOFS);
390         if (need_lock)
391                 btrfs_tree_read_unlock_blocking(eb);
392         return ret;
393 }
394
395 /*
396  * Return 0 if the superblock checksum type matches the checksum value of that
397  * algorithm. Pass the raw disk superblock data.
398  */
399 static int btrfs_check_super_csum(char *raw_disk_sb)
400 {
401         struct btrfs_super_block *disk_sb =
402                 (struct btrfs_super_block *)raw_disk_sb;
403         u16 csum_type = btrfs_super_csum_type(disk_sb);
404         int ret = 0;
405
406         if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
407                 u32 crc = ~(u32)0;
408                 const int csum_size = sizeof(crc);
409                 char result[csum_size];
410
411                 /*
412                  * The super_block structure does not span the whole
413                  * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
414                  * is filled with zeros and is included in the checkum.
415                  */
416                 crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
417                                 crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
418                 btrfs_csum_final(crc, result);
419
420                 if (memcmp(raw_disk_sb, result, csum_size))
421                         ret = 1;
422         }
423
424         if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
425                 printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n",
426                                 csum_type);
427                 ret = 1;
428         }
429
430         return ret;
431 }
432
433 /*
434  * helper to read a given tree block, doing retries as required when
435  * the checksums don't match and we have alternate mirrors to try.
436  */
437 static int btree_read_extent_buffer_pages(struct btrfs_root *root,
438                                           struct extent_buffer *eb,
439                                           u64 start, u64 parent_transid)
440 {
441         struct extent_io_tree *io_tree;
442         int failed = 0;
443         int ret;
444         int num_copies = 0;
445         int mirror_num = 0;
446         int failed_mirror = 0;
447
448         io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
449         while (1) {
450                 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
451                 ret = read_extent_buffer_pages(io_tree, eb, start,
452                                                WAIT_COMPLETE,
453                                                btree_get_extent, mirror_num);
454                 if (!ret) {
455                         if (!verify_parent_transid(io_tree, eb,
456                                                    parent_transid, 0))
457                                 break;
458                         else
459                                 ret = -EIO;
460                 }
461
462                 num_copies = btrfs_num_copies(root->fs_info,
463                                               eb->start, eb->len);
464                 if (num_copies == 1)
465                         break;
466
467                 if (!failed_mirror) {
468                         failed = 1;
469                         failed_mirror = eb->read_mirror;
470                 }
471
472                 mirror_num++;
473                 if (mirror_num == failed_mirror)
474                         mirror_num++;
475
476                 if (mirror_num > num_copies)
477                         break;
478         }
479
480         if (failed && !ret && failed_mirror)
481                 repair_eb_io_failure(root, eb, failed_mirror);
482
483         return ret;
484 }
485
486 /*
487  * checksum a dirty tree block before IO.  This has extra checks to make sure
488  * we only fill in the checksum field in the first page of a multi-page block
489  */
490
491 static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
492 {
493         u64 start = page_offset(page);
494         u64 found_start;
495         struct extent_buffer *eb;
496
497         eb = (struct extent_buffer *)page->private;
498         if (page != eb->pages[0])
499                 return 0;
500         found_start = btrfs_header_bytenr(eb);
501         if (WARN_ON(found_start != start || !PageUptodate(page)))
502                 return 0;
503         csum_tree_block(fs_info, eb, 0);
504         return 0;
505 }
506
507 static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
508                                  struct extent_buffer *eb)
509 {
510         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
511         u8 fsid[BTRFS_UUID_SIZE];
512         int ret = 1;
513
514         read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
515         while (fs_devices) {
516                 if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
517                         ret = 0;
518                         break;
519                 }
520                 fs_devices = fs_devices->seed;
521         }
522         return ret;
523 }
524
525 #define CORRUPT(reason, eb, root, slot)                         \
526         btrfs_crit(root->fs_info, "corrupt %s, %s: block=%llu," \
527                    " root=%llu, slot=%d",                       \
528                    btrfs_header_level(eb) == 0 ? "leaf" : "node",\
529                    reason, btrfs_header_bytenr(eb), root->objectid, slot)
530
531 static noinline int check_leaf(struct btrfs_root *root,
532                                struct extent_buffer *leaf)
533 {
534         struct btrfs_key key;
535         struct btrfs_key leaf_key;
536         u32 nritems = btrfs_header_nritems(leaf);
537         int slot;
538
539         /*
540          * Extent buffers from a relocation tree have a owner field that
541          * corresponds to the subvolume tree they are based on. So just from an
542          * extent buffer alone we can not find out what is the id of the
543          * corresponding subvolume tree, so we can not figure out if the extent
544          * buffer corresponds to the root of the relocation tree or not. So skip
545          * this check for relocation trees.
546          */
547         if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) {
548                 struct btrfs_root *check_root;
549
550                 key.objectid = btrfs_header_owner(leaf);
551                 key.type = BTRFS_ROOT_ITEM_KEY;
552                 key.offset = (u64)-1;
553
554                 check_root = btrfs_get_fs_root(root->fs_info, &key, false);
555                 /*
556                  * The only reason we also check NULL here is that during
557                  * open_ctree() some roots has not yet been set up.
558                  */
559                 if (!IS_ERR_OR_NULL(check_root)) {
560                         struct extent_buffer *eb;
561
562                         eb = btrfs_root_node(check_root);
563                         /* if leaf is the root, then it's fine */
564                         if (leaf != eb) {
565                                 CORRUPT("non-root leaf's nritems is 0",
566                                         leaf, check_root, 0);
567                                 free_extent_buffer(eb);
568                                 return -EIO;
569                         }
570                         free_extent_buffer(eb);
571                 }
572                 return 0;
573         }
574
575         if (nritems == 0)
576                 return 0;
577
578         /* Check the 0 item */
579         if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
580             BTRFS_LEAF_DATA_SIZE(root)) {
581                 CORRUPT("invalid item offset size pair", leaf, root, 0);
582                 return -EIO;
583         }
584
585         /*
586          * Check to make sure each items keys are in the correct order and their
587          * offsets make sense.  We only have to loop through nritems-1 because
588          * we check the current slot against the next slot, which verifies the
589          * next slot's offset+size makes sense and that the current's slot
590          * offset is correct.
591          */
592         for (slot = 0; slot < nritems - 1; slot++) {
593                 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
594                 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
595
596                 /* Make sure the keys are in the right order */
597                 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
598                         CORRUPT("bad key order", leaf, root, slot);
599                         return -EIO;
600                 }
601
602                 /*
603                  * Make sure the offset and ends are right, remember that the
604                  * item data starts at the end of the leaf and grows towards the
605                  * front.
606                  */
607                 if (btrfs_item_offset_nr(leaf, slot) !=
608                         btrfs_item_end_nr(leaf, slot + 1)) {
609                         CORRUPT("slot offset bad", leaf, root, slot);
610                         return -EIO;
611                 }
612
613                 /*
614                  * Check to make sure that we don't point outside of the leaf,
615                  * just incase all the items are consistent to eachother, but
616                  * all point outside of the leaf.
617                  */
618                 if (btrfs_item_end_nr(leaf, slot) >
619                     BTRFS_LEAF_DATA_SIZE(root)) {
620                         CORRUPT("slot end outside of leaf", leaf, root, slot);
621                         return -EIO;
622                 }
623         }
624
625         return 0;
626 }
627
628 static int check_node(struct btrfs_root *root, struct extent_buffer *node)
629 {
630         unsigned long nr = btrfs_header_nritems(node);
631         struct btrfs_key key, next_key;
632         int slot;
633         u64 bytenr;
634         int ret = 0;
635
636         if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
637                 btrfs_crit(root->fs_info,
638                            "corrupt node: block %llu root %llu nritems %lu",
639                            node->start, root->objectid, nr);
640                 return -EIO;
641         }
642
643         for (slot = 0; slot < nr - 1; slot++) {
644                 bytenr = btrfs_node_blockptr(node, slot);
645                 btrfs_node_key_to_cpu(node, &key, slot);
646                 btrfs_node_key_to_cpu(node, &next_key, slot + 1);
647
648                 if (!bytenr) {
649                         CORRUPT("invalid item slot", node, root, slot);
650                         ret = -EIO;
651                         goto out;
652                 }
653
654                 if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
655                         CORRUPT("bad key order", node, root, slot);
656                         ret = -EIO;
657                         goto out;
658                 }
659         }
660 out:
661         return ret;
662 }
663
664 static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
665                                       u64 phy_offset, struct page *page,
666                                       u64 start, u64 end, int mirror)
667 {
668         u64 found_start;
669         int found_level;
670         struct extent_buffer *eb;
671         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
672         int ret = 0;
673         int reads_done;
674
675         if (!page->private)
676                 goto out;
677
678         eb = (struct extent_buffer *)page->private;
679
680         /* the pending IO might have been the only thing that kept this buffer
681          * in memory.  Make sure we have a ref for all this other checks
682          */
683         extent_buffer_get(eb);
684
685         reads_done = atomic_dec_and_test(&eb->io_pages);
686         if (!reads_done)
687                 goto err;
688
689         eb->read_mirror = mirror;
690         if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
691                 ret = -EIO;
692                 goto err;
693         }
694
695         found_start = btrfs_header_bytenr(eb);
696         if (found_start != eb->start) {
697                 btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
698                                found_start, eb->start);
699                 ret = -EIO;
700                 goto err;
701         }
702         if (check_tree_block_fsid(root->fs_info, eb)) {
703                 btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
704                                eb->start);
705                 ret = -EIO;
706                 goto err;
707         }
708         found_level = btrfs_header_level(eb);
709         if (found_level >= BTRFS_MAX_LEVEL) {
710                 btrfs_err(root->fs_info, "bad tree block level %d",
711                            (int)btrfs_header_level(eb));
712                 ret = -EIO;
713                 goto err;
714         }
715
716         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
717                                        eb, found_level);
718
719         ret = csum_tree_block(root->fs_info, eb, 1);
720         if (ret) {
721                 ret = -EIO;
722                 goto err;
723         }
724
725         /*
726          * If this is a leaf block and it is corrupt, set the corrupt bit so
727          * that we don't try and read the other copies of this block, just
728          * return -EIO.
729          */
730         if (found_level == 0 && check_leaf(root, eb)) {
731                 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
732                 ret = -EIO;
733         }
734
735         if (found_level > 0 && check_node(root, eb))
736                 ret = -EIO;
737
738         if (!ret)
739                 set_extent_buffer_uptodate(eb);
740 err:
741         if (reads_done &&
742             test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
743                 btree_readahead_hook(root, eb, eb->start, ret);
744
745         if (ret) {
746                 /*
747                  * our io error hook is going to dec the io pages
748                  * again, we have to make sure it has something
749                  * to decrement
750                  */
751                 atomic_inc(&eb->io_pages);
752                 clear_extent_buffer_uptodate(eb);
753         }
754         free_extent_buffer(eb);
755 out:
756         return ret;
757 }
758
759 static int btree_io_failed_hook(struct page *page, int failed_mirror)
760 {
761         struct extent_buffer *eb;
762         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
763
764         eb = (struct extent_buffer *)page->private;
765         set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
766         eb->read_mirror = failed_mirror;
767         atomic_dec(&eb->io_pages);
768         if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
769                 btree_readahead_hook(root, eb, eb->start, -EIO);
770         return -EIO;    /* we fixed nothing */
771 }
772
773 static void end_workqueue_bio(struct bio *bio)
774 {
775         struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
776         struct btrfs_fs_info *fs_info;
777         struct btrfs_workqueue *wq;
778         btrfs_work_func_t func;
779
780         fs_info = end_io_wq->info;
781         end_io_wq->error = bio->bi_error;
782
783         if (bio->bi_rw & REQ_WRITE) {
784                 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
785                         wq = fs_info->endio_meta_write_workers;
786                         func = btrfs_endio_meta_write_helper;
787                 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
788                         wq = fs_info->endio_freespace_worker;
789                         func = btrfs_freespace_write_helper;
790                 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
791                         wq = fs_info->endio_raid56_workers;
792                         func = btrfs_endio_raid56_helper;
793                 } else {
794                         wq = fs_info->endio_write_workers;
795                         func = btrfs_endio_write_helper;
796                 }
797         } else {
798                 if (unlikely(end_io_wq->metadata ==
799                              BTRFS_WQ_ENDIO_DIO_REPAIR)) {
800                         wq = fs_info->endio_repair_workers;
801                         func = btrfs_endio_repair_helper;
802                 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
803                         wq = fs_info->endio_raid56_workers;
804                         func = btrfs_endio_raid56_helper;
805                 } else if (end_io_wq->metadata) {
806                         wq = fs_info->endio_meta_workers;
807                         func = btrfs_endio_meta_helper;
808                 } else {
809                         wq = fs_info->endio_workers;
810                         func = btrfs_endio_helper;
811                 }
812         }
813
814         btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
815         btrfs_queue_work(wq, &end_io_wq->work);
816 }
817
818 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
819                         enum btrfs_wq_endio_type metadata)
820 {
821         struct btrfs_end_io_wq *end_io_wq;
822
823         end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
824         if (!end_io_wq)
825                 return -ENOMEM;
826
827         end_io_wq->private = bio->bi_private;
828         end_io_wq->end_io = bio->bi_end_io;
829         end_io_wq->info = info;
830         end_io_wq->error = 0;
831         end_io_wq->bio = bio;
832         end_io_wq->metadata = metadata;
833
834         bio->bi_private = end_io_wq;
835         bio->bi_end_io = end_workqueue_bio;
836         return 0;
837 }
838
839 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
840 {
841         unsigned long limit = min_t(unsigned long,
842                                     info->thread_pool_size,
843                                     info->fs_devices->open_devices);
844         return 256 * limit;
845 }
846
847 static void run_one_async_start(struct btrfs_work *work)
848 {
849         struct async_submit_bio *async;
850         int ret;
851
852         async = container_of(work, struct  async_submit_bio, work);
853         ret = async->submit_bio_start(async->inode, async->rw, async->bio,
854                                       async->mirror_num, async->bio_flags,
855                                       async->bio_offset);
856         if (ret)
857                 async->error = ret;
858 }
859
860 static void run_one_async_done(struct btrfs_work *work)
861 {
862         struct btrfs_fs_info *fs_info;
863         struct async_submit_bio *async;
864         int limit;
865
866         async = container_of(work, struct  async_submit_bio, work);
867         fs_info = BTRFS_I(async->inode)->root->fs_info;
868
869         limit = btrfs_async_submit_limit(fs_info);
870         limit = limit * 2 / 3;
871
872         /*
873          * atomic_dec_return implies a barrier for waitqueue_active
874          */
875         if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
876             waitqueue_active(&fs_info->async_submit_wait))
877                 wake_up(&fs_info->async_submit_wait);
878
879         /* If an error occured we just want to clean up the bio and move on */
880         if (async->error) {
881                 async->bio->bi_error = async->error;
882                 bio_endio(async->bio);
883                 return;
884         }
885
886         async->submit_bio_done(async->inode, async->rw, async->bio,
887                                async->mirror_num, async->bio_flags,
888                                async->bio_offset);
889 }
890
891 static void run_one_async_free(struct btrfs_work *work)
892 {
893         struct async_submit_bio *async;
894
895         async = container_of(work, struct  async_submit_bio, work);
896         kfree(async);
897 }
898
899 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
900                         int rw, struct bio *bio, int mirror_num,
901                         unsigned long bio_flags,
902                         u64 bio_offset,
903                         extent_submit_bio_hook_t *submit_bio_start,
904                         extent_submit_bio_hook_t *submit_bio_done)
905 {
906         struct async_submit_bio *async;
907
908         async = kmalloc(sizeof(*async), GFP_NOFS);
909         if (!async)
910                 return -ENOMEM;
911
912         async->inode = inode;
913         async->rw = rw;
914         async->bio = bio;
915         async->mirror_num = mirror_num;
916         async->submit_bio_start = submit_bio_start;
917         async->submit_bio_done = submit_bio_done;
918
919         btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
920                         run_one_async_done, run_one_async_free);
921
922         async->bio_flags = bio_flags;
923         async->bio_offset = bio_offset;
924
925         async->error = 0;
926
927         atomic_inc(&fs_info->nr_async_submits);
928
929         if (rw & REQ_SYNC)
930                 btrfs_set_work_high_priority(&async->work);
931
932         btrfs_queue_work(fs_info->workers, &async->work);
933
934         while (atomic_read(&fs_info->async_submit_draining) &&
935               atomic_read(&fs_info->nr_async_submits)) {
936                 wait_event(fs_info->async_submit_wait,
937                            (atomic_read(&fs_info->nr_async_submits) == 0));
938         }
939
940         return 0;
941 }
942
943 static int btree_csum_one_bio(struct bio *bio)
944 {
945         struct bio_vec *bvec;
946         struct btrfs_root *root;
947         int i, ret = 0;
948
949         bio_for_each_segment_all(bvec, bio, i) {
950                 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
951                 ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
952                 if (ret)
953                         break;
954         }
955
956         return ret;
957 }
958
959 static int __btree_submit_bio_start(struct inode *inode, int rw,
960                                     struct bio *bio, int mirror_num,
961                                     unsigned long bio_flags,
962                                     u64 bio_offset)
963 {
964         /*
965          * when we're called for a write, we're already in the async
966          * submission context.  Just jump into btrfs_map_bio
967          */
968         return btree_csum_one_bio(bio);
969 }
970
971 static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
972                                  int mirror_num, unsigned long bio_flags,
973                                  u64 bio_offset)
974 {
975         int ret;
976
977         /*
978          * when we're called for a write, we're already in the async
979          * submission context.  Just jump into btrfs_map_bio
980          */
981         ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
982         if (ret) {
983                 bio->bi_error = ret;
984                 bio_endio(bio);
985         }
986         return ret;
987 }
988
989 static int check_async_write(struct inode *inode, unsigned long bio_flags)
990 {
991         if (bio_flags & EXTENT_BIO_TREE_LOG)
992                 return 0;
993 #ifdef CONFIG_X86
994         if (static_cpu_has(X86_FEATURE_XMM4_2))
995                 return 0;
996 #endif
997         return 1;
998 }
999
1000 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1001                                  int mirror_num, unsigned long bio_flags,
1002                                  u64 bio_offset)
1003 {
1004         int async = check_async_write(inode, bio_flags);
1005         int ret;
1006
1007         if (!(rw & REQ_WRITE)) {
1008                 /*
1009                  * called for a read, do the setup so that checksum validation
1010                  * can happen in the async kernel threads
1011                  */
1012                 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
1013                                           bio, BTRFS_WQ_ENDIO_METADATA);
1014                 if (ret)
1015                         goto out_w_error;
1016                 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
1017                                     mirror_num, 0);
1018         } else if (!async) {
1019                 ret = btree_csum_one_bio(bio);
1020                 if (ret)
1021                         goto out_w_error;
1022                 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
1023                                     mirror_num, 0);
1024         } else {
1025                 /*
1026                  * kthread helpers are used to submit writes so that
1027                  * checksumming can happen in parallel across all CPUs
1028                  */
1029                 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1030                                           inode, rw, bio, mirror_num, 0,
1031                                           bio_offset,
1032                                           __btree_submit_bio_start,
1033                                           __btree_submit_bio_done);
1034         }
1035
1036         if (ret)
1037                 goto out_w_error;
1038         return 0;
1039
1040 out_w_error:
1041         bio->bi_error = ret;
1042         bio_endio(bio);
1043         return ret;
1044 }
1045
1046 #ifdef CONFIG_MIGRATION
1047 static int btree_migratepage(struct address_space *mapping,
1048                         struct page *newpage, struct page *page,
1049                         enum migrate_mode mode)
1050 {
1051         /*
1052          * we can't safely write a btree page from here,
1053          * we haven't done the locking hook
1054          */
1055         if (PageDirty(page))
1056                 return -EAGAIN;
1057         /*
1058          * Buffers may be managed in a filesystem specific way.
1059          * We must have no buffers or drop them.
1060          */
1061         if (page_has_private(page) &&
1062             !try_to_release_page(page, GFP_KERNEL))
1063                 return -EAGAIN;
1064         return migrate_page(mapping, newpage, page, mode);
1065 }
1066 #endif
1067
1068
1069 static int btree_writepages(struct address_space *mapping,
1070                             struct writeback_control *wbc)
1071 {
1072         struct btrfs_fs_info *fs_info;
1073         int ret;
1074
1075         if (wbc->sync_mode == WB_SYNC_NONE) {
1076
1077                 if (wbc->for_kupdate)
1078                         return 0;
1079
1080                 fs_info = BTRFS_I(mapping->host)->root->fs_info;
1081                 /* this is a bit racy, but that's ok */
1082                 ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
1083                                              BTRFS_DIRTY_METADATA_THRESH,
1084                                              fs_info->dirty_metadata_batch);
1085                 if (ret < 0)
1086                         return 0;
1087         }
1088         return btree_write_cache_pages(mapping, wbc);
1089 }
1090
1091 static int btree_readpage(struct file *file, struct page *page)
1092 {
1093         struct extent_io_tree *tree;
1094         tree = &BTRFS_I(page->mapping->host)->io_tree;
1095         return extent_read_full_page(tree, page, btree_get_extent, 0);
1096 }
1097
1098 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
1099 {
1100         if (PageWriteback(page) || PageDirty(page))
1101                 return 0;
1102
1103         return try_release_extent_buffer(page);
1104 }
1105
1106 static void btree_invalidatepage(struct page *page, unsigned int offset,
1107                                  unsigned int length)
1108 {
1109         struct extent_io_tree *tree;
1110         tree = &BTRFS_I(page->mapping->host)->io_tree;
1111         extent_invalidatepage(tree, page, offset);
1112         btree_releasepage(page, GFP_NOFS);
1113         if (PagePrivate(page)) {
1114                 btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
1115                            "page private not zero on page %llu",
1116                            (unsigned long long)page_offset(page));
1117                 ClearPagePrivate(page);
1118                 set_page_private(page, 0);
1119                 page_cache_release(page);
1120         }
1121 }
1122
1123 static int btree_set_page_dirty(struct page *page)
1124 {
1125 #ifdef DEBUG
1126         struct extent_buffer *eb;
1127
1128         BUG_ON(!PagePrivate(page));
1129         eb = (struct extent_buffer *)page->private;
1130         BUG_ON(!eb);
1131         BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
1132         BUG_ON(!atomic_read(&eb->refs));
1133         btrfs_assert_tree_locked(eb);
1134 #endif
1135         return __set_page_dirty_nobuffers(page);
1136 }
1137
1138 static const struct address_space_operations btree_aops = {
1139         .readpage       = btree_readpage,
1140         .writepages     = btree_writepages,
1141         .releasepage    = btree_releasepage,
1142         .invalidatepage = btree_invalidatepage,
1143 #ifdef CONFIG_MIGRATION
1144         .migratepage    = btree_migratepage,
1145 #endif
1146         .set_page_dirty = btree_set_page_dirty,
1147 };
1148
1149 void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
1150 {
1151         struct extent_buffer *buf = NULL;
1152         struct inode *btree_inode = root->fs_info->btree_inode;
1153
1154         buf = btrfs_find_create_tree_block(root, bytenr);
1155         if (!buf)
1156                 return;
1157         read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
1158                                  buf, 0, WAIT_NONE, btree_get_extent, 0);
1159         free_extent_buffer(buf);
1160 }
1161
1162 int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
1163                          int mirror_num, struct extent_buffer **eb)
1164 {
1165         struct extent_buffer *buf = NULL;
1166         struct inode *btree_inode = root->fs_info->btree_inode;
1167         struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1168         int ret;
1169
1170         buf = btrfs_find_create_tree_block(root, bytenr);
1171         if (!buf)
1172                 return 0;
1173
1174         set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1175
1176         ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1177                                        btree_get_extent, mirror_num);
1178         if (ret) {
1179                 free_extent_buffer(buf);
1180                 return ret;
1181         }
1182
1183         if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1184                 free_extent_buffer(buf);
1185                 return -EIO;
1186         } else if (extent_buffer_uptodate(buf)) {
1187                 *eb = buf;
1188         } else {
1189                 free_extent_buffer(buf);
1190         }
1191         return 0;
1192 }
1193
1194 struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
1195                                             u64 bytenr)
1196 {
1197         return find_extent_buffer(fs_info, bytenr);
1198 }
1199
1200 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1201                                                  u64 bytenr)
1202 {
1203         if (btrfs_test_is_dummy_root(root))
1204                 return alloc_test_extent_buffer(root->fs_info, bytenr);
1205         return alloc_extent_buffer(root->fs_info, bytenr);
1206 }
1207
1208
1209 int btrfs_write_tree_block(struct extent_buffer *buf)
1210 {
1211         return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
1212                                         buf->start + buf->len - 1);
1213 }
1214
1215 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1216 {
1217         return filemap_fdatawait_range(buf->pages[0]->mapping,
1218                                        buf->start, buf->start + buf->len - 1);
1219 }
1220
1221 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1222                                       u64 parent_transid)
1223 {
1224         struct extent_buffer *buf = NULL;
1225         int ret;
1226
1227         buf = btrfs_find_create_tree_block(root, bytenr);
1228         if (!buf)
1229                 return ERR_PTR(-ENOMEM);
1230
1231         ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
1232         if (ret) {
1233                 free_extent_buffer(buf);
1234                 return ERR_PTR(ret);
1235         }
1236         return buf;
1237
1238 }
1239
1240 void clean_tree_block(struct btrfs_trans_handle *trans,
1241                       struct btrfs_fs_info *fs_info,
1242                       struct extent_buffer *buf)
1243 {
1244         if (btrfs_header_generation(buf) ==
1245             fs_info->running_transaction->transid) {
1246                 btrfs_assert_tree_locked(buf);
1247
1248                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1249                         __percpu_counter_add(&fs_info->dirty_metadata_bytes,
1250                                              -buf->len,
1251                                              fs_info->dirty_metadata_batch);
1252                         /* ugh, clear_extent_buffer_dirty needs to lock the page */
1253                         btrfs_set_lock_blocking(buf);
1254                         clear_extent_buffer_dirty(buf);
1255                 }
1256         }
1257 }
1258
1259 static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1260 {
1261         struct btrfs_subvolume_writers *writers;
1262         int ret;
1263
1264         writers = kmalloc(sizeof(*writers), GFP_NOFS);
1265         if (!writers)
1266                 return ERR_PTR(-ENOMEM);
1267
1268         ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
1269         if (ret < 0) {
1270                 kfree(writers);
1271                 return ERR_PTR(ret);
1272         }
1273
1274         init_waitqueue_head(&writers->wait);
1275         return writers;
1276 }
1277
1278 static void
1279 btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1280 {
1281         percpu_counter_destroy(&writers->counter);
1282         kfree(writers);
1283 }
1284
1285 static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
1286                          struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1287                          u64 objectid)
1288 {
1289         root->node = NULL;
1290         root->commit_root = NULL;
1291         root->sectorsize = sectorsize;
1292         root->nodesize = nodesize;
1293         root->stripesize = stripesize;
1294         root->state = 0;
1295         root->orphan_cleanup_state = 0;
1296
1297         root->objectid = objectid;
1298         root->last_trans = 0;
1299         root->highest_objectid = 0;
1300         root->nr_delalloc_inodes = 0;
1301         root->nr_ordered_extents = 0;
1302         root->name = NULL;
1303         root->inode_tree = RB_ROOT;
1304         INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1305         root->block_rsv = NULL;
1306         root->orphan_block_rsv = NULL;
1307
1308         INIT_LIST_HEAD(&root->dirty_list);
1309         INIT_LIST_HEAD(&root->root_list);
1310         INIT_LIST_HEAD(&root->delalloc_inodes);
1311         INIT_LIST_HEAD(&root->delalloc_root);
1312         INIT_LIST_HEAD(&root->ordered_extents);
1313         INIT_LIST_HEAD(&root->ordered_root);
1314         INIT_LIST_HEAD(&root->logged_list[0]);
1315         INIT_LIST_HEAD(&root->logged_list[1]);
1316         spin_lock_init(&root->orphan_lock);
1317         spin_lock_init(&root->inode_lock);
1318         spin_lock_init(&root->delalloc_lock);
1319         spin_lock_init(&root->ordered_extent_lock);
1320         spin_lock_init(&root->accounting_lock);
1321         spin_lock_init(&root->log_extents_lock[0]);
1322         spin_lock_init(&root->log_extents_lock[1]);
1323         mutex_init(&root->objectid_mutex);
1324         mutex_init(&root->log_mutex);
1325         mutex_init(&root->ordered_extent_mutex);
1326         mutex_init(&root->delalloc_mutex);
1327         init_waitqueue_head(&root->log_writer_wait);
1328         init_waitqueue_head(&root->log_commit_wait[0]);
1329         init_waitqueue_head(&root->log_commit_wait[1]);
1330         INIT_LIST_HEAD(&root->log_ctxs[0]);
1331         INIT_LIST_HEAD(&root->log_ctxs[1]);
1332         atomic_set(&root->log_commit[0], 0);
1333         atomic_set(&root->log_commit[1], 0);
1334         atomic_set(&root->log_writers, 0);
1335         atomic_set(&root->log_batch, 0);
1336         atomic_set(&root->orphan_inodes, 0);
1337         atomic_set(&root->refs, 1);
1338         atomic_set(&root->will_be_snapshoted, 0);
1339         atomic_set(&root->qgroup_meta_rsv, 0);
1340         root->log_transid = 0;
1341         root->log_transid_committed = -1;
1342         root->last_log_commit = 0;
1343         if (fs_info)
1344                 extent_io_tree_init(&root->dirty_log_pages,
1345                                      fs_info->btree_inode->i_mapping);
1346
1347         memset(&root->root_key, 0, sizeof(root->root_key));
1348         memset(&root->root_item, 0, sizeof(root->root_item));
1349         memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1350         if (fs_info)
1351                 root->defrag_trans_start = fs_info->generation;
1352         else
1353                 root->defrag_trans_start = 0;
1354         root->root_key.objectid = objectid;
1355         root->anon_dev = 0;
1356
1357         spin_lock_init(&root->root_item_lock);
1358 }
1359
1360 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1361 {
1362         struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
1363         if (root)
1364                 root->fs_info = fs_info;
1365         return root;
1366 }
1367
1368 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
1369 /* Should only be used by the testing infrastructure */
1370 struct btrfs_root *btrfs_alloc_dummy_root(void)
1371 {
1372         struct btrfs_root *root;
1373
1374         root = btrfs_alloc_root(NULL);
1375         if (!root)
1376                 return ERR_PTR(-ENOMEM);
1377         __setup_root(4096, 4096, 4096, root, NULL, 1);
1378         set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
1379         root->alloc_bytenr = 0;
1380
1381         return root;
1382 }
1383 #endif
1384
1385 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1386                                      struct btrfs_fs_info *fs_info,
1387                                      u64 objectid)
1388 {
1389         struct extent_buffer *leaf;
1390         struct btrfs_root *tree_root = fs_info->tree_root;
1391         struct btrfs_root *root;
1392         struct btrfs_key key;
1393         int ret = 0;
1394         uuid_le uuid;
1395
1396         root = btrfs_alloc_root(fs_info);
1397         if (!root)
1398                 return ERR_PTR(-ENOMEM);
1399
1400         __setup_root(tree_root->nodesize, tree_root->sectorsize,
1401                 tree_root->stripesize, root, fs_info, objectid);
1402         root->root_key.objectid = objectid;
1403         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1404         root->root_key.offset = 0;
1405
1406         leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
1407         if (IS_ERR(leaf)) {
1408                 ret = PTR_ERR(leaf);
1409                 leaf = NULL;
1410                 goto fail;
1411         }
1412
1413         memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1414         btrfs_set_header_bytenr(leaf, leaf->start);
1415         btrfs_set_header_generation(leaf, trans->transid);
1416         btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1417         btrfs_set_header_owner(leaf, objectid);
1418         root->node = leaf;
1419
1420         write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
1421                             BTRFS_FSID_SIZE);
1422         write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
1423                             btrfs_header_chunk_tree_uuid(leaf),
1424                             BTRFS_UUID_SIZE);
1425         btrfs_mark_buffer_dirty(leaf);
1426
1427         root->commit_root = btrfs_root_node(root);
1428         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1429
1430         root->root_item.flags = 0;
1431         root->root_item.byte_limit = 0;
1432         btrfs_set_root_bytenr(&root->root_item, leaf->start);
1433         btrfs_set_root_generation(&root->root_item, trans->transid);
1434         btrfs_set_root_level(&root->root_item, 0);
1435         btrfs_set_root_refs(&root->root_item, 1);
1436         btrfs_set_root_used(&root->root_item, leaf->len);
1437         btrfs_set_root_last_snapshot(&root->root_item, 0);
1438         btrfs_set_root_dirid(&root->root_item, 0);
1439         uuid_le_gen(&uuid);
1440         memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
1441         root->root_item.drop_level = 0;
1442
1443         key.objectid = objectid;
1444         key.type = BTRFS_ROOT_ITEM_KEY;
1445         key.offset = 0;
1446         ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
1447         if (ret)
1448                 goto fail;
1449
1450         btrfs_tree_unlock(leaf);
1451
1452         return root;
1453
1454 fail:
1455         if (leaf) {
1456                 btrfs_tree_unlock(leaf);
1457                 free_extent_buffer(root->commit_root);
1458                 free_extent_buffer(leaf);
1459         }
1460         kfree(root);
1461
1462         return ERR_PTR(ret);
1463 }
1464
1465 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1466                                          struct btrfs_fs_info *fs_info)
1467 {
1468         struct btrfs_root *root;
1469         struct btrfs_root *tree_root = fs_info->tree_root;
1470         struct extent_buffer *leaf;
1471
1472         root = btrfs_alloc_root(fs_info);
1473         if (!root)
1474                 return ERR_PTR(-ENOMEM);
1475
1476         __setup_root(tree_root->nodesize, tree_root->sectorsize,
1477                      tree_root->stripesize, root, fs_info,
1478                      BTRFS_TREE_LOG_OBJECTID);
1479
1480         root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1481         root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1482         root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1483
1484         /*
1485          * DON'T set REF_COWS for log trees
1486          *
1487          * log trees do not get reference counted because they go away
1488          * before a real commit is actually done.  They do store pointers
1489          * to file data extents, and those reference counts still get
1490          * updated (along with back refs to the log tree).
1491          */
1492
1493         leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1494                         NULL, 0, 0, 0);
1495         if (IS_ERR(leaf)) {
1496                 kfree(root);
1497                 return ERR_CAST(leaf);
1498         }
1499
1500         memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1501         btrfs_set_header_bytenr(leaf, leaf->start);
1502         btrfs_set_header_generation(leaf, trans->transid);
1503         btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
1504         btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
1505         root->node = leaf;
1506
1507         write_extent_buffer(root->node, root->fs_info->fsid,
1508                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
1509         btrfs_mark_buffer_dirty(root->node);
1510         btrfs_tree_unlock(root->node);
1511         return root;
1512 }
1513
1514 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
1515                              struct btrfs_fs_info *fs_info)
1516 {
1517         struct btrfs_root *log_root;
1518
1519         log_root = alloc_log_tree(trans, fs_info);
1520         if (IS_ERR(log_root))
1521                 return PTR_ERR(log_root);
1522         WARN_ON(fs_info->log_root_tree);
1523         fs_info->log_root_tree = log_root;
1524         return 0;
1525 }
1526
1527 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1528                        struct btrfs_root *root)
1529 {
1530         struct btrfs_root *log_root;
1531         struct btrfs_inode_item *inode_item;
1532
1533         log_root = alloc_log_tree(trans, root->fs_info);
1534         if (IS_ERR(log_root))
1535                 return PTR_ERR(log_root);
1536
1537         log_root->last_trans = trans->transid;
1538         log_root->root_key.offset = root->root_key.objectid;
1539
1540         inode_item = &log_root->root_item.inode;
1541         btrfs_set_stack_inode_generation(inode_item, 1);
1542         btrfs_set_stack_inode_size(inode_item, 3);
1543         btrfs_set_stack_inode_nlink(inode_item, 1);
1544         btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
1545         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1546
1547         btrfs_set_root_node(&log_root->root_item, log_root->node);
1548
1549         WARN_ON(root->log_root);
1550         root->log_root = log_root;
1551         root->log_transid = 0;
1552         root->log_transid_committed = -1;
1553         root->last_log_commit = 0;
1554         return 0;
1555 }
1556
1557 static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1558                                                struct btrfs_key *key)
1559 {
1560         struct btrfs_root *root;
1561         struct btrfs_fs_info *fs_info = tree_root->fs_info;
1562         struct btrfs_path *path;
1563         u64 generation;
1564         int ret;
1565
1566         path = btrfs_alloc_path();
1567         if (!path)
1568                 return ERR_PTR(-ENOMEM);
1569
1570         root = btrfs_alloc_root(fs_info);
1571         if (!root) {
1572                 ret = -ENOMEM;
1573                 goto alloc_fail;
1574         }
1575
1576         __setup_root(tree_root->nodesize, tree_root->sectorsize,
1577                 tree_root->stripesize, root, fs_info, key->objectid);
1578
1579         ret = btrfs_find_root(tree_root, key, path,
1580                               &root->root_item, &root->root_key);
1581         if (ret) {
1582                 if (ret > 0)
1583                         ret = -ENOENT;
1584                 goto find_fail;
1585         }
1586
1587         generation = btrfs_root_generation(&root->root_item);
1588         root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1589                                      generation);
1590         if (IS_ERR(root->node)) {
1591                 ret = PTR_ERR(root->node);
1592                 goto find_fail;
1593         } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1594                 ret = -EIO;
1595                 free_extent_buffer(root->node);
1596                 goto find_fail;
1597         }
1598         root->commit_root = btrfs_root_node(root);
1599 out:
1600         btrfs_free_path(path);
1601         return root;
1602
1603 find_fail:
1604         kfree(root);
1605 alloc_fail:
1606         root = ERR_PTR(ret);
1607         goto out;
1608 }
1609
1610 struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1611                                       struct btrfs_key *location)
1612 {
1613         struct btrfs_root *root;
1614
1615         root = btrfs_read_tree_root(tree_root, location);
1616         if (IS_ERR(root))
1617                 return root;
1618
1619         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1620                 set_bit(BTRFS_ROOT_REF_COWS, &root->state);
1621                 btrfs_check_and_init_root_item(&root->root_item);
1622         }
1623
1624         return root;
1625 }
1626
1627 int btrfs_init_fs_root(struct btrfs_root *root)
1628 {
1629         int ret;
1630         struct btrfs_subvolume_writers *writers;
1631
1632         root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1633         root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1634                                         GFP_NOFS);
1635         if (!root->free_ino_pinned || !root->free_ino_ctl) {
1636                 ret = -ENOMEM;
1637                 goto fail;
1638         }
1639
1640         writers = btrfs_alloc_subvolume_writers();
1641         if (IS_ERR(writers)) {
1642                 ret = PTR_ERR(writers);
1643                 goto fail;
1644         }
1645         root->subv_writers = writers;
1646
1647         btrfs_init_free_ino_ctl(root);
1648         spin_lock_init(&root->ino_cache_lock);
1649         init_waitqueue_head(&root->ino_cache_wait);
1650
1651         ret = get_anon_bdev(&root->anon_dev);
1652         if (ret)
1653                 goto free_writers;
1654
1655         mutex_lock(&root->objectid_mutex);
1656         ret = btrfs_find_highest_objectid(root,
1657                                         &root->highest_objectid);
1658         if (ret) {
1659                 mutex_unlock(&root->objectid_mutex);
1660                 goto free_root_dev;
1661         }
1662
1663         ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
1664
1665         mutex_unlock(&root->objectid_mutex);
1666
1667         return 0;
1668
1669 free_root_dev:
1670         free_anon_bdev(root->anon_dev);
1671 free_writers:
1672         btrfs_free_subvolume_writers(root->subv_writers);
1673 fail:
1674         kfree(root->free_ino_ctl);
1675         kfree(root->free_ino_pinned);
1676         return ret;
1677 }
1678
1679 struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1680                                         u64 root_id)
1681 {
1682         struct btrfs_root *root;
1683
1684         spin_lock(&fs_info->fs_roots_radix_lock);
1685         root = radix_tree_lookup(&fs_info->fs_roots_radix,
1686                                  (unsigned long)root_id);
1687         spin_unlock(&fs_info->fs_roots_radix_lock);
1688         return root;
1689 }
1690
1691 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1692                          struct btrfs_root *root)
1693 {
1694         int ret;
1695
1696         ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1697         if (ret)
1698                 return ret;
1699
1700         spin_lock(&fs_info->fs_roots_radix_lock);
1701         ret = radix_tree_insert(&fs_info->fs_roots_radix,
1702                                 (unsigned long)root->root_key.objectid,
1703                                 root);
1704         if (ret == 0)
1705                 set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1706         spin_unlock(&fs_info->fs_roots_radix_lock);
1707         radix_tree_preload_end();
1708
1709         return ret;
1710 }
1711
1712 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
1713                                      struct btrfs_key *location,
1714                                      bool check_ref)
1715 {
1716         struct btrfs_root *root;
1717         struct btrfs_path *path;
1718         struct btrfs_key key;
1719         int ret;
1720
1721         if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1722                 return fs_info->tree_root;
1723         if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1724                 return fs_info->extent_root;
1725         if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1726                 return fs_info->chunk_root;
1727         if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1728                 return fs_info->dev_root;
1729         if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1730                 return fs_info->csum_root;
1731         if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
1732                 return fs_info->quota_root ? fs_info->quota_root :
1733                                              ERR_PTR(-ENOENT);
1734         if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
1735                 return fs_info->uuid_root ? fs_info->uuid_root :
1736                                             ERR_PTR(-ENOENT);
1737 again:
1738         root = btrfs_lookup_fs_root(fs_info, location->objectid);
1739         if (root) {
1740                 if (check_ref && btrfs_root_refs(&root->root_item) == 0)
1741                         return ERR_PTR(-ENOENT);
1742                 return root;
1743         }
1744
1745         root = btrfs_read_fs_root(fs_info->tree_root, location);
1746         if (IS_ERR(root))
1747                 return root;
1748
1749         if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1750                 ret = -ENOENT;
1751                 goto fail;
1752         }
1753
1754         ret = btrfs_init_fs_root(root);
1755         if (ret)
1756                 goto fail;
1757
1758         path = btrfs_alloc_path();
1759         if (!path) {
1760                 ret = -ENOMEM;
1761                 goto fail;
1762         }
1763         key.objectid = BTRFS_ORPHAN_OBJECTID;
1764         key.type = BTRFS_ORPHAN_ITEM_KEY;
1765         key.offset = location->objectid;
1766
1767         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1768         btrfs_free_path(path);
1769         if (ret < 0)
1770                 goto fail;
1771         if (ret == 0)
1772                 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1773
1774         ret = btrfs_insert_fs_root(fs_info, root);
1775         if (ret) {
1776                 if (ret == -EEXIST) {
1777                         free_fs_root(root);
1778                         goto again;
1779                 }
1780                 goto fail;
1781         }
1782         return root;
1783 fail:
1784         free_fs_root(root);
1785         return ERR_PTR(ret);
1786 }
1787
1788 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1789 {
1790         struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
1791         int ret = 0;
1792         struct btrfs_device *device;
1793         struct backing_dev_info *bdi;
1794
1795         rcu_read_lock();
1796         list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1797                 if (!device->bdev)
1798                         continue;
1799                 bdi = blk_get_backing_dev_info(device->bdev);
1800                 if (bdi_congested(bdi, bdi_bits)) {
1801                         ret = 1;
1802                         break;
1803                 }
1804         }
1805         rcu_read_unlock();
1806         return ret;
1807 }
1808
1809 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1810 {
1811         int err;
1812
1813         err = bdi_setup_and_register(bdi, "btrfs");
1814         if (err)
1815                 return err;
1816
1817         bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
1818         bdi->congested_fn       = btrfs_congested_fn;
1819         bdi->congested_data     = info;
1820         bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
1821         return 0;
1822 }
1823
1824 /*
1825  * called by the kthread helper functions to finally call the bio end_io
1826  * functions.  This is where read checksum verification actually happens
1827  */
1828 static void end_workqueue_fn(struct btrfs_work *work)
1829 {
1830         struct bio *bio;
1831         struct btrfs_end_io_wq *end_io_wq;
1832
1833         end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1834         bio = end_io_wq->bio;
1835
1836         bio->bi_error = end_io_wq->error;
1837         bio->bi_private = end_io_wq->private;
1838         bio->bi_end_io = end_io_wq->end_io;
1839         kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1840         bio_endio(bio);
1841 }
1842
1843 static int cleaner_kthread(void *arg)
1844 {
1845         struct btrfs_root *root = arg;
1846         int again;
1847         struct btrfs_trans_handle *trans;
1848
1849         do {
1850                 again = 0;
1851
1852                 /* Make the cleaner go to sleep early. */
1853                 if (btrfs_need_cleaner_sleep(root))
1854                         goto sleep;
1855
1856                 if (!mutex_trylock(&root->fs_info->cleaner_mutex))
1857                         goto sleep;
1858
1859                 /*
1860                  * Avoid the problem that we change the status of the fs
1861                  * during the above check and trylock.
1862                  */
1863                 if (btrfs_need_cleaner_sleep(root)) {
1864                         mutex_unlock(&root->fs_info->cleaner_mutex);
1865                         goto sleep;
1866                 }
1867
1868                 mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
1869                 btrfs_run_delayed_iputs(root);
1870                 mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
1871
1872                 again = btrfs_clean_one_deleted_snapshot(root);
1873                 mutex_unlock(&root->fs_info->cleaner_mutex);
1874
1875                 /*
1876                  * The defragger has dealt with the R/O remount and umount,
1877                  * needn't do anything special here.
1878                  */
1879                 btrfs_run_defrag_inodes(root->fs_info);
1880
1881                 /*
1882                  * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
1883                  * with relocation (btrfs_relocate_chunk) and relocation
1884                  * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1885                  * after acquiring fs_info->delete_unused_bgs_mutex. So we
1886                  * can't hold, nor need to, fs_info->cleaner_mutex when deleting
1887                  * unused block groups.
1888                  */
1889                 btrfs_delete_unused_bgs(root->fs_info);
1890 sleep:
1891                 if (!try_to_freeze() && !again) {
1892                         set_current_state(TASK_INTERRUPTIBLE);
1893                         if (!kthread_should_stop())
1894                                 schedule();
1895                         __set_current_state(TASK_RUNNING);
1896                 }
1897         } while (!kthread_should_stop());
1898
1899         /*
1900          * Transaction kthread is stopped before us and wakes us up.
1901          * However we might have started a new transaction and COWed some
1902          * tree blocks when deleting unused block groups for example. So
1903          * make sure we commit the transaction we started to have a clean
1904          * shutdown when evicting the btree inode - if it has dirty pages
1905          * when we do the final iput() on it, eviction will trigger a
1906          * writeback for it which will fail with null pointer dereferences
1907          * since work queues and other resources were already released and
1908          * destroyed by the time the iput/eviction/writeback is made.
1909          */
1910         trans = btrfs_attach_transaction(root);
1911         if (IS_ERR(trans)) {
1912                 if (PTR_ERR(trans) != -ENOENT)
1913                         btrfs_err(root->fs_info,
1914                                   "cleaner transaction attach returned %ld",
1915                                   PTR_ERR(trans));
1916         } else {
1917                 int ret;
1918
1919                 ret = btrfs_commit_transaction(trans, root);
1920                 if (ret)
1921                         btrfs_err(root->fs_info,
1922                                   "cleaner open transaction commit returned %d",
1923                                   ret);
1924         }
1925
1926         return 0;
1927 }
1928
1929 static int transaction_kthread(void *arg)
1930 {
1931         struct btrfs_root *root = arg;
1932         struct btrfs_trans_handle *trans;
1933         struct btrfs_transaction *cur;
1934         u64 transid;
1935         unsigned long now;
1936         unsigned long delay;
1937         bool cannot_commit;
1938
1939         do {
1940                 cannot_commit = false;
1941                 delay = HZ * root->fs_info->commit_interval;
1942                 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1943
1944                 spin_lock(&root->fs_info->trans_lock);
1945                 cur = root->fs_info->running_transaction;
1946                 if (!cur) {
1947                         spin_unlock(&root->fs_info->trans_lock);
1948                         goto sleep;
1949                 }
1950
1951                 now = get_seconds();
1952                 if (cur->state < TRANS_STATE_BLOCKED &&
1953                     (now < cur->start_time ||
1954                      now - cur->start_time < root->fs_info->commit_interval)) {
1955                         spin_unlock(&root->fs_info->trans_lock);
1956                         delay = HZ * 5;
1957                         goto sleep;
1958                 }
1959                 transid = cur->transid;
1960                 spin_unlock(&root->fs_info->trans_lock);
1961
1962                 /* If the file system is aborted, this will always fail. */
1963                 trans = btrfs_attach_transaction(root);
1964                 if (IS_ERR(trans)) {
1965                         if (PTR_ERR(trans) != -ENOENT)
1966                                 cannot_commit = true;
1967                         goto sleep;
1968                 }
1969                 if (transid == trans->transid) {
1970                         btrfs_commit_transaction(trans, root);
1971                 } else {
1972                         btrfs_end_transaction(trans, root);
1973                 }
1974 sleep:
1975                 wake_up_process(root->fs_info->cleaner_kthread);
1976                 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
1977
1978                 if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
1979                                       &root->fs_info->fs_state)))
1980                         btrfs_cleanup_transaction(root);
1981                 if (!try_to_freeze()) {
1982                         set_current_state(TASK_INTERRUPTIBLE);
1983                         if (!kthread_should_stop() &&
1984                             (!btrfs_transaction_blocked(root->fs_info) ||
1985                              cannot_commit))
1986                                 schedule_timeout(delay);
1987                         __set_current_state(TASK_RUNNING);
1988                 }
1989         } while (!kthread_should_stop());
1990         return 0;
1991 }
1992
1993 /*
1994  * this will find the highest generation in the array of
1995  * root backups.  The index of the highest array is returned,
1996  * or -1 if we can't find anything.
1997  *
1998  * We check to make sure the array is valid by comparing the
1999  * generation of the latest  root in the array with the generation
2000  * in the super block.  If they don't match we pitch it.
2001  */
2002 static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
2003 {
2004         u64 cur;
2005         int newest_index = -1;
2006         struct btrfs_root_backup *root_backup;
2007         int i;
2008
2009         for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
2010                 root_backup = info->super_copy->super_roots + i;
2011                 cur = btrfs_backup_tree_root_gen(root_backup);
2012                 if (cur == newest_gen)
2013                         newest_index = i;
2014         }
2015
2016         /* check to see if we actually wrapped around */
2017         if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
2018                 root_backup = info->super_copy->super_roots;
2019                 cur = btrfs_backup_tree_root_gen(root_backup);
2020                 if (cur == newest_gen)
2021                         newest_index = 0;
2022         }
2023         return newest_index;
2024 }
2025
2026
2027 /*
2028  * find the oldest backup so we know where to store new entries
2029  * in the backup array.  This will set the backup_root_index
2030  * field in the fs_info struct
2031  */
2032 static void find_oldest_super_backup(struct btrfs_fs_info *info,
2033                                      u64 newest_gen)
2034 {
2035         int newest_index = -1;
2036
2037         newest_index = find_newest_super_backup(info, newest_gen);
2038         /* if there was garbage in there, just move along */
2039         if (newest_index == -1) {
2040                 info->backup_root_index = 0;
2041         } else {
2042                 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
2043         }
2044 }
2045
2046 /*
2047  * copy all the root pointers into the super backup array.
2048  * this will bump the backup pointer by one when it is
2049  * done
2050  */
2051 static void backup_super_roots(struct btrfs_fs_info *info)
2052 {
2053         int next_backup;
2054         struct btrfs_root_backup *root_backup;
2055         int last_backup;
2056
2057         next_backup = info->backup_root_index;
2058         last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
2059                 BTRFS_NUM_BACKUP_ROOTS;
2060
2061         /*
2062          * just overwrite the last backup if we're at the same generation
2063          * this happens only at umount
2064          */
2065         root_backup = info->super_for_commit->super_roots + last_backup;
2066         if (btrfs_backup_tree_root_gen(root_backup) ==
2067             btrfs_header_generation(info->tree_root->node))
2068                 next_backup = last_backup;
2069
2070         root_backup = info->super_for_commit->super_roots + next_backup;
2071
2072         /*
2073          * make sure all of our padding and empty slots get zero filled
2074          * regardless of which ones we use today
2075          */
2076         memset(root_backup, 0, sizeof(*root_backup));
2077
2078         info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
2079
2080         btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
2081         btrfs_set_backup_tree_root_gen(root_backup,
2082                                btrfs_header_generation(info->tree_root->node));
2083
2084         btrfs_set_backup_tree_root_level(root_backup,
2085                                btrfs_header_level(info->tree_root->node));
2086
2087         btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
2088         btrfs_set_backup_chunk_root_gen(root_backup,
2089                                btrfs_header_generation(info->chunk_root->node));
2090         btrfs_set_backup_chunk_root_level(root_backup,
2091                                btrfs_header_level(info->chunk_root->node));
2092
2093         btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
2094         btrfs_set_backup_extent_root_gen(root_backup,
2095                                btrfs_header_generation(info->extent_root->node));
2096         btrfs_set_backup_extent_root_level(root_backup,
2097                                btrfs_header_level(info->extent_root->node));
2098
2099         /*
2100          * we might commit during log recovery, which happens before we set
2101          * the fs_root.  Make sure it is valid before we fill it in.
2102          */
2103         if (info->fs_root && info->fs_root->node) {
2104                 btrfs_set_backup_fs_root(root_backup,
2105                                          info->fs_root->node->start);
2106                 btrfs_set_backup_fs_root_gen(root_backup,
2107                                btrfs_header_generation(info->fs_root->node));
2108                 btrfs_set_backup_fs_root_level(root_backup,
2109                                btrfs_header_level(info->fs_root->node));
2110         }
2111
2112         btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
2113         btrfs_set_backup_dev_root_gen(root_backup,
2114                                btrfs_header_generation(info->dev_root->node));
2115         btrfs_set_backup_dev_root_level(root_backup,
2116                                        btrfs_header_level(info->dev_root->node));
2117
2118         btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
2119         btrfs_set_backup_csum_root_gen(root_backup,
2120                                btrfs_header_generation(info->csum_root->node));
2121         btrfs_set_backup_csum_root_level(root_backup,
2122                                btrfs_header_level(info->csum_root->node));
2123
2124         btrfs_set_backup_total_bytes(root_backup,
2125                              btrfs_super_total_bytes(info->super_copy));
2126         btrfs_set_backup_bytes_used(root_backup,
2127                              btrfs_super_bytes_used(info->super_copy));
2128         btrfs_set_backup_num_devices(root_backup,
2129                              btrfs_super_num_devices(info->super_copy));
2130
2131         /*
2132          * if we don't copy this out to the super_copy, it won't get remembered
2133          * for the next commit
2134          */
2135         memcpy(&info->super_copy->super_roots,
2136                &info->super_for_commit->super_roots,
2137                sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
2138 }
2139
2140 /*
2141  * this copies info out of the root backup array and back into
2142  * the in-memory super block.  It is meant to help iterate through
2143  * the array, so you send it the number of backups you've already
2144  * tried and the last backup index you used.
2145  *
2146  * this returns -1 when it has tried all the backups
2147  */
2148 static noinline int next_root_backup(struct btrfs_fs_info *info,
2149                                      struct btrfs_super_block *super,
2150                                      int *num_backups_tried, int *backup_index)
2151 {
2152         struct btrfs_root_backup *root_backup;
2153         int newest = *backup_index;
2154
2155         if (*num_backups_tried == 0) {
2156                 u64 gen = btrfs_super_generation(super);
2157
2158                 newest = find_newest_super_backup(info, gen);
2159                 if (newest == -1)
2160                         return -1;
2161
2162                 *backup_index = newest;
2163                 *num_backups_tried = 1;
2164         } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
2165                 /* we've tried all the backups, all done */
2166                 return -1;
2167         } else {
2168                 /* jump to the next oldest backup */
2169                 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
2170                         BTRFS_NUM_BACKUP_ROOTS;
2171                 *backup_index = newest;
2172                 *num_backups_tried += 1;
2173         }
2174         root_backup = super->super_roots + newest;
2175
2176         btrfs_set_super_generation(super,
2177                                    btrfs_backup_tree_root_gen(root_backup));
2178         btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
2179         btrfs_set_super_root_level(super,
2180                                    btrfs_backup_tree_root_level(root_backup));
2181         btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
2182
2183         /*
2184          * fixme: the total bytes and num_devices need to match or we should
2185          * need a fsck
2186          */
2187         btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
2188         btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
2189         return 0;
2190 }
2191
2192 /* helper to cleanup workers */
2193 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2194 {
2195         btrfs_destroy_workqueue(fs_info->fixup_workers);
2196         btrfs_destroy_workqueue(fs_info->delalloc_workers);
2197         btrfs_destroy_workqueue(fs_info->workers);
2198         btrfs_destroy_workqueue(fs_info->endio_workers);
2199         btrfs_destroy_workqueue(fs_info->endio_meta_workers);
2200         btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2201         btrfs_destroy_workqueue(fs_info->endio_repair_workers);
2202         btrfs_destroy_workqueue(fs_info->rmw_workers);
2203         btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2204         btrfs_destroy_workqueue(fs_info->endio_write_workers);
2205         btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2206         btrfs_destroy_workqueue(fs_info->submit_workers);
2207         btrfs_destroy_workqueue(fs_info->delayed_workers);
2208         btrfs_destroy_workqueue(fs_info->caching_workers);
2209         btrfs_destroy_workqueue(fs_info->readahead_workers);
2210         btrfs_destroy_workqueue(fs_info->flush_workers);
2211         btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2212         btrfs_destroy_workqueue(fs_info->extent_workers);
2213 }
2214
2215 static void free_root_extent_buffers(struct btrfs_root *root)
2216 {
2217         if (root) {
2218                 free_extent_buffer(root->node);
2219                 free_extent_buffer(root->commit_root);
2220                 root->node = NULL;
2221                 root->commit_root = NULL;
2222         }
2223 }
2224
2225 /* helper to cleanup tree roots */
2226 static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
2227 {
2228         free_root_extent_buffers(info->tree_root);
2229
2230         free_root_extent_buffers(info->dev_root);
2231         free_root_extent_buffers(info->extent_root);
2232         free_root_extent_buffers(info->csum_root);
2233         free_root_extent_buffers(info->quota_root);
2234         free_root_extent_buffers(info->uuid_root);
2235         if (chunk_root)
2236                 free_root_extent_buffers(info->chunk_root);
2237 }
2238
2239 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2240 {
2241         int ret;
2242         struct btrfs_root *gang[8];
2243         int i;
2244
2245         while (!list_empty(&fs_info->dead_roots)) {
2246                 gang[0] = list_entry(fs_info->dead_roots.next,
2247                                      struct btrfs_root, root_list);
2248                 list_del(&gang[0]->root_list);
2249
2250                 if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
2251                         btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2252                 } else {
2253                         free_extent_buffer(gang[0]->node);
2254                         free_extent_buffer(gang[0]->commit_root);
2255                         btrfs_put_fs_root(gang[0]);
2256                 }
2257         }
2258
2259         while (1) {
2260                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
2261                                              (void **)gang, 0,
2262                                              ARRAY_SIZE(gang));
2263                 if (!ret)
2264                         break;
2265                 for (i = 0; i < ret; i++)
2266                         btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2267         }
2268
2269         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
2270                 btrfs_free_log_root_tree(NULL, fs_info);
2271                 btrfs_destroy_pinned_extent(fs_info->tree_root,
2272                                             fs_info->pinned_extents);
2273         }
2274 }
2275
2276 static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2277 {
2278         mutex_init(&fs_info->scrub_lock);
2279         atomic_set(&fs_info->scrubs_running, 0);
2280         atomic_set(&fs_info->scrub_pause_req, 0);
2281         atomic_set(&fs_info->scrubs_paused, 0);
2282         atomic_set(&fs_info->scrub_cancel_req, 0);
2283         init_waitqueue_head(&fs_info->scrub_pause_wait);
2284         fs_info->scrub_workers_refcnt = 0;
2285 }
2286
2287 static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
2288 {
2289         spin_lock_init(&fs_info->balance_lock);
2290         mutex_init(&fs_info->balance_mutex);
2291         atomic_set(&fs_info->balance_running, 0);
2292         atomic_set(&fs_info->balance_pause_req, 0);
2293         atomic_set(&fs_info->balance_cancel_req, 0);
2294         fs_info->balance_ctl = NULL;
2295         init_waitqueue_head(&fs_info->balance_wait_q);
2296 }
2297
2298 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
2299                                    struct btrfs_root *tree_root)
2300 {
2301         fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2302         set_nlink(fs_info->btree_inode, 1);
2303         /*
2304          * we set the i_size on the btree inode to the max possible int.
2305          * the real end of the address space is determined by all of
2306          * the devices in the system
2307          */
2308         fs_info->btree_inode->i_size = OFFSET_MAX;
2309         fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
2310
2311         RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
2312         extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
2313                              fs_info->btree_inode->i_mapping);
2314         BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
2315         extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
2316
2317         BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
2318
2319         BTRFS_I(fs_info->btree_inode)->root = tree_root;
2320         memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
2321                sizeof(struct btrfs_key));
2322         set_bit(BTRFS_INODE_DUMMY,
2323                 &BTRFS_I(fs_info->btree_inode)->runtime_flags);
2324         btrfs_insert_inode_hash(fs_info->btree_inode);
2325 }
2326
2327 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2328 {
2329         fs_info->dev_replace.lock_owner = 0;
2330         atomic_set(&fs_info->dev_replace.nesting_level, 0);
2331         mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2332         mutex_init(&fs_info->dev_replace.lock_management_lock);
2333         mutex_init(&fs_info->dev_replace.lock);
2334         init_waitqueue_head(&fs_info->replace_wait);
2335 }
2336
2337 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2338 {
2339         spin_lock_init(&fs_info->qgroup_lock);
2340         mutex_init(&fs_info->qgroup_ioctl_lock);
2341         fs_info->qgroup_tree = RB_ROOT;
2342         fs_info->qgroup_op_tree = RB_ROOT;
2343         INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2344         fs_info->qgroup_seq = 1;
2345         fs_info->quota_enabled = 0;
2346         fs_info->pending_quota_state = 0;
2347         fs_info->qgroup_ulist = NULL;
2348         fs_info->qgroup_rescan_running = false;
2349         mutex_init(&fs_info->qgroup_rescan_lock);
2350 }
2351
2352 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
2353                 struct btrfs_fs_devices *fs_devices)
2354 {
2355         int max_active = fs_info->thread_pool_size;
2356         unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2357
2358         fs_info->workers =
2359                 btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
2360                                       max_active, 16);
2361
2362         fs_info->delalloc_workers =
2363                 btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2364
2365         fs_info->flush_workers =
2366                 btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2367
2368         fs_info->caching_workers =
2369                 btrfs_alloc_workqueue("cache", flags, max_active, 0);
2370
2371         /*
2372          * a higher idle thresh on the submit workers makes it much more
2373          * likely that bios will be send down in a sane order to the
2374          * devices
2375          */
2376         fs_info->submit_workers =
2377                 btrfs_alloc_workqueue("submit", flags,
2378                                       min_t(u64, fs_devices->num_devices,
2379                                             max_active), 64);
2380
2381         fs_info->fixup_workers =
2382                 btrfs_alloc_workqueue("fixup", flags, 1, 0);
2383
2384         /*
2385          * endios are largely parallel and should have a very
2386          * low idle thresh
2387          */
2388         fs_info->endio_workers =
2389                 btrfs_alloc_workqueue("endio", flags, max_active, 4);
2390         fs_info->endio_meta_workers =
2391                 btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2392         fs_info->endio_meta_write_workers =
2393                 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2394         fs_info->endio_raid56_workers =
2395                 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2396         fs_info->endio_repair_workers =
2397                 btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
2398         fs_info->rmw_workers =
2399                 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2400         fs_info->endio_write_workers =
2401                 btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2402         fs_info->endio_freespace_worker =
2403                 btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2404         fs_info->delayed_workers =
2405                 btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2406         fs_info->readahead_workers =
2407                 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2408         fs_info->qgroup_rescan_workers =
2409                 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2410         fs_info->extent_workers =
2411                 btrfs_alloc_workqueue("extent-refs", flags,
2412                                       min_t(u64, fs_devices->num_devices,
2413                                             max_active), 8);
2414
2415         if (!(fs_info->workers && fs_info->delalloc_workers &&
2416               fs_info->submit_workers && fs_info->flush_workers &&
2417               fs_info->endio_workers && fs_info->endio_meta_workers &&
2418               fs_info->endio_meta_write_workers &&
2419               fs_info->endio_repair_workers &&
2420               fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2421               fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2422               fs_info->caching_workers && fs_info->readahead_workers &&
2423               fs_info->fixup_workers && fs_info->delayed_workers &&
2424               fs_info->extent_workers &&
2425               fs_info->qgroup_rescan_workers)) {
2426                 return -ENOMEM;
2427         }
2428
2429         return 0;
2430 }
2431
2432 static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2433                             struct btrfs_fs_devices *fs_devices)
2434 {
2435         int ret;
2436         struct btrfs_root *tree_root = fs_info->tree_root;
2437         struct btrfs_root *log_tree_root;
2438         struct btrfs_super_block *disk_super = fs_info->super_copy;
2439         u64 bytenr = btrfs_super_log_root(disk_super);
2440
2441         if (fs_devices->rw_devices == 0) {
2442                 btrfs_warn(fs_info, "log replay required on RO media");
2443                 return -EIO;
2444         }
2445
2446         log_tree_root = btrfs_alloc_root(fs_info);
2447         if (!log_tree_root)
2448                 return -ENOMEM;
2449
2450         __setup_root(tree_root->nodesize, tree_root->sectorsize,
2451                         tree_root->stripesize, log_tree_root, fs_info,
2452                         BTRFS_TREE_LOG_OBJECTID);
2453
2454         log_tree_root->node = read_tree_block(tree_root, bytenr,
2455                         fs_info->generation + 1);
2456         if (IS_ERR(log_tree_root->node)) {
2457                 btrfs_warn(fs_info, "failed to read log tree");
2458                 ret = PTR_ERR(log_tree_root->node);
2459                 kfree(log_tree_root);
2460                 return ret;
2461         } else if (!extent_buffer_uptodate(log_tree_root->node)) {
2462                 btrfs_err(fs_info, "failed to read log tree");
2463                 free_extent_buffer(log_tree_root->node);
2464                 kfree(log_tree_root);
2465                 return -EIO;
2466         }
2467         /* returns with log_tree_root freed on success */
2468         ret = btrfs_recover_log_trees(log_tree_root);
2469         if (ret) {
2470                 btrfs_std_error(tree_root->fs_info, ret,
2471                             "Failed to recover log tree");
2472                 free_extent_buffer(log_tree_root->node);
2473                 kfree(log_tree_root);
2474                 return ret;
2475         }
2476
2477         if (fs_info->sb->s_flags & MS_RDONLY) {
2478                 ret = btrfs_commit_super(tree_root);
2479                 if (ret)
2480                         return ret;
2481         }
2482
2483         return 0;
2484 }
2485
2486 static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
2487                             struct btrfs_root *tree_root)
2488 {
2489         struct btrfs_root *root;
2490         struct btrfs_key location;
2491         int ret;
2492
2493         location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2494         location.type = BTRFS_ROOT_ITEM_KEY;
2495         location.offset = 0;
2496
2497         root = btrfs_read_tree_root(tree_root, &location);
2498         if (IS_ERR(root))
2499                 return PTR_ERR(root);
2500         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2501         fs_info->extent_root = root;
2502
2503         location.objectid = BTRFS_DEV_TREE_OBJECTID;
2504         root = btrfs_read_tree_root(tree_root, &location);
2505         if (IS_ERR(root))
2506                 return PTR_ERR(root);
2507         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2508         fs_info->dev_root = root;
2509         btrfs_init_devices_late(fs_info);
2510
2511         location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2512         root = btrfs_read_tree_root(tree_root, &location);
2513         if (IS_ERR(root))
2514                 return PTR_ERR(root);
2515         set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2516         fs_info->csum_root = root;
2517
2518         location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2519         root = btrfs_read_tree_root(tree_root, &location);
2520         if (!IS_ERR(root)) {
2521                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2522                 fs_info->quota_enabled = 1;
2523                 fs_info->pending_quota_state = 1;
2524                 fs_info->quota_root = root;
2525         }
2526
2527         location.objectid = BTRFS_UUID_TREE_OBJECTID;
2528         root = btrfs_read_tree_root(tree_root, &location);
2529         if (IS_ERR(root)) {
2530                 ret = PTR_ERR(root);
2531                 if (ret != -ENOENT)
2532                         return ret;
2533         } else {
2534                 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2535                 fs_info->uuid_root = root;
2536         }
2537
2538         return 0;
2539 }
2540
2541 int open_ctree(struct super_block *sb,
2542                struct btrfs_fs_devices *fs_devices,
2543                char *options)
2544 {
2545         u32 sectorsize;
2546         u32 nodesize;
2547         u32 stripesize;
2548         u64 generation;
2549         u64 features;
2550         struct btrfs_key location;
2551         struct buffer_head *bh;
2552         struct btrfs_super_block *disk_super;
2553         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2554         struct btrfs_root *tree_root;
2555         struct btrfs_root *chunk_root;
2556         int ret;
2557         int err = -EINVAL;
2558         int num_backups_tried = 0;
2559         int backup_index = 0;
2560         int max_active;
2561
2562         tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
2563         chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
2564         if (!tree_root || !chunk_root) {
2565                 err = -ENOMEM;
2566                 goto fail;
2567         }
2568
2569         ret = init_srcu_struct(&fs_info->subvol_srcu);
2570         if (ret) {
2571                 err = ret;
2572                 goto fail;
2573         }
2574
2575         ret = setup_bdi(fs_info, &fs_info->bdi);
2576         if (ret) {
2577                 err = ret;
2578                 goto fail_srcu;
2579         }
2580
2581         ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2582         if (ret) {
2583                 err = ret;
2584                 goto fail_bdi;
2585         }
2586         fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
2587                                         (1 + ilog2(nr_cpu_ids));
2588
2589         ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2590         if (ret) {
2591                 err = ret;
2592                 goto fail_dirty_metadata_bytes;
2593         }
2594
2595         ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
2596         if (ret) {
2597                 err = ret;
2598                 goto fail_delalloc_bytes;
2599         }
2600
2601         fs_info->btree_inode = new_inode(sb);
2602         if (!fs_info->btree_inode) {
2603                 err = -ENOMEM;
2604                 goto fail_bio_counter;
2605         }
2606
2607         mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2608
2609         INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2610         INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2611         INIT_LIST_HEAD(&fs_info->trans_list);
2612         INIT_LIST_HEAD(&fs_info->dead_roots);
2613         INIT_LIST_HEAD(&fs_info->delayed_iputs);
2614         INIT_LIST_HEAD(&fs_info->delalloc_roots);
2615         INIT_LIST_HEAD(&fs_info->caching_block_groups);
2616         spin_lock_init(&fs_info->delalloc_root_lock);
2617         spin_lock_init(&fs_info->trans_lock);
2618         spin_lock_init(&fs_info->fs_roots_radix_lock);
2619         spin_lock_init(&fs_info->delayed_iput_lock);
2620         spin_lock_init(&fs_info->defrag_inodes_lock);
2621         spin_lock_init(&fs_info->free_chunk_lock);
2622         spin_lock_init(&fs_info->tree_mod_seq_lock);
2623         spin_lock_init(&fs_info->super_lock);
2624         spin_lock_init(&fs_info->qgroup_op_lock);
2625         spin_lock_init(&fs_info->buffer_lock);
2626         spin_lock_init(&fs_info->unused_bgs_lock);
2627         rwlock_init(&fs_info->tree_mod_log_lock);
2628         mutex_init(&fs_info->unused_bg_unpin_mutex);
2629         mutex_init(&fs_info->delete_unused_bgs_mutex);
2630         mutex_init(&fs_info->reloc_mutex);
2631         mutex_init(&fs_info->delalloc_root_mutex);
2632         mutex_init(&fs_info->cleaner_delayed_iput_mutex);
2633         seqlock_init(&fs_info->profiles_lock);
2634
2635         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2636         INIT_LIST_HEAD(&fs_info->space_info);
2637         INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2638         INIT_LIST_HEAD(&fs_info->unused_bgs);
2639         btrfs_mapping_init(&fs_info->mapping_tree);
2640         btrfs_init_block_rsv(&fs_info->global_block_rsv,
2641                              BTRFS_BLOCK_RSV_GLOBAL);
2642         btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
2643                              BTRFS_BLOCK_RSV_DELALLOC);
2644         btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
2645         btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
2646         btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
2647         btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
2648                              BTRFS_BLOCK_RSV_DELOPS);
2649         atomic_set(&fs_info->nr_async_submits, 0);
2650         atomic_set(&fs_info->async_delalloc_pages, 0);
2651         atomic_set(&fs_info->async_submit_draining, 0);
2652         atomic_set(&fs_info->nr_async_bios, 0);
2653         atomic_set(&fs_info->defrag_running, 0);
2654         atomic_set(&fs_info->qgroup_op_seq, 0);
2655         atomic64_set(&fs_info->tree_mod_seq, 0);
2656         fs_info->sb = sb;
2657         fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2658         fs_info->metadata_ratio = 0;
2659         fs_info->defrag_inodes = RB_ROOT;
2660         fs_info->free_chunk_space = 0;
2661         fs_info->tree_mod_log = RB_ROOT;
2662         fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2663         fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
2664         /* readahead state */
2665         INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
2666         spin_lock_init(&fs_info->reada_lock);
2667
2668         fs_info->thread_pool_size = min_t(unsigned long,
2669                                           num_online_cpus() + 2, 8);
2670
2671         INIT_LIST_HEAD(&fs_info->ordered_roots);
2672         spin_lock_init(&fs_info->ordered_root_lock);
2673         fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2674                                         GFP_NOFS);
2675         if (!fs_info->delayed_root) {
2676                 err = -ENOMEM;
2677                 goto fail_iput;
2678         }
2679         btrfs_init_delayed_root(fs_info->delayed_root);
2680
2681         btrfs_init_scrub(fs_info);
2682 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2683         fs_info->check_integrity_print_mask = 0;
2684 #endif
2685         btrfs_init_balance(fs_info);
2686         btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
2687
2688         sb->s_blocksize = 4096;
2689         sb->s_blocksize_bits = blksize_bits(4096);
2690         sb->s_bdi = &fs_info->bdi;
2691
2692         btrfs_init_btree_inode(fs_info, tree_root);
2693
2694         spin_lock_init(&fs_info->block_group_cache_lock);
2695         fs_info->block_group_cache_tree = RB_ROOT;
2696         fs_info->first_logical_byte = (u64)-1;
2697
2698         extent_io_tree_init(&fs_info->freed_extents[0],
2699                              fs_info->btree_inode->i_mapping);
2700         extent_io_tree_init(&fs_info->freed_extents[1],
2701                              fs_info->btree_inode->i_mapping);
2702         fs_info->pinned_extents = &fs_info->freed_extents[0];
2703         fs_info->do_barriers = 1;
2704
2705
2706         mutex_init(&fs_info->ordered_operations_mutex);
2707         mutex_init(&fs_info->tree_log_mutex);
2708         mutex_init(&fs_info->chunk_mutex);
2709         mutex_init(&fs_info->transaction_kthread_mutex);
2710         mutex_init(&fs_info->cleaner_mutex);
2711         mutex_init(&fs_info->volume_mutex);
2712         mutex_init(&fs_info->ro_block_group_mutex);
2713         init_rwsem(&fs_info->commit_root_sem);
2714         init_rwsem(&fs_info->cleanup_work_sem);
2715         init_rwsem(&fs_info->subvol_sem);
2716         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2717
2718         btrfs_init_dev_replace_locks(fs_info);
2719         btrfs_init_qgroup(fs_info);
2720
2721         btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2722         btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
2723
2724         init_waitqueue_head(&fs_info->transaction_throttle);
2725         init_waitqueue_head(&fs_info->transaction_wait);
2726         init_waitqueue_head(&fs_info->transaction_blocked_wait);
2727         init_waitqueue_head(&fs_info->async_submit_wait);
2728
2729         INIT_LIST_HEAD(&fs_info->pinned_chunks);
2730
2731         ret = btrfs_alloc_stripe_hash_table(fs_info);
2732         if (ret) {
2733                 err = ret;
2734                 goto fail_alloc;
2735         }
2736
2737         __setup_root(4096, 4096, 4096, tree_root,
2738                      fs_info, BTRFS_ROOT_TREE_OBJECTID);
2739
2740         invalidate_bdev(fs_devices->latest_bdev);
2741
2742         /*
2743          * Read super block and check the signature bytes only
2744          */
2745         bh = btrfs_read_dev_super(fs_devices->latest_bdev);
2746         if (IS_ERR(bh)) {
2747                 err = PTR_ERR(bh);
2748                 goto fail_alloc;
2749         }
2750
2751         /*
2752          * We want to check superblock checksum, the type is stored inside.
2753          * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
2754          */
2755         if (btrfs_check_super_csum(bh->b_data)) {
2756                 printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
2757                 err = -EINVAL;
2758                 brelse(bh);
2759                 goto fail_alloc;
2760         }
2761
2762         /*
2763          * super_copy is zeroed at allocation time and we never touch the
2764          * following bytes up to INFO_SIZE, the checksum is calculated from
2765          * the whole block of INFO_SIZE
2766          */
2767         memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
2768         memcpy(fs_info->super_for_commit, fs_info->super_copy,
2769                sizeof(*fs_info->super_for_commit));
2770         brelse(bh);
2771
2772         memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
2773
2774         ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
2775         if (ret) {
2776                 printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
2777                 err = -EINVAL;
2778                 goto fail_alloc;
2779         }
2780
2781         disk_super = fs_info->super_copy;
2782         if (!btrfs_super_root(disk_super))
2783                 goto fail_alloc;
2784
2785         /* check FS state, whether FS is broken. */
2786         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
2787                 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
2788
2789         /*
2790          * run through our array of backup supers and setup
2791          * our ring pointer to the oldest one
2792          */
2793         generation = btrfs_super_generation(disk_super);
2794         find_oldest_super_backup(fs_info, generation);
2795
2796         /*
2797          * In the long term, we'll store the compression type in the super
2798          * block, and it'll be used for per file compression control.
2799          */
2800         fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
2801
2802         ret = btrfs_parse_options(tree_root, options);
2803         if (ret) {
2804                 err = ret;
2805                 goto fail_alloc;
2806         }
2807
2808         features = btrfs_super_incompat_flags(disk_super) &
2809                 ~BTRFS_FEATURE_INCOMPAT_SUPP;
2810         if (features) {
2811                 printk(KERN_ERR "BTRFS: couldn't mount because of "
2812                        "unsupported optional features (%Lx).\n",
2813                        features);
2814                 err = -EINVAL;
2815                 goto fail_alloc;
2816         }
2817
2818         /*
2819          * Leafsize and nodesize were always equal, this is only a sanity check.
2820          */
2821         if (le32_to_cpu(disk_super->__unused_leafsize) !=
2822             btrfs_super_nodesize(disk_super)) {
2823                 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2824                        "blocksizes don't match.  node %d leaf %d\n",
2825                        btrfs_super_nodesize(disk_super),
2826                        le32_to_cpu(disk_super->__unused_leafsize));
2827                 err = -EINVAL;
2828                 goto fail_alloc;
2829         }
2830         if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
2831                 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2832                        "blocksize (%d) was too large\n",
2833                        btrfs_super_nodesize(disk_super));
2834                 err = -EINVAL;
2835                 goto fail_alloc;
2836         }
2837
2838         features = btrfs_super_incompat_flags(disk_super);
2839         features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
2840         if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
2841                 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
2842
2843         if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
2844                 printk(KERN_INFO "BTRFS: has skinny extents\n");
2845
2846         /*
2847          * flag our filesystem as having big metadata blocks if
2848          * they are bigger than the page size
2849          */
2850         if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
2851                 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2852                         printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
2853                 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2854         }
2855
2856         nodesize = btrfs_super_nodesize(disk_super);
2857         sectorsize = btrfs_super_sectorsize(disk_super);
2858         stripesize = btrfs_super_stripesize(disk_super);
2859         fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
2860         fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2861
2862         /*
2863          * mixed block groups end up with duplicate but slightly offset
2864          * extent buffers for the same range.  It leads to corruptions
2865          */
2866         if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2867             (sectorsize != nodesize)) {
2868                 printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
2869                                 "are not allowed for mixed block groups on %s\n",
2870                                 sb->s_id);
2871                 goto fail_alloc;
2872         }
2873
2874         /*
2875          * Needn't use the lock because there is no other task which will
2876          * update the flag.
2877          */
2878         btrfs_set_super_incompat_flags(disk_super, features);
2879
2880         features = btrfs_super_compat_ro_flags(disk_super) &
2881                 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
2882         if (!(sb->s_flags & MS_RDONLY) && features) {
2883                 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
2884                        "unsupported option features (%Lx).\n",
2885                        features);
2886                 err = -EINVAL;
2887                 goto fail_alloc;
2888         }
2889
2890         max_active = fs_info->thread_pool_size;
2891
2892         ret = btrfs_init_workqueues(fs_info, fs_devices);
2893         if (ret) {
2894                 err = ret;
2895                 goto fail_sb_buffer;
2896         }
2897
2898         fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
2899         fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
2900                                     4 * 1024 * 1024 / PAGE_CACHE_SIZE);
2901
2902         tree_root->nodesize = nodesize;
2903         tree_root->sectorsize = sectorsize;
2904         tree_root->stripesize = stripesize;
2905
2906         sb->s_blocksize = sectorsize;
2907         sb->s_blocksize_bits = blksize_bits(sectorsize);
2908
2909         if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
2910                 printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
2911                 goto fail_sb_buffer;
2912         }
2913
2914         if (sectorsize != PAGE_SIZE) {
2915                 printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
2916                        "found on %s\n", (unsigned long)sectorsize, sb->s_id);
2917                 goto fail_sb_buffer;
2918         }
2919
2920         mutex_lock(&fs_info->chunk_mutex);
2921         ret = btrfs_read_sys_array(tree_root);
2922         mutex_unlock(&fs_info->chunk_mutex);
2923         if (ret) {
2924                 printk(KERN_ERR "BTRFS: failed to read the system "
2925                        "array on %s\n", sb->s_id);
2926                 goto fail_sb_buffer;
2927         }
2928
2929         generation = btrfs_super_chunk_root_generation(disk_super);
2930
2931         __setup_root(nodesize, sectorsize, stripesize, chunk_root,
2932                      fs_info, BTRFS_CHUNK_TREE_OBJECTID);
2933
2934         chunk_root->node = read_tree_block(chunk_root,
2935                                            btrfs_super_chunk_root(disk_super),
2936                                            generation);
2937         if (IS_ERR(chunk_root->node) ||
2938             !extent_buffer_uptodate(chunk_root->node)) {
2939                 printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
2940                        sb->s_id);
2941                 if (!IS_ERR(chunk_root->node))
2942                         free_extent_buffer(chunk_root->node);
2943                 chunk_root->node = NULL;
2944                 goto fail_tree_roots;
2945         }
2946         btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
2947         chunk_root->commit_root = btrfs_root_node(chunk_root);
2948
2949         read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
2950            btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
2951
2952         ret = btrfs_read_chunk_tree(chunk_root);
2953         if (ret) {
2954                 printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
2955                        sb->s_id);
2956                 goto fail_tree_roots;
2957         }
2958
2959         /*
2960          * keep the device that is marked to be the target device for the
2961          * dev_replace procedure
2962          */
2963         btrfs_close_extra_devices(fs_devices, 0);
2964
2965         if (!fs_devices->latest_bdev) {
2966                 printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
2967                        sb->s_id);
2968                 goto fail_tree_roots;
2969         }
2970
2971 retry_root_backup:
2972         generation = btrfs_super_generation(disk_super);
2973
2974         tree_root->node = read_tree_block(tree_root,
2975                                           btrfs_super_root(disk_super),
2976                                           generation);
2977         if (IS_ERR(tree_root->node) ||
2978             !extent_buffer_uptodate(tree_root->node)) {
2979                 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
2980                        sb->s_id);
2981                 if (!IS_ERR(tree_root->node))
2982                         free_extent_buffer(tree_root->node);
2983                 tree_root->node = NULL;
2984                 goto recovery_tree_root;
2985         }
2986
2987         btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2988         tree_root->commit_root = btrfs_root_node(tree_root);
2989         btrfs_set_root_refs(&tree_root->root_item, 1);
2990
2991         mutex_lock(&tree_root->objectid_mutex);
2992         ret = btrfs_find_highest_objectid(tree_root,
2993                                         &tree_root->highest_objectid);
2994         if (ret) {
2995                 mutex_unlock(&tree_root->objectid_mutex);
2996                 goto recovery_tree_root;
2997         }
2998
2999         ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
3000
3001         mutex_unlock(&tree_root->objectid_mutex);
3002
3003         ret = btrfs_read_roots(fs_info, tree_root);
3004         if (ret)
3005                 goto recovery_tree_root;
3006
3007         fs_info->generation = generation;
3008         fs_info->last_trans_committed = generation;
3009
3010         ret = btrfs_recover_balance(fs_info);
3011         if (ret) {
3012                 printk(KERN_ERR "BTRFS: failed to recover balance\n");
3013                 goto fail_block_groups;
3014         }
3015
3016         ret = btrfs_init_dev_stats(fs_info);
3017         if (ret) {
3018                 printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
3019                        ret);
3020                 goto fail_block_groups;
3021         }
3022
3023         ret = btrfs_init_dev_replace(fs_info);
3024         if (ret) {
3025                 pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
3026                 goto fail_block_groups;
3027         }
3028
3029         btrfs_close_extra_devices(fs_devices, 1);
3030
3031         ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
3032         if (ret) {
3033                 pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
3034                 goto fail_block_groups;
3035         }
3036
3037         ret = btrfs_sysfs_add_device(fs_devices);
3038         if (ret) {
3039                 pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
3040                 goto fail_fsdev_sysfs;
3041         }
3042
3043         ret = btrfs_sysfs_add_mounted(fs_info);
3044         if (ret) {
3045                 pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
3046                 goto fail_fsdev_sysfs;
3047         }
3048
3049         ret = btrfs_init_space_info(fs_info);
3050         if (ret) {
3051                 printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
3052                 goto fail_sysfs;
3053         }
3054
3055         ret = btrfs_read_block_groups(fs_info->extent_root);
3056         if (ret) {
3057                 printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
3058                 goto fail_sysfs;
3059         }
3060         fs_info->num_tolerated_disk_barrier_failures =
3061                 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3062         if (fs_info->fs_devices->missing_devices >
3063              fs_info->num_tolerated_disk_barrier_failures &&
3064             !(sb->s_flags & MS_RDONLY)) {
3065                 pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
3066                         fs_info->fs_devices->missing_devices,
3067                         fs_info->num_tolerated_disk_barrier_failures);
3068                 goto fail_sysfs;
3069         }
3070
3071         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
3072                                                "btrfs-cleaner");
3073         if (IS_ERR(fs_info->cleaner_kthread))
3074                 goto fail_sysfs;
3075
3076         fs_info->transaction_kthread = kthread_run(transaction_kthread,
3077                                                    tree_root,
3078                                                    "btrfs-transaction");
3079         if (IS_ERR(fs_info->transaction_kthread))
3080                 goto fail_cleaner;
3081
3082         if (!btrfs_test_opt(tree_root, SSD) &&
3083             !btrfs_test_opt(tree_root, NOSSD) &&
3084             !fs_info->fs_devices->rotating) {
3085                 printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
3086                        "mode\n");
3087                 btrfs_set_opt(fs_info->mount_opt, SSD);
3088         }
3089
3090         /*
3091          * Mount does not set all options immediatelly, we can do it now and do
3092          * not have to wait for transaction commit
3093          */
3094         btrfs_apply_pending_changes(fs_info);
3095
3096 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3097         if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
3098                 ret = btrfsic_mount(tree_root, fs_devices,
3099                                     btrfs_test_opt(tree_root,
3100                                         CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
3101                                     1 : 0,
3102                                     fs_info->check_integrity_print_mask);
3103                 if (ret)
3104                         printk(KERN_WARNING "BTRFS: failed to initialize"
3105                                " integrity check module %s\n", sb->s_id);
3106         }
3107 #endif
3108         ret = btrfs_read_qgroup_config(fs_info);
3109         if (ret)
3110                 goto fail_trans_kthread;
3111
3112         /* do not make disk changes in broken FS */
3113         if (btrfs_super_log_root(disk_super) != 0) {
3114                 ret = btrfs_replay_log(fs_info, fs_devices);
3115                 if (ret) {
3116                         err = ret;
3117                         goto fail_qgroup;
3118                 }
3119         }
3120
3121         ret = btrfs_find_orphan_roots(tree_root);
3122         if (ret)
3123                 goto fail_qgroup;
3124
3125         if (!(sb->s_flags & MS_RDONLY)) {
3126                 ret = btrfs_cleanup_fs_roots(fs_info);
3127                 if (ret)
3128                         goto fail_qgroup;
3129
3130                 mutex_lock(&fs_info->cleaner_mutex);
3131                 ret = btrfs_recover_relocation(tree_root);
3132                 mutex_unlock(&fs_info->cleaner_mutex);
3133                 if (ret < 0) {
3134                         printk(KERN_WARNING
3135                                "BTRFS: failed to recover relocation\n");
3136                         err = -EINVAL;
3137                         goto fail_qgroup;
3138                 }
3139         }
3140
3141         location.objectid = BTRFS_FS_TREE_OBJECTID;
3142         location.type = BTRFS_ROOT_ITEM_KEY;
3143         location.offset = 0;
3144
3145         fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
3146         if (IS_ERR(fs_info->fs_root)) {
3147                 err = PTR_ERR(fs_info->fs_root);
3148                 goto fail_qgroup;
3149         }
3150
3151         if (sb->s_flags & MS_RDONLY)
3152                 return 0;
3153
3154         down_read(&fs_info->cleanup_work_sem);
3155         if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
3156             (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3157                 up_read(&fs_info->cleanup_work_sem);
3158                 close_ctree(tree_root);
3159                 return ret;
3160         }
3161         up_read(&fs_info->cleanup_work_sem);
3162
3163         ret = btrfs_resume_balance_async(fs_info);
3164         if (ret) {
3165                 printk(KERN_WARNING "BTRFS: failed to resume balance\n");
3166                 close_ctree(tree_root);
3167                 return ret;
3168         }
3169
3170         ret = btrfs_resume_dev_replace_async(fs_info);
3171         if (ret) {
3172                 pr_warn("BTRFS: failed to resume dev_replace\n");
3173                 close_ctree(tree_root);
3174                 return ret;
3175         }
3176
3177         btrfs_qgroup_rescan_resume(fs_info);
3178
3179         if (!fs_info->uuid_root) {
3180                 pr_info("BTRFS: creating UUID tree\n");
3181                 ret = btrfs_create_uuid_tree(fs_info);
3182                 if (ret) {
3183                         pr_warn("BTRFS: failed to create the UUID tree %d\n",
3184                                 ret);
3185                         close_ctree(tree_root);
3186                         return ret;
3187                 }
3188         } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
3189                    fs_info->generation !=
3190                                 btrfs_super_uuid_tree_generation(disk_super)) {
3191                 pr_info("BTRFS: checking UUID tree\n");
3192                 ret = btrfs_check_uuid_tree(fs_info);
3193                 if (ret) {
3194                         pr_warn("BTRFS: failed to check the UUID tree %d\n",
3195                                 ret);
3196                         close_ctree(tree_root);
3197                         return ret;
3198                 }
3199         } else {
3200                 fs_info->update_uuid_tree_gen = 1;
3201         }
3202
3203         fs_info->open = 1;
3204
3205         return 0;
3206
3207 fail_qgroup:
3208         btrfs_free_qgroup_config(fs_info);
3209 fail_trans_kthread:
3210         kthread_stop(fs_info->transaction_kthread);
3211         btrfs_cleanup_transaction(fs_info->tree_root);
3212         btrfs_free_fs_roots(fs_info);
3213 fail_cleaner:
3214         kthread_stop(fs_info->cleaner_kthread);
3215
3216         /*
3217          * make sure we're done with the btree inode before we stop our
3218          * kthreads
3219          */
3220         filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3221
3222 fail_sysfs:
3223         btrfs_sysfs_remove_mounted(fs_info);
3224
3225 fail_fsdev_sysfs:
3226         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3227
3228 fail_block_groups:
3229         btrfs_put_block_group_cache(fs_info);
3230         btrfs_free_block_groups(fs_info);
3231
3232 fail_tree_roots:
3233         free_root_pointers(fs_info, 1);
3234         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3235
3236 fail_sb_buffer:
3237         btrfs_stop_all_workers(fs_info);
3238 fail_alloc:
3239 fail_iput:
3240         btrfs_mapping_tree_free(&fs_info->mapping_tree);
3241
3242         iput(fs_info->btree_inode);
3243 fail_bio_counter:
3244         percpu_counter_destroy(&fs_info->bio_counter);
3245 fail_delalloc_bytes:
3246         percpu_counter_destroy(&fs_info->delalloc_bytes);
3247 fail_dirty_metadata_bytes:
3248         percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3249 fail_bdi:
3250         bdi_destroy(&fs_info->bdi);
3251 fail_srcu:
3252         cleanup_srcu_struct(&fs_info->subvol_srcu);
3253 fail:
3254         btrfs_free_stripe_hash_table(fs_info);
3255         btrfs_close_devices(fs_info->fs_devices);
3256         return err;
3257
3258 recovery_tree_root:
3259         if (!btrfs_test_opt(tree_root, RECOVERY))
3260                 goto fail_tree_roots;
3261
3262         free_root_pointers(fs_info, 0);
3263
3264         /* don't use the log in recovery mode, it won't be valid */
3265         btrfs_set_super_log_root(disk_super, 0);
3266
3267         /* we can't trust the free space cache either */
3268         btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
3269
3270         ret = next_root_backup(fs_info, fs_info->super_copy,
3271                                &num_backups_tried, &backup_index);
3272         if (ret == -1)
3273                 goto fail_block_groups;
3274         goto retry_root_backup;
3275 }
3276
3277 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
3278 {
3279         if (uptodate) {
3280                 set_buffer_uptodate(bh);
3281         } else {
3282                 struct btrfs_device *device = (struct btrfs_device *)
3283                         bh->b_private;
3284
3285                 btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
3286                                 "lost page write due to IO error on %s",
3287                                           rcu_str_deref(device->name));
3288                 /* note, we dont' set_buffer_write_io_error because we have
3289                  * our own ways of dealing with the IO errors
3290                  */
3291                 clear_buffer_uptodate(bh);
3292                 btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
3293         }
3294         unlock_buffer(bh);
3295         put_bh(bh);
3296 }
3297
3298 int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
3299                         struct buffer_head **bh_ret)
3300 {
3301         struct buffer_head *bh;
3302         struct btrfs_super_block *super;
3303         u64 bytenr;
3304
3305         bytenr = btrfs_sb_offset(copy_num);
3306         if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
3307                 return -EINVAL;
3308
3309         bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
3310         /*
3311          * If we fail to read from the underlying devices, as of now
3312          * the best option we have is to mark it EIO.
3313          */
3314         if (!bh)
3315                 return -EIO;
3316
3317         super = (struct btrfs_super_block *)bh->b_data;
3318         if (btrfs_super_bytenr(super) != bytenr ||
3319                     btrfs_super_magic(super) != BTRFS_MAGIC) {
3320                 brelse(bh);
3321                 return -EINVAL;
3322         }
3323
3324         *bh_ret = bh;
3325         return 0;
3326 }
3327
3328
3329 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
3330 {
3331         struct buffer_head *bh;
3332         struct buffer_head *latest = NULL;
3333         struct btrfs_super_block *super;
3334         int i;
3335         u64 transid = 0;
3336         int ret = -EINVAL;
3337
3338         /* we would like to check all the supers, but that would make
3339          * a btrfs mount succeed after a mkfs from a different FS.
3340          * So, we need to add a special mount option to scan for
3341          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3342          */
3343         for (i = 0; i < 1; i++) {
3344                 ret = btrfs_read_dev_one_super(bdev, i, &bh);
3345                 if (ret)
3346                         continue;
3347
3348                 super = (struct btrfs_super_block *)bh->b_data;
3349
3350                 if (!latest || btrfs_super_generation(super) > transid) {
3351                         brelse(latest);
3352                         latest = bh;
3353                         transid = btrfs_super_generation(super);
3354                 } else {
3355                         brelse(bh);
3356                 }
3357         }
3358
3359         if (!latest)
3360                 return ERR_PTR(ret);
3361
3362         return latest;
3363 }
3364
3365 /*
3366  * this should be called twice, once with wait == 0 and
3367  * once with wait == 1.  When wait == 0 is done, all the buffer heads
3368  * we write are pinned.
3369  *
3370  * They are released when wait == 1 is done.
3371  * max_mirrors must be the same for both runs, and it indicates how
3372  * many supers on this one device should be written.
3373  *
3374  * max_mirrors == 0 means to write them all.
3375  */
3376 static int write_dev_supers(struct btrfs_device *device,
3377                             struct btrfs_super_block *sb,
3378                             int do_barriers, int wait, int max_mirrors)
3379 {
3380         struct buffer_head *bh;
3381         int i;
3382         int ret;
3383         int errors = 0;
3384         u32 crc;
3385         u64 bytenr;
3386
3387         if (max_mirrors == 0)
3388                 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
3389
3390         for (i = 0; i < max_mirrors; i++) {
3391                 bytenr = btrfs_sb_offset(i);
3392                 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3393                     device->commit_total_bytes)
3394                         break;
3395
3396                 if (wait) {
3397                         bh = __find_get_block(device->bdev, bytenr / 4096,
3398                                               BTRFS_SUPER_INFO_SIZE);
3399                         if (!bh) {
3400                                 errors++;
3401                                 continue;
3402                         }
3403                         wait_on_buffer(bh);
3404                         if (!buffer_uptodate(bh))
3405                                 errors++;
3406
3407                         /* drop our reference */
3408                         brelse(bh);
3409
3410                         /* drop the reference from the wait == 0 run */
3411                         brelse(bh);
3412                         continue;
3413                 } else {
3414                         btrfs_set_super_bytenr(sb, bytenr);
3415
3416                         crc = ~(u32)0;
3417                         crc = btrfs_csum_data((char *)sb +
3418                                               BTRFS_CSUM_SIZE, crc,
3419                                               BTRFS_SUPER_INFO_SIZE -
3420                                               BTRFS_CSUM_SIZE);
3421                         btrfs_csum_final(crc, sb->csum);
3422
3423                         /*
3424                          * one reference for us, and we leave it for the
3425                          * caller
3426                          */
3427                         bh = __getblk(device->bdev, bytenr / 4096,
3428                                       BTRFS_SUPER_INFO_SIZE);
3429                         if (!bh) {
3430                                 btrfs_err(device->dev_root->fs_info,
3431                                     "couldn't get super buffer head for bytenr %llu",
3432                                     bytenr);
3433                                 errors++;
3434                                 continue;
3435                         }
3436
3437                         memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
3438
3439                         /* one reference for submit_bh */
3440                         get_bh(bh);
3441
3442                         set_buffer_uptodate(bh);
3443                         lock_buffer(bh);
3444                         bh->b_end_io = btrfs_end_buffer_write_sync;
3445                         bh->b_private = device;
3446                 }
3447
3448                 /*
3449                  * we fua the first super.  The others we allow
3450                  * to go down lazy.
3451                  */
3452                 if (i == 0)
3453                         ret = btrfsic_submit_bh(WRITE_FUA, bh);
3454                 else
3455                         ret = btrfsic_submit_bh(WRITE_SYNC, bh);
3456                 if (ret)
3457                         errors++;
3458         }
3459         return errors < i ? 0 : -1;
3460 }
3461
3462 /*
3463  * endio for the write_dev_flush, this will wake anyone waiting
3464  * for the barrier when it is done
3465  */
3466 static void btrfs_end_empty_barrier(struct bio *bio)
3467 {
3468         if (bio->bi_private)
3469                 complete(bio->bi_private);
3470         bio_put(bio);
3471 }
3472
3473 /*
3474  * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
3475  * sent down.  With wait == 1, it waits for the previous flush.
3476  *
3477  * any device where the flush fails with eopnotsupp are flagged as not-barrier
3478  * capable
3479  */
3480 static int write_dev_flush(struct btrfs_device *device, int wait)
3481 {
3482         struct bio *bio;
3483         int ret = 0;
3484
3485         if (device->nobarriers)
3486                 return 0;
3487
3488         if (wait) {
3489                 bio = device->flush_bio;
3490                 if (!bio)
3491                         return 0;
3492
3493                 wait_for_completion(&device->flush_wait);
3494
3495                 if (bio->bi_error) {
3496                         ret = bio->bi_error;
3497                         btrfs_dev_stat_inc_and_print(device,
3498                                 BTRFS_DEV_STAT_FLUSH_ERRS);
3499                 }
3500
3501                 /* drop the reference from the wait == 0 run */
3502                 bio_put(bio);
3503                 device->flush_bio = NULL;
3504
3505                 return ret;
3506         }
3507
3508         /*
3509          * one reference for us, and we leave it for the
3510          * caller
3511          */
3512         device->flush_bio = NULL;
3513         bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
3514         if (!bio)
3515                 return -ENOMEM;
3516
3517         bio->bi_end_io = btrfs_end_empty_barrier;
3518         bio->bi_bdev = device->bdev;
3519         init_completion(&device->flush_wait);
3520         bio->bi_private = &device->flush_wait;
3521         device->flush_bio = bio;
3522
3523         bio_get(bio);
3524         btrfsic_submit_bio(WRITE_FLUSH, bio);
3525
3526         return 0;
3527 }
3528
3529 /*
3530  * send an empty flush down to each device in parallel,
3531  * then wait for them
3532  */
3533 static int barrier_all_devices(struct btrfs_fs_info *info)
3534 {
3535         struct list_head *head;
3536         struct btrfs_device *dev;
3537         int errors_send = 0;
3538         int errors_wait = 0;
3539         int ret;
3540
3541         /* send down all the barriers */
3542         head = &info->fs_devices->devices;
3543         list_for_each_entry_rcu(dev, head, dev_list) {
3544                 if (dev->missing)
3545                         continue;
3546                 if (!dev->bdev) {
3547                         errors_send++;
3548                         continue;
3549                 }
3550                 if (!dev->in_fs_metadata || !dev->writeable)
3551                         continue;
3552
3553                 ret = write_dev_flush(dev, 0);
3554                 if (ret)
3555                         errors_send++;
3556         }
3557
3558         /* wait for all the barriers */
3559         list_for_each_entry_rcu(dev, head, dev_list) {
3560                 if (dev->missing)
3561                         continue;
3562                 if (!dev->bdev) {
3563                         errors_wait++;
3564                         continue;
3565                 }
3566                 if (!dev->in_fs_metadata || !dev->writeable)
3567                         continue;
3568
3569                 ret = write_dev_flush(dev, 1);
3570                 if (ret)
3571                         errors_wait++;
3572         }
3573         if (errors_send > info->num_tolerated_disk_barrier_failures ||
3574             errors_wait > info->num_tolerated_disk_barrier_failures)
3575                 return -EIO;
3576         return 0;
3577 }
3578
3579 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3580 {
3581         int raid_type;
3582         int min_tolerated = INT_MAX;
3583
3584         if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
3585             (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3586                 min_tolerated = min(min_tolerated,
3587                                     btrfs_raid_array[BTRFS_RAID_SINGLE].
3588                                     tolerated_failures);
3589
3590         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3591                 if (raid_type == BTRFS_RAID_SINGLE)
3592                         continue;
3593                 if (!(flags & btrfs_raid_group[raid_type]))
3594                         continue;
3595                 min_tolerated = min(min_tolerated,
3596                                     btrfs_raid_array[raid_type].
3597                                     tolerated_failures);
3598         }
3599
3600         if (min_tolerated == INT_MAX) {
3601                 pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
3602                 min_tolerated = 0;
3603         }
3604
3605         return min_tolerated;
3606 }
3607
3608 int btrfs_calc_num_tolerated_disk_barrier_failures(
3609         struct btrfs_fs_info *fs_info)
3610 {
3611         struct btrfs_ioctl_space_info space;
3612         struct btrfs_space_info *sinfo;
3613         u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
3614                        BTRFS_BLOCK_GROUP_SYSTEM,
3615                        BTRFS_BLOCK_GROUP_METADATA,
3616                        BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
3617         int i;
3618         int c;
3619         int num_tolerated_disk_barrier_failures =
3620                 (int)fs_info->fs_devices->num_devices;
3621
3622         for (i = 0; i < ARRAY_SIZE(types); i++) {
3623                 struct btrfs_space_info *tmp;
3624
3625                 sinfo = NULL;
3626                 rcu_read_lock();
3627                 list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
3628                         if (tmp->flags == types[i]) {
3629                                 sinfo = tmp;
3630                                 break;
3631                         }
3632                 }
3633                 rcu_read_unlock();
3634
3635                 if (!sinfo)
3636                         continue;
3637
3638                 down_read(&sinfo->groups_sem);
3639                 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3640                         u64 flags;
3641
3642                         if (list_empty(&sinfo->block_groups[c]))
3643                                 continue;
3644
3645                         btrfs_get_block_group_info(&sinfo->block_groups[c],
3646                                                    &space);
3647                         if (space.total_bytes == 0 || space.used_bytes == 0)
3648                                 continue;
3649                         flags = space.flags;
3650
3651                         num_tolerated_disk_barrier_failures = min(
3652                                 num_tolerated_disk_barrier_failures,
3653                                 btrfs_get_num_tolerated_disk_barrier_failures(
3654                                         flags));
3655                 }
3656                 up_read(&sinfo->groups_sem);
3657         }
3658
3659         return num_tolerated_disk_barrier_failures;
3660 }
3661
3662 static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3663 {
3664         struct list_head *head;
3665         struct btrfs_device *dev;
3666         struct btrfs_super_block *sb;
3667         struct btrfs_dev_item *dev_item;
3668         int ret;
3669         int do_barriers;
3670         int max_errors;
3671         int total_errors = 0;
3672         u64 flags;
3673
3674         do_barriers = !btrfs_test_opt(root, NOBARRIER);
3675         backup_super_roots(root->fs_info);
3676
3677         sb = root->fs_info->super_for_commit;
3678         dev_item = &sb->dev_item;
3679
3680         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3681         head = &root->fs_info->fs_devices->devices;
3682         max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
3683
3684         if (do_barriers) {
3685                 ret = barrier_all_devices(root->fs_info);
3686                 if (ret) {
3687                         mutex_unlock(
3688                                 &root->fs_info->fs_devices->device_list_mutex);
3689                         btrfs_std_error(root->fs_info, ret,
3690                                     "errors while submitting device barriers.");
3691                         return ret;
3692                 }
3693         }
3694
3695         list_for_each_entry_rcu(dev, head, dev_list) {
3696                 if (!dev->bdev) {
3697                         total_errors++;
3698                         continue;
3699                 }
3700                 if (!dev->in_fs_metadata || !dev->writeable)
3701                         continue;
3702
3703                 btrfs_set_stack_device_generation(dev_item, 0);
3704                 btrfs_set_stack_device_type(dev_item, dev->type);
3705                 btrfs_set_stack_device_id(dev_item, dev->devid);
3706                 btrfs_set_stack_device_total_bytes(dev_item,
3707                                                    dev->commit_total_bytes);
3708                 btrfs_set_stack_device_bytes_used(dev_item,
3709                                                   dev->commit_bytes_used);
3710                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
3711                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
3712                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
3713                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
3714                 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
3715
3716                 flags = btrfs_super_flags(sb);
3717                 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
3718
3719                 ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
3720                 if (ret)
3721                         total_errors++;
3722         }
3723         if (total_errors > max_errors) {
3724                 btrfs_err(root->fs_info, "%d errors while writing supers",
3725                        total_errors);
3726                 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3727
3728                 /* FUA is masked off if unsupported and can't be the reason */
3729                 btrfs_std_error(root->fs_info, -EIO,
3730                             "%d errors while writing supers", total_errors);
3731                 return -EIO;
3732         }
3733
3734         total_errors = 0;
3735         list_for_each_entry_rcu(dev, head, dev_list) {
3736                 if (!dev->bdev)
3737                         continue;
3738                 if (!dev->in_fs_metadata || !dev->writeable)
3739                         continue;
3740
3741                 ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
3742                 if (ret)
3743                         total_errors++;
3744         }
3745         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3746         if (total_errors > max_errors) {
3747                 btrfs_std_error(root->fs_info, -EIO,
3748                             "%d errors while writing supers", total_errors);
3749                 return -EIO;
3750         }
3751         return 0;
3752 }
3753
3754 int write_ctree_super(struct btrfs_trans_handle *trans,
3755                       struct btrfs_root *root, int max_mirrors)
3756 {
3757         return write_all_supers(root, max_mirrors);
3758 }
3759
3760 /* Drop a fs root from the radix tree and free it. */
3761 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3762                                   struct btrfs_root *root)
3763 {
3764         spin_lock(&fs_info->fs_roots_radix_lock);
3765         radix_tree_delete(&fs_info->fs_roots_radix,
3766                           (unsigned long)root->root_key.objectid);
3767         spin_unlock(&fs_info->fs_roots_radix_lock);
3768
3769         if (btrfs_root_refs(&root->root_item) == 0)
3770                 synchronize_srcu(&fs_info->subvol_srcu);
3771
3772         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3773                 btrfs_free_log(NULL, root);
3774
3775         if (root->free_ino_pinned)
3776                 __btrfs_remove_free_space_cache(root->free_ino_pinned);
3777         if (root->free_ino_ctl)
3778                 __btrfs_remove_free_space_cache(root->free_ino_ctl);
3779         free_fs_root(root);
3780 }
3781
3782 static void free_fs_root(struct btrfs_root *root)
3783 {
3784         iput(root->ino_cache_inode);
3785         WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
3786         btrfs_free_block_rsv(root, root->orphan_block_rsv);
3787         root->orphan_block_rsv = NULL;
3788         if (root->anon_dev)
3789                 free_anon_bdev(root->anon_dev);
3790         if (root->subv_writers)
3791                 btrfs_free_subvolume_writers(root->subv_writers);
3792         free_extent_buffer(root->node);
3793         free_extent_buffer(root->commit_root);
3794         kfree(root->free_ino_ctl);
3795         kfree(root->free_ino_pinned);
3796         kfree(root->name);
3797         btrfs_put_fs_root(root);
3798 }
3799
3800 void btrfs_free_fs_root(struct btrfs_root *root)
3801 {
3802         free_fs_root(root);
3803 }
3804
3805 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
3806 {
3807         u64 root_objectid = 0;
3808         struct btrfs_root *gang[8];
3809         int i = 0;
3810         int err = 0;
3811         unsigned int ret = 0;
3812         int index;
3813
3814         while (1) {
3815                 index = srcu_read_lock(&fs_info->subvol_srcu);
3816                 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
3817                                              (void **)gang, root_objectid,
3818                                              ARRAY_SIZE(gang));
3819                 if (!ret) {
3820                         srcu_read_unlock(&fs_info->subvol_srcu, index);
3821                         break;
3822                 }
3823                 root_objectid = gang[ret - 1]->root_key.objectid + 1;
3824
3825                 for (i = 0; i < ret; i++) {
3826                         /* Avoid to grab roots in dead_roots */
3827                         if (btrfs_root_refs(&gang[i]->root_item) == 0) {
3828                                 gang[i] = NULL;
3829                                 continue;
3830                         }
3831                         /* grab all the search result for later use */
3832                         gang[i] = btrfs_grab_fs_root(gang[i]);
3833                 }
3834                 srcu_read_unlock(&fs_info->subvol_srcu, index);
3835
3836                 for (i = 0; i < ret; i++) {
3837                         if (!gang[i])
3838                                 continue;
3839                         root_objectid = gang[i]->root_key.objectid;
3840                         err = btrfs_orphan_cleanup(gang[i]);
3841                         if (err)
3842                                 break;
3843                         btrfs_put_fs_root(gang[i]);
3844                 }
3845                 root_objectid++;
3846         }
3847
3848         /* release the uncleaned roots due to error */
3849         for (; i < ret; i++) {
3850                 if (gang[i])
3851                         btrfs_put_fs_root(gang[i]);
3852         }
3853         return err;
3854 }
3855
3856 int btrfs_commit_super(struct btrfs_root *root)
3857 {
3858         struct btrfs_trans_handle *trans;
3859
3860         mutex_lock(&root->fs_info->cleaner_mutex);
3861         btrfs_run_delayed_iputs(root);
3862         mutex_unlock(&root->fs_info->cleaner_mutex);
3863         wake_up_process(root->fs_info->cleaner_kthread);
3864
3865         /* wait until ongoing cleanup work done */
3866         down_write(&root->fs_info->cleanup_work_sem);
3867         up_write(&root->fs_info->cleanup_work_sem);
3868
3869         trans = btrfs_join_transaction(root);
3870         if (IS_ERR(trans))
3871                 return PTR_ERR(trans);
3872         return btrfs_commit_transaction(trans, root);
3873 }
3874
3875 void close_ctree(struct btrfs_root *root)
3876 {
3877         struct btrfs_fs_info *fs_info = root->fs_info;
3878         int ret;
3879
3880         fs_info->closing = 1;
3881         smp_mb();
3882
3883         /* wait for the qgroup rescan worker to stop */
3884         btrfs_qgroup_wait_for_completion(fs_info, false);
3885
3886         /* wait for the uuid_scan task to finish */
3887         down(&fs_info->uuid_tree_rescan_sem);
3888         /* avoid complains from lockdep et al., set sem back to initial state */
3889         up(&fs_info->uuid_tree_rescan_sem);
3890
3891         /* pause restriper - we want to resume on mount */
3892         btrfs_pause_balance(fs_info);
3893
3894         btrfs_dev_replace_suspend_for_unmount(fs_info);
3895
3896         btrfs_scrub_cancel(fs_info);
3897
3898         /* wait for any defraggers to finish */
3899         wait_event(fs_info->transaction_wait,
3900                    (atomic_read(&fs_info->defrag_running) == 0));
3901
3902         /* clear out the rbtree of defraggable inodes */
3903         btrfs_cleanup_defrag_inodes(fs_info);
3904
3905         cancel_work_sync(&fs_info->async_reclaim_work);
3906
3907         if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3908                 /*
3909                  * If the cleaner thread is stopped and there are
3910                  * block groups queued for removal, the deletion will be
3911                  * skipped when we quit the cleaner thread.
3912                  */
3913                 btrfs_delete_unused_bgs(root->fs_info);
3914
3915                 ret = btrfs_commit_super(root);
3916                 if (ret)
3917                         btrfs_err(fs_info, "commit super ret %d", ret);
3918         }
3919
3920         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3921                 btrfs_error_commit_super(root);
3922
3923         kthread_stop(fs_info->transaction_kthread);
3924         kthread_stop(fs_info->cleaner_kthread);
3925
3926         fs_info->closing = 2;
3927         smp_mb();
3928
3929         btrfs_free_qgroup_config(fs_info);
3930
3931         if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
3932                 btrfs_info(fs_info, "at unmount delalloc count %lld",
3933                        percpu_counter_sum(&fs_info->delalloc_bytes));
3934         }
3935
3936         btrfs_sysfs_remove_mounted(fs_info);
3937         btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3938
3939         btrfs_free_fs_roots(fs_info);
3940
3941         btrfs_put_block_group_cache(fs_info);
3942
3943         btrfs_free_block_groups(fs_info);
3944
3945         /*
3946          * we must make sure there is not any read request to
3947          * submit after we stopping all workers.
3948          */
3949         invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3950         btrfs_stop_all_workers(fs_info);
3951
3952         fs_info->open = 0;
3953         free_root_pointers(fs_info, 1);
3954
3955         iput(fs_info->btree_inode);
3956
3957 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3958         if (btrfs_test_opt(root, CHECK_INTEGRITY))
3959                 btrfsic_unmount(root, fs_info->fs_devices);
3960 #endif
3961
3962         btrfs_close_devices(fs_info->fs_devices);
3963         btrfs_mapping_tree_free(&fs_info->mapping_tree);
3964
3965         percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
3966         percpu_counter_destroy(&fs_info->delalloc_bytes);
3967         percpu_counter_destroy(&fs_info->bio_counter);
3968         bdi_destroy(&fs_info->bdi);
3969         cleanup_srcu_struct(&fs_info->subvol_srcu);
3970
3971         btrfs_free_stripe_hash_table(fs_info);
3972
3973         __btrfs_free_block_rsv(root->orphan_block_rsv);
3974         root->orphan_block_rsv = NULL;
3975
3976         lock_chunks(root);
3977         while (!list_empty(&fs_info->pinned_chunks)) {
3978                 struct extent_map *em;
3979
3980                 em = list_first_entry(&fs_info->pinned_chunks,
3981                                       struct extent_map, list);
3982                 list_del_init(&em->list);
3983                 free_extent_map(em);
3984         }
3985         unlock_chunks(root);
3986 }
3987
3988 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
3989                           int atomic)
3990 {
3991         int ret;
3992         struct inode *btree_inode = buf->pages[0]->mapping->host;
3993
3994         ret = extent_buffer_uptodate(buf);
3995         if (!ret)
3996                 return ret;
3997
3998         ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
3999                                     parent_transid, atomic);
4000         if (ret == -EAGAIN)
4001                 return ret;
4002         return !ret;
4003 }
4004
4005 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
4006 {
4007         return set_extent_buffer_uptodate(buf);
4008 }
4009
4010 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
4011 {
4012         struct btrfs_root *root;
4013         u64 transid = btrfs_header_generation(buf);
4014         int was_dirty;
4015
4016 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4017         /*
4018          * This is a fast path so only do this check if we have sanity tests
4019          * enabled.  Normal people shouldn't be marking dummy buffers as dirty
4020          * outside of the sanity tests.
4021          */
4022         if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
4023                 return;
4024 #endif
4025         root = BTRFS_I(buf->pages[0]->mapping->host)->root;
4026         btrfs_assert_tree_locked(buf);
4027         if (transid != root->fs_info->generation)
4028                 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
4029                        "found %llu running %llu\n",
4030                         buf->start, transid, root->fs_info->generation);
4031         was_dirty = set_extent_buffer_dirty(buf);
4032         if (!was_dirty)
4033                 __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
4034                                      buf->len,
4035                                      root->fs_info->dirty_metadata_batch);
4036 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4037         if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
4038                 btrfs_print_leaf(root, buf);
4039                 ASSERT(0);
4040         }
4041 #endif
4042 }
4043
4044 static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
4045                                         int flush_delayed)
4046 {
4047         /*
4048          * looks as though older kernels can get into trouble with
4049          * this code, they end up stuck in balance_dirty_pages forever
4050          */
4051         int ret;
4052
4053         if (current->flags & PF_MEMALLOC)
4054                 return;
4055
4056         if (flush_delayed)
4057                 btrfs_balance_delayed_items(root);
4058
4059         ret = __percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
4060                                      BTRFS_DIRTY_METADATA_THRESH,
4061                                      root->fs_info->dirty_metadata_batch);
4062         if (ret > 0) {
4063                 balance_dirty_pages_ratelimited(
4064                                    root->fs_info->btree_inode->i_mapping);
4065         }
4066         return;
4067 }
4068
4069 void btrfs_btree_balance_dirty(struct btrfs_root *root)
4070 {
4071         __btrfs_btree_balance_dirty(root, 1);
4072 }
4073
4074 void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
4075 {
4076         __btrfs_btree_balance_dirty(root, 0);
4077 }
4078
4079 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
4080 {
4081         struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
4082         return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
4083 }
4084
4085 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
4086                               int read_only)
4087 {
4088         struct btrfs_super_block *sb = fs_info->super_copy;
4089         int ret = 0;
4090
4091         if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
4092                 printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
4093                                 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
4094                 ret = -EINVAL;
4095         }
4096         if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
4097                 printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
4098                                 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
4099                 ret = -EINVAL;
4100         }
4101         if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
4102                 printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
4103                                 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
4104                 ret = -EINVAL;
4105         }
4106
4107         /*
4108          * The common minimum, we don't know if we can trust the nodesize/sectorsize
4109          * items yet, they'll be verified later. Issue just a warning.
4110          */
4111         if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
4112                 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
4113                                 btrfs_super_root(sb));
4114         if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
4115                 printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
4116                                 btrfs_super_chunk_root(sb));
4117         if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
4118                 printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
4119                                 btrfs_super_log_root(sb));
4120
4121         /*
4122          * Check the lower bound, the alignment and other constraints are
4123          * checked later.
4124          */
4125         if (btrfs_super_nodesize(sb) < 4096) {
4126                 printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
4127                                 btrfs_super_nodesize(sb));
4128                 ret = -EINVAL;
4129         }
4130         if (btrfs_super_sectorsize(sb) < 4096) {
4131                 printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
4132                                 btrfs_super_sectorsize(sb));
4133                 ret = -EINVAL;
4134         }
4135
4136         if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
4137                 printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
4138                                 fs_info->fsid, sb->dev_item.fsid);
4139                 ret = -EINVAL;
4140         }
4141
4142         /*
4143          * Hint to catch really bogus numbers, bitflips or so, more exact checks are
4144          * done later
4145          */
4146         if (btrfs_super_num_devices(sb) > (1UL << 31))
4147                 printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
4148                                 btrfs_super_num_devices(sb));
4149         if (btrfs_super_num_devices(sb) == 0) {
4150                 printk(KERN_ERR "BTRFS: number of devices is 0\n");
4151                 ret = -EINVAL;
4152         }
4153
4154         if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
4155                 printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
4156                                 btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
4157                 ret = -EINVAL;
4158         }
4159
4160         /*
4161          * Obvious sys_chunk_array corruptions, it must hold at least one key
4162          * and one chunk
4163          */
4164         if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4165                 printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
4166                                 btrfs_super_sys_array_size(sb),
4167                                 BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
4168                 ret = -EINVAL;
4169         }
4170         if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
4171                         + sizeof(struct btrfs_chunk)) {
4172                 printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
4173                                 btrfs_super_sys_array_size(sb),
4174                                 sizeof(struct btrfs_disk_key)
4175                                 + sizeof(struct btrfs_chunk));
4176                 ret = -EINVAL;
4177         }
4178
4179         /*
4180          * The generation is a global counter, we'll trust it more than the others
4181          * but it's still possible that it's the one that's wrong.
4182          */
4183         if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
4184                 printk(KERN_WARNING
4185                         "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
4186                         btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
4187         if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
4188             && btrfs_super_cache_generation(sb) != (u64)-1)
4189                 printk(KERN_WARNING
4190                         "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
4191                         btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
4192
4193         return ret;
4194 }
4195
4196 static void btrfs_error_commit_super(struct btrfs_root *root)
4197 {
4198         mutex_lock(&root->fs_info->cleaner_mutex);
4199         btrfs_run_delayed_iputs(root);
4200         mutex_unlock(&root->fs_info->cleaner_mutex);
4201
4202         down_write(&root->fs_info->cleanup_work_sem);
4203         up_write(&root->fs_info->cleanup_work_sem);
4204
4205         /* cleanup FS via transaction */
4206         btrfs_cleanup_transaction(root);
4207 }
4208
4209 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4210 {
4211         struct btrfs_ordered_extent *ordered;
4212
4213         spin_lock(&root->ordered_extent_lock);
4214         /*
4215          * This will just short circuit the ordered completion stuff which will
4216          * make sure the ordered extent gets properly cleaned up.
4217          */
4218         list_for_each_entry(ordered, &root->ordered_extents,
4219                             root_extent_list)
4220                 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4221         spin_unlock(&root->ordered_extent_lock);
4222 }
4223
4224 static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
4225 {
4226         struct btrfs_root *root;
4227         struct list_head splice;
4228
4229         INIT_LIST_HEAD(&splice);
4230
4231         spin_lock(&fs_info->ordered_root_lock);
4232         list_splice_init(&fs_info->ordered_roots, &splice);
4233         while (!list_empty(&splice)) {
4234                 root = list_first_entry(&splice, struct btrfs_root,
4235                                         ordered_root);
4236                 list_move_tail(&root->ordered_root,
4237                                &fs_info->ordered_roots);
4238
4239                 spin_unlock(&fs_info->ordered_root_lock);
4240                 btrfs_destroy_ordered_extents(root);
4241
4242                 cond_resched();
4243                 spin_lock(&fs_info->ordered_root_lock);
4244         }
4245         spin_unlock(&fs_info->ordered_root_lock);
4246 }
4247
4248 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4249                                       struct btrfs_root *root)
4250 {
4251         struct rb_node *node;
4252         struct btrfs_delayed_ref_root *delayed_refs;
4253         struct btrfs_delayed_ref_node *ref;
4254         int ret = 0;
4255
4256         delayed_refs = &trans->delayed_refs;
4257
4258         spin_lock(&delayed_refs->lock);
4259         if (atomic_read(&delayed_refs->num_entries) == 0) {
4260                 spin_unlock(&delayed_refs->lock);
4261                 btrfs_info(root->fs_info, "delayed_refs has NO entry");
4262                 return ret;
4263         }
4264
4265         while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
4266                 struct btrfs_delayed_ref_head *head;
4267                 struct btrfs_delayed_ref_node *tmp;
4268                 bool pin_bytes = false;
4269
4270                 head = rb_entry(node, struct btrfs_delayed_ref_head,
4271                                 href_node);
4272                 if (!mutex_trylock(&head->mutex)) {
4273                         atomic_inc(&head->node.refs);
4274                         spin_unlock(&delayed_refs->lock);
4275
4276                         mutex_lock(&head->mutex);
4277                         mutex_unlock(&head->mutex);
4278                         btrfs_put_delayed_ref(&head->node);
4279                         spin_lock(&delayed_refs->lock);
4280                         continue;
4281                 }
4282                 spin_lock(&head->lock);
4283                 list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
4284                                                  list) {
4285                         ref->in_tree = 0;
4286                         list_del(&ref->list);
4287                         atomic_dec(&delayed_refs->num_entries);
4288                         btrfs_put_delayed_ref(ref);
4289                 }
4290                 if (head->must_insert_reserved)
4291                         pin_bytes = true;
4292                 btrfs_free_delayed_extent_op(head->extent_op);
4293                 delayed_refs->num_heads--;
4294                 if (head->processing == 0)
4295                         delayed_refs->num_heads_ready--;
4296                 atomic_dec(&delayed_refs->num_entries);
4297                 head->node.in_tree = 0;
4298                 rb_erase(&head->href_node, &delayed_refs->href_root);
4299                 spin_unlock(&head->lock);
4300                 spin_unlock(&delayed_refs->lock);
4301                 mutex_unlock(&head->mutex);
4302
4303                 if (pin_bytes)
4304                         btrfs_pin_extent(root, head->node.bytenr,
4305                                          head->node.num_bytes, 1);
4306                 btrfs_put_delayed_ref(&head->node);
4307                 cond_resched();
4308                 spin_lock(&delayed_refs->lock);
4309         }
4310
4311         spin_unlock(&delayed_refs->lock);
4312
4313         return ret;
4314 }
4315
4316 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4317 {
4318         struct btrfs_inode *btrfs_inode;
4319         struct list_head splice;
4320
4321         INIT_LIST_HEAD(&splice);
4322
4323         spin_lock(&root->delalloc_lock);
4324         list_splice_init(&root->delalloc_inodes, &splice);
4325
4326         while (!list_empty(&splice)) {
4327                 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
4328                                                delalloc_inodes);
4329
4330                 list_del_init(&btrfs_inode->delalloc_inodes);
4331                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
4332                           &btrfs_inode->runtime_flags);
4333                 spin_unlock(&root->delalloc_lock);
4334
4335                 btrfs_invalidate_inodes(btrfs_inode->root);
4336
4337                 spin_lock(&root->delalloc_lock);
4338         }
4339
4340         spin_unlock(&root->delalloc_lock);
4341 }
4342
4343 static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
4344 {
4345         struct btrfs_root *root;
4346         struct list_head splice;
4347
4348         INIT_LIST_HEAD(&splice);
4349
4350         spin_lock(&fs_info->delalloc_root_lock);
4351         list_splice_init(&fs_info->delalloc_roots, &splice);
4352         while (!list_empty(&splice)) {
4353                 root = list_first_entry(&splice, struct btrfs_root,
4354                                          delalloc_root);
4355                 list_del_init(&root->delalloc_root);
4356                 root = btrfs_grab_fs_root(root);
4357                 BUG_ON(!root);
4358                 spin_unlock(&fs_info->delalloc_root_lock);
4359
4360                 btrfs_destroy_delalloc_inodes(root);
4361                 btrfs_put_fs_root(root);
4362
4363                 spin_lock(&fs_info->delalloc_root_lock);
4364         }
4365         spin_unlock(&fs_info->delalloc_root_lock);
4366 }
4367
4368 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
4369                                         struct extent_io_tree *dirty_pages,
4370                                         int mark)
4371 {
4372         int ret;
4373         struct extent_buffer *eb;
4374         u64 start = 0;
4375         u64 end;
4376
4377         while (1) {
4378                 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4379                                             mark, NULL);
4380                 if (ret)
4381                         break;
4382
4383                 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
4384                 while (start <= end) {
4385                         eb = btrfs_find_tree_block(root->fs_info, start);
4386                         start += root->nodesize;
4387                         if (!eb)
4388                                 continue;
4389                         wait_on_extent_buffer_writeback(eb);
4390
4391                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
4392                                                &eb->bflags))
4393                                 clear_extent_buffer_dirty(eb);
4394                         free_extent_buffer_stale(eb);
4395                 }
4396         }
4397
4398         return ret;
4399 }
4400
4401 static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
4402                                        struct extent_io_tree *pinned_extents)
4403 {
4404         struct btrfs_fs_info *fs_info = root->fs_info;
4405         struct extent_io_tree *unpin;
4406         u64 start;
4407         u64 end;
4408         int ret;
4409         bool loop = true;
4410
4411         unpin = pinned_extents;
4412 again:
4413         while (1) {
4414                 /*
4415                  * The btrfs_finish_extent_commit() may get the same range as
4416                  * ours between find_first_extent_bit and clear_extent_dirty.
4417                  * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
4418                  * the same extent range.
4419                  */
4420                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
4421                 ret = find_first_extent_bit(unpin, 0, &start, &end,
4422                                             EXTENT_DIRTY, NULL);
4423                 if (ret) {
4424                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4425                         break;
4426                 }
4427
4428                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
4429                 btrfs_error_unpin_extent_range(root, start, end);
4430                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4431                 cond_resched();
4432         }
4433
4434         if (loop) {
4435                 if (unpin == &fs_info->freed_extents[0])
4436                         unpin = &fs_info->freed_extents[1];
4437                 else
4438                         unpin = &fs_info->freed_extents[0];
4439                 loop = false;
4440                 goto again;
4441         }
4442
4443         return 0;
4444 }
4445
4446 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4447                                    struct btrfs_root *root)
4448 {
4449         btrfs_destroy_delayed_refs(cur_trans, root);
4450
4451         cur_trans->state = TRANS_STATE_COMMIT_START;
4452         wake_up(&root->fs_info->transaction_blocked_wait);
4453
4454         cur_trans->state = TRANS_STATE_UNBLOCKED;
4455         wake_up(&root->fs_info->transaction_wait);
4456
4457         btrfs_destroy_delayed_inodes(root);
4458         btrfs_assert_delayed_root_empty(root);
4459
4460         btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
4461                                      EXTENT_DIRTY);
4462         btrfs_destroy_pinned_extent(root,
4463                                     root->fs_info->pinned_extents);
4464
4465         cur_trans->state =TRANS_STATE_COMPLETED;
4466         wake_up(&cur_trans->commit_wait);
4467
4468         /*
4469         memset(cur_trans, 0, sizeof(*cur_trans));
4470         kmem_cache_free(btrfs_transaction_cachep, cur_trans);
4471         */
4472 }
4473
4474 static int btrfs_cleanup_transaction(struct btrfs_root *root)
4475 {
4476         struct btrfs_transaction *t;
4477
4478         mutex_lock(&root->fs_info->transaction_kthread_mutex);
4479
4480         spin_lock(&root->fs_info->trans_lock);
4481         while (!list_empty(&root->fs_info->trans_list)) {
4482                 t = list_first_entry(&root->fs_info->trans_list,
4483                                      struct btrfs_transaction, list);
4484                 if (t->state >= TRANS_STATE_COMMIT_START) {
4485                         atomic_inc(&t->use_count);
4486                         spin_unlock(&root->fs_info->trans_lock);
4487                         btrfs_wait_for_commit(root, t->transid);
4488                         btrfs_put_transaction(t);
4489                         spin_lock(&root->fs_info->trans_lock);
4490                         continue;
4491                 }
4492                 if (t == root->fs_info->running_transaction) {
4493                         t->state = TRANS_STATE_COMMIT_DOING;
4494                         spin_unlock(&root->fs_info->trans_lock);
4495                         /*
4496                          * We wait for 0 num_writers since we don't hold a trans
4497                          * handle open currently for this transaction.
4498                          */
4499                         wait_event(t->writer_wait,
4500                                    atomic_read(&t->num_writers) == 0);
4501                 } else {
4502                         spin_unlock(&root->fs_info->trans_lock);
4503                 }
4504                 btrfs_cleanup_one_transaction(t, root);
4505
4506                 spin_lock(&root->fs_info->trans_lock);
4507                 if (t == root->fs_info->running_transaction)
4508                         root->fs_info->running_transaction = NULL;
4509                 list_del_init(&t->list);
4510                 spin_unlock(&root->fs_info->trans_lock);
4511
4512                 btrfs_put_transaction(t);
4513                 trace_btrfs_transaction_commit(root);
4514                 spin_lock(&root->fs_info->trans_lock);
4515         }
4516         spin_unlock(&root->fs_info->trans_lock);
4517         btrfs_destroy_all_ordered_extents(root->fs_info);
4518         btrfs_destroy_delayed_inodes(root);
4519         btrfs_assert_delayed_root_empty(root);
4520         btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
4521         btrfs_destroy_all_delalloc_inodes(root->fs_info);
4522         mutex_unlock(&root->fs_info->transaction_kthread_mutex);
4523
4524         return 0;
4525 }
4526
4527 static const struct extent_io_ops btree_extent_io_ops = {
4528         .readpage_end_io_hook = btree_readpage_end_io_hook,
4529         .readpage_io_failed_hook = btree_io_failed_hook,
4530         .submit_bio_hook = btree_submit_bio_hook,
4531         /* note we're sharing with inode.c for the merge bio hook */
4532         .merge_bio_hook = btrfs_merge_bio_hook,
4533 };