OSDN Git Service

Merge tag '9p-for-5.7' of git://github.com/martinetd/linux
[tomoyo/tomoyo-test1.git] / fs / btrfs / volumes.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/bio.h>
8 #include <linux/slab.h>
9 #include <linux/blkdev.h>
10 #include <linux/ratelimit.h>
11 #include <linux/kthread.h>
12 #include <linux/raid/pq.h>
13 #include <linux/semaphore.h>
14 #include <linux/uuid.h>
15 #include <linux/list_sort.h>
16 #include "misc.h"
17 #include "ctree.h"
18 #include "extent_map.h"
19 #include "disk-io.h"
20 #include "transaction.h"
21 #include "print-tree.h"
22 #include "volumes.h"
23 #include "raid56.h"
24 #include "async-thread.h"
25 #include "check-integrity.h"
26 #include "rcu-string.h"
27 #include "dev-replace.h"
28 #include "sysfs.h"
29 #include "tree-checker.h"
30 #include "space-info.h"
31 #include "block-group.h"
32 #include "discard.h"
33
34 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
35         [BTRFS_RAID_RAID10] = {
36                 .sub_stripes    = 2,
37                 .dev_stripes    = 1,
38                 .devs_max       = 0,    /* 0 == as many as possible */
39                 .devs_min       = 4,
40                 .tolerated_failures = 1,
41                 .devs_increment = 2,
42                 .ncopies        = 2,
43                 .nparity        = 0,
44                 .raid_name      = "raid10",
45                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
46                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
47         },
48         [BTRFS_RAID_RAID1] = {
49                 .sub_stripes    = 1,
50                 .dev_stripes    = 1,
51                 .devs_max       = 2,
52                 .devs_min       = 2,
53                 .tolerated_failures = 1,
54                 .devs_increment = 2,
55                 .ncopies        = 2,
56                 .nparity        = 0,
57                 .raid_name      = "raid1",
58                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
59                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
60         },
61         [BTRFS_RAID_RAID1C3] = {
62                 .sub_stripes    = 1,
63                 .dev_stripes    = 1,
64                 .devs_max       = 3,
65                 .devs_min       = 3,
66                 .tolerated_failures = 2,
67                 .devs_increment = 3,
68                 .ncopies        = 3,
69                 .nparity        = 0,
70                 .raid_name      = "raid1c3",
71                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
72                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
73         },
74         [BTRFS_RAID_RAID1C4] = {
75                 .sub_stripes    = 1,
76                 .dev_stripes    = 1,
77                 .devs_max       = 4,
78                 .devs_min       = 4,
79                 .tolerated_failures = 3,
80                 .devs_increment = 4,
81                 .ncopies        = 4,
82                 .nparity        = 0,
83                 .raid_name      = "raid1c4",
84                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
85                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
86         },
87         [BTRFS_RAID_DUP] = {
88                 .sub_stripes    = 1,
89                 .dev_stripes    = 2,
90                 .devs_max       = 1,
91                 .devs_min       = 1,
92                 .tolerated_failures = 0,
93                 .devs_increment = 1,
94                 .ncopies        = 2,
95                 .nparity        = 0,
96                 .raid_name      = "dup",
97                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
98                 .mindev_error   = 0,
99         },
100         [BTRFS_RAID_RAID0] = {
101                 .sub_stripes    = 1,
102                 .dev_stripes    = 1,
103                 .devs_max       = 0,
104                 .devs_min       = 2,
105                 .tolerated_failures = 0,
106                 .devs_increment = 1,
107                 .ncopies        = 1,
108                 .nparity        = 0,
109                 .raid_name      = "raid0",
110                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
111                 .mindev_error   = 0,
112         },
113         [BTRFS_RAID_SINGLE] = {
114                 .sub_stripes    = 1,
115                 .dev_stripes    = 1,
116                 .devs_max       = 1,
117                 .devs_min       = 1,
118                 .tolerated_failures = 0,
119                 .devs_increment = 1,
120                 .ncopies        = 1,
121                 .nparity        = 0,
122                 .raid_name      = "single",
123                 .bg_flag        = 0,
124                 .mindev_error   = 0,
125         },
126         [BTRFS_RAID_RAID5] = {
127                 .sub_stripes    = 1,
128                 .dev_stripes    = 1,
129                 .devs_max       = 0,
130                 .devs_min       = 2,
131                 .tolerated_failures = 1,
132                 .devs_increment = 1,
133                 .ncopies        = 1,
134                 .nparity        = 1,
135                 .raid_name      = "raid5",
136                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
137                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
138         },
139         [BTRFS_RAID_RAID6] = {
140                 .sub_stripes    = 1,
141                 .dev_stripes    = 1,
142                 .devs_max       = 0,
143                 .devs_min       = 3,
144                 .tolerated_failures = 2,
145                 .devs_increment = 1,
146                 .ncopies        = 1,
147                 .nparity        = 2,
148                 .raid_name      = "raid6",
149                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
150                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
151         },
152 };
153
154 const char *btrfs_bg_type_to_raid_name(u64 flags)
155 {
156         const int index = btrfs_bg_flags_to_raid_index(flags);
157
158         if (index >= BTRFS_NR_RAID_TYPES)
159                 return NULL;
160
161         return btrfs_raid_array[index].raid_name;
162 }
163
164 /*
165  * Fill @buf with textual description of @bg_flags, no more than @size_buf
166  * bytes including terminating null byte.
167  */
168 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
169 {
170         int i;
171         int ret;
172         char *bp = buf;
173         u64 flags = bg_flags;
174         u32 size_bp = size_buf;
175
176         if (!flags) {
177                 strcpy(bp, "NONE");
178                 return;
179         }
180
181 #define DESCRIBE_FLAG(flag, desc)                                               \
182         do {                                                            \
183                 if (flags & (flag)) {                                   \
184                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
185                         if (ret < 0 || ret >= size_bp)                  \
186                                 goto out_overflow;                      \
187                         size_bp -= ret;                                 \
188                         bp += ret;                                      \
189                         flags &= ~(flag);                               \
190                 }                                                       \
191         } while (0)
192
193         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
194         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
195         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
196
197         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
198         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
199                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
200                               btrfs_raid_array[i].raid_name);
201 #undef DESCRIBE_FLAG
202
203         if (flags) {
204                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
205                 size_bp -= ret;
206         }
207
208         if (size_bp < size_buf)
209                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
210
211         /*
212          * The text is trimmed, it's up to the caller to provide sufficiently
213          * large buffer
214          */
215 out_overflow:;
216 }
217
218 static int init_first_rw_device(struct btrfs_trans_handle *trans);
219 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
220 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
221 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
222 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
223                              enum btrfs_map_op op,
224                              u64 logical, u64 *length,
225                              struct btrfs_bio **bbio_ret,
226                              int mirror_num, int need_raid_map);
227
228 /*
229  * Device locking
230  * ==============
231  *
232  * There are several mutexes that protect manipulation of devices and low-level
233  * structures like chunks but not block groups, extents or files
234  *
235  * uuid_mutex (global lock)
236  * ------------------------
237  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
238  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
239  * device) or requested by the device= mount option
240  *
241  * the mutex can be very coarse and can cover long-running operations
242  *
243  * protects: updates to fs_devices counters like missing devices, rw devices,
244  * seeding, structure cloning, opening/closing devices at mount/umount time
245  *
246  * global::fs_devs - add, remove, updates to the global list
247  *
248  * does not protect: manipulation of the fs_devices::devices list!
249  *
250  * btrfs_device::name - renames (write side), read is RCU
251  *
252  * fs_devices::device_list_mutex (per-fs, with RCU)
253  * ------------------------------------------------
254  * protects updates to fs_devices::devices, ie. adding and deleting
255  *
256  * simple list traversal with read-only actions can be done with RCU protection
257  *
258  * may be used to exclude some operations from running concurrently without any
259  * modifications to the list (see write_all_supers)
260  *
261  * balance_mutex
262  * -------------
263  * protects balance structures (status, state) and context accessed from
264  * several places (internally, ioctl)
265  *
266  * chunk_mutex
267  * -----------
268  * protects chunks, adding or removing during allocation, trim or when a new
269  * device is added/removed. Additionally it also protects post_commit_list of
270  * individual devices, since they can be added to the transaction's
271  * post_commit_list only with chunk_mutex held.
272  *
273  * cleaner_mutex
274  * -------------
275  * a big lock that is held by the cleaner thread and prevents running subvolume
276  * cleaning together with relocation or delayed iputs
277  *
278  *
279  * Lock nesting
280  * ============
281  *
282  * uuid_mutex
283  *   volume_mutex
284  *     device_list_mutex
285  *       chunk_mutex
286  *     balance_mutex
287  *
288  *
289  * Exclusive operations, BTRFS_FS_EXCL_OP
290  * ======================================
291  *
292  * Maintains the exclusivity of the following operations that apply to the
293  * whole filesystem and cannot run in parallel.
294  *
295  * - Balance (*)
296  * - Device add
297  * - Device remove
298  * - Device replace (*)
299  * - Resize
300  *
301  * The device operations (as above) can be in one of the following states:
302  *
303  * - Running state
304  * - Paused state
305  * - Completed state
306  *
307  * Only device operations marked with (*) can go into the Paused state for the
308  * following reasons:
309  *
310  * - ioctl (only Balance can be Paused through ioctl)
311  * - filesystem remounted as read-only
312  * - filesystem unmounted and mounted as read-only
313  * - system power-cycle and filesystem mounted as read-only
314  * - filesystem or device errors leading to forced read-only
315  *
316  * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
317  * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
318  * A device operation in Paused or Running state can be canceled or resumed
319  * either by ioctl (Balance only) or when remounted as read-write.
320  * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
321  * completed.
322  */
323
324 DEFINE_MUTEX(uuid_mutex);
325 static LIST_HEAD(fs_uuids);
326 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
327 {
328         return &fs_uuids;
329 }
330
331 /*
332  * alloc_fs_devices - allocate struct btrfs_fs_devices
333  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
334  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
335  *
336  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
337  * The returned struct is not linked onto any lists and can be destroyed with
338  * kfree() right away.
339  */
340 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
341                                                  const u8 *metadata_fsid)
342 {
343         struct btrfs_fs_devices *fs_devs;
344
345         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
346         if (!fs_devs)
347                 return ERR_PTR(-ENOMEM);
348
349         mutex_init(&fs_devs->device_list_mutex);
350
351         INIT_LIST_HEAD(&fs_devs->devices);
352         INIT_LIST_HEAD(&fs_devs->alloc_list);
353         INIT_LIST_HEAD(&fs_devs->fs_list);
354         if (fsid)
355                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
356
357         if (metadata_fsid)
358                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
359         else if (fsid)
360                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
361
362         return fs_devs;
363 }
364
365 void btrfs_free_device(struct btrfs_device *device)
366 {
367         WARN_ON(!list_empty(&device->post_commit_list));
368         rcu_string_free(device->name);
369         extent_io_tree_release(&device->alloc_state);
370         bio_put(device->flush_bio);
371         kfree(device);
372 }
373
374 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
375 {
376         struct btrfs_device *device;
377         WARN_ON(fs_devices->opened);
378         while (!list_empty(&fs_devices->devices)) {
379                 device = list_entry(fs_devices->devices.next,
380                                     struct btrfs_device, dev_list);
381                 list_del(&device->dev_list);
382                 btrfs_free_device(device);
383         }
384         kfree(fs_devices);
385 }
386
387 void __exit btrfs_cleanup_fs_uuids(void)
388 {
389         struct btrfs_fs_devices *fs_devices;
390
391         while (!list_empty(&fs_uuids)) {
392                 fs_devices = list_entry(fs_uuids.next,
393                                         struct btrfs_fs_devices, fs_list);
394                 list_del(&fs_devices->fs_list);
395                 free_fs_devices(fs_devices);
396         }
397 }
398
399 /*
400  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
401  * Returned struct is not linked onto any lists and must be destroyed using
402  * btrfs_free_device.
403  */
404 static struct btrfs_device *__alloc_device(void)
405 {
406         struct btrfs_device *dev;
407
408         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
409         if (!dev)
410                 return ERR_PTR(-ENOMEM);
411
412         /*
413          * Preallocate a bio that's always going to be used for flushing device
414          * barriers and matches the device lifespan
415          */
416         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
417         if (!dev->flush_bio) {
418                 kfree(dev);
419                 return ERR_PTR(-ENOMEM);
420         }
421
422         INIT_LIST_HEAD(&dev->dev_list);
423         INIT_LIST_HEAD(&dev->dev_alloc_list);
424         INIT_LIST_HEAD(&dev->post_commit_list);
425
426         atomic_set(&dev->reada_in_flight, 0);
427         atomic_set(&dev->dev_stats_ccnt, 0);
428         btrfs_device_data_ordered_init(dev);
429         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
430         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
431         extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
432
433         return dev;
434 }
435
436 static noinline struct btrfs_fs_devices *find_fsid(
437                 const u8 *fsid, const u8 *metadata_fsid)
438 {
439         struct btrfs_fs_devices *fs_devices;
440
441         ASSERT(fsid);
442
443         /* Handle non-split brain cases */
444         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
445                 if (metadata_fsid) {
446                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
447                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
448                                       BTRFS_FSID_SIZE) == 0)
449                                 return fs_devices;
450                 } else {
451                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
452                                 return fs_devices;
453                 }
454         }
455         return NULL;
456 }
457
458 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
459                                 struct btrfs_super_block *disk_super)
460 {
461
462         struct btrfs_fs_devices *fs_devices;
463
464         /*
465          * Handle scanned device having completed its fsid change but
466          * belonging to a fs_devices that was created by first scanning
467          * a device which didn't have its fsid/metadata_uuid changed
468          * at all and the CHANGING_FSID_V2 flag set.
469          */
470         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
471                 if (fs_devices->fsid_change &&
472                     memcmp(disk_super->metadata_uuid, fs_devices->fsid,
473                            BTRFS_FSID_SIZE) == 0 &&
474                     memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
475                            BTRFS_FSID_SIZE) == 0) {
476                         return fs_devices;
477                 }
478         }
479         /*
480          * Handle scanned device having completed its fsid change but
481          * belonging to a fs_devices that was created by a device that
482          * has an outdated pair of fsid/metadata_uuid and
483          * CHANGING_FSID_V2 flag set.
484          */
485         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
486                 if (fs_devices->fsid_change &&
487                     memcmp(fs_devices->metadata_uuid,
488                            fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
489                     memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
490                            BTRFS_FSID_SIZE) == 0) {
491                         return fs_devices;
492                 }
493         }
494
495         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
496 }
497
498
499 static int
500 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
501                       int flush, struct block_device **bdev,
502                       struct btrfs_super_block **disk_super)
503 {
504         int ret;
505
506         *bdev = blkdev_get_by_path(device_path, flags, holder);
507
508         if (IS_ERR(*bdev)) {
509                 ret = PTR_ERR(*bdev);
510                 goto error;
511         }
512
513         if (flush)
514                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
515         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
516         if (ret) {
517                 blkdev_put(*bdev, flags);
518                 goto error;
519         }
520         invalidate_bdev(*bdev);
521         *disk_super = btrfs_read_dev_super(*bdev);
522         if (IS_ERR(*disk_super)) {
523                 ret = PTR_ERR(*disk_super);
524                 blkdev_put(*bdev, flags);
525                 goto error;
526         }
527
528         return 0;
529
530 error:
531         *bdev = NULL;
532         return ret;
533 }
534
535 static bool device_path_matched(const char *path, struct btrfs_device *device)
536 {
537         int found;
538
539         rcu_read_lock();
540         found = strcmp(rcu_str_deref(device->name), path);
541         rcu_read_unlock();
542
543         return found == 0;
544 }
545
546 /*
547  *  Search and remove all stale (devices which are not mounted) devices.
548  *  When both inputs are NULL, it will search and release all stale devices.
549  *  path:       Optional. When provided will it release all unmounted devices
550  *              matching this path only.
551  *  skip_dev:   Optional. Will skip this device when searching for the stale
552  *              devices.
553  *  Return:     0 for success or if @path is NULL.
554  *              -EBUSY if @path is a mounted device.
555  *              -ENOENT if @path does not match any device in the list.
556  */
557 static int btrfs_free_stale_devices(const char *path,
558                                      struct btrfs_device *skip_device)
559 {
560         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
561         struct btrfs_device *device, *tmp_device;
562         int ret = 0;
563
564         if (path)
565                 ret = -ENOENT;
566
567         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
568
569                 mutex_lock(&fs_devices->device_list_mutex);
570                 list_for_each_entry_safe(device, tmp_device,
571                                          &fs_devices->devices, dev_list) {
572                         if (skip_device && skip_device == device)
573                                 continue;
574                         if (path && !device->name)
575                                 continue;
576                         if (path && !device_path_matched(path, device))
577                                 continue;
578                         if (fs_devices->opened) {
579                                 /* for an already deleted device return 0 */
580                                 if (path && ret != 0)
581                                         ret = -EBUSY;
582                                 break;
583                         }
584
585                         /* delete the stale device */
586                         fs_devices->num_devices--;
587                         list_del(&device->dev_list);
588                         btrfs_free_device(device);
589
590                         ret = 0;
591                         if (fs_devices->num_devices == 0)
592                                 break;
593                 }
594                 mutex_unlock(&fs_devices->device_list_mutex);
595
596                 if (fs_devices->num_devices == 0) {
597                         btrfs_sysfs_remove_fsid(fs_devices);
598                         list_del(&fs_devices->fs_list);
599                         free_fs_devices(fs_devices);
600                 }
601         }
602
603         return ret;
604 }
605
606 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
607                         struct btrfs_device *device, fmode_t flags,
608                         void *holder)
609 {
610         struct request_queue *q;
611         struct block_device *bdev;
612         struct btrfs_super_block *disk_super;
613         u64 devid;
614         int ret;
615
616         if (device->bdev)
617                 return -EINVAL;
618         if (!device->name)
619                 return -EINVAL;
620
621         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
622                                     &bdev, &disk_super);
623         if (ret)
624                 return ret;
625
626         devid = btrfs_stack_device_id(&disk_super->dev_item);
627         if (devid != device->devid)
628                 goto error_free_page;
629
630         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
631                 goto error_free_page;
632
633         device->generation = btrfs_super_generation(disk_super);
634
635         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
636                 if (btrfs_super_incompat_flags(disk_super) &
637                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
638                         pr_err(
639                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
640                         goto error_free_page;
641                 }
642
643                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
644                 fs_devices->seeding = true;
645         } else {
646                 if (bdev_read_only(bdev))
647                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
648                 else
649                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
650         }
651
652         q = bdev_get_queue(bdev);
653         if (!blk_queue_nonrot(q))
654                 fs_devices->rotating = true;
655
656         device->bdev = bdev;
657         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
658         device->mode = flags;
659
660         fs_devices->open_devices++;
661         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
662             device->devid != BTRFS_DEV_REPLACE_DEVID) {
663                 fs_devices->rw_devices++;
664                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
665         }
666         btrfs_release_disk_super(disk_super);
667
668         return 0;
669
670 error_free_page:
671         btrfs_release_disk_super(disk_super);
672         blkdev_put(bdev, flags);
673
674         return -EINVAL;
675 }
676
677 /*
678  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
679  * being created with a disk that has already completed its fsid change. Such
680  * disk can belong to an fs which has its FSID changed or to one which doesn't.
681  * Handle both cases here.
682  */
683 static struct btrfs_fs_devices *find_fsid_inprogress(
684                                         struct btrfs_super_block *disk_super)
685 {
686         struct btrfs_fs_devices *fs_devices;
687
688         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
689                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
690                            BTRFS_FSID_SIZE) != 0 &&
691                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
692                            BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
693                         return fs_devices;
694                 }
695         }
696
697         return find_fsid(disk_super->fsid, NULL);
698 }
699
700
701 static struct btrfs_fs_devices *find_fsid_changed(
702                                         struct btrfs_super_block *disk_super)
703 {
704         struct btrfs_fs_devices *fs_devices;
705
706         /*
707          * Handles the case where scanned device is part of an fs that had
708          * multiple successful changes of FSID but curently device didn't
709          * observe it. Meaning our fsid will be different than theirs. We need
710          * to handle two subcases :
711          *  1 - The fs still continues to have different METADATA/FSID uuids.
712          *  2 - The fs is switched back to its original FSID (METADATA/FSID
713          *  are equal).
714          */
715         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
716                 /* Changed UUIDs */
717                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
718                            BTRFS_FSID_SIZE) != 0 &&
719                     memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
720                            BTRFS_FSID_SIZE) == 0 &&
721                     memcmp(fs_devices->fsid, disk_super->fsid,
722                            BTRFS_FSID_SIZE) != 0)
723                         return fs_devices;
724
725                 /* Unchanged UUIDs */
726                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
727                            BTRFS_FSID_SIZE) == 0 &&
728                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
729                            BTRFS_FSID_SIZE) == 0)
730                         return fs_devices;
731         }
732
733         return NULL;
734 }
735
736 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
737                                 struct btrfs_super_block *disk_super)
738 {
739         struct btrfs_fs_devices *fs_devices;
740
741         /*
742          * Handle the case where the scanned device is part of an fs whose last
743          * metadata UUID change reverted it to the original FSID. At the same
744          * time * fs_devices was first created by another constitutent device
745          * which didn't fully observe the operation. This results in an
746          * btrfs_fs_devices created with metadata/fsid different AND
747          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
748          * fs_devices equal to the FSID of the disk.
749          */
750         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
751                 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
752                            BTRFS_FSID_SIZE) != 0 &&
753                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
754                            BTRFS_FSID_SIZE) == 0 &&
755                     fs_devices->fsid_change)
756                         return fs_devices;
757         }
758
759         return NULL;
760 }
761 /*
762  * Add new device to list of registered devices
763  *
764  * Returns:
765  * device pointer which was just added or updated when successful
766  * error pointer when failed
767  */
768 static noinline struct btrfs_device *device_list_add(const char *path,
769                            struct btrfs_super_block *disk_super,
770                            bool *new_device_added)
771 {
772         struct btrfs_device *device;
773         struct btrfs_fs_devices *fs_devices = NULL;
774         struct rcu_string *name;
775         u64 found_transid = btrfs_super_generation(disk_super);
776         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
777         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
778                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
779         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
780                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
781
782         if (fsid_change_in_progress) {
783                 if (!has_metadata_uuid)
784                         fs_devices = find_fsid_inprogress(disk_super);
785                 else
786                         fs_devices = find_fsid_changed(disk_super);
787         } else if (has_metadata_uuid) {
788                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
789         } else {
790                 fs_devices = find_fsid_reverted_metadata(disk_super);
791                 if (!fs_devices)
792                         fs_devices = find_fsid(disk_super->fsid, NULL);
793         }
794
795
796         if (!fs_devices) {
797                 if (has_metadata_uuid)
798                         fs_devices = alloc_fs_devices(disk_super->fsid,
799                                                       disk_super->metadata_uuid);
800                 else
801                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
802
803                 if (IS_ERR(fs_devices))
804                         return ERR_CAST(fs_devices);
805
806                 fs_devices->fsid_change = fsid_change_in_progress;
807
808                 mutex_lock(&fs_devices->device_list_mutex);
809                 list_add(&fs_devices->fs_list, &fs_uuids);
810
811                 device = NULL;
812         } else {
813                 mutex_lock(&fs_devices->device_list_mutex);
814                 device = btrfs_find_device(fs_devices, devid,
815                                 disk_super->dev_item.uuid, NULL, false);
816
817                 /*
818                  * If this disk has been pulled into an fs devices created by
819                  * a device which had the CHANGING_FSID_V2 flag then replace the
820                  * metadata_uuid/fsid values of the fs_devices.
821                  */
822                 if (fs_devices->fsid_change &&
823                     found_transid > fs_devices->latest_generation) {
824                         memcpy(fs_devices->fsid, disk_super->fsid,
825                                         BTRFS_FSID_SIZE);
826
827                         if (has_metadata_uuid)
828                                 memcpy(fs_devices->metadata_uuid,
829                                        disk_super->metadata_uuid,
830                                        BTRFS_FSID_SIZE);
831                         else
832                                 memcpy(fs_devices->metadata_uuid,
833                                        disk_super->fsid, BTRFS_FSID_SIZE);
834
835                         fs_devices->fsid_change = false;
836                 }
837         }
838
839         if (!device) {
840                 if (fs_devices->opened) {
841                         mutex_unlock(&fs_devices->device_list_mutex);
842                         return ERR_PTR(-EBUSY);
843                 }
844
845                 device = btrfs_alloc_device(NULL, &devid,
846                                             disk_super->dev_item.uuid);
847                 if (IS_ERR(device)) {
848                         mutex_unlock(&fs_devices->device_list_mutex);
849                         /* we can safely leave the fs_devices entry around */
850                         return device;
851                 }
852
853                 name = rcu_string_strdup(path, GFP_NOFS);
854                 if (!name) {
855                         btrfs_free_device(device);
856                         mutex_unlock(&fs_devices->device_list_mutex);
857                         return ERR_PTR(-ENOMEM);
858                 }
859                 rcu_assign_pointer(device->name, name);
860
861                 list_add_rcu(&device->dev_list, &fs_devices->devices);
862                 fs_devices->num_devices++;
863
864                 device->fs_devices = fs_devices;
865                 *new_device_added = true;
866
867                 if (disk_super->label[0])
868                         pr_info(
869         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
870                                 disk_super->label, devid, found_transid, path,
871                                 current->comm, task_pid_nr(current));
872                 else
873                         pr_info(
874         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
875                                 disk_super->fsid, devid, found_transid, path,
876                                 current->comm, task_pid_nr(current));
877
878         } else if (!device->name || strcmp(device->name->str, path)) {
879                 /*
880                  * When FS is already mounted.
881                  * 1. If you are here and if the device->name is NULL that
882                  *    means this device was missing at time of FS mount.
883                  * 2. If you are here and if the device->name is different
884                  *    from 'path' that means either
885                  *      a. The same device disappeared and reappeared with
886                  *         different name. or
887                  *      b. The missing-disk-which-was-replaced, has
888                  *         reappeared now.
889                  *
890                  * We must allow 1 and 2a above. But 2b would be a spurious
891                  * and unintentional.
892                  *
893                  * Further in case of 1 and 2a above, the disk at 'path'
894                  * would have missed some transaction when it was away and
895                  * in case of 2a the stale bdev has to be updated as well.
896                  * 2b must not be allowed at all time.
897                  */
898
899                 /*
900                  * For now, we do allow update to btrfs_fs_device through the
901                  * btrfs dev scan cli after FS has been mounted.  We're still
902                  * tracking a problem where systems fail mount by subvolume id
903                  * when we reject replacement on a mounted FS.
904                  */
905                 if (!fs_devices->opened && found_transid < device->generation) {
906                         /*
907                          * That is if the FS is _not_ mounted and if you
908                          * are here, that means there is more than one
909                          * disk with same uuid and devid.We keep the one
910                          * with larger generation number or the last-in if
911                          * generation are equal.
912                          */
913                         mutex_unlock(&fs_devices->device_list_mutex);
914                         return ERR_PTR(-EEXIST);
915                 }
916
917                 /*
918                  * We are going to replace the device path for a given devid,
919                  * make sure it's the same device if the device is mounted
920                  */
921                 if (device->bdev) {
922                         struct block_device *path_bdev;
923
924                         path_bdev = lookup_bdev(path);
925                         if (IS_ERR(path_bdev)) {
926                                 mutex_unlock(&fs_devices->device_list_mutex);
927                                 return ERR_CAST(path_bdev);
928                         }
929
930                         if (device->bdev != path_bdev) {
931                                 bdput(path_bdev);
932                                 mutex_unlock(&fs_devices->device_list_mutex);
933                                 btrfs_warn_in_rcu(device->fs_info,
934                         "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
935                                         disk_super->fsid, devid,
936                                         rcu_str_deref(device->name), path);
937                                 return ERR_PTR(-EEXIST);
938                         }
939                         bdput(path_bdev);
940                         btrfs_info_in_rcu(device->fs_info,
941                                 "device fsid %pU devid %llu moved old:%s new:%s",
942                                 disk_super->fsid, devid,
943                                 rcu_str_deref(device->name), path);
944                 }
945
946                 name = rcu_string_strdup(path, GFP_NOFS);
947                 if (!name) {
948                         mutex_unlock(&fs_devices->device_list_mutex);
949                         return ERR_PTR(-ENOMEM);
950                 }
951                 rcu_string_free(device->name);
952                 rcu_assign_pointer(device->name, name);
953                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
954                         fs_devices->missing_devices--;
955                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
956                 }
957         }
958
959         /*
960          * Unmount does not free the btrfs_device struct but would zero
961          * generation along with most of the other members. So just update
962          * it back. We need it to pick the disk with largest generation
963          * (as above).
964          */
965         if (!fs_devices->opened) {
966                 device->generation = found_transid;
967                 fs_devices->latest_generation = max_t(u64, found_transid,
968                                                 fs_devices->latest_generation);
969         }
970
971         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
972
973         mutex_unlock(&fs_devices->device_list_mutex);
974         return device;
975 }
976
977 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
978 {
979         struct btrfs_fs_devices *fs_devices;
980         struct btrfs_device *device;
981         struct btrfs_device *orig_dev;
982         int ret = 0;
983
984         fs_devices = alloc_fs_devices(orig->fsid, NULL);
985         if (IS_ERR(fs_devices))
986                 return fs_devices;
987
988         mutex_lock(&orig->device_list_mutex);
989         fs_devices->total_devices = orig->total_devices;
990
991         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
992                 struct rcu_string *name;
993
994                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
995                                             orig_dev->uuid);
996                 if (IS_ERR(device)) {
997                         ret = PTR_ERR(device);
998                         goto error;
999                 }
1000
1001                 /*
1002                  * This is ok to do without rcu read locked because we hold the
1003                  * uuid mutex so nothing we touch in here is going to disappear.
1004                  */
1005                 if (orig_dev->name) {
1006                         name = rcu_string_strdup(orig_dev->name->str,
1007                                         GFP_KERNEL);
1008                         if (!name) {
1009                                 btrfs_free_device(device);
1010                                 ret = -ENOMEM;
1011                                 goto error;
1012                         }
1013                         rcu_assign_pointer(device->name, name);
1014                 }
1015
1016                 list_add(&device->dev_list, &fs_devices->devices);
1017                 device->fs_devices = fs_devices;
1018                 fs_devices->num_devices++;
1019         }
1020         mutex_unlock(&orig->device_list_mutex);
1021         return fs_devices;
1022 error:
1023         mutex_unlock(&orig->device_list_mutex);
1024         free_fs_devices(fs_devices);
1025         return ERR_PTR(ret);
1026 }
1027
1028 /*
1029  * After we have read the system tree and know devids belonging to
1030  * this filesystem, remove the device which does not belong there.
1031  */
1032 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1033 {
1034         struct btrfs_device *device, *next;
1035         struct btrfs_device *latest_dev = NULL;
1036
1037         mutex_lock(&uuid_mutex);
1038 again:
1039         /* This is the initialized path, it is safe to release the devices. */
1040         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1041                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
1042                                                         &device->dev_state)) {
1043                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1044                              &device->dev_state) &&
1045                              (!latest_dev ||
1046                               device->generation > latest_dev->generation)) {
1047                                 latest_dev = device;
1048                         }
1049                         continue;
1050                 }
1051
1052                 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
1053                         /*
1054                          * In the first step, keep the device which has
1055                          * the correct fsid and the devid that is used
1056                          * for the dev_replace procedure.
1057                          * In the second step, the dev_replace state is
1058                          * read from the device tree and it is known
1059                          * whether the procedure is really active or
1060                          * not, which means whether this device is
1061                          * used or whether it should be removed.
1062                          */
1063                         if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1064                                                   &device->dev_state)) {
1065                                 continue;
1066                         }
1067                 }
1068                 if (device->bdev) {
1069                         blkdev_put(device->bdev, device->mode);
1070                         device->bdev = NULL;
1071                         fs_devices->open_devices--;
1072                 }
1073                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1074                         list_del_init(&device->dev_alloc_list);
1075                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1076                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1077                                       &device->dev_state))
1078                                 fs_devices->rw_devices--;
1079                 }
1080                 list_del_init(&device->dev_list);
1081                 fs_devices->num_devices--;
1082                 btrfs_free_device(device);
1083         }
1084
1085         if (fs_devices->seed) {
1086                 fs_devices = fs_devices->seed;
1087                 goto again;
1088         }
1089
1090         fs_devices->latest_bdev = latest_dev->bdev;
1091
1092         mutex_unlock(&uuid_mutex);
1093 }
1094
1095 static void btrfs_close_bdev(struct btrfs_device *device)
1096 {
1097         if (!device->bdev)
1098                 return;
1099
1100         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1101                 sync_blockdev(device->bdev);
1102                 invalidate_bdev(device->bdev);
1103         }
1104
1105         blkdev_put(device->bdev, device->mode);
1106 }
1107
1108 static void btrfs_close_one_device(struct btrfs_device *device)
1109 {
1110         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1111
1112         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1113             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1114                 list_del_init(&device->dev_alloc_list);
1115                 fs_devices->rw_devices--;
1116         }
1117
1118         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1119                 fs_devices->missing_devices--;
1120
1121         btrfs_close_bdev(device);
1122         if (device->bdev) {
1123                 fs_devices->open_devices--;
1124                 device->bdev = NULL;
1125         }
1126         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1127
1128         device->fs_info = NULL;
1129         atomic_set(&device->dev_stats_ccnt, 0);
1130         extent_io_tree_release(&device->alloc_state);
1131
1132         /* Verify the device is back in a pristine state  */
1133         ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1134         ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1135         ASSERT(list_empty(&device->dev_alloc_list));
1136         ASSERT(list_empty(&device->post_commit_list));
1137         ASSERT(atomic_read(&device->reada_in_flight) == 0);
1138 }
1139
1140 static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1141 {
1142         struct btrfs_device *device, *tmp;
1143
1144         if (--fs_devices->opened > 0)
1145                 return 0;
1146
1147         mutex_lock(&fs_devices->device_list_mutex);
1148         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1149                 btrfs_close_one_device(device);
1150         }
1151         mutex_unlock(&fs_devices->device_list_mutex);
1152
1153         WARN_ON(fs_devices->open_devices);
1154         WARN_ON(fs_devices->rw_devices);
1155         fs_devices->opened = 0;
1156         fs_devices->seeding = false;
1157
1158         return 0;
1159 }
1160
1161 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1162 {
1163         struct btrfs_fs_devices *seed_devices = NULL;
1164         int ret;
1165
1166         mutex_lock(&uuid_mutex);
1167         ret = close_fs_devices(fs_devices);
1168         if (!fs_devices->opened) {
1169                 seed_devices = fs_devices->seed;
1170                 fs_devices->seed = NULL;
1171         }
1172         mutex_unlock(&uuid_mutex);
1173
1174         while (seed_devices) {
1175                 fs_devices = seed_devices;
1176                 seed_devices = fs_devices->seed;
1177                 close_fs_devices(fs_devices);
1178                 free_fs_devices(fs_devices);
1179         }
1180         return ret;
1181 }
1182
1183 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1184                                 fmode_t flags, void *holder)
1185 {
1186         struct btrfs_device *device;
1187         struct btrfs_device *latest_dev = NULL;
1188         int ret = 0;
1189
1190         flags |= FMODE_EXCL;
1191
1192         list_for_each_entry(device, &fs_devices->devices, dev_list) {
1193                 /* Just open everything we can; ignore failures here */
1194                 if (btrfs_open_one_device(fs_devices, device, flags, holder))
1195                         continue;
1196
1197                 if (!latest_dev ||
1198                     device->generation > latest_dev->generation)
1199                         latest_dev = device;
1200         }
1201         if (fs_devices->open_devices == 0) {
1202                 ret = -EINVAL;
1203                 goto out;
1204         }
1205         fs_devices->opened = 1;
1206         fs_devices->latest_bdev = latest_dev->bdev;
1207         fs_devices->total_rw_bytes = 0;
1208         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1209 out:
1210         return ret;
1211 }
1212
1213 static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1214 {
1215         struct btrfs_device *dev1, *dev2;
1216
1217         dev1 = list_entry(a, struct btrfs_device, dev_list);
1218         dev2 = list_entry(b, struct btrfs_device, dev_list);
1219
1220         if (dev1->devid < dev2->devid)
1221                 return -1;
1222         else if (dev1->devid > dev2->devid)
1223                 return 1;
1224         return 0;
1225 }
1226
1227 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1228                        fmode_t flags, void *holder)
1229 {
1230         int ret;
1231
1232         lockdep_assert_held(&uuid_mutex);
1233
1234         mutex_lock(&fs_devices->device_list_mutex);
1235         if (fs_devices->opened) {
1236                 fs_devices->opened++;
1237                 ret = 0;
1238         } else {
1239                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1240                 ret = open_fs_devices(fs_devices, flags, holder);
1241         }
1242         mutex_unlock(&fs_devices->device_list_mutex);
1243
1244         return ret;
1245 }
1246
1247 void btrfs_release_disk_super(struct btrfs_super_block *super)
1248 {
1249         struct page *page = virt_to_page(super);
1250
1251         put_page(page);
1252 }
1253
1254 static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1255                                  struct page **page,
1256                                  struct btrfs_super_block **disk_super)
1257 {
1258         void *p;
1259         pgoff_t index;
1260
1261         /* make sure our super fits in the device */
1262         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1263                 return 1;
1264
1265         /* make sure our super fits in the page */
1266         if (sizeof(**disk_super) > PAGE_SIZE)
1267                 return 1;
1268
1269         /* make sure our super doesn't straddle pages on disk */
1270         index = bytenr >> PAGE_SHIFT;
1271         if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1272                 return 1;
1273
1274         /* pull in the page with our super */
1275         *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1276                                    index, GFP_KERNEL);
1277
1278         if (IS_ERR(*page))
1279                 return 1;
1280
1281         p = page_address(*page);
1282
1283         /* align our pointer to the offset of the super block */
1284         *disk_super = p + offset_in_page(bytenr);
1285
1286         if (btrfs_super_bytenr(*disk_super) != bytenr ||
1287             btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1288                 btrfs_release_disk_super(p);
1289                 return 1;
1290         }
1291
1292         if ((*disk_super)->label[0] &&
1293                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1294                 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1295
1296         return 0;
1297 }
1298
1299 int btrfs_forget_devices(const char *path)
1300 {
1301         int ret;
1302
1303         mutex_lock(&uuid_mutex);
1304         ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1305         mutex_unlock(&uuid_mutex);
1306
1307         return ret;
1308 }
1309
1310 /*
1311  * Look for a btrfs signature on a device. This may be called out of the mount path
1312  * and we are not allowed to call set_blocksize during the scan. The superblock
1313  * is read via pagecache
1314  */
1315 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1316                                            void *holder)
1317 {
1318         struct btrfs_super_block *disk_super;
1319         bool new_device_added = false;
1320         struct btrfs_device *device = NULL;
1321         struct block_device *bdev;
1322         struct page *page;
1323         u64 bytenr;
1324
1325         lockdep_assert_held(&uuid_mutex);
1326
1327         /*
1328          * we would like to check all the supers, but that would make
1329          * a btrfs mount succeed after a mkfs from a different FS.
1330          * So, we need to add a special mount option to scan for
1331          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1332          */
1333         bytenr = btrfs_sb_offset(0);
1334         flags |= FMODE_EXCL;
1335
1336         bdev = blkdev_get_by_path(path, flags, holder);
1337         if (IS_ERR(bdev))
1338                 return ERR_CAST(bdev);
1339
1340         if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1341                 device = ERR_PTR(-EINVAL);
1342                 goto error_bdev_put;
1343         }
1344
1345         device = device_list_add(path, disk_super, &new_device_added);
1346         if (!IS_ERR(device)) {
1347                 if (new_device_added)
1348                         btrfs_free_stale_devices(path, device);
1349         }
1350
1351         btrfs_release_disk_super(disk_super);
1352
1353 error_bdev_put:
1354         blkdev_put(bdev, flags);
1355
1356         return device;
1357 }
1358
1359 /*
1360  * Try to find a chunk that intersects [start, start + len] range and when one
1361  * such is found, record the end of it in *start
1362  */
1363 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1364                                     u64 len)
1365 {
1366         u64 physical_start, physical_end;
1367
1368         lockdep_assert_held(&device->fs_info->chunk_mutex);
1369
1370         if (!find_first_extent_bit(&device->alloc_state, *start,
1371                                    &physical_start, &physical_end,
1372                                    CHUNK_ALLOCATED, NULL)) {
1373
1374                 if (in_range(physical_start, *start, len) ||
1375                     in_range(*start, physical_start,
1376                              physical_end - physical_start)) {
1377                         *start = physical_end + 1;
1378                         return true;
1379                 }
1380         }
1381         return false;
1382 }
1383
1384 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1385 {
1386         switch (device->fs_devices->chunk_alloc_policy) {
1387         case BTRFS_CHUNK_ALLOC_REGULAR:
1388                 /*
1389                  * We don't want to overwrite the superblock on the drive nor
1390                  * any area used by the boot loader (grub for example), so we
1391                  * make sure to start at an offset of at least 1MB.
1392                  */
1393                 return max_t(u64, start, SZ_1M);
1394         default:
1395                 BUG();
1396         }
1397 }
1398
1399 /**
1400  * dev_extent_hole_check - check if specified hole is suitable for allocation
1401  * @device:     the device which we have the hole
1402  * @hole_start: starting position of the hole
1403  * @hole_size:  the size of the hole
1404  * @num_bytes:  the size of the free space that we need
1405  *
1406  * This function may modify @hole_start and @hole_end to reflect the suitable
1407  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1408  */
1409 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1410                                   u64 *hole_size, u64 num_bytes)
1411 {
1412         bool changed = false;
1413         u64 hole_end = *hole_start + *hole_size;
1414
1415         /*
1416          * Check before we set max_hole_start, otherwise we could end up
1417          * sending back this offset anyway.
1418          */
1419         if (contains_pending_extent(device, hole_start, *hole_size)) {
1420                 if (hole_end >= *hole_start)
1421                         *hole_size = hole_end - *hole_start;
1422                 else
1423                         *hole_size = 0;
1424                 changed = true;
1425         }
1426
1427         switch (device->fs_devices->chunk_alloc_policy) {
1428         case BTRFS_CHUNK_ALLOC_REGULAR:
1429                 /* No extra check */
1430                 break;
1431         default:
1432                 BUG();
1433         }
1434
1435         return changed;
1436 }
1437
1438 /*
1439  * find_free_dev_extent_start - find free space in the specified device
1440  * @device:       the device which we search the free space in
1441  * @num_bytes:    the size of the free space that we need
1442  * @search_start: the position from which to begin the search
1443  * @start:        store the start of the free space.
1444  * @len:          the size of the free space. that we find, or the size
1445  *                of the max free space if we don't find suitable free space
1446  *
1447  * this uses a pretty simple search, the expectation is that it is
1448  * called very infrequently and that a given device has a small number
1449  * of extents
1450  *
1451  * @start is used to store the start of the free space if we find. But if we
1452  * don't find suitable free space, it will be used to store the start position
1453  * of the max free space.
1454  *
1455  * @len is used to store the size of the free space that we find.
1456  * But if we don't find suitable free space, it is used to store the size of
1457  * the max free space.
1458  *
1459  * NOTE: This function will search *commit* root of device tree, and does extra
1460  * check to ensure dev extents are not double allocated.
1461  * This makes the function safe to allocate dev extents but may not report
1462  * correct usable device space, as device extent freed in current transaction
1463  * is not reported as avaiable.
1464  */
1465 static int find_free_dev_extent_start(struct btrfs_device *device,
1466                                 u64 num_bytes, u64 search_start, u64 *start,
1467                                 u64 *len)
1468 {
1469         struct btrfs_fs_info *fs_info = device->fs_info;
1470         struct btrfs_root *root = fs_info->dev_root;
1471         struct btrfs_key key;
1472         struct btrfs_dev_extent *dev_extent;
1473         struct btrfs_path *path;
1474         u64 hole_size;
1475         u64 max_hole_start;
1476         u64 max_hole_size;
1477         u64 extent_end;
1478         u64 search_end = device->total_bytes;
1479         int ret;
1480         int slot;
1481         struct extent_buffer *l;
1482
1483         search_start = dev_extent_search_start(device, search_start);
1484
1485         path = btrfs_alloc_path();
1486         if (!path)
1487                 return -ENOMEM;
1488
1489         max_hole_start = search_start;
1490         max_hole_size = 0;
1491
1492 again:
1493         if (search_start >= search_end ||
1494                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1495                 ret = -ENOSPC;
1496                 goto out;
1497         }
1498
1499         path->reada = READA_FORWARD;
1500         path->search_commit_root = 1;
1501         path->skip_locking = 1;
1502
1503         key.objectid = device->devid;
1504         key.offset = search_start;
1505         key.type = BTRFS_DEV_EXTENT_KEY;
1506
1507         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1508         if (ret < 0)
1509                 goto out;
1510         if (ret > 0) {
1511                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1512                 if (ret < 0)
1513                         goto out;
1514         }
1515
1516         while (1) {
1517                 l = path->nodes[0];
1518                 slot = path->slots[0];
1519                 if (slot >= btrfs_header_nritems(l)) {
1520                         ret = btrfs_next_leaf(root, path);
1521                         if (ret == 0)
1522                                 continue;
1523                         if (ret < 0)
1524                                 goto out;
1525
1526                         break;
1527                 }
1528                 btrfs_item_key_to_cpu(l, &key, slot);
1529
1530                 if (key.objectid < device->devid)
1531                         goto next;
1532
1533                 if (key.objectid > device->devid)
1534                         break;
1535
1536                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1537                         goto next;
1538
1539                 if (key.offset > search_start) {
1540                         hole_size = key.offset - search_start;
1541                         dev_extent_hole_check(device, &search_start, &hole_size,
1542                                               num_bytes);
1543
1544                         if (hole_size > max_hole_size) {
1545                                 max_hole_start = search_start;
1546                                 max_hole_size = hole_size;
1547                         }
1548
1549                         /*
1550                          * If this free space is greater than which we need,
1551                          * it must be the max free space that we have found
1552                          * until now, so max_hole_start must point to the start
1553                          * of this free space and the length of this free space
1554                          * is stored in max_hole_size. Thus, we return
1555                          * max_hole_start and max_hole_size and go back to the
1556                          * caller.
1557                          */
1558                         if (hole_size >= num_bytes) {
1559                                 ret = 0;
1560                                 goto out;
1561                         }
1562                 }
1563
1564                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1565                 extent_end = key.offset + btrfs_dev_extent_length(l,
1566                                                                   dev_extent);
1567                 if (extent_end > search_start)
1568                         search_start = extent_end;
1569 next:
1570                 path->slots[0]++;
1571                 cond_resched();
1572         }
1573
1574         /*
1575          * At this point, search_start should be the end of
1576          * allocated dev extents, and when shrinking the device,
1577          * search_end may be smaller than search_start.
1578          */
1579         if (search_end > search_start) {
1580                 hole_size = search_end - search_start;
1581                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1582                                           num_bytes)) {
1583                         btrfs_release_path(path);
1584                         goto again;
1585                 }
1586
1587                 if (hole_size > max_hole_size) {
1588                         max_hole_start = search_start;
1589                         max_hole_size = hole_size;
1590                 }
1591         }
1592
1593         /* See above. */
1594         if (max_hole_size < num_bytes)
1595                 ret = -ENOSPC;
1596         else
1597                 ret = 0;
1598
1599 out:
1600         btrfs_free_path(path);
1601         *start = max_hole_start;
1602         if (len)
1603                 *len = max_hole_size;
1604         return ret;
1605 }
1606
1607 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1608                          u64 *start, u64 *len)
1609 {
1610         /* FIXME use last free of some kind */
1611         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1612 }
1613
1614 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1615                           struct btrfs_device *device,
1616                           u64 start, u64 *dev_extent_len)
1617 {
1618         struct btrfs_fs_info *fs_info = device->fs_info;
1619         struct btrfs_root *root = fs_info->dev_root;
1620         int ret;
1621         struct btrfs_path *path;
1622         struct btrfs_key key;
1623         struct btrfs_key found_key;
1624         struct extent_buffer *leaf = NULL;
1625         struct btrfs_dev_extent *extent = NULL;
1626
1627         path = btrfs_alloc_path();
1628         if (!path)
1629                 return -ENOMEM;
1630
1631         key.objectid = device->devid;
1632         key.offset = start;
1633         key.type = BTRFS_DEV_EXTENT_KEY;
1634 again:
1635         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1636         if (ret > 0) {
1637                 ret = btrfs_previous_item(root, path, key.objectid,
1638                                           BTRFS_DEV_EXTENT_KEY);
1639                 if (ret)
1640                         goto out;
1641                 leaf = path->nodes[0];
1642                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1643                 extent = btrfs_item_ptr(leaf, path->slots[0],
1644                                         struct btrfs_dev_extent);
1645                 BUG_ON(found_key.offset > start || found_key.offset +
1646                        btrfs_dev_extent_length(leaf, extent) < start);
1647                 key = found_key;
1648                 btrfs_release_path(path);
1649                 goto again;
1650         } else if (ret == 0) {
1651                 leaf = path->nodes[0];
1652                 extent = btrfs_item_ptr(leaf, path->slots[0],
1653                                         struct btrfs_dev_extent);
1654         } else {
1655                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1656                 goto out;
1657         }
1658
1659         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1660
1661         ret = btrfs_del_item(trans, root, path);
1662         if (ret) {
1663                 btrfs_handle_fs_error(fs_info, ret,
1664                                       "Failed to remove dev extent item");
1665         } else {
1666                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1667         }
1668 out:
1669         btrfs_free_path(path);
1670         return ret;
1671 }
1672
1673 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1674                                   struct btrfs_device *device,
1675                                   u64 chunk_offset, u64 start, u64 num_bytes)
1676 {
1677         int ret;
1678         struct btrfs_path *path;
1679         struct btrfs_fs_info *fs_info = device->fs_info;
1680         struct btrfs_root *root = fs_info->dev_root;
1681         struct btrfs_dev_extent *extent;
1682         struct extent_buffer *leaf;
1683         struct btrfs_key key;
1684
1685         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1686         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1687         path = btrfs_alloc_path();
1688         if (!path)
1689                 return -ENOMEM;
1690
1691         key.objectid = device->devid;
1692         key.offset = start;
1693         key.type = BTRFS_DEV_EXTENT_KEY;
1694         ret = btrfs_insert_empty_item(trans, root, path, &key,
1695                                       sizeof(*extent));
1696         if (ret)
1697                 goto out;
1698
1699         leaf = path->nodes[0];
1700         extent = btrfs_item_ptr(leaf, path->slots[0],
1701                                 struct btrfs_dev_extent);
1702         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1703                                         BTRFS_CHUNK_TREE_OBJECTID);
1704         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1705                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1706         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1707
1708         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1709         btrfs_mark_buffer_dirty(leaf);
1710 out:
1711         btrfs_free_path(path);
1712         return ret;
1713 }
1714
1715 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1716 {
1717         struct extent_map_tree *em_tree;
1718         struct extent_map *em;
1719         struct rb_node *n;
1720         u64 ret = 0;
1721
1722         em_tree = &fs_info->mapping_tree;
1723         read_lock(&em_tree->lock);
1724         n = rb_last(&em_tree->map.rb_root);
1725         if (n) {
1726                 em = rb_entry(n, struct extent_map, rb_node);
1727                 ret = em->start + em->len;
1728         }
1729         read_unlock(&em_tree->lock);
1730
1731         return ret;
1732 }
1733
1734 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1735                                     u64 *devid_ret)
1736 {
1737         int ret;
1738         struct btrfs_key key;
1739         struct btrfs_key found_key;
1740         struct btrfs_path *path;
1741
1742         path = btrfs_alloc_path();
1743         if (!path)
1744                 return -ENOMEM;
1745
1746         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1747         key.type = BTRFS_DEV_ITEM_KEY;
1748         key.offset = (u64)-1;
1749
1750         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1751         if (ret < 0)
1752                 goto error;
1753
1754         if (ret == 0) {
1755                 /* Corruption */
1756                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1757                 ret = -EUCLEAN;
1758                 goto error;
1759         }
1760
1761         ret = btrfs_previous_item(fs_info->chunk_root, path,
1762                                   BTRFS_DEV_ITEMS_OBJECTID,
1763                                   BTRFS_DEV_ITEM_KEY);
1764         if (ret) {
1765                 *devid_ret = 1;
1766         } else {
1767                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1768                                       path->slots[0]);
1769                 *devid_ret = found_key.offset + 1;
1770         }
1771         ret = 0;
1772 error:
1773         btrfs_free_path(path);
1774         return ret;
1775 }
1776
1777 /*
1778  * the device information is stored in the chunk root
1779  * the btrfs_device struct should be fully filled in
1780  */
1781 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1782                             struct btrfs_device *device)
1783 {
1784         int ret;
1785         struct btrfs_path *path;
1786         struct btrfs_dev_item *dev_item;
1787         struct extent_buffer *leaf;
1788         struct btrfs_key key;
1789         unsigned long ptr;
1790
1791         path = btrfs_alloc_path();
1792         if (!path)
1793                 return -ENOMEM;
1794
1795         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1796         key.type = BTRFS_DEV_ITEM_KEY;
1797         key.offset = device->devid;
1798
1799         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1800                                       &key, sizeof(*dev_item));
1801         if (ret)
1802                 goto out;
1803
1804         leaf = path->nodes[0];
1805         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1806
1807         btrfs_set_device_id(leaf, dev_item, device->devid);
1808         btrfs_set_device_generation(leaf, dev_item, 0);
1809         btrfs_set_device_type(leaf, dev_item, device->type);
1810         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1811         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1812         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1813         btrfs_set_device_total_bytes(leaf, dev_item,
1814                                      btrfs_device_get_disk_total_bytes(device));
1815         btrfs_set_device_bytes_used(leaf, dev_item,
1816                                     btrfs_device_get_bytes_used(device));
1817         btrfs_set_device_group(leaf, dev_item, 0);
1818         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1819         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1820         btrfs_set_device_start_offset(leaf, dev_item, 0);
1821
1822         ptr = btrfs_device_uuid(dev_item);
1823         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1824         ptr = btrfs_device_fsid(dev_item);
1825         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1826                             ptr, BTRFS_FSID_SIZE);
1827         btrfs_mark_buffer_dirty(leaf);
1828
1829         ret = 0;
1830 out:
1831         btrfs_free_path(path);
1832         return ret;
1833 }
1834
1835 /*
1836  * Function to update ctime/mtime for a given device path.
1837  * Mainly used for ctime/mtime based probe like libblkid.
1838  */
1839 static void update_dev_time(const char *path_name)
1840 {
1841         struct file *filp;
1842
1843         filp = filp_open(path_name, O_RDWR, 0);
1844         if (IS_ERR(filp))
1845                 return;
1846         file_update_time(filp);
1847         filp_close(filp, NULL);
1848 }
1849
1850 static int btrfs_rm_dev_item(struct btrfs_device *device)
1851 {
1852         struct btrfs_root *root = device->fs_info->chunk_root;
1853         int ret;
1854         struct btrfs_path *path;
1855         struct btrfs_key key;
1856         struct btrfs_trans_handle *trans;
1857
1858         path = btrfs_alloc_path();
1859         if (!path)
1860                 return -ENOMEM;
1861
1862         trans = btrfs_start_transaction(root, 0);
1863         if (IS_ERR(trans)) {
1864                 btrfs_free_path(path);
1865                 return PTR_ERR(trans);
1866         }
1867         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1868         key.type = BTRFS_DEV_ITEM_KEY;
1869         key.offset = device->devid;
1870
1871         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1872         if (ret) {
1873                 if (ret > 0)
1874                         ret = -ENOENT;
1875                 btrfs_abort_transaction(trans, ret);
1876                 btrfs_end_transaction(trans);
1877                 goto out;
1878         }
1879
1880         ret = btrfs_del_item(trans, root, path);
1881         if (ret) {
1882                 btrfs_abort_transaction(trans, ret);
1883                 btrfs_end_transaction(trans);
1884         }
1885
1886 out:
1887         btrfs_free_path(path);
1888         if (!ret)
1889                 ret = btrfs_commit_transaction(trans);
1890         return ret;
1891 }
1892
1893 /*
1894  * Verify that @num_devices satisfies the RAID profile constraints in the whole
1895  * filesystem. It's up to the caller to adjust that number regarding eg. device
1896  * replace.
1897  */
1898 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1899                 u64 num_devices)
1900 {
1901         u64 all_avail;
1902         unsigned seq;
1903         int i;
1904
1905         do {
1906                 seq = read_seqbegin(&fs_info->profiles_lock);
1907
1908                 all_avail = fs_info->avail_data_alloc_bits |
1909                             fs_info->avail_system_alloc_bits |
1910                             fs_info->avail_metadata_alloc_bits;
1911         } while (read_seqretry(&fs_info->profiles_lock, seq));
1912
1913         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1914                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
1915                         continue;
1916
1917                 if (num_devices < btrfs_raid_array[i].devs_min) {
1918                         int ret = btrfs_raid_array[i].mindev_error;
1919
1920                         if (ret)
1921                                 return ret;
1922                 }
1923         }
1924
1925         return 0;
1926 }
1927
1928 static struct btrfs_device * btrfs_find_next_active_device(
1929                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1930 {
1931         struct btrfs_device *next_device;
1932
1933         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1934                 if (next_device != device &&
1935                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1936                     && next_device->bdev)
1937                         return next_device;
1938         }
1939
1940         return NULL;
1941 }
1942
1943 /*
1944  * Helper function to check if the given device is part of s_bdev / latest_bdev
1945  * and replace it with the provided or the next active device, in the context
1946  * where this function called, there should be always be another device (or
1947  * this_dev) which is active.
1948  */
1949 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1950                                      struct btrfs_device *this_dev)
1951 {
1952         struct btrfs_fs_info *fs_info = device->fs_info;
1953         struct btrfs_device *next_device;
1954
1955         if (this_dev)
1956                 next_device = this_dev;
1957         else
1958                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1959                                                                 device);
1960         ASSERT(next_device);
1961
1962         if (fs_info->sb->s_bdev &&
1963                         (fs_info->sb->s_bdev == device->bdev))
1964                 fs_info->sb->s_bdev = next_device->bdev;
1965
1966         if (fs_info->fs_devices->latest_bdev == device->bdev)
1967                 fs_info->fs_devices->latest_bdev = next_device->bdev;
1968 }
1969
1970 /*
1971  * Return btrfs_fs_devices::num_devices excluding the device that's being
1972  * currently replaced.
1973  */
1974 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1975 {
1976         u64 num_devices = fs_info->fs_devices->num_devices;
1977
1978         down_read(&fs_info->dev_replace.rwsem);
1979         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1980                 ASSERT(num_devices > 1);
1981                 num_devices--;
1982         }
1983         up_read(&fs_info->dev_replace.rwsem);
1984
1985         return num_devices;
1986 }
1987
1988 static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
1989                                       struct block_device *bdev,
1990                                       const char *device_path)
1991 {
1992         struct btrfs_super_block *disk_super;
1993         int copy_num;
1994
1995         if (!bdev)
1996                 return;
1997
1998         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
1999                 struct page *page;
2000                 int ret;
2001
2002                 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2003                 if (IS_ERR(disk_super))
2004                         continue;
2005
2006                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2007
2008                 page = virt_to_page(disk_super);
2009                 set_page_dirty(page);
2010                 lock_page(page);
2011                 /* write_on_page() unlocks the page */
2012                 ret = write_one_page(page);
2013                 if (ret)
2014                         btrfs_warn(fs_info,
2015                                 "error clearing superblock number %d (%d)",
2016                                 copy_num, ret);
2017                 btrfs_release_disk_super(disk_super);
2018
2019         }
2020
2021         /* Notify udev that device has changed */
2022         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2023
2024         /* Update ctime/mtime for device path for libblkid */
2025         update_dev_time(device_path);
2026 }
2027
2028 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2029                 u64 devid)
2030 {
2031         struct btrfs_device *device;
2032         struct btrfs_fs_devices *cur_devices;
2033         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2034         u64 num_devices;
2035         int ret = 0;
2036
2037         mutex_lock(&uuid_mutex);
2038
2039         num_devices = btrfs_num_devices(fs_info);
2040
2041         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2042         if (ret)
2043                 goto out;
2044
2045         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2046
2047         if (IS_ERR(device)) {
2048                 if (PTR_ERR(device) == -ENOENT &&
2049                     strcmp(device_path, "missing") == 0)
2050                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2051                 else
2052                         ret = PTR_ERR(device);
2053                 goto out;
2054         }
2055
2056         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2057                 btrfs_warn_in_rcu(fs_info,
2058                   "cannot remove device %s (devid %llu) due to active swapfile",
2059                                   rcu_str_deref(device->name), device->devid);
2060                 ret = -ETXTBSY;
2061                 goto out;
2062         }
2063
2064         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2065                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2066                 goto out;
2067         }
2068
2069         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2070             fs_info->fs_devices->rw_devices == 1) {
2071                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2072                 goto out;
2073         }
2074
2075         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2076                 mutex_lock(&fs_info->chunk_mutex);
2077                 list_del_init(&device->dev_alloc_list);
2078                 device->fs_devices->rw_devices--;
2079                 mutex_unlock(&fs_info->chunk_mutex);
2080         }
2081
2082         mutex_unlock(&uuid_mutex);
2083         ret = btrfs_shrink_device(device, 0);
2084         mutex_lock(&uuid_mutex);
2085         if (ret)
2086                 goto error_undo;
2087
2088         /*
2089          * TODO: the superblock still includes this device in its num_devices
2090          * counter although write_all_supers() is not locked out. This
2091          * could give a filesystem state which requires a degraded mount.
2092          */
2093         ret = btrfs_rm_dev_item(device);
2094         if (ret)
2095                 goto error_undo;
2096
2097         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2098         btrfs_scrub_cancel_dev(device);
2099
2100         /*
2101          * the device list mutex makes sure that we don't change
2102          * the device list while someone else is writing out all
2103          * the device supers. Whoever is writing all supers, should
2104          * lock the device list mutex before getting the number of
2105          * devices in the super block (super_copy). Conversely,
2106          * whoever updates the number of devices in the super block
2107          * (super_copy) should hold the device list mutex.
2108          */
2109
2110         /*
2111          * In normal cases the cur_devices == fs_devices. But in case
2112          * of deleting a seed device, the cur_devices should point to
2113          * its own fs_devices listed under the fs_devices->seed.
2114          */
2115         cur_devices = device->fs_devices;
2116         mutex_lock(&fs_devices->device_list_mutex);
2117         list_del_rcu(&device->dev_list);
2118
2119         cur_devices->num_devices--;
2120         cur_devices->total_devices--;
2121         /* Update total_devices of the parent fs_devices if it's seed */
2122         if (cur_devices != fs_devices)
2123                 fs_devices->total_devices--;
2124
2125         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2126                 cur_devices->missing_devices--;
2127
2128         btrfs_assign_next_active_device(device, NULL);
2129
2130         if (device->bdev) {
2131                 cur_devices->open_devices--;
2132                 /* remove sysfs entry */
2133                 btrfs_sysfs_remove_devices_dir(fs_devices, device);
2134         }
2135
2136         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2137         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2138         mutex_unlock(&fs_devices->device_list_mutex);
2139
2140         /*
2141          * at this point, the device is zero sized and detached from
2142          * the devices list.  All that's left is to zero out the old
2143          * supers and free the device.
2144          */
2145         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2146                 btrfs_scratch_superblocks(fs_info, device->bdev,
2147                                           device->name->str);
2148
2149         btrfs_close_bdev(device);
2150         synchronize_rcu();
2151         btrfs_free_device(device);
2152
2153         if (cur_devices->open_devices == 0) {
2154                 while (fs_devices) {
2155                         if (fs_devices->seed == cur_devices) {
2156                                 fs_devices->seed = cur_devices->seed;
2157                                 break;
2158                         }
2159                         fs_devices = fs_devices->seed;
2160                 }
2161                 cur_devices->seed = NULL;
2162                 close_fs_devices(cur_devices);
2163                 free_fs_devices(cur_devices);
2164         }
2165
2166 out:
2167         mutex_unlock(&uuid_mutex);
2168         return ret;
2169
2170 error_undo:
2171         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2172                 mutex_lock(&fs_info->chunk_mutex);
2173                 list_add(&device->dev_alloc_list,
2174                          &fs_devices->alloc_list);
2175                 device->fs_devices->rw_devices++;
2176                 mutex_unlock(&fs_info->chunk_mutex);
2177         }
2178         goto out;
2179 }
2180
2181 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2182 {
2183         struct btrfs_fs_devices *fs_devices;
2184
2185         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2186
2187         /*
2188          * in case of fs with no seed, srcdev->fs_devices will point
2189          * to fs_devices of fs_info. However when the dev being replaced is
2190          * a seed dev it will point to the seed's local fs_devices. In short
2191          * srcdev will have its correct fs_devices in both the cases.
2192          */
2193         fs_devices = srcdev->fs_devices;
2194
2195         list_del_rcu(&srcdev->dev_list);
2196         list_del(&srcdev->dev_alloc_list);
2197         fs_devices->num_devices--;
2198         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2199                 fs_devices->missing_devices--;
2200
2201         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2202                 fs_devices->rw_devices--;
2203
2204         if (srcdev->bdev)
2205                 fs_devices->open_devices--;
2206 }
2207
2208 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2209 {
2210         struct btrfs_fs_info *fs_info = srcdev->fs_info;
2211         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2212
2213         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2214                 /* zero out the old super if it is writable */
2215                 btrfs_scratch_superblocks(fs_info, srcdev->bdev,
2216                                           srcdev->name->str);
2217         }
2218
2219         btrfs_close_bdev(srcdev);
2220         synchronize_rcu();
2221         btrfs_free_device(srcdev);
2222
2223         /* if this is no devs we rather delete the fs_devices */
2224         if (!fs_devices->num_devices) {
2225                 struct btrfs_fs_devices *tmp_fs_devices;
2226
2227                 /*
2228                  * On a mounted FS, num_devices can't be zero unless it's a
2229                  * seed. In case of a seed device being replaced, the replace
2230                  * target added to the sprout FS, so there will be no more
2231                  * device left under the seed FS.
2232                  */
2233                 ASSERT(fs_devices->seeding);
2234
2235                 tmp_fs_devices = fs_info->fs_devices;
2236                 while (tmp_fs_devices) {
2237                         if (tmp_fs_devices->seed == fs_devices) {
2238                                 tmp_fs_devices->seed = fs_devices->seed;
2239                                 break;
2240                         }
2241                         tmp_fs_devices = tmp_fs_devices->seed;
2242                 }
2243                 fs_devices->seed = NULL;
2244                 close_fs_devices(fs_devices);
2245                 free_fs_devices(fs_devices);
2246         }
2247 }
2248
2249 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2250 {
2251         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2252
2253         mutex_lock(&fs_devices->device_list_mutex);
2254
2255         btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev);
2256
2257         if (tgtdev->bdev)
2258                 fs_devices->open_devices--;
2259
2260         fs_devices->num_devices--;
2261
2262         btrfs_assign_next_active_device(tgtdev, NULL);
2263
2264         list_del_rcu(&tgtdev->dev_list);
2265
2266         mutex_unlock(&fs_devices->device_list_mutex);
2267
2268         /*
2269          * The update_dev_time() with in btrfs_scratch_superblocks()
2270          * may lead to a call to btrfs_show_devname() which will try
2271          * to hold device_list_mutex. And here this device
2272          * is already out of device list, so we don't have to hold
2273          * the device_list_mutex lock.
2274          */
2275         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2276                                   tgtdev->name->str);
2277
2278         btrfs_close_bdev(tgtdev);
2279         synchronize_rcu();
2280         btrfs_free_device(tgtdev);
2281 }
2282
2283 static struct btrfs_device *btrfs_find_device_by_path(
2284                 struct btrfs_fs_info *fs_info, const char *device_path)
2285 {
2286         int ret = 0;
2287         struct btrfs_super_block *disk_super;
2288         u64 devid;
2289         u8 *dev_uuid;
2290         struct block_device *bdev;
2291         struct btrfs_device *device;
2292
2293         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2294                                     fs_info->bdev_holder, 0, &bdev, &disk_super);
2295         if (ret)
2296                 return ERR_PTR(ret);
2297
2298         devid = btrfs_stack_device_id(&disk_super->dev_item);
2299         dev_uuid = disk_super->dev_item.uuid;
2300         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2301                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2302                                            disk_super->metadata_uuid, true);
2303         else
2304                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2305                                            disk_super->fsid, true);
2306
2307         btrfs_release_disk_super(disk_super);
2308         if (!device)
2309                 device = ERR_PTR(-ENOENT);
2310         blkdev_put(bdev, FMODE_READ);
2311         return device;
2312 }
2313
2314 /*
2315  * Lookup a device given by device id, or the path if the id is 0.
2316  */
2317 struct btrfs_device *btrfs_find_device_by_devspec(
2318                 struct btrfs_fs_info *fs_info, u64 devid,
2319                 const char *device_path)
2320 {
2321         struct btrfs_device *device;
2322
2323         if (devid) {
2324                 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2325                                            NULL, true);
2326                 if (!device)
2327                         return ERR_PTR(-ENOENT);
2328                 return device;
2329         }
2330
2331         if (!device_path || !device_path[0])
2332                 return ERR_PTR(-EINVAL);
2333
2334         if (strcmp(device_path, "missing") == 0) {
2335                 /* Find first missing device */
2336                 list_for_each_entry(device, &fs_info->fs_devices->devices,
2337                                     dev_list) {
2338                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2339                                      &device->dev_state) && !device->bdev)
2340                                 return device;
2341                 }
2342                 return ERR_PTR(-ENOENT);
2343         }
2344
2345         return btrfs_find_device_by_path(fs_info, device_path);
2346 }
2347
2348 /*
2349  * does all the dirty work required for changing file system's UUID.
2350  */
2351 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2352 {
2353         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2354         struct btrfs_fs_devices *old_devices;
2355         struct btrfs_fs_devices *seed_devices;
2356         struct btrfs_super_block *disk_super = fs_info->super_copy;
2357         struct btrfs_device *device;
2358         u64 super_flags;
2359
2360         lockdep_assert_held(&uuid_mutex);
2361         if (!fs_devices->seeding)
2362                 return -EINVAL;
2363
2364         seed_devices = alloc_fs_devices(NULL, NULL);
2365         if (IS_ERR(seed_devices))
2366                 return PTR_ERR(seed_devices);
2367
2368         old_devices = clone_fs_devices(fs_devices);
2369         if (IS_ERR(old_devices)) {
2370                 kfree(seed_devices);
2371                 return PTR_ERR(old_devices);
2372         }
2373
2374         list_add(&old_devices->fs_list, &fs_uuids);
2375
2376         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2377         seed_devices->opened = 1;
2378         INIT_LIST_HEAD(&seed_devices->devices);
2379         INIT_LIST_HEAD(&seed_devices->alloc_list);
2380         mutex_init(&seed_devices->device_list_mutex);
2381
2382         mutex_lock(&fs_devices->device_list_mutex);
2383         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2384                               synchronize_rcu);
2385         list_for_each_entry(device, &seed_devices->devices, dev_list)
2386                 device->fs_devices = seed_devices;
2387
2388         mutex_lock(&fs_info->chunk_mutex);
2389         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2390         mutex_unlock(&fs_info->chunk_mutex);
2391
2392         fs_devices->seeding = false;
2393         fs_devices->num_devices = 0;
2394         fs_devices->open_devices = 0;
2395         fs_devices->missing_devices = 0;
2396         fs_devices->rotating = false;
2397         fs_devices->seed = seed_devices;
2398
2399         generate_random_uuid(fs_devices->fsid);
2400         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2401         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2402         mutex_unlock(&fs_devices->device_list_mutex);
2403
2404         super_flags = btrfs_super_flags(disk_super) &
2405                       ~BTRFS_SUPER_FLAG_SEEDING;
2406         btrfs_set_super_flags(disk_super, super_flags);
2407
2408         return 0;
2409 }
2410
2411 /*
2412  * Store the expected generation for seed devices in device items.
2413  */
2414 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2415 {
2416         struct btrfs_fs_info *fs_info = trans->fs_info;
2417         struct btrfs_root *root = fs_info->chunk_root;
2418         struct btrfs_path *path;
2419         struct extent_buffer *leaf;
2420         struct btrfs_dev_item *dev_item;
2421         struct btrfs_device *device;
2422         struct btrfs_key key;
2423         u8 fs_uuid[BTRFS_FSID_SIZE];
2424         u8 dev_uuid[BTRFS_UUID_SIZE];
2425         u64 devid;
2426         int ret;
2427
2428         path = btrfs_alloc_path();
2429         if (!path)
2430                 return -ENOMEM;
2431
2432         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2433         key.offset = 0;
2434         key.type = BTRFS_DEV_ITEM_KEY;
2435
2436         while (1) {
2437                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2438                 if (ret < 0)
2439                         goto error;
2440
2441                 leaf = path->nodes[0];
2442 next_slot:
2443                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2444                         ret = btrfs_next_leaf(root, path);
2445                         if (ret > 0)
2446                                 break;
2447                         if (ret < 0)
2448                                 goto error;
2449                         leaf = path->nodes[0];
2450                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2451                         btrfs_release_path(path);
2452                         continue;
2453                 }
2454
2455                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2456                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2457                     key.type != BTRFS_DEV_ITEM_KEY)
2458                         break;
2459
2460                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2461                                           struct btrfs_dev_item);
2462                 devid = btrfs_device_id(leaf, dev_item);
2463                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2464                                    BTRFS_UUID_SIZE);
2465                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2466                                    BTRFS_FSID_SIZE);
2467                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2468                                            fs_uuid, true);
2469                 BUG_ON(!device); /* Logic error */
2470
2471                 if (device->fs_devices->seeding) {
2472                         btrfs_set_device_generation(leaf, dev_item,
2473                                                     device->generation);
2474                         btrfs_mark_buffer_dirty(leaf);
2475                 }
2476
2477                 path->slots[0]++;
2478                 goto next_slot;
2479         }
2480         ret = 0;
2481 error:
2482         btrfs_free_path(path);
2483         return ret;
2484 }
2485
2486 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2487 {
2488         struct btrfs_root *root = fs_info->dev_root;
2489         struct request_queue *q;
2490         struct btrfs_trans_handle *trans;
2491         struct btrfs_device *device;
2492         struct block_device *bdev;
2493         struct super_block *sb = fs_info->sb;
2494         struct rcu_string *name;
2495         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2496         u64 orig_super_total_bytes;
2497         u64 orig_super_num_devices;
2498         int seeding_dev = 0;
2499         int ret = 0;
2500         bool unlocked = false;
2501
2502         if (sb_rdonly(sb) && !fs_devices->seeding)
2503                 return -EROFS;
2504
2505         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2506                                   fs_info->bdev_holder);
2507         if (IS_ERR(bdev))
2508                 return PTR_ERR(bdev);
2509
2510         if (fs_devices->seeding) {
2511                 seeding_dev = 1;
2512                 down_write(&sb->s_umount);
2513                 mutex_lock(&uuid_mutex);
2514         }
2515
2516         filemap_write_and_wait(bdev->bd_inode->i_mapping);
2517
2518         mutex_lock(&fs_devices->device_list_mutex);
2519         list_for_each_entry(device, &fs_devices->devices, dev_list) {
2520                 if (device->bdev == bdev) {
2521                         ret = -EEXIST;
2522                         mutex_unlock(
2523                                 &fs_devices->device_list_mutex);
2524                         goto error;
2525                 }
2526         }
2527         mutex_unlock(&fs_devices->device_list_mutex);
2528
2529         device = btrfs_alloc_device(fs_info, NULL, NULL);
2530         if (IS_ERR(device)) {
2531                 /* we can safely leave the fs_devices entry around */
2532                 ret = PTR_ERR(device);
2533                 goto error;
2534         }
2535
2536         name = rcu_string_strdup(device_path, GFP_KERNEL);
2537         if (!name) {
2538                 ret = -ENOMEM;
2539                 goto error_free_device;
2540         }
2541         rcu_assign_pointer(device->name, name);
2542
2543         trans = btrfs_start_transaction(root, 0);
2544         if (IS_ERR(trans)) {
2545                 ret = PTR_ERR(trans);
2546                 goto error_free_device;
2547         }
2548
2549         q = bdev_get_queue(bdev);
2550         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2551         device->generation = trans->transid;
2552         device->io_width = fs_info->sectorsize;
2553         device->io_align = fs_info->sectorsize;
2554         device->sector_size = fs_info->sectorsize;
2555         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2556                                          fs_info->sectorsize);
2557         device->disk_total_bytes = device->total_bytes;
2558         device->commit_total_bytes = device->total_bytes;
2559         device->fs_info = fs_info;
2560         device->bdev = bdev;
2561         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2562         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2563         device->mode = FMODE_EXCL;
2564         device->dev_stats_valid = 1;
2565         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2566
2567         if (seeding_dev) {
2568                 sb->s_flags &= ~SB_RDONLY;
2569                 ret = btrfs_prepare_sprout(fs_info);
2570                 if (ret) {
2571                         btrfs_abort_transaction(trans, ret);
2572                         goto error_trans;
2573                 }
2574         }
2575
2576         device->fs_devices = fs_devices;
2577
2578         mutex_lock(&fs_devices->device_list_mutex);
2579         mutex_lock(&fs_info->chunk_mutex);
2580         list_add_rcu(&device->dev_list, &fs_devices->devices);
2581         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2582         fs_devices->num_devices++;
2583         fs_devices->open_devices++;
2584         fs_devices->rw_devices++;
2585         fs_devices->total_devices++;
2586         fs_devices->total_rw_bytes += device->total_bytes;
2587
2588         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2589
2590         if (!blk_queue_nonrot(q))
2591                 fs_devices->rotating = true;
2592
2593         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2594         btrfs_set_super_total_bytes(fs_info->super_copy,
2595                 round_down(orig_super_total_bytes + device->total_bytes,
2596                            fs_info->sectorsize));
2597
2598         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2599         btrfs_set_super_num_devices(fs_info->super_copy,
2600                                     orig_super_num_devices + 1);
2601
2602         /* add sysfs device entry */
2603         btrfs_sysfs_add_devices_dir(fs_devices, device);
2604
2605         /*
2606          * we've got more storage, clear any full flags on the space
2607          * infos
2608          */
2609         btrfs_clear_space_info_full(fs_info);
2610
2611         mutex_unlock(&fs_info->chunk_mutex);
2612         mutex_unlock(&fs_devices->device_list_mutex);
2613
2614         if (seeding_dev) {
2615                 mutex_lock(&fs_info->chunk_mutex);
2616                 ret = init_first_rw_device(trans);
2617                 mutex_unlock(&fs_info->chunk_mutex);
2618                 if (ret) {
2619                         btrfs_abort_transaction(trans, ret);
2620                         goto error_sysfs;
2621                 }
2622         }
2623
2624         ret = btrfs_add_dev_item(trans, device);
2625         if (ret) {
2626                 btrfs_abort_transaction(trans, ret);
2627                 goto error_sysfs;
2628         }
2629
2630         if (seeding_dev) {
2631                 ret = btrfs_finish_sprout(trans);
2632                 if (ret) {
2633                         btrfs_abort_transaction(trans, ret);
2634                         goto error_sysfs;
2635                 }
2636
2637                 btrfs_sysfs_update_sprout_fsid(fs_devices,
2638                                 fs_info->fs_devices->fsid);
2639         }
2640
2641         ret = btrfs_commit_transaction(trans);
2642
2643         if (seeding_dev) {
2644                 mutex_unlock(&uuid_mutex);
2645                 up_write(&sb->s_umount);
2646                 unlocked = true;
2647
2648                 if (ret) /* transaction commit */
2649                         return ret;
2650
2651                 ret = btrfs_relocate_sys_chunks(fs_info);
2652                 if (ret < 0)
2653                         btrfs_handle_fs_error(fs_info, ret,
2654                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2655                 trans = btrfs_attach_transaction(root);
2656                 if (IS_ERR(trans)) {
2657                         if (PTR_ERR(trans) == -ENOENT)
2658                                 return 0;
2659                         ret = PTR_ERR(trans);
2660                         trans = NULL;
2661                         goto error_sysfs;
2662                 }
2663                 ret = btrfs_commit_transaction(trans);
2664         }
2665
2666         /* Update ctime/mtime for libblkid */
2667         update_dev_time(device_path);
2668         return ret;
2669
2670 error_sysfs:
2671         btrfs_sysfs_remove_devices_dir(fs_devices, device);
2672         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2673         mutex_lock(&fs_info->chunk_mutex);
2674         list_del_rcu(&device->dev_list);
2675         list_del(&device->dev_alloc_list);
2676         fs_info->fs_devices->num_devices--;
2677         fs_info->fs_devices->open_devices--;
2678         fs_info->fs_devices->rw_devices--;
2679         fs_info->fs_devices->total_devices--;
2680         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2681         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2682         btrfs_set_super_total_bytes(fs_info->super_copy,
2683                                     orig_super_total_bytes);
2684         btrfs_set_super_num_devices(fs_info->super_copy,
2685                                     orig_super_num_devices);
2686         mutex_unlock(&fs_info->chunk_mutex);
2687         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2688 error_trans:
2689         if (seeding_dev)
2690                 sb->s_flags |= SB_RDONLY;
2691         if (trans)
2692                 btrfs_end_transaction(trans);
2693 error_free_device:
2694         btrfs_free_device(device);
2695 error:
2696         blkdev_put(bdev, FMODE_EXCL);
2697         if (seeding_dev && !unlocked) {
2698                 mutex_unlock(&uuid_mutex);
2699                 up_write(&sb->s_umount);
2700         }
2701         return ret;
2702 }
2703
2704 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2705                                         struct btrfs_device *device)
2706 {
2707         int ret;
2708         struct btrfs_path *path;
2709         struct btrfs_root *root = device->fs_info->chunk_root;
2710         struct btrfs_dev_item *dev_item;
2711         struct extent_buffer *leaf;
2712         struct btrfs_key key;
2713
2714         path = btrfs_alloc_path();
2715         if (!path)
2716                 return -ENOMEM;
2717
2718         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2719         key.type = BTRFS_DEV_ITEM_KEY;
2720         key.offset = device->devid;
2721
2722         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2723         if (ret < 0)
2724                 goto out;
2725
2726         if (ret > 0) {
2727                 ret = -ENOENT;
2728                 goto out;
2729         }
2730
2731         leaf = path->nodes[0];
2732         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2733
2734         btrfs_set_device_id(leaf, dev_item, device->devid);
2735         btrfs_set_device_type(leaf, dev_item, device->type);
2736         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2737         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2738         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2739         btrfs_set_device_total_bytes(leaf, dev_item,
2740                                      btrfs_device_get_disk_total_bytes(device));
2741         btrfs_set_device_bytes_used(leaf, dev_item,
2742                                     btrfs_device_get_bytes_used(device));
2743         btrfs_mark_buffer_dirty(leaf);
2744
2745 out:
2746         btrfs_free_path(path);
2747         return ret;
2748 }
2749
2750 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2751                       struct btrfs_device *device, u64 new_size)
2752 {
2753         struct btrfs_fs_info *fs_info = device->fs_info;
2754         struct btrfs_super_block *super_copy = fs_info->super_copy;
2755         u64 old_total;
2756         u64 diff;
2757
2758         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2759                 return -EACCES;
2760
2761         new_size = round_down(new_size, fs_info->sectorsize);
2762
2763         mutex_lock(&fs_info->chunk_mutex);
2764         old_total = btrfs_super_total_bytes(super_copy);
2765         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2766
2767         if (new_size <= device->total_bytes ||
2768             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2769                 mutex_unlock(&fs_info->chunk_mutex);
2770                 return -EINVAL;
2771         }
2772
2773         btrfs_set_super_total_bytes(super_copy,
2774                         round_down(old_total + diff, fs_info->sectorsize));
2775         device->fs_devices->total_rw_bytes += diff;
2776
2777         btrfs_device_set_total_bytes(device, new_size);
2778         btrfs_device_set_disk_total_bytes(device, new_size);
2779         btrfs_clear_space_info_full(device->fs_info);
2780         if (list_empty(&device->post_commit_list))
2781                 list_add_tail(&device->post_commit_list,
2782                               &trans->transaction->dev_update_list);
2783         mutex_unlock(&fs_info->chunk_mutex);
2784
2785         return btrfs_update_device(trans, device);
2786 }
2787
2788 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2789 {
2790         struct btrfs_fs_info *fs_info = trans->fs_info;
2791         struct btrfs_root *root = fs_info->chunk_root;
2792         int ret;
2793         struct btrfs_path *path;
2794         struct btrfs_key key;
2795
2796         path = btrfs_alloc_path();
2797         if (!path)
2798                 return -ENOMEM;
2799
2800         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2801         key.offset = chunk_offset;
2802         key.type = BTRFS_CHUNK_ITEM_KEY;
2803
2804         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2805         if (ret < 0)
2806                 goto out;
2807         else if (ret > 0) { /* Logic error or corruption */
2808                 btrfs_handle_fs_error(fs_info, -ENOENT,
2809                                       "Failed lookup while freeing chunk.");
2810                 ret = -ENOENT;
2811                 goto out;
2812         }
2813
2814         ret = btrfs_del_item(trans, root, path);
2815         if (ret < 0)
2816                 btrfs_handle_fs_error(fs_info, ret,
2817                                       "Failed to delete chunk item.");
2818 out:
2819         btrfs_free_path(path);
2820         return ret;
2821 }
2822
2823 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2824 {
2825         struct btrfs_super_block *super_copy = fs_info->super_copy;
2826         struct btrfs_disk_key *disk_key;
2827         struct btrfs_chunk *chunk;
2828         u8 *ptr;
2829         int ret = 0;
2830         u32 num_stripes;
2831         u32 array_size;
2832         u32 len = 0;
2833         u32 cur;
2834         struct btrfs_key key;
2835
2836         mutex_lock(&fs_info->chunk_mutex);
2837         array_size = btrfs_super_sys_array_size(super_copy);
2838
2839         ptr = super_copy->sys_chunk_array;
2840         cur = 0;
2841
2842         while (cur < array_size) {
2843                 disk_key = (struct btrfs_disk_key *)ptr;
2844                 btrfs_disk_key_to_cpu(&key, disk_key);
2845
2846                 len = sizeof(*disk_key);
2847
2848                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2849                         chunk = (struct btrfs_chunk *)(ptr + len);
2850                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2851                         len += btrfs_chunk_item_size(num_stripes);
2852                 } else {
2853                         ret = -EIO;
2854                         break;
2855                 }
2856                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2857                     key.offset == chunk_offset) {
2858                         memmove(ptr, ptr + len, array_size - (cur + len));
2859                         array_size -= len;
2860                         btrfs_set_super_sys_array_size(super_copy, array_size);
2861                 } else {
2862                         ptr += len;
2863                         cur += len;
2864                 }
2865         }
2866         mutex_unlock(&fs_info->chunk_mutex);
2867         return ret;
2868 }
2869
2870 /*
2871  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2872  * @logical: Logical block offset in bytes.
2873  * @length: Length of extent in bytes.
2874  *
2875  * Return: Chunk mapping or ERR_PTR.
2876  */
2877 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2878                                        u64 logical, u64 length)
2879 {
2880         struct extent_map_tree *em_tree;
2881         struct extent_map *em;
2882
2883         em_tree = &fs_info->mapping_tree;
2884         read_lock(&em_tree->lock);
2885         em = lookup_extent_mapping(em_tree, logical, length);
2886         read_unlock(&em_tree->lock);
2887
2888         if (!em) {
2889                 btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2890                            logical, length);
2891                 return ERR_PTR(-EINVAL);
2892         }
2893
2894         if (em->start > logical || em->start + em->len < logical) {
2895                 btrfs_crit(fs_info,
2896                            "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2897                            logical, length, em->start, em->start + em->len);
2898                 free_extent_map(em);
2899                 return ERR_PTR(-EINVAL);
2900         }
2901
2902         /* callers are responsible for dropping em's ref. */
2903         return em;
2904 }
2905
2906 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2907 {
2908         struct btrfs_fs_info *fs_info = trans->fs_info;
2909         struct extent_map *em;
2910         struct map_lookup *map;
2911         u64 dev_extent_len = 0;
2912         int i, ret = 0;
2913         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2914
2915         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2916         if (IS_ERR(em)) {
2917                 /*
2918                  * This is a logic error, but we don't want to just rely on the
2919                  * user having built with ASSERT enabled, so if ASSERT doesn't
2920                  * do anything we still error out.
2921                  */
2922                 ASSERT(0);
2923                 return PTR_ERR(em);
2924         }
2925         map = em->map_lookup;
2926         mutex_lock(&fs_info->chunk_mutex);
2927         check_system_chunk(trans, map->type);
2928         mutex_unlock(&fs_info->chunk_mutex);
2929
2930         /*
2931          * Take the device list mutex to prevent races with the final phase of
2932          * a device replace operation that replaces the device object associated
2933          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2934          */
2935         mutex_lock(&fs_devices->device_list_mutex);
2936         for (i = 0; i < map->num_stripes; i++) {
2937                 struct btrfs_device *device = map->stripes[i].dev;
2938                 ret = btrfs_free_dev_extent(trans, device,
2939                                             map->stripes[i].physical,
2940                                             &dev_extent_len);
2941                 if (ret) {
2942                         mutex_unlock(&fs_devices->device_list_mutex);
2943                         btrfs_abort_transaction(trans, ret);
2944                         goto out;
2945                 }
2946
2947                 if (device->bytes_used > 0) {
2948                         mutex_lock(&fs_info->chunk_mutex);
2949                         btrfs_device_set_bytes_used(device,
2950                                         device->bytes_used - dev_extent_len);
2951                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2952                         btrfs_clear_space_info_full(fs_info);
2953                         mutex_unlock(&fs_info->chunk_mutex);
2954                 }
2955
2956                 ret = btrfs_update_device(trans, device);
2957                 if (ret) {
2958                         mutex_unlock(&fs_devices->device_list_mutex);
2959                         btrfs_abort_transaction(trans, ret);
2960                         goto out;
2961                 }
2962         }
2963         mutex_unlock(&fs_devices->device_list_mutex);
2964
2965         ret = btrfs_free_chunk(trans, chunk_offset);
2966         if (ret) {
2967                 btrfs_abort_transaction(trans, ret);
2968                 goto out;
2969         }
2970
2971         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2972
2973         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2974                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2975                 if (ret) {
2976                         btrfs_abort_transaction(trans, ret);
2977                         goto out;
2978                 }
2979         }
2980
2981         ret = btrfs_remove_block_group(trans, chunk_offset, em);
2982         if (ret) {
2983                 btrfs_abort_transaction(trans, ret);
2984                 goto out;
2985         }
2986
2987 out:
2988         /* once for us */
2989         free_extent_map(em);
2990         return ret;
2991 }
2992
2993 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2994 {
2995         struct btrfs_root *root = fs_info->chunk_root;
2996         struct btrfs_trans_handle *trans;
2997         struct btrfs_block_group *block_group;
2998         int ret;
2999
3000         /*
3001          * Prevent races with automatic removal of unused block groups.
3002          * After we relocate and before we remove the chunk with offset
3003          * chunk_offset, automatic removal of the block group can kick in,
3004          * resulting in a failure when calling btrfs_remove_chunk() below.
3005          *
3006          * Make sure to acquire this mutex before doing a tree search (dev
3007          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3008          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3009          * we release the path used to search the chunk/dev tree and before
3010          * the current task acquires this mutex and calls us.
3011          */
3012         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3013
3014         /* step one, relocate all the extents inside this chunk */
3015         btrfs_scrub_pause(fs_info);
3016         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3017         btrfs_scrub_continue(fs_info);
3018         if (ret)
3019                 return ret;
3020
3021         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3022         if (!block_group)
3023                 return -ENOENT;
3024         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3025         btrfs_put_block_group(block_group);
3026
3027         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3028                                                      chunk_offset);
3029         if (IS_ERR(trans)) {
3030                 ret = PTR_ERR(trans);
3031                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3032                 return ret;
3033         }
3034
3035         /*
3036          * step two, delete the device extents and the
3037          * chunk tree entries
3038          */
3039         ret = btrfs_remove_chunk(trans, chunk_offset);
3040         btrfs_end_transaction(trans);
3041         return ret;
3042 }
3043
3044 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3045 {
3046         struct btrfs_root *chunk_root = fs_info->chunk_root;
3047         struct btrfs_path *path;
3048         struct extent_buffer *leaf;
3049         struct btrfs_chunk *chunk;
3050         struct btrfs_key key;
3051         struct btrfs_key found_key;
3052         u64 chunk_type;
3053         bool retried = false;
3054         int failed = 0;
3055         int ret;
3056
3057         path = btrfs_alloc_path();
3058         if (!path)
3059                 return -ENOMEM;
3060
3061 again:
3062         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3063         key.offset = (u64)-1;
3064         key.type = BTRFS_CHUNK_ITEM_KEY;
3065
3066         while (1) {
3067                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3068                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3069                 if (ret < 0) {
3070                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3071                         goto error;
3072                 }
3073                 BUG_ON(ret == 0); /* Corruption */
3074
3075                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3076                                           key.type);
3077                 if (ret)
3078                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3079                 if (ret < 0)
3080                         goto error;
3081                 if (ret > 0)
3082                         break;
3083
3084                 leaf = path->nodes[0];
3085                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3086
3087                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3088                                        struct btrfs_chunk);
3089                 chunk_type = btrfs_chunk_type(leaf, chunk);
3090                 btrfs_release_path(path);
3091
3092                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3093                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3094                         if (ret == -ENOSPC)
3095                                 failed++;
3096                         else
3097                                 BUG_ON(ret);
3098                 }
3099                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3100
3101                 if (found_key.offset == 0)
3102                         break;
3103                 key.offset = found_key.offset - 1;
3104         }
3105         ret = 0;
3106         if (failed && !retried) {
3107                 failed = 0;
3108                 retried = true;
3109                 goto again;
3110         } else if (WARN_ON(failed && retried)) {
3111                 ret = -ENOSPC;
3112         }
3113 error:
3114         btrfs_free_path(path);
3115         return ret;
3116 }
3117
3118 /*
3119  * return 1 : allocate a data chunk successfully,
3120  * return <0: errors during allocating a data chunk,
3121  * return 0 : no need to allocate a data chunk.
3122  */
3123 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3124                                       u64 chunk_offset)
3125 {
3126         struct btrfs_block_group *cache;
3127         u64 bytes_used;
3128         u64 chunk_type;
3129
3130         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3131         ASSERT(cache);
3132         chunk_type = cache->flags;
3133         btrfs_put_block_group(cache);
3134
3135         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3136                 return 0;
3137
3138         spin_lock(&fs_info->data_sinfo->lock);
3139         bytes_used = fs_info->data_sinfo->bytes_used;
3140         spin_unlock(&fs_info->data_sinfo->lock);
3141
3142         if (!bytes_used) {
3143                 struct btrfs_trans_handle *trans;
3144                 int ret;
3145
3146                 trans = btrfs_join_transaction(fs_info->tree_root);
3147                 if (IS_ERR(trans))
3148                         return PTR_ERR(trans);
3149
3150                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3151                 btrfs_end_transaction(trans);
3152                 if (ret < 0)
3153                         return ret;
3154                 return 1;
3155         }
3156
3157         return 0;
3158 }
3159
3160 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3161                                struct btrfs_balance_control *bctl)
3162 {
3163         struct btrfs_root *root = fs_info->tree_root;
3164         struct btrfs_trans_handle *trans;
3165         struct btrfs_balance_item *item;
3166         struct btrfs_disk_balance_args disk_bargs;
3167         struct btrfs_path *path;
3168         struct extent_buffer *leaf;
3169         struct btrfs_key key;
3170         int ret, err;
3171
3172         path = btrfs_alloc_path();
3173         if (!path)
3174                 return -ENOMEM;
3175
3176         trans = btrfs_start_transaction(root, 0);
3177         if (IS_ERR(trans)) {
3178                 btrfs_free_path(path);
3179                 return PTR_ERR(trans);
3180         }
3181
3182         key.objectid = BTRFS_BALANCE_OBJECTID;
3183         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3184         key.offset = 0;
3185
3186         ret = btrfs_insert_empty_item(trans, root, path, &key,
3187                                       sizeof(*item));
3188         if (ret)
3189                 goto out;
3190
3191         leaf = path->nodes[0];
3192         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3193
3194         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3195
3196         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3197         btrfs_set_balance_data(leaf, item, &disk_bargs);
3198         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3199         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3200         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3201         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3202
3203         btrfs_set_balance_flags(leaf, item, bctl->flags);
3204
3205         btrfs_mark_buffer_dirty(leaf);
3206 out:
3207         btrfs_free_path(path);
3208         err = btrfs_commit_transaction(trans);
3209         if (err && !ret)
3210                 ret = err;
3211         return ret;
3212 }
3213
3214 static int del_balance_item(struct btrfs_fs_info *fs_info)
3215 {
3216         struct btrfs_root *root = fs_info->tree_root;
3217         struct btrfs_trans_handle *trans;
3218         struct btrfs_path *path;
3219         struct btrfs_key key;
3220         int ret, err;
3221
3222         path = btrfs_alloc_path();
3223         if (!path)
3224                 return -ENOMEM;
3225
3226         trans = btrfs_start_transaction(root, 0);
3227         if (IS_ERR(trans)) {
3228                 btrfs_free_path(path);
3229                 return PTR_ERR(trans);
3230         }
3231
3232         key.objectid = BTRFS_BALANCE_OBJECTID;
3233         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3234         key.offset = 0;
3235
3236         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3237         if (ret < 0)
3238                 goto out;
3239         if (ret > 0) {
3240                 ret = -ENOENT;
3241                 goto out;
3242         }
3243
3244         ret = btrfs_del_item(trans, root, path);
3245 out:
3246         btrfs_free_path(path);
3247         err = btrfs_commit_transaction(trans);
3248         if (err && !ret)
3249                 ret = err;
3250         return ret;
3251 }
3252
3253 /*
3254  * This is a heuristic used to reduce the number of chunks balanced on
3255  * resume after balance was interrupted.
3256  */
3257 static void update_balance_args(struct btrfs_balance_control *bctl)
3258 {
3259         /*
3260          * Turn on soft mode for chunk types that were being converted.
3261          */
3262         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3263                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3264         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3265                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3266         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3267                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3268
3269         /*
3270          * Turn on usage filter if is not already used.  The idea is
3271          * that chunks that we have already balanced should be
3272          * reasonably full.  Don't do it for chunks that are being
3273          * converted - that will keep us from relocating unconverted
3274          * (albeit full) chunks.
3275          */
3276         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3277             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3278             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3279                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3280                 bctl->data.usage = 90;
3281         }
3282         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3283             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3284             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3285                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3286                 bctl->sys.usage = 90;
3287         }
3288         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3289             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3290             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3291                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3292                 bctl->meta.usage = 90;
3293         }
3294 }
3295
3296 /*
3297  * Clear the balance status in fs_info and delete the balance item from disk.
3298  */
3299 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3300 {
3301         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3302         int ret;
3303
3304         BUG_ON(!fs_info->balance_ctl);
3305
3306         spin_lock(&fs_info->balance_lock);
3307         fs_info->balance_ctl = NULL;
3308         spin_unlock(&fs_info->balance_lock);
3309
3310         kfree(bctl);
3311         ret = del_balance_item(fs_info);
3312         if (ret)
3313                 btrfs_handle_fs_error(fs_info, ret, NULL);
3314 }
3315
3316 /*
3317  * Balance filters.  Return 1 if chunk should be filtered out
3318  * (should not be balanced).
3319  */
3320 static int chunk_profiles_filter(u64 chunk_type,
3321                                  struct btrfs_balance_args *bargs)
3322 {
3323         chunk_type = chunk_to_extended(chunk_type) &
3324                                 BTRFS_EXTENDED_PROFILE_MASK;
3325
3326         if (bargs->profiles & chunk_type)
3327                 return 0;
3328
3329         return 1;
3330 }
3331
3332 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3333                               struct btrfs_balance_args *bargs)
3334 {
3335         struct btrfs_block_group *cache;
3336         u64 chunk_used;
3337         u64 user_thresh_min;
3338         u64 user_thresh_max;
3339         int ret = 1;
3340
3341         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3342         chunk_used = cache->used;
3343
3344         if (bargs->usage_min == 0)
3345                 user_thresh_min = 0;
3346         else
3347                 user_thresh_min = div_factor_fine(cache->length,
3348                                                   bargs->usage_min);
3349
3350         if (bargs->usage_max == 0)
3351                 user_thresh_max = 1;
3352         else if (bargs->usage_max > 100)
3353                 user_thresh_max = cache->length;
3354         else
3355                 user_thresh_max = div_factor_fine(cache->length,
3356                                                   bargs->usage_max);
3357
3358         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3359                 ret = 0;
3360
3361         btrfs_put_block_group(cache);
3362         return ret;
3363 }
3364
3365 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3366                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3367 {
3368         struct btrfs_block_group *cache;
3369         u64 chunk_used, user_thresh;
3370         int ret = 1;
3371
3372         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3373         chunk_used = cache->used;
3374
3375         if (bargs->usage_min == 0)
3376                 user_thresh = 1;
3377         else if (bargs->usage > 100)
3378                 user_thresh = cache->length;
3379         else
3380                 user_thresh = div_factor_fine(cache->length, bargs->usage);
3381
3382         if (chunk_used < user_thresh)
3383                 ret = 0;
3384
3385         btrfs_put_block_group(cache);
3386         return ret;
3387 }
3388
3389 static int chunk_devid_filter(struct extent_buffer *leaf,
3390                               struct btrfs_chunk *chunk,
3391                               struct btrfs_balance_args *bargs)
3392 {
3393         struct btrfs_stripe *stripe;
3394         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3395         int i;
3396
3397         for (i = 0; i < num_stripes; i++) {
3398                 stripe = btrfs_stripe_nr(chunk, i);
3399                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3400                         return 0;
3401         }
3402
3403         return 1;
3404 }
3405
3406 static u64 calc_data_stripes(u64 type, int num_stripes)
3407 {
3408         const int index = btrfs_bg_flags_to_raid_index(type);
3409         const int ncopies = btrfs_raid_array[index].ncopies;
3410         const int nparity = btrfs_raid_array[index].nparity;
3411
3412         if (nparity)
3413                 return num_stripes - nparity;
3414         else
3415                 return num_stripes / ncopies;
3416 }
3417
3418 /* [pstart, pend) */
3419 static int chunk_drange_filter(struct extent_buffer *leaf,
3420                                struct btrfs_chunk *chunk,
3421                                struct btrfs_balance_args *bargs)
3422 {
3423         struct btrfs_stripe *stripe;
3424         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3425         u64 stripe_offset;
3426         u64 stripe_length;
3427         u64 type;
3428         int factor;
3429         int i;
3430
3431         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3432                 return 0;
3433
3434         type = btrfs_chunk_type(leaf, chunk);
3435         factor = calc_data_stripes(type, num_stripes);
3436
3437         for (i = 0; i < num_stripes; i++) {
3438                 stripe = btrfs_stripe_nr(chunk, i);
3439                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3440                         continue;
3441
3442                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3443                 stripe_length = btrfs_chunk_length(leaf, chunk);
3444                 stripe_length = div_u64(stripe_length, factor);
3445
3446                 if (stripe_offset < bargs->pend &&
3447                     stripe_offset + stripe_length > bargs->pstart)
3448                         return 0;
3449         }
3450
3451         return 1;
3452 }
3453
3454 /* [vstart, vend) */
3455 static int chunk_vrange_filter(struct extent_buffer *leaf,
3456                                struct btrfs_chunk *chunk,
3457                                u64 chunk_offset,
3458                                struct btrfs_balance_args *bargs)
3459 {
3460         if (chunk_offset < bargs->vend &&
3461             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3462                 /* at least part of the chunk is inside this vrange */
3463                 return 0;
3464
3465         return 1;
3466 }
3467
3468 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3469                                struct btrfs_chunk *chunk,
3470                                struct btrfs_balance_args *bargs)
3471 {
3472         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3473
3474         if (bargs->stripes_min <= num_stripes
3475                         && num_stripes <= bargs->stripes_max)
3476                 return 0;
3477
3478         return 1;
3479 }
3480
3481 static int chunk_soft_convert_filter(u64 chunk_type,
3482                                      struct btrfs_balance_args *bargs)
3483 {
3484         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3485                 return 0;
3486
3487         chunk_type = chunk_to_extended(chunk_type) &
3488                                 BTRFS_EXTENDED_PROFILE_MASK;
3489
3490         if (bargs->target == chunk_type)
3491                 return 1;
3492
3493         return 0;
3494 }
3495
3496 static int should_balance_chunk(struct extent_buffer *leaf,
3497                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3498 {
3499         struct btrfs_fs_info *fs_info = leaf->fs_info;
3500         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3501         struct btrfs_balance_args *bargs = NULL;
3502         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3503
3504         /* type filter */
3505         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3506               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3507                 return 0;
3508         }
3509
3510         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3511                 bargs = &bctl->data;
3512         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3513                 bargs = &bctl->sys;
3514         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3515                 bargs = &bctl->meta;
3516
3517         /* profiles filter */
3518         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3519             chunk_profiles_filter(chunk_type, bargs)) {
3520                 return 0;
3521         }
3522
3523         /* usage filter */
3524         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3525             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3526                 return 0;
3527         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3528             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3529                 return 0;
3530         }
3531
3532         /* devid filter */
3533         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3534             chunk_devid_filter(leaf, chunk, bargs)) {
3535                 return 0;
3536         }
3537
3538         /* drange filter, makes sense only with devid filter */
3539         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3540             chunk_drange_filter(leaf, chunk, bargs)) {
3541                 return 0;
3542         }
3543
3544         /* vrange filter */
3545         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3546             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3547                 return 0;
3548         }
3549
3550         /* stripes filter */
3551         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3552             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3553                 return 0;
3554         }
3555
3556         /* soft profile changing mode */
3557         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3558             chunk_soft_convert_filter(chunk_type, bargs)) {
3559                 return 0;
3560         }
3561
3562         /*
3563          * limited by count, must be the last filter
3564          */
3565         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3566                 if (bargs->limit == 0)
3567                         return 0;
3568                 else
3569                         bargs->limit--;
3570         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3571                 /*
3572                  * Same logic as the 'limit' filter; the minimum cannot be
3573                  * determined here because we do not have the global information
3574                  * about the count of all chunks that satisfy the filters.
3575                  */
3576                 if (bargs->limit_max == 0)
3577                         return 0;
3578                 else
3579                         bargs->limit_max--;
3580         }
3581
3582         return 1;
3583 }
3584
3585 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3586 {
3587         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3588         struct btrfs_root *chunk_root = fs_info->chunk_root;
3589         u64 chunk_type;
3590         struct btrfs_chunk *chunk;
3591         struct btrfs_path *path = NULL;
3592         struct btrfs_key key;
3593         struct btrfs_key found_key;
3594         struct extent_buffer *leaf;
3595         int slot;
3596         int ret;
3597         int enospc_errors = 0;
3598         bool counting = true;
3599         /* The single value limit and min/max limits use the same bytes in the */
3600         u64 limit_data = bctl->data.limit;
3601         u64 limit_meta = bctl->meta.limit;
3602         u64 limit_sys = bctl->sys.limit;
3603         u32 count_data = 0;
3604         u32 count_meta = 0;
3605         u32 count_sys = 0;
3606         int chunk_reserved = 0;
3607
3608         path = btrfs_alloc_path();
3609         if (!path) {
3610                 ret = -ENOMEM;
3611                 goto error;
3612         }
3613
3614         /* zero out stat counters */
3615         spin_lock(&fs_info->balance_lock);
3616         memset(&bctl->stat, 0, sizeof(bctl->stat));
3617         spin_unlock(&fs_info->balance_lock);
3618 again:
3619         if (!counting) {
3620                 /*
3621                  * The single value limit and min/max limits use the same bytes
3622                  * in the
3623                  */
3624                 bctl->data.limit = limit_data;
3625                 bctl->meta.limit = limit_meta;
3626                 bctl->sys.limit = limit_sys;
3627         }
3628         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3629         key.offset = (u64)-1;
3630         key.type = BTRFS_CHUNK_ITEM_KEY;
3631
3632         while (1) {
3633                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3634                     atomic_read(&fs_info->balance_cancel_req)) {
3635                         ret = -ECANCELED;
3636                         goto error;
3637                 }
3638
3639                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3640                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3641                 if (ret < 0) {
3642                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3643                         goto error;
3644                 }
3645
3646                 /*
3647                  * this shouldn't happen, it means the last relocate
3648                  * failed
3649                  */
3650                 if (ret == 0)
3651                         BUG(); /* FIXME break ? */
3652
3653                 ret = btrfs_previous_item(chunk_root, path, 0,
3654                                           BTRFS_CHUNK_ITEM_KEY);
3655                 if (ret) {
3656                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3657                         ret = 0;
3658                         break;
3659                 }
3660
3661                 leaf = path->nodes[0];
3662                 slot = path->slots[0];
3663                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3664
3665                 if (found_key.objectid != key.objectid) {
3666                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3667                         break;
3668                 }
3669
3670                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3671                 chunk_type = btrfs_chunk_type(leaf, chunk);
3672
3673                 if (!counting) {
3674                         spin_lock(&fs_info->balance_lock);
3675                         bctl->stat.considered++;
3676                         spin_unlock(&fs_info->balance_lock);
3677                 }
3678
3679                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3680
3681                 btrfs_release_path(path);
3682                 if (!ret) {
3683                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3684                         goto loop;
3685                 }
3686
3687                 if (counting) {
3688                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3689                         spin_lock(&fs_info->balance_lock);
3690                         bctl->stat.expected++;
3691                         spin_unlock(&fs_info->balance_lock);
3692
3693                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3694                                 count_data++;
3695                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3696                                 count_sys++;
3697                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3698                                 count_meta++;
3699
3700                         goto loop;
3701                 }
3702
3703                 /*
3704                  * Apply limit_min filter, no need to check if the LIMITS
3705                  * filter is used, limit_min is 0 by default
3706                  */
3707                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3708                                         count_data < bctl->data.limit_min)
3709                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3710                                         count_meta < bctl->meta.limit_min)
3711                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3712                                         count_sys < bctl->sys.limit_min)) {
3713                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3714                         goto loop;
3715                 }
3716
3717                 if (!chunk_reserved) {
3718                         /*
3719                          * We may be relocating the only data chunk we have,
3720                          * which could potentially end up with losing data's
3721                          * raid profile, so lets allocate an empty one in
3722                          * advance.
3723                          */
3724                         ret = btrfs_may_alloc_data_chunk(fs_info,
3725                                                          found_key.offset);
3726                         if (ret < 0) {
3727                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3728                                 goto error;
3729                         } else if (ret == 1) {
3730                                 chunk_reserved = 1;
3731                         }
3732                 }
3733
3734                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3735                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3736                 if (ret == -ENOSPC) {
3737                         enospc_errors++;
3738                 } else if (ret == -ETXTBSY) {
3739                         btrfs_info(fs_info,
3740            "skipping relocation of block group %llu due to active swapfile",
3741                                    found_key.offset);
3742                         ret = 0;
3743                 } else if (ret) {
3744                         goto error;
3745                 } else {
3746                         spin_lock(&fs_info->balance_lock);
3747                         bctl->stat.completed++;
3748                         spin_unlock(&fs_info->balance_lock);
3749                 }
3750 loop:
3751                 if (found_key.offset == 0)
3752                         break;
3753                 key.offset = found_key.offset - 1;
3754         }
3755
3756         if (counting) {
3757                 btrfs_release_path(path);
3758                 counting = false;
3759                 goto again;
3760         }
3761 error:
3762         btrfs_free_path(path);
3763         if (enospc_errors) {
3764                 btrfs_info(fs_info, "%d enospc errors during balance",
3765                            enospc_errors);
3766                 if (!ret)
3767                         ret = -ENOSPC;
3768         }
3769
3770         return ret;
3771 }
3772
3773 /**
3774  * alloc_profile_is_valid - see if a given profile is valid and reduced
3775  * @flags: profile to validate
3776  * @extended: if true @flags is treated as an extended profile
3777  */
3778 static int alloc_profile_is_valid(u64 flags, int extended)
3779 {
3780         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3781                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3782
3783         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3784
3785         /* 1) check that all other bits are zeroed */
3786         if (flags & ~mask)
3787                 return 0;
3788
3789         /* 2) see if profile is reduced */
3790         if (flags == 0)
3791                 return !extended; /* "0" is valid for usual profiles */
3792
3793         return has_single_bit_set(flags);
3794 }
3795
3796 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3797 {
3798         /* cancel requested || normal exit path */
3799         return atomic_read(&fs_info->balance_cancel_req) ||
3800                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3801                  atomic_read(&fs_info->balance_cancel_req) == 0);
3802 }
3803
3804 /*
3805  * Validate target profile against allowed profiles and return true if it's OK.
3806  * Otherwise print the error message and return false.
3807  */
3808 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3809                 const struct btrfs_balance_args *bargs,
3810                 u64 allowed, const char *type)
3811 {
3812         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3813                 return true;
3814
3815         /* Profile is valid and does not have bits outside of the allowed set */
3816         if (alloc_profile_is_valid(bargs->target, 1) &&
3817             (bargs->target & ~allowed) == 0)
3818                 return true;
3819
3820         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3821                         type, btrfs_bg_type_to_raid_name(bargs->target));
3822         return false;
3823 }
3824
3825 /*
3826  * Fill @buf with textual description of balance filter flags @bargs, up to
3827  * @size_buf including the terminating null. The output may be trimmed if it
3828  * does not fit into the provided buffer.
3829  */
3830 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3831                                  u32 size_buf)
3832 {
3833         int ret;
3834         u32 size_bp = size_buf;
3835         char *bp = buf;
3836         u64 flags = bargs->flags;
3837         char tmp_buf[128] = {'\0'};
3838
3839         if (!flags)
3840                 return;
3841
3842 #define CHECK_APPEND_NOARG(a)                                           \
3843         do {                                                            \
3844                 ret = snprintf(bp, size_bp, (a));                       \
3845                 if (ret < 0 || ret >= size_bp)                          \
3846                         goto out_overflow;                              \
3847                 size_bp -= ret;                                         \
3848                 bp += ret;                                              \
3849         } while (0)
3850
3851 #define CHECK_APPEND_1ARG(a, v1)                                        \
3852         do {                                                            \
3853                 ret = snprintf(bp, size_bp, (a), (v1));                 \
3854                 if (ret < 0 || ret >= size_bp)                          \
3855                         goto out_overflow;                              \
3856                 size_bp -= ret;                                         \
3857                 bp += ret;                                              \
3858         } while (0)
3859
3860 #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
3861         do {                                                            \
3862                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
3863                 if (ret < 0 || ret >= size_bp)                          \
3864                         goto out_overflow;                              \
3865                 size_bp -= ret;                                         \
3866                 bp += ret;                                              \
3867         } while (0)
3868
3869         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3870                 CHECK_APPEND_1ARG("convert=%s,",
3871                                   btrfs_bg_type_to_raid_name(bargs->target));
3872
3873         if (flags & BTRFS_BALANCE_ARGS_SOFT)
3874                 CHECK_APPEND_NOARG("soft,");
3875
3876         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3877                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3878                                             sizeof(tmp_buf));
3879                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3880         }
3881
3882         if (flags & BTRFS_BALANCE_ARGS_USAGE)
3883                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3884
3885         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3886                 CHECK_APPEND_2ARG("usage=%u..%u,",
3887                                   bargs->usage_min, bargs->usage_max);
3888
3889         if (flags & BTRFS_BALANCE_ARGS_DEVID)
3890                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3891
3892         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
3893                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
3894                                   bargs->pstart, bargs->pend);
3895
3896         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
3897                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
3898                                   bargs->vstart, bargs->vend);
3899
3900         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
3901                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
3902
3903         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
3904                 CHECK_APPEND_2ARG("limit=%u..%u,",
3905                                 bargs->limit_min, bargs->limit_max);
3906
3907         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
3908                 CHECK_APPEND_2ARG("stripes=%u..%u,",
3909                                   bargs->stripes_min, bargs->stripes_max);
3910
3911 #undef CHECK_APPEND_2ARG
3912 #undef CHECK_APPEND_1ARG
3913 #undef CHECK_APPEND_NOARG
3914
3915 out_overflow:
3916
3917         if (size_bp < size_buf)
3918                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
3919         else
3920                 buf[0] = '\0';
3921 }
3922
3923 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
3924 {
3925         u32 size_buf = 1024;
3926         char tmp_buf[192] = {'\0'};
3927         char *buf;
3928         char *bp;
3929         u32 size_bp = size_buf;
3930         int ret;
3931         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3932
3933         buf = kzalloc(size_buf, GFP_KERNEL);
3934         if (!buf)
3935                 return;
3936
3937         bp = buf;
3938
3939 #define CHECK_APPEND_1ARG(a, v1)                                        \
3940         do {                                                            \
3941                 ret = snprintf(bp, size_bp, (a), (v1));                 \
3942                 if (ret < 0 || ret >= size_bp)                          \
3943                         goto out_overflow;                              \
3944                 size_bp -= ret;                                         \
3945                 bp += ret;                                              \
3946         } while (0)
3947
3948         if (bctl->flags & BTRFS_BALANCE_FORCE)
3949                 CHECK_APPEND_1ARG("%s", "-f ");
3950
3951         if (bctl->flags & BTRFS_BALANCE_DATA) {
3952                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
3953                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
3954         }
3955
3956         if (bctl->flags & BTRFS_BALANCE_METADATA) {
3957                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
3958                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
3959         }
3960
3961         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
3962                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
3963                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
3964         }
3965
3966 #undef CHECK_APPEND_1ARG
3967
3968 out_overflow:
3969
3970         if (size_bp < size_buf)
3971                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
3972         btrfs_info(fs_info, "balance: %s %s",
3973                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
3974                    "resume" : "start", buf);
3975
3976         kfree(buf);
3977 }
3978
3979 /*
3980  * Should be called with balance mutexe held
3981  */
3982 int btrfs_balance(struct btrfs_fs_info *fs_info,
3983                   struct btrfs_balance_control *bctl,
3984                   struct btrfs_ioctl_balance_args *bargs)
3985 {
3986         u64 meta_target, data_target;
3987         u64 allowed;
3988         int mixed = 0;
3989         int ret;
3990         u64 num_devices;
3991         unsigned seq;
3992         bool reducing_redundancy;
3993         int i;
3994
3995         if (btrfs_fs_closing(fs_info) ||
3996             atomic_read(&fs_info->balance_pause_req) ||
3997             btrfs_should_cancel_balance(fs_info)) {
3998                 ret = -EINVAL;
3999                 goto out;
4000         }
4001
4002         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4003         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4004                 mixed = 1;
4005
4006         /*
4007          * In case of mixed groups both data and meta should be picked,
4008          * and identical options should be given for both of them.
4009          */
4010         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4011         if (mixed && (bctl->flags & allowed)) {
4012                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4013                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4014                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4015                         btrfs_err(fs_info,
4016           "balance: mixed groups data and metadata options must be the same");
4017                         ret = -EINVAL;
4018                         goto out;
4019                 }
4020         }
4021
4022         /*
4023          * rw_devices will not change at the moment, device add/delete/replace
4024          * are excluded by EXCL_OP
4025          */
4026         num_devices = fs_info->fs_devices->rw_devices;
4027
4028         /*
4029          * SINGLE profile on-disk has no profile bit, but in-memory we have a
4030          * special bit for it, to make it easier to distinguish.  Thus we need
4031          * to set it manually, or balance would refuse the profile.
4032          */
4033         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4034         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4035                 if (num_devices >= btrfs_raid_array[i].devs_min)
4036                         allowed |= btrfs_raid_array[i].bg_flag;
4037
4038         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4039             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4040             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4041                 ret = -EINVAL;
4042                 goto out;
4043         }
4044
4045         /*
4046          * Allow to reduce metadata or system integrity only if force set for
4047          * profiles with redundancy (copies, parity)
4048          */
4049         allowed = 0;
4050         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4051                 if (btrfs_raid_array[i].ncopies >= 2 ||
4052                     btrfs_raid_array[i].tolerated_failures >= 1)
4053                         allowed |= btrfs_raid_array[i].bg_flag;
4054         }
4055         do {
4056                 seq = read_seqbegin(&fs_info->profiles_lock);
4057
4058                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4059                      (fs_info->avail_system_alloc_bits & allowed) &&
4060                      !(bctl->sys.target & allowed)) ||
4061                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4062                      (fs_info->avail_metadata_alloc_bits & allowed) &&
4063                      !(bctl->meta.target & allowed)))
4064                         reducing_redundancy = true;
4065                 else
4066                         reducing_redundancy = false;
4067
4068                 /* if we're not converting, the target field is uninitialized */
4069                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4070                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4071                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4072                         bctl->data.target : fs_info->avail_data_alloc_bits;
4073         } while (read_seqretry(&fs_info->profiles_lock, seq));
4074
4075         if (reducing_redundancy) {
4076                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4077                         btrfs_info(fs_info,
4078                            "balance: force reducing metadata redundancy");
4079                 } else {
4080                         btrfs_err(fs_info,
4081         "balance: reduces metadata redundancy, use --force if you want this");
4082                         ret = -EINVAL;
4083                         goto out;
4084                 }
4085         }
4086
4087         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4088                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4089                 btrfs_warn(fs_info,
4090         "balance: metadata profile %s has lower redundancy than data profile %s",
4091                                 btrfs_bg_type_to_raid_name(meta_target),
4092                                 btrfs_bg_type_to_raid_name(data_target));
4093         }
4094
4095         if (fs_info->send_in_progress) {
4096                 btrfs_warn_rl(fs_info,
4097 "cannot run balance while send operations are in progress (%d in progress)",
4098                               fs_info->send_in_progress);
4099                 ret = -EAGAIN;
4100                 goto out;
4101         }
4102
4103         ret = insert_balance_item(fs_info, bctl);
4104         if (ret && ret != -EEXIST)
4105                 goto out;
4106
4107         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4108                 BUG_ON(ret == -EEXIST);
4109                 BUG_ON(fs_info->balance_ctl);
4110                 spin_lock(&fs_info->balance_lock);
4111                 fs_info->balance_ctl = bctl;
4112                 spin_unlock(&fs_info->balance_lock);
4113         } else {
4114                 BUG_ON(ret != -EEXIST);
4115                 spin_lock(&fs_info->balance_lock);
4116                 update_balance_args(bctl);
4117                 spin_unlock(&fs_info->balance_lock);
4118         }
4119
4120         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4121         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4122         describe_balance_start_or_resume(fs_info);
4123         mutex_unlock(&fs_info->balance_mutex);
4124
4125         ret = __btrfs_balance(fs_info);
4126
4127         mutex_lock(&fs_info->balance_mutex);
4128         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4129                 btrfs_info(fs_info, "balance: paused");
4130         else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
4131                 btrfs_info(fs_info, "balance: canceled");
4132         else
4133                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4134
4135         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4136
4137         if (bargs) {
4138                 memset(bargs, 0, sizeof(*bargs));
4139                 btrfs_update_ioctl_balance_args(fs_info, bargs);
4140         }
4141
4142         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4143             balance_need_close(fs_info)) {
4144                 reset_balance_state(fs_info);
4145                 clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4146         }
4147
4148         wake_up(&fs_info->balance_wait_q);
4149
4150         return ret;
4151 out:
4152         if (bctl->flags & BTRFS_BALANCE_RESUME)
4153                 reset_balance_state(fs_info);
4154         else
4155                 kfree(bctl);
4156         clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4157
4158         return ret;
4159 }
4160
4161 static int balance_kthread(void *data)
4162 {
4163         struct btrfs_fs_info *fs_info = data;
4164         int ret = 0;
4165
4166         mutex_lock(&fs_info->balance_mutex);
4167         if (fs_info->balance_ctl)
4168                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4169         mutex_unlock(&fs_info->balance_mutex);
4170
4171         return ret;
4172 }
4173
4174 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4175 {
4176         struct task_struct *tsk;
4177
4178         mutex_lock(&fs_info->balance_mutex);
4179         if (!fs_info->balance_ctl) {
4180                 mutex_unlock(&fs_info->balance_mutex);
4181                 return 0;
4182         }
4183         mutex_unlock(&fs_info->balance_mutex);
4184
4185         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4186                 btrfs_info(fs_info, "balance: resume skipped");
4187                 return 0;
4188         }
4189
4190         /*
4191          * A ro->rw remount sequence should continue with the paused balance
4192          * regardless of who pauses it, system or the user as of now, so set
4193          * the resume flag.
4194          */
4195         spin_lock(&fs_info->balance_lock);
4196         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4197         spin_unlock(&fs_info->balance_lock);
4198
4199         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4200         return PTR_ERR_OR_ZERO(tsk);
4201 }
4202
4203 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4204 {
4205         struct btrfs_balance_control *bctl;
4206         struct btrfs_balance_item *item;
4207         struct btrfs_disk_balance_args disk_bargs;
4208         struct btrfs_path *path;
4209         struct extent_buffer *leaf;
4210         struct btrfs_key key;
4211         int ret;
4212
4213         path = btrfs_alloc_path();
4214         if (!path)
4215                 return -ENOMEM;
4216
4217         key.objectid = BTRFS_BALANCE_OBJECTID;
4218         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4219         key.offset = 0;
4220
4221         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4222         if (ret < 0)
4223                 goto out;
4224         if (ret > 0) { /* ret = -ENOENT; */
4225                 ret = 0;
4226                 goto out;
4227         }
4228
4229         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4230         if (!bctl) {
4231                 ret = -ENOMEM;
4232                 goto out;
4233         }
4234
4235         leaf = path->nodes[0];
4236         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4237
4238         bctl->flags = btrfs_balance_flags(leaf, item);
4239         bctl->flags |= BTRFS_BALANCE_RESUME;
4240
4241         btrfs_balance_data(leaf, item, &disk_bargs);
4242         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4243         btrfs_balance_meta(leaf, item, &disk_bargs);
4244         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4245         btrfs_balance_sys(leaf, item, &disk_bargs);
4246         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4247
4248         /*
4249          * This should never happen, as the paused balance state is recovered
4250          * during mount without any chance of other exclusive ops to collide.
4251          *
4252          * This gives the exclusive op status to balance and keeps in paused
4253          * state until user intervention (cancel or umount). If the ownership
4254          * cannot be assigned, show a message but do not fail. The balance
4255          * is in a paused state and must have fs_info::balance_ctl properly
4256          * set up.
4257          */
4258         if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
4259                 btrfs_warn(fs_info,
4260         "balance: cannot set exclusive op status, resume manually");
4261
4262         mutex_lock(&fs_info->balance_mutex);
4263         BUG_ON(fs_info->balance_ctl);
4264         spin_lock(&fs_info->balance_lock);
4265         fs_info->balance_ctl = bctl;
4266         spin_unlock(&fs_info->balance_lock);
4267         mutex_unlock(&fs_info->balance_mutex);
4268 out:
4269         btrfs_free_path(path);
4270         return ret;
4271 }
4272
4273 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4274 {
4275         int ret = 0;
4276
4277         mutex_lock(&fs_info->balance_mutex);
4278         if (!fs_info->balance_ctl) {
4279                 mutex_unlock(&fs_info->balance_mutex);
4280                 return -ENOTCONN;
4281         }
4282
4283         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4284                 atomic_inc(&fs_info->balance_pause_req);
4285                 mutex_unlock(&fs_info->balance_mutex);
4286
4287                 wait_event(fs_info->balance_wait_q,
4288                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4289
4290                 mutex_lock(&fs_info->balance_mutex);
4291                 /* we are good with balance_ctl ripped off from under us */
4292                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4293                 atomic_dec(&fs_info->balance_pause_req);
4294         } else {
4295                 ret = -ENOTCONN;
4296         }
4297
4298         mutex_unlock(&fs_info->balance_mutex);
4299         return ret;
4300 }
4301
4302 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4303 {
4304         mutex_lock(&fs_info->balance_mutex);
4305         if (!fs_info->balance_ctl) {
4306                 mutex_unlock(&fs_info->balance_mutex);
4307                 return -ENOTCONN;
4308         }
4309
4310         /*
4311          * A paused balance with the item stored on disk can be resumed at
4312          * mount time if the mount is read-write. Otherwise it's still paused
4313          * and we must not allow cancelling as it deletes the item.
4314          */
4315         if (sb_rdonly(fs_info->sb)) {
4316                 mutex_unlock(&fs_info->balance_mutex);
4317                 return -EROFS;
4318         }
4319
4320         atomic_inc(&fs_info->balance_cancel_req);
4321         /*
4322          * if we are running just wait and return, balance item is
4323          * deleted in btrfs_balance in this case
4324          */
4325         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4326                 mutex_unlock(&fs_info->balance_mutex);
4327                 wait_event(fs_info->balance_wait_q,
4328                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4329                 mutex_lock(&fs_info->balance_mutex);
4330         } else {
4331                 mutex_unlock(&fs_info->balance_mutex);
4332                 /*
4333                  * Lock released to allow other waiters to continue, we'll
4334                  * reexamine the status again.
4335                  */
4336                 mutex_lock(&fs_info->balance_mutex);
4337
4338                 if (fs_info->balance_ctl) {
4339                         reset_balance_state(fs_info);
4340                         clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4341                         btrfs_info(fs_info, "balance: canceled");
4342                 }
4343         }
4344
4345         BUG_ON(fs_info->balance_ctl ||
4346                 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4347         atomic_dec(&fs_info->balance_cancel_req);
4348         mutex_unlock(&fs_info->balance_mutex);
4349         return 0;
4350 }
4351
4352 int btrfs_uuid_scan_kthread(void *data)
4353 {
4354         struct btrfs_fs_info *fs_info = data;
4355         struct btrfs_root *root = fs_info->tree_root;
4356         struct btrfs_key key;
4357         struct btrfs_path *path = NULL;
4358         int ret = 0;
4359         struct extent_buffer *eb;
4360         int slot;
4361         struct btrfs_root_item root_item;
4362         u32 item_size;
4363         struct btrfs_trans_handle *trans = NULL;
4364         bool closing = false;
4365
4366         path = btrfs_alloc_path();
4367         if (!path) {
4368                 ret = -ENOMEM;
4369                 goto out;
4370         }
4371
4372         key.objectid = 0;
4373         key.type = BTRFS_ROOT_ITEM_KEY;
4374         key.offset = 0;
4375
4376         while (1) {
4377                 if (btrfs_fs_closing(fs_info)) {
4378                         closing = true;
4379                         break;
4380                 }
4381                 ret = btrfs_search_forward(root, &key, path,
4382                                 BTRFS_OLDEST_GENERATION);
4383                 if (ret) {
4384                         if (ret > 0)
4385                                 ret = 0;
4386                         break;
4387                 }
4388
4389                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4390                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4391                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4392                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
4393                         goto skip;
4394
4395                 eb = path->nodes[0];
4396                 slot = path->slots[0];
4397                 item_size = btrfs_item_size_nr(eb, slot);
4398                 if (item_size < sizeof(root_item))
4399                         goto skip;
4400
4401                 read_extent_buffer(eb, &root_item,
4402                                    btrfs_item_ptr_offset(eb, slot),
4403                                    (int)sizeof(root_item));
4404                 if (btrfs_root_refs(&root_item) == 0)
4405                         goto skip;
4406
4407                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4408                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
4409                         if (trans)
4410                                 goto update_tree;
4411
4412                         btrfs_release_path(path);
4413                         /*
4414                          * 1 - subvol uuid item
4415                          * 1 - received_subvol uuid item
4416                          */
4417                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4418                         if (IS_ERR(trans)) {
4419                                 ret = PTR_ERR(trans);
4420                                 break;
4421                         }
4422                         continue;
4423                 } else {
4424                         goto skip;
4425                 }
4426 update_tree:
4427                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4428                         ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4429                                                   BTRFS_UUID_KEY_SUBVOL,
4430                                                   key.objectid);
4431                         if (ret < 0) {
4432                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4433                                         ret);
4434                                 break;
4435                         }
4436                 }
4437
4438                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4439                         ret = btrfs_uuid_tree_add(trans,
4440                                                   root_item.received_uuid,
4441                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4442                                                   key.objectid);
4443                         if (ret < 0) {
4444                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4445                                         ret);
4446                                 break;
4447                         }
4448                 }
4449
4450 skip:
4451                 if (trans) {
4452                         ret = btrfs_end_transaction(trans);
4453                         trans = NULL;
4454                         if (ret)
4455                                 break;
4456                 }
4457
4458                 btrfs_release_path(path);
4459                 if (key.offset < (u64)-1) {
4460                         key.offset++;
4461                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4462                         key.offset = 0;
4463                         key.type = BTRFS_ROOT_ITEM_KEY;
4464                 } else if (key.objectid < (u64)-1) {
4465                         key.offset = 0;
4466                         key.type = BTRFS_ROOT_ITEM_KEY;
4467                         key.objectid++;
4468                 } else {
4469                         break;
4470                 }
4471                 cond_resched();
4472         }
4473
4474 out:
4475         btrfs_free_path(path);
4476         if (trans && !IS_ERR(trans))
4477                 btrfs_end_transaction(trans);
4478         if (ret)
4479                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4480         else if (!closing)
4481                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4482         up(&fs_info->uuid_tree_rescan_sem);
4483         return 0;
4484 }
4485
4486 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4487 {
4488         struct btrfs_trans_handle *trans;
4489         struct btrfs_root *tree_root = fs_info->tree_root;
4490         struct btrfs_root *uuid_root;
4491         struct task_struct *task;
4492         int ret;
4493
4494         /*
4495          * 1 - root node
4496          * 1 - root item
4497          */
4498         trans = btrfs_start_transaction(tree_root, 2);
4499         if (IS_ERR(trans))
4500                 return PTR_ERR(trans);
4501
4502         uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4503         if (IS_ERR(uuid_root)) {
4504                 ret = PTR_ERR(uuid_root);
4505                 btrfs_abort_transaction(trans, ret);
4506                 btrfs_end_transaction(trans);
4507                 return ret;
4508         }
4509
4510         fs_info->uuid_root = uuid_root;
4511
4512         ret = btrfs_commit_transaction(trans);
4513         if (ret)
4514                 return ret;
4515
4516         down(&fs_info->uuid_tree_rescan_sem);
4517         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4518         if (IS_ERR(task)) {
4519                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4520                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4521                 up(&fs_info->uuid_tree_rescan_sem);
4522                 return PTR_ERR(task);
4523         }
4524
4525         return 0;
4526 }
4527
4528 /*
4529  * shrinking a device means finding all of the device extents past
4530  * the new size, and then following the back refs to the chunks.
4531  * The chunk relocation code actually frees the device extent
4532  */
4533 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4534 {
4535         struct btrfs_fs_info *fs_info = device->fs_info;
4536         struct btrfs_root *root = fs_info->dev_root;
4537         struct btrfs_trans_handle *trans;
4538         struct btrfs_dev_extent *dev_extent = NULL;
4539         struct btrfs_path *path;
4540         u64 length;
4541         u64 chunk_offset;
4542         int ret;
4543         int slot;
4544         int failed = 0;
4545         bool retried = false;
4546         struct extent_buffer *l;
4547         struct btrfs_key key;
4548         struct btrfs_super_block *super_copy = fs_info->super_copy;
4549         u64 old_total = btrfs_super_total_bytes(super_copy);
4550         u64 old_size = btrfs_device_get_total_bytes(device);
4551         u64 diff;
4552         u64 start;
4553
4554         new_size = round_down(new_size, fs_info->sectorsize);
4555         start = new_size;
4556         diff = round_down(old_size - new_size, fs_info->sectorsize);
4557
4558         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4559                 return -EINVAL;
4560
4561         path = btrfs_alloc_path();
4562         if (!path)
4563                 return -ENOMEM;
4564
4565         path->reada = READA_BACK;
4566
4567         trans = btrfs_start_transaction(root, 0);
4568         if (IS_ERR(trans)) {
4569                 btrfs_free_path(path);
4570                 return PTR_ERR(trans);
4571         }
4572
4573         mutex_lock(&fs_info->chunk_mutex);
4574
4575         btrfs_device_set_total_bytes(device, new_size);
4576         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4577                 device->fs_devices->total_rw_bytes -= diff;
4578                 atomic64_sub(diff, &fs_info->free_chunk_space);
4579         }
4580
4581         /*
4582          * Once the device's size has been set to the new size, ensure all
4583          * in-memory chunks are synced to disk so that the loop below sees them
4584          * and relocates them accordingly.
4585          */
4586         if (contains_pending_extent(device, &start, diff)) {
4587                 mutex_unlock(&fs_info->chunk_mutex);
4588                 ret = btrfs_commit_transaction(trans);
4589                 if (ret)
4590                         goto done;
4591         } else {
4592                 mutex_unlock(&fs_info->chunk_mutex);
4593                 btrfs_end_transaction(trans);
4594         }
4595
4596 again:
4597         key.objectid = device->devid;
4598         key.offset = (u64)-1;
4599         key.type = BTRFS_DEV_EXTENT_KEY;
4600
4601         do {
4602                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
4603                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4604                 if (ret < 0) {
4605                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4606                         goto done;
4607                 }
4608
4609                 ret = btrfs_previous_item(root, path, 0, key.type);
4610                 if (ret)
4611                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4612                 if (ret < 0)
4613                         goto done;
4614                 if (ret) {
4615                         ret = 0;
4616                         btrfs_release_path(path);
4617                         break;
4618                 }
4619
4620                 l = path->nodes[0];
4621                 slot = path->slots[0];
4622                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4623
4624                 if (key.objectid != device->devid) {
4625                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4626                         btrfs_release_path(path);
4627                         break;
4628                 }
4629
4630                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4631                 length = btrfs_dev_extent_length(l, dev_extent);
4632
4633                 if (key.offset + length <= new_size) {
4634                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4635                         btrfs_release_path(path);
4636                         break;
4637                 }
4638
4639                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4640                 btrfs_release_path(path);
4641
4642                 /*
4643                  * We may be relocating the only data chunk we have,
4644                  * which could potentially end up with losing data's
4645                  * raid profile, so lets allocate an empty one in
4646                  * advance.
4647                  */
4648                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4649                 if (ret < 0) {
4650                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4651                         goto done;
4652                 }
4653
4654                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4655                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4656                 if (ret == -ENOSPC) {
4657                         failed++;
4658                 } else if (ret) {
4659                         if (ret == -ETXTBSY) {
4660                                 btrfs_warn(fs_info,
4661                    "could not shrink block group %llu due to active swapfile",
4662                                            chunk_offset);
4663                         }
4664                         goto done;
4665                 }
4666         } while (key.offset-- > 0);
4667
4668         if (failed && !retried) {
4669                 failed = 0;
4670                 retried = true;
4671                 goto again;
4672         } else if (failed && retried) {
4673                 ret = -ENOSPC;
4674                 goto done;
4675         }
4676
4677         /* Shrinking succeeded, else we would be at "done". */
4678         trans = btrfs_start_transaction(root, 0);
4679         if (IS_ERR(trans)) {
4680                 ret = PTR_ERR(trans);
4681                 goto done;
4682         }
4683
4684         mutex_lock(&fs_info->chunk_mutex);
4685         btrfs_device_set_disk_total_bytes(device, new_size);
4686         if (list_empty(&device->post_commit_list))
4687                 list_add_tail(&device->post_commit_list,
4688                               &trans->transaction->dev_update_list);
4689
4690         WARN_ON(diff > old_total);
4691         btrfs_set_super_total_bytes(super_copy,
4692                         round_down(old_total - diff, fs_info->sectorsize));
4693         mutex_unlock(&fs_info->chunk_mutex);
4694
4695         /* Now btrfs_update_device() will change the on-disk size. */
4696         ret = btrfs_update_device(trans, device);
4697         if (ret < 0) {
4698                 btrfs_abort_transaction(trans, ret);
4699                 btrfs_end_transaction(trans);
4700         } else {
4701                 ret = btrfs_commit_transaction(trans);
4702         }
4703 done:
4704         btrfs_free_path(path);
4705         if (ret) {
4706                 mutex_lock(&fs_info->chunk_mutex);
4707                 btrfs_device_set_total_bytes(device, old_size);
4708                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4709                         device->fs_devices->total_rw_bytes += diff;
4710                 atomic64_add(diff, &fs_info->free_chunk_space);
4711                 mutex_unlock(&fs_info->chunk_mutex);
4712         }
4713         return ret;
4714 }
4715
4716 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4717                            struct btrfs_key *key,
4718                            struct btrfs_chunk *chunk, int item_size)
4719 {
4720         struct btrfs_super_block *super_copy = fs_info->super_copy;
4721         struct btrfs_disk_key disk_key;
4722         u32 array_size;
4723         u8 *ptr;
4724
4725         mutex_lock(&fs_info->chunk_mutex);
4726         array_size = btrfs_super_sys_array_size(super_copy);
4727         if (array_size + item_size + sizeof(disk_key)
4728                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4729                 mutex_unlock(&fs_info->chunk_mutex);
4730                 return -EFBIG;
4731         }
4732
4733         ptr = super_copy->sys_chunk_array + array_size;
4734         btrfs_cpu_key_to_disk(&disk_key, key);
4735         memcpy(ptr, &disk_key, sizeof(disk_key));
4736         ptr += sizeof(disk_key);
4737         memcpy(ptr, chunk, item_size);
4738         item_size += sizeof(disk_key);
4739         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4740         mutex_unlock(&fs_info->chunk_mutex);
4741
4742         return 0;
4743 }
4744
4745 /*
4746  * sort the devices in descending order by max_avail, total_avail
4747  */
4748 static int btrfs_cmp_device_info(const void *a, const void *b)
4749 {
4750         const struct btrfs_device_info *di_a = a;
4751         const struct btrfs_device_info *di_b = b;
4752
4753         if (di_a->max_avail > di_b->max_avail)
4754                 return -1;
4755         if (di_a->max_avail < di_b->max_avail)
4756                 return 1;
4757         if (di_a->total_avail > di_b->total_avail)
4758                 return -1;
4759         if (di_a->total_avail < di_b->total_avail)
4760                 return 1;
4761         return 0;
4762 }
4763
4764 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4765 {
4766         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4767                 return;
4768
4769         btrfs_set_fs_incompat(info, RAID56);
4770 }
4771
4772 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4773 {
4774         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4775                 return;
4776
4777         btrfs_set_fs_incompat(info, RAID1C34);
4778 }
4779
4780 /*
4781  * Structure used internally for __btrfs_alloc_chunk() function.
4782  * Wraps needed parameters.
4783  */
4784 struct alloc_chunk_ctl {
4785         u64 start;
4786         u64 type;
4787         /* Total number of stripes to allocate */
4788         int num_stripes;
4789         /* sub_stripes info for map */
4790         int sub_stripes;
4791         /* Stripes per device */
4792         int dev_stripes;
4793         /* Maximum number of devices to use */
4794         int devs_max;
4795         /* Minimum number of devices to use */
4796         int devs_min;
4797         /* ndevs has to be a multiple of this */
4798         int devs_increment;
4799         /* Number of copies */
4800         int ncopies;
4801         /* Number of stripes worth of bytes to store parity information */
4802         int nparity;
4803         u64 max_stripe_size;
4804         u64 max_chunk_size;
4805         u64 dev_extent_min;
4806         u64 stripe_size;
4807         u64 chunk_size;
4808         int ndevs;
4809 };
4810
4811 static void init_alloc_chunk_ctl_policy_regular(
4812                                 struct btrfs_fs_devices *fs_devices,
4813                                 struct alloc_chunk_ctl *ctl)
4814 {
4815         u64 type = ctl->type;
4816
4817         if (type & BTRFS_BLOCK_GROUP_DATA) {
4818                 ctl->max_stripe_size = SZ_1G;
4819                 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4820         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4821                 /* For larger filesystems, use larger metadata chunks */
4822                 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4823                         ctl->max_stripe_size = SZ_1G;
4824                 else
4825                         ctl->max_stripe_size = SZ_256M;
4826                 ctl->max_chunk_size = ctl->max_stripe_size;
4827         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4828                 ctl->max_stripe_size = SZ_32M;
4829                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4830                 ctl->devs_max = min_t(int, ctl->devs_max,
4831                                       BTRFS_MAX_DEVS_SYS_CHUNK);
4832         } else {
4833                 BUG();
4834         }
4835
4836         /* We don't want a chunk larger than 10% of writable space */
4837         ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4838                                   ctl->max_chunk_size);
4839         ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4840 }
4841
4842 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
4843                                  struct alloc_chunk_ctl *ctl)
4844 {
4845         int index = btrfs_bg_flags_to_raid_index(ctl->type);
4846
4847         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
4848         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
4849         ctl->devs_max = btrfs_raid_array[index].devs_max;
4850         if (!ctl->devs_max)
4851                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
4852         ctl->devs_min = btrfs_raid_array[index].devs_min;
4853         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
4854         ctl->ncopies = btrfs_raid_array[index].ncopies;
4855         ctl->nparity = btrfs_raid_array[index].nparity;
4856         ctl->ndevs = 0;
4857
4858         switch (fs_devices->chunk_alloc_policy) {
4859         case BTRFS_CHUNK_ALLOC_REGULAR:
4860                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
4861                 break;
4862         default:
4863                 BUG();
4864         }
4865 }
4866
4867 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
4868                               struct alloc_chunk_ctl *ctl,
4869                               struct btrfs_device_info *devices_info)
4870 {
4871         struct btrfs_fs_info *info = fs_devices->fs_info;
4872         struct btrfs_device *device;
4873         u64 total_avail;
4874         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
4875         int ret;
4876         int ndevs = 0;
4877         u64 max_avail;
4878         u64 dev_offset;
4879
4880         /*
4881          * in the first pass through the devices list, we gather information
4882          * about the available holes on each device.
4883          */
4884         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4885                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4886                         WARN(1, KERN_ERR
4887                                "BTRFS: read-only device in alloc_list\n");
4888                         continue;
4889                 }
4890
4891                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
4892                                         &device->dev_state) ||
4893                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4894                         continue;
4895
4896                 if (device->total_bytes > device->bytes_used)
4897                         total_avail = device->total_bytes - device->bytes_used;
4898                 else
4899                         total_avail = 0;
4900
4901                 /* If there is no space on this device, skip it. */
4902                 if (total_avail < ctl->dev_extent_min)
4903                         continue;
4904
4905                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
4906                                            &max_avail);
4907                 if (ret && ret != -ENOSPC)
4908                         return ret;
4909
4910                 if (ret == 0)
4911                         max_avail = dev_extent_want;
4912
4913                 if (max_avail < ctl->dev_extent_min) {
4914                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
4915                                 btrfs_debug(info,
4916                         "%s: devid %llu has no free space, have=%llu want=%llu",
4917                                             __func__, device->devid, max_avail,
4918                                             ctl->dev_extent_min);
4919                         continue;
4920                 }
4921
4922                 if (ndevs == fs_devices->rw_devices) {
4923                         WARN(1, "%s: found more than %llu devices\n",
4924                              __func__, fs_devices->rw_devices);
4925                         break;
4926                 }
4927                 devices_info[ndevs].dev_offset = dev_offset;
4928                 devices_info[ndevs].max_avail = max_avail;
4929                 devices_info[ndevs].total_avail = total_avail;
4930                 devices_info[ndevs].dev = device;
4931                 ++ndevs;
4932         }
4933         ctl->ndevs = ndevs;
4934
4935         /*
4936          * now sort the devices by hole size / available space
4937          */
4938         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4939              btrfs_cmp_device_info, NULL);
4940
4941         return 0;
4942 }
4943
4944 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
4945                                       struct btrfs_device_info *devices_info)
4946 {
4947         /* Number of stripes that count for block group size */
4948         int data_stripes;
4949
4950         /*
4951          * The primary goal is to maximize the number of stripes, so use as
4952          * many devices as possible, even if the stripes are not maximum sized.
4953          *
4954          * The DUP profile stores more than one stripe per device, the
4955          * max_avail is the total size so we have to adjust.
4956          */
4957         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
4958                                    ctl->dev_stripes);
4959         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
4960
4961         /* This will have to be fixed for RAID1 and RAID10 over more drives */
4962         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
4963
4964         /*
4965          * Use the number of data stripes to figure out how big this chunk is
4966          * really going to be in terms of logical address space, and compare
4967          * that answer with the max chunk size. If it's higher, we try to
4968          * reduce stripe_size.
4969          */
4970         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
4971                 /*
4972                  * Reduce stripe_size, round it up to a 16MB boundary again and
4973                  * then use it, unless it ends up being even bigger than the
4974                  * previous value we had already.
4975                  */
4976                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
4977                                                         data_stripes), SZ_16M),
4978                                        ctl->stripe_size);
4979         }
4980
4981         /* Align to BTRFS_STRIPE_LEN */
4982         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
4983         ctl->chunk_size = ctl->stripe_size * data_stripes;
4984
4985         return 0;
4986 }
4987
4988 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
4989                               struct alloc_chunk_ctl *ctl,
4990                               struct btrfs_device_info *devices_info)
4991 {
4992         struct btrfs_fs_info *info = fs_devices->fs_info;
4993
4994         /*
4995          * Round down to number of usable stripes, devs_increment can be any
4996          * number so we can't use round_down() that requires power of 2, while
4997          * rounddown is safe.
4998          */
4999         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5000
5001         if (ctl->ndevs < ctl->devs_min) {
5002                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5003                         btrfs_debug(info,
5004         "%s: not enough devices with free space: have=%d minimum required=%d",
5005                                     __func__, ctl->ndevs, ctl->devs_min);
5006                 }
5007                 return -ENOSPC;
5008         }
5009
5010         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5011
5012         switch (fs_devices->chunk_alloc_policy) {
5013         case BTRFS_CHUNK_ALLOC_REGULAR:
5014                 return decide_stripe_size_regular(ctl, devices_info);
5015         default:
5016                 BUG();
5017         }
5018 }
5019
5020 static int create_chunk(struct btrfs_trans_handle *trans,
5021                         struct alloc_chunk_ctl *ctl,
5022                         struct btrfs_device_info *devices_info)
5023 {
5024         struct btrfs_fs_info *info = trans->fs_info;
5025         struct map_lookup *map = NULL;
5026         struct extent_map_tree *em_tree;
5027         struct extent_map *em;
5028         u64 start = ctl->start;
5029         u64 type = ctl->type;
5030         int ret;
5031         int i;
5032         int j;
5033
5034         map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5035         if (!map)
5036                 return -ENOMEM;
5037         map->num_stripes = ctl->num_stripes;
5038
5039         for (i = 0; i < ctl->ndevs; ++i) {
5040                 for (j = 0; j < ctl->dev_stripes; ++j) {
5041                         int s = i * ctl->dev_stripes + j;
5042                         map->stripes[s].dev = devices_info[i].dev;
5043                         map->stripes[s].physical = devices_info[i].dev_offset +
5044                                                    j * ctl->stripe_size;
5045                 }
5046         }
5047         map->stripe_len = BTRFS_STRIPE_LEN;
5048         map->io_align = BTRFS_STRIPE_LEN;
5049         map->io_width = BTRFS_STRIPE_LEN;
5050         map->type = type;
5051         map->sub_stripes = ctl->sub_stripes;
5052
5053         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5054
5055         em = alloc_extent_map();
5056         if (!em) {
5057                 kfree(map);
5058                 return -ENOMEM;
5059         }
5060         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5061         em->map_lookup = map;
5062         em->start = start;
5063         em->len = ctl->chunk_size;
5064         em->block_start = 0;
5065         em->block_len = em->len;
5066         em->orig_block_len = ctl->stripe_size;
5067
5068         em_tree = &info->mapping_tree;
5069         write_lock(&em_tree->lock);
5070         ret = add_extent_mapping(em_tree, em, 0);
5071         if (ret) {
5072                 write_unlock(&em_tree->lock);
5073                 free_extent_map(em);
5074                 return ret;
5075         }
5076         write_unlock(&em_tree->lock);
5077
5078         ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5079         if (ret)
5080                 goto error_del_extent;
5081
5082         for (i = 0; i < map->num_stripes; i++) {
5083                 struct btrfs_device *dev = map->stripes[i].dev;
5084
5085                 btrfs_device_set_bytes_used(dev,
5086                                             dev->bytes_used + ctl->stripe_size);
5087                 if (list_empty(&dev->post_commit_list))
5088                         list_add_tail(&dev->post_commit_list,
5089                                       &trans->transaction->dev_update_list);
5090         }
5091
5092         atomic64_sub(ctl->stripe_size * map->num_stripes,
5093                      &info->free_chunk_space);
5094
5095         free_extent_map(em);
5096         check_raid56_incompat_flag(info, type);
5097         check_raid1c34_incompat_flag(info, type);
5098
5099         return 0;
5100
5101 error_del_extent:
5102         write_lock(&em_tree->lock);
5103         remove_extent_mapping(em_tree, em);
5104         write_unlock(&em_tree->lock);
5105
5106         /* One for our allocation */
5107         free_extent_map(em);
5108         /* One for the tree reference */
5109         free_extent_map(em);
5110
5111         return ret;
5112 }
5113
5114 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5115 {
5116         struct btrfs_fs_info *info = trans->fs_info;
5117         struct btrfs_fs_devices *fs_devices = info->fs_devices;
5118         struct btrfs_device_info *devices_info = NULL;
5119         struct alloc_chunk_ctl ctl;
5120         int ret;
5121
5122         lockdep_assert_held(&info->chunk_mutex);
5123
5124         if (!alloc_profile_is_valid(type, 0)) {
5125                 ASSERT(0);
5126                 return -EINVAL;
5127         }
5128
5129         if (list_empty(&fs_devices->alloc_list)) {
5130                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5131                         btrfs_debug(info, "%s: no writable device", __func__);
5132                 return -ENOSPC;
5133         }
5134
5135         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5136                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5137                 ASSERT(0);
5138                 return -EINVAL;
5139         }
5140
5141         ctl.start = find_next_chunk(info);
5142         ctl.type = type;
5143         init_alloc_chunk_ctl(fs_devices, &ctl);
5144
5145         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5146                                GFP_NOFS);
5147         if (!devices_info)
5148                 return -ENOMEM;
5149
5150         ret = gather_device_info(fs_devices, &ctl, devices_info);
5151         if (ret < 0)
5152                 goto out;
5153
5154         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5155         if (ret < 0)
5156                 goto out;
5157
5158         ret = create_chunk(trans, &ctl, devices_info);
5159
5160 out:
5161         kfree(devices_info);
5162         return ret;
5163 }
5164
5165 /*
5166  * Chunk allocation falls into two parts. The first part does work
5167  * that makes the new allocated chunk usable, but does not do any operation
5168  * that modifies the chunk tree. The second part does the work that
5169  * requires modifying the chunk tree. This division is important for the
5170  * bootstrap process of adding storage to a seed btrfs.
5171  */
5172 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5173                              u64 chunk_offset, u64 chunk_size)
5174 {
5175         struct btrfs_fs_info *fs_info = trans->fs_info;
5176         struct btrfs_root *extent_root = fs_info->extent_root;
5177         struct btrfs_root *chunk_root = fs_info->chunk_root;
5178         struct btrfs_key key;
5179         struct btrfs_device *device;
5180         struct btrfs_chunk *chunk;
5181         struct btrfs_stripe *stripe;
5182         struct extent_map *em;
5183         struct map_lookup *map;
5184         size_t item_size;
5185         u64 dev_offset;
5186         u64 stripe_size;
5187         int i = 0;
5188         int ret = 0;
5189
5190         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5191         if (IS_ERR(em))
5192                 return PTR_ERR(em);
5193
5194         map = em->map_lookup;
5195         item_size = btrfs_chunk_item_size(map->num_stripes);
5196         stripe_size = em->orig_block_len;
5197
5198         chunk = kzalloc(item_size, GFP_NOFS);
5199         if (!chunk) {
5200                 ret = -ENOMEM;
5201                 goto out;
5202         }
5203
5204         /*
5205          * Take the device list mutex to prevent races with the final phase of
5206          * a device replace operation that replaces the device object associated
5207          * with the map's stripes, because the device object's id can change
5208          * at any time during that final phase of the device replace operation
5209          * (dev-replace.c:btrfs_dev_replace_finishing()).
5210          */
5211         mutex_lock(&fs_info->fs_devices->device_list_mutex);
5212         for (i = 0; i < map->num_stripes; i++) {
5213                 device = map->stripes[i].dev;
5214                 dev_offset = map->stripes[i].physical;
5215
5216                 ret = btrfs_update_device(trans, device);
5217                 if (ret)
5218                         break;
5219                 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5220                                              dev_offset, stripe_size);
5221                 if (ret)
5222                         break;
5223         }
5224         if (ret) {
5225                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5226                 goto out;
5227         }
5228
5229         stripe = &chunk->stripe;
5230         for (i = 0; i < map->num_stripes; i++) {
5231                 device = map->stripes[i].dev;
5232                 dev_offset = map->stripes[i].physical;
5233
5234                 btrfs_set_stack_stripe_devid(stripe, device->devid);
5235                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5236                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5237                 stripe++;
5238         }
5239         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5240
5241         btrfs_set_stack_chunk_length(chunk, chunk_size);
5242         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5243         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5244         btrfs_set_stack_chunk_type(chunk, map->type);
5245         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5246         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5247         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5248         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5249         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5250
5251         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5252         key.type = BTRFS_CHUNK_ITEM_KEY;
5253         key.offset = chunk_offset;
5254
5255         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5256         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5257                 /*
5258                  * TODO: Cleanup of inserted chunk root in case of
5259                  * failure.
5260                  */
5261                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5262         }
5263
5264 out:
5265         kfree(chunk);
5266         free_extent_map(em);
5267         return ret;
5268 }
5269
5270 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5271 {
5272         struct btrfs_fs_info *fs_info = trans->fs_info;
5273         u64 alloc_profile;
5274         int ret;
5275
5276         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5277         ret = btrfs_alloc_chunk(trans, alloc_profile);
5278         if (ret)
5279                 return ret;
5280
5281         alloc_profile = btrfs_system_alloc_profile(fs_info);
5282         ret = btrfs_alloc_chunk(trans, alloc_profile);
5283         return ret;
5284 }
5285
5286 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5287 {
5288         const int index = btrfs_bg_flags_to_raid_index(map->type);
5289
5290         return btrfs_raid_array[index].tolerated_failures;
5291 }
5292
5293 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5294 {
5295         struct extent_map *em;
5296         struct map_lookup *map;
5297         int readonly = 0;
5298         int miss_ndevs = 0;
5299         int i;
5300
5301         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5302         if (IS_ERR(em))
5303                 return 1;
5304
5305         map = em->map_lookup;
5306         for (i = 0; i < map->num_stripes; i++) {
5307                 if (test_bit(BTRFS_DEV_STATE_MISSING,
5308                                         &map->stripes[i].dev->dev_state)) {
5309                         miss_ndevs++;
5310                         continue;
5311                 }
5312                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5313                                         &map->stripes[i].dev->dev_state)) {
5314                         readonly = 1;
5315                         goto end;
5316                 }
5317         }
5318
5319         /*
5320          * If the number of missing devices is larger than max errors,
5321          * we can not write the data into that chunk successfully, so
5322          * set it readonly.
5323          */
5324         if (miss_ndevs > btrfs_chunk_max_errors(map))
5325                 readonly = 1;
5326 end:
5327         free_extent_map(em);
5328         return readonly;
5329 }
5330
5331 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5332 {
5333         struct extent_map *em;
5334
5335         while (1) {
5336                 write_lock(&tree->lock);
5337                 em = lookup_extent_mapping(tree, 0, (u64)-1);
5338                 if (em)
5339                         remove_extent_mapping(tree, em);
5340                 write_unlock(&tree->lock);
5341                 if (!em)
5342                         break;
5343                 /* once for us */
5344                 free_extent_map(em);
5345                 /* once for the tree */
5346                 free_extent_map(em);
5347         }
5348 }
5349
5350 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5351 {
5352         struct extent_map *em;
5353         struct map_lookup *map;
5354         int ret;
5355
5356         em = btrfs_get_chunk_map(fs_info, logical, len);
5357         if (IS_ERR(em))
5358                 /*
5359                  * We could return errors for these cases, but that could get
5360                  * ugly and we'd probably do the same thing which is just not do
5361                  * anything else and exit, so return 1 so the callers don't try
5362                  * to use other copies.
5363                  */
5364                 return 1;
5365
5366         map = em->map_lookup;
5367         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5368                 ret = map->num_stripes;
5369         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5370                 ret = map->sub_stripes;
5371         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5372                 ret = 2;
5373         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5374                 /*
5375                  * There could be two corrupted data stripes, we need
5376                  * to loop retry in order to rebuild the correct data.
5377                  *
5378                  * Fail a stripe at a time on every retry except the
5379                  * stripe under reconstruction.
5380                  */
5381                 ret = map->num_stripes;
5382         else
5383                 ret = 1;
5384         free_extent_map(em);
5385
5386         down_read(&fs_info->dev_replace.rwsem);
5387         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5388             fs_info->dev_replace.tgtdev)
5389                 ret++;
5390         up_read(&fs_info->dev_replace.rwsem);
5391
5392         return ret;
5393 }
5394
5395 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5396                                     u64 logical)
5397 {
5398         struct extent_map *em;
5399         struct map_lookup *map;
5400         unsigned long len = fs_info->sectorsize;
5401
5402         em = btrfs_get_chunk_map(fs_info, logical, len);
5403
5404         if (!WARN_ON(IS_ERR(em))) {
5405                 map = em->map_lookup;
5406                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5407                         len = map->stripe_len * nr_data_stripes(map);
5408                 free_extent_map(em);
5409         }
5410         return len;
5411 }
5412
5413 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5414 {
5415         struct extent_map *em;
5416         struct map_lookup *map;
5417         int ret = 0;
5418
5419         em = btrfs_get_chunk_map(fs_info, logical, len);
5420
5421         if(!WARN_ON(IS_ERR(em))) {
5422                 map = em->map_lookup;
5423                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5424                         ret = 1;
5425                 free_extent_map(em);
5426         }
5427         return ret;
5428 }
5429
5430 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5431                             struct map_lookup *map, int first,
5432                             int dev_replace_is_ongoing)
5433 {
5434         int i;
5435         int num_stripes;
5436         int preferred_mirror;
5437         int tolerance;
5438         struct btrfs_device *srcdev;
5439
5440         ASSERT((map->type &
5441                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5442
5443         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5444                 num_stripes = map->sub_stripes;
5445         else
5446                 num_stripes = map->num_stripes;
5447
5448         preferred_mirror = first + current->pid % num_stripes;
5449
5450         if (dev_replace_is_ongoing &&
5451             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5452              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5453                 srcdev = fs_info->dev_replace.srcdev;
5454         else
5455                 srcdev = NULL;
5456
5457         /*
5458          * try to avoid the drive that is the source drive for a
5459          * dev-replace procedure, only choose it if no other non-missing
5460          * mirror is available
5461          */
5462         for (tolerance = 0; tolerance < 2; tolerance++) {
5463                 if (map->stripes[preferred_mirror].dev->bdev &&
5464                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5465                         return preferred_mirror;
5466                 for (i = first; i < first + num_stripes; i++) {
5467                         if (map->stripes[i].dev->bdev &&
5468                             (tolerance || map->stripes[i].dev != srcdev))
5469                                 return i;
5470                 }
5471         }
5472
5473         /* we couldn't find one that doesn't fail.  Just return something
5474          * and the io error handling code will clean up eventually
5475          */
5476         return preferred_mirror;
5477 }
5478
5479 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5480 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5481 {
5482         int i;
5483         int again = 1;
5484
5485         while (again) {
5486                 again = 0;
5487                 for (i = 0; i < num_stripes - 1; i++) {
5488                         /* Swap if parity is on a smaller index */
5489                         if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5490                                 swap(bbio->stripes[i], bbio->stripes[i + 1]);
5491                                 swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5492                                 again = 1;
5493                         }
5494                 }
5495         }
5496 }
5497
5498 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5499 {
5500         struct btrfs_bio *bbio = kzalloc(
5501                  /* the size of the btrfs_bio */
5502                 sizeof(struct btrfs_bio) +
5503                 /* plus the variable array for the stripes */
5504                 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5505                 /* plus the variable array for the tgt dev */
5506                 sizeof(int) * (real_stripes) +
5507                 /*
5508                  * plus the raid_map, which includes both the tgt dev
5509                  * and the stripes
5510                  */
5511                 sizeof(u64) * (total_stripes),
5512                 GFP_NOFS|__GFP_NOFAIL);
5513
5514         atomic_set(&bbio->error, 0);
5515         refcount_set(&bbio->refs, 1);
5516
5517         return bbio;
5518 }
5519
5520 void btrfs_get_bbio(struct btrfs_bio *bbio)
5521 {
5522         WARN_ON(!refcount_read(&bbio->refs));
5523         refcount_inc(&bbio->refs);
5524 }
5525
5526 void btrfs_put_bbio(struct btrfs_bio *bbio)
5527 {
5528         if (!bbio)
5529                 return;
5530         if (refcount_dec_and_test(&bbio->refs))
5531                 kfree(bbio);
5532 }
5533
5534 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5535 /*
5536  * Please note that, discard won't be sent to target device of device
5537  * replace.
5538  */
5539 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5540                                          u64 logical, u64 *length_ret,
5541                                          struct btrfs_bio **bbio_ret)
5542 {
5543         struct extent_map *em;
5544         struct map_lookup *map;
5545         struct btrfs_bio *bbio;
5546         u64 length = *length_ret;
5547         u64 offset;
5548         u64 stripe_nr;
5549         u64 stripe_nr_end;
5550         u64 stripe_end_offset;
5551         u64 stripe_cnt;
5552         u64 stripe_len;
5553         u64 stripe_offset;
5554         u64 num_stripes;
5555         u32 stripe_index;
5556         u32 factor = 0;
5557         u32 sub_stripes = 0;
5558         u64 stripes_per_dev = 0;
5559         u32 remaining_stripes = 0;
5560         u32 last_stripe = 0;
5561         int ret = 0;
5562         int i;
5563
5564         /* discard always return a bbio */
5565         ASSERT(bbio_ret);
5566
5567         em = btrfs_get_chunk_map(fs_info, logical, length);
5568         if (IS_ERR(em))
5569                 return PTR_ERR(em);
5570
5571         map = em->map_lookup;
5572         /* we don't discard raid56 yet */
5573         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5574                 ret = -EOPNOTSUPP;
5575                 goto out;
5576         }
5577
5578         offset = logical - em->start;
5579         length = min_t(u64, em->start + em->len - logical, length);
5580         *length_ret = length;
5581
5582         stripe_len = map->stripe_len;
5583         /*
5584          * stripe_nr counts the total number of stripes we have to stride
5585          * to get to this block
5586          */
5587         stripe_nr = div64_u64(offset, stripe_len);
5588
5589         /* stripe_offset is the offset of this block in its stripe */
5590         stripe_offset = offset - stripe_nr * stripe_len;
5591
5592         stripe_nr_end = round_up(offset + length, map->stripe_len);
5593         stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5594         stripe_cnt = stripe_nr_end - stripe_nr;
5595         stripe_end_offset = stripe_nr_end * map->stripe_len -
5596                             (offset + length);
5597         /*
5598          * after this, stripe_nr is the number of stripes on this
5599          * device we have to walk to find the data, and stripe_index is
5600          * the number of our device in the stripe array
5601          */
5602         num_stripes = 1;
5603         stripe_index = 0;
5604         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5605                          BTRFS_BLOCK_GROUP_RAID10)) {
5606                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5607                         sub_stripes = 1;
5608                 else
5609                         sub_stripes = map->sub_stripes;
5610
5611                 factor = map->num_stripes / sub_stripes;
5612                 num_stripes = min_t(u64, map->num_stripes,
5613                                     sub_stripes * stripe_cnt);
5614                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5615                 stripe_index *= sub_stripes;
5616                 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5617                                               &remaining_stripes);
5618                 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5619                 last_stripe *= sub_stripes;
5620         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5621                                 BTRFS_BLOCK_GROUP_DUP)) {
5622                 num_stripes = map->num_stripes;
5623         } else {
5624                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5625                                         &stripe_index);
5626         }
5627
5628         bbio = alloc_btrfs_bio(num_stripes, 0);
5629         if (!bbio) {
5630                 ret = -ENOMEM;
5631                 goto out;
5632         }
5633
5634         for (i = 0; i < num_stripes; i++) {
5635                 bbio->stripes[i].physical =
5636                         map->stripes[stripe_index].physical +
5637                         stripe_offset + stripe_nr * map->stripe_len;
5638                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5639
5640                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5641                                  BTRFS_BLOCK_GROUP_RAID10)) {
5642                         bbio->stripes[i].length = stripes_per_dev *
5643                                 map->stripe_len;
5644
5645                         if (i / sub_stripes < remaining_stripes)
5646                                 bbio->stripes[i].length +=
5647                                         map->stripe_len;
5648
5649                         /*
5650                          * Special for the first stripe and
5651                          * the last stripe:
5652                          *
5653                          * |-------|...|-------|
5654                          *     |----------|
5655                          *    off     end_off
5656                          */
5657                         if (i < sub_stripes)
5658                                 bbio->stripes[i].length -=
5659                                         stripe_offset;
5660
5661                         if (stripe_index >= last_stripe &&
5662                             stripe_index <= (last_stripe +
5663                                              sub_stripes - 1))
5664                                 bbio->stripes[i].length -=
5665                                         stripe_end_offset;
5666
5667                         if (i == sub_stripes - 1)
5668                                 stripe_offset = 0;
5669                 } else {
5670                         bbio->stripes[i].length = length;
5671                 }
5672
5673                 stripe_index++;
5674                 if (stripe_index == map->num_stripes) {
5675                         stripe_index = 0;
5676                         stripe_nr++;
5677                 }
5678         }
5679
5680         *bbio_ret = bbio;
5681         bbio->map_type = map->type;
5682         bbio->num_stripes = num_stripes;
5683 out:
5684         free_extent_map(em);
5685         return ret;
5686 }
5687
5688 /*
5689  * In dev-replace case, for repair case (that's the only case where the mirror
5690  * is selected explicitly when calling btrfs_map_block), blocks left of the
5691  * left cursor can also be read from the target drive.
5692  *
5693  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5694  * array of stripes.
5695  * For READ, it also needs to be supported using the same mirror number.
5696  *
5697  * If the requested block is not left of the left cursor, EIO is returned. This
5698  * can happen because btrfs_num_copies() returns one more in the dev-replace
5699  * case.
5700  */
5701 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5702                                          u64 logical, u64 length,
5703                                          u64 srcdev_devid, int *mirror_num,
5704                                          u64 *physical)
5705 {
5706         struct btrfs_bio *bbio = NULL;
5707         int num_stripes;
5708         int index_srcdev = 0;
5709         int found = 0;
5710         u64 physical_of_found = 0;
5711         int i;
5712         int ret = 0;
5713
5714         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5715                                 logical, &length, &bbio, 0, 0);
5716         if (ret) {
5717                 ASSERT(bbio == NULL);
5718                 return ret;
5719         }
5720
5721         num_stripes = bbio->num_stripes;
5722         if (*mirror_num > num_stripes) {
5723                 /*
5724                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5725                  * that means that the requested area is not left of the left
5726                  * cursor
5727                  */
5728                 btrfs_put_bbio(bbio);
5729                 return -EIO;
5730         }
5731
5732         /*
5733          * process the rest of the function using the mirror_num of the source
5734          * drive. Therefore look it up first.  At the end, patch the device
5735          * pointer to the one of the target drive.
5736          */
5737         for (i = 0; i < num_stripes; i++) {
5738                 if (bbio->stripes[i].dev->devid != srcdev_devid)
5739                         continue;
5740
5741                 /*
5742                  * In case of DUP, in order to keep it simple, only add the
5743                  * mirror with the lowest physical address
5744                  */
5745                 if (found &&
5746                     physical_of_found <= bbio->stripes[i].physical)
5747                         continue;
5748
5749                 index_srcdev = i;
5750                 found = 1;
5751                 physical_of_found = bbio->stripes[i].physical;
5752         }
5753
5754         btrfs_put_bbio(bbio);
5755
5756         ASSERT(found);
5757         if (!found)
5758                 return -EIO;
5759
5760         *mirror_num = index_srcdev + 1;
5761         *physical = physical_of_found;
5762         return ret;
5763 }
5764
5765 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5766                                       struct btrfs_bio **bbio_ret,
5767                                       struct btrfs_dev_replace *dev_replace,
5768                                       int *num_stripes_ret, int *max_errors_ret)
5769 {
5770         struct btrfs_bio *bbio = *bbio_ret;
5771         u64 srcdev_devid = dev_replace->srcdev->devid;
5772         int tgtdev_indexes = 0;
5773         int num_stripes = *num_stripes_ret;
5774         int max_errors = *max_errors_ret;
5775         int i;
5776
5777         if (op == BTRFS_MAP_WRITE) {
5778                 int index_where_to_add;
5779
5780                 /*
5781                  * duplicate the write operations while the dev replace
5782                  * procedure is running. Since the copying of the old disk to
5783                  * the new disk takes place at run time while the filesystem is
5784                  * mounted writable, the regular write operations to the old
5785                  * disk have to be duplicated to go to the new disk as well.
5786                  *
5787                  * Note that device->missing is handled by the caller, and that
5788                  * the write to the old disk is already set up in the stripes
5789                  * array.
5790                  */
5791                 index_where_to_add = num_stripes;
5792                 for (i = 0; i < num_stripes; i++) {
5793                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5794                                 /* write to new disk, too */
5795                                 struct btrfs_bio_stripe *new =
5796                                         bbio->stripes + index_where_to_add;
5797                                 struct btrfs_bio_stripe *old =
5798                                         bbio->stripes + i;
5799
5800                                 new->physical = old->physical;
5801                                 new->length = old->length;
5802                                 new->dev = dev_replace->tgtdev;
5803                                 bbio->tgtdev_map[i] = index_where_to_add;
5804                                 index_where_to_add++;
5805                                 max_errors++;
5806                                 tgtdev_indexes++;
5807                         }
5808                 }
5809                 num_stripes = index_where_to_add;
5810         } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5811                 int index_srcdev = 0;
5812                 int found = 0;
5813                 u64 physical_of_found = 0;
5814
5815                 /*
5816                  * During the dev-replace procedure, the target drive can also
5817                  * be used to read data in case it is needed to repair a corrupt
5818                  * block elsewhere. This is possible if the requested area is
5819                  * left of the left cursor. In this area, the target drive is a
5820                  * full copy of the source drive.
5821                  */
5822                 for (i = 0; i < num_stripes; i++) {
5823                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5824                                 /*
5825                                  * In case of DUP, in order to keep it simple,
5826                                  * only add the mirror with the lowest physical
5827                                  * address
5828                                  */
5829                                 if (found &&
5830                                     physical_of_found <=
5831                                      bbio->stripes[i].physical)
5832                                         continue;
5833                                 index_srcdev = i;
5834                                 found = 1;
5835                                 physical_of_found = bbio->stripes[i].physical;
5836                         }
5837                 }
5838                 if (found) {
5839                         struct btrfs_bio_stripe *tgtdev_stripe =
5840                                 bbio->stripes + num_stripes;
5841
5842                         tgtdev_stripe->physical = physical_of_found;
5843                         tgtdev_stripe->length =
5844                                 bbio->stripes[index_srcdev].length;
5845                         tgtdev_stripe->dev = dev_replace->tgtdev;
5846                         bbio->tgtdev_map[index_srcdev] = num_stripes;
5847
5848                         tgtdev_indexes++;
5849                         num_stripes++;
5850                 }
5851         }
5852
5853         *num_stripes_ret = num_stripes;
5854         *max_errors_ret = max_errors;
5855         bbio->num_tgtdevs = tgtdev_indexes;
5856         *bbio_ret = bbio;
5857 }
5858
5859 static bool need_full_stripe(enum btrfs_map_op op)
5860 {
5861         return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5862 }
5863
5864 /*
5865  * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5866  *                     tuple. This information is used to calculate how big a
5867  *                     particular bio can get before it straddles a stripe.
5868  *
5869  * @fs_info - the filesystem
5870  * @logical - address that we want to figure out the geometry of
5871  * @len     - the length of IO we are going to perform, starting at @logical
5872  * @op      - type of operation - write or read
5873  * @io_geom - pointer used to return values
5874  *
5875  * Returns < 0 in case a chunk for the given logical address cannot be found,
5876  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
5877  */
5878 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5879                         u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
5880 {
5881         struct extent_map *em;
5882         struct map_lookup *map;
5883         u64 offset;
5884         u64 stripe_offset;
5885         u64 stripe_nr;
5886         u64 stripe_len;
5887         u64 raid56_full_stripe_start = (u64)-1;
5888         int data_stripes;
5889         int ret = 0;
5890
5891         ASSERT(op != BTRFS_MAP_DISCARD);
5892
5893         em = btrfs_get_chunk_map(fs_info, logical, len);
5894         if (IS_ERR(em))
5895                 return PTR_ERR(em);
5896
5897         map = em->map_lookup;
5898         /* Offset of this logical address in the chunk */
5899         offset = logical - em->start;
5900         /* Len of a stripe in a chunk */
5901         stripe_len = map->stripe_len;
5902         /* Stripe wher this block falls in */
5903         stripe_nr = div64_u64(offset, stripe_len);
5904         /* Offset of stripe in the chunk */
5905         stripe_offset = stripe_nr * stripe_len;
5906         if (offset < stripe_offset) {
5907                 btrfs_crit(fs_info,
5908 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
5909                         stripe_offset, offset, em->start, logical, stripe_len);
5910                 ret = -EINVAL;
5911                 goto out;
5912         }
5913
5914         /* stripe_offset is the offset of this block in its stripe */
5915         stripe_offset = offset - stripe_offset;
5916         data_stripes = nr_data_stripes(map);
5917
5918         if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5919                 u64 max_len = stripe_len - stripe_offset;
5920
5921                 /*
5922                  * In case of raid56, we need to know the stripe aligned start
5923                  */
5924                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5925                         unsigned long full_stripe_len = stripe_len * data_stripes;
5926                         raid56_full_stripe_start = offset;
5927
5928                         /*
5929                          * Allow a write of a full stripe, but make sure we
5930                          * don't allow straddling of stripes
5931                          */
5932                         raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5933                                         full_stripe_len);
5934                         raid56_full_stripe_start *= full_stripe_len;
5935
5936                         /*
5937                          * For writes to RAID[56], allow a full stripeset across
5938                          * all disks. For other RAID types and for RAID[56]
5939                          * reads, just allow a single stripe (on a single disk).
5940                          */
5941                         if (op == BTRFS_MAP_WRITE) {
5942                                 max_len = stripe_len * data_stripes -
5943                                           (offset - raid56_full_stripe_start);
5944                         }
5945                 }
5946                 len = min_t(u64, em->len - offset, max_len);
5947         } else {
5948                 len = em->len - offset;
5949         }
5950
5951         io_geom->len = len;
5952         io_geom->offset = offset;
5953         io_geom->stripe_len = stripe_len;
5954         io_geom->stripe_nr = stripe_nr;
5955         io_geom->stripe_offset = stripe_offset;
5956         io_geom->raid56_stripe_offset = raid56_full_stripe_start;
5957
5958 out:
5959         /* once for us */
5960         free_extent_map(em);
5961         return ret;
5962 }
5963
5964 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5965                              enum btrfs_map_op op,
5966                              u64 logical, u64 *length,
5967                              struct btrfs_bio **bbio_ret,
5968                              int mirror_num, int need_raid_map)
5969 {
5970         struct extent_map *em;
5971         struct map_lookup *map;
5972         u64 stripe_offset;
5973         u64 stripe_nr;
5974         u64 stripe_len;
5975         u32 stripe_index;
5976         int data_stripes;
5977         int i;
5978         int ret = 0;
5979         int num_stripes;
5980         int max_errors = 0;
5981         int tgtdev_indexes = 0;
5982         struct btrfs_bio *bbio = NULL;
5983         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
5984         int dev_replace_is_ongoing = 0;
5985         int num_alloc_stripes;
5986         int patch_the_first_stripe_for_dev_replace = 0;
5987         u64 physical_to_patch_in_first_stripe = 0;
5988         u64 raid56_full_stripe_start = (u64)-1;
5989         struct btrfs_io_geometry geom;
5990
5991         ASSERT(bbio_ret);
5992         ASSERT(op != BTRFS_MAP_DISCARD);
5993
5994         ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
5995         if (ret < 0)
5996                 return ret;
5997
5998         em = btrfs_get_chunk_map(fs_info, logical, *length);
5999         ASSERT(!IS_ERR(em));
6000         map = em->map_lookup;
6001
6002         *length = geom.len;
6003         stripe_len = geom.stripe_len;
6004         stripe_nr = geom.stripe_nr;
6005         stripe_offset = geom.stripe_offset;
6006         raid56_full_stripe_start = geom.raid56_stripe_offset;
6007         data_stripes = nr_data_stripes(map);
6008
6009         down_read(&dev_replace->rwsem);
6010         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6011         /*
6012          * Hold the semaphore for read during the whole operation, write is
6013          * requested at commit time but must wait.
6014          */
6015         if (!dev_replace_is_ongoing)
6016                 up_read(&dev_replace->rwsem);
6017
6018         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6019             !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6020                 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6021                                                     dev_replace->srcdev->devid,
6022                                                     &mirror_num,
6023                                             &physical_to_patch_in_first_stripe);
6024                 if (ret)
6025                         goto out;
6026                 else
6027                         patch_the_first_stripe_for_dev_replace = 1;
6028         } else if (mirror_num > map->num_stripes) {
6029                 mirror_num = 0;
6030         }
6031
6032         num_stripes = 1;
6033         stripe_index = 0;
6034         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6035                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6036                                 &stripe_index);
6037                 if (!need_full_stripe(op))
6038                         mirror_num = 1;
6039         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6040                 if (need_full_stripe(op))
6041                         num_stripes = map->num_stripes;
6042                 else if (mirror_num)
6043                         stripe_index = mirror_num - 1;
6044                 else {
6045                         stripe_index = find_live_mirror(fs_info, map, 0,
6046                                             dev_replace_is_ongoing);
6047                         mirror_num = stripe_index + 1;
6048                 }
6049
6050         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6051                 if (need_full_stripe(op)) {
6052                         num_stripes = map->num_stripes;
6053                 } else if (mirror_num) {
6054                         stripe_index = mirror_num - 1;
6055                 } else {
6056                         mirror_num = 1;
6057                 }
6058
6059         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6060                 u32 factor = map->num_stripes / map->sub_stripes;
6061
6062                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6063                 stripe_index *= map->sub_stripes;
6064
6065                 if (need_full_stripe(op))
6066                         num_stripes = map->sub_stripes;
6067                 else if (mirror_num)
6068                         stripe_index += mirror_num - 1;
6069                 else {
6070                         int old_stripe_index = stripe_index;
6071                         stripe_index = find_live_mirror(fs_info, map,
6072                                               stripe_index,
6073                                               dev_replace_is_ongoing);
6074                         mirror_num = stripe_index - old_stripe_index + 1;
6075                 }
6076
6077         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6078                 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6079                         /* push stripe_nr back to the start of the full stripe */
6080                         stripe_nr = div64_u64(raid56_full_stripe_start,
6081                                         stripe_len * data_stripes);
6082
6083                         /* RAID[56] write or recovery. Return all stripes */
6084                         num_stripes = map->num_stripes;
6085                         max_errors = nr_parity_stripes(map);
6086
6087                         *length = map->stripe_len;
6088                         stripe_index = 0;
6089                         stripe_offset = 0;
6090                 } else {
6091                         /*
6092                          * Mirror #0 or #1 means the original data block.
6093                          * Mirror #2 is RAID5 parity block.
6094                          * Mirror #3 is RAID6 Q block.
6095                          */
6096                         stripe_nr = div_u64_rem(stripe_nr,
6097                                         data_stripes, &stripe_index);
6098                         if (mirror_num > 1)
6099                                 stripe_index = data_stripes + mirror_num - 2;
6100
6101                         /* We distribute the parity blocks across stripes */
6102                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6103                                         &stripe_index);
6104                         if (!need_full_stripe(op) && mirror_num <= 1)
6105                                 mirror_num = 1;
6106                 }
6107         } else {
6108                 /*
6109                  * after this, stripe_nr is the number of stripes on this
6110                  * device we have to walk to find the data, and stripe_index is
6111                  * the number of our device in the stripe array
6112                  */
6113                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6114                                 &stripe_index);
6115                 mirror_num = stripe_index + 1;
6116         }
6117         if (stripe_index >= map->num_stripes) {
6118                 btrfs_crit(fs_info,
6119                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6120                            stripe_index, map->num_stripes);
6121                 ret = -EINVAL;
6122                 goto out;
6123         }
6124
6125         num_alloc_stripes = num_stripes;
6126         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6127                 if (op == BTRFS_MAP_WRITE)
6128                         num_alloc_stripes <<= 1;
6129                 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6130                         num_alloc_stripes++;
6131                 tgtdev_indexes = num_stripes;
6132         }
6133
6134         bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6135         if (!bbio) {
6136                 ret = -ENOMEM;
6137                 goto out;
6138         }
6139         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
6140                 bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
6141
6142         /* build raid_map */
6143         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6144             (need_full_stripe(op) || mirror_num > 1)) {
6145                 u64 tmp;
6146                 unsigned rot;
6147
6148                 bbio->raid_map = (u64 *)((void *)bbio->stripes +
6149                                  sizeof(struct btrfs_bio_stripe) *
6150                                  num_alloc_stripes +
6151                                  sizeof(int) * tgtdev_indexes);
6152
6153                 /* Work out the disk rotation on this stripe-set */
6154                 div_u64_rem(stripe_nr, num_stripes, &rot);
6155
6156                 /* Fill in the logical address of each stripe */
6157                 tmp = stripe_nr * data_stripes;
6158                 for (i = 0; i < data_stripes; i++)
6159                         bbio->raid_map[(i+rot) % num_stripes] =
6160                                 em->start + (tmp + i) * map->stripe_len;
6161
6162                 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6163                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6164                         bbio->raid_map[(i+rot+1) % num_stripes] =
6165                                 RAID6_Q_STRIPE;
6166         }
6167
6168
6169         for (i = 0; i < num_stripes; i++) {
6170                 bbio->stripes[i].physical =
6171                         map->stripes[stripe_index].physical +
6172                         stripe_offset +
6173                         stripe_nr * map->stripe_len;
6174                 bbio->stripes[i].dev =
6175                         map->stripes[stripe_index].dev;
6176                 stripe_index++;
6177         }
6178
6179         if (need_full_stripe(op))
6180                 max_errors = btrfs_chunk_max_errors(map);
6181
6182         if (bbio->raid_map)
6183                 sort_parity_stripes(bbio, num_stripes);
6184
6185         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6186             need_full_stripe(op)) {
6187                 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
6188                                           &max_errors);
6189         }
6190
6191         *bbio_ret = bbio;
6192         bbio->map_type = map->type;
6193         bbio->num_stripes = num_stripes;
6194         bbio->max_errors = max_errors;
6195         bbio->mirror_num = mirror_num;
6196
6197         /*
6198          * this is the case that REQ_READ && dev_replace_is_ongoing &&
6199          * mirror_num == num_stripes + 1 && dev_replace target drive is
6200          * available as a mirror
6201          */
6202         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6203                 WARN_ON(num_stripes > 1);
6204                 bbio->stripes[0].dev = dev_replace->tgtdev;
6205                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6206                 bbio->mirror_num = map->num_stripes + 1;
6207         }
6208 out:
6209         if (dev_replace_is_ongoing) {
6210                 lockdep_assert_held(&dev_replace->rwsem);
6211                 /* Unlock and let waiting writers proceed */
6212                 up_read(&dev_replace->rwsem);
6213         }
6214         free_extent_map(em);
6215         return ret;
6216 }
6217
6218 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6219                       u64 logical, u64 *length,
6220                       struct btrfs_bio **bbio_ret, int mirror_num)
6221 {
6222         if (op == BTRFS_MAP_DISCARD)
6223                 return __btrfs_map_block_for_discard(fs_info, logical,
6224                                                      length, bbio_ret);
6225
6226         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6227                                  mirror_num, 0);
6228 }
6229
6230 /* For Scrub/replace */
6231 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6232                      u64 logical, u64 *length,
6233                      struct btrfs_bio **bbio_ret)
6234 {
6235         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6236 }
6237
6238 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6239 {
6240         bio->bi_private = bbio->private;
6241         bio->bi_end_io = bbio->end_io;
6242         bio_endio(bio);
6243
6244         btrfs_put_bbio(bbio);
6245 }
6246
6247 static void btrfs_end_bio(struct bio *bio)
6248 {
6249         struct btrfs_bio *bbio = bio->bi_private;
6250         int is_orig_bio = 0;
6251
6252         if (bio->bi_status) {
6253                 atomic_inc(&bbio->error);
6254                 if (bio->bi_status == BLK_STS_IOERR ||
6255                     bio->bi_status == BLK_STS_TARGET) {
6256                         unsigned int stripe_index =
6257                                 btrfs_io_bio(bio)->stripe_index;
6258                         struct btrfs_device *dev;
6259
6260                         BUG_ON(stripe_index >= bbio->num_stripes);
6261                         dev = bbio->stripes[stripe_index].dev;
6262                         if (dev->bdev) {
6263                                 if (bio_op(bio) == REQ_OP_WRITE)
6264                                         btrfs_dev_stat_inc_and_print(dev,
6265                                                 BTRFS_DEV_STAT_WRITE_ERRS);
6266                                 else if (!(bio->bi_opf & REQ_RAHEAD))
6267                                         btrfs_dev_stat_inc_and_print(dev,
6268                                                 BTRFS_DEV_STAT_READ_ERRS);
6269                                 if (bio->bi_opf & REQ_PREFLUSH)
6270                                         btrfs_dev_stat_inc_and_print(dev,
6271                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
6272                         }
6273                 }
6274         }
6275
6276         if (bio == bbio->orig_bio)
6277                 is_orig_bio = 1;
6278
6279         btrfs_bio_counter_dec(bbio->fs_info);
6280
6281         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6282                 if (!is_orig_bio) {
6283                         bio_put(bio);
6284                         bio = bbio->orig_bio;
6285                 }
6286
6287                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6288                 /* only send an error to the higher layers if it is
6289                  * beyond the tolerance of the btrfs bio
6290                  */
6291                 if (atomic_read(&bbio->error) > bbio->max_errors) {
6292                         bio->bi_status = BLK_STS_IOERR;
6293                 } else {
6294                         /*
6295                          * this bio is actually up to date, we didn't
6296                          * go over the max number of errors
6297                          */
6298                         bio->bi_status = BLK_STS_OK;
6299                 }
6300
6301                 btrfs_end_bbio(bbio, bio);
6302         } else if (!is_orig_bio) {
6303                 bio_put(bio);
6304         }
6305 }
6306
6307 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6308                               u64 physical, int dev_nr)
6309 {
6310         struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6311         struct btrfs_fs_info *fs_info = bbio->fs_info;
6312
6313         bio->bi_private = bbio;
6314         btrfs_io_bio(bio)->stripe_index = dev_nr;
6315         bio->bi_end_io = btrfs_end_bio;
6316         bio->bi_iter.bi_sector = physical >> 9;
6317         btrfs_debug_in_rcu(fs_info,
6318         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6319                 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6320                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6321                 dev->devid, bio->bi_iter.bi_size);
6322         bio_set_dev(bio, dev->bdev);
6323
6324         btrfs_bio_counter_inc_noblocked(fs_info);
6325
6326         btrfsic_submit_bio(bio);
6327 }
6328
6329 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6330 {
6331         atomic_inc(&bbio->error);
6332         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6333                 /* Should be the original bio. */
6334                 WARN_ON(bio != bbio->orig_bio);
6335
6336                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6337                 bio->bi_iter.bi_sector = logical >> 9;
6338                 if (atomic_read(&bbio->error) > bbio->max_errors)
6339                         bio->bi_status = BLK_STS_IOERR;
6340                 else
6341                         bio->bi_status = BLK_STS_OK;
6342                 btrfs_end_bbio(bbio, bio);
6343         }
6344 }
6345
6346 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6347                            int mirror_num)
6348 {
6349         struct btrfs_device *dev;
6350         struct bio *first_bio = bio;
6351         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6352         u64 length = 0;
6353         u64 map_length;
6354         int ret;
6355         int dev_nr;
6356         int total_devs;
6357         struct btrfs_bio *bbio = NULL;
6358
6359         length = bio->bi_iter.bi_size;
6360         map_length = length;
6361
6362         btrfs_bio_counter_inc_blocked(fs_info);
6363         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6364                                 &map_length, &bbio, mirror_num, 1);
6365         if (ret) {
6366                 btrfs_bio_counter_dec(fs_info);
6367                 return errno_to_blk_status(ret);
6368         }
6369
6370         total_devs = bbio->num_stripes;
6371         bbio->orig_bio = first_bio;
6372         bbio->private = first_bio->bi_private;
6373         bbio->end_io = first_bio->bi_end_io;
6374         bbio->fs_info = fs_info;
6375         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6376
6377         if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6378             ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6379                 /* In this case, map_length has been set to the length of
6380                    a single stripe; not the whole write */
6381                 if (bio_op(bio) == REQ_OP_WRITE) {
6382                         ret = raid56_parity_write(fs_info, bio, bbio,
6383                                                   map_length);
6384                 } else {
6385                         ret = raid56_parity_recover(fs_info, bio, bbio,
6386                                                     map_length, mirror_num, 1);
6387                 }
6388
6389                 btrfs_bio_counter_dec(fs_info);
6390                 return errno_to_blk_status(ret);
6391         }
6392
6393         if (map_length < length) {
6394                 btrfs_crit(fs_info,
6395                            "mapping failed logical %llu bio len %llu len %llu",
6396                            logical, length, map_length);
6397                 BUG();
6398         }
6399
6400         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6401                 dev = bbio->stripes[dev_nr].dev;
6402                 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6403                                                    &dev->dev_state) ||
6404                     (bio_op(first_bio) == REQ_OP_WRITE &&
6405                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6406                         bbio_error(bbio, first_bio, logical);
6407                         continue;
6408                 }
6409
6410                 if (dev_nr < total_devs - 1)
6411                         bio = btrfs_bio_clone(first_bio);
6412                 else
6413                         bio = first_bio;
6414
6415                 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6416                                   dev_nr);
6417         }
6418         btrfs_bio_counter_dec(fs_info);
6419         return BLK_STS_OK;
6420 }
6421
6422 /*
6423  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6424  * return NULL.
6425  *
6426  * If devid and uuid are both specified, the match must be exact, otherwise
6427  * only devid is used.
6428  *
6429  * If @seed is true, traverse through the seed devices.
6430  */
6431 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6432                                        u64 devid, u8 *uuid, u8 *fsid,
6433                                        bool seed)
6434 {
6435         struct btrfs_device *device;
6436
6437         while (fs_devices) {
6438                 if (!fsid ||
6439                     !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6440                         list_for_each_entry(device, &fs_devices->devices,
6441                                             dev_list) {
6442                                 if (device->devid == devid &&
6443                                     (!uuid || memcmp(device->uuid, uuid,
6444                                                      BTRFS_UUID_SIZE) == 0))
6445                                         return device;
6446                         }
6447                 }
6448                 if (seed)
6449                         fs_devices = fs_devices->seed;
6450                 else
6451                         return NULL;
6452         }
6453         return NULL;
6454 }
6455
6456 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6457                                             u64 devid, u8 *dev_uuid)
6458 {
6459         struct btrfs_device *device;
6460
6461         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6462         if (IS_ERR(device))
6463                 return device;
6464
6465         list_add(&device->dev_list, &fs_devices->devices);
6466         device->fs_devices = fs_devices;
6467         fs_devices->num_devices++;
6468
6469         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6470         fs_devices->missing_devices++;
6471
6472         return device;
6473 }
6474
6475 /**
6476  * btrfs_alloc_device - allocate struct btrfs_device
6477  * @fs_info:    used only for generating a new devid, can be NULL if
6478  *              devid is provided (i.e. @devid != NULL).
6479  * @devid:      a pointer to devid for this device.  If NULL a new devid
6480  *              is generated.
6481  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6482  *              is generated.
6483  *
6484  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6485  * on error.  Returned struct is not linked onto any lists and must be
6486  * destroyed with btrfs_free_device.
6487  */
6488 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6489                                         const u64 *devid,
6490                                         const u8 *uuid)
6491 {
6492         struct btrfs_device *dev;
6493         u64 tmp;
6494
6495         if (WARN_ON(!devid && !fs_info))
6496                 return ERR_PTR(-EINVAL);
6497
6498         dev = __alloc_device();
6499         if (IS_ERR(dev))
6500                 return dev;
6501
6502         if (devid)
6503                 tmp = *devid;
6504         else {
6505                 int ret;
6506
6507                 ret = find_next_devid(fs_info, &tmp);
6508                 if (ret) {
6509                         btrfs_free_device(dev);
6510                         return ERR_PTR(ret);
6511                 }
6512         }
6513         dev->devid = tmp;
6514
6515         if (uuid)
6516                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6517         else
6518                 generate_random_uuid(dev->uuid);
6519
6520         return dev;
6521 }
6522
6523 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6524                                         u64 devid, u8 *uuid, bool error)
6525 {
6526         if (error)
6527                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6528                               devid, uuid);
6529         else
6530                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6531                               devid, uuid);
6532 }
6533
6534 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6535 {
6536         int index = btrfs_bg_flags_to_raid_index(type);
6537         int ncopies = btrfs_raid_array[index].ncopies;
6538         const int nparity = btrfs_raid_array[index].nparity;
6539         int data_stripes;
6540
6541         if (nparity)
6542                 data_stripes = num_stripes - nparity;
6543         else
6544                 data_stripes = num_stripes / ncopies;
6545
6546         return div_u64(chunk_len, data_stripes);
6547 }
6548
6549 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6550                           struct btrfs_chunk *chunk)
6551 {
6552         struct btrfs_fs_info *fs_info = leaf->fs_info;
6553         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6554         struct map_lookup *map;
6555         struct extent_map *em;
6556         u64 logical;
6557         u64 length;
6558         u64 devid;
6559         u8 uuid[BTRFS_UUID_SIZE];
6560         int num_stripes;
6561         int ret;
6562         int i;
6563
6564         logical = key->offset;
6565         length = btrfs_chunk_length(leaf, chunk);
6566         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6567
6568         /*
6569          * Only need to verify chunk item if we're reading from sys chunk array,
6570          * as chunk item in tree block is already verified by tree-checker.
6571          */
6572         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6573                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6574                 if (ret)
6575                         return ret;
6576         }
6577
6578         read_lock(&map_tree->lock);
6579         em = lookup_extent_mapping(map_tree, logical, 1);
6580         read_unlock(&map_tree->lock);
6581
6582         /* already mapped? */
6583         if (em && em->start <= logical && em->start + em->len > logical) {
6584                 free_extent_map(em);
6585                 return 0;
6586         } else if (em) {
6587                 free_extent_map(em);
6588         }
6589
6590         em = alloc_extent_map();
6591         if (!em)
6592                 return -ENOMEM;
6593         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6594         if (!map) {
6595                 free_extent_map(em);
6596                 return -ENOMEM;
6597         }
6598
6599         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6600         em->map_lookup = map;
6601         em->start = logical;
6602         em->len = length;
6603         em->orig_start = 0;
6604         em->block_start = 0;
6605         em->block_len = em->len;
6606
6607         map->num_stripes = num_stripes;
6608         map->io_width = btrfs_chunk_io_width(leaf, chunk);
6609         map->io_align = btrfs_chunk_io_align(leaf, chunk);
6610         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6611         map->type = btrfs_chunk_type(leaf, chunk);
6612         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6613         map->verified_stripes = 0;
6614         em->orig_block_len = calc_stripe_length(map->type, em->len,
6615                                                 map->num_stripes);
6616         for (i = 0; i < num_stripes; i++) {
6617                 map->stripes[i].physical =
6618                         btrfs_stripe_offset_nr(leaf, chunk, i);
6619                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6620                 read_extent_buffer(leaf, uuid, (unsigned long)
6621                                    btrfs_stripe_dev_uuid_nr(chunk, i),
6622                                    BTRFS_UUID_SIZE);
6623                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6624                                                         devid, uuid, NULL, true);
6625                 if (!map->stripes[i].dev &&
6626                     !btrfs_test_opt(fs_info, DEGRADED)) {
6627                         free_extent_map(em);
6628                         btrfs_report_missing_device(fs_info, devid, uuid, true);
6629                         return -ENOENT;
6630                 }
6631                 if (!map->stripes[i].dev) {
6632                         map->stripes[i].dev =
6633                                 add_missing_dev(fs_info->fs_devices, devid,
6634                                                 uuid);
6635                         if (IS_ERR(map->stripes[i].dev)) {
6636                                 free_extent_map(em);
6637                                 btrfs_err(fs_info,
6638                                         "failed to init missing dev %llu: %ld",
6639                                         devid, PTR_ERR(map->stripes[i].dev));
6640                                 return PTR_ERR(map->stripes[i].dev);
6641                         }
6642                         btrfs_report_missing_device(fs_info, devid, uuid, false);
6643                 }
6644                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6645                                 &(map->stripes[i].dev->dev_state));
6646
6647         }
6648
6649         write_lock(&map_tree->lock);
6650         ret = add_extent_mapping(map_tree, em, 0);
6651         write_unlock(&map_tree->lock);
6652         if (ret < 0) {
6653                 btrfs_err(fs_info,
6654                           "failed to add chunk map, start=%llu len=%llu: %d",
6655                           em->start, em->len, ret);
6656         }
6657         free_extent_map(em);
6658
6659         return ret;
6660 }
6661
6662 static void fill_device_from_item(struct extent_buffer *leaf,
6663                                  struct btrfs_dev_item *dev_item,
6664                                  struct btrfs_device *device)
6665 {
6666         unsigned long ptr;
6667
6668         device->devid = btrfs_device_id(leaf, dev_item);
6669         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6670         device->total_bytes = device->disk_total_bytes;
6671         device->commit_total_bytes = device->disk_total_bytes;
6672         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6673         device->commit_bytes_used = device->bytes_used;
6674         device->type = btrfs_device_type(leaf, dev_item);
6675         device->io_align = btrfs_device_io_align(leaf, dev_item);
6676         device->io_width = btrfs_device_io_width(leaf, dev_item);
6677         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6678         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6679         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6680
6681         ptr = btrfs_device_uuid(dev_item);
6682         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6683 }
6684
6685 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6686                                                   u8 *fsid)
6687 {
6688         struct btrfs_fs_devices *fs_devices;
6689         int ret;
6690
6691         lockdep_assert_held(&uuid_mutex);
6692         ASSERT(fsid);
6693
6694         fs_devices = fs_info->fs_devices->seed;
6695         while (fs_devices) {
6696                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6697                         return fs_devices;
6698
6699                 fs_devices = fs_devices->seed;
6700         }
6701
6702         fs_devices = find_fsid(fsid, NULL);
6703         if (!fs_devices) {
6704                 if (!btrfs_test_opt(fs_info, DEGRADED))
6705                         return ERR_PTR(-ENOENT);
6706
6707                 fs_devices = alloc_fs_devices(fsid, NULL);
6708                 if (IS_ERR(fs_devices))
6709                         return fs_devices;
6710
6711                 fs_devices->seeding = true;
6712                 fs_devices->opened = 1;
6713                 return fs_devices;
6714         }
6715
6716         fs_devices = clone_fs_devices(fs_devices);
6717         if (IS_ERR(fs_devices))
6718                 return fs_devices;
6719
6720         ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6721         if (ret) {
6722                 free_fs_devices(fs_devices);
6723                 fs_devices = ERR_PTR(ret);
6724                 goto out;
6725         }
6726
6727         if (!fs_devices->seeding) {
6728                 close_fs_devices(fs_devices);
6729                 free_fs_devices(fs_devices);
6730                 fs_devices = ERR_PTR(-EINVAL);
6731                 goto out;
6732         }
6733
6734         fs_devices->seed = fs_info->fs_devices->seed;
6735         fs_info->fs_devices->seed = fs_devices;
6736 out:
6737         return fs_devices;
6738 }
6739
6740 static int read_one_dev(struct extent_buffer *leaf,
6741                         struct btrfs_dev_item *dev_item)
6742 {
6743         struct btrfs_fs_info *fs_info = leaf->fs_info;
6744         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6745         struct btrfs_device *device;
6746         u64 devid;
6747         int ret;
6748         u8 fs_uuid[BTRFS_FSID_SIZE];
6749         u8 dev_uuid[BTRFS_UUID_SIZE];
6750
6751         devid = btrfs_device_id(leaf, dev_item);
6752         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6753                            BTRFS_UUID_SIZE);
6754         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6755                            BTRFS_FSID_SIZE);
6756
6757         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6758                 fs_devices = open_seed_devices(fs_info, fs_uuid);
6759                 if (IS_ERR(fs_devices))
6760                         return PTR_ERR(fs_devices);
6761         }
6762
6763         device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
6764                                    fs_uuid, true);
6765         if (!device) {
6766                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6767                         btrfs_report_missing_device(fs_info, devid,
6768                                                         dev_uuid, true);
6769                         return -ENOENT;
6770                 }
6771
6772                 device = add_missing_dev(fs_devices, devid, dev_uuid);
6773                 if (IS_ERR(device)) {
6774                         btrfs_err(fs_info,
6775                                 "failed to add missing dev %llu: %ld",
6776                                 devid, PTR_ERR(device));
6777                         return PTR_ERR(device);
6778                 }
6779                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6780         } else {
6781                 if (!device->bdev) {
6782                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
6783                                 btrfs_report_missing_device(fs_info,
6784                                                 devid, dev_uuid, true);
6785                                 return -ENOENT;
6786                         }
6787                         btrfs_report_missing_device(fs_info, devid,
6788                                                         dev_uuid, false);
6789                 }
6790
6791                 if (!device->bdev &&
6792                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6793                         /*
6794                          * this happens when a device that was properly setup
6795                          * in the device info lists suddenly goes bad.
6796                          * device->bdev is NULL, and so we have to set
6797                          * device->missing to one here
6798                          */
6799                         device->fs_devices->missing_devices++;
6800                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6801                 }
6802
6803                 /* Move the device to its own fs_devices */
6804                 if (device->fs_devices != fs_devices) {
6805                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6806                                                         &device->dev_state));
6807
6808                         list_move(&device->dev_list, &fs_devices->devices);
6809                         device->fs_devices->num_devices--;
6810                         fs_devices->num_devices++;
6811
6812                         device->fs_devices->missing_devices--;
6813                         fs_devices->missing_devices++;
6814
6815                         device->fs_devices = fs_devices;
6816                 }
6817         }
6818
6819         if (device->fs_devices != fs_info->fs_devices) {
6820                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
6821                 if (device->generation !=
6822                     btrfs_device_generation(leaf, dev_item))
6823                         return -EINVAL;
6824         }
6825
6826         fill_device_from_item(leaf, dev_item, device);
6827         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6828         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6829            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
6830                 device->fs_devices->total_rw_bytes += device->total_bytes;
6831                 atomic64_add(device->total_bytes - device->bytes_used,
6832                                 &fs_info->free_chunk_space);
6833         }
6834         ret = 0;
6835         return ret;
6836 }
6837
6838 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6839 {
6840         struct btrfs_root *root = fs_info->tree_root;
6841         struct btrfs_super_block *super_copy = fs_info->super_copy;
6842         struct extent_buffer *sb;
6843         struct btrfs_disk_key *disk_key;
6844         struct btrfs_chunk *chunk;
6845         u8 *array_ptr;
6846         unsigned long sb_array_offset;
6847         int ret = 0;
6848         u32 num_stripes;
6849         u32 array_size;
6850         u32 len = 0;
6851         u32 cur_offset;
6852         u64 type;
6853         struct btrfs_key key;
6854
6855         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6856         /*
6857          * This will create extent buffer of nodesize, superblock size is
6858          * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6859          * overallocate but we can keep it as-is, only the first page is used.
6860          */
6861         sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6862         if (IS_ERR(sb))
6863                 return PTR_ERR(sb);
6864         set_extent_buffer_uptodate(sb);
6865         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6866         /*
6867          * The sb extent buffer is artificial and just used to read the system array.
6868          * set_extent_buffer_uptodate() call does not properly mark all it's
6869          * pages up-to-date when the page is larger: extent does not cover the
6870          * whole page and consequently check_page_uptodate does not find all
6871          * the page's extents up-to-date (the hole beyond sb),
6872          * write_extent_buffer then triggers a WARN_ON.
6873          *
6874          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6875          * but sb spans only this function. Add an explicit SetPageUptodate call
6876          * to silence the warning eg. on PowerPC 64.
6877          */
6878         if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6879                 SetPageUptodate(sb->pages[0]);
6880
6881         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6882         array_size = btrfs_super_sys_array_size(super_copy);
6883
6884         array_ptr = super_copy->sys_chunk_array;
6885         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6886         cur_offset = 0;
6887
6888         while (cur_offset < array_size) {
6889                 disk_key = (struct btrfs_disk_key *)array_ptr;
6890                 len = sizeof(*disk_key);
6891                 if (cur_offset + len > array_size)
6892                         goto out_short_read;
6893
6894                 btrfs_disk_key_to_cpu(&key, disk_key);
6895
6896                 array_ptr += len;
6897                 sb_array_offset += len;
6898                 cur_offset += len;
6899
6900                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
6901                         btrfs_err(fs_info,
6902                             "unexpected item type %u in sys_array at offset %u",
6903                                   (u32)key.type, cur_offset);
6904                         ret = -EIO;
6905                         break;
6906                 }
6907
6908                 chunk = (struct btrfs_chunk *)sb_array_offset;
6909                 /*
6910                  * At least one btrfs_chunk with one stripe must be present,
6911                  * exact stripe count check comes afterwards
6912                  */
6913                 len = btrfs_chunk_item_size(1);
6914                 if (cur_offset + len > array_size)
6915                         goto out_short_read;
6916
6917                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6918                 if (!num_stripes) {
6919                         btrfs_err(fs_info,
6920                         "invalid number of stripes %u in sys_array at offset %u",
6921                                   num_stripes, cur_offset);
6922                         ret = -EIO;
6923                         break;
6924                 }
6925
6926                 type = btrfs_chunk_type(sb, chunk);
6927                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6928                         btrfs_err(fs_info,
6929                         "invalid chunk type %llu in sys_array at offset %u",
6930                                   type, cur_offset);
6931                         ret = -EIO;
6932                         break;
6933                 }
6934
6935                 len = btrfs_chunk_item_size(num_stripes);
6936                 if (cur_offset + len > array_size)
6937                         goto out_short_read;
6938
6939                 ret = read_one_chunk(&key, sb, chunk);
6940                 if (ret)
6941                         break;
6942
6943                 array_ptr += len;
6944                 sb_array_offset += len;
6945                 cur_offset += len;
6946         }
6947         clear_extent_buffer_uptodate(sb);
6948         free_extent_buffer_stale(sb);
6949         return ret;
6950
6951 out_short_read:
6952         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6953                         len, cur_offset);
6954         clear_extent_buffer_uptodate(sb);
6955         free_extent_buffer_stale(sb);
6956         return -EIO;
6957 }
6958
6959 /*
6960  * Check if all chunks in the fs are OK for read-write degraded mount
6961  *
6962  * If the @failing_dev is specified, it's accounted as missing.
6963  *
6964  * Return true if all chunks meet the minimal RW mount requirements.
6965  * Return false if any chunk doesn't meet the minimal RW mount requirements.
6966  */
6967 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
6968                                         struct btrfs_device *failing_dev)
6969 {
6970         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6971         struct extent_map *em;
6972         u64 next_start = 0;
6973         bool ret = true;
6974
6975         read_lock(&map_tree->lock);
6976         em = lookup_extent_mapping(map_tree, 0, (u64)-1);
6977         read_unlock(&map_tree->lock);
6978         /* No chunk at all? Return false anyway */
6979         if (!em) {
6980                 ret = false;
6981                 goto out;
6982         }
6983         while (em) {
6984                 struct map_lookup *map;
6985                 int missing = 0;
6986                 int max_tolerated;
6987                 int i;
6988
6989                 map = em->map_lookup;
6990                 max_tolerated =
6991                         btrfs_get_num_tolerated_disk_barrier_failures(
6992                                         map->type);
6993                 for (i = 0; i < map->num_stripes; i++) {
6994                         struct btrfs_device *dev = map->stripes[i].dev;
6995
6996                         if (!dev || !dev->bdev ||
6997                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
6998                             dev->last_flush_error)
6999                                 missing++;
7000                         else if (failing_dev && failing_dev == dev)
7001                                 missing++;
7002                 }
7003                 if (missing > max_tolerated) {
7004                         if (!failing_dev)
7005                                 btrfs_warn(fs_info,
7006         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7007                                    em->start, missing, max_tolerated);
7008                         free_extent_map(em);
7009                         ret = false;
7010                         goto out;
7011                 }
7012                 next_start = extent_map_end(em);
7013                 free_extent_map(em);
7014
7015                 read_lock(&map_tree->lock);
7016                 em = lookup_extent_mapping(map_tree, next_start,
7017                                            (u64)(-1) - next_start);
7018                 read_unlock(&map_tree->lock);
7019         }
7020 out:
7021         return ret;
7022 }
7023
7024 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7025 {
7026         struct btrfs_root *root = fs_info->chunk_root;
7027         struct btrfs_path *path;
7028         struct extent_buffer *leaf;
7029         struct btrfs_key key;
7030         struct btrfs_key found_key;
7031         int ret;
7032         int slot;
7033         u64 total_dev = 0;
7034
7035         path = btrfs_alloc_path();
7036         if (!path)
7037                 return -ENOMEM;
7038
7039         /*
7040          * uuid_mutex is needed only if we are mounting a sprout FS
7041          * otherwise we don't need it.
7042          */
7043         mutex_lock(&uuid_mutex);
7044         mutex_lock(&fs_info->chunk_mutex);
7045
7046         /*
7047          * Read all device items, and then all the chunk items. All
7048          * device items are found before any chunk item (their object id
7049          * is smaller than the lowest possible object id for a chunk
7050          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7051          */
7052         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7053         key.offset = 0;
7054         key.type = 0;
7055         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7056         if (ret < 0)
7057                 goto error;
7058         while (1) {
7059                 leaf = path->nodes[0];
7060                 slot = path->slots[0];
7061                 if (slot >= btrfs_header_nritems(leaf)) {
7062                         ret = btrfs_next_leaf(root, path);
7063                         if (ret == 0)
7064                                 continue;
7065                         if (ret < 0)
7066                                 goto error;
7067                         break;
7068                 }
7069                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7070                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7071                         struct btrfs_dev_item *dev_item;
7072                         dev_item = btrfs_item_ptr(leaf, slot,
7073                                                   struct btrfs_dev_item);
7074                         ret = read_one_dev(leaf, dev_item);
7075                         if (ret)
7076                                 goto error;
7077                         total_dev++;
7078                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7079                         struct btrfs_chunk *chunk;
7080                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7081                         ret = read_one_chunk(&found_key, leaf, chunk);
7082                         if (ret)
7083                                 goto error;
7084                 }
7085                 path->slots[0]++;
7086         }
7087
7088         /*
7089          * After loading chunk tree, we've got all device information,
7090          * do another round of validation checks.
7091          */
7092         if (total_dev != fs_info->fs_devices->total_devices) {
7093                 btrfs_err(fs_info,
7094            "super_num_devices %llu mismatch with num_devices %llu found here",
7095                           btrfs_super_num_devices(fs_info->super_copy),
7096                           total_dev);
7097                 ret = -EINVAL;
7098                 goto error;
7099         }
7100         if (btrfs_super_total_bytes(fs_info->super_copy) <
7101             fs_info->fs_devices->total_rw_bytes) {
7102                 btrfs_err(fs_info,
7103         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7104                           btrfs_super_total_bytes(fs_info->super_copy),
7105                           fs_info->fs_devices->total_rw_bytes);
7106                 ret = -EINVAL;
7107                 goto error;
7108         }
7109         ret = 0;
7110 error:
7111         mutex_unlock(&fs_info->chunk_mutex);
7112         mutex_unlock(&uuid_mutex);
7113
7114         btrfs_free_path(path);
7115         return ret;
7116 }
7117
7118 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7119 {
7120         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7121         struct btrfs_device *device;
7122
7123         while (fs_devices) {
7124                 mutex_lock(&fs_devices->device_list_mutex);
7125                 list_for_each_entry(device, &fs_devices->devices, dev_list)
7126                         device->fs_info = fs_info;
7127                 mutex_unlock(&fs_devices->device_list_mutex);
7128
7129                 fs_devices = fs_devices->seed;
7130         }
7131 }
7132
7133 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7134                                  const struct btrfs_dev_stats_item *ptr,
7135                                  int index)
7136 {
7137         u64 val;
7138
7139         read_extent_buffer(eb, &val,
7140                            offsetof(struct btrfs_dev_stats_item, values) +
7141                             ((unsigned long)ptr) + (index * sizeof(u64)),
7142                            sizeof(val));
7143         return val;
7144 }
7145
7146 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7147                                       struct btrfs_dev_stats_item *ptr,
7148                                       int index, u64 val)
7149 {
7150         write_extent_buffer(eb, &val,
7151                             offsetof(struct btrfs_dev_stats_item, values) +
7152                              ((unsigned long)ptr) + (index * sizeof(u64)),
7153                             sizeof(val));
7154 }
7155
7156 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7157 {
7158         struct btrfs_key key;
7159         struct btrfs_root *dev_root = fs_info->dev_root;
7160         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7161         struct extent_buffer *eb;
7162         int slot;
7163         int ret = 0;
7164         struct btrfs_device *device;
7165         struct btrfs_path *path = NULL;
7166         int i;
7167
7168         path = btrfs_alloc_path();
7169         if (!path)
7170                 return -ENOMEM;
7171
7172         mutex_lock(&fs_devices->device_list_mutex);
7173         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7174                 int item_size;
7175                 struct btrfs_dev_stats_item *ptr;
7176
7177                 key.objectid = BTRFS_DEV_STATS_OBJECTID;
7178                 key.type = BTRFS_PERSISTENT_ITEM_KEY;
7179                 key.offset = device->devid;
7180                 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
7181                 if (ret) {
7182                         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7183                                 btrfs_dev_stat_set(device, i, 0);
7184                         device->dev_stats_valid = 1;
7185                         btrfs_release_path(path);
7186                         continue;
7187                 }
7188                 slot = path->slots[0];
7189                 eb = path->nodes[0];
7190                 item_size = btrfs_item_size_nr(eb, slot);
7191
7192                 ptr = btrfs_item_ptr(eb, slot,
7193                                      struct btrfs_dev_stats_item);
7194
7195                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7196                         if (item_size >= (1 + i) * sizeof(__le64))
7197                                 btrfs_dev_stat_set(device, i,
7198                                         btrfs_dev_stats_value(eb, ptr, i));
7199                         else
7200                                 btrfs_dev_stat_set(device, i, 0);
7201                 }
7202
7203                 device->dev_stats_valid = 1;
7204                 btrfs_dev_stat_print_on_load(device);
7205                 btrfs_release_path(path);
7206         }
7207         mutex_unlock(&fs_devices->device_list_mutex);
7208
7209         btrfs_free_path(path);
7210         return ret < 0 ? ret : 0;
7211 }
7212
7213 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7214                                 struct btrfs_device *device)
7215 {
7216         struct btrfs_fs_info *fs_info = trans->fs_info;
7217         struct btrfs_root *dev_root = fs_info->dev_root;
7218         struct btrfs_path *path;
7219         struct btrfs_key key;
7220         struct extent_buffer *eb;
7221         struct btrfs_dev_stats_item *ptr;
7222         int ret;
7223         int i;
7224
7225         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7226         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7227         key.offset = device->devid;
7228
7229         path = btrfs_alloc_path();
7230         if (!path)
7231                 return -ENOMEM;
7232         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7233         if (ret < 0) {
7234                 btrfs_warn_in_rcu(fs_info,
7235                         "error %d while searching for dev_stats item for device %s",
7236                               ret, rcu_str_deref(device->name));
7237                 goto out;
7238         }
7239
7240         if (ret == 0 &&
7241             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7242                 /* need to delete old one and insert a new one */
7243                 ret = btrfs_del_item(trans, dev_root, path);
7244                 if (ret != 0) {
7245                         btrfs_warn_in_rcu(fs_info,
7246                                 "delete too small dev_stats item for device %s failed %d",
7247                                       rcu_str_deref(device->name), ret);
7248                         goto out;
7249                 }
7250                 ret = 1;
7251         }
7252
7253         if (ret == 1) {
7254                 /* need to insert a new item */
7255                 btrfs_release_path(path);
7256                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7257                                               &key, sizeof(*ptr));
7258                 if (ret < 0) {
7259                         btrfs_warn_in_rcu(fs_info,
7260                                 "insert dev_stats item for device %s failed %d",
7261                                 rcu_str_deref(device->name), ret);
7262                         goto out;
7263                 }
7264         }
7265
7266         eb = path->nodes[0];
7267         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7268         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7269                 btrfs_set_dev_stats_value(eb, ptr, i,
7270                                           btrfs_dev_stat_read(device, i));
7271         btrfs_mark_buffer_dirty(eb);
7272
7273 out:
7274         btrfs_free_path(path);
7275         return ret;
7276 }
7277
7278 /*
7279  * called from commit_transaction. Writes all changed device stats to disk.
7280  */
7281 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7282 {
7283         struct btrfs_fs_info *fs_info = trans->fs_info;
7284         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7285         struct btrfs_device *device;
7286         int stats_cnt;
7287         int ret = 0;
7288
7289         mutex_lock(&fs_devices->device_list_mutex);
7290         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7291                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7292                 if (!device->dev_stats_valid || stats_cnt == 0)
7293                         continue;
7294
7295
7296                 /*
7297                  * There is a LOAD-LOAD control dependency between the value of
7298                  * dev_stats_ccnt and updating the on-disk values which requires
7299                  * reading the in-memory counters. Such control dependencies
7300                  * require explicit read memory barriers.
7301                  *
7302                  * This memory barriers pairs with smp_mb__before_atomic in
7303                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7304                  * barrier implied by atomic_xchg in
7305                  * btrfs_dev_stats_read_and_reset
7306                  */
7307                 smp_rmb();
7308
7309                 ret = update_dev_stat_item(trans, device);
7310                 if (!ret)
7311                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7312         }
7313         mutex_unlock(&fs_devices->device_list_mutex);
7314
7315         return ret;
7316 }
7317
7318 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7319 {
7320         btrfs_dev_stat_inc(dev, index);
7321         btrfs_dev_stat_print_on_error(dev);
7322 }
7323
7324 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7325 {
7326         if (!dev->dev_stats_valid)
7327                 return;
7328         btrfs_err_rl_in_rcu(dev->fs_info,
7329                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7330                            rcu_str_deref(dev->name),
7331                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7332                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7333                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7334                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7335                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7336 }
7337
7338 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7339 {
7340         int i;
7341
7342         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7343                 if (btrfs_dev_stat_read(dev, i) != 0)
7344                         break;
7345         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7346                 return; /* all values == 0, suppress message */
7347
7348         btrfs_info_in_rcu(dev->fs_info,
7349                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7350                rcu_str_deref(dev->name),
7351                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7352                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7353                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7354                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7355                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7356 }
7357
7358 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7359                         struct btrfs_ioctl_get_dev_stats *stats)
7360 {
7361         struct btrfs_device *dev;
7362         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7363         int i;
7364
7365         mutex_lock(&fs_devices->device_list_mutex);
7366         dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7367                                 true);
7368         mutex_unlock(&fs_devices->device_list_mutex);
7369
7370         if (!dev) {
7371                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7372                 return -ENODEV;
7373         } else if (!dev->dev_stats_valid) {
7374                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7375                 return -ENODEV;
7376         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7377                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7378                         if (stats->nr_items > i)
7379                                 stats->values[i] =
7380                                         btrfs_dev_stat_read_and_reset(dev, i);
7381                         else
7382                                 btrfs_dev_stat_set(dev, i, 0);
7383                 }
7384                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7385                            current->comm, task_pid_nr(current));
7386         } else {
7387                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7388                         if (stats->nr_items > i)
7389                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7390         }
7391         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7392                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7393         return 0;
7394 }
7395
7396 /*
7397  * Update the size and bytes used for each device where it changed.  This is
7398  * delayed since we would otherwise get errors while writing out the
7399  * superblocks.
7400  *
7401  * Must be invoked during transaction commit.
7402  */
7403 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7404 {
7405         struct btrfs_device *curr, *next;
7406
7407         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7408
7409         if (list_empty(&trans->dev_update_list))
7410                 return;
7411
7412         /*
7413          * We don't need the device_list_mutex here.  This list is owned by the
7414          * transaction and the transaction must complete before the device is
7415          * released.
7416          */
7417         mutex_lock(&trans->fs_info->chunk_mutex);
7418         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7419                                  post_commit_list) {
7420                 list_del_init(&curr->post_commit_list);
7421                 curr->commit_total_bytes = curr->disk_total_bytes;
7422                 curr->commit_bytes_used = curr->bytes_used;
7423         }
7424         mutex_unlock(&trans->fs_info->chunk_mutex);
7425 }
7426
7427 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7428 {
7429         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7430         while (fs_devices) {
7431                 fs_devices->fs_info = fs_info;
7432                 fs_devices = fs_devices->seed;
7433         }
7434 }
7435
7436 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7437 {
7438         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7439         while (fs_devices) {
7440                 fs_devices->fs_info = NULL;
7441                 fs_devices = fs_devices->seed;
7442         }
7443 }
7444
7445 /*
7446  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7447  */
7448 int btrfs_bg_type_to_factor(u64 flags)
7449 {
7450         const int index = btrfs_bg_flags_to_raid_index(flags);
7451
7452         return btrfs_raid_array[index].ncopies;
7453 }
7454
7455
7456
7457 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7458                                  u64 chunk_offset, u64 devid,
7459                                  u64 physical_offset, u64 physical_len)
7460 {
7461         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7462         struct extent_map *em;
7463         struct map_lookup *map;
7464         struct btrfs_device *dev;
7465         u64 stripe_len;
7466         bool found = false;
7467         int ret = 0;
7468         int i;
7469
7470         read_lock(&em_tree->lock);
7471         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7472         read_unlock(&em_tree->lock);
7473
7474         if (!em) {
7475                 btrfs_err(fs_info,
7476 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7477                           physical_offset, devid);
7478                 ret = -EUCLEAN;
7479                 goto out;
7480         }
7481
7482         map = em->map_lookup;
7483         stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7484         if (physical_len != stripe_len) {
7485                 btrfs_err(fs_info,
7486 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7487                           physical_offset, devid, em->start, physical_len,
7488                           stripe_len);
7489                 ret = -EUCLEAN;
7490                 goto out;
7491         }
7492
7493         for (i = 0; i < map->num_stripes; i++) {
7494                 if (map->stripes[i].dev->devid == devid &&
7495                     map->stripes[i].physical == physical_offset) {
7496                         found = true;
7497                         if (map->verified_stripes >= map->num_stripes) {
7498                                 btrfs_err(fs_info,
7499                                 "too many dev extents for chunk %llu found",
7500                                           em->start);
7501                                 ret = -EUCLEAN;
7502                                 goto out;
7503                         }
7504                         map->verified_stripes++;
7505                         break;
7506                 }
7507         }
7508         if (!found) {
7509                 btrfs_err(fs_info,
7510         "dev extent physical offset %llu devid %llu has no corresponding chunk",
7511                         physical_offset, devid);
7512                 ret = -EUCLEAN;
7513         }
7514
7515         /* Make sure no dev extent is beyond device bondary */
7516         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
7517         if (!dev) {
7518                 btrfs_err(fs_info, "failed to find devid %llu", devid);
7519                 ret = -EUCLEAN;
7520                 goto out;
7521         }
7522
7523         /* It's possible this device is a dummy for seed device */
7524         if (dev->disk_total_bytes == 0) {
7525                 dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
7526                                         NULL, false);
7527                 if (!dev) {
7528                         btrfs_err(fs_info, "failed to find seed devid %llu",
7529                                   devid);
7530                         ret = -EUCLEAN;
7531                         goto out;
7532                 }
7533         }
7534
7535         if (physical_offset + physical_len > dev->disk_total_bytes) {
7536                 btrfs_err(fs_info,
7537 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7538                           devid, physical_offset, physical_len,
7539                           dev->disk_total_bytes);
7540                 ret = -EUCLEAN;
7541                 goto out;
7542         }
7543 out:
7544         free_extent_map(em);
7545         return ret;
7546 }
7547
7548 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7549 {
7550         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7551         struct extent_map *em;
7552         struct rb_node *node;
7553         int ret = 0;
7554
7555         read_lock(&em_tree->lock);
7556         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7557                 em = rb_entry(node, struct extent_map, rb_node);
7558                 if (em->map_lookup->num_stripes !=
7559                     em->map_lookup->verified_stripes) {
7560                         btrfs_err(fs_info,
7561                         "chunk %llu has missing dev extent, have %d expect %d",
7562                                   em->start, em->map_lookup->verified_stripes,
7563                                   em->map_lookup->num_stripes);
7564                         ret = -EUCLEAN;
7565                         goto out;
7566                 }
7567         }
7568 out:
7569         read_unlock(&em_tree->lock);
7570         return ret;
7571 }
7572
7573 /*
7574  * Ensure that all dev extents are mapped to correct chunk, otherwise
7575  * later chunk allocation/free would cause unexpected behavior.
7576  *
7577  * NOTE: This will iterate through the whole device tree, which should be of
7578  * the same size level as the chunk tree.  This slightly increases mount time.
7579  */
7580 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7581 {
7582         struct btrfs_path *path;
7583         struct btrfs_root *root = fs_info->dev_root;
7584         struct btrfs_key key;
7585         u64 prev_devid = 0;
7586         u64 prev_dev_ext_end = 0;
7587         int ret = 0;
7588
7589         key.objectid = 1;
7590         key.type = BTRFS_DEV_EXTENT_KEY;
7591         key.offset = 0;
7592
7593         path = btrfs_alloc_path();
7594         if (!path)
7595                 return -ENOMEM;
7596
7597         path->reada = READA_FORWARD;
7598         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7599         if (ret < 0)
7600                 goto out;
7601
7602         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7603                 ret = btrfs_next_item(root, path);
7604                 if (ret < 0)
7605                         goto out;
7606                 /* No dev extents at all? Not good */
7607                 if (ret > 0) {
7608                         ret = -EUCLEAN;
7609                         goto out;
7610                 }
7611         }
7612         while (1) {
7613                 struct extent_buffer *leaf = path->nodes[0];
7614                 struct btrfs_dev_extent *dext;
7615                 int slot = path->slots[0];
7616                 u64 chunk_offset;
7617                 u64 physical_offset;
7618                 u64 physical_len;
7619                 u64 devid;
7620
7621                 btrfs_item_key_to_cpu(leaf, &key, slot);
7622                 if (key.type != BTRFS_DEV_EXTENT_KEY)
7623                         break;
7624                 devid = key.objectid;
7625                 physical_offset = key.offset;
7626
7627                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7628                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7629                 physical_len = btrfs_dev_extent_length(leaf, dext);
7630
7631                 /* Check if this dev extent overlaps with the previous one */
7632                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7633                         btrfs_err(fs_info,
7634 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7635                                   devid, physical_offset, prev_dev_ext_end);
7636                         ret = -EUCLEAN;
7637                         goto out;
7638                 }
7639
7640                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7641                                             physical_offset, physical_len);
7642                 if (ret < 0)
7643                         goto out;
7644                 prev_devid = devid;
7645                 prev_dev_ext_end = physical_offset + physical_len;
7646
7647                 ret = btrfs_next_item(root, path);
7648                 if (ret < 0)
7649                         goto out;
7650                 if (ret > 0) {
7651                         ret = 0;
7652                         break;
7653                 }
7654         }
7655
7656         /* Ensure all chunks have corresponding dev extents */
7657         ret = verify_chunk_dev_extent_mapping(fs_info);
7658 out:
7659         btrfs_free_path(path);
7660         return ret;
7661 }
7662
7663 /*
7664  * Check whether the given block group or device is pinned by any inode being
7665  * used as a swapfile.
7666  */
7667 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7668 {
7669         struct btrfs_swapfile_pin *sp;
7670         struct rb_node *node;
7671
7672         spin_lock(&fs_info->swapfile_pins_lock);
7673         node = fs_info->swapfile_pins.rb_node;
7674         while (node) {
7675                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7676                 if (ptr < sp->ptr)
7677                         node = node->rb_left;
7678                 else if (ptr > sp->ptr)
7679                         node = node->rb_right;
7680                 else
7681                         break;
7682         }
7683         spin_unlock(&fs_info->swapfile_pins_lock);
7684         return node != NULL;
7685 }