OSDN Git Service

b8708f994e679a559af68b90b3135b2b8fa2a9ac
[uclinux-h8/linux.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/ratelimit.h>
27 #include <linux/kthread.h>
28 #include <asm/div64.h>
29 #include "compat.h"
30 #include "ctree.h"
31 #include "extent_map.h"
32 #include "disk-io.h"
33 #include "transaction.h"
34 #include "print-tree.h"
35 #include "volumes.h"
36 #include "async-thread.h"
37 #include "check-integrity.h"
38 #include "rcu-string.h"
39
40 static int init_first_rw_device(struct btrfs_trans_handle *trans,
41                                 struct btrfs_root *root,
42                                 struct btrfs_device *device);
43 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
44 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
45 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
46
47 static DEFINE_MUTEX(uuid_mutex);
48 static LIST_HEAD(fs_uuids);
49
50 static void lock_chunks(struct btrfs_root *root)
51 {
52         mutex_lock(&root->fs_info->chunk_mutex);
53 }
54
55 static void unlock_chunks(struct btrfs_root *root)
56 {
57         mutex_unlock(&root->fs_info->chunk_mutex);
58 }
59
60 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
61 {
62         struct btrfs_device *device;
63         WARN_ON(fs_devices->opened);
64         while (!list_empty(&fs_devices->devices)) {
65                 device = list_entry(fs_devices->devices.next,
66                                     struct btrfs_device, dev_list);
67                 list_del(&device->dev_list);
68                 rcu_string_free(device->name);
69                 kfree(device);
70         }
71         kfree(fs_devices);
72 }
73
74 void btrfs_cleanup_fs_uuids(void)
75 {
76         struct btrfs_fs_devices *fs_devices;
77
78         while (!list_empty(&fs_uuids)) {
79                 fs_devices = list_entry(fs_uuids.next,
80                                         struct btrfs_fs_devices, list);
81                 list_del(&fs_devices->list);
82                 free_fs_devices(fs_devices);
83         }
84 }
85
86 static noinline struct btrfs_device *__find_device(struct list_head *head,
87                                                    u64 devid, u8 *uuid)
88 {
89         struct btrfs_device *dev;
90
91         list_for_each_entry(dev, head, dev_list) {
92                 if (dev->devid == devid &&
93                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
94                         return dev;
95                 }
96         }
97         return NULL;
98 }
99
100 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
101 {
102         struct btrfs_fs_devices *fs_devices;
103
104         list_for_each_entry(fs_devices, &fs_uuids, list) {
105                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
106                         return fs_devices;
107         }
108         return NULL;
109 }
110
111 static void requeue_list(struct btrfs_pending_bios *pending_bios,
112                         struct bio *head, struct bio *tail)
113 {
114
115         struct bio *old_head;
116
117         old_head = pending_bios->head;
118         pending_bios->head = head;
119         if (pending_bios->tail)
120                 tail->bi_next = old_head;
121         else
122                 pending_bios->tail = tail;
123 }
124
125 /*
126  * we try to collect pending bios for a device so we don't get a large
127  * number of procs sending bios down to the same device.  This greatly
128  * improves the schedulers ability to collect and merge the bios.
129  *
130  * But, it also turns into a long list of bios to process and that is sure
131  * to eventually make the worker thread block.  The solution here is to
132  * make some progress and then put this work struct back at the end of
133  * the list if the block device is congested.  This way, multiple devices
134  * can make progress from a single worker thread.
135  */
136 static noinline void run_scheduled_bios(struct btrfs_device *device)
137 {
138         struct bio *pending;
139         struct backing_dev_info *bdi;
140         struct btrfs_fs_info *fs_info;
141         struct btrfs_pending_bios *pending_bios;
142         struct bio *tail;
143         struct bio *cur;
144         int again = 0;
145         unsigned long num_run;
146         unsigned long batch_run = 0;
147         unsigned long limit;
148         unsigned long last_waited = 0;
149         int force_reg = 0;
150         int sync_pending = 0;
151         struct blk_plug plug;
152
153         /*
154          * this function runs all the bios we've collected for
155          * a particular device.  We don't want to wander off to
156          * another device without first sending all of these down.
157          * So, setup a plug here and finish it off before we return
158          */
159         blk_start_plug(&plug);
160
161         bdi = blk_get_backing_dev_info(device->bdev);
162         fs_info = device->dev_root->fs_info;
163         limit = btrfs_async_submit_limit(fs_info);
164         limit = limit * 2 / 3;
165
166 loop:
167         spin_lock(&device->io_lock);
168
169 loop_lock:
170         num_run = 0;
171
172         /* take all the bios off the list at once and process them
173          * later on (without the lock held).  But, remember the
174          * tail and other pointers so the bios can be properly reinserted
175          * into the list if we hit congestion
176          */
177         if (!force_reg && device->pending_sync_bios.head) {
178                 pending_bios = &device->pending_sync_bios;
179                 force_reg = 1;
180         } else {
181                 pending_bios = &device->pending_bios;
182                 force_reg = 0;
183         }
184
185         pending = pending_bios->head;
186         tail = pending_bios->tail;
187         WARN_ON(pending && !tail);
188
189         /*
190          * if pending was null this time around, no bios need processing
191          * at all and we can stop.  Otherwise it'll loop back up again
192          * and do an additional check so no bios are missed.
193          *
194          * device->running_pending is used to synchronize with the
195          * schedule_bio code.
196          */
197         if (device->pending_sync_bios.head == NULL &&
198             device->pending_bios.head == NULL) {
199                 again = 0;
200                 device->running_pending = 0;
201         } else {
202                 again = 1;
203                 device->running_pending = 1;
204         }
205
206         pending_bios->head = NULL;
207         pending_bios->tail = NULL;
208
209         spin_unlock(&device->io_lock);
210
211         while (pending) {
212
213                 rmb();
214                 /* we want to work on both lists, but do more bios on the
215                  * sync list than the regular list
216                  */
217                 if ((num_run > 32 &&
218                     pending_bios != &device->pending_sync_bios &&
219                     device->pending_sync_bios.head) ||
220                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
221                     device->pending_bios.head)) {
222                         spin_lock(&device->io_lock);
223                         requeue_list(pending_bios, pending, tail);
224                         goto loop_lock;
225                 }
226
227                 cur = pending;
228                 pending = pending->bi_next;
229                 cur->bi_next = NULL;
230                 atomic_dec(&fs_info->nr_async_bios);
231
232                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
233                     waitqueue_active(&fs_info->async_submit_wait))
234                         wake_up(&fs_info->async_submit_wait);
235
236                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
237
238                 /*
239                  * if we're doing the sync list, record that our
240                  * plug has some sync requests on it
241                  *
242                  * If we're doing the regular list and there are
243                  * sync requests sitting around, unplug before
244                  * we add more
245                  */
246                 if (pending_bios == &device->pending_sync_bios) {
247                         sync_pending = 1;
248                 } else if (sync_pending) {
249                         blk_finish_plug(&plug);
250                         blk_start_plug(&plug);
251                         sync_pending = 0;
252                 }
253
254                 btrfsic_submit_bio(cur->bi_rw, cur);
255                 num_run++;
256                 batch_run++;
257                 if (need_resched())
258                         cond_resched();
259
260                 /*
261                  * we made progress, there is more work to do and the bdi
262                  * is now congested.  Back off and let other work structs
263                  * run instead
264                  */
265                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
266                     fs_info->fs_devices->open_devices > 1) {
267                         struct io_context *ioc;
268
269                         ioc = current->io_context;
270
271                         /*
272                          * the main goal here is that we don't want to
273                          * block if we're going to be able to submit
274                          * more requests without blocking.
275                          *
276                          * This code does two great things, it pokes into
277                          * the elevator code from a filesystem _and_
278                          * it makes assumptions about how batching works.
279                          */
280                         if (ioc && ioc->nr_batch_requests > 0 &&
281                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
282                             (last_waited == 0 ||
283                              ioc->last_waited == last_waited)) {
284                                 /*
285                                  * we want to go through our batch of
286                                  * requests and stop.  So, we copy out
287                                  * the ioc->last_waited time and test
288                                  * against it before looping
289                                  */
290                                 last_waited = ioc->last_waited;
291                                 if (need_resched())
292                                         cond_resched();
293                                 continue;
294                         }
295                         spin_lock(&device->io_lock);
296                         requeue_list(pending_bios, pending, tail);
297                         device->running_pending = 1;
298
299                         spin_unlock(&device->io_lock);
300                         btrfs_requeue_work(&device->work);
301                         goto done;
302                 }
303                 /* unplug every 64 requests just for good measure */
304                 if (batch_run % 64 == 0) {
305                         blk_finish_plug(&plug);
306                         blk_start_plug(&plug);
307                         sync_pending = 0;
308                 }
309         }
310
311         cond_resched();
312         if (again)
313                 goto loop;
314
315         spin_lock(&device->io_lock);
316         if (device->pending_bios.head || device->pending_sync_bios.head)
317                 goto loop_lock;
318         spin_unlock(&device->io_lock);
319
320 done:
321         blk_finish_plug(&plug);
322 }
323
324 static void pending_bios_fn(struct btrfs_work *work)
325 {
326         struct btrfs_device *device;
327
328         device = container_of(work, struct btrfs_device, work);
329         run_scheduled_bios(device);
330 }
331
332 static noinline int device_list_add(const char *path,
333                            struct btrfs_super_block *disk_super,
334                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
335 {
336         struct btrfs_device *device;
337         struct btrfs_fs_devices *fs_devices;
338         struct rcu_string *name;
339         u64 found_transid = btrfs_super_generation(disk_super);
340
341         fs_devices = find_fsid(disk_super->fsid);
342         if (!fs_devices) {
343                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
344                 if (!fs_devices)
345                         return -ENOMEM;
346                 INIT_LIST_HEAD(&fs_devices->devices);
347                 INIT_LIST_HEAD(&fs_devices->alloc_list);
348                 list_add(&fs_devices->list, &fs_uuids);
349                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
350                 fs_devices->latest_devid = devid;
351                 fs_devices->latest_trans = found_transid;
352                 mutex_init(&fs_devices->device_list_mutex);
353                 device = NULL;
354         } else {
355                 device = __find_device(&fs_devices->devices, devid,
356                                        disk_super->dev_item.uuid);
357         }
358         if (!device) {
359                 if (fs_devices->opened)
360                         return -EBUSY;
361
362                 device = kzalloc(sizeof(*device), GFP_NOFS);
363                 if (!device) {
364                         /* we can safely leave the fs_devices entry around */
365                         return -ENOMEM;
366                 }
367                 device->devid = devid;
368                 device->dev_stats_valid = 0;
369                 device->work.func = pending_bios_fn;
370                 memcpy(device->uuid, disk_super->dev_item.uuid,
371                        BTRFS_UUID_SIZE);
372                 spin_lock_init(&device->io_lock);
373
374                 name = rcu_string_strdup(path, GFP_NOFS);
375                 if (!name) {
376                         kfree(device);
377                         return -ENOMEM;
378                 }
379                 rcu_assign_pointer(device->name, name);
380                 INIT_LIST_HEAD(&device->dev_alloc_list);
381
382                 /* init readahead state */
383                 spin_lock_init(&device->reada_lock);
384                 device->reada_curr_zone = NULL;
385                 atomic_set(&device->reada_in_flight, 0);
386                 device->reada_next = 0;
387                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
388                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
389
390                 mutex_lock(&fs_devices->device_list_mutex);
391                 list_add_rcu(&device->dev_list, &fs_devices->devices);
392                 mutex_unlock(&fs_devices->device_list_mutex);
393
394                 device->fs_devices = fs_devices;
395                 fs_devices->num_devices++;
396         } else if (!device->name || strcmp(device->name->str, path)) {
397                 name = rcu_string_strdup(path, GFP_NOFS);
398                 if (!name)
399                         return -ENOMEM;
400                 rcu_string_free(device->name);
401                 rcu_assign_pointer(device->name, name);
402                 if (device->missing) {
403                         fs_devices->missing_devices--;
404                         device->missing = 0;
405                 }
406         }
407
408         if (found_transid > fs_devices->latest_trans) {
409                 fs_devices->latest_devid = devid;
410                 fs_devices->latest_trans = found_transid;
411         }
412         *fs_devices_ret = fs_devices;
413         return 0;
414 }
415
416 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
417 {
418         struct btrfs_fs_devices *fs_devices;
419         struct btrfs_device *device;
420         struct btrfs_device *orig_dev;
421
422         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
423         if (!fs_devices)
424                 return ERR_PTR(-ENOMEM);
425
426         INIT_LIST_HEAD(&fs_devices->devices);
427         INIT_LIST_HEAD(&fs_devices->alloc_list);
428         INIT_LIST_HEAD(&fs_devices->list);
429         mutex_init(&fs_devices->device_list_mutex);
430         fs_devices->latest_devid = orig->latest_devid;
431         fs_devices->latest_trans = orig->latest_trans;
432         fs_devices->total_devices = orig->total_devices;
433         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
434
435         /* We have held the volume lock, it is safe to get the devices. */
436         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
437                 struct rcu_string *name;
438
439                 device = kzalloc(sizeof(*device), GFP_NOFS);
440                 if (!device)
441                         goto error;
442
443                 /*
444                  * This is ok to do without rcu read locked because we hold the
445                  * uuid mutex so nothing we touch in here is going to disappear.
446                  */
447                 name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
448                 if (!name) {
449                         kfree(device);
450                         goto error;
451                 }
452                 rcu_assign_pointer(device->name, name);
453
454                 device->devid = orig_dev->devid;
455                 device->work.func = pending_bios_fn;
456                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
457                 spin_lock_init(&device->io_lock);
458                 INIT_LIST_HEAD(&device->dev_list);
459                 INIT_LIST_HEAD(&device->dev_alloc_list);
460
461                 list_add(&device->dev_list, &fs_devices->devices);
462                 device->fs_devices = fs_devices;
463                 fs_devices->num_devices++;
464         }
465         return fs_devices;
466 error:
467         free_fs_devices(fs_devices);
468         return ERR_PTR(-ENOMEM);
469 }
470
471 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
472 {
473         struct btrfs_device *device, *next;
474
475         struct block_device *latest_bdev = NULL;
476         u64 latest_devid = 0;
477         u64 latest_transid = 0;
478
479         mutex_lock(&uuid_mutex);
480 again:
481         /* This is the initialized path, it is safe to release the devices. */
482         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
483                 if (device->in_fs_metadata) {
484                         if (!latest_transid ||
485                             device->generation > latest_transid) {
486                                 latest_devid = device->devid;
487                                 latest_transid = device->generation;
488                                 latest_bdev = device->bdev;
489                         }
490                         continue;
491                 }
492
493                 if (device->bdev) {
494                         blkdev_put(device->bdev, device->mode);
495                         device->bdev = NULL;
496                         fs_devices->open_devices--;
497                 }
498                 if (device->writeable) {
499                         list_del_init(&device->dev_alloc_list);
500                         device->writeable = 0;
501                         fs_devices->rw_devices--;
502                 }
503                 list_del_init(&device->dev_list);
504                 fs_devices->num_devices--;
505                 rcu_string_free(device->name);
506                 kfree(device);
507         }
508
509         if (fs_devices->seed) {
510                 fs_devices = fs_devices->seed;
511                 goto again;
512         }
513
514         fs_devices->latest_bdev = latest_bdev;
515         fs_devices->latest_devid = latest_devid;
516         fs_devices->latest_trans = latest_transid;
517
518         mutex_unlock(&uuid_mutex);
519 }
520
521 static void __free_device(struct work_struct *work)
522 {
523         struct btrfs_device *device;
524
525         device = container_of(work, struct btrfs_device, rcu_work);
526
527         if (device->bdev)
528                 blkdev_put(device->bdev, device->mode);
529
530         rcu_string_free(device->name);
531         kfree(device);
532 }
533
534 static void free_device(struct rcu_head *head)
535 {
536         struct btrfs_device *device;
537
538         device = container_of(head, struct btrfs_device, rcu);
539
540         INIT_WORK(&device->rcu_work, __free_device);
541         schedule_work(&device->rcu_work);
542 }
543
544 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
545 {
546         struct btrfs_device *device;
547
548         if (--fs_devices->opened > 0)
549                 return 0;
550
551         mutex_lock(&fs_devices->device_list_mutex);
552         list_for_each_entry(device, &fs_devices->devices, dev_list) {
553                 struct btrfs_device *new_device;
554                 struct rcu_string *name;
555
556                 if (device->bdev)
557                         fs_devices->open_devices--;
558
559                 if (device->writeable) {
560                         list_del_init(&device->dev_alloc_list);
561                         fs_devices->rw_devices--;
562                 }
563
564                 if (device->can_discard)
565                         fs_devices->num_can_discard--;
566
567                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
568                 BUG_ON(!new_device); /* -ENOMEM */
569                 memcpy(new_device, device, sizeof(*new_device));
570
571                 /* Safe because we are under uuid_mutex */
572                 name = rcu_string_strdup(device->name->str, GFP_NOFS);
573                 BUG_ON(device->name && !name); /* -ENOMEM */
574                 rcu_assign_pointer(new_device->name, name);
575                 new_device->bdev = NULL;
576                 new_device->writeable = 0;
577                 new_device->in_fs_metadata = 0;
578                 new_device->can_discard = 0;
579                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
580
581                 call_rcu(&device->rcu, free_device);
582         }
583         mutex_unlock(&fs_devices->device_list_mutex);
584
585         WARN_ON(fs_devices->open_devices);
586         WARN_ON(fs_devices->rw_devices);
587         fs_devices->opened = 0;
588         fs_devices->seeding = 0;
589
590         return 0;
591 }
592
593 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
594 {
595         struct btrfs_fs_devices *seed_devices = NULL;
596         int ret;
597
598         mutex_lock(&uuid_mutex);
599         ret = __btrfs_close_devices(fs_devices);
600         if (!fs_devices->opened) {
601                 seed_devices = fs_devices->seed;
602                 fs_devices->seed = NULL;
603         }
604         mutex_unlock(&uuid_mutex);
605
606         while (seed_devices) {
607                 fs_devices = seed_devices;
608                 seed_devices = fs_devices->seed;
609                 __btrfs_close_devices(fs_devices);
610                 free_fs_devices(fs_devices);
611         }
612         return ret;
613 }
614
615 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
616                                 fmode_t flags, void *holder)
617 {
618         struct request_queue *q;
619         struct block_device *bdev;
620         struct list_head *head = &fs_devices->devices;
621         struct btrfs_device *device;
622         struct block_device *latest_bdev = NULL;
623         struct buffer_head *bh;
624         struct btrfs_super_block *disk_super;
625         u64 latest_devid = 0;
626         u64 latest_transid = 0;
627         u64 devid;
628         int seeding = 1;
629         int ret = 0;
630
631         flags |= FMODE_EXCL;
632
633         list_for_each_entry(device, head, dev_list) {
634                 if (device->bdev)
635                         continue;
636                 if (!device->name)
637                         continue;
638
639                 bdev = blkdev_get_by_path(device->name->str, flags, holder);
640                 if (IS_ERR(bdev)) {
641                         printk(KERN_INFO "open %s failed\n", device->name->str);
642                         goto error;
643                 }
644                 filemap_write_and_wait(bdev->bd_inode->i_mapping);
645                 invalidate_bdev(bdev);
646                 set_blocksize(bdev, 4096);
647
648                 bh = btrfs_read_dev_super(bdev);
649                 if (!bh)
650                         goto error_close;
651
652                 disk_super = (struct btrfs_super_block *)bh->b_data;
653                 devid = btrfs_stack_device_id(&disk_super->dev_item);
654                 if (devid != device->devid)
655                         goto error_brelse;
656
657                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
658                            BTRFS_UUID_SIZE))
659                         goto error_brelse;
660
661                 device->generation = btrfs_super_generation(disk_super);
662                 if (!latest_transid || device->generation > latest_transid) {
663                         latest_devid = devid;
664                         latest_transid = device->generation;
665                         latest_bdev = bdev;
666                 }
667
668                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
669                         device->writeable = 0;
670                 } else {
671                         device->writeable = !bdev_read_only(bdev);
672                         seeding = 0;
673                 }
674
675                 q = bdev_get_queue(bdev);
676                 if (blk_queue_discard(q)) {
677                         device->can_discard = 1;
678                         fs_devices->num_can_discard++;
679                 }
680
681                 device->bdev = bdev;
682                 device->in_fs_metadata = 0;
683                 device->mode = flags;
684
685                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
686                         fs_devices->rotating = 1;
687
688                 fs_devices->open_devices++;
689                 if (device->writeable) {
690                         fs_devices->rw_devices++;
691                         list_add(&device->dev_alloc_list,
692                                  &fs_devices->alloc_list);
693                 }
694                 brelse(bh);
695                 continue;
696
697 error_brelse:
698                 brelse(bh);
699 error_close:
700                 blkdev_put(bdev, flags);
701 error:
702                 continue;
703         }
704         if (fs_devices->open_devices == 0) {
705                 ret = -EINVAL;
706                 goto out;
707         }
708         fs_devices->seeding = seeding;
709         fs_devices->opened = 1;
710         fs_devices->latest_bdev = latest_bdev;
711         fs_devices->latest_devid = latest_devid;
712         fs_devices->latest_trans = latest_transid;
713         fs_devices->total_rw_bytes = 0;
714 out:
715         return ret;
716 }
717
718 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
719                        fmode_t flags, void *holder)
720 {
721         int ret;
722
723         mutex_lock(&uuid_mutex);
724         if (fs_devices->opened) {
725                 fs_devices->opened++;
726                 ret = 0;
727         } else {
728                 ret = __btrfs_open_devices(fs_devices, flags, holder);
729         }
730         mutex_unlock(&uuid_mutex);
731         return ret;
732 }
733
734 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
735                           struct btrfs_fs_devices **fs_devices_ret)
736 {
737         struct btrfs_super_block *disk_super;
738         struct block_device *bdev;
739         struct buffer_head *bh;
740         int ret;
741         u64 devid;
742         u64 transid;
743         u64 total_devices;
744
745         flags |= FMODE_EXCL;
746         bdev = blkdev_get_by_path(path, flags, holder);
747
748         if (IS_ERR(bdev)) {
749                 ret = PTR_ERR(bdev);
750                 goto error;
751         }
752
753         mutex_lock(&uuid_mutex);
754         ret = set_blocksize(bdev, 4096);
755         if (ret)
756                 goto error_close;
757         bh = btrfs_read_dev_super(bdev);
758         if (!bh) {
759                 ret = -EINVAL;
760                 goto error_close;
761         }
762         disk_super = (struct btrfs_super_block *)bh->b_data;
763         devid = btrfs_stack_device_id(&disk_super->dev_item);
764         transid = btrfs_super_generation(disk_super);
765         total_devices = btrfs_super_num_devices(disk_super);
766         if (disk_super->label[0])
767                 printk(KERN_INFO "device label %s ", disk_super->label);
768         else
769                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
770         printk(KERN_CONT "devid %llu transid %llu %s\n",
771                (unsigned long long)devid, (unsigned long long)transid, path);
772         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
773         if (!ret && fs_devices_ret)
774                 (*fs_devices_ret)->total_devices = total_devices;
775         brelse(bh);
776 error_close:
777         mutex_unlock(&uuid_mutex);
778         blkdev_put(bdev, flags);
779 error:
780         return ret;
781 }
782
783 /* helper to account the used device space in the range */
784 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
785                                    u64 end, u64 *length)
786 {
787         struct btrfs_key key;
788         struct btrfs_root *root = device->dev_root;
789         struct btrfs_dev_extent *dev_extent;
790         struct btrfs_path *path;
791         u64 extent_end;
792         int ret;
793         int slot;
794         struct extent_buffer *l;
795
796         *length = 0;
797
798         if (start >= device->total_bytes)
799                 return 0;
800
801         path = btrfs_alloc_path();
802         if (!path)
803                 return -ENOMEM;
804         path->reada = 2;
805
806         key.objectid = device->devid;
807         key.offset = start;
808         key.type = BTRFS_DEV_EXTENT_KEY;
809
810         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
811         if (ret < 0)
812                 goto out;
813         if (ret > 0) {
814                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
815                 if (ret < 0)
816                         goto out;
817         }
818
819         while (1) {
820                 l = path->nodes[0];
821                 slot = path->slots[0];
822                 if (slot >= btrfs_header_nritems(l)) {
823                         ret = btrfs_next_leaf(root, path);
824                         if (ret == 0)
825                                 continue;
826                         if (ret < 0)
827                                 goto out;
828
829                         break;
830                 }
831                 btrfs_item_key_to_cpu(l, &key, slot);
832
833                 if (key.objectid < device->devid)
834                         goto next;
835
836                 if (key.objectid > device->devid)
837                         break;
838
839                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
840                         goto next;
841
842                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
843                 extent_end = key.offset + btrfs_dev_extent_length(l,
844                                                                   dev_extent);
845                 if (key.offset <= start && extent_end > end) {
846                         *length = end - start + 1;
847                         break;
848                 } else if (key.offset <= start && extent_end > start)
849                         *length += extent_end - start;
850                 else if (key.offset > start && extent_end <= end)
851                         *length += extent_end - key.offset;
852                 else if (key.offset > start && key.offset <= end) {
853                         *length += end - key.offset + 1;
854                         break;
855                 } else if (key.offset > end)
856                         break;
857
858 next:
859                 path->slots[0]++;
860         }
861         ret = 0;
862 out:
863         btrfs_free_path(path);
864         return ret;
865 }
866
867 /*
868  * find_free_dev_extent - find free space in the specified device
869  * @device:     the device which we search the free space in
870  * @num_bytes:  the size of the free space that we need
871  * @start:      store the start of the free space.
872  * @len:        the size of the free space. that we find, or the size of the max
873  *              free space if we don't find suitable free space
874  *
875  * this uses a pretty simple search, the expectation is that it is
876  * called very infrequently and that a given device has a small number
877  * of extents
878  *
879  * @start is used to store the start of the free space if we find. But if we
880  * don't find suitable free space, it will be used to store the start position
881  * of the max free space.
882  *
883  * @len is used to store the size of the free space that we find.
884  * But if we don't find suitable free space, it is used to store the size of
885  * the max free space.
886  */
887 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
888                          u64 *start, u64 *len)
889 {
890         struct btrfs_key key;
891         struct btrfs_root *root = device->dev_root;
892         struct btrfs_dev_extent *dev_extent;
893         struct btrfs_path *path;
894         u64 hole_size;
895         u64 max_hole_start;
896         u64 max_hole_size;
897         u64 extent_end;
898         u64 search_start;
899         u64 search_end = device->total_bytes;
900         int ret;
901         int slot;
902         struct extent_buffer *l;
903
904         /* FIXME use last free of some kind */
905
906         /* we don't want to overwrite the superblock on the drive,
907          * so we make sure to start at an offset of at least 1MB
908          */
909         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
910
911         max_hole_start = search_start;
912         max_hole_size = 0;
913         hole_size = 0;
914
915         if (search_start >= search_end) {
916                 ret = -ENOSPC;
917                 goto error;
918         }
919
920         path = btrfs_alloc_path();
921         if (!path) {
922                 ret = -ENOMEM;
923                 goto error;
924         }
925         path->reada = 2;
926
927         key.objectid = device->devid;
928         key.offset = search_start;
929         key.type = BTRFS_DEV_EXTENT_KEY;
930
931         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
932         if (ret < 0)
933                 goto out;
934         if (ret > 0) {
935                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
936                 if (ret < 0)
937                         goto out;
938         }
939
940         while (1) {
941                 l = path->nodes[0];
942                 slot = path->slots[0];
943                 if (slot >= btrfs_header_nritems(l)) {
944                         ret = btrfs_next_leaf(root, path);
945                         if (ret == 0)
946                                 continue;
947                         if (ret < 0)
948                                 goto out;
949
950                         break;
951                 }
952                 btrfs_item_key_to_cpu(l, &key, slot);
953
954                 if (key.objectid < device->devid)
955                         goto next;
956
957                 if (key.objectid > device->devid)
958                         break;
959
960                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
961                         goto next;
962
963                 if (key.offset > search_start) {
964                         hole_size = key.offset - search_start;
965
966                         if (hole_size > max_hole_size) {
967                                 max_hole_start = search_start;
968                                 max_hole_size = hole_size;
969                         }
970
971                         /*
972                          * If this free space is greater than which we need,
973                          * it must be the max free space that we have found
974                          * until now, so max_hole_start must point to the start
975                          * of this free space and the length of this free space
976                          * is stored in max_hole_size. Thus, we return
977                          * max_hole_start and max_hole_size and go back to the
978                          * caller.
979                          */
980                         if (hole_size >= num_bytes) {
981                                 ret = 0;
982                                 goto out;
983                         }
984                 }
985
986                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
987                 extent_end = key.offset + btrfs_dev_extent_length(l,
988                                                                   dev_extent);
989                 if (extent_end > search_start)
990                         search_start = extent_end;
991 next:
992                 path->slots[0]++;
993                 cond_resched();
994         }
995
996         /*
997          * At this point, search_start should be the end of
998          * allocated dev extents, and when shrinking the device,
999          * search_end may be smaller than search_start.
1000          */
1001         if (search_end > search_start)
1002                 hole_size = search_end - search_start;
1003
1004         if (hole_size > max_hole_size) {
1005                 max_hole_start = search_start;
1006                 max_hole_size = hole_size;
1007         }
1008
1009         /* See above. */
1010         if (hole_size < num_bytes)
1011                 ret = -ENOSPC;
1012         else
1013                 ret = 0;
1014
1015 out:
1016         btrfs_free_path(path);
1017 error:
1018         *start = max_hole_start;
1019         if (len)
1020                 *len = max_hole_size;
1021         return ret;
1022 }
1023
1024 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1025                           struct btrfs_device *device,
1026                           u64 start)
1027 {
1028         int ret;
1029         struct btrfs_path *path;
1030         struct btrfs_root *root = device->dev_root;
1031         struct btrfs_key key;
1032         struct btrfs_key found_key;
1033         struct extent_buffer *leaf = NULL;
1034         struct btrfs_dev_extent *extent = NULL;
1035
1036         path = btrfs_alloc_path();
1037         if (!path)
1038                 return -ENOMEM;
1039
1040         key.objectid = device->devid;
1041         key.offset = start;
1042         key.type = BTRFS_DEV_EXTENT_KEY;
1043 again:
1044         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1045         if (ret > 0) {
1046                 ret = btrfs_previous_item(root, path, key.objectid,
1047                                           BTRFS_DEV_EXTENT_KEY);
1048                 if (ret)
1049                         goto out;
1050                 leaf = path->nodes[0];
1051                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1052                 extent = btrfs_item_ptr(leaf, path->slots[0],
1053                                         struct btrfs_dev_extent);
1054                 BUG_ON(found_key.offset > start || found_key.offset +
1055                        btrfs_dev_extent_length(leaf, extent) < start);
1056                 key = found_key;
1057                 btrfs_release_path(path);
1058                 goto again;
1059         } else if (ret == 0) {
1060                 leaf = path->nodes[0];
1061                 extent = btrfs_item_ptr(leaf, path->slots[0],
1062                                         struct btrfs_dev_extent);
1063         } else {
1064                 btrfs_error(root->fs_info, ret, "Slot search failed");
1065                 goto out;
1066         }
1067
1068         if (device->bytes_used > 0) {
1069                 u64 len = btrfs_dev_extent_length(leaf, extent);
1070                 device->bytes_used -= len;
1071                 spin_lock(&root->fs_info->free_chunk_lock);
1072                 root->fs_info->free_chunk_space += len;
1073                 spin_unlock(&root->fs_info->free_chunk_lock);
1074         }
1075         ret = btrfs_del_item(trans, root, path);
1076         if (ret) {
1077                 btrfs_error(root->fs_info, ret,
1078                             "Failed to remove dev extent item");
1079         }
1080 out:
1081         btrfs_free_path(path);
1082         return ret;
1083 }
1084
1085 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1086                            struct btrfs_device *device,
1087                            u64 chunk_tree, u64 chunk_objectid,
1088                            u64 chunk_offset, u64 start, u64 num_bytes)
1089 {
1090         int ret;
1091         struct btrfs_path *path;
1092         struct btrfs_root *root = device->dev_root;
1093         struct btrfs_dev_extent *extent;
1094         struct extent_buffer *leaf;
1095         struct btrfs_key key;
1096
1097         WARN_ON(!device->in_fs_metadata);
1098         path = btrfs_alloc_path();
1099         if (!path)
1100                 return -ENOMEM;
1101
1102         key.objectid = device->devid;
1103         key.offset = start;
1104         key.type = BTRFS_DEV_EXTENT_KEY;
1105         ret = btrfs_insert_empty_item(trans, root, path, &key,
1106                                       sizeof(*extent));
1107         if (ret)
1108                 goto out;
1109
1110         leaf = path->nodes[0];
1111         extent = btrfs_item_ptr(leaf, path->slots[0],
1112                                 struct btrfs_dev_extent);
1113         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1114         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1115         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1116
1117         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1118                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1119                     BTRFS_UUID_SIZE);
1120
1121         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1122         btrfs_mark_buffer_dirty(leaf);
1123 out:
1124         btrfs_free_path(path);
1125         return ret;
1126 }
1127
1128 static noinline int find_next_chunk(struct btrfs_root *root,
1129                                     u64 objectid, u64 *offset)
1130 {
1131         struct btrfs_path *path;
1132         int ret;
1133         struct btrfs_key key;
1134         struct btrfs_chunk *chunk;
1135         struct btrfs_key found_key;
1136
1137         path = btrfs_alloc_path();
1138         if (!path)
1139                 return -ENOMEM;
1140
1141         key.objectid = objectid;
1142         key.offset = (u64)-1;
1143         key.type = BTRFS_CHUNK_ITEM_KEY;
1144
1145         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1146         if (ret < 0)
1147                 goto error;
1148
1149         BUG_ON(ret == 0); /* Corruption */
1150
1151         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1152         if (ret) {
1153                 *offset = 0;
1154         } else {
1155                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1156                                       path->slots[0]);
1157                 if (found_key.objectid != objectid)
1158                         *offset = 0;
1159                 else {
1160                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1161                                                struct btrfs_chunk);
1162                         *offset = found_key.offset +
1163                                 btrfs_chunk_length(path->nodes[0], chunk);
1164                 }
1165         }
1166         ret = 0;
1167 error:
1168         btrfs_free_path(path);
1169         return ret;
1170 }
1171
1172 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1173 {
1174         int ret;
1175         struct btrfs_key key;
1176         struct btrfs_key found_key;
1177         struct btrfs_path *path;
1178
1179         root = root->fs_info->chunk_root;
1180
1181         path = btrfs_alloc_path();
1182         if (!path)
1183                 return -ENOMEM;
1184
1185         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1186         key.type = BTRFS_DEV_ITEM_KEY;
1187         key.offset = (u64)-1;
1188
1189         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1190         if (ret < 0)
1191                 goto error;
1192
1193         BUG_ON(ret == 0); /* Corruption */
1194
1195         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1196                                   BTRFS_DEV_ITEM_KEY);
1197         if (ret) {
1198                 *objectid = 1;
1199         } else {
1200                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1201                                       path->slots[0]);
1202                 *objectid = found_key.offset + 1;
1203         }
1204         ret = 0;
1205 error:
1206         btrfs_free_path(path);
1207         return ret;
1208 }
1209
1210 /*
1211  * the device information is stored in the chunk root
1212  * the btrfs_device struct should be fully filled in
1213  */
1214 int btrfs_add_device(struct btrfs_trans_handle *trans,
1215                      struct btrfs_root *root,
1216                      struct btrfs_device *device)
1217 {
1218         int ret;
1219         struct btrfs_path *path;
1220         struct btrfs_dev_item *dev_item;
1221         struct extent_buffer *leaf;
1222         struct btrfs_key key;
1223         unsigned long ptr;
1224
1225         root = root->fs_info->chunk_root;
1226
1227         path = btrfs_alloc_path();
1228         if (!path)
1229                 return -ENOMEM;
1230
1231         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1232         key.type = BTRFS_DEV_ITEM_KEY;
1233         key.offset = device->devid;
1234
1235         ret = btrfs_insert_empty_item(trans, root, path, &key,
1236                                       sizeof(*dev_item));
1237         if (ret)
1238                 goto out;
1239
1240         leaf = path->nodes[0];
1241         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1242
1243         btrfs_set_device_id(leaf, dev_item, device->devid);
1244         btrfs_set_device_generation(leaf, dev_item, 0);
1245         btrfs_set_device_type(leaf, dev_item, device->type);
1246         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1247         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1248         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1249         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1250         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1251         btrfs_set_device_group(leaf, dev_item, 0);
1252         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1253         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1254         btrfs_set_device_start_offset(leaf, dev_item, 0);
1255
1256         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1257         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1258         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1259         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1260         btrfs_mark_buffer_dirty(leaf);
1261
1262         ret = 0;
1263 out:
1264         btrfs_free_path(path);
1265         return ret;
1266 }
1267
1268 static int btrfs_rm_dev_item(struct btrfs_root *root,
1269                              struct btrfs_device *device)
1270 {
1271         int ret;
1272         struct btrfs_path *path;
1273         struct btrfs_key key;
1274         struct btrfs_trans_handle *trans;
1275
1276         root = root->fs_info->chunk_root;
1277
1278         path = btrfs_alloc_path();
1279         if (!path)
1280                 return -ENOMEM;
1281
1282         trans = btrfs_start_transaction(root, 0);
1283         if (IS_ERR(trans)) {
1284                 btrfs_free_path(path);
1285                 return PTR_ERR(trans);
1286         }
1287         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1288         key.type = BTRFS_DEV_ITEM_KEY;
1289         key.offset = device->devid;
1290         lock_chunks(root);
1291
1292         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1293         if (ret < 0)
1294                 goto out;
1295
1296         if (ret > 0) {
1297                 ret = -ENOENT;
1298                 goto out;
1299         }
1300
1301         ret = btrfs_del_item(trans, root, path);
1302         if (ret)
1303                 goto out;
1304 out:
1305         btrfs_free_path(path);
1306         unlock_chunks(root);
1307         btrfs_commit_transaction(trans, root);
1308         return ret;
1309 }
1310
1311 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1312 {
1313         struct btrfs_device *device;
1314         struct btrfs_device *next_device;
1315         struct block_device *bdev;
1316         struct buffer_head *bh = NULL;
1317         struct btrfs_super_block *disk_super;
1318         struct btrfs_fs_devices *cur_devices;
1319         u64 all_avail;
1320         u64 devid;
1321         u64 num_devices;
1322         u8 *dev_uuid;
1323         int ret = 0;
1324         bool clear_super = false;
1325
1326         mutex_lock(&uuid_mutex);
1327
1328         all_avail = root->fs_info->avail_data_alloc_bits |
1329                 root->fs_info->avail_system_alloc_bits |
1330                 root->fs_info->avail_metadata_alloc_bits;
1331
1332         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1333             root->fs_info->fs_devices->num_devices <= 4) {
1334                 printk(KERN_ERR "btrfs: unable to go below four devices "
1335                        "on raid10\n");
1336                 ret = -EINVAL;
1337                 goto out;
1338         }
1339
1340         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1341             root->fs_info->fs_devices->num_devices <= 2) {
1342                 printk(KERN_ERR "btrfs: unable to go below two "
1343                        "devices on raid1\n");
1344                 ret = -EINVAL;
1345                 goto out;
1346         }
1347
1348         if (strcmp(device_path, "missing") == 0) {
1349                 struct list_head *devices;
1350                 struct btrfs_device *tmp;
1351
1352                 device = NULL;
1353                 devices = &root->fs_info->fs_devices->devices;
1354                 /*
1355                  * It is safe to read the devices since the volume_mutex
1356                  * is held.
1357                  */
1358                 list_for_each_entry(tmp, devices, dev_list) {
1359                         if (tmp->in_fs_metadata && !tmp->bdev) {
1360                                 device = tmp;
1361                                 break;
1362                         }
1363                 }
1364                 bdev = NULL;
1365                 bh = NULL;
1366                 disk_super = NULL;
1367                 if (!device) {
1368                         printk(KERN_ERR "btrfs: no missing devices found to "
1369                                "remove\n");
1370                         goto out;
1371                 }
1372         } else {
1373                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1374                                           root->fs_info->bdev_holder);
1375                 if (IS_ERR(bdev)) {
1376                         ret = PTR_ERR(bdev);
1377                         goto out;
1378                 }
1379
1380                 set_blocksize(bdev, 4096);
1381                 invalidate_bdev(bdev);
1382                 bh = btrfs_read_dev_super(bdev);
1383                 if (!bh) {
1384                         ret = -EINVAL;
1385                         goto error_close;
1386                 }
1387                 disk_super = (struct btrfs_super_block *)bh->b_data;
1388                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1389                 dev_uuid = disk_super->dev_item.uuid;
1390                 device = btrfs_find_device(root, devid, dev_uuid,
1391                                            disk_super->fsid);
1392                 if (!device) {
1393                         ret = -ENOENT;
1394                         goto error_brelse;
1395                 }
1396         }
1397
1398         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1399                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1400                        "device\n");
1401                 ret = -EINVAL;
1402                 goto error_brelse;
1403         }
1404
1405         if (device->writeable) {
1406                 lock_chunks(root);
1407                 list_del_init(&device->dev_alloc_list);
1408                 unlock_chunks(root);
1409                 root->fs_info->fs_devices->rw_devices--;
1410                 clear_super = true;
1411         }
1412
1413         ret = btrfs_shrink_device(device, 0);
1414         if (ret)
1415                 goto error_undo;
1416
1417         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1418         if (ret)
1419                 goto error_undo;
1420
1421         spin_lock(&root->fs_info->free_chunk_lock);
1422         root->fs_info->free_chunk_space = device->total_bytes -
1423                 device->bytes_used;
1424         spin_unlock(&root->fs_info->free_chunk_lock);
1425
1426         device->in_fs_metadata = 0;
1427         btrfs_scrub_cancel_dev(root, device);
1428
1429         /*
1430          * the device list mutex makes sure that we don't change
1431          * the device list while someone else is writing out all
1432          * the device supers.
1433          */
1434
1435         cur_devices = device->fs_devices;
1436         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1437         list_del_rcu(&device->dev_list);
1438
1439         device->fs_devices->num_devices--;
1440         device->fs_devices->total_devices--;
1441
1442         if (device->missing)
1443                 root->fs_info->fs_devices->missing_devices--;
1444
1445         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1446                                  struct btrfs_device, dev_list);
1447         if (device->bdev == root->fs_info->sb->s_bdev)
1448                 root->fs_info->sb->s_bdev = next_device->bdev;
1449         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1450                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1451
1452         if (device->bdev)
1453                 device->fs_devices->open_devices--;
1454
1455         call_rcu(&device->rcu, free_device);
1456         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1457
1458         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1459         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1460
1461         if (cur_devices->open_devices == 0) {
1462                 struct btrfs_fs_devices *fs_devices;
1463                 fs_devices = root->fs_info->fs_devices;
1464                 while (fs_devices) {
1465                         if (fs_devices->seed == cur_devices)
1466                                 break;
1467                         fs_devices = fs_devices->seed;
1468                 }
1469                 fs_devices->seed = cur_devices->seed;
1470                 cur_devices->seed = NULL;
1471                 lock_chunks(root);
1472                 __btrfs_close_devices(cur_devices);
1473                 unlock_chunks(root);
1474                 free_fs_devices(cur_devices);
1475         }
1476
1477         /*
1478          * at this point, the device is zero sized.  We want to
1479          * remove it from the devices list and zero out the old super
1480          */
1481         if (clear_super) {
1482                 /* make sure this device isn't detected as part of
1483                  * the FS anymore
1484                  */
1485                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1486                 set_buffer_dirty(bh);
1487                 sync_dirty_buffer(bh);
1488         }
1489
1490         ret = 0;
1491
1492 error_brelse:
1493         brelse(bh);
1494 error_close:
1495         if (bdev)
1496                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1497 out:
1498         mutex_unlock(&uuid_mutex);
1499         return ret;
1500 error_undo:
1501         if (device->writeable) {
1502                 lock_chunks(root);
1503                 list_add(&device->dev_alloc_list,
1504                          &root->fs_info->fs_devices->alloc_list);
1505                 unlock_chunks(root);
1506                 root->fs_info->fs_devices->rw_devices++;
1507         }
1508         goto error_brelse;
1509 }
1510
1511 /*
1512  * does all the dirty work required for changing file system's UUID.
1513  */
1514 static int btrfs_prepare_sprout(struct btrfs_root *root)
1515 {
1516         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1517         struct btrfs_fs_devices *old_devices;
1518         struct btrfs_fs_devices *seed_devices;
1519         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1520         struct btrfs_device *device;
1521         u64 super_flags;
1522
1523         BUG_ON(!mutex_is_locked(&uuid_mutex));
1524         if (!fs_devices->seeding)
1525                 return -EINVAL;
1526
1527         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1528         if (!seed_devices)
1529                 return -ENOMEM;
1530
1531         old_devices = clone_fs_devices(fs_devices);
1532         if (IS_ERR(old_devices)) {
1533                 kfree(seed_devices);
1534                 return PTR_ERR(old_devices);
1535         }
1536
1537         list_add(&old_devices->list, &fs_uuids);
1538
1539         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1540         seed_devices->opened = 1;
1541         INIT_LIST_HEAD(&seed_devices->devices);
1542         INIT_LIST_HEAD(&seed_devices->alloc_list);
1543         mutex_init(&seed_devices->device_list_mutex);
1544
1545         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1546         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1547                               synchronize_rcu);
1548         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1549
1550         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1551         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1552                 device->fs_devices = seed_devices;
1553         }
1554
1555         fs_devices->seeding = 0;
1556         fs_devices->num_devices = 0;
1557         fs_devices->open_devices = 0;
1558         fs_devices->total_devices = 0;
1559         fs_devices->seed = seed_devices;
1560
1561         generate_random_uuid(fs_devices->fsid);
1562         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1563         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1564         super_flags = btrfs_super_flags(disk_super) &
1565                       ~BTRFS_SUPER_FLAG_SEEDING;
1566         btrfs_set_super_flags(disk_super, super_flags);
1567
1568         return 0;
1569 }
1570
1571 /*
1572  * strore the expected generation for seed devices in device items.
1573  */
1574 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1575                                struct btrfs_root *root)
1576 {
1577         struct btrfs_path *path;
1578         struct extent_buffer *leaf;
1579         struct btrfs_dev_item *dev_item;
1580         struct btrfs_device *device;
1581         struct btrfs_key key;
1582         u8 fs_uuid[BTRFS_UUID_SIZE];
1583         u8 dev_uuid[BTRFS_UUID_SIZE];
1584         u64 devid;
1585         int ret;
1586
1587         path = btrfs_alloc_path();
1588         if (!path)
1589                 return -ENOMEM;
1590
1591         root = root->fs_info->chunk_root;
1592         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1593         key.offset = 0;
1594         key.type = BTRFS_DEV_ITEM_KEY;
1595
1596         while (1) {
1597                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1598                 if (ret < 0)
1599                         goto error;
1600
1601                 leaf = path->nodes[0];
1602 next_slot:
1603                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1604                         ret = btrfs_next_leaf(root, path);
1605                         if (ret > 0)
1606                                 break;
1607                         if (ret < 0)
1608                                 goto error;
1609                         leaf = path->nodes[0];
1610                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1611                         btrfs_release_path(path);
1612                         continue;
1613                 }
1614
1615                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1616                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1617                     key.type != BTRFS_DEV_ITEM_KEY)
1618                         break;
1619
1620                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1621                                           struct btrfs_dev_item);
1622                 devid = btrfs_device_id(leaf, dev_item);
1623                 read_extent_buffer(leaf, dev_uuid,
1624                                    (unsigned long)btrfs_device_uuid(dev_item),
1625                                    BTRFS_UUID_SIZE);
1626                 read_extent_buffer(leaf, fs_uuid,
1627                                    (unsigned long)btrfs_device_fsid(dev_item),
1628                                    BTRFS_UUID_SIZE);
1629                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1630                 BUG_ON(!device); /* Logic error */
1631
1632                 if (device->fs_devices->seeding) {
1633                         btrfs_set_device_generation(leaf, dev_item,
1634                                                     device->generation);
1635                         btrfs_mark_buffer_dirty(leaf);
1636                 }
1637
1638                 path->slots[0]++;
1639                 goto next_slot;
1640         }
1641         ret = 0;
1642 error:
1643         btrfs_free_path(path);
1644         return ret;
1645 }
1646
1647 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1648 {
1649         struct request_queue *q;
1650         struct btrfs_trans_handle *trans;
1651         struct btrfs_device *device;
1652         struct block_device *bdev;
1653         struct list_head *devices;
1654         struct super_block *sb = root->fs_info->sb;
1655         struct rcu_string *name;
1656         u64 total_bytes;
1657         int seeding_dev = 0;
1658         int ret = 0;
1659
1660         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1661                 return -EROFS;
1662
1663         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1664                                   root->fs_info->bdev_holder);
1665         if (IS_ERR(bdev))
1666                 return PTR_ERR(bdev);
1667
1668         if (root->fs_info->fs_devices->seeding) {
1669                 seeding_dev = 1;
1670                 down_write(&sb->s_umount);
1671                 mutex_lock(&uuid_mutex);
1672         }
1673
1674         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1675
1676         devices = &root->fs_info->fs_devices->devices;
1677         /*
1678          * we have the volume lock, so we don't need the extra
1679          * device list mutex while reading the list here.
1680          */
1681         list_for_each_entry(device, devices, dev_list) {
1682                 if (device->bdev == bdev) {
1683                         ret = -EEXIST;
1684                         goto error;
1685                 }
1686         }
1687
1688         device = kzalloc(sizeof(*device), GFP_NOFS);
1689         if (!device) {
1690                 /* we can safely leave the fs_devices entry around */
1691                 ret = -ENOMEM;
1692                 goto error;
1693         }
1694
1695         name = rcu_string_strdup(device_path, GFP_NOFS);
1696         if (!name) {
1697                 kfree(device);
1698                 ret = -ENOMEM;
1699                 goto error;
1700         }
1701         rcu_assign_pointer(device->name, name);
1702
1703         ret = find_next_devid(root, &device->devid);
1704         if (ret) {
1705                 rcu_string_free(device->name);
1706                 kfree(device);
1707                 goto error;
1708         }
1709
1710         trans = btrfs_start_transaction(root, 0);
1711         if (IS_ERR(trans)) {
1712                 rcu_string_free(device->name);
1713                 kfree(device);
1714                 ret = PTR_ERR(trans);
1715                 goto error;
1716         }
1717
1718         lock_chunks(root);
1719
1720         q = bdev_get_queue(bdev);
1721         if (blk_queue_discard(q))
1722                 device->can_discard = 1;
1723         device->writeable = 1;
1724         device->work.func = pending_bios_fn;
1725         generate_random_uuid(device->uuid);
1726         spin_lock_init(&device->io_lock);
1727         device->generation = trans->transid;
1728         device->io_width = root->sectorsize;
1729         device->io_align = root->sectorsize;
1730         device->sector_size = root->sectorsize;
1731         device->total_bytes = i_size_read(bdev->bd_inode);
1732         device->disk_total_bytes = device->total_bytes;
1733         device->dev_root = root->fs_info->dev_root;
1734         device->bdev = bdev;
1735         device->in_fs_metadata = 1;
1736         device->mode = FMODE_EXCL;
1737         set_blocksize(device->bdev, 4096);
1738
1739         if (seeding_dev) {
1740                 sb->s_flags &= ~MS_RDONLY;
1741                 ret = btrfs_prepare_sprout(root);
1742                 BUG_ON(ret); /* -ENOMEM */
1743         }
1744
1745         device->fs_devices = root->fs_info->fs_devices;
1746
1747         /*
1748          * we don't want write_supers to jump in here with our device
1749          * half setup
1750          */
1751         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1752         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1753         list_add(&device->dev_alloc_list,
1754                  &root->fs_info->fs_devices->alloc_list);
1755         root->fs_info->fs_devices->num_devices++;
1756         root->fs_info->fs_devices->open_devices++;
1757         root->fs_info->fs_devices->rw_devices++;
1758         root->fs_info->fs_devices->total_devices++;
1759         if (device->can_discard)
1760                 root->fs_info->fs_devices->num_can_discard++;
1761         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1762
1763         spin_lock(&root->fs_info->free_chunk_lock);
1764         root->fs_info->free_chunk_space += device->total_bytes;
1765         spin_unlock(&root->fs_info->free_chunk_lock);
1766
1767         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1768                 root->fs_info->fs_devices->rotating = 1;
1769
1770         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1771         btrfs_set_super_total_bytes(root->fs_info->super_copy,
1772                                     total_bytes + device->total_bytes);
1773
1774         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1775         btrfs_set_super_num_devices(root->fs_info->super_copy,
1776                                     total_bytes + 1);
1777         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1778
1779         if (seeding_dev) {
1780                 ret = init_first_rw_device(trans, root, device);
1781                 if (ret)
1782                         goto error_trans;
1783                 ret = btrfs_finish_sprout(trans, root);
1784                 if (ret)
1785                         goto error_trans;
1786         } else {
1787                 ret = btrfs_add_device(trans, root, device);
1788                 if (ret)
1789                         goto error_trans;
1790         }
1791
1792         /*
1793          * we've got more storage, clear any full flags on the space
1794          * infos
1795          */
1796         btrfs_clear_space_info_full(root->fs_info);
1797
1798         unlock_chunks(root);
1799         ret = btrfs_commit_transaction(trans, root);
1800
1801         if (seeding_dev) {
1802                 mutex_unlock(&uuid_mutex);
1803                 up_write(&sb->s_umount);
1804
1805                 if (ret) /* transaction commit */
1806                         return ret;
1807
1808                 ret = btrfs_relocate_sys_chunks(root);
1809                 if (ret < 0)
1810                         btrfs_error(root->fs_info, ret,
1811                                     "Failed to relocate sys chunks after "
1812                                     "device initialization. This can be fixed "
1813                                     "using the \"btrfs balance\" command.");
1814         }
1815
1816         return ret;
1817
1818 error_trans:
1819         unlock_chunks(root);
1820         btrfs_abort_transaction(trans, root, ret);
1821         btrfs_end_transaction(trans, root);
1822         rcu_string_free(device->name);
1823         kfree(device);
1824 error:
1825         blkdev_put(bdev, FMODE_EXCL);
1826         if (seeding_dev) {
1827                 mutex_unlock(&uuid_mutex);
1828                 up_write(&sb->s_umount);
1829         }
1830         return ret;
1831 }
1832
1833 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1834                                         struct btrfs_device *device)
1835 {
1836         int ret;
1837         struct btrfs_path *path;
1838         struct btrfs_root *root;
1839         struct btrfs_dev_item *dev_item;
1840         struct extent_buffer *leaf;
1841         struct btrfs_key key;
1842
1843         root = device->dev_root->fs_info->chunk_root;
1844
1845         path = btrfs_alloc_path();
1846         if (!path)
1847                 return -ENOMEM;
1848
1849         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1850         key.type = BTRFS_DEV_ITEM_KEY;
1851         key.offset = device->devid;
1852
1853         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1854         if (ret < 0)
1855                 goto out;
1856
1857         if (ret > 0) {
1858                 ret = -ENOENT;
1859                 goto out;
1860         }
1861
1862         leaf = path->nodes[0];
1863         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1864
1865         btrfs_set_device_id(leaf, dev_item, device->devid);
1866         btrfs_set_device_type(leaf, dev_item, device->type);
1867         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1868         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1869         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1870         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1871         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1872         btrfs_mark_buffer_dirty(leaf);
1873
1874 out:
1875         btrfs_free_path(path);
1876         return ret;
1877 }
1878
1879 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1880                       struct btrfs_device *device, u64 new_size)
1881 {
1882         struct btrfs_super_block *super_copy =
1883                 device->dev_root->fs_info->super_copy;
1884         u64 old_total = btrfs_super_total_bytes(super_copy);
1885         u64 diff = new_size - device->total_bytes;
1886
1887         if (!device->writeable)
1888                 return -EACCES;
1889         if (new_size <= device->total_bytes)
1890                 return -EINVAL;
1891
1892         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1893         device->fs_devices->total_rw_bytes += diff;
1894
1895         device->total_bytes = new_size;
1896         device->disk_total_bytes = new_size;
1897         btrfs_clear_space_info_full(device->dev_root->fs_info);
1898
1899         return btrfs_update_device(trans, device);
1900 }
1901
1902 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1903                       struct btrfs_device *device, u64 new_size)
1904 {
1905         int ret;
1906         lock_chunks(device->dev_root);
1907         ret = __btrfs_grow_device(trans, device, new_size);
1908         unlock_chunks(device->dev_root);
1909         return ret;
1910 }
1911
1912 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1913                             struct btrfs_root *root,
1914                             u64 chunk_tree, u64 chunk_objectid,
1915                             u64 chunk_offset)
1916 {
1917         int ret;
1918         struct btrfs_path *path;
1919         struct btrfs_key key;
1920
1921         root = root->fs_info->chunk_root;
1922         path = btrfs_alloc_path();
1923         if (!path)
1924                 return -ENOMEM;
1925
1926         key.objectid = chunk_objectid;
1927         key.offset = chunk_offset;
1928         key.type = BTRFS_CHUNK_ITEM_KEY;
1929
1930         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1931         if (ret < 0)
1932                 goto out;
1933         else if (ret > 0) { /* Logic error or corruption */
1934                 btrfs_error(root->fs_info, -ENOENT,
1935                             "Failed lookup while freeing chunk.");
1936                 ret = -ENOENT;
1937                 goto out;
1938         }
1939
1940         ret = btrfs_del_item(trans, root, path);
1941         if (ret < 0)
1942                 btrfs_error(root->fs_info, ret,
1943                             "Failed to delete chunk item.");
1944 out:
1945         btrfs_free_path(path);
1946         return ret;
1947 }
1948
1949 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1950                         chunk_offset)
1951 {
1952         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1953         struct btrfs_disk_key *disk_key;
1954         struct btrfs_chunk *chunk;
1955         u8 *ptr;
1956         int ret = 0;
1957         u32 num_stripes;
1958         u32 array_size;
1959         u32 len = 0;
1960         u32 cur;
1961         struct btrfs_key key;
1962
1963         array_size = btrfs_super_sys_array_size(super_copy);
1964
1965         ptr = super_copy->sys_chunk_array;
1966         cur = 0;
1967
1968         while (cur < array_size) {
1969                 disk_key = (struct btrfs_disk_key *)ptr;
1970                 btrfs_disk_key_to_cpu(&key, disk_key);
1971
1972                 len = sizeof(*disk_key);
1973
1974                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1975                         chunk = (struct btrfs_chunk *)(ptr + len);
1976                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1977                         len += btrfs_chunk_item_size(num_stripes);
1978                 } else {
1979                         ret = -EIO;
1980                         break;
1981                 }
1982                 if (key.objectid == chunk_objectid &&
1983                     key.offset == chunk_offset) {
1984                         memmove(ptr, ptr + len, array_size - (cur + len));
1985                         array_size -= len;
1986                         btrfs_set_super_sys_array_size(super_copy, array_size);
1987                 } else {
1988                         ptr += len;
1989                         cur += len;
1990                 }
1991         }
1992         return ret;
1993 }
1994
1995 static int btrfs_relocate_chunk(struct btrfs_root *root,
1996                          u64 chunk_tree, u64 chunk_objectid,
1997                          u64 chunk_offset)
1998 {
1999         struct extent_map_tree *em_tree;
2000         struct btrfs_root *extent_root;
2001         struct btrfs_trans_handle *trans;
2002         struct extent_map *em;
2003         struct map_lookup *map;
2004         int ret;
2005         int i;
2006
2007         root = root->fs_info->chunk_root;
2008         extent_root = root->fs_info->extent_root;
2009         em_tree = &root->fs_info->mapping_tree.map_tree;
2010
2011         ret = btrfs_can_relocate(extent_root, chunk_offset);
2012         if (ret)
2013                 return -ENOSPC;
2014
2015         /* step one, relocate all the extents inside this chunk */
2016         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2017         if (ret)
2018                 return ret;
2019
2020         trans = btrfs_start_transaction(root, 0);
2021         BUG_ON(IS_ERR(trans));
2022
2023         lock_chunks(root);
2024
2025         /*
2026          * step two, delete the device extents and the
2027          * chunk tree entries
2028          */
2029         read_lock(&em_tree->lock);
2030         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2031         read_unlock(&em_tree->lock);
2032
2033         BUG_ON(!em || em->start > chunk_offset ||
2034                em->start + em->len < chunk_offset);
2035         map = (struct map_lookup *)em->bdev;
2036
2037         for (i = 0; i < map->num_stripes; i++) {
2038                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
2039                                             map->stripes[i].physical);
2040                 BUG_ON(ret);
2041
2042                 if (map->stripes[i].dev) {
2043                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2044                         BUG_ON(ret);
2045                 }
2046         }
2047         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2048                                chunk_offset);
2049
2050         BUG_ON(ret);
2051
2052         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2053
2054         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2055                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2056                 BUG_ON(ret);
2057         }
2058
2059         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2060         BUG_ON(ret);
2061
2062         write_lock(&em_tree->lock);
2063         remove_extent_mapping(em_tree, em);
2064         write_unlock(&em_tree->lock);
2065
2066         kfree(map);
2067         em->bdev = NULL;
2068
2069         /* once for the tree */
2070         free_extent_map(em);
2071         /* once for us */
2072         free_extent_map(em);
2073
2074         unlock_chunks(root);
2075         btrfs_end_transaction(trans, root);
2076         return 0;
2077 }
2078
2079 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2080 {
2081         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2082         struct btrfs_path *path;
2083         struct extent_buffer *leaf;
2084         struct btrfs_chunk *chunk;
2085         struct btrfs_key key;
2086         struct btrfs_key found_key;
2087         u64 chunk_tree = chunk_root->root_key.objectid;
2088         u64 chunk_type;
2089         bool retried = false;
2090         int failed = 0;
2091         int ret;
2092
2093         path = btrfs_alloc_path();
2094         if (!path)
2095                 return -ENOMEM;
2096
2097 again:
2098         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2099         key.offset = (u64)-1;
2100         key.type = BTRFS_CHUNK_ITEM_KEY;
2101
2102         while (1) {
2103                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2104                 if (ret < 0)
2105                         goto error;
2106                 BUG_ON(ret == 0); /* Corruption */
2107
2108                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2109                                           key.type);
2110                 if (ret < 0)
2111                         goto error;
2112                 if (ret > 0)
2113                         break;
2114
2115                 leaf = path->nodes[0];
2116                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2117
2118                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2119                                        struct btrfs_chunk);
2120                 chunk_type = btrfs_chunk_type(leaf, chunk);
2121                 btrfs_release_path(path);
2122
2123                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2124                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2125                                                    found_key.objectid,
2126                                                    found_key.offset);
2127                         if (ret == -ENOSPC)
2128                                 failed++;
2129                         else if (ret)
2130                                 BUG();
2131                 }
2132
2133                 if (found_key.offset == 0)
2134                         break;
2135                 key.offset = found_key.offset - 1;
2136         }
2137         ret = 0;
2138         if (failed && !retried) {
2139                 failed = 0;
2140                 retried = true;
2141                 goto again;
2142         } else if (failed && retried) {
2143                 WARN_ON(1);
2144                 ret = -ENOSPC;
2145         }
2146 error:
2147         btrfs_free_path(path);
2148         return ret;
2149 }
2150
2151 static int insert_balance_item(struct btrfs_root *root,
2152                                struct btrfs_balance_control *bctl)
2153 {
2154         struct btrfs_trans_handle *trans;
2155         struct btrfs_balance_item *item;
2156         struct btrfs_disk_balance_args disk_bargs;
2157         struct btrfs_path *path;
2158         struct extent_buffer *leaf;
2159         struct btrfs_key key;
2160         int ret, err;
2161
2162         path = btrfs_alloc_path();
2163         if (!path)
2164                 return -ENOMEM;
2165
2166         trans = btrfs_start_transaction(root, 0);
2167         if (IS_ERR(trans)) {
2168                 btrfs_free_path(path);
2169                 return PTR_ERR(trans);
2170         }
2171
2172         key.objectid = BTRFS_BALANCE_OBJECTID;
2173         key.type = BTRFS_BALANCE_ITEM_KEY;
2174         key.offset = 0;
2175
2176         ret = btrfs_insert_empty_item(trans, root, path, &key,
2177                                       sizeof(*item));
2178         if (ret)
2179                 goto out;
2180
2181         leaf = path->nodes[0];
2182         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2183
2184         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2185
2186         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2187         btrfs_set_balance_data(leaf, item, &disk_bargs);
2188         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2189         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2190         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2191         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2192
2193         btrfs_set_balance_flags(leaf, item, bctl->flags);
2194
2195         btrfs_mark_buffer_dirty(leaf);
2196 out:
2197         btrfs_free_path(path);
2198         err = btrfs_commit_transaction(trans, root);
2199         if (err && !ret)
2200                 ret = err;
2201         return ret;
2202 }
2203
2204 static int del_balance_item(struct btrfs_root *root)
2205 {
2206         struct btrfs_trans_handle *trans;
2207         struct btrfs_path *path;
2208         struct btrfs_key key;
2209         int ret, err;
2210
2211         path = btrfs_alloc_path();
2212         if (!path)
2213                 return -ENOMEM;
2214
2215         trans = btrfs_start_transaction(root, 0);
2216         if (IS_ERR(trans)) {
2217                 btrfs_free_path(path);
2218                 return PTR_ERR(trans);
2219         }
2220
2221         key.objectid = BTRFS_BALANCE_OBJECTID;
2222         key.type = BTRFS_BALANCE_ITEM_KEY;
2223         key.offset = 0;
2224
2225         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2226         if (ret < 0)
2227                 goto out;
2228         if (ret > 0) {
2229                 ret = -ENOENT;
2230                 goto out;
2231         }
2232
2233         ret = btrfs_del_item(trans, root, path);
2234 out:
2235         btrfs_free_path(path);
2236         err = btrfs_commit_transaction(trans, root);
2237         if (err && !ret)
2238                 ret = err;
2239         return ret;
2240 }
2241
2242 /*
2243  * This is a heuristic used to reduce the number of chunks balanced on
2244  * resume after balance was interrupted.
2245  */
2246 static void update_balance_args(struct btrfs_balance_control *bctl)
2247 {
2248         /*
2249          * Turn on soft mode for chunk types that were being converted.
2250          */
2251         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2252                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2253         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2254                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2255         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2256                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2257
2258         /*
2259          * Turn on usage filter if is not already used.  The idea is
2260          * that chunks that we have already balanced should be
2261          * reasonably full.  Don't do it for chunks that are being
2262          * converted - that will keep us from relocating unconverted
2263          * (albeit full) chunks.
2264          */
2265         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2266             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2267                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2268                 bctl->data.usage = 90;
2269         }
2270         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2271             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2272                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2273                 bctl->sys.usage = 90;
2274         }
2275         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2276             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2277                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2278                 bctl->meta.usage = 90;
2279         }
2280 }
2281
2282 /*
2283  * Should be called with both balance and volume mutexes held to
2284  * serialize other volume operations (add_dev/rm_dev/resize) with
2285  * restriper.  Same goes for unset_balance_control.
2286  */
2287 static void set_balance_control(struct btrfs_balance_control *bctl)
2288 {
2289         struct btrfs_fs_info *fs_info = bctl->fs_info;
2290
2291         BUG_ON(fs_info->balance_ctl);
2292
2293         spin_lock(&fs_info->balance_lock);
2294         fs_info->balance_ctl = bctl;
2295         spin_unlock(&fs_info->balance_lock);
2296 }
2297
2298 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2299 {
2300         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2301
2302         BUG_ON(!fs_info->balance_ctl);
2303
2304         spin_lock(&fs_info->balance_lock);
2305         fs_info->balance_ctl = NULL;
2306         spin_unlock(&fs_info->balance_lock);
2307
2308         kfree(bctl);
2309 }
2310
2311 /*
2312  * Balance filters.  Return 1 if chunk should be filtered out
2313  * (should not be balanced).
2314  */
2315 static int chunk_profiles_filter(u64 chunk_type,
2316                                  struct btrfs_balance_args *bargs)
2317 {
2318         chunk_type = chunk_to_extended(chunk_type) &
2319                                 BTRFS_EXTENDED_PROFILE_MASK;
2320
2321         if (bargs->profiles & chunk_type)
2322                 return 0;
2323
2324         return 1;
2325 }
2326
2327 static u64 div_factor_fine(u64 num, int factor)
2328 {
2329         if (factor <= 0)
2330                 return 0;
2331         if (factor >= 100)
2332                 return num;
2333
2334         num *= factor;
2335         do_div(num, 100);
2336         return num;
2337 }
2338
2339 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2340                               struct btrfs_balance_args *bargs)
2341 {
2342         struct btrfs_block_group_cache *cache;
2343         u64 chunk_used, user_thresh;
2344         int ret = 1;
2345
2346         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2347         chunk_used = btrfs_block_group_used(&cache->item);
2348
2349         user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2350         if (chunk_used < user_thresh)
2351                 ret = 0;
2352
2353         btrfs_put_block_group(cache);
2354         return ret;
2355 }
2356
2357 static int chunk_devid_filter(struct extent_buffer *leaf,
2358                               struct btrfs_chunk *chunk,
2359                               struct btrfs_balance_args *bargs)
2360 {
2361         struct btrfs_stripe *stripe;
2362         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2363         int i;
2364
2365         for (i = 0; i < num_stripes; i++) {
2366                 stripe = btrfs_stripe_nr(chunk, i);
2367                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2368                         return 0;
2369         }
2370
2371         return 1;
2372 }
2373
2374 /* [pstart, pend) */
2375 static int chunk_drange_filter(struct extent_buffer *leaf,
2376                                struct btrfs_chunk *chunk,
2377                                u64 chunk_offset,
2378                                struct btrfs_balance_args *bargs)
2379 {
2380         struct btrfs_stripe *stripe;
2381         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2382         u64 stripe_offset;
2383         u64 stripe_length;
2384         int factor;
2385         int i;
2386
2387         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2388                 return 0;
2389
2390         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2391              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2392                 factor = 2;
2393         else
2394                 factor = 1;
2395         factor = num_stripes / factor;
2396
2397         for (i = 0; i < num_stripes; i++) {
2398                 stripe = btrfs_stripe_nr(chunk, i);
2399                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2400                         continue;
2401
2402                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2403                 stripe_length = btrfs_chunk_length(leaf, chunk);
2404                 do_div(stripe_length, factor);
2405
2406                 if (stripe_offset < bargs->pend &&
2407                     stripe_offset + stripe_length > bargs->pstart)
2408                         return 0;
2409         }
2410
2411         return 1;
2412 }
2413
2414 /* [vstart, vend) */
2415 static int chunk_vrange_filter(struct extent_buffer *leaf,
2416                                struct btrfs_chunk *chunk,
2417                                u64 chunk_offset,
2418                                struct btrfs_balance_args *bargs)
2419 {
2420         if (chunk_offset < bargs->vend &&
2421             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2422                 /* at least part of the chunk is inside this vrange */
2423                 return 0;
2424
2425         return 1;
2426 }
2427
2428 static int chunk_soft_convert_filter(u64 chunk_type,
2429                                      struct btrfs_balance_args *bargs)
2430 {
2431         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2432                 return 0;
2433
2434         chunk_type = chunk_to_extended(chunk_type) &
2435                                 BTRFS_EXTENDED_PROFILE_MASK;
2436
2437         if (bargs->target == chunk_type)
2438                 return 1;
2439
2440         return 0;
2441 }
2442
2443 static int should_balance_chunk(struct btrfs_root *root,
2444                                 struct extent_buffer *leaf,
2445                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2446 {
2447         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2448         struct btrfs_balance_args *bargs = NULL;
2449         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2450
2451         /* type filter */
2452         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2453               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2454                 return 0;
2455         }
2456
2457         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2458                 bargs = &bctl->data;
2459         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2460                 bargs = &bctl->sys;
2461         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2462                 bargs = &bctl->meta;
2463
2464         /* profiles filter */
2465         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2466             chunk_profiles_filter(chunk_type, bargs)) {
2467                 return 0;
2468         }
2469
2470         /* usage filter */
2471         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2472             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2473                 return 0;
2474         }
2475
2476         /* devid filter */
2477         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2478             chunk_devid_filter(leaf, chunk, bargs)) {
2479                 return 0;
2480         }
2481
2482         /* drange filter, makes sense only with devid filter */
2483         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2484             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2485                 return 0;
2486         }
2487
2488         /* vrange filter */
2489         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2490             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2491                 return 0;
2492         }
2493
2494         /* soft profile changing mode */
2495         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2496             chunk_soft_convert_filter(chunk_type, bargs)) {
2497                 return 0;
2498         }
2499
2500         return 1;
2501 }
2502
2503 static u64 div_factor(u64 num, int factor)
2504 {
2505         if (factor == 10)
2506                 return num;
2507         num *= factor;
2508         do_div(num, 10);
2509         return num;
2510 }
2511
2512 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2513 {
2514         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2515         struct btrfs_root *chunk_root = fs_info->chunk_root;
2516         struct btrfs_root *dev_root = fs_info->dev_root;
2517         struct list_head *devices;
2518         struct btrfs_device *device;
2519         u64 old_size;
2520         u64 size_to_free;
2521         struct btrfs_chunk *chunk;
2522         struct btrfs_path *path;
2523         struct btrfs_key key;
2524         struct btrfs_key found_key;
2525         struct btrfs_trans_handle *trans;
2526         struct extent_buffer *leaf;
2527         int slot;
2528         int ret;
2529         int enospc_errors = 0;
2530         bool counting = true;
2531
2532         /* step one make some room on all the devices */
2533         devices = &fs_info->fs_devices->devices;
2534         list_for_each_entry(device, devices, dev_list) {
2535                 old_size = device->total_bytes;
2536                 size_to_free = div_factor(old_size, 1);
2537                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2538                 if (!device->writeable ||
2539                     device->total_bytes - device->bytes_used > size_to_free)
2540                         continue;
2541
2542                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2543                 if (ret == -ENOSPC)
2544                         break;
2545                 BUG_ON(ret);
2546
2547                 trans = btrfs_start_transaction(dev_root, 0);
2548                 BUG_ON(IS_ERR(trans));
2549
2550                 ret = btrfs_grow_device(trans, device, old_size);
2551                 BUG_ON(ret);
2552
2553                 btrfs_end_transaction(trans, dev_root);
2554         }
2555
2556         /* step two, relocate all the chunks */
2557         path = btrfs_alloc_path();
2558         if (!path) {
2559                 ret = -ENOMEM;
2560                 goto error;
2561         }
2562
2563         /* zero out stat counters */
2564         spin_lock(&fs_info->balance_lock);
2565         memset(&bctl->stat, 0, sizeof(bctl->stat));
2566         spin_unlock(&fs_info->balance_lock);
2567 again:
2568         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2569         key.offset = (u64)-1;
2570         key.type = BTRFS_CHUNK_ITEM_KEY;
2571
2572         while (1) {
2573                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2574                     atomic_read(&fs_info->balance_cancel_req)) {
2575                         ret = -ECANCELED;
2576                         goto error;
2577                 }
2578
2579                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2580                 if (ret < 0)
2581                         goto error;
2582
2583                 /*
2584                  * this shouldn't happen, it means the last relocate
2585                  * failed
2586                  */
2587                 if (ret == 0)
2588                         BUG(); /* FIXME break ? */
2589
2590                 ret = btrfs_previous_item(chunk_root, path, 0,
2591                                           BTRFS_CHUNK_ITEM_KEY);
2592                 if (ret) {
2593                         ret = 0;
2594                         break;
2595                 }
2596
2597                 leaf = path->nodes[0];
2598                 slot = path->slots[0];
2599                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2600
2601                 if (found_key.objectid != key.objectid)
2602                         break;
2603
2604                 /* chunk zero is special */
2605                 if (found_key.offset == 0)
2606                         break;
2607
2608                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2609
2610                 if (!counting) {
2611                         spin_lock(&fs_info->balance_lock);
2612                         bctl->stat.considered++;
2613                         spin_unlock(&fs_info->balance_lock);
2614                 }
2615
2616                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2617                                            found_key.offset);
2618                 btrfs_release_path(path);
2619                 if (!ret)
2620                         goto loop;
2621
2622                 if (counting) {
2623                         spin_lock(&fs_info->balance_lock);
2624                         bctl->stat.expected++;
2625                         spin_unlock(&fs_info->balance_lock);
2626                         goto loop;
2627                 }
2628
2629                 ret = btrfs_relocate_chunk(chunk_root,
2630                                            chunk_root->root_key.objectid,
2631                                            found_key.objectid,
2632                                            found_key.offset);
2633                 if (ret && ret != -ENOSPC)
2634                         goto error;
2635                 if (ret == -ENOSPC) {
2636                         enospc_errors++;
2637                 } else {
2638                         spin_lock(&fs_info->balance_lock);
2639                         bctl->stat.completed++;
2640                         spin_unlock(&fs_info->balance_lock);
2641                 }
2642 loop:
2643                 key.offset = found_key.offset - 1;
2644         }
2645
2646         if (counting) {
2647                 btrfs_release_path(path);
2648                 counting = false;
2649                 goto again;
2650         }
2651 error:
2652         btrfs_free_path(path);
2653         if (enospc_errors) {
2654                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2655                        enospc_errors);
2656                 if (!ret)
2657                         ret = -ENOSPC;
2658         }
2659
2660         return ret;
2661 }
2662
2663 /**
2664  * alloc_profile_is_valid - see if a given profile is valid and reduced
2665  * @flags: profile to validate
2666  * @extended: if true @flags is treated as an extended profile
2667  */
2668 static int alloc_profile_is_valid(u64 flags, int extended)
2669 {
2670         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
2671                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
2672
2673         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2674
2675         /* 1) check that all other bits are zeroed */
2676         if (flags & ~mask)
2677                 return 0;
2678
2679         /* 2) see if profile is reduced */
2680         if (flags == 0)
2681                 return !extended; /* "0" is valid for usual profiles */
2682
2683         /* true if exactly one bit set */
2684         return (flags & (flags - 1)) == 0;
2685 }
2686
2687 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2688 {
2689         /* cancel requested || normal exit path */
2690         return atomic_read(&fs_info->balance_cancel_req) ||
2691                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2692                  atomic_read(&fs_info->balance_cancel_req) == 0);
2693 }
2694
2695 static void __cancel_balance(struct btrfs_fs_info *fs_info)
2696 {
2697         int ret;
2698
2699         unset_balance_control(fs_info);
2700         ret = del_balance_item(fs_info->tree_root);
2701         BUG_ON(ret);
2702 }
2703
2704 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2705                                struct btrfs_ioctl_balance_args *bargs);
2706
2707 /*
2708  * Should be called with both balance and volume mutexes held
2709  */
2710 int btrfs_balance(struct btrfs_balance_control *bctl,
2711                   struct btrfs_ioctl_balance_args *bargs)
2712 {
2713         struct btrfs_fs_info *fs_info = bctl->fs_info;
2714         u64 allowed;
2715         int mixed = 0;
2716         int ret;
2717
2718         if (btrfs_fs_closing(fs_info) ||
2719             atomic_read(&fs_info->balance_pause_req) ||
2720             atomic_read(&fs_info->balance_cancel_req)) {
2721                 ret = -EINVAL;
2722                 goto out;
2723         }
2724
2725         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2726         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
2727                 mixed = 1;
2728
2729         /*
2730          * In case of mixed groups both data and meta should be picked,
2731          * and identical options should be given for both of them.
2732          */
2733         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
2734         if (mixed && (bctl->flags & allowed)) {
2735                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2736                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2737                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2738                         printk(KERN_ERR "btrfs: with mixed groups data and "
2739                                "metadata balance options must be the same\n");
2740                         ret = -EINVAL;
2741                         goto out;
2742                 }
2743         }
2744
2745         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2746         if (fs_info->fs_devices->num_devices == 1)
2747                 allowed |= BTRFS_BLOCK_GROUP_DUP;
2748         else if (fs_info->fs_devices->num_devices < 4)
2749                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2750         else
2751                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2752                                 BTRFS_BLOCK_GROUP_RAID10);
2753
2754         if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2755             (!alloc_profile_is_valid(bctl->data.target, 1) ||
2756              (bctl->data.target & ~allowed))) {
2757                 printk(KERN_ERR "btrfs: unable to start balance with target "
2758                        "data profile %llu\n",
2759                        (unsigned long long)bctl->data.target);
2760                 ret = -EINVAL;
2761                 goto out;
2762         }
2763         if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2764             (!alloc_profile_is_valid(bctl->meta.target, 1) ||
2765              (bctl->meta.target & ~allowed))) {
2766                 printk(KERN_ERR "btrfs: unable to start balance with target "
2767                        "metadata profile %llu\n",
2768                        (unsigned long long)bctl->meta.target);
2769                 ret = -EINVAL;
2770                 goto out;
2771         }
2772         if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2773             (!alloc_profile_is_valid(bctl->sys.target, 1) ||
2774              (bctl->sys.target & ~allowed))) {
2775                 printk(KERN_ERR "btrfs: unable to start balance with target "
2776                        "system profile %llu\n",
2777                        (unsigned long long)bctl->sys.target);
2778                 ret = -EINVAL;
2779                 goto out;
2780         }
2781
2782         /* allow dup'ed data chunks only in mixed mode */
2783         if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2784             (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
2785                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2786                 ret = -EINVAL;
2787                 goto out;
2788         }
2789
2790         /* allow to reduce meta or sys integrity only if force set */
2791         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2792                         BTRFS_BLOCK_GROUP_RAID10;
2793         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2794              (fs_info->avail_system_alloc_bits & allowed) &&
2795              !(bctl->sys.target & allowed)) ||
2796             ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2797              (fs_info->avail_metadata_alloc_bits & allowed) &&
2798              !(bctl->meta.target & allowed))) {
2799                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2800                         printk(KERN_INFO "btrfs: force reducing metadata "
2801                                "integrity\n");
2802                 } else {
2803                         printk(KERN_ERR "btrfs: balance will reduce metadata "
2804                                "integrity, use force if you want this\n");
2805                         ret = -EINVAL;
2806                         goto out;
2807                 }
2808         }
2809
2810         ret = insert_balance_item(fs_info->tree_root, bctl);
2811         if (ret && ret != -EEXIST)
2812                 goto out;
2813
2814         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2815                 BUG_ON(ret == -EEXIST);
2816                 set_balance_control(bctl);
2817         } else {
2818                 BUG_ON(ret != -EEXIST);
2819                 spin_lock(&fs_info->balance_lock);
2820                 update_balance_args(bctl);
2821                 spin_unlock(&fs_info->balance_lock);
2822         }
2823
2824         atomic_inc(&fs_info->balance_running);
2825         mutex_unlock(&fs_info->balance_mutex);
2826
2827         ret = __btrfs_balance(fs_info);
2828
2829         mutex_lock(&fs_info->balance_mutex);
2830         atomic_dec(&fs_info->balance_running);
2831
2832         if (bargs) {
2833                 memset(bargs, 0, sizeof(*bargs));
2834                 update_ioctl_balance_args(fs_info, 0, bargs);
2835         }
2836
2837         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2838             balance_need_close(fs_info)) {
2839                 __cancel_balance(fs_info);
2840         }
2841
2842         wake_up(&fs_info->balance_wait_q);
2843
2844         return ret;
2845 out:
2846         if (bctl->flags & BTRFS_BALANCE_RESUME)
2847                 __cancel_balance(fs_info);
2848         else
2849                 kfree(bctl);
2850         return ret;
2851 }
2852
2853 static int balance_kthread(void *data)
2854 {
2855         struct btrfs_fs_info *fs_info = data;
2856         int ret = 0;
2857
2858         mutex_lock(&fs_info->volume_mutex);
2859         mutex_lock(&fs_info->balance_mutex);
2860
2861         if (fs_info->balance_ctl) {
2862                 printk(KERN_INFO "btrfs: continuing balance\n");
2863                 ret = btrfs_balance(fs_info->balance_ctl, NULL);
2864         }
2865
2866         mutex_unlock(&fs_info->balance_mutex);
2867         mutex_unlock(&fs_info->volume_mutex);
2868
2869         return ret;
2870 }
2871
2872 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
2873 {
2874         struct task_struct *tsk;
2875
2876         spin_lock(&fs_info->balance_lock);
2877         if (!fs_info->balance_ctl) {
2878                 spin_unlock(&fs_info->balance_lock);
2879                 return 0;
2880         }
2881         spin_unlock(&fs_info->balance_lock);
2882
2883         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2884                 printk(KERN_INFO "btrfs: force skipping balance\n");
2885                 return 0;
2886         }
2887
2888         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
2889         if (IS_ERR(tsk))
2890                 return PTR_ERR(tsk);
2891
2892         return 0;
2893 }
2894
2895 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
2896 {
2897         struct btrfs_balance_control *bctl;
2898         struct btrfs_balance_item *item;
2899         struct btrfs_disk_balance_args disk_bargs;
2900         struct btrfs_path *path;
2901         struct extent_buffer *leaf;
2902         struct btrfs_key key;
2903         int ret;
2904
2905         path = btrfs_alloc_path();
2906         if (!path)
2907                 return -ENOMEM;
2908
2909         key.objectid = BTRFS_BALANCE_OBJECTID;
2910         key.type = BTRFS_BALANCE_ITEM_KEY;
2911         key.offset = 0;
2912
2913         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
2914         if (ret < 0)
2915                 goto out;
2916         if (ret > 0) { /* ret = -ENOENT; */
2917                 ret = 0;
2918                 goto out;
2919         }
2920
2921         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2922         if (!bctl) {
2923                 ret = -ENOMEM;
2924                 goto out;
2925         }
2926
2927         leaf = path->nodes[0];
2928         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2929
2930         bctl->fs_info = fs_info;
2931         bctl->flags = btrfs_balance_flags(leaf, item);
2932         bctl->flags |= BTRFS_BALANCE_RESUME;
2933
2934         btrfs_balance_data(leaf, item, &disk_bargs);
2935         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2936         btrfs_balance_meta(leaf, item, &disk_bargs);
2937         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2938         btrfs_balance_sys(leaf, item, &disk_bargs);
2939         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2940
2941         mutex_lock(&fs_info->volume_mutex);
2942         mutex_lock(&fs_info->balance_mutex);
2943
2944         set_balance_control(bctl);
2945
2946         mutex_unlock(&fs_info->balance_mutex);
2947         mutex_unlock(&fs_info->volume_mutex);
2948 out:
2949         btrfs_free_path(path);
2950         return ret;
2951 }
2952
2953 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2954 {
2955         int ret = 0;
2956
2957         mutex_lock(&fs_info->balance_mutex);
2958         if (!fs_info->balance_ctl) {
2959                 mutex_unlock(&fs_info->balance_mutex);
2960                 return -ENOTCONN;
2961         }
2962
2963         if (atomic_read(&fs_info->balance_running)) {
2964                 atomic_inc(&fs_info->balance_pause_req);
2965                 mutex_unlock(&fs_info->balance_mutex);
2966
2967                 wait_event(fs_info->balance_wait_q,
2968                            atomic_read(&fs_info->balance_running) == 0);
2969
2970                 mutex_lock(&fs_info->balance_mutex);
2971                 /* we are good with balance_ctl ripped off from under us */
2972                 BUG_ON(atomic_read(&fs_info->balance_running));
2973                 atomic_dec(&fs_info->balance_pause_req);
2974         } else {
2975                 ret = -ENOTCONN;
2976         }
2977
2978         mutex_unlock(&fs_info->balance_mutex);
2979         return ret;
2980 }
2981
2982 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2983 {
2984         mutex_lock(&fs_info->balance_mutex);
2985         if (!fs_info->balance_ctl) {
2986                 mutex_unlock(&fs_info->balance_mutex);
2987                 return -ENOTCONN;
2988         }
2989
2990         atomic_inc(&fs_info->balance_cancel_req);
2991         /*
2992          * if we are running just wait and return, balance item is
2993          * deleted in btrfs_balance in this case
2994          */
2995         if (atomic_read(&fs_info->balance_running)) {
2996                 mutex_unlock(&fs_info->balance_mutex);
2997                 wait_event(fs_info->balance_wait_q,
2998                            atomic_read(&fs_info->balance_running) == 0);
2999                 mutex_lock(&fs_info->balance_mutex);
3000         } else {
3001                 /* __cancel_balance needs volume_mutex */
3002                 mutex_unlock(&fs_info->balance_mutex);
3003                 mutex_lock(&fs_info->volume_mutex);
3004                 mutex_lock(&fs_info->balance_mutex);
3005
3006                 if (fs_info->balance_ctl)
3007                         __cancel_balance(fs_info);
3008
3009                 mutex_unlock(&fs_info->volume_mutex);
3010         }
3011
3012         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3013         atomic_dec(&fs_info->balance_cancel_req);
3014         mutex_unlock(&fs_info->balance_mutex);
3015         return 0;
3016 }
3017
3018 /*
3019  * shrinking a device means finding all of the device extents past
3020  * the new size, and then following the back refs to the chunks.
3021  * The chunk relocation code actually frees the device extent
3022  */
3023 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3024 {
3025         struct btrfs_trans_handle *trans;
3026         struct btrfs_root *root = device->dev_root;
3027         struct btrfs_dev_extent *dev_extent = NULL;
3028         struct btrfs_path *path;
3029         u64 length;
3030         u64 chunk_tree;
3031         u64 chunk_objectid;
3032         u64 chunk_offset;
3033         int ret;
3034         int slot;
3035         int failed = 0;
3036         bool retried = false;
3037         struct extent_buffer *l;
3038         struct btrfs_key key;
3039         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3040         u64 old_total = btrfs_super_total_bytes(super_copy);
3041         u64 old_size = device->total_bytes;
3042         u64 diff = device->total_bytes - new_size;
3043
3044         if (new_size >= device->total_bytes)
3045                 return -EINVAL;
3046
3047         path = btrfs_alloc_path();
3048         if (!path)
3049                 return -ENOMEM;
3050
3051         path->reada = 2;
3052
3053         lock_chunks(root);
3054
3055         device->total_bytes = new_size;
3056         if (device->writeable) {
3057                 device->fs_devices->total_rw_bytes -= diff;
3058                 spin_lock(&root->fs_info->free_chunk_lock);
3059                 root->fs_info->free_chunk_space -= diff;
3060                 spin_unlock(&root->fs_info->free_chunk_lock);
3061         }
3062         unlock_chunks(root);
3063
3064 again:
3065         key.objectid = device->devid;
3066         key.offset = (u64)-1;
3067         key.type = BTRFS_DEV_EXTENT_KEY;
3068
3069         do {
3070                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3071                 if (ret < 0)
3072                         goto done;
3073
3074                 ret = btrfs_previous_item(root, path, 0, key.type);
3075                 if (ret < 0)
3076                         goto done;
3077                 if (ret) {
3078                         ret = 0;
3079                         btrfs_release_path(path);
3080                         break;
3081                 }
3082
3083                 l = path->nodes[0];
3084                 slot = path->slots[0];
3085                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
3086
3087                 if (key.objectid != device->devid) {
3088                         btrfs_release_path(path);
3089                         break;
3090                 }
3091
3092                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3093                 length = btrfs_dev_extent_length(l, dev_extent);
3094
3095                 if (key.offset + length <= new_size) {
3096                         btrfs_release_path(path);
3097                         break;
3098                 }
3099
3100                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3101                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3102                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3103                 btrfs_release_path(path);
3104
3105                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
3106                                            chunk_offset);
3107                 if (ret && ret != -ENOSPC)
3108                         goto done;
3109                 if (ret == -ENOSPC)
3110                         failed++;
3111         } while (key.offset-- > 0);
3112
3113         if (failed && !retried) {
3114                 failed = 0;
3115                 retried = true;
3116                 goto again;
3117         } else if (failed && retried) {
3118                 ret = -ENOSPC;
3119                 lock_chunks(root);
3120
3121                 device->total_bytes = old_size;
3122                 if (device->writeable)
3123                         device->fs_devices->total_rw_bytes += diff;
3124                 spin_lock(&root->fs_info->free_chunk_lock);
3125                 root->fs_info->free_chunk_space += diff;
3126                 spin_unlock(&root->fs_info->free_chunk_lock);
3127                 unlock_chunks(root);
3128                 goto done;
3129         }
3130
3131         /* Shrinking succeeded, else we would be at "done". */
3132         trans = btrfs_start_transaction(root, 0);
3133         if (IS_ERR(trans)) {
3134                 ret = PTR_ERR(trans);
3135                 goto done;
3136         }
3137
3138         lock_chunks(root);
3139
3140         device->disk_total_bytes = new_size;
3141         /* Now btrfs_update_device() will change the on-disk size. */
3142         ret = btrfs_update_device(trans, device);
3143         if (ret) {
3144                 unlock_chunks(root);
3145                 btrfs_end_transaction(trans, root);
3146                 goto done;
3147         }
3148         WARN_ON(diff > old_total);
3149         btrfs_set_super_total_bytes(super_copy, old_total - diff);
3150         unlock_chunks(root);
3151         btrfs_end_transaction(trans, root);
3152 done:
3153         btrfs_free_path(path);
3154         return ret;
3155 }
3156
3157 static int btrfs_add_system_chunk(struct btrfs_root *root,
3158                            struct btrfs_key *key,
3159                            struct btrfs_chunk *chunk, int item_size)
3160 {
3161         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3162         struct btrfs_disk_key disk_key;
3163         u32 array_size;
3164         u8 *ptr;
3165
3166         array_size = btrfs_super_sys_array_size(super_copy);
3167         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3168                 return -EFBIG;
3169
3170         ptr = super_copy->sys_chunk_array + array_size;
3171         btrfs_cpu_key_to_disk(&disk_key, key);
3172         memcpy(ptr, &disk_key, sizeof(disk_key));
3173         ptr += sizeof(disk_key);
3174         memcpy(ptr, chunk, item_size);
3175         item_size += sizeof(disk_key);
3176         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3177         return 0;
3178 }
3179
3180 /*
3181  * sort the devices in descending order by max_avail, total_avail
3182  */
3183 static int btrfs_cmp_device_info(const void *a, const void *b)
3184 {
3185         const struct btrfs_device_info *di_a = a;
3186         const struct btrfs_device_info *di_b = b;
3187
3188         if (di_a->max_avail > di_b->max_avail)
3189                 return -1;
3190         if (di_a->max_avail < di_b->max_avail)
3191                 return 1;
3192         if (di_a->total_avail > di_b->total_avail)
3193                 return -1;
3194         if (di_a->total_avail < di_b->total_avail)
3195                 return 1;
3196         return 0;
3197 }
3198
3199 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3200                                struct btrfs_root *extent_root,
3201                                struct map_lookup **map_ret,
3202                                u64 *num_bytes_out, u64 *stripe_size_out,
3203                                u64 start, u64 type)
3204 {
3205         struct btrfs_fs_info *info = extent_root->fs_info;
3206         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3207         struct list_head *cur;
3208         struct map_lookup *map = NULL;
3209         struct extent_map_tree *em_tree;
3210         struct extent_map *em;
3211         struct btrfs_device_info *devices_info = NULL;
3212         u64 total_avail;
3213         int num_stripes;        /* total number of stripes to allocate */
3214         int sub_stripes;        /* sub_stripes info for map */
3215         int dev_stripes;        /* stripes per dev */
3216         int devs_max;           /* max devs to use */
3217         int devs_min;           /* min devs needed */
3218         int devs_increment;     /* ndevs has to be a multiple of this */
3219         int ncopies;            /* how many copies to data has */
3220         int ret;
3221         u64 max_stripe_size;
3222         u64 max_chunk_size;
3223         u64 stripe_size;
3224         u64 num_bytes;
3225         int ndevs;
3226         int i;
3227         int j;
3228
3229         BUG_ON(!alloc_profile_is_valid(type, 0));
3230
3231         if (list_empty(&fs_devices->alloc_list))
3232                 return -ENOSPC;
3233
3234         sub_stripes = 1;
3235         dev_stripes = 1;
3236         devs_increment = 1;
3237         ncopies = 1;
3238         devs_max = 0;   /* 0 == as many as possible */
3239         devs_min = 1;
3240
3241         /*
3242          * define the properties of each RAID type.
3243          * FIXME: move this to a global table and use it in all RAID
3244          * calculation code
3245          */
3246         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
3247                 dev_stripes = 2;
3248                 ncopies = 2;
3249                 devs_max = 1;
3250         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3251                 devs_min = 2;
3252         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3253                 devs_increment = 2;
3254                 ncopies = 2;
3255                 devs_max = 2;
3256                 devs_min = 2;
3257         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3258                 sub_stripes = 2;
3259                 devs_increment = 2;
3260                 ncopies = 2;
3261                 devs_min = 4;
3262         } else {
3263                 devs_max = 1;
3264         }
3265
3266         if (type & BTRFS_BLOCK_GROUP_DATA) {
3267                 max_stripe_size = 1024 * 1024 * 1024;
3268                 max_chunk_size = 10 * max_stripe_size;
3269         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3270                 /* for larger filesystems, use larger metadata chunks */
3271                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3272                         max_stripe_size = 1024 * 1024 * 1024;
3273                 else
3274                         max_stripe_size = 256 * 1024 * 1024;
3275                 max_chunk_size = max_stripe_size;
3276         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3277                 max_stripe_size = 32 * 1024 * 1024;
3278                 max_chunk_size = 2 * max_stripe_size;
3279         } else {
3280                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3281                        type);
3282                 BUG_ON(1);
3283         }
3284
3285         /* we don't want a chunk larger than 10% of writeable space */
3286         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3287                              max_chunk_size);
3288
3289         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3290                                GFP_NOFS);
3291         if (!devices_info)
3292                 return -ENOMEM;
3293
3294         cur = fs_devices->alloc_list.next;
3295
3296         /*
3297          * in the first pass through the devices list, we gather information
3298          * about the available holes on each device.
3299          */
3300         ndevs = 0;
3301         while (cur != &fs_devices->alloc_list) {
3302                 struct btrfs_device *device;
3303                 u64 max_avail;
3304                 u64 dev_offset;
3305
3306                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3307
3308                 cur = cur->next;
3309
3310                 if (!device->writeable) {
3311                         printk(KERN_ERR
3312                                "btrfs: read-only device in alloc_list\n");
3313                         WARN_ON(1);
3314                         continue;
3315                 }
3316
3317                 if (!device->in_fs_metadata)
3318                         continue;
3319
3320                 if (device->total_bytes > device->bytes_used)
3321                         total_avail = device->total_bytes - device->bytes_used;
3322                 else
3323                         total_avail = 0;
3324
3325                 /* If there is no space on this device, skip it. */
3326                 if (total_avail == 0)
3327                         continue;
3328
3329                 ret = find_free_dev_extent(device,
3330                                            max_stripe_size * dev_stripes,
3331                                            &dev_offset, &max_avail);
3332                 if (ret && ret != -ENOSPC)
3333                         goto error;
3334
3335                 if (ret == 0)
3336                         max_avail = max_stripe_size * dev_stripes;
3337
3338                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3339                         continue;
3340
3341                 devices_info[ndevs].dev_offset = dev_offset;
3342                 devices_info[ndevs].max_avail = max_avail;
3343                 devices_info[ndevs].total_avail = total_avail;
3344                 devices_info[ndevs].dev = device;
3345                 ++ndevs;
3346         }
3347
3348         /*
3349          * now sort the devices by hole size / available space
3350          */
3351         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3352              btrfs_cmp_device_info, NULL);
3353
3354         /* round down to number of usable stripes */
3355         ndevs -= ndevs % devs_increment;
3356
3357         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3358                 ret = -ENOSPC;
3359                 goto error;
3360         }
3361
3362         if (devs_max && ndevs > devs_max)
3363                 ndevs = devs_max;
3364         /*
3365          * the primary goal is to maximize the number of stripes, so use as many
3366          * devices as possible, even if the stripes are not maximum sized.
3367          */
3368         stripe_size = devices_info[ndevs-1].max_avail;
3369         num_stripes = ndevs * dev_stripes;
3370
3371         if (stripe_size * ndevs > max_chunk_size * ncopies) {
3372                 stripe_size = max_chunk_size * ncopies;
3373                 do_div(stripe_size, ndevs);
3374         }
3375
3376         do_div(stripe_size, dev_stripes);
3377
3378         /* align to BTRFS_STRIPE_LEN */
3379         do_div(stripe_size, BTRFS_STRIPE_LEN);
3380         stripe_size *= BTRFS_STRIPE_LEN;
3381
3382         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3383         if (!map) {
3384                 ret = -ENOMEM;
3385                 goto error;
3386         }
3387         map->num_stripes = num_stripes;
3388
3389         for (i = 0; i < ndevs; ++i) {
3390                 for (j = 0; j < dev_stripes; ++j) {
3391                         int s = i * dev_stripes + j;
3392                         map->stripes[s].dev = devices_info[i].dev;
3393                         map->stripes[s].physical = devices_info[i].dev_offset +
3394                                                    j * stripe_size;
3395                 }
3396         }
3397         map->sector_size = extent_root->sectorsize;
3398         map->stripe_len = BTRFS_STRIPE_LEN;
3399         map->io_align = BTRFS_STRIPE_LEN;
3400         map->io_width = BTRFS_STRIPE_LEN;
3401         map->type = type;
3402         map->sub_stripes = sub_stripes;
3403
3404         *map_ret = map;
3405         num_bytes = stripe_size * (num_stripes / ncopies);
3406
3407         *stripe_size_out = stripe_size;
3408         *num_bytes_out = num_bytes;
3409
3410         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3411
3412         em = alloc_extent_map();
3413         if (!em) {
3414                 ret = -ENOMEM;
3415                 goto error;
3416         }
3417         em->bdev = (struct block_device *)map;
3418         em->start = start;
3419         em->len = num_bytes;
3420         em->block_start = 0;
3421         em->block_len = em->len;
3422
3423         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3424         write_lock(&em_tree->lock);
3425         ret = add_extent_mapping(em_tree, em);
3426         write_unlock(&em_tree->lock);
3427         free_extent_map(em);
3428         if (ret)
3429                 goto error;
3430
3431         ret = btrfs_make_block_group(trans, extent_root, 0, type,
3432                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3433                                      start, num_bytes);
3434         if (ret)
3435                 goto error;
3436
3437         for (i = 0; i < map->num_stripes; ++i) {
3438                 struct btrfs_device *device;
3439                 u64 dev_offset;
3440
3441                 device = map->stripes[i].dev;
3442                 dev_offset = map->stripes[i].physical;
3443
3444                 ret = btrfs_alloc_dev_extent(trans, device,
3445                                 info->chunk_root->root_key.objectid,
3446                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3447                                 start, dev_offset, stripe_size);
3448                 if (ret) {
3449                         btrfs_abort_transaction(trans, extent_root, ret);
3450                         goto error;
3451                 }
3452         }
3453
3454         kfree(devices_info);
3455         return 0;
3456
3457 error:
3458         kfree(map);
3459         kfree(devices_info);
3460         return ret;
3461 }
3462
3463 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
3464                                 struct btrfs_root *extent_root,
3465                                 struct map_lookup *map, u64 chunk_offset,
3466                                 u64 chunk_size, u64 stripe_size)
3467 {
3468         u64 dev_offset;
3469         struct btrfs_key key;
3470         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3471         struct btrfs_device *device;
3472         struct btrfs_chunk *chunk;
3473         struct btrfs_stripe *stripe;
3474         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
3475         int index = 0;
3476         int ret;
3477
3478         chunk = kzalloc(item_size, GFP_NOFS);
3479         if (!chunk)
3480                 return -ENOMEM;
3481
3482         index = 0;
3483         while (index < map->num_stripes) {
3484                 device = map->stripes[index].dev;
3485                 device->bytes_used += stripe_size;
3486                 ret = btrfs_update_device(trans, device);
3487                 if (ret)
3488                         goto out_free;
3489                 index++;
3490         }
3491
3492         spin_lock(&extent_root->fs_info->free_chunk_lock);
3493         extent_root->fs_info->free_chunk_space -= (stripe_size *
3494                                                    map->num_stripes);
3495         spin_unlock(&extent_root->fs_info->free_chunk_lock);
3496
3497         index = 0;
3498         stripe = &chunk->stripe;
3499         while (index < map->num_stripes) {
3500                 device = map->stripes[index].dev;
3501                 dev_offset = map->stripes[index].physical;
3502
3503                 btrfs_set_stack_stripe_devid(stripe, device->devid);
3504                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
3505                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
3506                 stripe++;
3507                 index++;
3508         }
3509
3510         btrfs_set_stack_chunk_length(chunk, chunk_size);
3511         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
3512         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
3513         btrfs_set_stack_chunk_type(chunk, map->type);
3514         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
3515         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
3516         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
3517         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
3518         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
3519
3520         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3521         key.type = BTRFS_CHUNK_ITEM_KEY;
3522         key.offset = chunk_offset;
3523
3524         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
3525
3526         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3527                 /*
3528                  * TODO: Cleanup of inserted chunk root in case of
3529                  * failure.
3530                  */
3531                 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
3532                                              item_size);
3533         }
3534
3535 out_free:
3536         kfree(chunk);
3537         return ret;
3538 }
3539
3540 /*
3541  * Chunk allocation falls into two parts. The first part does works
3542  * that make the new allocated chunk useable, but not do any operation
3543  * that modifies the chunk tree. The second part does the works that
3544  * require modifying the chunk tree. This division is important for the
3545  * bootstrap process of adding storage to a seed btrfs.
3546  */
3547 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3548                       struct btrfs_root *extent_root, u64 type)
3549 {
3550         u64 chunk_offset;
3551         u64 chunk_size;
3552         u64 stripe_size;
3553         struct map_lookup *map;
3554         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3555         int ret;
3556
3557         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3558                               &chunk_offset);
3559         if (ret)
3560                 return ret;
3561
3562         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3563                                   &stripe_size, chunk_offset, type);
3564         if (ret)
3565                 return ret;
3566
3567         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3568                                    chunk_size, stripe_size);
3569         if (ret)
3570                 return ret;
3571         return 0;
3572 }
3573
3574 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3575                                          struct btrfs_root *root,
3576                                          struct btrfs_device *device)
3577 {
3578         u64 chunk_offset;
3579         u64 sys_chunk_offset;
3580         u64 chunk_size;
3581         u64 sys_chunk_size;
3582         u64 stripe_size;
3583         u64 sys_stripe_size;
3584         u64 alloc_profile;
3585         struct map_lookup *map;
3586         struct map_lookup *sys_map;
3587         struct btrfs_fs_info *fs_info = root->fs_info;
3588         struct btrfs_root *extent_root = fs_info->extent_root;
3589         int ret;
3590
3591         ret = find_next_chunk(fs_info->chunk_root,
3592                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
3593         if (ret)
3594                 return ret;
3595
3596         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
3597                                 fs_info->avail_metadata_alloc_bits;
3598         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3599
3600         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3601                                   &stripe_size, chunk_offset, alloc_profile);
3602         if (ret)
3603                 return ret;
3604
3605         sys_chunk_offset = chunk_offset + chunk_size;
3606
3607         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
3608                                 fs_info->avail_system_alloc_bits;
3609         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3610
3611         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3612                                   &sys_chunk_size, &sys_stripe_size,
3613                                   sys_chunk_offset, alloc_profile);
3614         if (ret)
3615                 goto abort;
3616
3617         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3618         if (ret)
3619                 goto abort;
3620
3621         /*
3622          * Modifying chunk tree needs allocating new blocks from both
3623          * system block group and metadata block group. So we only can
3624          * do operations require modifying the chunk tree after both
3625          * block groups were created.
3626          */
3627         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3628                                    chunk_size, stripe_size);
3629         if (ret)
3630                 goto abort;
3631
3632         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3633                                    sys_chunk_offset, sys_chunk_size,
3634                                    sys_stripe_size);
3635         if (ret)
3636                 goto abort;
3637
3638         return 0;
3639
3640 abort:
3641         btrfs_abort_transaction(trans, root, ret);
3642         return ret;
3643 }
3644
3645 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
3646 {
3647         struct extent_map *em;
3648         struct map_lookup *map;
3649         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3650         int readonly = 0;
3651         int i;
3652
3653         read_lock(&map_tree->map_tree.lock);
3654         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3655         read_unlock(&map_tree->map_tree.lock);
3656         if (!em)
3657                 return 1;
3658
3659         if (btrfs_test_opt(root, DEGRADED)) {
3660                 free_extent_map(em);
3661                 return 0;
3662         }
3663
3664         map = (struct map_lookup *)em->bdev;
3665         for (i = 0; i < map->num_stripes; i++) {
3666                 if (!map->stripes[i].dev->writeable) {
3667                         readonly = 1;
3668                         break;
3669                 }
3670         }
3671         free_extent_map(em);
3672         return readonly;
3673 }
3674
3675 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
3676 {
3677         extent_map_tree_init(&tree->map_tree);
3678 }
3679
3680 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3681 {
3682         struct extent_map *em;
3683
3684         while (1) {
3685                 write_lock(&tree->map_tree.lock);
3686                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
3687                 if (em)
3688                         remove_extent_mapping(&tree->map_tree, em);
3689                 write_unlock(&tree->map_tree.lock);
3690                 if (!em)
3691                         break;
3692                 kfree(em->bdev);
3693                 /* once for us */
3694                 free_extent_map(em);
3695                 /* once for the tree */
3696                 free_extent_map(em);
3697         }
3698 }
3699
3700 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3701 {
3702         struct extent_map *em;
3703         struct map_lookup *map;
3704         struct extent_map_tree *em_tree = &map_tree->map_tree;
3705         int ret;
3706
3707         read_lock(&em_tree->lock);
3708         em = lookup_extent_mapping(em_tree, logical, len);
3709         read_unlock(&em_tree->lock);
3710         BUG_ON(!em);
3711
3712         BUG_ON(em->start > logical || em->start + em->len < logical);
3713         map = (struct map_lookup *)em->bdev;
3714         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
3715                 ret = map->num_stripes;
3716         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3717                 ret = map->sub_stripes;
3718         else
3719                 ret = 1;
3720         free_extent_map(em);
3721         return ret;
3722 }
3723
3724 static int find_live_mirror(struct map_lookup *map, int first, int num,
3725                             int optimal)
3726 {
3727         int i;
3728         if (map->stripes[optimal].dev->bdev)
3729                 return optimal;
3730         for (i = first; i < first + num; i++) {
3731                 if (map->stripes[i].dev->bdev)
3732                         return i;
3733         }
3734         /* we couldn't find one that doesn't fail.  Just return something
3735          * and the io error handling code will clean up eventually
3736          */
3737         return optimal;
3738 }
3739
3740 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3741                              u64 logical, u64 *length,
3742                              struct btrfs_bio **bbio_ret,
3743                              int mirror_num)
3744 {
3745         struct extent_map *em;
3746         struct map_lookup *map;
3747         struct extent_map_tree *em_tree = &map_tree->map_tree;
3748         u64 offset;
3749         u64 stripe_offset;
3750         u64 stripe_end_offset;
3751         u64 stripe_nr;
3752         u64 stripe_nr_orig;
3753         u64 stripe_nr_end;
3754         int stripe_index;
3755         int i;
3756         int ret = 0;
3757         int num_stripes;
3758         int max_errors = 0;
3759         struct btrfs_bio *bbio = NULL;
3760
3761         read_lock(&em_tree->lock);
3762         em = lookup_extent_mapping(em_tree, logical, *length);
3763         read_unlock(&em_tree->lock);
3764
3765         if (!em) {
3766                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
3767                        (unsigned long long)logical,
3768                        (unsigned long long)*length);
3769                 BUG();
3770         }
3771
3772         BUG_ON(em->start > logical || em->start + em->len < logical);
3773         map = (struct map_lookup *)em->bdev;
3774         offset = logical - em->start;
3775
3776         if (mirror_num > map->num_stripes)
3777                 mirror_num = 0;
3778
3779         stripe_nr = offset;
3780         /*
3781          * stripe_nr counts the total number of stripes we have to stride
3782          * to get to this block
3783          */
3784         do_div(stripe_nr, map->stripe_len);
3785
3786         stripe_offset = stripe_nr * map->stripe_len;
3787         BUG_ON(offset < stripe_offset);
3788
3789         /* stripe_offset is the offset of this block in its stripe*/
3790         stripe_offset = offset - stripe_offset;
3791
3792         if (rw & REQ_DISCARD)
3793                 *length = min_t(u64, em->len - offset, *length);
3794         else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
3795                 /* we limit the length of each bio to what fits in a stripe */
3796                 *length = min_t(u64, em->len - offset,
3797                                 map->stripe_len - stripe_offset);
3798         } else {
3799                 *length = em->len - offset;
3800         }
3801
3802         if (!bbio_ret)
3803                 goto out;
3804
3805         num_stripes = 1;
3806         stripe_index = 0;
3807         stripe_nr_orig = stripe_nr;
3808         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3809                         (~(map->stripe_len - 1));
3810         do_div(stripe_nr_end, map->stripe_len);
3811         stripe_end_offset = stripe_nr_end * map->stripe_len -
3812                             (offset + *length);
3813         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3814                 if (rw & REQ_DISCARD)
3815                         num_stripes = min_t(u64, map->num_stripes,
3816                                             stripe_nr_end - stripe_nr_orig);
3817                 stripe_index = do_div(stripe_nr, map->num_stripes);
3818         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3819                 if (rw & (REQ_WRITE | REQ_DISCARD))
3820                         num_stripes = map->num_stripes;
3821                 else if (mirror_num)
3822                         stripe_index = mirror_num - 1;
3823                 else {
3824                         stripe_index = find_live_mirror(map, 0,
3825                                             map->num_stripes,
3826                                             current->pid % map->num_stripes);
3827                         mirror_num = stripe_index + 1;
3828                 }
3829
3830         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3831                 if (rw & (REQ_WRITE | REQ_DISCARD)) {
3832                         num_stripes = map->num_stripes;
3833                 } else if (mirror_num) {
3834                         stripe_index = mirror_num - 1;
3835                 } else {
3836                         mirror_num = 1;
3837                 }
3838
3839         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3840                 int factor = map->num_stripes / map->sub_stripes;
3841
3842                 stripe_index = do_div(stripe_nr, factor);
3843                 stripe_index *= map->sub_stripes;
3844
3845                 if (rw & REQ_WRITE)
3846                         num_stripes = map->sub_stripes;
3847                 else if (rw & REQ_DISCARD)
3848                         num_stripes = min_t(u64, map->sub_stripes *
3849                                             (stripe_nr_end - stripe_nr_orig),
3850                                             map->num_stripes);
3851                 else if (mirror_num)
3852                         stripe_index += mirror_num - 1;
3853                 else {
3854                         int old_stripe_index = stripe_index;
3855                         stripe_index = find_live_mirror(map, stripe_index,
3856                                               map->sub_stripes, stripe_index +
3857                                               current->pid % map->sub_stripes);
3858                         mirror_num = stripe_index - old_stripe_index + 1;
3859                 }
3860         } else {
3861                 /*
3862                  * after this do_div call, stripe_nr is the number of stripes
3863                  * on this device we have to walk to find the data, and
3864                  * stripe_index is the number of our device in the stripe array
3865                  */
3866                 stripe_index = do_div(stripe_nr, map->num_stripes);
3867                 mirror_num = stripe_index + 1;
3868         }
3869         BUG_ON(stripe_index >= map->num_stripes);
3870
3871         bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3872         if (!bbio) {
3873                 ret = -ENOMEM;
3874                 goto out;
3875         }
3876         atomic_set(&bbio->error, 0);
3877
3878         if (rw & REQ_DISCARD) {
3879                 int factor = 0;
3880                 int sub_stripes = 0;
3881                 u64 stripes_per_dev = 0;
3882                 u32 remaining_stripes = 0;
3883                 u32 last_stripe = 0;
3884
3885                 if (map->type &
3886                     (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3887                         if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3888                                 sub_stripes = 1;
3889                         else
3890                                 sub_stripes = map->sub_stripes;
3891
3892                         factor = map->num_stripes / sub_stripes;
3893                         stripes_per_dev = div_u64_rem(stripe_nr_end -
3894                                                       stripe_nr_orig,
3895                                                       factor,
3896                                                       &remaining_stripes);
3897                         div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
3898                         last_stripe *= sub_stripes;
3899                 }
3900
3901                 for (i = 0; i < num_stripes; i++) {
3902                         bbio->stripes[i].physical =
3903                                 map->stripes[stripe_index].physical +
3904                                 stripe_offset + stripe_nr * map->stripe_len;
3905                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3906
3907                         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3908                                          BTRFS_BLOCK_GROUP_RAID10)) {
3909                                 bbio->stripes[i].length = stripes_per_dev *
3910                                                           map->stripe_len;
3911
3912                                 if (i / sub_stripes < remaining_stripes)
3913                                         bbio->stripes[i].length +=
3914                                                 map->stripe_len;
3915
3916                                 /*
3917                                  * Special for the first stripe and
3918                                  * the last stripe:
3919                                  *
3920                                  * |-------|...|-------|
3921                                  *     |----------|
3922                                  *    off     end_off
3923                                  */
3924                                 if (i < sub_stripes)
3925                                         bbio->stripes[i].length -=
3926                                                 stripe_offset;
3927
3928                                 if (stripe_index >= last_stripe &&
3929                                     stripe_index <= (last_stripe +
3930                                                      sub_stripes - 1))
3931                                         bbio->stripes[i].length -=
3932                                                 stripe_end_offset;
3933
3934                                 if (i == sub_stripes - 1)
3935                                         stripe_offset = 0;
3936                         } else
3937                                 bbio->stripes[i].length = *length;
3938
3939                         stripe_index++;
3940                         if (stripe_index == map->num_stripes) {
3941                                 /* This could only happen for RAID0/10 */
3942                                 stripe_index = 0;
3943                                 stripe_nr++;
3944                         }
3945                 }
3946         } else {
3947                 for (i = 0; i < num_stripes; i++) {
3948                         bbio->stripes[i].physical =
3949                                 map->stripes[stripe_index].physical +
3950                                 stripe_offset +
3951                                 stripe_nr * map->stripe_len;
3952                         bbio->stripes[i].dev =
3953                                 map->stripes[stripe_index].dev;
3954                         stripe_index++;
3955                 }
3956         }
3957
3958         if (rw & REQ_WRITE) {
3959                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3960                                  BTRFS_BLOCK_GROUP_RAID10 |
3961                                  BTRFS_BLOCK_GROUP_DUP)) {
3962                         max_errors = 1;
3963                 }
3964         }
3965
3966         *bbio_ret = bbio;
3967         bbio->num_stripes = num_stripes;
3968         bbio->max_errors = max_errors;
3969         bbio->mirror_num = mirror_num;
3970 out:
3971         free_extent_map(em);
3972         return ret;
3973 }
3974
3975 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3976                       u64 logical, u64 *length,
3977                       struct btrfs_bio **bbio_ret, int mirror_num)
3978 {
3979         return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3980                                  mirror_num);
3981 }
3982
3983 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3984                      u64 chunk_start, u64 physical, u64 devid,
3985                      u64 **logical, int *naddrs, int *stripe_len)
3986 {
3987         struct extent_map_tree *em_tree = &map_tree->map_tree;
3988         struct extent_map *em;
3989         struct map_lookup *map;
3990         u64 *buf;
3991         u64 bytenr;
3992         u64 length;
3993         u64 stripe_nr;
3994         int i, j, nr = 0;
3995
3996         read_lock(&em_tree->lock);
3997         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3998         read_unlock(&em_tree->lock);
3999
4000         BUG_ON(!em || em->start != chunk_start);
4001         map = (struct map_lookup *)em->bdev;
4002
4003         length = em->len;
4004         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4005                 do_div(length, map->num_stripes / map->sub_stripes);
4006         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4007                 do_div(length, map->num_stripes);
4008
4009         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4010         BUG_ON(!buf); /* -ENOMEM */
4011
4012         for (i = 0; i < map->num_stripes; i++) {
4013                 if (devid && map->stripes[i].dev->devid != devid)
4014                         continue;
4015                 if (map->stripes[i].physical > physical ||
4016                     map->stripes[i].physical + length <= physical)
4017                         continue;
4018
4019                 stripe_nr = physical - map->stripes[i].physical;
4020                 do_div(stripe_nr, map->stripe_len);
4021
4022                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
4023                         stripe_nr = stripe_nr * map->num_stripes + i;
4024                         do_div(stripe_nr, map->sub_stripes);
4025                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4026                         stripe_nr = stripe_nr * map->num_stripes + i;
4027                 }
4028                 bytenr = chunk_start + stripe_nr * map->stripe_len;
4029                 WARN_ON(nr >= map->num_stripes);
4030                 for (j = 0; j < nr; j++) {
4031                         if (buf[j] == bytenr)
4032                                 break;
4033                 }
4034                 if (j == nr) {
4035                         WARN_ON(nr >= map->num_stripes);
4036                         buf[nr++] = bytenr;
4037                 }
4038         }
4039
4040         *logical = buf;
4041         *naddrs = nr;
4042         *stripe_len = map->stripe_len;
4043
4044         free_extent_map(em);
4045         return 0;
4046 }
4047
4048 static void *merge_stripe_index_into_bio_private(void *bi_private,
4049                                                  unsigned int stripe_index)
4050 {
4051         /*
4052          * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
4053          * at most 1.
4054          * The alternative solution (instead of stealing bits from the
4055          * pointer) would be to allocate an intermediate structure
4056          * that contains the old private pointer plus the stripe_index.
4057          */
4058         BUG_ON((((uintptr_t)bi_private) & 3) != 0);
4059         BUG_ON(stripe_index > 3);
4060         return (void *)(((uintptr_t)bi_private) | stripe_index);
4061 }
4062
4063 static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
4064 {
4065         return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
4066 }
4067
4068 static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
4069 {
4070         return (unsigned int)((uintptr_t)bi_private) & 3;
4071 }
4072
4073 static void btrfs_end_bio(struct bio *bio, int err)
4074 {
4075         struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
4076         int is_orig_bio = 0;
4077
4078         if (err) {
4079                 atomic_inc(&bbio->error);
4080                 if (err == -EIO || err == -EREMOTEIO) {
4081                         unsigned int stripe_index =
4082                                 extract_stripe_index_from_bio_private(
4083                                         bio->bi_private);
4084                         struct btrfs_device *dev;
4085
4086                         BUG_ON(stripe_index >= bbio->num_stripes);
4087                         dev = bbio->stripes[stripe_index].dev;
4088                         if (dev->bdev) {
4089                                 if (bio->bi_rw & WRITE)
4090                                         btrfs_dev_stat_inc(dev,
4091                                                 BTRFS_DEV_STAT_WRITE_ERRS);
4092                                 else
4093                                         btrfs_dev_stat_inc(dev,
4094                                                 BTRFS_DEV_STAT_READ_ERRS);
4095                                 if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
4096                                         btrfs_dev_stat_inc(dev,
4097                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
4098                                 btrfs_dev_stat_print_on_error(dev);
4099                         }
4100                 }
4101         }
4102
4103         if (bio == bbio->orig_bio)
4104                 is_orig_bio = 1;
4105
4106         if (atomic_dec_and_test(&bbio->stripes_pending)) {
4107                 if (!is_orig_bio) {
4108                         bio_put(bio);
4109                         bio = bbio->orig_bio;
4110                 }
4111                 bio->bi_private = bbio->private;
4112                 bio->bi_end_io = bbio->end_io;
4113                 bio->bi_bdev = (struct block_device *)
4114                                         (unsigned long)bbio->mirror_num;
4115                 /* only send an error to the higher layers if it is
4116                  * beyond the tolerance of the multi-bio
4117                  */
4118                 if (atomic_read(&bbio->error) > bbio->max_errors) {
4119                         err = -EIO;
4120                 } else {
4121                         /*
4122                          * this bio is actually up to date, we didn't
4123                          * go over the max number of errors
4124                          */
4125                         set_bit(BIO_UPTODATE, &bio->bi_flags);
4126                         err = 0;
4127                 }
4128                 kfree(bbio);
4129
4130                 bio_endio(bio, err);
4131         } else if (!is_orig_bio) {
4132                 bio_put(bio);
4133         }
4134 }
4135
4136 struct async_sched {
4137         struct bio *bio;
4138         int rw;
4139         struct btrfs_fs_info *info;
4140         struct btrfs_work work;
4141 };
4142
4143 /*
4144  * see run_scheduled_bios for a description of why bios are collected for
4145  * async submit.
4146  *
4147  * This will add one bio to the pending list for a device and make sure
4148  * the work struct is scheduled.
4149  */
4150 static noinline void schedule_bio(struct btrfs_root *root,
4151                                  struct btrfs_device *device,
4152                                  int rw, struct bio *bio)
4153 {
4154         int should_queue = 1;
4155         struct btrfs_pending_bios *pending_bios;
4156
4157         /* don't bother with additional async steps for reads, right now */
4158         if (!(rw & REQ_WRITE)) {
4159                 bio_get(bio);
4160                 btrfsic_submit_bio(rw, bio);
4161                 bio_put(bio);
4162                 return;
4163         }
4164
4165         /*
4166          * nr_async_bios allows us to reliably return congestion to the
4167          * higher layers.  Otherwise, the async bio makes it appear we have
4168          * made progress against dirty pages when we've really just put it
4169          * on a queue for later
4170          */
4171         atomic_inc(&root->fs_info->nr_async_bios);
4172         WARN_ON(bio->bi_next);
4173         bio->bi_next = NULL;
4174         bio->bi_rw |= rw;
4175
4176         spin_lock(&device->io_lock);
4177         if (bio->bi_rw & REQ_SYNC)
4178                 pending_bios = &device->pending_sync_bios;
4179         else
4180                 pending_bios = &device->pending_bios;
4181
4182         if (pending_bios->tail)
4183                 pending_bios->tail->bi_next = bio;
4184
4185         pending_bios->tail = bio;
4186         if (!pending_bios->head)
4187                 pending_bios->head = bio;
4188         if (device->running_pending)
4189                 should_queue = 0;
4190
4191         spin_unlock(&device->io_lock);
4192
4193         if (should_queue)
4194                 btrfs_queue_worker(&root->fs_info->submit_workers,
4195                                    &device->work);
4196 }
4197
4198 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4199                   int mirror_num, int async_submit)
4200 {
4201         struct btrfs_mapping_tree *map_tree;
4202         struct btrfs_device *dev;
4203         struct bio *first_bio = bio;
4204         u64 logical = (u64)bio->bi_sector << 9;
4205         u64 length = 0;
4206         u64 map_length;
4207         int ret;
4208         int dev_nr = 0;
4209         int total_devs = 1;
4210         struct btrfs_bio *bbio = NULL;
4211
4212         length = bio->bi_size;
4213         map_tree = &root->fs_info->mapping_tree;
4214         map_length = length;
4215
4216         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
4217                               mirror_num);
4218         if (ret) /* -ENOMEM */
4219                 return ret;
4220
4221         total_devs = bbio->num_stripes;
4222         if (map_length < length) {
4223                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
4224                        "len %llu\n", (unsigned long long)logical,
4225                        (unsigned long long)length,
4226                        (unsigned long long)map_length);
4227                 BUG();
4228         }
4229
4230         bbio->orig_bio = first_bio;
4231         bbio->private = first_bio->bi_private;
4232         bbio->end_io = first_bio->bi_end_io;
4233         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4234
4235         while (dev_nr < total_devs) {
4236                 if (dev_nr < total_devs - 1) {
4237                         bio = bio_clone(first_bio, GFP_NOFS);
4238                         BUG_ON(!bio); /* -ENOMEM */
4239                 } else {
4240                         bio = first_bio;
4241                 }
4242                 bio->bi_private = bbio;
4243                 bio->bi_private = merge_stripe_index_into_bio_private(
4244                                 bio->bi_private, (unsigned int)dev_nr);
4245                 bio->bi_end_io = btrfs_end_bio;
4246                 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4247                 dev = bbio->stripes[dev_nr].dev;
4248                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4249 #ifdef DEBUG
4250                         struct rcu_string *name;
4251
4252                         rcu_read_lock();
4253                         name = rcu_dereference(dev->name);
4254                         pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4255                                  "(%s id %llu), size=%u\n", rw,
4256                                  (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4257                                  name->str, dev->devid, bio->bi_size);
4258                         rcu_read_unlock();
4259 #endif
4260                         bio->bi_bdev = dev->bdev;
4261                         if (async_submit)
4262                                 schedule_bio(root, dev, rw, bio);
4263                         else
4264                                 btrfsic_submit_bio(rw, bio);
4265                 } else {
4266                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4267                         bio->bi_sector = logical >> 9;
4268                         bio_endio(bio, -EIO);
4269                 }
4270                 dev_nr++;
4271         }
4272         return 0;
4273 }
4274
4275 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
4276                                        u8 *uuid, u8 *fsid)
4277 {
4278         struct btrfs_device *device;
4279         struct btrfs_fs_devices *cur_devices;
4280
4281         cur_devices = root->fs_info->fs_devices;
4282         while (cur_devices) {
4283                 if (!fsid ||
4284                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4285                         device = __find_device(&cur_devices->devices,
4286                                                devid, uuid);
4287                         if (device)
4288                                 return device;
4289                 }
4290                 cur_devices = cur_devices->seed;
4291         }
4292         return NULL;
4293 }
4294
4295 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
4296                                             u64 devid, u8 *dev_uuid)
4297 {
4298         struct btrfs_device *device;
4299         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4300
4301         device = kzalloc(sizeof(*device), GFP_NOFS);
4302         if (!device)
4303                 return NULL;
4304         list_add(&device->dev_list,
4305                  &fs_devices->devices);
4306         device->dev_root = root->fs_info->dev_root;
4307         device->devid = devid;
4308         device->work.func = pending_bios_fn;
4309         device->fs_devices = fs_devices;
4310         device->missing = 1;
4311         fs_devices->num_devices++;
4312         fs_devices->missing_devices++;
4313         spin_lock_init(&device->io_lock);
4314         INIT_LIST_HEAD(&device->dev_alloc_list);
4315         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
4316         return device;
4317 }
4318
4319 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4320                           struct extent_buffer *leaf,
4321                           struct btrfs_chunk *chunk)
4322 {
4323         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4324         struct map_lookup *map;
4325         struct extent_map *em;
4326         u64 logical;
4327         u64 length;
4328         u64 devid;
4329         u8 uuid[BTRFS_UUID_SIZE];
4330         int num_stripes;
4331         int ret;
4332         int i;
4333
4334         logical = key->offset;
4335         length = btrfs_chunk_length(leaf, chunk);
4336
4337         read_lock(&map_tree->map_tree.lock);
4338         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
4339         read_unlock(&map_tree->map_tree.lock);
4340
4341         /* already mapped? */
4342         if (em && em->start <= logical && em->start + em->len > logical) {
4343                 free_extent_map(em);
4344                 return 0;
4345         } else if (em) {
4346                 free_extent_map(em);
4347         }
4348
4349         em = alloc_extent_map();
4350         if (!em)
4351                 return -ENOMEM;
4352         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
4353         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4354         if (!map) {
4355                 free_extent_map(em);
4356                 return -ENOMEM;
4357         }
4358
4359         em->bdev = (struct block_device *)map;
4360         em->start = logical;
4361         em->len = length;
4362         em->block_start = 0;
4363         em->block_len = em->len;
4364
4365         map->num_stripes = num_stripes;
4366         map->io_width = btrfs_chunk_io_width(leaf, chunk);
4367         map->io_align = btrfs_chunk_io_align(leaf, chunk);
4368         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
4369         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
4370         map->type = btrfs_chunk_type(leaf, chunk);
4371         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
4372         for (i = 0; i < num_stripes; i++) {
4373                 map->stripes[i].physical =
4374                         btrfs_stripe_offset_nr(leaf, chunk, i);
4375                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
4376                 read_extent_buffer(leaf, uuid, (unsigned long)
4377                                    btrfs_stripe_dev_uuid_nr(chunk, i),
4378                                    BTRFS_UUID_SIZE);
4379                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
4380                                                         NULL);
4381                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4382                         kfree(map);
4383                         free_extent_map(em);
4384                         return -EIO;
4385                 }
4386                 if (!map->stripes[i].dev) {
4387                         map->stripes[i].dev =
4388                                 add_missing_dev(root, devid, uuid);
4389                         if (!map->stripes[i].dev) {
4390                                 kfree(map);
4391                                 free_extent_map(em);
4392                                 return -EIO;
4393                         }
4394                 }
4395                 map->stripes[i].dev->in_fs_metadata = 1;
4396         }
4397
4398         write_lock(&map_tree->map_tree.lock);
4399         ret = add_extent_mapping(&map_tree->map_tree, em);
4400         write_unlock(&map_tree->map_tree.lock);
4401         BUG_ON(ret); /* Tree corruption */
4402         free_extent_map(em);
4403
4404         return 0;
4405 }
4406
4407 static void fill_device_from_item(struct extent_buffer *leaf,
4408                                  struct btrfs_dev_item *dev_item,
4409                                  struct btrfs_device *device)
4410 {
4411         unsigned long ptr;
4412
4413         device->devid = btrfs_device_id(leaf, dev_item);
4414         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
4415         device->total_bytes = device->disk_total_bytes;
4416         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
4417         device->type = btrfs_device_type(leaf, dev_item);
4418         device->io_align = btrfs_device_io_align(leaf, dev_item);
4419         device->io_width = btrfs_device_io_width(leaf, dev_item);
4420         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
4421
4422         ptr = (unsigned long)btrfs_device_uuid(dev_item);
4423         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
4424 }
4425
4426 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
4427 {
4428         struct btrfs_fs_devices *fs_devices;
4429         int ret;
4430
4431         BUG_ON(!mutex_is_locked(&uuid_mutex));
4432
4433         fs_devices = root->fs_info->fs_devices->seed;
4434         while (fs_devices) {
4435                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4436                         ret = 0;
4437                         goto out;
4438                 }
4439                 fs_devices = fs_devices->seed;
4440         }
4441
4442         fs_devices = find_fsid(fsid);
4443         if (!fs_devices) {
4444                 ret = -ENOENT;
4445                 goto out;
4446         }
4447
4448         fs_devices = clone_fs_devices(fs_devices);
4449         if (IS_ERR(fs_devices)) {
4450                 ret = PTR_ERR(fs_devices);
4451                 goto out;
4452         }
4453
4454         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
4455                                    root->fs_info->bdev_holder);
4456         if (ret) {
4457                 free_fs_devices(fs_devices);
4458                 goto out;
4459         }
4460
4461         if (!fs_devices->seeding) {
4462                 __btrfs_close_devices(fs_devices);
4463                 free_fs_devices(fs_devices);
4464                 ret = -EINVAL;
4465                 goto out;
4466         }
4467
4468         fs_devices->seed = root->fs_info->fs_devices->seed;
4469         root->fs_info->fs_devices->seed = fs_devices;
4470 out:
4471         return ret;
4472 }
4473
4474 static int read_one_dev(struct btrfs_root *root,
4475                         struct extent_buffer *leaf,
4476                         struct btrfs_dev_item *dev_item)
4477 {
4478         struct btrfs_device *device;
4479         u64 devid;
4480         int ret;
4481         u8 fs_uuid[BTRFS_UUID_SIZE];
4482         u8 dev_uuid[BTRFS_UUID_SIZE];
4483
4484         devid = btrfs_device_id(leaf, dev_item);
4485         read_extent_buffer(leaf, dev_uuid,
4486                            (unsigned long)btrfs_device_uuid(dev_item),
4487                            BTRFS_UUID_SIZE);
4488         read_extent_buffer(leaf, fs_uuid,
4489                            (unsigned long)btrfs_device_fsid(dev_item),
4490                            BTRFS_UUID_SIZE);
4491
4492         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
4493                 ret = open_seed_devices(root, fs_uuid);
4494                 if (ret && !btrfs_test_opt(root, DEGRADED))
4495                         return ret;
4496         }
4497
4498         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
4499         if (!device || !device->bdev) {
4500                 if (!btrfs_test_opt(root, DEGRADED))
4501                         return -EIO;
4502
4503                 if (!device) {
4504                         printk(KERN_WARNING "warning devid %llu missing\n",
4505                                (unsigned long long)devid);
4506                         device = add_missing_dev(root, devid, dev_uuid);
4507                         if (!device)
4508                                 return -ENOMEM;
4509                 } else if (!device->missing) {
4510                         /*
4511                          * this happens when a device that was properly setup
4512                          * in the device info lists suddenly goes bad.
4513                          * device->bdev is NULL, and so we have to set
4514                          * device->missing to one here
4515                          */
4516                         root->fs_info->fs_devices->missing_devices++;
4517                         device->missing = 1;
4518                 }
4519         }
4520
4521         if (device->fs_devices != root->fs_info->fs_devices) {
4522                 BUG_ON(device->writeable);
4523                 if (device->generation !=
4524                     btrfs_device_generation(leaf, dev_item))
4525                         return -EINVAL;
4526         }
4527
4528         fill_device_from_item(leaf, dev_item, device);
4529         device->dev_root = root->fs_info->dev_root;
4530         device->in_fs_metadata = 1;
4531         if (device->writeable) {
4532                 device->fs_devices->total_rw_bytes += device->total_bytes;
4533                 spin_lock(&root->fs_info->free_chunk_lock);
4534                 root->fs_info->free_chunk_space += device->total_bytes -
4535                         device->bytes_used;
4536                 spin_unlock(&root->fs_info->free_chunk_lock);
4537         }
4538         ret = 0;
4539         return ret;
4540 }
4541
4542 int btrfs_read_sys_array(struct btrfs_root *root)
4543 {
4544         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4545         struct extent_buffer *sb;
4546         struct btrfs_disk_key *disk_key;
4547         struct btrfs_chunk *chunk;
4548         u8 *ptr;
4549         unsigned long sb_ptr;
4550         int ret = 0;
4551         u32 num_stripes;
4552         u32 array_size;
4553         u32 len = 0;
4554         u32 cur;
4555         struct btrfs_key key;
4556
4557         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
4558                                           BTRFS_SUPER_INFO_SIZE);
4559         if (!sb)
4560                 return -ENOMEM;
4561         btrfs_set_buffer_uptodate(sb);
4562         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
4563         /*
4564          * The sb extent buffer is artifical and just used to read the system array.
4565          * btrfs_set_buffer_uptodate() call does not properly mark all it's
4566          * pages up-to-date when the page is larger: extent does not cover the
4567          * whole page and consequently check_page_uptodate does not find all
4568          * the page's extents up-to-date (the hole beyond sb),
4569          * write_extent_buffer then triggers a WARN_ON.
4570          *
4571          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
4572          * but sb spans only this function. Add an explicit SetPageUptodate call
4573          * to silence the warning eg. on PowerPC 64.
4574          */
4575         if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
4576                 SetPageUptodate(sb->pages[0]);
4577
4578         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
4579         array_size = btrfs_super_sys_array_size(super_copy);
4580
4581         ptr = super_copy->sys_chunk_array;
4582         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
4583         cur = 0;
4584
4585         while (cur < array_size) {
4586                 disk_key = (struct btrfs_disk_key *)ptr;
4587                 btrfs_disk_key_to_cpu(&key, disk_key);
4588
4589                 len = sizeof(*disk_key); ptr += len;
4590                 sb_ptr += len;
4591                 cur += len;
4592
4593                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
4594                         chunk = (struct btrfs_chunk *)sb_ptr;
4595                         ret = read_one_chunk(root, &key, sb, chunk);
4596                         if (ret)
4597                                 break;
4598                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
4599                         len = btrfs_chunk_item_size(num_stripes);
4600                 } else {
4601                         ret = -EIO;
4602                         break;
4603                 }
4604                 ptr += len;
4605                 sb_ptr += len;
4606                 cur += len;
4607         }
4608         free_extent_buffer(sb);
4609         return ret;
4610 }
4611
4612 struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
4613                                                    u64 logical, int mirror_num)
4614 {
4615         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4616         int ret;
4617         u64 map_length = 0;
4618         struct btrfs_bio *bbio = NULL;
4619         struct btrfs_device *device;
4620
4621         BUG_ON(mirror_num == 0);
4622         ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
4623                               mirror_num);
4624         if (ret) {
4625                 BUG_ON(bbio != NULL);
4626                 return NULL;
4627         }
4628         BUG_ON(mirror_num != bbio->mirror_num);
4629         device = bbio->stripes[mirror_num - 1].dev;
4630         kfree(bbio);
4631         return device;
4632 }
4633
4634 int btrfs_read_chunk_tree(struct btrfs_root *root)
4635 {
4636         struct btrfs_path *path;
4637         struct extent_buffer *leaf;
4638         struct btrfs_key key;
4639         struct btrfs_key found_key;
4640         int ret;
4641         int slot;
4642
4643         root = root->fs_info->chunk_root;
4644
4645         path = btrfs_alloc_path();
4646         if (!path)
4647                 return -ENOMEM;
4648
4649         mutex_lock(&uuid_mutex);
4650         lock_chunks(root);
4651
4652         /* first we search for all of the device items, and then we
4653          * read in all of the chunk items.  This way we can create chunk
4654          * mappings that reference all of the devices that are afound
4655          */
4656         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
4657         key.offset = 0;
4658         key.type = 0;
4659 again:
4660         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4661         if (ret < 0)
4662                 goto error;
4663         while (1) {
4664                 leaf = path->nodes[0];
4665                 slot = path->slots[0];
4666                 if (slot >= btrfs_header_nritems(leaf)) {
4667                         ret = btrfs_next_leaf(root, path);
4668                         if (ret == 0)
4669                                 continue;
4670                         if (ret < 0)
4671                                 goto error;
4672                         break;
4673                 }
4674                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4675                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4676                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
4677                                 break;
4678                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
4679                                 struct btrfs_dev_item *dev_item;
4680                                 dev_item = btrfs_item_ptr(leaf, slot,
4681                                                   struct btrfs_dev_item);
4682                                 ret = read_one_dev(root, leaf, dev_item);
4683                                 if (ret)
4684                                         goto error;
4685                         }
4686                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
4687                         struct btrfs_chunk *chunk;
4688                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4689                         ret = read_one_chunk(root, &found_key, leaf, chunk);
4690                         if (ret)
4691                                 goto error;
4692                 }
4693                 path->slots[0]++;
4694         }
4695         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4696                 key.objectid = 0;
4697                 btrfs_release_path(path);
4698                 goto again;
4699         }
4700         ret = 0;
4701 error:
4702         unlock_chunks(root);
4703         mutex_unlock(&uuid_mutex);
4704
4705         btrfs_free_path(path);
4706         return ret;
4707 }
4708
4709 static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
4710 {
4711         int i;
4712
4713         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4714                 btrfs_dev_stat_reset(dev, i);
4715 }
4716
4717 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
4718 {
4719         struct btrfs_key key;
4720         struct btrfs_key found_key;
4721         struct btrfs_root *dev_root = fs_info->dev_root;
4722         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4723         struct extent_buffer *eb;
4724         int slot;
4725         int ret = 0;
4726         struct btrfs_device *device;
4727         struct btrfs_path *path = NULL;
4728         int i;
4729
4730         path = btrfs_alloc_path();
4731         if (!path) {
4732                 ret = -ENOMEM;
4733                 goto out;
4734         }
4735
4736         mutex_lock(&fs_devices->device_list_mutex);
4737         list_for_each_entry(device, &fs_devices->devices, dev_list) {
4738                 int item_size;
4739                 struct btrfs_dev_stats_item *ptr;
4740
4741                 key.objectid = 0;
4742                 key.type = BTRFS_DEV_STATS_KEY;
4743                 key.offset = device->devid;
4744                 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
4745                 if (ret) {
4746                         __btrfs_reset_dev_stats(device);
4747                         device->dev_stats_valid = 1;
4748                         btrfs_release_path(path);
4749                         continue;
4750                 }
4751                 slot = path->slots[0];
4752                 eb = path->nodes[0];
4753                 btrfs_item_key_to_cpu(eb, &found_key, slot);
4754                 item_size = btrfs_item_size_nr(eb, slot);
4755
4756                 ptr = btrfs_item_ptr(eb, slot,
4757                                      struct btrfs_dev_stats_item);
4758
4759                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4760                         if (item_size >= (1 + i) * sizeof(__le64))
4761                                 btrfs_dev_stat_set(device, i,
4762                                         btrfs_dev_stats_value(eb, ptr, i));
4763                         else
4764                                 btrfs_dev_stat_reset(device, i);
4765                 }
4766
4767                 device->dev_stats_valid = 1;
4768                 btrfs_dev_stat_print_on_load(device);
4769                 btrfs_release_path(path);
4770         }
4771         mutex_unlock(&fs_devices->device_list_mutex);
4772
4773 out:
4774         btrfs_free_path(path);
4775         return ret < 0 ? ret : 0;
4776 }
4777
4778 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
4779                                 struct btrfs_root *dev_root,
4780                                 struct btrfs_device *device)
4781 {
4782         struct btrfs_path *path;
4783         struct btrfs_key key;
4784         struct extent_buffer *eb;
4785         struct btrfs_dev_stats_item *ptr;
4786         int ret;
4787         int i;
4788
4789         key.objectid = 0;
4790         key.type = BTRFS_DEV_STATS_KEY;
4791         key.offset = device->devid;
4792
4793         path = btrfs_alloc_path();
4794         BUG_ON(!path);
4795         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
4796         if (ret < 0) {
4797                 printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
4798                               ret, rcu_str_deref(device->name));
4799                 goto out;
4800         }
4801
4802         if (ret == 0 &&
4803             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
4804                 /* need to delete old one and insert a new one */
4805                 ret = btrfs_del_item(trans, dev_root, path);
4806                 if (ret != 0) {
4807                         printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
4808                                       rcu_str_deref(device->name), ret);
4809                         goto out;
4810                 }
4811                 ret = 1;
4812         }
4813
4814         if (ret == 1) {
4815                 /* need to insert a new item */
4816                 btrfs_release_path(path);
4817                 ret = btrfs_insert_empty_item(trans, dev_root, path,
4818                                               &key, sizeof(*ptr));
4819                 if (ret < 0) {
4820                         printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
4821                                       rcu_str_deref(device->name), ret);
4822                         goto out;
4823                 }
4824         }
4825
4826         eb = path->nodes[0];
4827         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
4828         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4829                 btrfs_set_dev_stats_value(eb, ptr, i,
4830                                           btrfs_dev_stat_read(device, i));
4831         btrfs_mark_buffer_dirty(eb);
4832
4833 out:
4834         btrfs_free_path(path);
4835         return ret;
4836 }
4837
4838 /*
4839  * called from commit_transaction. Writes all changed device stats to disk.
4840  */
4841 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
4842                         struct btrfs_fs_info *fs_info)
4843 {
4844         struct btrfs_root *dev_root = fs_info->dev_root;
4845         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
4846         struct btrfs_device *device;
4847         int ret = 0;
4848
4849         mutex_lock(&fs_devices->device_list_mutex);
4850         list_for_each_entry(device, &fs_devices->devices, dev_list) {
4851                 if (!device->dev_stats_valid || !device->dev_stats_dirty)
4852                         continue;
4853
4854                 ret = update_dev_stat_item(trans, dev_root, device);
4855                 if (!ret)
4856                         device->dev_stats_dirty = 0;
4857         }
4858         mutex_unlock(&fs_devices->device_list_mutex);
4859
4860         return ret;
4861 }
4862
4863 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
4864 {
4865         btrfs_dev_stat_inc(dev, index);
4866         btrfs_dev_stat_print_on_error(dev);
4867 }
4868
4869 void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
4870 {
4871         if (!dev->dev_stats_valid)
4872                 return;
4873         printk_ratelimited_in_rcu(KERN_ERR
4874                            "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4875                            rcu_str_deref(dev->name),
4876                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4877                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4878                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4879                            btrfs_dev_stat_read(dev,
4880                                                BTRFS_DEV_STAT_CORRUPTION_ERRS),
4881                            btrfs_dev_stat_read(dev,
4882                                                BTRFS_DEV_STAT_GENERATION_ERRS));
4883 }
4884
4885 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
4886 {
4887         int i;
4888
4889         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4890                 if (btrfs_dev_stat_read(dev, i) != 0)
4891                         break;
4892         if (i == BTRFS_DEV_STAT_VALUES_MAX)
4893                 return; /* all values == 0, suppress message */
4894
4895         printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
4896                rcu_str_deref(dev->name),
4897                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
4898                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
4899                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
4900                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
4901                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
4902 }
4903
4904 int btrfs_get_dev_stats(struct btrfs_root *root,
4905                         struct btrfs_ioctl_get_dev_stats *stats)
4906 {
4907         struct btrfs_device *dev;
4908         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4909         int i;
4910
4911         mutex_lock(&fs_devices->device_list_mutex);
4912         dev = btrfs_find_device(root, stats->devid, NULL, NULL);
4913         mutex_unlock(&fs_devices->device_list_mutex);
4914
4915         if (!dev) {
4916                 printk(KERN_WARNING
4917                        "btrfs: get dev_stats failed, device not found\n");
4918                 return -ENODEV;
4919         } else if (!dev->dev_stats_valid) {
4920                 printk(KERN_WARNING
4921                        "btrfs: get dev_stats failed, not yet valid\n");
4922                 return -ENODEV;
4923         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
4924                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
4925                         if (stats->nr_items > i)
4926                                 stats->values[i] =
4927                                         btrfs_dev_stat_read_and_reset(dev, i);
4928                         else
4929                                 btrfs_dev_stat_reset(dev, i);
4930                 }
4931         } else {
4932                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
4933                         if (stats->nr_items > i)
4934                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
4935         }
4936         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
4937                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4938         return 0;
4939 }