OSDN Git Service

Btrfs: ensure path name is null terminated at btrfs_control_ioctl
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include <linux/percpu_counter.h>
28 #include "hash.h"
29 #include "tree-log.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "volumes.h"
33 #include "raid56.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 #include "math.h"
37 #include "sysfs.h"
38 #include "qgroup.h"
39
40 #undef SCRAMBLE_DELAYED_REFS
41
42 /*
43  * control flags for do_chunk_alloc's force field
44  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
45  * if we really need one.
46  *
47  * CHUNK_ALLOC_LIMITED means to only try and allocate one
48  * if we have very few chunks already allocated.  This is
49  * used as part of the clustering code to help make sure
50  * we have a good pool of storage to cluster in, without
51  * filling the FS with empty chunks
52  *
53  * CHUNK_ALLOC_FORCE means it must try to allocate one
54  *
55  */
56 enum {
57         CHUNK_ALLOC_NO_FORCE = 0,
58         CHUNK_ALLOC_LIMITED = 1,
59         CHUNK_ALLOC_FORCE = 2,
60 };
61
62 /*
63  * Control how reservations are dealt with.
64  *
65  * RESERVE_FREE - freeing a reservation.
66  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
67  *   ENOSPC accounting
68  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
69  *   bytes_may_use as the ENOSPC accounting is done elsewhere
70  */
71 enum {
72         RESERVE_FREE = 0,
73         RESERVE_ALLOC = 1,
74         RESERVE_ALLOC_NO_ACCOUNT = 2,
75 };
76
77 static int update_block_group(struct btrfs_trans_handle *trans,
78                               struct btrfs_root *root, u64 bytenr,
79                               u64 num_bytes, int alloc);
80 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
81                                 struct btrfs_root *root,
82                                 struct btrfs_delayed_ref_node *node, u64 parent,
83                                 u64 root_objectid, u64 owner_objectid,
84                                 u64 owner_offset, int refs_to_drop,
85                                 struct btrfs_delayed_extent_op *extra_op);
86 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
87                                     struct extent_buffer *leaf,
88                                     struct btrfs_extent_item *ei);
89 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
90                                       struct btrfs_root *root,
91                                       u64 parent, u64 root_objectid,
92                                       u64 flags, u64 owner, u64 offset,
93                                       struct btrfs_key *ins, int ref_mod);
94 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
95                                      struct btrfs_root *root,
96                                      u64 parent, u64 root_objectid,
97                                      u64 flags, struct btrfs_disk_key *key,
98                                      int level, struct btrfs_key *ins);
99 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
100                           struct btrfs_root *extent_root, u64 flags,
101                           int force);
102 static int find_next_key(struct btrfs_path *path, int level,
103                          struct btrfs_key *key);
104 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
105                             int dump_block_groups);
106 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
107                                        u64 num_bytes, int reserve,
108                                        int delalloc);
109 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
110                                u64 num_bytes);
111 int btrfs_pin_extent(struct btrfs_root *root,
112                      u64 bytenr, u64 num_bytes, int reserved);
113
114 static noinline int
115 block_group_cache_done(struct btrfs_block_group_cache *cache)
116 {
117         smp_mb();
118         return cache->cached == BTRFS_CACHE_FINISHED ||
119                 cache->cached == BTRFS_CACHE_ERROR;
120 }
121
122 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
123 {
124         return (cache->flags & bits) == bits;
125 }
126
127 void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
128 {
129         atomic_inc(&cache->count);
130 }
131
132 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
133 {
134         if (atomic_dec_and_test(&cache->count)) {
135                 WARN_ON(cache->pinned > 0);
136                 WARN_ON(cache->reserved > 0);
137                 kfree(cache->free_space_ctl);
138                 kfree(cache);
139         }
140 }
141
142 /*
143  * this adds the block group to the fs_info rb tree for the block group
144  * cache
145  */
146 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
147                                 struct btrfs_block_group_cache *block_group)
148 {
149         struct rb_node **p;
150         struct rb_node *parent = NULL;
151         struct btrfs_block_group_cache *cache;
152
153         spin_lock(&info->block_group_cache_lock);
154         p = &info->block_group_cache_tree.rb_node;
155
156         while (*p) {
157                 parent = *p;
158                 cache = rb_entry(parent, struct btrfs_block_group_cache,
159                                  cache_node);
160                 if (block_group->key.objectid < cache->key.objectid) {
161                         p = &(*p)->rb_left;
162                 } else if (block_group->key.objectid > cache->key.objectid) {
163                         p = &(*p)->rb_right;
164                 } else {
165                         spin_unlock(&info->block_group_cache_lock);
166                         return -EEXIST;
167                 }
168         }
169
170         rb_link_node(&block_group->cache_node, parent, p);
171         rb_insert_color(&block_group->cache_node,
172                         &info->block_group_cache_tree);
173
174         if (info->first_logical_byte > block_group->key.objectid)
175                 info->first_logical_byte = block_group->key.objectid;
176
177         spin_unlock(&info->block_group_cache_lock);
178
179         return 0;
180 }
181
182 /*
183  * This will return the block group at or after bytenr if contains is 0, else
184  * it will return the block group that contains the bytenr
185  */
186 static struct btrfs_block_group_cache *
187 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
188                               int contains)
189 {
190         struct btrfs_block_group_cache *cache, *ret = NULL;
191         struct rb_node *n;
192         u64 end, start;
193
194         spin_lock(&info->block_group_cache_lock);
195         n = info->block_group_cache_tree.rb_node;
196
197         while (n) {
198                 cache = rb_entry(n, struct btrfs_block_group_cache,
199                                  cache_node);
200                 end = cache->key.objectid + cache->key.offset - 1;
201                 start = cache->key.objectid;
202
203                 if (bytenr < start) {
204                         if (!contains && (!ret || start < ret->key.objectid))
205                                 ret = cache;
206                         n = n->rb_left;
207                 } else if (bytenr > start) {
208                         if (contains && bytenr <= end) {
209                                 ret = cache;
210                                 break;
211                         }
212                         n = n->rb_right;
213                 } else {
214                         ret = cache;
215                         break;
216                 }
217         }
218         if (ret) {
219                 btrfs_get_block_group(ret);
220                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
221                         info->first_logical_byte = ret->key.objectid;
222         }
223         spin_unlock(&info->block_group_cache_lock);
224
225         return ret;
226 }
227
228 static int add_excluded_extent(struct btrfs_root *root,
229                                u64 start, u64 num_bytes)
230 {
231         u64 end = start + num_bytes - 1;
232         set_extent_bits(&root->fs_info->freed_extents[0],
233                         start, end, EXTENT_UPTODATE, GFP_NOFS);
234         set_extent_bits(&root->fs_info->freed_extents[1],
235                         start, end, EXTENT_UPTODATE, GFP_NOFS);
236         return 0;
237 }
238
239 static void free_excluded_extents(struct btrfs_root *root,
240                                   struct btrfs_block_group_cache *cache)
241 {
242         u64 start, end;
243
244         start = cache->key.objectid;
245         end = start + cache->key.offset - 1;
246
247         clear_extent_bits(&root->fs_info->freed_extents[0],
248                           start, end, EXTENT_UPTODATE, GFP_NOFS);
249         clear_extent_bits(&root->fs_info->freed_extents[1],
250                           start, end, EXTENT_UPTODATE, GFP_NOFS);
251 }
252
253 static int exclude_super_stripes(struct btrfs_root *root,
254                                  struct btrfs_block_group_cache *cache)
255 {
256         u64 bytenr;
257         u64 *logical;
258         int stripe_len;
259         int i, nr, ret;
260
261         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
262                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
263                 cache->bytes_super += stripe_len;
264                 ret = add_excluded_extent(root, cache->key.objectid,
265                                           stripe_len);
266                 if (ret)
267                         return ret;
268         }
269
270         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
271                 bytenr = btrfs_sb_offset(i);
272                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
273                                        cache->key.objectid, bytenr,
274                                        0, &logical, &nr, &stripe_len);
275                 if (ret)
276                         return ret;
277
278                 while (nr--) {
279                         u64 start, len;
280
281                         if (logical[nr] > cache->key.objectid +
282                             cache->key.offset)
283                                 continue;
284
285                         if (logical[nr] + stripe_len <= cache->key.objectid)
286                                 continue;
287
288                         start = logical[nr];
289                         if (start < cache->key.objectid) {
290                                 start = cache->key.objectid;
291                                 len = (logical[nr] + stripe_len) - start;
292                         } else {
293                                 len = min_t(u64, stripe_len,
294                                             cache->key.objectid +
295                                             cache->key.offset - start);
296                         }
297
298                         cache->bytes_super += len;
299                         ret = add_excluded_extent(root, start, len);
300                         if (ret) {
301                                 kfree(logical);
302                                 return ret;
303                         }
304                 }
305
306                 kfree(logical);
307         }
308         return 0;
309 }
310
311 static struct btrfs_caching_control *
312 get_caching_control(struct btrfs_block_group_cache *cache)
313 {
314         struct btrfs_caching_control *ctl;
315
316         spin_lock(&cache->lock);
317         if (!cache->caching_ctl) {
318                 spin_unlock(&cache->lock);
319                 return NULL;
320         }
321
322         ctl = cache->caching_ctl;
323         atomic_inc(&ctl->count);
324         spin_unlock(&cache->lock);
325         return ctl;
326 }
327
328 static void put_caching_control(struct btrfs_caching_control *ctl)
329 {
330         if (atomic_dec_and_test(&ctl->count))
331                 kfree(ctl);
332 }
333
334 #ifdef CONFIG_BTRFS_DEBUG
335 static void fragment_free_space(struct btrfs_root *root,
336                                 struct btrfs_block_group_cache *block_group)
337 {
338         u64 start = block_group->key.objectid;
339         u64 len = block_group->key.offset;
340         u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
341                 root->nodesize : root->sectorsize;
342         u64 step = chunk << 1;
343
344         while (len > chunk) {
345                 btrfs_remove_free_space(block_group, start, chunk);
346                 start += step;
347                 if (len < step)
348                         len = 0;
349                 else
350                         len -= step;
351         }
352 }
353 #endif
354
355 /*
356  * this is only called by cache_block_group, since we could have freed extents
357  * we need to check the pinned_extents for any extents that can't be used yet
358  * since their free space will be released as soon as the transaction commits.
359  */
360 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
361                               struct btrfs_fs_info *info, u64 start, u64 end)
362 {
363         u64 extent_start, extent_end, size, total_added = 0;
364         int ret;
365
366         while (start < end) {
367                 ret = find_first_extent_bit(info->pinned_extents, start,
368                                             &extent_start, &extent_end,
369                                             EXTENT_DIRTY | EXTENT_UPTODATE,
370                                             NULL);
371                 if (ret)
372                         break;
373
374                 if (extent_start <= start) {
375                         start = extent_end + 1;
376                 } else if (extent_start > start && extent_start < end) {
377                         size = extent_start - start;
378                         total_added += size;
379                         ret = btrfs_add_free_space(block_group, start,
380                                                    size);
381                         BUG_ON(ret); /* -ENOMEM or logic error */
382                         start = extent_end + 1;
383                 } else {
384                         break;
385                 }
386         }
387
388         if (start < end) {
389                 size = end - start;
390                 total_added += size;
391                 ret = btrfs_add_free_space(block_group, start, size);
392                 BUG_ON(ret); /* -ENOMEM or logic error */
393         }
394
395         return total_added;
396 }
397
398 static noinline void caching_thread(struct btrfs_work *work)
399 {
400         struct btrfs_block_group_cache *block_group;
401         struct btrfs_fs_info *fs_info;
402         struct btrfs_caching_control *caching_ctl;
403         struct btrfs_root *extent_root;
404         struct btrfs_path *path;
405         struct extent_buffer *leaf;
406         struct btrfs_key key;
407         u64 total_found = 0;
408         u64 last = 0;
409         u32 nritems;
410         int ret = -ENOMEM;
411         bool wakeup = true;
412
413         caching_ctl = container_of(work, struct btrfs_caching_control, work);
414         block_group = caching_ctl->block_group;
415         fs_info = block_group->fs_info;
416         extent_root = fs_info->extent_root;
417
418         path = btrfs_alloc_path();
419         if (!path)
420                 goto out;
421
422         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
423
424 #ifdef CONFIG_BTRFS_DEBUG
425         /*
426          * If we're fragmenting we don't want to make anybody think we can
427          * allocate from this block group until we've had a chance to fragment
428          * the free space.
429          */
430         if (btrfs_should_fragment_free_space(extent_root, block_group))
431                 wakeup = false;
432 #endif
433         /*
434          * We don't want to deadlock with somebody trying to allocate a new
435          * extent for the extent root while also trying to search the extent
436          * root to add free space.  So we skip locking and search the commit
437          * root, since its read-only
438          */
439         path->skip_locking = 1;
440         path->search_commit_root = 1;
441         path->reada = 1;
442
443         key.objectid = last;
444         key.offset = 0;
445         key.type = BTRFS_EXTENT_ITEM_KEY;
446 again:
447         mutex_lock(&caching_ctl->mutex);
448         /* need to make sure the commit_root doesn't disappear */
449         down_read(&fs_info->commit_root_sem);
450
451 next:
452         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
453         if (ret < 0)
454                 goto err;
455
456         leaf = path->nodes[0];
457         nritems = btrfs_header_nritems(leaf);
458
459         while (1) {
460                 if (btrfs_fs_closing(fs_info) > 1) {
461                         last = (u64)-1;
462                         break;
463                 }
464
465                 if (path->slots[0] < nritems) {
466                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
467                 } else {
468                         ret = find_next_key(path, 0, &key);
469                         if (ret)
470                                 break;
471
472                         if (need_resched() ||
473                             rwsem_is_contended(&fs_info->commit_root_sem)) {
474                                 if (wakeup)
475                                         caching_ctl->progress = last;
476                                 btrfs_release_path(path);
477                                 up_read(&fs_info->commit_root_sem);
478                                 mutex_unlock(&caching_ctl->mutex);
479                                 cond_resched();
480                                 goto again;
481                         }
482
483                         ret = btrfs_next_leaf(extent_root, path);
484                         if (ret < 0)
485                                 goto err;
486                         if (ret)
487                                 break;
488                         leaf = path->nodes[0];
489                         nritems = btrfs_header_nritems(leaf);
490                         continue;
491                 }
492
493                 if (key.objectid < last) {
494                         key.objectid = last;
495                         key.offset = 0;
496                         key.type = BTRFS_EXTENT_ITEM_KEY;
497
498                         if (wakeup)
499                                 caching_ctl->progress = last;
500                         btrfs_release_path(path);
501                         goto next;
502                 }
503
504                 if (key.objectid < block_group->key.objectid) {
505                         path->slots[0]++;
506                         continue;
507                 }
508
509                 if (key.objectid >= block_group->key.objectid +
510                     block_group->key.offset)
511                         break;
512
513                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
514                     key.type == BTRFS_METADATA_ITEM_KEY) {
515                         total_found += add_new_free_space(block_group,
516                                                           fs_info, last,
517                                                           key.objectid);
518                         if (key.type == BTRFS_METADATA_ITEM_KEY)
519                                 last = key.objectid +
520                                         fs_info->tree_root->nodesize;
521                         else
522                                 last = key.objectid + key.offset;
523
524                         if (total_found > (1024 * 1024 * 2)) {
525                                 total_found = 0;
526                                 if (wakeup)
527                                         wake_up(&caching_ctl->wait);
528                         }
529                 }
530                 path->slots[0]++;
531         }
532         ret = 0;
533
534         total_found += add_new_free_space(block_group, fs_info, last,
535                                           block_group->key.objectid +
536                                           block_group->key.offset);
537         spin_lock(&block_group->lock);
538         block_group->caching_ctl = NULL;
539         block_group->cached = BTRFS_CACHE_FINISHED;
540         spin_unlock(&block_group->lock);
541
542 #ifdef CONFIG_BTRFS_DEBUG
543         if (btrfs_should_fragment_free_space(extent_root, block_group)) {
544                 u64 bytes_used;
545
546                 spin_lock(&block_group->space_info->lock);
547                 spin_lock(&block_group->lock);
548                 bytes_used = block_group->key.offset -
549                         btrfs_block_group_used(&block_group->item);
550                 block_group->space_info->bytes_used += bytes_used >> 1;
551                 spin_unlock(&block_group->lock);
552                 spin_unlock(&block_group->space_info->lock);
553                 fragment_free_space(extent_root, block_group);
554         }
555 #endif
556
557         caching_ctl->progress = (u64)-1;
558 err:
559         btrfs_free_path(path);
560         up_read(&fs_info->commit_root_sem);
561
562         free_excluded_extents(extent_root, block_group);
563
564         mutex_unlock(&caching_ctl->mutex);
565 out:
566         if (ret) {
567                 spin_lock(&block_group->lock);
568                 block_group->caching_ctl = NULL;
569                 block_group->cached = BTRFS_CACHE_ERROR;
570                 spin_unlock(&block_group->lock);
571         }
572         wake_up(&caching_ctl->wait);
573
574         put_caching_control(caching_ctl);
575         btrfs_put_block_group(block_group);
576 }
577
578 static int cache_block_group(struct btrfs_block_group_cache *cache,
579                              int load_cache_only)
580 {
581         DEFINE_WAIT(wait);
582         struct btrfs_fs_info *fs_info = cache->fs_info;
583         struct btrfs_caching_control *caching_ctl;
584         int ret = 0;
585
586         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
587         if (!caching_ctl)
588                 return -ENOMEM;
589
590         INIT_LIST_HEAD(&caching_ctl->list);
591         mutex_init(&caching_ctl->mutex);
592         init_waitqueue_head(&caching_ctl->wait);
593         caching_ctl->block_group = cache;
594         caching_ctl->progress = cache->key.objectid;
595         atomic_set(&caching_ctl->count, 1);
596         btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
597                         caching_thread, NULL, NULL);
598
599         spin_lock(&cache->lock);
600         /*
601          * This should be a rare occasion, but this could happen I think in the
602          * case where one thread starts to load the space cache info, and then
603          * some other thread starts a transaction commit which tries to do an
604          * allocation while the other thread is still loading the space cache
605          * info.  The previous loop should have kept us from choosing this block
606          * group, but if we've moved to the state where we will wait on caching
607          * block groups we need to first check if we're doing a fast load here,
608          * so we can wait for it to finish, otherwise we could end up allocating
609          * from a block group who's cache gets evicted for one reason or
610          * another.
611          */
612         while (cache->cached == BTRFS_CACHE_FAST) {
613                 struct btrfs_caching_control *ctl;
614
615                 ctl = cache->caching_ctl;
616                 atomic_inc(&ctl->count);
617                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
618                 spin_unlock(&cache->lock);
619
620                 schedule();
621
622                 finish_wait(&ctl->wait, &wait);
623                 put_caching_control(ctl);
624                 spin_lock(&cache->lock);
625         }
626
627         if (cache->cached != BTRFS_CACHE_NO) {
628                 spin_unlock(&cache->lock);
629                 kfree(caching_ctl);
630                 return 0;
631         }
632         WARN_ON(cache->caching_ctl);
633         cache->caching_ctl = caching_ctl;
634         cache->cached = BTRFS_CACHE_FAST;
635         spin_unlock(&cache->lock);
636
637         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
638                 mutex_lock(&caching_ctl->mutex);
639                 ret = load_free_space_cache(fs_info, cache);
640
641                 spin_lock(&cache->lock);
642                 if (ret == 1) {
643                         cache->caching_ctl = NULL;
644                         cache->cached = BTRFS_CACHE_FINISHED;
645                         cache->last_byte_to_unpin = (u64)-1;
646                         caching_ctl->progress = (u64)-1;
647                 } else {
648                         if (load_cache_only) {
649                                 cache->caching_ctl = NULL;
650                                 cache->cached = BTRFS_CACHE_NO;
651                         } else {
652                                 cache->cached = BTRFS_CACHE_STARTED;
653                                 cache->has_caching_ctl = 1;
654                         }
655                 }
656                 spin_unlock(&cache->lock);
657 #ifdef CONFIG_BTRFS_DEBUG
658                 if (ret == 1 &&
659                     btrfs_should_fragment_free_space(fs_info->extent_root,
660                                                      cache)) {
661                         u64 bytes_used;
662
663                         spin_lock(&cache->space_info->lock);
664                         spin_lock(&cache->lock);
665                         bytes_used = cache->key.offset -
666                                 btrfs_block_group_used(&cache->item);
667                         cache->space_info->bytes_used += bytes_used >> 1;
668                         spin_unlock(&cache->lock);
669                         spin_unlock(&cache->space_info->lock);
670                         fragment_free_space(fs_info->extent_root, cache);
671                 }
672 #endif
673                 mutex_unlock(&caching_ctl->mutex);
674
675                 wake_up(&caching_ctl->wait);
676                 if (ret == 1) {
677                         put_caching_control(caching_ctl);
678                         free_excluded_extents(fs_info->extent_root, cache);
679                         return 0;
680                 }
681         } else {
682                 /*
683                  * We are not going to do the fast caching, set cached to the
684                  * appropriate value and wakeup any waiters.
685                  */
686                 spin_lock(&cache->lock);
687                 if (load_cache_only) {
688                         cache->caching_ctl = NULL;
689                         cache->cached = BTRFS_CACHE_NO;
690                 } else {
691                         cache->cached = BTRFS_CACHE_STARTED;
692                         cache->has_caching_ctl = 1;
693                 }
694                 spin_unlock(&cache->lock);
695                 wake_up(&caching_ctl->wait);
696         }
697
698         if (load_cache_only) {
699                 put_caching_control(caching_ctl);
700                 return 0;
701         }
702
703         down_write(&fs_info->commit_root_sem);
704         atomic_inc(&caching_ctl->count);
705         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
706         up_write(&fs_info->commit_root_sem);
707
708         btrfs_get_block_group(cache);
709
710         btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
711
712         return ret;
713 }
714
715 /*
716  * return the block group that starts at or after bytenr
717  */
718 static struct btrfs_block_group_cache *
719 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
720 {
721         struct btrfs_block_group_cache *cache;
722
723         cache = block_group_cache_tree_search(info, bytenr, 0);
724
725         return cache;
726 }
727
728 /*
729  * return the block group that contains the given bytenr
730  */
731 struct btrfs_block_group_cache *btrfs_lookup_block_group(
732                                                  struct btrfs_fs_info *info,
733                                                  u64 bytenr)
734 {
735         struct btrfs_block_group_cache *cache;
736
737         cache = block_group_cache_tree_search(info, bytenr, 1);
738
739         return cache;
740 }
741
742 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
743                                                   u64 flags)
744 {
745         struct list_head *head = &info->space_info;
746         struct btrfs_space_info *found;
747
748         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
749
750         rcu_read_lock();
751         list_for_each_entry_rcu(found, head, list) {
752                 if (found->flags & flags) {
753                         rcu_read_unlock();
754                         return found;
755                 }
756         }
757         rcu_read_unlock();
758         return NULL;
759 }
760
761 /*
762  * after adding space to the filesystem, we need to clear the full flags
763  * on all the space infos.
764  */
765 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
766 {
767         struct list_head *head = &info->space_info;
768         struct btrfs_space_info *found;
769
770         rcu_read_lock();
771         list_for_each_entry_rcu(found, head, list)
772                 found->full = 0;
773         rcu_read_unlock();
774 }
775
776 /* simple helper to search for an existing data extent at a given offset */
777 int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
778 {
779         int ret;
780         struct btrfs_key key;
781         struct btrfs_path *path;
782
783         path = btrfs_alloc_path();
784         if (!path)
785                 return -ENOMEM;
786
787         key.objectid = start;
788         key.offset = len;
789         key.type = BTRFS_EXTENT_ITEM_KEY;
790         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
791                                 0, 0);
792         btrfs_free_path(path);
793         return ret;
794 }
795
796 /*
797  * helper function to lookup reference count and flags of a tree block.
798  *
799  * the head node for delayed ref is used to store the sum of all the
800  * reference count modifications queued up in the rbtree. the head
801  * node may also store the extent flags to set. This way you can check
802  * to see what the reference count and extent flags would be if all of
803  * the delayed refs are not processed.
804  */
805 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
806                              struct btrfs_root *root, u64 bytenr,
807                              u64 offset, int metadata, u64 *refs, u64 *flags)
808 {
809         struct btrfs_delayed_ref_head *head;
810         struct btrfs_delayed_ref_root *delayed_refs;
811         struct btrfs_path *path;
812         struct btrfs_extent_item *ei;
813         struct extent_buffer *leaf;
814         struct btrfs_key key;
815         u32 item_size;
816         u64 num_refs;
817         u64 extent_flags;
818         int ret;
819
820         /*
821          * If we don't have skinny metadata, don't bother doing anything
822          * different
823          */
824         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
825                 offset = root->nodesize;
826                 metadata = 0;
827         }
828
829         path = btrfs_alloc_path();
830         if (!path)
831                 return -ENOMEM;
832
833         if (!trans) {
834                 path->skip_locking = 1;
835                 path->search_commit_root = 1;
836         }
837
838 search_again:
839         key.objectid = bytenr;
840         key.offset = offset;
841         if (metadata)
842                 key.type = BTRFS_METADATA_ITEM_KEY;
843         else
844                 key.type = BTRFS_EXTENT_ITEM_KEY;
845
846         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
847                                 &key, path, 0, 0);
848         if (ret < 0)
849                 goto out_free;
850
851         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
852                 if (path->slots[0]) {
853                         path->slots[0]--;
854                         btrfs_item_key_to_cpu(path->nodes[0], &key,
855                                               path->slots[0]);
856                         if (key.objectid == bytenr &&
857                             key.type == BTRFS_EXTENT_ITEM_KEY &&
858                             key.offset == root->nodesize)
859                                 ret = 0;
860                 }
861         }
862
863         if (ret == 0) {
864                 leaf = path->nodes[0];
865                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
866                 if (item_size >= sizeof(*ei)) {
867                         ei = btrfs_item_ptr(leaf, path->slots[0],
868                                             struct btrfs_extent_item);
869                         num_refs = btrfs_extent_refs(leaf, ei);
870                         extent_flags = btrfs_extent_flags(leaf, ei);
871                 } else {
872 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
873                         struct btrfs_extent_item_v0 *ei0;
874                         BUG_ON(item_size != sizeof(*ei0));
875                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
876                                              struct btrfs_extent_item_v0);
877                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
878                         /* FIXME: this isn't correct for data */
879                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
880 #else
881                         BUG();
882 #endif
883                 }
884                 BUG_ON(num_refs == 0);
885         } else {
886                 num_refs = 0;
887                 extent_flags = 0;
888                 ret = 0;
889         }
890
891         if (!trans)
892                 goto out;
893
894         delayed_refs = &trans->transaction->delayed_refs;
895         spin_lock(&delayed_refs->lock);
896         head = btrfs_find_delayed_ref_head(trans, bytenr);
897         if (head) {
898                 if (!mutex_trylock(&head->mutex)) {
899                         atomic_inc(&head->node.refs);
900                         spin_unlock(&delayed_refs->lock);
901
902                         btrfs_release_path(path);
903
904                         /*
905                          * Mutex was contended, block until it's released and try
906                          * again
907                          */
908                         mutex_lock(&head->mutex);
909                         mutex_unlock(&head->mutex);
910                         btrfs_put_delayed_ref(&head->node);
911                         goto search_again;
912                 }
913                 spin_lock(&head->lock);
914                 if (head->extent_op && head->extent_op->update_flags)
915                         extent_flags |= head->extent_op->flags_to_set;
916                 else
917                         BUG_ON(num_refs == 0);
918
919                 num_refs += head->node.ref_mod;
920                 spin_unlock(&head->lock);
921                 mutex_unlock(&head->mutex);
922         }
923         spin_unlock(&delayed_refs->lock);
924 out:
925         WARN_ON(num_refs == 0);
926         if (refs)
927                 *refs = num_refs;
928         if (flags)
929                 *flags = extent_flags;
930 out_free:
931         btrfs_free_path(path);
932         return ret;
933 }
934
935 /*
936  * Back reference rules.  Back refs have three main goals:
937  *
938  * 1) differentiate between all holders of references to an extent so that
939  *    when a reference is dropped we can make sure it was a valid reference
940  *    before freeing the extent.
941  *
942  * 2) Provide enough information to quickly find the holders of an extent
943  *    if we notice a given block is corrupted or bad.
944  *
945  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
946  *    maintenance.  This is actually the same as #2, but with a slightly
947  *    different use case.
948  *
949  * There are two kinds of back refs. The implicit back refs is optimized
950  * for pointers in non-shared tree blocks. For a given pointer in a block,
951  * back refs of this kind provide information about the block's owner tree
952  * and the pointer's key. These information allow us to find the block by
953  * b-tree searching. The full back refs is for pointers in tree blocks not
954  * referenced by their owner trees. The location of tree block is recorded
955  * in the back refs. Actually the full back refs is generic, and can be
956  * used in all cases the implicit back refs is used. The major shortcoming
957  * of the full back refs is its overhead. Every time a tree block gets
958  * COWed, we have to update back refs entry for all pointers in it.
959  *
960  * For a newly allocated tree block, we use implicit back refs for
961  * pointers in it. This means most tree related operations only involve
962  * implicit back refs. For a tree block created in old transaction, the
963  * only way to drop a reference to it is COW it. So we can detect the
964  * event that tree block loses its owner tree's reference and do the
965  * back refs conversion.
966  *
967  * When a tree block is COW'd through a tree, there are four cases:
968  *
969  * The reference count of the block is one and the tree is the block's
970  * owner tree. Nothing to do in this case.
971  *
972  * The reference count of the block is one and the tree is not the
973  * block's owner tree. In this case, full back refs is used for pointers
974  * in the block. Remove these full back refs, add implicit back refs for
975  * every pointers in the new block.
976  *
977  * The reference count of the block is greater than one and the tree is
978  * the block's owner tree. In this case, implicit back refs is used for
979  * pointers in the block. Add full back refs for every pointers in the
980  * block, increase lower level extents' reference counts. The original
981  * implicit back refs are entailed to the new block.
982  *
983  * The reference count of the block is greater than one and the tree is
984  * not the block's owner tree. Add implicit back refs for every pointer in
985  * the new block, increase lower level extents' reference count.
986  *
987  * Back Reference Key composing:
988  *
989  * The key objectid corresponds to the first byte in the extent,
990  * The key type is used to differentiate between types of back refs.
991  * There are different meanings of the key offset for different types
992  * of back refs.
993  *
994  * File extents can be referenced by:
995  *
996  * - multiple snapshots, subvolumes, or different generations in one subvol
997  * - different files inside a single subvolume
998  * - different offsets inside a file (bookend extents in file.c)
999  *
1000  * The extent ref structure for the implicit back refs has fields for:
1001  *
1002  * - Objectid of the subvolume root
1003  * - objectid of the file holding the reference
1004  * - original offset in the file
1005  * - how many bookend extents
1006  *
1007  * The key offset for the implicit back refs is hash of the first
1008  * three fields.
1009  *
1010  * The extent ref structure for the full back refs has field for:
1011  *
1012  * - number of pointers in the tree leaf
1013  *
1014  * The key offset for the implicit back refs is the first byte of
1015  * the tree leaf
1016  *
1017  * When a file extent is allocated, The implicit back refs is used.
1018  * the fields are filled in:
1019  *
1020  *     (root_key.objectid, inode objectid, offset in file, 1)
1021  *
1022  * When a file extent is removed file truncation, we find the
1023  * corresponding implicit back refs and check the following fields:
1024  *
1025  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1026  *
1027  * Btree extents can be referenced by:
1028  *
1029  * - Different subvolumes
1030  *
1031  * Both the implicit back refs and the full back refs for tree blocks
1032  * only consist of key. The key offset for the implicit back refs is
1033  * objectid of block's owner tree. The key offset for the full back refs
1034  * is the first byte of parent block.
1035  *
1036  * When implicit back refs is used, information about the lowest key and
1037  * level of the tree block are required. These information are stored in
1038  * tree block info structure.
1039  */
1040
1041 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1042 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1043                                   struct btrfs_root *root,
1044                                   struct btrfs_path *path,
1045                                   u64 owner, u32 extra_size)
1046 {
1047         struct btrfs_extent_item *item;
1048         struct btrfs_extent_item_v0 *ei0;
1049         struct btrfs_extent_ref_v0 *ref0;
1050         struct btrfs_tree_block_info *bi;
1051         struct extent_buffer *leaf;
1052         struct btrfs_key key;
1053         struct btrfs_key found_key;
1054         u32 new_size = sizeof(*item);
1055         u64 refs;
1056         int ret;
1057
1058         leaf = path->nodes[0];
1059         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1060
1061         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1062         ei0 = btrfs_item_ptr(leaf, path->slots[0],
1063                              struct btrfs_extent_item_v0);
1064         refs = btrfs_extent_refs_v0(leaf, ei0);
1065
1066         if (owner == (u64)-1) {
1067                 while (1) {
1068                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1069                                 ret = btrfs_next_leaf(root, path);
1070                                 if (ret < 0)
1071                                         return ret;
1072                                 BUG_ON(ret > 0); /* Corruption */
1073                                 leaf = path->nodes[0];
1074                         }
1075                         btrfs_item_key_to_cpu(leaf, &found_key,
1076                                               path->slots[0]);
1077                         BUG_ON(key.objectid != found_key.objectid);
1078                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1079                                 path->slots[0]++;
1080                                 continue;
1081                         }
1082                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1083                                               struct btrfs_extent_ref_v0);
1084                         owner = btrfs_ref_objectid_v0(leaf, ref0);
1085                         break;
1086                 }
1087         }
1088         btrfs_release_path(path);
1089
1090         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1091                 new_size += sizeof(*bi);
1092
1093         new_size -= sizeof(*ei0);
1094         ret = btrfs_search_slot(trans, root, &key, path,
1095                                 new_size + extra_size, 1);
1096         if (ret < 0)
1097                 return ret;
1098         BUG_ON(ret); /* Corruption */
1099
1100         btrfs_extend_item(root, path, new_size);
1101
1102         leaf = path->nodes[0];
1103         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1104         btrfs_set_extent_refs(leaf, item, refs);
1105         /* FIXME: get real generation */
1106         btrfs_set_extent_generation(leaf, item, 0);
1107         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1108                 btrfs_set_extent_flags(leaf, item,
1109                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1110                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1111                 bi = (struct btrfs_tree_block_info *)(item + 1);
1112                 /* FIXME: get first key of the block */
1113                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1114                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1115         } else {
1116                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1117         }
1118         btrfs_mark_buffer_dirty(leaf);
1119         return 0;
1120 }
1121 #endif
1122
1123 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1124 {
1125         u32 high_crc = ~(u32)0;
1126         u32 low_crc = ~(u32)0;
1127         __le64 lenum;
1128
1129         lenum = cpu_to_le64(root_objectid);
1130         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1131         lenum = cpu_to_le64(owner);
1132         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1133         lenum = cpu_to_le64(offset);
1134         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1135
1136         return ((u64)high_crc << 31) ^ (u64)low_crc;
1137 }
1138
1139 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1140                                      struct btrfs_extent_data_ref *ref)
1141 {
1142         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1143                                     btrfs_extent_data_ref_objectid(leaf, ref),
1144                                     btrfs_extent_data_ref_offset(leaf, ref));
1145 }
1146
1147 static int match_extent_data_ref(struct extent_buffer *leaf,
1148                                  struct btrfs_extent_data_ref *ref,
1149                                  u64 root_objectid, u64 owner, u64 offset)
1150 {
1151         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1152             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1153             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1154                 return 0;
1155         return 1;
1156 }
1157
1158 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1159                                            struct btrfs_root *root,
1160                                            struct btrfs_path *path,
1161                                            u64 bytenr, u64 parent,
1162                                            u64 root_objectid,
1163                                            u64 owner, u64 offset)
1164 {
1165         struct btrfs_key key;
1166         struct btrfs_extent_data_ref *ref;
1167         struct extent_buffer *leaf;
1168         u32 nritems;
1169         int ret;
1170         int recow;
1171         int err = -ENOENT;
1172
1173         key.objectid = bytenr;
1174         if (parent) {
1175                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1176                 key.offset = parent;
1177         } else {
1178                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1179                 key.offset = hash_extent_data_ref(root_objectid,
1180                                                   owner, offset);
1181         }
1182 again:
1183         recow = 0;
1184         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1185         if (ret < 0) {
1186                 err = ret;
1187                 goto fail;
1188         }
1189
1190         if (parent) {
1191                 if (!ret)
1192                         return 0;
1193 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1194                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1195                 btrfs_release_path(path);
1196                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1197                 if (ret < 0) {
1198                         err = ret;
1199                         goto fail;
1200                 }
1201                 if (!ret)
1202                         return 0;
1203 #endif
1204                 goto fail;
1205         }
1206
1207         leaf = path->nodes[0];
1208         nritems = btrfs_header_nritems(leaf);
1209         while (1) {
1210                 if (path->slots[0] >= nritems) {
1211                         ret = btrfs_next_leaf(root, path);
1212                         if (ret < 0)
1213                                 err = ret;
1214                         if (ret)
1215                                 goto fail;
1216
1217                         leaf = path->nodes[0];
1218                         nritems = btrfs_header_nritems(leaf);
1219                         recow = 1;
1220                 }
1221
1222                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1223                 if (key.objectid != bytenr ||
1224                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1225                         goto fail;
1226
1227                 ref = btrfs_item_ptr(leaf, path->slots[0],
1228                                      struct btrfs_extent_data_ref);
1229
1230                 if (match_extent_data_ref(leaf, ref, root_objectid,
1231                                           owner, offset)) {
1232                         if (recow) {
1233                                 btrfs_release_path(path);
1234                                 goto again;
1235                         }
1236                         err = 0;
1237                         break;
1238                 }
1239                 path->slots[0]++;
1240         }
1241 fail:
1242         return err;
1243 }
1244
1245 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1246                                            struct btrfs_root *root,
1247                                            struct btrfs_path *path,
1248                                            u64 bytenr, u64 parent,
1249                                            u64 root_objectid, u64 owner,
1250                                            u64 offset, int refs_to_add)
1251 {
1252         struct btrfs_key key;
1253         struct extent_buffer *leaf;
1254         u32 size;
1255         u32 num_refs;
1256         int ret;
1257
1258         key.objectid = bytenr;
1259         if (parent) {
1260                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1261                 key.offset = parent;
1262                 size = sizeof(struct btrfs_shared_data_ref);
1263         } else {
1264                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1265                 key.offset = hash_extent_data_ref(root_objectid,
1266                                                   owner, offset);
1267                 size = sizeof(struct btrfs_extent_data_ref);
1268         }
1269
1270         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1271         if (ret && ret != -EEXIST)
1272                 goto fail;
1273
1274         leaf = path->nodes[0];
1275         if (parent) {
1276                 struct btrfs_shared_data_ref *ref;
1277                 ref = btrfs_item_ptr(leaf, path->slots[0],
1278                                      struct btrfs_shared_data_ref);
1279                 if (ret == 0) {
1280                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1281                 } else {
1282                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1283                         num_refs += refs_to_add;
1284                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1285                 }
1286         } else {
1287                 struct btrfs_extent_data_ref *ref;
1288                 while (ret == -EEXIST) {
1289                         ref = btrfs_item_ptr(leaf, path->slots[0],
1290                                              struct btrfs_extent_data_ref);
1291                         if (match_extent_data_ref(leaf, ref, root_objectid,
1292                                                   owner, offset))
1293                                 break;
1294                         btrfs_release_path(path);
1295                         key.offset++;
1296                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1297                                                       size);
1298                         if (ret && ret != -EEXIST)
1299                                 goto fail;
1300
1301                         leaf = path->nodes[0];
1302                 }
1303                 ref = btrfs_item_ptr(leaf, path->slots[0],
1304                                      struct btrfs_extent_data_ref);
1305                 if (ret == 0) {
1306                         btrfs_set_extent_data_ref_root(leaf, ref,
1307                                                        root_objectid);
1308                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1309                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1310                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1311                 } else {
1312                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1313                         num_refs += refs_to_add;
1314                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1315                 }
1316         }
1317         btrfs_mark_buffer_dirty(leaf);
1318         ret = 0;
1319 fail:
1320         btrfs_release_path(path);
1321         return ret;
1322 }
1323
1324 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1325                                            struct btrfs_root *root,
1326                                            struct btrfs_path *path,
1327                                            int refs_to_drop, int *last_ref)
1328 {
1329         struct btrfs_key key;
1330         struct btrfs_extent_data_ref *ref1 = NULL;
1331         struct btrfs_shared_data_ref *ref2 = NULL;
1332         struct extent_buffer *leaf;
1333         u32 num_refs = 0;
1334         int ret = 0;
1335
1336         leaf = path->nodes[0];
1337         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1338
1339         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1340                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1341                                       struct btrfs_extent_data_ref);
1342                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1343         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1344                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1345                                       struct btrfs_shared_data_ref);
1346                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1347 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1348         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1349                 struct btrfs_extent_ref_v0 *ref0;
1350                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1351                                       struct btrfs_extent_ref_v0);
1352                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1353 #endif
1354         } else {
1355                 BUG();
1356         }
1357
1358         BUG_ON(num_refs < refs_to_drop);
1359         num_refs -= refs_to_drop;
1360
1361         if (num_refs == 0) {
1362                 ret = btrfs_del_item(trans, root, path);
1363                 *last_ref = 1;
1364         } else {
1365                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1366                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1367                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1368                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1369 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1370                 else {
1371                         struct btrfs_extent_ref_v0 *ref0;
1372                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1373                                         struct btrfs_extent_ref_v0);
1374                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1375                 }
1376 #endif
1377                 btrfs_mark_buffer_dirty(leaf);
1378         }
1379         return ret;
1380 }
1381
1382 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1383                                           struct btrfs_extent_inline_ref *iref)
1384 {
1385         struct btrfs_key key;
1386         struct extent_buffer *leaf;
1387         struct btrfs_extent_data_ref *ref1;
1388         struct btrfs_shared_data_ref *ref2;
1389         u32 num_refs = 0;
1390
1391         leaf = path->nodes[0];
1392         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1393         if (iref) {
1394                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1395                     BTRFS_EXTENT_DATA_REF_KEY) {
1396                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1397                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1398                 } else {
1399                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1400                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1401                 }
1402         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1403                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1404                                       struct btrfs_extent_data_ref);
1405                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1406         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1407                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1408                                       struct btrfs_shared_data_ref);
1409                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1410 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1411         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1412                 struct btrfs_extent_ref_v0 *ref0;
1413                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1414                                       struct btrfs_extent_ref_v0);
1415                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1416 #endif
1417         } else {
1418                 WARN_ON(1);
1419         }
1420         return num_refs;
1421 }
1422
1423 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1424                                           struct btrfs_root *root,
1425                                           struct btrfs_path *path,
1426                                           u64 bytenr, u64 parent,
1427                                           u64 root_objectid)
1428 {
1429         struct btrfs_key key;
1430         int ret;
1431
1432         key.objectid = bytenr;
1433         if (parent) {
1434                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1435                 key.offset = parent;
1436         } else {
1437                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1438                 key.offset = root_objectid;
1439         }
1440
1441         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1442         if (ret > 0)
1443                 ret = -ENOENT;
1444 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1445         if (ret == -ENOENT && parent) {
1446                 btrfs_release_path(path);
1447                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1448                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1449                 if (ret > 0)
1450                         ret = -ENOENT;
1451         }
1452 #endif
1453         return ret;
1454 }
1455
1456 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1457                                           struct btrfs_root *root,
1458                                           struct btrfs_path *path,
1459                                           u64 bytenr, u64 parent,
1460                                           u64 root_objectid)
1461 {
1462         struct btrfs_key key;
1463         int ret;
1464
1465         key.objectid = bytenr;
1466         if (parent) {
1467                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1468                 key.offset = parent;
1469         } else {
1470                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1471                 key.offset = root_objectid;
1472         }
1473
1474         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1475         btrfs_release_path(path);
1476         return ret;
1477 }
1478
1479 static inline int extent_ref_type(u64 parent, u64 owner)
1480 {
1481         int type;
1482         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1483                 if (parent > 0)
1484                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1485                 else
1486                         type = BTRFS_TREE_BLOCK_REF_KEY;
1487         } else {
1488                 if (parent > 0)
1489                         type = BTRFS_SHARED_DATA_REF_KEY;
1490                 else
1491                         type = BTRFS_EXTENT_DATA_REF_KEY;
1492         }
1493         return type;
1494 }
1495
1496 static int find_next_key(struct btrfs_path *path, int level,
1497                          struct btrfs_key *key)
1498
1499 {
1500         for (; level < BTRFS_MAX_LEVEL; level++) {
1501                 if (!path->nodes[level])
1502                         break;
1503                 if (path->slots[level] + 1 >=
1504                     btrfs_header_nritems(path->nodes[level]))
1505                         continue;
1506                 if (level == 0)
1507                         btrfs_item_key_to_cpu(path->nodes[level], key,
1508                                               path->slots[level] + 1);
1509                 else
1510                         btrfs_node_key_to_cpu(path->nodes[level], key,
1511                                               path->slots[level] + 1);
1512                 return 0;
1513         }
1514         return 1;
1515 }
1516
1517 /*
1518  * look for inline back ref. if back ref is found, *ref_ret is set
1519  * to the address of inline back ref, and 0 is returned.
1520  *
1521  * if back ref isn't found, *ref_ret is set to the address where it
1522  * should be inserted, and -ENOENT is returned.
1523  *
1524  * if insert is true and there are too many inline back refs, the path
1525  * points to the extent item, and -EAGAIN is returned.
1526  *
1527  * NOTE: inline back refs are ordered in the same way that back ref
1528  *       items in the tree are ordered.
1529  */
1530 static noinline_for_stack
1531 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1532                                  struct btrfs_root *root,
1533                                  struct btrfs_path *path,
1534                                  struct btrfs_extent_inline_ref **ref_ret,
1535                                  u64 bytenr, u64 num_bytes,
1536                                  u64 parent, u64 root_objectid,
1537                                  u64 owner, u64 offset, int insert)
1538 {
1539         struct btrfs_key key;
1540         struct extent_buffer *leaf;
1541         struct btrfs_extent_item *ei;
1542         struct btrfs_extent_inline_ref *iref;
1543         u64 flags;
1544         u64 item_size;
1545         unsigned long ptr;
1546         unsigned long end;
1547         int extra_size;
1548         int type;
1549         int want;
1550         int ret;
1551         int err = 0;
1552         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1553                                                  SKINNY_METADATA);
1554
1555         key.objectid = bytenr;
1556         key.type = BTRFS_EXTENT_ITEM_KEY;
1557         key.offset = num_bytes;
1558
1559         want = extent_ref_type(parent, owner);
1560         if (insert) {
1561                 extra_size = btrfs_extent_inline_ref_size(want);
1562                 path->keep_locks = 1;
1563         } else
1564                 extra_size = -1;
1565
1566         /*
1567          * Owner is our parent level, so we can just add one to get the level
1568          * for the block we are interested in.
1569          */
1570         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1571                 key.type = BTRFS_METADATA_ITEM_KEY;
1572                 key.offset = owner;
1573         }
1574
1575 again:
1576         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1577         if (ret < 0) {
1578                 err = ret;
1579                 goto out;
1580         }
1581
1582         /*
1583          * We may be a newly converted file system which still has the old fat
1584          * extent entries for metadata, so try and see if we have one of those.
1585          */
1586         if (ret > 0 && skinny_metadata) {
1587                 skinny_metadata = false;
1588                 if (path->slots[0]) {
1589                         path->slots[0]--;
1590                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1591                                               path->slots[0]);
1592                         if (key.objectid == bytenr &&
1593                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1594                             key.offset == num_bytes)
1595                                 ret = 0;
1596                 }
1597                 if (ret) {
1598                         key.objectid = bytenr;
1599                         key.type = BTRFS_EXTENT_ITEM_KEY;
1600                         key.offset = num_bytes;
1601                         btrfs_release_path(path);
1602                         goto again;
1603                 }
1604         }
1605
1606         if (ret && !insert) {
1607                 err = -ENOENT;
1608                 goto out;
1609         } else if (WARN_ON(ret)) {
1610                 err = -EIO;
1611                 goto out;
1612         }
1613
1614         leaf = path->nodes[0];
1615         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1616 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1617         if (item_size < sizeof(*ei)) {
1618                 if (!insert) {
1619                         err = -ENOENT;
1620                         goto out;
1621                 }
1622                 ret = convert_extent_item_v0(trans, root, path, owner,
1623                                              extra_size);
1624                 if (ret < 0) {
1625                         err = ret;
1626                         goto out;
1627                 }
1628                 leaf = path->nodes[0];
1629                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1630         }
1631 #endif
1632         BUG_ON(item_size < sizeof(*ei));
1633
1634         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1635         flags = btrfs_extent_flags(leaf, ei);
1636
1637         ptr = (unsigned long)(ei + 1);
1638         end = (unsigned long)ei + item_size;
1639
1640         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1641                 ptr += sizeof(struct btrfs_tree_block_info);
1642                 BUG_ON(ptr > end);
1643         }
1644
1645         err = -ENOENT;
1646         while (1) {
1647                 if (ptr >= end) {
1648                         WARN_ON(ptr > end);
1649                         break;
1650                 }
1651                 iref = (struct btrfs_extent_inline_ref *)ptr;
1652                 type = btrfs_extent_inline_ref_type(leaf, iref);
1653                 if (want < type)
1654                         break;
1655                 if (want > type) {
1656                         ptr += btrfs_extent_inline_ref_size(type);
1657                         continue;
1658                 }
1659
1660                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1661                         struct btrfs_extent_data_ref *dref;
1662                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1663                         if (match_extent_data_ref(leaf, dref, root_objectid,
1664                                                   owner, offset)) {
1665                                 err = 0;
1666                                 break;
1667                         }
1668                         if (hash_extent_data_ref_item(leaf, dref) <
1669                             hash_extent_data_ref(root_objectid, owner, offset))
1670                                 break;
1671                 } else {
1672                         u64 ref_offset;
1673                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1674                         if (parent > 0) {
1675                                 if (parent == ref_offset) {
1676                                         err = 0;
1677                                         break;
1678                                 }
1679                                 if (ref_offset < parent)
1680                                         break;
1681                         } else {
1682                                 if (root_objectid == ref_offset) {
1683                                         err = 0;
1684                                         break;
1685                                 }
1686                                 if (ref_offset < root_objectid)
1687                                         break;
1688                         }
1689                 }
1690                 ptr += btrfs_extent_inline_ref_size(type);
1691         }
1692         if (err == -ENOENT && insert) {
1693                 if (item_size + extra_size >=
1694                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1695                         err = -EAGAIN;
1696                         goto out;
1697                 }
1698                 /*
1699                  * To add new inline back ref, we have to make sure
1700                  * there is no corresponding back ref item.
1701                  * For simplicity, we just do not add new inline back
1702                  * ref if there is any kind of item for this block
1703                  */
1704                 if (find_next_key(path, 0, &key) == 0 &&
1705                     key.objectid == bytenr &&
1706                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1707                         err = -EAGAIN;
1708                         goto out;
1709                 }
1710         }
1711         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1712 out:
1713         if (insert) {
1714                 path->keep_locks = 0;
1715                 btrfs_unlock_up_safe(path, 1);
1716         }
1717         return err;
1718 }
1719
1720 /*
1721  * helper to add new inline back ref
1722  */
1723 static noinline_for_stack
1724 void setup_inline_extent_backref(struct btrfs_root *root,
1725                                  struct btrfs_path *path,
1726                                  struct btrfs_extent_inline_ref *iref,
1727                                  u64 parent, u64 root_objectid,
1728                                  u64 owner, u64 offset, int refs_to_add,
1729                                  struct btrfs_delayed_extent_op *extent_op)
1730 {
1731         struct extent_buffer *leaf;
1732         struct btrfs_extent_item *ei;
1733         unsigned long ptr;
1734         unsigned long end;
1735         unsigned long item_offset;
1736         u64 refs;
1737         int size;
1738         int type;
1739
1740         leaf = path->nodes[0];
1741         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1742         item_offset = (unsigned long)iref - (unsigned long)ei;
1743
1744         type = extent_ref_type(parent, owner);
1745         size = btrfs_extent_inline_ref_size(type);
1746
1747         btrfs_extend_item(root, path, size);
1748
1749         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1750         refs = btrfs_extent_refs(leaf, ei);
1751         refs += refs_to_add;
1752         btrfs_set_extent_refs(leaf, ei, refs);
1753         if (extent_op)
1754                 __run_delayed_extent_op(extent_op, leaf, ei);
1755
1756         ptr = (unsigned long)ei + item_offset;
1757         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1758         if (ptr < end - size)
1759                 memmove_extent_buffer(leaf, ptr + size, ptr,
1760                                       end - size - ptr);
1761
1762         iref = (struct btrfs_extent_inline_ref *)ptr;
1763         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1764         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1765                 struct btrfs_extent_data_ref *dref;
1766                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1767                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1768                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1769                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1770                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1771         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1772                 struct btrfs_shared_data_ref *sref;
1773                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1774                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1775                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1776         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1777                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1778         } else {
1779                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1780         }
1781         btrfs_mark_buffer_dirty(leaf);
1782 }
1783
1784 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1785                                  struct btrfs_root *root,
1786                                  struct btrfs_path *path,
1787                                  struct btrfs_extent_inline_ref **ref_ret,
1788                                  u64 bytenr, u64 num_bytes, u64 parent,
1789                                  u64 root_objectid, u64 owner, u64 offset)
1790 {
1791         int ret;
1792
1793         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1794                                            bytenr, num_bytes, parent,
1795                                            root_objectid, owner, offset, 0);
1796         if (ret != -ENOENT)
1797                 return ret;
1798
1799         btrfs_release_path(path);
1800         *ref_ret = NULL;
1801
1802         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1803                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1804                                             root_objectid);
1805         } else {
1806                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1807                                              root_objectid, owner, offset);
1808         }
1809         return ret;
1810 }
1811
1812 /*
1813  * helper to update/remove inline back ref
1814  */
1815 static noinline_for_stack
1816 void update_inline_extent_backref(struct btrfs_root *root,
1817                                   struct btrfs_path *path,
1818                                   struct btrfs_extent_inline_ref *iref,
1819                                   int refs_to_mod,
1820                                   struct btrfs_delayed_extent_op *extent_op,
1821                                   int *last_ref)
1822 {
1823         struct extent_buffer *leaf;
1824         struct btrfs_extent_item *ei;
1825         struct btrfs_extent_data_ref *dref = NULL;
1826         struct btrfs_shared_data_ref *sref = NULL;
1827         unsigned long ptr;
1828         unsigned long end;
1829         u32 item_size;
1830         int size;
1831         int type;
1832         u64 refs;
1833
1834         leaf = path->nodes[0];
1835         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1836         refs = btrfs_extent_refs(leaf, ei);
1837         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1838         refs += refs_to_mod;
1839         btrfs_set_extent_refs(leaf, ei, refs);
1840         if (extent_op)
1841                 __run_delayed_extent_op(extent_op, leaf, ei);
1842
1843         type = btrfs_extent_inline_ref_type(leaf, iref);
1844
1845         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1846                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1847                 refs = btrfs_extent_data_ref_count(leaf, dref);
1848         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1849                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1850                 refs = btrfs_shared_data_ref_count(leaf, sref);
1851         } else {
1852                 refs = 1;
1853                 BUG_ON(refs_to_mod != -1);
1854         }
1855
1856         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1857         refs += refs_to_mod;
1858
1859         if (refs > 0) {
1860                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1861                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1862                 else
1863                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1864         } else {
1865                 *last_ref = 1;
1866                 size =  btrfs_extent_inline_ref_size(type);
1867                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1868                 ptr = (unsigned long)iref;
1869                 end = (unsigned long)ei + item_size;
1870                 if (ptr + size < end)
1871                         memmove_extent_buffer(leaf, ptr, ptr + size,
1872                                               end - ptr - size);
1873                 item_size -= size;
1874                 btrfs_truncate_item(root, path, item_size, 1);
1875         }
1876         btrfs_mark_buffer_dirty(leaf);
1877 }
1878
1879 static noinline_for_stack
1880 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1881                                  struct btrfs_root *root,
1882                                  struct btrfs_path *path,
1883                                  u64 bytenr, u64 num_bytes, u64 parent,
1884                                  u64 root_objectid, u64 owner,
1885                                  u64 offset, int refs_to_add,
1886                                  struct btrfs_delayed_extent_op *extent_op)
1887 {
1888         struct btrfs_extent_inline_ref *iref;
1889         int ret;
1890
1891         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1892                                            bytenr, num_bytes, parent,
1893                                            root_objectid, owner, offset, 1);
1894         if (ret == 0) {
1895                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1896                 update_inline_extent_backref(root, path, iref,
1897                                              refs_to_add, extent_op, NULL);
1898         } else if (ret == -ENOENT) {
1899                 setup_inline_extent_backref(root, path, iref, parent,
1900                                             root_objectid, owner, offset,
1901                                             refs_to_add, extent_op);
1902                 ret = 0;
1903         }
1904         return ret;
1905 }
1906
1907 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1908                                  struct btrfs_root *root,
1909                                  struct btrfs_path *path,
1910                                  u64 bytenr, u64 parent, u64 root_objectid,
1911                                  u64 owner, u64 offset, int refs_to_add)
1912 {
1913         int ret;
1914         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1915                 BUG_ON(refs_to_add != 1);
1916                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1917                                             parent, root_objectid);
1918         } else {
1919                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1920                                              parent, root_objectid,
1921                                              owner, offset, refs_to_add);
1922         }
1923         return ret;
1924 }
1925
1926 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1927                                  struct btrfs_root *root,
1928                                  struct btrfs_path *path,
1929                                  struct btrfs_extent_inline_ref *iref,
1930                                  int refs_to_drop, int is_data, int *last_ref)
1931 {
1932         int ret = 0;
1933
1934         BUG_ON(!is_data && refs_to_drop != 1);
1935         if (iref) {
1936                 update_inline_extent_backref(root, path, iref,
1937                                              -refs_to_drop, NULL, last_ref);
1938         } else if (is_data) {
1939                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1940                                              last_ref);
1941         } else {
1942                 *last_ref = 1;
1943                 ret = btrfs_del_item(trans, root, path);
1944         }
1945         return ret;
1946 }
1947
1948 #define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1949 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1950                                u64 *discarded_bytes)
1951 {
1952         int j, ret = 0;
1953         u64 bytes_left, end;
1954         u64 aligned_start = ALIGN(start, 1 << 9);
1955
1956         if (WARN_ON(start != aligned_start)) {
1957                 len -= aligned_start - start;
1958                 len = round_down(len, 1 << 9);
1959                 start = aligned_start;
1960         }
1961
1962         *discarded_bytes = 0;
1963
1964         if (!len)
1965                 return 0;
1966
1967         end = start + len;
1968         bytes_left = len;
1969
1970         /* Skip any superblocks on this device. */
1971         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1972                 u64 sb_start = btrfs_sb_offset(j);
1973                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1974                 u64 size = sb_start - start;
1975
1976                 if (!in_range(sb_start, start, bytes_left) &&
1977                     !in_range(sb_end, start, bytes_left) &&
1978                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1979                         continue;
1980
1981                 /*
1982                  * Superblock spans beginning of range.  Adjust start and
1983                  * try again.
1984                  */
1985                 if (sb_start <= start) {
1986                         start += sb_end - start;
1987                         if (start > end) {
1988                                 bytes_left = 0;
1989                                 break;
1990                         }
1991                         bytes_left = end - start;
1992                         continue;
1993                 }
1994
1995                 if (size) {
1996                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1997                                                    GFP_NOFS, 0);
1998                         if (!ret)
1999                                 *discarded_bytes += size;
2000                         else if (ret != -EOPNOTSUPP)
2001                                 return ret;
2002                 }
2003
2004                 start = sb_end;
2005                 if (start > end) {
2006                         bytes_left = 0;
2007                         break;
2008                 }
2009                 bytes_left = end - start;
2010         }
2011
2012         if (bytes_left) {
2013                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2014                                            GFP_NOFS, 0);
2015                 if (!ret)
2016                         *discarded_bytes += bytes_left;
2017         }
2018         return ret;
2019 }
2020
2021 int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2022                          u64 num_bytes, u64 *actual_bytes)
2023 {
2024         int ret;
2025         u64 discarded_bytes = 0;
2026         struct btrfs_bio *bbio = NULL;
2027
2028
2029         /* Tell the block device(s) that the sectors can be discarded */
2030         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
2031                               bytenr, &num_bytes, &bbio, 0);
2032         /* Error condition is -ENOMEM */
2033         if (!ret) {
2034                 struct btrfs_bio_stripe *stripe = bbio->stripes;
2035                 int i;
2036
2037
2038                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2039                         u64 bytes;
2040                         if (!stripe->dev->can_discard)
2041                                 continue;
2042
2043                         ret = btrfs_issue_discard(stripe->dev->bdev,
2044                                                   stripe->physical,
2045                                                   stripe->length,
2046                                                   &bytes);
2047                         if (!ret)
2048                                 discarded_bytes += bytes;
2049                         else if (ret != -EOPNOTSUPP)
2050                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2051
2052                         /*
2053                          * Just in case we get back EOPNOTSUPP for some reason,
2054                          * just ignore the return value so we don't screw up
2055                          * people calling discard_extent.
2056                          */
2057                         ret = 0;
2058                 }
2059                 btrfs_put_bbio(bbio);
2060         }
2061
2062         if (actual_bytes)
2063                 *actual_bytes = discarded_bytes;
2064
2065
2066         if (ret == -EOPNOTSUPP)
2067                 ret = 0;
2068         return ret;
2069 }
2070
2071 /* Can return -ENOMEM */
2072 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2073                          struct btrfs_root *root,
2074                          u64 bytenr, u64 num_bytes, u64 parent,
2075                          u64 root_objectid, u64 owner, u64 offset)
2076 {
2077         int ret;
2078         struct btrfs_fs_info *fs_info = root->fs_info;
2079
2080         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2081                root_objectid == BTRFS_TREE_LOG_OBJECTID);
2082
2083         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2084                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2085                                         num_bytes,
2086                                         parent, root_objectid, (int)owner,
2087                                         BTRFS_ADD_DELAYED_REF, NULL);
2088         } else {
2089                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2090                                         num_bytes, parent, root_objectid,
2091                                         owner, offset, 0,
2092                                         BTRFS_ADD_DELAYED_REF, NULL);
2093         }
2094         return ret;
2095 }
2096
2097 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2098                                   struct btrfs_root *root,
2099                                   struct btrfs_delayed_ref_node *node,
2100                                   u64 parent, u64 root_objectid,
2101                                   u64 owner, u64 offset, int refs_to_add,
2102                                   struct btrfs_delayed_extent_op *extent_op)
2103 {
2104         struct btrfs_fs_info *fs_info = root->fs_info;
2105         struct btrfs_path *path;
2106         struct extent_buffer *leaf;
2107         struct btrfs_extent_item *item;
2108         struct btrfs_key key;
2109         u64 bytenr = node->bytenr;
2110         u64 num_bytes = node->num_bytes;
2111         u64 refs;
2112         int ret;
2113
2114         path = btrfs_alloc_path();
2115         if (!path)
2116                 return -ENOMEM;
2117
2118         path->reada = 1;
2119         path->leave_spinning = 1;
2120         /* this will setup the path even if it fails to insert the back ref */
2121         ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2122                                            bytenr, num_bytes, parent,
2123                                            root_objectid, owner, offset,
2124                                            refs_to_add, extent_op);
2125         if ((ret < 0 && ret != -EAGAIN) || !ret)
2126                 goto out;
2127
2128         /*
2129          * Ok we had -EAGAIN which means we didn't have space to insert and
2130          * inline extent ref, so just update the reference count and add a
2131          * normal backref.
2132          */
2133         leaf = path->nodes[0];
2134         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2135         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2136         refs = btrfs_extent_refs(leaf, item);
2137         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2138         if (extent_op)
2139                 __run_delayed_extent_op(extent_op, leaf, item);
2140
2141         btrfs_mark_buffer_dirty(leaf);
2142         btrfs_release_path(path);
2143
2144         path->reada = 1;
2145         path->leave_spinning = 1;
2146         /* now insert the actual backref */
2147         ret = insert_extent_backref(trans, root->fs_info->extent_root,
2148                                     path, bytenr, parent, root_objectid,
2149                                     owner, offset, refs_to_add);
2150         if (ret)
2151                 btrfs_abort_transaction(trans, root, ret);
2152 out:
2153         btrfs_free_path(path);
2154         return ret;
2155 }
2156
2157 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2158                                 struct btrfs_root *root,
2159                                 struct btrfs_delayed_ref_node *node,
2160                                 struct btrfs_delayed_extent_op *extent_op,
2161                                 int insert_reserved)
2162 {
2163         int ret = 0;
2164         struct btrfs_delayed_data_ref *ref;
2165         struct btrfs_key ins;
2166         u64 parent = 0;
2167         u64 ref_root = 0;
2168         u64 flags = 0;
2169
2170         ins.objectid = node->bytenr;
2171         ins.offset = node->num_bytes;
2172         ins.type = BTRFS_EXTENT_ITEM_KEY;
2173
2174         ref = btrfs_delayed_node_to_data_ref(node);
2175         trace_run_delayed_data_ref(node, ref, node->action);
2176
2177         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2178                 parent = ref->parent;
2179         ref_root = ref->root;
2180
2181         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2182                 if (extent_op)
2183                         flags |= extent_op->flags_to_set;
2184                 ret = alloc_reserved_file_extent(trans, root,
2185                                                  parent, ref_root, flags,
2186                                                  ref->objectid, ref->offset,
2187                                                  &ins, node->ref_mod);
2188         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2189                 ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2190                                              ref_root, ref->objectid,
2191                                              ref->offset, node->ref_mod,
2192                                              extent_op);
2193         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2194                 ret = __btrfs_free_extent(trans, root, node, parent,
2195                                           ref_root, ref->objectid,
2196                                           ref->offset, node->ref_mod,
2197                                           extent_op);
2198         } else {
2199                 BUG();
2200         }
2201         return ret;
2202 }
2203
2204 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2205                                     struct extent_buffer *leaf,
2206                                     struct btrfs_extent_item *ei)
2207 {
2208         u64 flags = btrfs_extent_flags(leaf, ei);
2209         if (extent_op->update_flags) {
2210                 flags |= extent_op->flags_to_set;
2211                 btrfs_set_extent_flags(leaf, ei, flags);
2212         }
2213
2214         if (extent_op->update_key) {
2215                 struct btrfs_tree_block_info *bi;
2216                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2217                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2218                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2219         }
2220 }
2221
2222 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2223                                  struct btrfs_root *root,
2224                                  struct btrfs_delayed_ref_node *node,
2225                                  struct btrfs_delayed_extent_op *extent_op)
2226 {
2227         struct btrfs_key key;
2228         struct btrfs_path *path;
2229         struct btrfs_extent_item *ei;
2230         struct extent_buffer *leaf;
2231         u32 item_size;
2232         int ret;
2233         int err = 0;
2234         int metadata = !extent_op->is_data;
2235
2236         if (trans->aborted)
2237                 return 0;
2238
2239         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2240                 metadata = 0;
2241
2242         path = btrfs_alloc_path();
2243         if (!path)
2244                 return -ENOMEM;
2245
2246         key.objectid = node->bytenr;
2247
2248         if (metadata) {
2249                 key.type = BTRFS_METADATA_ITEM_KEY;
2250                 key.offset = extent_op->level;
2251         } else {
2252                 key.type = BTRFS_EXTENT_ITEM_KEY;
2253                 key.offset = node->num_bytes;
2254         }
2255
2256 again:
2257         path->reada = 1;
2258         path->leave_spinning = 1;
2259         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2260                                 path, 0, 1);
2261         if (ret < 0) {
2262                 err = ret;
2263                 goto out;
2264         }
2265         if (ret > 0) {
2266                 if (metadata) {
2267                         if (path->slots[0] > 0) {
2268                                 path->slots[0]--;
2269                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
2270                                                       path->slots[0]);
2271                                 if (key.objectid == node->bytenr &&
2272                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
2273                                     key.offset == node->num_bytes)
2274                                         ret = 0;
2275                         }
2276                         if (ret > 0) {
2277                                 btrfs_release_path(path);
2278                                 metadata = 0;
2279
2280                                 key.objectid = node->bytenr;
2281                                 key.offset = node->num_bytes;
2282                                 key.type = BTRFS_EXTENT_ITEM_KEY;
2283                                 goto again;
2284                         }
2285                 } else {
2286                         err = -EIO;
2287                         goto out;
2288                 }
2289         }
2290
2291         leaf = path->nodes[0];
2292         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2293 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2294         if (item_size < sizeof(*ei)) {
2295                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2296                                              path, (u64)-1, 0);
2297                 if (ret < 0) {
2298                         err = ret;
2299                         goto out;
2300                 }
2301                 leaf = path->nodes[0];
2302                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2303         }
2304 #endif
2305         BUG_ON(item_size < sizeof(*ei));
2306         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2307         __run_delayed_extent_op(extent_op, leaf, ei);
2308
2309         btrfs_mark_buffer_dirty(leaf);
2310 out:
2311         btrfs_free_path(path);
2312         return err;
2313 }
2314
2315 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2316                                 struct btrfs_root *root,
2317                                 struct btrfs_delayed_ref_node *node,
2318                                 struct btrfs_delayed_extent_op *extent_op,
2319                                 int insert_reserved)
2320 {
2321         int ret = 0;
2322         struct btrfs_delayed_tree_ref *ref;
2323         struct btrfs_key ins;
2324         u64 parent = 0;
2325         u64 ref_root = 0;
2326         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2327                                                  SKINNY_METADATA);
2328
2329         ref = btrfs_delayed_node_to_tree_ref(node);
2330         trace_run_delayed_tree_ref(node, ref, node->action);
2331
2332         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2333                 parent = ref->parent;
2334         ref_root = ref->root;
2335
2336         ins.objectid = node->bytenr;
2337         if (skinny_metadata) {
2338                 ins.offset = ref->level;
2339                 ins.type = BTRFS_METADATA_ITEM_KEY;
2340         } else {
2341                 ins.offset = node->num_bytes;
2342                 ins.type = BTRFS_EXTENT_ITEM_KEY;
2343         }
2344
2345         BUG_ON(node->ref_mod != 1);
2346         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2347                 BUG_ON(!extent_op || !extent_op->update_flags);
2348                 ret = alloc_reserved_tree_block(trans, root,
2349                                                 parent, ref_root,
2350                                                 extent_op->flags_to_set,
2351                                                 &extent_op->key,
2352                                                 ref->level, &ins);
2353         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2354                 ret = __btrfs_inc_extent_ref(trans, root, node,
2355                                              parent, ref_root,
2356                                              ref->level, 0, 1,
2357                                              extent_op);
2358         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2359                 ret = __btrfs_free_extent(trans, root, node,
2360                                           parent, ref_root,
2361                                           ref->level, 0, 1, extent_op);
2362         } else {
2363                 BUG();
2364         }
2365         return ret;
2366 }
2367
2368 /* helper function to actually process a single delayed ref entry */
2369 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2370                                struct btrfs_root *root,
2371                                struct btrfs_delayed_ref_node *node,
2372                                struct btrfs_delayed_extent_op *extent_op,
2373                                int insert_reserved)
2374 {
2375         int ret = 0;
2376
2377         if (trans->aborted) {
2378                 if (insert_reserved)
2379                         btrfs_pin_extent(root, node->bytenr,
2380                                          node->num_bytes, 1);
2381                 return 0;
2382         }
2383
2384         if (btrfs_delayed_ref_is_head(node)) {
2385                 struct btrfs_delayed_ref_head *head;
2386                 /*
2387                  * we've hit the end of the chain and we were supposed
2388                  * to insert this extent into the tree.  But, it got
2389                  * deleted before we ever needed to insert it, so all
2390                  * we have to do is clean up the accounting
2391                  */
2392                 BUG_ON(extent_op);
2393                 head = btrfs_delayed_node_to_head(node);
2394                 trace_run_delayed_ref_head(node, head, node->action);
2395
2396                 if (insert_reserved) {
2397                         btrfs_pin_extent(root, node->bytenr,
2398                                          node->num_bytes, 1);
2399                         if (head->is_data) {
2400                                 ret = btrfs_del_csums(trans, root,
2401                                                       node->bytenr,
2402                                                       node->num_bytes);
2403                         }
2404                 }
2405
2406                 /* Also free its reserved qgroup space */
2407                 btrfs_qgroup_free_delayed_ref(root->fs_info,
2408                                               head->qgroup_ref_root,
2409                                               head->qgroup_reserved);
2410                 return ret;
2411         }
2412
2413         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2414             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2415                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2416                                            insert_reserved);
2417         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2418                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2419                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2420                                            insert_reserved);
2421         else
2422                 BUG();
2423         return ret;
2424 }
2425
2426 static inline struct btrfs_delayed_ref_node *
2427 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2428 {
2429         struct btrfs_delayed_ref_node *ref;
2430
2431         if (list_empty(&head->ref_list))
2432                 return NULL;
2433
2434         /*
2435          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2436          * This is to prevent a ref count from going down to zero, which deletes
2437          * the extent item from the extent tree, when there still are references
2438          * to add, which would fail because they would not find the extent item.
2439          */
2440         list_for_each_entry(ref, &head->ref_list, list) {
2441                 if (ref->action == BTRFS_ADD_DELAYED_REF)
2442                         return ref;
2443         }
2444
2445         return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
2446                           list);
2447 }
2448
2449 /*
2450  * Returns 0 on success or if called with an already aborted transaction.
2451  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2452  */
2453 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2454                                              struct btrfs_root *root,
2455                                              unsigned long nr)
2456 {
2457         struct btrfs_delayed_ref_root *delayed_refs;
2458         struct btrfs_delayed_ref_node *ref;
2459         struct btrfs_delayed_ref_head *locked_ref = NULL;
2460         struct btrfs_delayed_extent_op *extent_op;
2461         struct btrfs_fs_info *fs_info = root->fs_info;
2462         ktime_t start = ktime_get();
2463         int ret;
2464         unsigned long count = 0;
2465         unsigned long actual_count = 0;
2466         int must_insert_reserved = 0;
2467
2468         delayed_refs = &trans->transaction->delayed_refs;
2469         while (1) {
2470                 if (!locked_ref) {
2471                         if (count >= nr)
2472                                 break;
2473
2474                         spin_lock(&delayed_refs->lock);
2475                         locked_ref = btrfs_select_ref_head(trans);
2476                         if (!locked_ref) {
2477                                 spin_unlock(&delayed_refs->lock);
2478                                 break;
2479                         }
2480
2481                         /* grab the lock that says we are going to process
2482                          * all the refs for this head */
2483                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2484                         spin_unlock(&delayed_refs->lock);
2485                         /*
2486                          * we may have dropped the spin lock to get the head
2487                          * mutex lock, and that might have given someone else
2488                          * time to free the head.  If that's true, it has been
2489                          * removed from our list and we can move on.
2490                          */
2491                         if (ret == -EAGAIN) {
2492                                 locked_ref = NULL;
2493                                 count++;
2494                                 continue;
2495                         }
2496                 }
2497
2498                 /*
2499                  * We need to try and merge add/drops of the same ref since we
2500                  * can run into issues with relocate dropping the implicit ref
2501                  * and then it being added back again before the drop can
2502                  * finish.  If we merged anything we need to re-loop so we can
2503                  * get a good ref.
2504                  * Or we can get node references of the same type that weren't
2505                  * merged when created due to bumps in the tree mod seq, and
2506                  * we need to merge them to prevent adding an inline extent
2507                  * backref before dropping it (triggering a BUG_ON at
2508                  * insert_inline_extent_backref()).
2509                  */
2510                 spin_lock(&locked_ref->lock);
2511                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2512                                          locked_ref);
2513
2514                 /*
2515                  * locked_ref is the head node, so we have to go one
2516                  * node back for any delayed ref updates
2517                  */
2518                 ref = select_delayed_ref(locked_ref);
2519
2520                 if (ref && ref->seq &&
2521                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2522                         spin_unlock(&locked_ref->lock);
2523                         spin_lock(&delayed_refs->lock);
2524                         locked_ref->processing = 0;
2525                         delayed_refs->num_heads_ready++;
2526                         spin_unlock(&delayed_refs->lock);
2527                         btrfs_delayed_ref_unlock(locked_ref);
2528                         locked_ref = NULL;
2529                         cond_resched();
2530                         count++;
2531                         continue;
2532                 }
2533
2534                 /*
2535                  * record the must insert reserved flag before we
2536                  * drop the spin lock.
2537                  */
2538                 must_insert_reserved = locked_ref->must_insert_reserved;
2539                 locked_ref->must_insert_reserved = 0;
2540
2541                 extent_op = locked_ref->extent_op;
2542                 locked_ref->extent_op = NULL;
2543
2544                 if (!ref) {
2545
2546
2547                         /* All delayed refs have been processed, Go ahead
2548                          * and send the head node to run_one_delayed_ref,
2549                          * so that any accounting fixes can happen
2550                          */
2551                         ref = &locked_ref->node;
2552
2553                         if (extent_op && must_insert_reserved) {
2554                                 btrfs_free_delayed_extent_op(extent_op);
2555                                 extent_op = NULL;
2556                         }
2557
2558                         if (extent_op) {
2559                                 spin_unlock(&locked_ref->lock);
2560                                 ret = run_delayed_extent_op(trans, root,
2561                                                             ref, extent_op);
2562                                 btrfs_free_delayed_extent_op(extent_op);
2563
2564                                 if (ret) {
2565                                         /*
2566                                          * Need to reset must_insert_reserved if
2567                                          * there was an error so the abort stuff
2568                                          * can cleanup the reserved space
2569                                          * properly.
2570                                          */
2571                                         if (must_insert_reserved)
2572                                                 locked_ref->must_insert_reserved = 1;
2573                                         spin_lock(&delayed_refs->lock);
2574                                         locked_ref->processing = 0;
2575                                         delayed_refs->num_heads_ready++;
2576                                         spin_unlock(&delayed_refs->lock);
2577                                         btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2578                                         btrfs_delayed_ref_unlock(locked_ref);
2579                                         return ret;
2580                                 }
2581                                 continue;
2582                         }
2583
2584                         /*
2585                          * Need to drop our head ref lock and re-aqcuire the
2586                          * delayed ref lock and then re-check to make sure
2587                          * nobody got added.
2588                          */
2589                         spin_unlock(&locked_ref->lock);
2590                         spin_lock(&delayed_refs->lock);
2591                         spin_lock(&locked_ref->lock);
2592                         if (!list_empty(&locked_ref->ref_list) ||
2593                             locked_ref->extent_op) {
2594                                 spin_unlock(&locked_ref->lock);
2595                                 spin_unlock(&delayed_refs->lock);
2596                                 continue;
2597                         }
2598                         ref->in_tree = 0;
2599                         delayed_refs->num_heads--;
2600                         rb_erase(&locked_ref->href_node,
2601                                  &delayed_refs->href_root);
2602                         spin_unlock(&delayed_refs->lock);
2603                 } else {
2604                         actual_count++;
2605                         ref->in_tree = 0;
2606                         list_del(&ref->list);
2607                 }
2608                 atomic_dec(&delayed_refs->num_entries);
2609
2610                 if (!btrfs_delayed_ref_is_head(ref)) {
2611                         /*
2612                          * when we play the delayed ref, also correct the
2613                          * ref_mod on head
2614                          */
2615                         switch (ref->action) {
2616                         case BTRFS_ADD_DELAYED_REF:
2617                         case BTRFS_ADD_DELAYED_EXTENT:
2618                                 locked_ref->node.ref_mod -= ref->ref_mod;
2619                                 break;
2620                         case BTRFS_DROP_DELAYED_REF:
2621                                 locked_ref->node.ref_mod += ref->ref_mod;
2622                                 break;
2623                         default:
2624                                 WARN_ON(1);
2625                         }
2626                 }
2627                 spin_unlock(&locked_ref->lock);
2628
2629                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2630                                           must_insert_reserved);
2631
2632                 btrfs_free_delayed_extent_op(extent_op);
2633                 if (ret) {
2634                         locked_ref->processing = 0;
2635                         btrfs_delayed_ref_unlock(locked_ref);
2636                         btrfs_put_delayed_ref(ref);
2637                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2638                         return ret;
2639                 }
2640
2641                 /*
2642                  * If this node is a head, that means all the refs in this head
2643                  * have been dealt with, and we will pick the next head to deal
2644                  * with, so we must unlock the head and drop it from the cluster
2645                  * list before we release it.
2646                  */
2647                 if (btrfs_delayed_ref_is_head(ref)) {
2648                         if (locked_ref->is_data &&
2649                             locked_ref->total_ref_mod < 0) {
2650                                 spin_lock(&delayed_refs->lock);
2651                                 delayed_refs->pending_csums -= ref->num_bytes;
2652                                 spin_unlock(&delayed_refs->lock);
2653                         }
2654                         btrfs_delayed_ref_unlock(locked_ref);
2655                         locked_ref = NULL;
2656                 }
2657                 btrfs_put_delayed_ref(ref);
2658                 count++;
2659                 cond_resched();
2660         }
2661
2662         /*
2663          * We don't want to include ref heads since we can have empty ref heads
2664          * and those will drastically skew our runtime down since we just do
2665          * accounting, no actual extent tree updates.
2666          */
2667         if (actual_count > 0) {
2668                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2669                 u64 avg;
2670
2671                 /*
2672                  * We weigh the current average higher than our current runtime
2673                  * to avoid large swings in the average.
2674                  */
2675                 spin_lock(&delayed_refs->lock);
2676                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2677                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2678                 spin_unlock(&delayed_refs->lock);
2679         }
2680         return 0;
2681 }
2682
2683 #ifdef SCRAMBLE_DELAYED_REFS
2684 /*
2685  * Normally delayed refs get processed in ascending bytenr order. This
2686  * correlates in most cases to the order added. To expose dependencies on this
2687  * order, we start to process the tree in the middle instead of the beginning
2688  */
2689 static u64 find_middle(struct rb_root *root)
2690 {
2691         struct rb_node *n = root->rb_node;
2692         struct btrfs_delayed_ref_node *entry;
2693         int alt = 1;
2694         u64 middle;
2695         u64 first = 0, last = 0;
2696
2697         n = rb_first(root);
2698         if (n) {
2699                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2700                 first = entry->bytenr;
2701         }
2702         n = rb_last(root);
2703         if (n) {
2704                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2705                 last = entry->bytenr;
2706         }
2707         n = root->rb_node;
2708
2709         while (n) {
2710                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2711                 WARN_ON(!entry->in_tree);
2712
2713                 middle = entry->bytenr;
2714
2715                 if (alt)
2716                         n = n->rb_left;
2717                 else
2718                         n = n->rb_right;
2719
2720                 alt = 1 - alt;
2721         }
2722         return middle;
2723 }
2724 #endif
2725
2726 static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2727 {
2728         u64 num_bytes;
2729
2730         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2731                              sizeof(struct btrfs_extent_inline_ref));
2732         if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2733                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2734
2735         /*
2736          * We don't ever fill up leaves all the way so multiply by 2 just to be
2737          * closer to what we're really going to want to ouse.
2738          */
2739         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2740 }
2741
2742 /*
2743  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2744  * would require to store the csums for that many bytes.
2745  */
2746 u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2747 {
2748         u64 csum_size;
2749         u64 num_csums_per_leaf;
2750         u64 num_csums;
2751
2752         csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
2753         num_csums_per_leaf = div64_u64(csum_size,
2754                         (u64)btrfs_super_csum_size(root->fs_info->super_copy));
2755         num_csums = div64_u64(csum_bytes, root->sectorsize);
2756         num_csums += num_csums_per_leaf - 1;
2757         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2758         return num_csums;
2759 }
2760
2761 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2762                                        struct btrfs_root *root)
2763 {
2764         struct btrfs_block_rsv *global_rsv;
2765         u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2766         u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2767         u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2768         u64 num_bytes, num_dirty_bgs_bytes;
2769         int ret = 0;
2770
2771         num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2772         num_heads = heads_to_leaves(root, num_heads);
2773         if (num_heads > 1)
2774                 num_bytes += (num_heads - 1) * root->nodesize;
2775         num_bytes <<= 1;
2776         num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2777         num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2778                                                              num_dirty_bgs);
2779         global_rsv = &root->fs_info->global_block_rsv;
2780
2781         /*
2782          * If we can't allocate any more chunks lets make sure we have _lots_ of
2783          * wiggle room since running delayed refs can create more delayed refs.
2784          */
2785         if (global_rsv->space_info->full) {
2786                 num_dirty_bgs_bytes <<= 1;
2787                 num_bytes <<= 1;
2788         }
2789
2790         spin_lock(&global_rsv->lock);
2791         if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2792                 ret = 1;
2793         spin_unlock(&global_rsv->lock);
2794         return ret;
2795 }
2796
2797 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2798                                        struct btrfs_root *root)
2799 {
2800         struct btrfs_fs_info *fs_info = root->fs_info;
2801         u64 num_entries =
2802                 atomic_read(&trans->transaction->delayed_refs.num_entries);
2803         u64 avg_runtime;
2804         u64 val;
2805
2806         smp_mb();
2807         avg_runtime = fs_info->avg_delayed_ref_runtime;
2808         val = num_entries * avg_runtime;
2809         if (num_entries * avg_runtime >= NSEC_PER_SEC)
2810                 return 1;
2811         if (val >= NSEC_PER_SEC / 2)
2812                 return 2;
2813
2814         return btrfs_check_space_for_delayed_refs(trans, root);
2815 }
2816
2817 struct async_delayed_refs {
2818         struct btrfs_root *root;
2819         int count;
2820         int error;
2821         int sync;
2822         struct completion wait;
2823         struct btrfs_work work;
2824 };
2825
2826 static void delayed_ref_async_start(struct btrfs_work *work)
2827 {
2828         struct async_delayed_refs *async;
2829         struct btrfs_trans_handle *trans;
2830         int ret;
2831
2832         async = container_of(work, struct async_delayed_refs, work);
2833
2834         trans = btrfs_join_transaction(async->root);
2835         if (IS_ERR(trans)) {
2836                 async->error = PTR_ERR(trans);
2837                 goto done;
2838         }
2839
2840         /*
2841          * trans->sync means that when we call end_transaciton, we won't
2842          * wait on delayed refs
2843          */
2844         trans->sync = true;
2845         ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2846         if (ret)
2847                 async->error = ret;
2848
2849         ret = btrfs_end_transaction(trans, async->root);
2850         if (ret && !async->error)
2851                 async->error = ret;
2852 done:
2853         if (async->sync)
2854                 complete(&async->wait);
2855         else
2856                 kfree(async);
2857 }
2858
2859 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2860                                  unsigned long count, int wait)
2861 {
2862         struct async_delayed_refs *async;
2863         int ret;
2864
2865         async = kmalloc(sizeof(*async), GFP_NOFS);
2866         if (!async)
2867                 return -ENOMEM;
2868
2869         async->root = root->fs_info->tree_root;
2870         async->count = count;
2871         async->error = 0;
2872         if (wait)
2873                 async->sync = 1;
2874         else
2875                 async->sync = 0;
2876         init_completion(&async->wait);
2877
2878         btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2879                         delayed_ref_async_start, NULL, NULL);
2880
2881         btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2882
2883         if (wait) {
2884                 wait_for_completion(&async->wait);
2885                 ret = async->error;
2886                 kfree(async);
2887                 return ret;
2888         }
2889         return 0;
2890 }
2891
2892 /*
2893  * this starts processing the delayed reference count updates and
2894  * extent insertions we have queued up so far.  count can be
2895  * 0, which means to process everything in the tree at the start
2896  * of the run (but not newly added entries), or it can be some target
2897  * number you'd like to process.
2898  *
2899  * Returns 0 on success or if called with an aborted transaction
2900  * Returns <0 on error and aborts the transaction
2901  */
2902 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2903                            struct btrfs_root *root, unsigned long count)
2904 {
2905         struct rb_node *node;
2906         struct btrfs_delayed_ref_root *delayed_refs;
2907         struct btrfs_delayed_ref_head *head;
2908         int ret;
2909         int run_all = count == (unsigned long)-1;
2910         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
2911
2912         /* We'll clean this up in btrfs_cleanup_transaction */
2913         if (trans->aborted)
2914                 return 0;
2915
2916         if (root == root->fs_info->extent_root)
2917                 root = root->fs_info->tree_root;
2918
2919         delayed_refs = &trans->transaction->delayed_refs;
2920         if (count == 0)
2921                 count = atomic_read(&delayed_refs->num_entries) * 2;
2922
2923 again:
2924 #ifdef SCRAMBLE_DELAYED_REFS
2925         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2926 #endif
2927         trans->can_flush_pending_bgs = false;
2928         ret = __btrfs_run_delayed_refs(trans, root, count);
2929         if (ret < 0) {
2930                 btrfs_abort_transaction(trans, root, ret);
2931                 return ret;
2932         }
2933
2934         if (run_all) {
2935                 if (!list_empty(&trans->new_bgs))
2936                         btrfs_create_pending_block_groups(trans, root);
2937
2938                 spin_lock(&delayed_refs->lock);
2939                 node = rb_first(&delayed_refs->href_root);
2940                 if (!node) {
2941                         spin_unlock(&delayed_refs->lock);
2942                         goto out;
2943                 }
2944                 count = (unsigned long)-1;
2945
2946                 while (node) {
2947                         head = rb_entry(node, struct btrfs_delayed_ref_head,
2948                                         href_node);
2949                         if (btrfs_delayed_ref_is_head(&head->node)) {
2950                                 struct btrfs_delayed_ref_node *ref;
2951
2952                                 ref = &head->node;
2953                                 atomic_inc(&ref->refs);
2954
2955                                 spin_unlock(&delayed_refs->lock);
2956                                 /*
2957                                  * Mutex was contended, block until it's
2958                                  * released and try again
2959                                  */
2960                                 mutex_lock(&head->mutex);
2961                                 mutex_unlock(&head->mutex);
2962
2963                                 btrfs_put_delayed_ref(ref);
2964                                 cond_resched();
2965                                 goto again;
2966                         } else {
2967                                 WARN_ON(1);
2968                         }
2969                         node = rb_next(node);
2970                 }
2971                 spin_unlock(&delayed_refs->lock);
2972                 cond_resched();
2973                 goto again;
2974         }
2975 out:
2976         assert_qgroups_uptodate(trans);
2977         trans->can_flush_pending_bgs = can_flush_pending_bgs;
2978         return 0;
2979 }
2980
2981 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2982                                 struct btrfs_root *root,
2983                                 u64 bytenr, u64 num_bytes, u64 flags,
2984                                 int level, int is_data)
2985 {
2986         struct btrfs_delayed_extent_op *extent_op;
2987         int ret;
2988
2989         extent_op = btrfs_alloc_delayed_extent_op();
2990         if (!extent_op)
2991                 return -ENOMEM;
2992
2993         extent_op->flags_to_set = flags;
2994         extent_op->update_flags = 1;
2995         extent_op->update_key = 0;
2996         extent_op->is_data = is_data ? 1 : 0;
2997         extent_op->level = level;
2998
2999         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
3000                                           num_bytes, extent_op);
3001         if (ret)
3002                 btrfs_free_delayed_extent_op(extent_op);
3003         return ret;
3004 }
3005
3006 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
3007                                       struct btrfs_root *root,
3008                                       struct btrfs_path *path,
3009                                       u64 objectid, u64 offset, u64 bytenr)
3010 {
3011         struct btrfs_delayed_ref_head *head;
3012         struct btrfs_delayed_ref_node *ref;
3013         struct btrfs_delayed_data_ref *data_ref;
3014         struct btrfs_delayed_ref_root *delayed_refs;
3015         int ret = 0;
3016
3017         delayed_refs = &trans->transaction->delayed_refs;
3018         spin_lock(&delayed_refs->lock);
3019         head = btrfs_find_delayed_ref_head(trans, bytenr);
3020         if (!head) {
3021                 spin_unlock(&delayed_refs->lock);
3022                 return 0;
3023         }
3024
3025         if (!mutex_trylock(&head->mutex)) {
3026                 atomic_inc(&head->node.refs);
3027                 spin_unlock(&delayed_refs->lock);
3028
3029                 btrfs_release_path(path);
3030
3031                 /*
3032                  * Mutex was contended, block until it's released and let
3033                  * caller try again
3034                  */
3035                 mutex_lock(&head->mutex);
3036                 mutex_unlock(&head->mutex);
3037                 btrfs_put_delayed_ref(&head->node);
3038                 return -EAGAIN;
3039         }
3040         spin_unlock(&delayed_refs->lock);
3041
3042         spin_lock(&head->lock);
3043         list_for_each_entry(ref, &head->ref_list, list) {
3044                 /* If it's a shared ref we know a cross reference exists */
3045                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3046                         ret = 1;
3047                         break;
3048                 }
3049
3050                 data_ref = btrfs_delayed_node_to_data_ref(ref);
3051
3052                 /*
3053                  * If our ref doesn't match the one we're currently looking at
3054                  * then we have a cross reference.
3055                  */
3056                 if (data_ref->root != root->root_key.objectid ||
3057                     data_ref->objectid != objectid ||
3058                     data_ref->offset != offset) {
3059                         ret = 1;
3060                         break;
3061                 }
3062         }
3063         spin_unlock(&head->lock);
3064         mutex_unlock(&head->mutex);
3065         return ret;
3066 }
3067
3068 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
3069                                         struct btrfs_root *root,
3070                                         struct btrfs_path *path,
3071                                         u64 objectid, u64 offset, u64 bytenr)
3072 {
3073         struct btrfs_root *extent_root = root->fs_info->extent_root;
3074         struct extent_buffer *leaf;
3075         struct btrfs_extent_data_ref *ref;
3076         struct btrfs_extent_inline_ref *iref;
3077         struct btrfs_extent_item *ei;
3078         struct btrfs_key key;
3079         u32 item_size;
3080         int ret;
3081
3082         key.objectid = bytenr;
3083         key.offset = (u64)-1;
3084         key.type = BTRFS_EXTENT_ITEM_KEY;
3085
3086         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3087         if (ret < 0)
3088                 goto out;
3089         BUG_ON(ret == 0); /* Corruption */
3090
3091         ret = -ENOENT;
3092         if (path->slots[0] == 0)
3093                 goto out;
3094
3095         path->slots[0]--;
3096         leaf = path->nodes[0];
3097         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3098
3099         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3100                 goto out;
3101
3102         ret = 1;
3103         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3104 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3105         if (item_size < sizeof(*ei)) {
3106                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3107                 goto out;
3108         }
3109 #endif
3110         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3111
3112         if (item_size != sizeof(*ei) +
3113             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3114                 goto out;
3115
3116         if (btrfs_extent_generation(leaf, ei) <=
3117             btrfs_root_last_snapshot(&root->root_item))
3118                 goto out;
3119
3120         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3121         if (btrfs_extent_inline_ref_type(leaf, iref) !=
3122             BTRFS_EXTENT_DATA_REF_KEY)
3123                 goto out;
3124
3125         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3126         if (btrfs_extent_refs(leaf, ei) !=
3127             btrfs_extent_data_ref_count(leaf, ref) ||
3128             btrfs_extent_data_ref_root(leaf, ref) !=
3129             root->root_key.objectid ||
3130             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3131             btrfs_extent_data_ref_offset(leaf, ref) != offset)
3132                 goto out;
3133
3134         ret = 0;
3135 out:
3136         return ret;
3137 }
3138
3139 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3140                           struct btrfs_root *root,
3141                           u64 objectid, u64 offset, u64 bytenr)
3142 {
3143         struct btrfs_path *path;
3144         int ret;
3145         int ret2;
3146
3147         path = btrfs_alloc_path();
3148         if (!path)
3149                 return -ENOENT;
3150
3151         do {
3152                 ret = check_committed_ref(trans, root, path, objectid,
3153                                           offset, bytenr);
3154                 if (ret && ret != -ENOENT)
3155                         goto out;
3156
3157                 ret2 = check_delayed_ref(trans, root, path, objectid,
3158                                          offset, bytenr);
3159         } while (ret2 == -EAGAIN);
3160
3161         if (ret2 && ret2 != -ENOENT) {
3162                 ret = ret2;
3163                 goto out;
3164         }
3165
3166         if (ret != -ENOENT || ret2 != -ENOENT)
3167                 ret = 0;
3168 out:
3169         btrfs_free_path(path);
3170         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3171                 WARN_ON(ret > 0);
3172         return ret;
3173 }
3174
3175 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3176                            struct btrfs_root *root,
3177                            struct extent_buffer *buf,
3178                            int full_backref, int inc)
3179 {
3180         u64 bytenr;
3181         u64 num_bytes;
3182         u64 parent;
3183         u64 ref_root;
3184         u32 nritems;
3185         struct btrfs_key key;
3186         struct btrfs_file_extent_item *fi;
3187         int i;
3188         int level;
3189         int ret = 0;
3190         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3191                             u64, u64, u64, u64, u64, u64);
3192
3193
3194         if (btrfs_test_is_dummy_root(root))
3195                 return 0;
3196
3197         ref_root = btrfs_header_owner(buf);
3198         nritems = btrfs_header_nritems(buf);
3199         level = btrfs_header_level(buf);
3200
3201         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3202                 return 0;
3203
3204         if (inc)
3205                 process_func = btrfs_inc_extent_ref;
3206         else
3207                 process_func = btrfs_free_extent;
3208
3209         if (full_backref)
3210                 parent = buf->start;
3211         else
3212                 parent = 0;
3213
3214         for (i = 0; i < nritems; i++) {
3215                 if (level == 0) {
3216                         btrfs_item_key_to_cpu(buf, &key, i);
3217                         if (key.type != BTRFS_EXTENT_DATA_KEY)
3218                                 continue;
3219                         fi = btrfs_item_ptr(buf, i,
3220                                             struct btrfs_file_extent_item);
3221                         if (btrfs_file_extent_type(buf, fi) ==
3222                             BTRFS_FILE_EXTENT_INLINE)
3223                                 continue;
3224                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3225                         if (bytenr == 0)
3226                                 continue;
3227
3228                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3229                         key.offset -= btrfs_file_extent_offset(buf, fi);
3230                         ret = process_func(trans, root, bytenr, num_bytes,
3231                                            parent, ref_root, key.objectid,
3232                                            key.offset);
3233                         if (ret)
3234                                 goto fail;
3235                 } else {
3236                         bytenr = btrfs_node_blockptr(buf, i);
3237                         num_bytes = root->nodesize;
3238                         ret = process_func(trans, root, bytenr, num_bytes,
3239                                            parent, ref_root, level - 1, 0);
3240                         if (ret)
3241                                 goto fail;
3242                 }
3243         }
3244         return 0;
3245 fail:
3246         return ret;
3247 }
3248
3249 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3250                   struct extent_buffer *buf, int full_backref)
3251 {
3252         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3253 }
3254
3255 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3256                   struct extent_buffer *buf, int full_backref)
3257 {
3258         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3259 }
3260
3261 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3262                                  struct btrfs_root *root,
3263                                  struct btrfs_path *path,
3264                                  struct btrfs_block_group_cache *cache)
3265 {
3266         int ret;
3267         struct btrfs_root *extent_root = root->fs_info->extent_root;
3268         unsigned long bi;
3269         struct extent_buffer *leaf;
3270
3271         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3272         if (ret) {
3273                 if (ret > 0)
3274                         ret = -ENOENT;
3275                 goto fail;
3276         }
3277
3278         leaf = path->nodes[0];
3279         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3280         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3281         btrfs_mark_buffer_dirty(leaf);
3282 fail:
3283         btrfs_release_path(path);
3284         return ret;
3285
3286 }
3287
3288 static struct btrfs_block_group_cache *
3289 next_block_group(struct btrfs_root *root,
3290                  struct btrfs_block_group_cache *cache)
3291 {
3292         struct rb_node *node;
3293
3294         spin_lock(&root->fs_info->block_group_cache_lock);
3295
3296         /* If our block group was removed, we need a full search. */
3297         if (RB_EMPTY_NODE(&cache->cache_node)) {
3298                 const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3299
3300                 spin_unlock(&root->fs_info->block_group_cache_lock);
3301                 btrfs_put_block_group(cache);
3302                 cache = btrfs_lookup_first_block_group(root->fs_info,
3303                                                        next_bytenr);
3304                 return cache;
3305         }
3306         node = rb_next(&cache->cache_node);
3307         btrfs_put_block_group(cache);
3308         if (node) {
3309                 cache = rb_entry(node, struct btrfs_block_group_cache,
3310                                  cache_node);
3311                 btrfs_get_block_group(cache);
3312         } else
3313                 cache = NULL;
3314         spin_unlock(&root->fs_info->block_group_cache_lock);
3315         return cache;
3316 }
3317
3318 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3319                             struct btrfs_trans_handle *trans,
3320                             struct btrfs_path *path)
3321 {
3322         struct btrfs_root *root = block_group->fs_info->tree_root;
3323         struct inode *inode = NULL;
3324         u64 alloc_hint = 0;
3325         int dcs = BTRFS_DC_ERROR;
3326         u64 num_pages = 0;
3327         int retries = 0;
3328         int ret = 0;
3329
3330         /*
3331          * If this block group is smaller than 100 megs don't bother caching the
3332          * block group.
3333          */
3334         if (block_group->key.offset < (100 * 1024 * 1024)) {
3335                 spin_lock(&block_group->lock);
3336                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3337                 spin_unlock(&block_group->lock);
3338                 return 0;
3339         }
3340
3341         if (trans->aborted)
3342                 return 0;
3343 again:
3344         inode = lookup_free_space_inode(root, block_group, path);
3345         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3346                 ret = PTR_ERR(inode);
3347                 btrfs_release_path(path);
3348                 goto out;
3349         }
3350
3351         if (IS_ERR(inode)) {
3352                 BUG_ON(retries);
3353                 retries++;
3354
3355                 if (block_group->ro)
3356                         goto out_free;
3357
3358                 ret = create_free_space_inode(root, trans, block_group, path);
3359                 if (ret)
3360                         goto out_free;
3361                 goto again;
3362         }
3363
3364         /*
3365          * We want to set the generation to 0, that way if anything goes wrong
3366          * from here on out we know not to trust this cache when we load up next
3367          * time.
3368          */
3369         BTRFS_I(inode)->generation = 0;
3370         ret = btrfs_update_inode(trans, root, inode);
3371         if (ret) {
3372                 /*
3373                  * So theoretically we could recover from this, simply set the
3374                  * super cache generation to 0 so we know to invalidate the
3375                  * cache, but then we'd have to keep track of the block groups
3376                  * that fail this way so we know we _have_ to reset this cache
3377                  * before the next commit or risk reading stale cache.  So to
3378                  * limit our exposure to horrible edge cases lets just abort the
3379                  * transaction, this only happens in really bad situations
3380                  * anyway.
3381                  */
3382                 btrfs_abort_transaction(trans, root, ret);
3383                 goto out_put;
3384         }
3385         WARN_ON(ret);
3386
3387         /* We've already setup this transaction, go ahead and exit */
3388         if (block_group->cache_generation == trans->transid &&
3389             i_size_read(inode)) {
3390                 dcs = BTRFS_DC_SETUP;
3391                 goto out_put;
3392         }
3393
3394         if (i_size_read(inode) > 0) {
3395                 ret = btrfs_check_trunc_cache_free_space(root,
3396                                         &root->fs_info->global_block_rsv);
3397                 if (ret)
3398                         goto out_put;
3399
3400                 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3401                 if (ret)
3402                         goto out_put;
3403         }
3404
3405         spin_lock(&block_group->lock);
3406         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3407             !btrfs_test_opt(root, SPACE_CACHE)) {
3408                 /*
3409                  * don't bother trying to write stuff out _if_
3410                  * a) we're not cached,
3411                  * b) we're with nospace_cache mount option.
3412                  */
3413                 dcs = BTRFS_DC_WRITTEN;
3414                 spin_unlock(&block_group->lock);
3415                 goto out_put;
3416         }
3417         spin_unlock(&block_group->lock);
3418
3419         /*
3420          * We hit an ENOSPC when setting up the cache in this transaction, just
3421          * skip doing the setup, we've already cleared the cache so we're safe.
3422          */
3423         if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3424                 ret = -ENOSPC;
3425                 goto out_put;
3426         }
3427
3428         /*
3429          * Try to preallocate enough space based on how big the block group is.
3430          * Keep in mind this has to include any pinned space which could end up
3431          * taking up quite a bit since it's not folded into the other space
3432          * cache.
3433          */
3434         num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
3435         if (!num_pages)
3436                 num_pages = 1;
3437
3438         num_pages *= 16;
3439         num_pages *= PAGE_CACHE_SIZE;
3440
3441         ret = btrfs_check_data_free_space(inode, 0, num_pages);
3442         if (ret)
3443                 goto out_put;
3444
3445         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3446                                               num_pages, num_pages,
3447                                               &alloc_hint);
3448         /*
3449          * Our cache requires contiguous chunks so that we don't modify a bunch
3450          * of metadata or split extents when writing the cache out, which means
3451          * we can enospc if we are heavily fragmented in addition to just normal
3452          * out of space conditions.  So if we hit this just skip setting up any
3453          * other block groups for this transaction, maybe we'll unpin enough
3454          * space the next time around.
3455          */
3456         if (!ret)
3457                 dcs = BTRFS_DC_SETUP;
3458         else if (ret == -ENOSPC)
3459                 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3460         btrfs_free_reserved_data_space(inode, 0, num_pages);
3461
3462 out_put:
3463         iput(inode);
3464 out_free:
3465         btrfs_release_path(path);
3466 out:
3467         spin_lock(&block_group->lock);
3468         if (!ret && dcs == BTRFS_DC_SETUP)
3469                 block_group->cache_generation = trans->transid;
3470         block_group->disk_cache_state = dcs;
3471         spin_unlock(&block_group->lock);
3472
3473         return ret;
3474 }
3475
3476 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3477                             struct btrfs_root *root)
3478 {
3479         struct btrfs_block_group_cache *cache, *tmp;
3480         struct btrfs_transaction *cur_trans = trans->transaction;
3481         struct btrfs_path *path;
3482
3483         if (list_empty(&cur_trans->dirty_bgs) ||
3484             !btrfs_test_opt(root, SPACE_CACHE))
3485                 return 0;
3486
3487         path = btrfs_alloc_path();
3488         if (!path)
3489                 return -ENOMEM;
3490
3491         /* Could add new block groups, use _safe just in case */
3492         list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3493                                  dirty_list) {
3494                 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3495                         cache_save_setup(cache, trans, path);
3496         }
3497
3498         btrfs_free_path(path);
3499         return 0;
3500 }
3501
3502 /*
3503  * transaction commit does final block group cache writeback during a
3504  * critical section where nothing is allowed to change the FS.  This is
3505  * required in order for the cache to actually match the block group,
3506  * but can introduce a lot of latency into the commit.
3507  *
3508  * So, btrfs_start_dirty_block_groups is here to kick off block group
3509  * cache IO.  There's a chance we'll have to redo some of it if the
3510  * block group changes again during the commit, but it greatly reduces
3511  * the commit latency by getting rid of the easy block groups while
3512  * we're still allowing others to join the commit.
3513  */
3514 int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3515                                    struct btrfs_root *root)
3516 {
3517         struct btrfs_block_group_cache *cache;
3518         struct btrfs_transaction *cur_trans = trans->transaction;
3519         int ret = 0;
3520         int should_put;
3521         struct btrfs_path *path = NULL;
3522         LIST_HEAD(dirty);
3523         struct list_head *io = &cur_trans->io_bgs;
3524         int num_started = 0;
3525         int loops = 0;
3526
3527         spin_lock(&cur_trans->dirty_bgs_lock);
3528         if (list_empty(&cur_trans->dirty_bgs)) {
3529                 spin_unlock(&cur_trans->dirty_bgs_lock);
3530                 return 0;
3531         }
3532         list_splice_init(&cur_trans->dirty_bgs, &dirty);
3533         spin_unlock(&cur_trans->dirty_bgs_lock);
3534
3535 again:
3536         /*
3537          * make sure all the block groups on our dirty list actually
3538          * exist
3539          */
3540         btrfs_create_pending_block_groups(trans, root);
3541
3542         if (!path) {
3543                 path = btrfs_alloc_path();
3544                 if (!path)
3545                         return -ENOMEM;
3546         }
3547
3548         /*
3549          * cache_write_mutex is here only to save us from balance or automatic
3550          * removal of empty block groups deleting this block group while we are
3551          * writing out the cache
3552          */
3553         mutex_lock(&trans->transaction->cache_write_mutex);
3554         while (!list_empty(&dirty)) {
3555                 cache = list_first_entry(&dirty,
3556                                          struct btrfs_block_group_cache,
3557                                          dirty_list);
3558                 /*
3559                  * this can happen if something re-dirties a block
3560                  * group that is already under IO.  Just wait for it to
3561                  * finish and then do it all again
3562                  */
3563                 if (!list_empty(&cache->io_list)) {
3564                         list_del_init(&cache->io_list);
3565                         btrfs_wait_cache_io(root, trans, cache,
3566                                             &cache->io_ctl, path,
3567                                             cache->key.objectid);
3568                         btrfs_put_block_group(cache);
3569                 }
3570
3571
3572                 /*
3573                  * btrfs_wait_cache_io uses the cache->dirty_list to decide
3574                  * if it should update the cache_state.  Don't delete
3575                  * until after we wait.
3576                  *
3577                  * Since we're not running in the commit critical section
3578                  * we need the dirty_bgs_lock to protect from update_block_group
3579                  */
3580                 spin_lock(&cur_trans->dirty_bgs_lock);
3581                 list_del_init(&cache->dirty_list);
3582                 spin_unlock(&cur_trans->dirty_bgs_lock);
3583
3584                 should_put = 1;
3585
3586                 cache_save_setup(cache, trans, path);
3587
3588                 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3589                         cache->io_ctl.inode = NULL;
3590                         ret = btrfs_write_out_cache(root, trans, cache, path);
3591                         if (ret == 0 && cache->io_ctl.inode) {
3592                                 num_started++;
3593                                 should_put = 0;
3594
3595                                 /*
3596                                  * the cache_write_mutex is protecting
3597                                  * the io_list
3598                                  */
3599                                 list_add_tail(&cache->io_list, io);
3600                         } else {
3601                                 /*
3602                                  * if we failed to write the cache, the
3603                                  * generation will be bad and life goes on
3604                                  */
3605                                 ret = 0;
3606                         }
3607                 }
3608                 if (!ret) {
3609                         ret = write_one_cache_group(trans, root, path, cache);
3610                         /*
3611                          * Our block group might still be attached to the list
3612                          * of new block groups in the transaction handle of some
3613                          * other task (struct btrfs_trans_handle->new_bgs). This
3614                          * means its block group item isn't yet in the extent
3615                          * tree. If this happens ignore the error, as we will
3616                          * try again later in the critical section of the
3617                          * transaction commit.
3618                          */
3619                         if (ret == -ENOENT) {
3620                                 ret = 0;
3621                                 spin_lock(&cur_trans->dirty_bgs_lock);
3622                                 if (list_empty(&cache->dirty_list)) {
3623                                         list_add_tail(&cache->dirty_list,
3624                                                       &cur_trans->dirty_bgs);
3625                                         btrfs_get_block_group(cache);
3626                                 }
3627                                 spin_unlock(&cur_trans->dirty_bgs_lock);
3628                         } else if (ret) {
3629                                 btrfs_abort_transaction(trans, root, ret);
3630                         }
3631                 }
3632
3633                 /* if its not on the io list, we need to put the block group */
3634                 if (should_put)
3635                         btrfs_put_block_group(cache);
3636
3637                 if (ret)
3638                         break;
3639
3640                 /*
3641                  * Avoid blocking other tasks for too long. It might even save
3642                  * us from writing caches for block groups that are going to be
3643                  * removed.
3644                  */
3645                 mutex_unlock(&trans->transaction->cache_write_mutex);
3646                 mutex_lock(&trans->transaction->cache_write_mutex);
3647         }
3648         mutex_unlock(&trans->transaction->cache_write_mutex);
3649
3650         /*
3651          * go through delayed refs for all the stuff we've just kicked off
3652          * and then loop back (just once)
3653          */
3654         ret = btrfs_run_delayed_refs(trans, root, 0);
3655         if (!ret && loops == 0) {
3656                 loops++;
3657                 spin_lock(&cur_trans->dirty_bgs_lock);
3658                 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3659                 /*
3660                  * dirty_bgs_lock protects us from concurrent block group
3661                  * deletes too (not just cache_write_mutex).
3662                  */
3663                 if (!list_empty(&dirty)) {
3664                         spin_unlock(&cur_trans->dirty_bgs_lock);
3665                         goto again;
3666                 }
3667                 spin_unlock(&cur_trans->dirty_bgs_lock);
3668         }
3669
3670         btrfs_free_path(path);
3671         return ret;
3672 }
3673
3674 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3675                                    struct btrfs_root *root)
3676 {
3677         struct btrfs_block_group_cache *cache;
3678         struct btrfs_transaction *cur_trans = trans->transaction;
3679         int ret = 0;
3680         int should_put;
3681         struct btrfs_path *path;
3682         struct list_head *io = &cur_trans->io_bgs;
3683         int num_started = 0;
3684
3685         path = btrfs_alloc_path();
3686         if (!path)
3687                 return -ENOMEM;
3688
3689         /*
3690          * We don't need the lock here since we are protected by the transaction
3691          * commit.  We want to do the cache_save_setup first and then run the
3692          * delayed refs to make sure we have the best chance at doing this all
3693          * in one shot.
3694          */
3695         while (!list_empty(&cur_trans->dirty_bgs)) {
3696                 cache = list_first_entry(&cur_trans->dirty_bgs,
3697                                          struct btrfs_block_group_cache,
3698                                          dirty_list);
3699
3700                 /*
3701                  * this can happen if cache_save_setup re-dirties a block
3702                  * group that is already under IO.  Just wait for it to
3703                  * finish and then do it all again
3704                  */
3705                 if (!list_empty(&cache->io_list)) {
3706                         list_del_init(&cache->io_list);
3707                         btrfs_wait_cache_io(root, trans, cache,
3708                                             &cache->io_ctl, path,
3709                                             cache->key.objectid);
3710                         btrfs_put_block_group(cache);
3711                 }
3712
3713                 /*
3714                  * don't remove from the dirty list until after we've waited
3715                  * on any pending IO
3716                  */
3717                 list_del_init(&cache->dirty_list);
3718                 should_put = 1;
3719
3720                 cache_save_setup(cache, trans, path);
3721
3722                 if (!ret)
3723                         ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3724
3725                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3726                         cache->io_ctl.inode = NULL;
3727                         ret = btrfs_write_out_cache(root, trans, cache, path);
3728                         if (ret == 0 && cache->io_ctl.inode) {
3729                                 num_started++;
3730                                 should_put = 0;
3731                                 list_add_tail(&cache->io_list, io);
3732                         } else {
3733                                 /*
3734                                  * if we failed to write the cache, the
3735                                  * generation will be bad and life goes on
3736                                  */
3737                                 ret = 0;
3738                         }
3739                 }
3740                 if (!ret) {
3741                         ret = write_one_cache_group(trans, root, path, cache);
3742                         if (ret)
3743                                 btrfs_abort_transaction(trans, root, ret);
3744                 }
3745
3746                 /* if its not on the io list, we need to put the block group */
3747                 if (should_put)
3748                         btrfs_put_block_group(cache);
3749         }
3750
3751         while (!list_empty(io)) {
3752                 cache = list_first_entry(io, struct btrfs_block_group_cache,
3753                                          io_list);
3754                 list_del_init(&cache->io_list);
3755                 btrfs_wait_cache_io(root, trans, cache,
3756                                     &cache->io_ctl, path, cache->key.objectid);
3757                 btrfs_put_block_group(cache);
3758         }
3759
3760         btrfs_free_path(path);
3761         return ret;
3762 }
3763
3764 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3765 {
3766         struct btrfs_block_group_cache *block_group;
3767         int readonly = 0;
3768
3769         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3770         if (!block_group || block_group->ro)
3771                 readonly = 1;
3772         if (block_group)
3773                 btrfs_put_block_group(block_group);
3774         return readonly;
3775 }
3776
3777 static const char *alloc_name(u64 flags)
3778 {
3779         switch (flags) {
3780         case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3781                 return "mixed";
3782         case BTRFS_BLOCK_GROUP_METADATA:
3783                 return "metadata";
3784         case BTRFS_BLOCK_GROUP_DATA:
3785                 return "data";
3786         case BTRFS_BLOCK_GROUP_SYSTEM:
3787                 return "system";
3788         default:
3789                 WARN_ON(1);
3790                 return "invalid-combination";
3791         };
3792 }
3793
3794 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3795                              u64 total_bytes, u64 bytes_used,
3796                              struct btrfs_space_info **space_info)
3797 {
3798         struct btrfs_space_info *found;
3799         int i;
3800         int factor;
3801         int ret;
3802
3803         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3804                      BTRFS_BLOCK_GROUP_RAID10))
3805                 factor = 2;
3806         else
3807                 factor = 1;
3808
3809         found = __find_space_info(info, flags);
3810         if (found) {
3811                 spin_lock(&found->lock);
3812                 found->total_bytes += total_bytes;
3813                 found->disk_total += total_bytes * factor;
3814                 found->bytes_used += bytes_used;
3815                 found->disk_used += bytes_used * factor;
3816                 if (total_bytes > 0)
3817                         found->full = 0;
3818                 spin_unlock(&found->lock);
3819                 *space_info = found;
3820                 return 0;
3821         }
3822         found = kzalloc(sizeof(*found), GFP_NOFS);
3823         if (!found)
3824                 return -ENOMEM;
3825
3826         ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3827         if (ret) {
3828                 kfree(found);
3829                 return ret;
3830         }
3831
3832         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3833                 INIT_LIST_HEAD(&found->block_groups[i]);
3834         init_rwsem(&found->groups_sem);
3835         spin_lock_init(&found->lock);
3836         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3837         found->total_bytes = total_bytes;
3838         found->disk_total = total_bytes * factor;
3839         found->bytes_used = bytes_used;
3840         found->disk_used = bytes_used * factor;
3841         found->bytes_pinned = 0;
3842         found->bytes_reserved = 0;
3843         found->bytes_readonly = 0;
3844         found->bytes_may_use = 0;
3845         found->full = 0;
3846         found->max_extent_size = 0;
3847         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3848         found->chunk_alloc = 0;
3849         found->flush = 0;
3850         init_waitqueue_head(&found->wait);
3851         INIT_LIST_HEAD(&found->ro_bgs);
3852
3853         ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3854                                     info->space_info_kobj, "%s",
3855                                     alloc_name(found->flags));
3856         if (ret) {
3857                 percpu_counter_destroy(&found->total_bytes_pinned);
3858                 kfree(found);
3859                 return ret;
3860         }
3861
3862         *space_info = found;
3863         list_add_rcu(&found->list, &info->space_info);
3864         if (flags & BTRFS_BLOCK_GROUP_DATA)
3865                 info->data_sinfo = found;
3866
3867         return ret;
3868 }
3869
3870 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3871 {
3872         u64 extra_flags = chunk_to_extended(flags) &
3873                                 BTRFS_EXTENDED_PROFILE_MASK;
3874
3875         write_seqlock(&fs_info->profiles_lock);
3876         if (flags & BTRFS_BLOCK_GROUP_DATA)
3877                 fs_info->avail_data_alloc_bits |= extra_flags;
3878         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3879                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3880         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3881                 fs_info->avail_system_alloc_bits |= extra_flags;
3882         write_sequnlock(&fs_info->profiles_lock);
3883 }
3884
3885 /*
3886  * returns target flags in extended format or 0 if restripe for this
3887  * chunk_type is not in progress
3888  *
3889  * should be called with either volume_mutex or balance_lock held
3890  */
3891 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3892 {
3893         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3894         u64 target = 0;
3895
3896         if (!bctl)
3897                 return 0;
3898
3899         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3900             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3901                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3902         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3903                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3904                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3905         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3906                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3907                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3908         }
3909
3910         return target;
3911 }
3912
3913 /*
3914  * @flags: available profiles in extended format (see ctree.h)
3915  *
3916  * Returns reduced profile in chunk format.  If profile changing is in
3917  * progress (either running or paused) picks the target profile (if it's
3918  * already available), otherwise falls back to plain reducing.
3919  */
3920 static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3921 {
3922         u64 num_devices = root->fs_info->fs_devices->rw_devices;
3923         u64 target;
3924         u64 raid_type;
3925         u64 allowed = 0;
3926
3927         /*
3928          * see if restripe for this chunk_type is in progress, if so
3929          * try to reduce to the target profile
3930          */
3931         spin_lock(&root->fs_info->balance_lock);
3932         target = get_restripe_target(root->fs_info, flags);
3933         if (target) {
3934                 /* pick target profile only if it's already available */
3935                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3936                         spin_unlock(&root->fs_info->balance_lock);
3937                         return extended_to_chunk(target);
3938                 }
3939         }
3940         spin_unlock(&root->fs_info->balance_lock);
3941
3942         /* First, mask out the RAID levels which aren't possible */
3943         for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3944                 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
3945                         allowed |= btrfs_raid_group[raid_type];
3946         }
3947         allowed &= flags;
3948
3949         if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3950                 allowed = BTRFS_BLOCK_GROUP_RAID6;
3951         else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3952                 allowed = BTRFS_BLOCK_GROUP_RAID5;
3953         else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3954                 allowed = BTRFS_BLOCK_GROUP_RAID10;
3955         else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3956                 allowed = BTRFS_BLOCK_GROUP_RAID1;
3957         else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3958                 allowed = BTRFS_BLOCK_GROUP_RAID0;
3959
3960         flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3961
3962         return extended_to_chunk(flags | allowed);
3963 }
3964
3965 static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
3966 {
3967         unsigned seq;
3968         u64 flags;
3969
3970         do {
3971                 flags = orig_flags;
3972                 seq = read_seqbegin(&root->fs_info->profiles_lock);
3973
3974                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3975                         flags |= root->fs_info->avail_data_alloc_bits;
3976                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3977                         flags |= root->fs_info->avail_system_alloc_bits;
3978                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3979                         flags |= root->fs_info->avail_metadata_alloc_bits;
3980         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3981
3982         return btrfs_reduce_alloc_profile(root, flags);
3983 }
3984
3985 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3986 {
3987         u64 flags;
3988         u64 ret;
3989
3990         if (data)
3991                 flags = BTRFS_BLOCK_GROUP_DATA;
3992         else if (root == root->fs_info->chunk_root)
3993                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3994         else
3995                 flags = BTRFS_BLOCK_GROUP_METADATA;
3996
3997         ret = get_alloc_profile(root, flags);
3998         return ret;
3999 }
4000
4001 int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
4002 {
4003         struct btrfs_space_info *data_sinfo;
4004         struct btrfs_root *root = BTRFS_I(inode)->root;
4005         struct btrfs_fs_info *fs_info = root->fs_info;
4006         u64 used;
4007         int ret = 0;
4008         int need_commit = 2;
4009         int have_pinned_space;
4010
4011         /* make sure bytes are sectorsize aligned */
4012         bytes = ALIGN(bytes, root->sectorsize);
4013
4014         if (btrfs_is_free_space_inode(inode)) {
4015                 need_commit = 0;
4016                 ASSERT(current->journal_info);
4017         }
4018
4019         data_sinfo = fs_info->data_sinfo;
4020         if (!data_sinfo)
4021                 goto alloc;
4022
4023 again:
4024         /* make sure we have enough space to handle the data first */
4025         spin_lock(&data_sinfo->lock);
4026         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
4027                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
4028                 data_sinfo->bytes_may_use;
4029
4030         if (used + bytes > data_sinfo->total_bytes) {
4031                 struct btrfs_trans_handle *trans;
4032
4033                 /*
4034                  * if we don't have enough free bytes in this space then we need
4035                  * to alloc a new chunk.
4036                  */
4037                 if (!data_sinfo->full) {
4038                         u64 alloc_target;
4039
4040                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4041                         spin_unlock(&data_sinfo->lock);
4042 alloc:
4043                         alloc_target = btrfs_get_alloc_profile(root, 1);
4044                         /*
4045                          * It is ugly that we don't call nolock join
4046                          * transaction for the free space inode case here.
4047                          * But it is safe because we only do the data space
4048                          * reservation for the free space cache in the
4049                          * transaction context, the common join transaction
4050                          * just increase the counter of the current transaction
4051                          * handler, doesn't try to acquire the trans_lock of
4052                          * the fs.
4053                          */
4054                         trans = btrfs_join_transaction(root);
4055                         if (IS_ERR(trans))
4056                                 return PTR_ERR(trans);
4057
4058                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4059                                              alloc_target,
4060                                              CHUNK_ALLOC_NO_FORCE);
4061                         btrfs_end_transaction(trans, root);
4062                         if (ret < 0) {
4063                                 if (ret != -ENOSPC)
4064                                         return ret;
4065                                 else {
4066                                         have_pinned_space = 1;
4067                                         goto commit_trans;
4068                                 }
4069                         }
4070
4071                         if (!data_sinfo)
4072                                 data_sinfo = fs_info->data_sinfo;
4073
4074                         goto again;
4075                 }
4076
4077                 /*
4078                  * If we don't have enough pinned space to deal with this
4079                  * allocation, and no removed chunk in current transaction,
4080                  * don't bother committing the transaction.
4081                  */
4082                 have_pinned_space = percpu_counter_compare(
4083                         &data_sinfo->total_bytes_pinned,
4084                         used + bytes - data_sinfo->total_bytes);
4085                 spin_unlock(&data_sinfo->lock);
4086
4087                 /* commit the current transaction and try again */
4088 commit_trans:
4089                 if (need_commit &&
4090                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
4091                         need_commit--;
4092
4093                         if (need_commit > 0) {
4094                                 btrfs_start_delalloc_roots(fs_info, 0, -1);
4095                                 btrfs_wait_ordered_roots(fs_info, -1);
4096                         }
4097
4098                         trans = btrfs_join_transaction(root);
4099                         if (IS_ERR(trans))
4100                                 return PTR_ERR(trans);
4101                         if (have_pinned_space >= 0 ||
4102                             test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4103                                      &trans->transaction->flags) ||
4104                             need_commit > 0) {
4105                                 ret = btrfs_commit_transaction(trans, root);
4106                                 if (ret)
4107                                         return ret;
4108                                 /*
4109                                  * The cleaner kthread might still be doing iput
4110                                  * operations. Wait for it to finish so that
4111                                  * more space is released.
4112                                  */
4113                                 mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
4114                                 mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
4115                                 goto again;
4116                         } else {
4117                                 btrfs_end_transaction(trans, root);
4118                         }
4119                 }
4120
4121                 trace_btrfs_space_reservation(root->fs_info,
4122                                               "space_info:enospc",
4123                                               data_sinfo->flags, bytes, 1);
4124                 return -ENOSPC;
4125         }
4126         data_sinfo->bytes_may_use += bytes;
4127         trace_btrfs_space_reservation(root->fs_info, "space_info",
4128                                       data_sinfo->flags, bytes, 1);
4129         spin_unlock(&data_sinfo->lock);
4130
4131         return 0;
4132 }
4133
4134 /*
4135  * New check_data_free_space() with ability for precious data reservation
4136  * Will replace old btrfs_check_data_free_space(), but for patch split,
4137  * add a new function first and then replace it.
4138  */
4139 int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4140 {
4141         struct btrfs_root *root = BTRFS_I(inode)->root;
4142         int ret;
4143
4144         /* align the range */
4145         len = round_up(start + len, root->sectorsize) -
4146               round_down(start, root->sectorsize);
4147         start = round_down(start, root->sectorsize);
4148
4149         ret = btrfs_alloc_data_chunk_ondemand(inode, len);
4150         if (ret < 0)
4151                 return ret;
4152
4153         /*
4154          * Use new btrfs_qgroup_reserve_data to reserve precious data space
4155          *
4156          * TODO: Find a good method to avoid reserve data space for NOCOW
4157          * range, but don't impact performance on quota disable case.
4158          */
4159         ret = btrfs_qgroup_reserve_data(inode, start, len);
4160         return ret;
4161 }
4162
4163 /*
4164  * Called if we need to clear a data reservation for this inode
4165  * Normally in a error case.
4166  *
4167  * This one will *NOT* use accurate qgroup reserved space API, just for case
4168  * which we can't sleep and is sure it won't affect qgroup reserved space.
4169  * Like clear_bit_hook().
4170  */
4171 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4172                                             u64 len)
4173 {
4174         struct btrfs_root *root = BTRFS_I(inode)->root;
4175         struct btrfs_space_info *data_sinfo;
4176
4177         /* Make sure the range is aligned to sectorsize */
4178         len = round_up(start + len, root->sectorsize) -
4179               round_down(start, root->sectorsize);
4180         start = round_down(start, root->sectorsize);
4181
4182         data_sinfo = root->fs_info->data_sinfo;
4183         spin_lock(&data_sinfo->lock);
4184         if (WARN_ON(data_sinfo->bytes_may_use < len))
4185                 data_sinfo->bytes_may_use = 0;
4186         else
4187                 data_sinfo->bytes_may_use -= len;
4188         trace_btrfs_space_reservation(root->fs_info, "space_info",
4189                                       data_sinfo->flags, len, 0);
4190         spin_unlock(&data_sinfo->lock);
4191 }
4192
4193 /*
4194  * Called if we need to clear a data reservation for this inode
4195  * Normally in a error case.
4196  *
4197  * This one will handle the per-indoe data rsv map for accurate reserved
4198  * space framework.
4199  */
4200 void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4201 {
4202         btrfs_free_reserved_data_space_noquota(inode, start, len);
4203         btrfs_qgroup_free_data(inode, start, len);
4204 }
4205
4206 static void force_metadata_allocation(struct btrfs_fs_info *info)
4207 {
4208         struct list_head *head = &info->space_info;
4209         struct btrfs_space_info *found;
4210
4211         rcu_read_lock();
4212         list_for_each_entry_rcu(found, head, list) {
4213                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4214                         found->force_alloc = CHUNK_ALLOC_FORCE;
4215         }
4216         rcu_read_unlock();
4217 }
4218
4219 static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4220 {
4221         return (global->size << 1);
4222 }
4223
4224 static int should_alloc_chunk(struct btrfs_root *root,
4225                               struct btrfs_space_info *sinfo, int force)
4226 {
4227         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4228         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4229         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4230         u64 thresh;
4231
4232         if (force == CHUNK_ALLOC_FORCE)
4233                 return 1;
4234
4235         /*
4236          * We need to take into account the global rsv because for all intents
4237          * and purposes it's used space.  Don't worry about locking the
4238          * global_rsv, it doesn't change except when the transaction commits.
4239          */
4240         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4241                 num_allocated += calc_global_rsv_need_space(global_rsv);
4242
4243         /*
4244          * in limited mode, we want to have some free space up to
4245          * about 1% of the FS size.
4246          */
4247         if (force == CHUNK_ALLOC_LIMITED) {
4248                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4249                 thresh = max_t(u64, 64 * 1024 * 1024,
4250                                div_factor_fine(thresh, 1));
4251
4252                 if (num_bytes - num_allocated < thresh)
4253                         return 1;
4254         }
4255
4256         if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
4257                 return 0;
4258         return 1;
4259 }
4260
4261 static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4262 {
4263         u64 num_dev;
4264
4265         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4266                     BTRFS_BLOCK_GROUP_RAID0 |
4267                     BTRFS_BLOCK_GROUP_RAID5 |
4268                     BTRFS_BLOCK_GROUP_RAID6))
4269                 num_dev = root->fs_info->fs_devices->rw_devices;
4270         else if (type & BTRFS_BLOCK_GROUP_RAID1)
4271                 num_dev = 2;
4272         else
4273                 num_dev = 1;    /* DUP or single */
4274
4275         return num_dev;
4276 }
4277
4278 /*
4279  * If @is_allocation is true, reserve space in the system space info necessary
4280  * for allocating a chunk, otherwise if it's false, reserve space necessary for
4281  * removing a chunk.
4282  */
4283 void check_system_chunk(struct btrfs_trans_handle *trans,
4284                         struct btrfs_root *root,
4285                         u64 type)
4286 {
4287         struct btrfs_space_info *info;
4288         u64 left;
4289         u64 thresh;
4290         int ret = 0;
4291         u64 num_devs;
4292
4293         /*
4294          * Needed because we can end up allocating a system chunk and for an
4295          * atomic and race free space reservation in the chunk block reserve.
4296          */
4297         ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4298
4299         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4300         spin_lock(&info->lock);
4301         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4302                 info->bytes_reserved - info->bytes_readonly -
4303                 info->bytes_may_use;
4304         spin_unlock(&info->lock);
4305
4306         num_devs = get_profile_num_devs(root, type);
4307
4308         /* num_devs device items to update and 1 chunk item to add or remove */
4309         thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
4310                 btrfs_calc_trans_metadata_size(root, 1);
4311
4312         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
4313                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
4314                         left, thresh, type);
4315                 dump_space_info(info, 0, 0);
4316         }
4317
4318         if (left < thresh) {
4319                 u64 flags;
4320
4321                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4322                 /*
4323                  * Ignore failure to create system chunk. We might end up not
4324                  * needing it, as we might not need to COW all nodes/leafs from
4325                  * the paths we visit in the chunk tree (they were already COWed
4326                  * or created in the current transaction for example).
4327                  */
4328                 ret = btrfs_alloc_chunk(trans, root, flags);
4329         }
4330
4331         if (!ret) {
4332                 ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
4333                                           &root->fs_info->chunk_block_rsv,
4334                                           thresh, BTRFS_RESERVE_NO_FLUSH);
4335                 if (!ret)
4336                         trans->chunk_bytes_reserved += thresh;
4337         }
4338 }
4339
4340 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4341                           struct btrfs_root *extent_root, u64 flags, int force)
4342 {
4343         struct btrfs_space_info *space_info;
4344         struct btrfs_fs_info *fs_info = extent_root->fs_info;
4345         int wait_for_alloc = 0;
4346         int ret = 0;
4347
4348         /* Don't re-enter if we're already allocating a chunk */
4349         if (trans->allocating_chunk)
4350                 return -ENOSPC;
4351
4352         space_info = __find_space_info(extent_root->fs_info, flags);
4353         if (!space_info) {
4354                 ret = update_space_info(extent_root->fs_info, flags,
4355                                         0, 0, &space_info);
4356                 BUG_ON(ret); /* -ENOMEM */
4357         }
4358         BUG_ON(!space_info); /* Logic error */
4359
4360 again:
4361         spin_lock(&space_info->lock);
4362         if (force < space_info->force_alloc)
4363                 force = space_info->force_alloc;
4364         if (space_info->full) {
4365                 if (should_alloc_chunk(extent_root, space_info, force))
4366                         ret = -ENOSPC;
4367                 else
4368                         ret = 0;
4369                 spin_unlock(&space_info->lock);
4370                 return ret;
4371         }
4372
4373         if (!should_alloc_chunk(extent_root, space_info, force)) {
4374                 spin_unlock(&space_info->lock);
4375                 return 0;
4376         } else if (space_info->chunk_alloc) {
4377                 wait_for_alloc = 1;
4378         } else {
4379                 space_info->chunk_alloc = 1;
4380         }
4381
4382         spin_unlock(&space_info->lock);
4383
4384         mutex_lock(&fs_info->chunk_mutex);
4385
4386         /*
4387          * The chunk_mutex is held throughout the entirety of a chunk
4388          * allocation, so once we've acquired the chunk_mutex we know that the
4389          * other guy is done and we need to recheck and see if we should
4390          * allocate.
4391          */
4392         if (wait_for_alloc) {
4393                 mutex_unlock(&fs_info->chunk_mutex);
4394                 wait_for_alloc = 0;
4395                 cond_resched();
4396                 goto again;
4397         }
4398
4399         trans->allocating_chunk = true;
4400
4401         /*
4402          * If we have mixed data/metadata chunks we want to make sure we keep
4403          * allocating mixed chunks instead of individual chunks.
4404          */
4405         if (btrfs_mixed_space_info(space_info))
4406                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4407
4408         /*
4409          * if we're doing a data chunk, go ahead and make sure that
4410          * we keep a reasonable number of metadata chunks allocated in the
4411          * FS as well.
4412          */
4413         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4414                 fs_info->data_chunk_allocations++;
4415                 if (!(fs_info->data_chunk_allocations %
4416                       fs_info->metadata_ratio))
4417                         force_metadata_allocation(fs_info);
4418         }
4419
4420         /*
4421          * Check if we have enough space in SYSTEM chunk because we may need
4422          * to update devices.
4423          */
4424         check_system_chunk(trans, extent_root, flags);
4425
4426         ret = btrfs_alloc_chunk(trans, extent_root, flags);
4427         trans->allocating_chunk = false;
4428
4429         spin_lock(&space_info->lock);
4430         if (ret < 0 && ret != -ENOSPC)
4431                 goto out;
4432         if (ret)
4433                 space_info->full = 1;
4434         else
4435                 ret = 1;
4436
4437         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4438 out:
4439         space_info->chunk_alloc = 0;
4440         spin_unlock(&space_info->lock);
4441         mutex_unlock(&fs_info->chunk_mutex);
4442         /*
4443          * When we allocate a new chunk we reserve space in the chunk block
4444          * reserve to make sure we can COW nodes/leafs in the chunk tree or
4445          * add new nodes/leafs to it if we end up needing to do it when
4446          * inserting the chunk item and updating device items as part of the
4447          * second phase of chunk allocation, performed by
4448          * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4449          * large number of new block groups to create in our transaction
4450          * handle's new_bgs list to avoid exhausting the chunk block reserve
4451          * in extreme cases - like having a single transaction create many new
4452          * block groups when starting to write out the free space caches of all
4453          * the block groups that were made dirty during the lifetime of the
4454          * transaction.
4455          */
4456         if (trans->can_flush_pending_bgs &&
4457             trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
4458                 btrfs_create_pending_block_groups(trans, trans->root);
4459                 btrfs_trans_release_chunk_metadata(trans);
4460         }
4461         return ret;
4462 }
4463
4464 static int can_overcommit(struct btrfs_root *root,
4465                           struct btrfs_space_info *space_info, u64 bytes,
4466                           enum btrfs_reserve_flush_enum flush)
4467 {
4468         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4469         u64 profile = btrfs_get_alloc_profile(root, 0);
4470         u64 space_size;
4471         u64 avail;
4472         u64 used;
4473
4474         used = space_info->bytes_used + space_info->bytes_reserved +
4475                 space_info->bytes_pinned + space_info->bytes_readonly;
4476
4477         /*
4478          * We only want to allow over committing if we have lots of actual space
4479          * free, but if we don't have enough space to handle the global reserve
4480          * space then we could end up having a real enospc problem when trying
4481          * to allocate a chunk or some other such important allocation.
4482          */
4483         spin_lock(&global_rsv->lock);
4484         space_size = calc_global_rsv_need_space(global_rsv);
4485         spin_unlock(&global_rsv->lock);
4486         if (used + space_size >= space_info->total_bytes)
4487                 return 0;
4488
4489         used += space_info->bytes_may_use;
4490
4491         spin_lock(&root->fs_info->free_chunk_lock);
4492         avail = root->fs_info->free_chunk_space;
4493         spin_unlock(&root->fs_info->free_chunk_lock);
4494
4495         /*
4496          * If we have dup, raid1 or raid10 then only half of the free
4497          * space is actually useable.  For raid56, the space info used
4498          * doesn't include the parity drive, so we don't have to
4499          * change the math
4500          */
4501         if (profile & (BTRFS_BLOCK_GROUP_DUP |
4502                        BTRFS_BLOCK_GROUP_RAID1 |
4503                        BTRFS_BLOCK_GROUP_RAID10))
4504                 avail >>= 1;
4505
4506         /*
4507          * If we aren't flushing all things, let us overcommit up to
4508          * 1/2th of the space. If we can flush, don't let us overcommit
4509          * too much, let it overcommit up to 1/8 of the space.
4510          */
4511         if (flush == BTRFS_RESERVE_FLUSH_ALL)
4512                 avail >>= 3;
4513         else
4514                 avail >>= 1;
4515
4516         if (used + bytes < space_info->total_bytes + avail)
4517                 return 1;
4518         return 0;
4519 }
4520
4521 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4522                                          unsigned long nr_pages, int nr_items)
4523 {
4524         struct super_block *sb = root->fs_info->sb;
4525
4526         if (down_read_trylock(&sb->s_umount)) {
4527                 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4528                 up_read(&sb->s_umount);
4529         } else {
4530                 /*
4531                  * We needn't worry the filesystem going from r/w to r/o though
4532                  * we don't acquire ->s_umount mutex, because the filesystem
4533                  * should guarantee the delalloc inodes list be empty after
4534                  * the filesystem is readonly(all dirty pages are written to
4535                  * the disk).
4536                  */
4537                 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4538                 if (!current->journal_info)
4539                         btrfs_wait_ordered_roots(root->fs_info, nr_items);
4540         }
4541 }
4542
4543 static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4544 {
4545         u64 bytes;
4546         int nr;
4547
4548         bytes = btrfs_calc_trans_metadata_size(root, 1);
4549         nr = (int)div64_u64(to_reclaim, bytes);
4550         if (!nr)
4551                 nr = 1;
4552         return nr;
4553 }
4554
4555 #define EXTENT_SIZE_PER_ITEM    (256 * 1024)
4556
4557 /*
4558  * shrink metadata reservation for delalloc
4559  */
4560 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4561                             bool wait_ordered)
4562 {
4563         struct btrfs_block_rsv *block_rsv;
4564         struct btrfs_space_info *space_info;
4565         struct btrfs_trans_handle *trans;
4566         u64 delalloc_bytes;
4567         u64 max_reclaim;
4568         long time_left;
4569         unsigned long nr_pages;
4570         int loops;
4571         int items;
4572         enum btrfs_reserve_flush_enum flush;
4573
4574         /* Calc the number of the pages we need flush for space reservation */
4575         items = calc_reclaim_items_nr(root, to_reclaim);
4576         to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4577
4578         trans = (struct btrfs_trans_handle *)current->journal_info;
4579         block_rsv = &root->fs_info->delalloc_block_rsv;
4580         space_info = block_rsv->space_info;
4581
4582         delalloc_bytes = percpu_counter_sum_positive(
4583                                                 &root->fs_info->delalloc_bytes);
4584         if (delalloc_bytes == 0) {
4585                 if (trans)
4586                         return;
4587                 if (wait_ordered)
4588                         btrfs_wait_ordered_roots(root->fs_info, items);
4589                 return;
4590         }
4591
4592         loops = 0;
4593         while (delalloc_bytes && loops < 3) {
4594                 max_reclaim = min(delalloc_bytes, to_reclaim);
4595                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4596                 btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4597                 /*
4598                  * We need to wait for the async pages to actually start before
4599                  * we do anything.
4600                  */
4601                 max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4602                 if (!max_reclaim)
4603                         goto skip_async;
4604
4605                 if (max_reclaim <= nr_pages)
4606                         max_reclaim = 0;
4607                 else
4608                         max_reclaim -= nr_pages;
4609
4610                 wait_event(root->fs_info->async_submit_wait,
4611                            atomic_read(&root->fs_info->async_delalloc_pages) <=
4612                            (int)max_reclaim);
4613 skip_async:
4614                 if (!trans)
4615                         flush = BTRFS_RESERVE_FLUSH_ALL;
4616                 else
4617                         flush = BTRFS_RESERVE_NO_FLUSH;
4618                 spin_lock(&space_info->lock);
4619                 if (can_overcommit(root, space_info, orig, flush)) {
4620                         spin_unlock(&space_info->lock);
4621                         break;
4622                 }
4623                 spin_unlock(&space_info->lock);
4624
4625                 loops++;
4626                 if (wait_ordered && !trans) {
4627                         btrfs_wait_ordered_roots(root->fs_info, items);
4628                 } else {
4629                         time_left = schedule_timeout_killable(1);
4630                         if (time_left)
4631                                 break;
4632                 }
4633                 delalloc_bytes = percpu_counter_sum_positive(
4634                                                 &root->fs_info->delalloc_bytes);
4635         }
4636 }
4637
4638 /**
4639  * maybe_commit_transaction - possibly commit the transaction if its ok to
4640  * @root - the root we're allocating for
4641  * @bytes - the number of bytes we want to reserve
4642  * @force - force the commit
4643  *
4644  * This will check to make sure that committing the transaction will actually
4645  * get us somewhere and then commit the transaction if it does.  Otherwise it
4646  * will return -ENOSPC.
4647  */
4648 static int may_commit_transaction(struct btrfs_root *root,
4649                                   struct btrfs_space_info *space_info,
4650                                   u64 bytes, int force)
4651 {
4652         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4653         struct btrfs_trans_handle *trans;
4654
4655         trans = (struct btrfs_trans_handle *)current->journal_info;
4656         if (trans)
4657                 return -EAGAIN;
4658
4659         if (force)
4660                 goto commit;
4661
4662         /* See if there is enough pinned space to make this reservation */
4663         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4664                                    bytes) >= 0)
4665                 goto commit;
4666
4667         /*
4668          * See if there is some space in the delayed insertion reservation for
4669          * this reservation.
4670          */
4671         if (space_info != delayed_rsv->space_info)
4672                 return -ENOSPC;
4673
4674         spin_lock(&delayed_rsv->lock);
4675         if (percpu_counter_compare(&space_info->total_bytes_pinned,
4676                                    bytes - delayed_rsv->size) >= 0) {
4677                 spin_unlock(&delayed_rsv->lock);
4678                 return -ENOSPC;
4679         }
4680         spin_unlock(&delayed_rsv->lock);
4681
4682 commit:
4683         trans = btrfs_join_transaction(root);
4684         if (IS_ERR(trans))
4685                 return -ENOSPC;
4686
4687         return btrfs_commit_transaction(trans, root);
4688 }
4689
4690 enum flush_state {
4691         FLUSH_DELAYED_ITEMS_NR  =       1,
4692         FLUSH_DELAYED_ITEMS     =       2,
4693         FLUSH_DELALLOC          =       3,
4694         FLUSH_DELALLOC_WAIT     =       4,
4695         ALLOC_CHUNK             =       5,
4696         COMMIT_TRANS            =       6,
4697 };
4698
4699 static int flush_space(struct btrfs_root *root,
4700                        struct btrfs_space_info *space_info, u64 num_bytes,
4701                        u64 orig_bytes, int state)
4702 {
4703         struct btrfs_trans_handle *trans;
4704         int nr;
4705         int ret = 0;
4706
4707         switch (state) {
4708         case FLUSH_DELAYED_ITEMS_NR:
4709         case FLUSH_DELAYED_ITEMS:
4710                 if (state == FLUSH_DELAYED_ITEMS_NR)
4711                         nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4712                 else
4713                         nr = -1;
4714
4715                 trans = btrfs_join_transaction(root);
4716                 if (IS_ERR(trans)) {
4717                         ret = PTR_ERR(trans);
4718                         break;
4719                 }
4720                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4721                 btrfs_end_transaction(trans, root);
4722                 break;
4723         case FLUSH_DELALLOC:
4724         case FLUSH_DELALLOC_WAIT:
4725                 shrink_delalloc(root, num_bytes * 2, orig_bytes,
4726                                 state == FLUSH_DELALLOC_WAIT);
4727                 break;
4728         case ALLOC_CHUNK:
4729                 trans = btrfs_join_transaction(root);
4730                 if (IS_ERR(trans)) {
4731                         ret = PTR_ERR(trans);
4732                         break;
4733                 }
4734                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4735                                      btrfs_get_alloc_profile(root, 0),
4736                                      CHUNK_ALLOC_NO_FORCE);
4737                 btrfs_end_transaction(trans, root);
4738                 if (ret == -ENOSPC)
4739                         ret = 0;
4740                 break;
4741         case COMMIT_TRANS:
4742                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4743                 break;
4744         default:
4745                 ret = -ENOSPC;
4746                 break;
4747         }
4748
4749         return ret;
4750 }
4751
4752 static inline u64
4753 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4754                                  struct btrfs_space_info *space_info)
4755 {
4756         u64 used;
4757         u64 expected;
4758         u64 to_reclaim;
4759
4760         to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
4761                                 16 * 1024 * 1024);
4762         spin_lock(&space_info->lock);
4763         if (can_overcommit(root, space_info, to_reclaim,
4764                            BTRFS_RESERVE_FLUSH_ALL)) {
4765                 to_reclaim = 0;
4766                 goto out;
4767         }
4768
4769         used = space_info->bytes_used + space_info->bytes_reserved +
4770                space_info->bytes_pinned + space_info->bytes_readonly +
4771                space_info->bytes_may_use;
4772         if (can_overcommit(root, space_info, 1024 * 1024,
4773                            BTRFS_RESERVE_FLUSH_ALL))
4774                 expected = div_factor_fine(space_info->total_bytes, 95);
4775         else
4776                 expected = div_factor_fine(space_info->total_bytes, 90);
4777
4778         if (used > expected)
4779                 to_reclaim = used - expected;
4780         else
4781                 to_reclaim = 0;
4782         to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4783                                      space_info->bytes_reserved);
4784 out:
4785         spin_unlock(&space_info->lock);
4786
4787         return to_reclaim;
4788 }
4789
4790 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4791                                         struct btrfs_fs_info *fs_info, u64 used)
4792 {
4793         u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4794
4795         /* If we're just plain full then async reclaim just slows us down. */
4796         if (space_info->bytes_used >= thresh)
4797                 return 0;
4798
4799         return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4800                 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4801 }
4802
4803 static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4804                                        struct btrfs_fs_info *fs_info,
4805                                        int flush_state)
4806 {
4807         u64 used;
4808
4809         spin_lock(&space_info->lock);
4810         /*
4811          * We run out of space and have not got any free space via flush_space,
4812          * so don't bother doing async reclaim.
4813          */
4814         if (flush_state > COMMIT_TRANS && space_info->full) {
4815                 spin_unlock(&space_info->lock);
4816                 return 0;
4817         }
4818
4819         used = space_info->bytes_used + space_info->bytes_reserved +
4820                space_info->bytes_pinned + space_info->bytes_readonly +
4821                space_info->bytes_may_use;
4822         if (need_do_async_reclaim(space_info, fs_info, used)) {
4823                 spin_unlock(&space_info->lock);
4824                 return 1;
4825         }
4826         spin_unlock(&space_info->lock);
4827
4828         return 0;
4829 }
4830
4831 static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4832 {
4833         struct btrfs_fs_info *fs_info;
4834         struct btrfs_space_info *space_info;
4835         u64 to_reclaim;
4836         int flush_state;
4837
4838         fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4839         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4840
4841         to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4842                                                       space_info);
4843         if (!to_reclaim)
4844                 return;
4845
4846         flush_state = FLUSH_DELAYED_ITEMS_NR;
4847         do {
4848                 flush_space(fs_info->fs_root, space_info, to_reclaim,
4849                             to_reclaim, flush_state);
4850                 flush_state++;
4851                 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4852                                                  flush_state))
4853                         return;
4854         } while (flush_state < COMMIT_TRANS);
4855 }
4856
4857 void btrfs_init_async_reclaim_work(struct work_struct *work)
4858 {
4859         INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4860 }
4861
4862 /**
4863  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4864  * @root - the root we're allocating for
4865  * @block_rsv - the block_rsv we're allocating for
4866  * @orig_bytes - the number of bytes we want
4867  * @flush - whether or not we can flush to make our reservation
4868  *
4869  * This will reserve orgi_bytes number of bytes from the space info associated
4870  * with the block_rsv.  If there is not enough space it will make an attempt to
4871  * flush out space to make room.  It will do this by flushing delalloc if
4872  * possible or committing the transaction.  If flush is 0 then no attempts to
4873  * regain reservations will be made and this will fail if there is not enough
4874  * space already.
4875  */
4876 static int reserve_metadata_bytes(struct btrfs_root *root,
4877                                   struct btrfs_block_rsv *block_rsv,
4878                                   u64 orig_bytes,
4879                                   enum btrfs_reserve_flush_enum flush)
4880 {
4881         struct btrfs_space_info *space_info = block_rsv->space_info;
4882         u64 used;
4883         u64 num_bytes = orig_bytes;
4884         int flush_state = FLUSH_DELAYED_ITEMS_NR;
4885         int ret = 0;
4886         bool flushing = false;
4887
4888 again:
4889         ret = 0;
4890         spin_lock(&space_info->lock);
4891         /*
4892          * We only want to wait if somebody other than us is flushing and we
4893          * are actually allowed to flush all things.
4894          */
4895         while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4896                space_info->flush) {
4897                 spin_unlock(&space_info->lock);
4898                 /*
4899                  * If we have a trans handle we can't wait because the flusher
4900                  * may have to commit the transaction, which would mean we would
4901                  * deadlock since we are waiting for the flusher to finish, but
4902                  * hold the current transaction open.
4903                  */
4904                 if (current->journal_info)
4905                         return -EAGAIN;
4906                 ret = wait_event_killable(space_info->wait, !space_info->flush);
4907                 /* Must have been killed, return */
4908                 if (ret)
4909                         return -EINTR;
4910
4911                 spin_lock(&space_info->lock);
4912         }
4913
4914         ret = -ENOSPC;
4915         used = space_info->bytes_used + space_info->bytes_reserved +
4916                 space_info->bytes_pinned + space_info->bytes_readonly +
4917                 space_info->bytes_may_use;
4918
4919         /*
4920          * The idea here is that we've not already over-reserved the block group
4921          * then we can go ahead and save our reservation first and then start
4922          * flushing if we need to.  Otherwise if we've already overcommitted
4923          * lets start flushing stuff first and then come back and try to make
4924          * our reservation.
4925          */
4926         if (used <= space_info->total_bytes) {
4927                 if (used + orig_bytes <= space_info->total_bytes) {
4928                         space_info->bytes_may_use += orig_bytes;
4929                         trace_btrfs_space_reservation(root->fs_info,
4930                                 "space_info", space_info->flags, orig_bytes, 1);
4931                         ret = 0;
4932                 } else {
4933                         /*
4934                          * Ok set num_bytes to orig_bytes since we aren't
4935                          * overocmmitted, this way we only try and reclaim what
4936                          * we need.
4937                          */
4938                         num_bytes = orig_bytes;
4939                 }
4940         } else {
4941                 /*
4942                  * Ok we're over committed, set num_bytes to the overcommitted
4943                  * amount plus the amount of bytes that we need for this
4944                  * reservation.
4945                  */
4946                 num_bytes = used - space_info->total_bytes +
4947                         (orig_bytes * 2);
4948         }
4949
4950         if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4951                 space_info->bytes_may_use += orig_bytes;
4952                 trace_btrfs_space_reservation(root->fs_info, "space_info",
4953                                               space_info->flags, orig_bytes,
4954                                               1);
4955                 ret = 0;
4956         }
4957
4958         /*
4959          * Couldn't make our reservation, save our place so while we're trying
4960          * to reclaim space we can actually use it instead of somebody else
4961          * stealing it from us.
4962          *
4963          * We make the other tasks wait for the flush only when we can flush
4964          * all things.
4965          */
4966         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4967                 flushing = true;
4968                 space_info->flush = 1;
4969         } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4970                 used += orig_bytes;
4971                 /*
4972                  * We will do the space reservation dance during log replay,
4973                  * which means we won't have fs_info->fs_root set, so don't do
4974                  * the async reclaim as we will panic.
4975                  */
4976                 if (!root->fs_info->log_root_recovering &&
4977                     need_do_async_reclaim(space_info, root->fs_info, used) &&
4978                     !work_busy(&root->fs_info->async_reclaim_work))
4979                         queue_work(system_unbound_wq,
4980                                    &root->fs_info->async_reclaim_work);
4981         }
4982         spin_unlock(&space_info->lock);
4983
4984         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4985                 goto out;
4986
4987         ret = flush_space(root, space_info, num_bytes, orig_bytes,
4988                           flush_state);
4989         flush_state++;
4990
4991         /*
4992          * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4993          * would happen. So skip delalloc flush.
4994          */
4995         if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4996             (flush_state == FLUSH_DELALLOC ||
4997              flush_state == FLUSH_DELALLOC_WAIT))
4998                 flush_state = ALLOC_CHUNK;
4999
5000         if (!ret)
5001                 goto again;
5002         else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
5003                  flush_state < COMMIT_TRANS)
5004                 goto again;
5005         else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
5006                  flush_state <= COMMIT_TRANS)
5007                 goto again;
5008
5009 out:
5010         if (ret == -ENOSPC &&
5011             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5012                 struct btrfs_block_rsv *global_rsv =
5013                         &root->fs_info->global_block_rsv;
5014
5015                 if (block_rsv != global_rsv &&
5016                     !block_rsv_use_bytes(global_rsv, orig_bytes))
5017                         ret = 0;
5018         }
5019         if (ret == -ENOSPC)
5020                 trace_btrfs_space_reservation(root->fs_info,
5021                                               "space_info:enospc",
5022                                               space_info->flags, orig_bytes, 1);
5023         if (flushing) {
5024                 spin_lock(&space_info->lock);
5025                 space_info->flush = 0;
5026                 wake_up_all(&space_info->wait);
5027                 spin_unlock(&space_info->lock);
5028         }
5029         return ret;
5030 }
5031
5032 static struct btrfs_block_rsv *get_block_rsv(
5033                                         const struct btrfs_trans_handle *trans,
5034                                         const struct btrfs_root *root)
5035 {
5036         struct btrfs_block_rsv *block_rsv = NULL;
5037
5038         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5039             (root == root->fs_info->csum_root && trans->adding_csums) ||
5040              (root == root->fs_info->uuid_root))
5041                 block_rsv = trans->block_rsv;
5042
5043         if (!block_rsv)
5044                 block_rsv = root->block_rsv;
5045
5046         if (!block_rsv)
5047                 block_rsv = &root->fs_info->empty_block_rsv;
5048
5049         return block_rsv;
5050 }
5051
5052 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5053                                u64 num_bytes)
5054 {
5055         int ret = -ENOSPC;
5056         spin_lock(&block_rsv->lock);
5057         if (block_rsv->reserved >= num_bytes) {
5058                 block_rsv->reserved -= num_bytes;
5059                 if (block_rsv->reserved < block_rsv->size)
5060                         block_rsv->full = 0;
5061                 ret = 0;
5062         }
5063         spin_unlock(&block_rsv->lock);
5064         return ret;
5065 }
5066
5067 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5068                                 u64 num_bytes, int update_size)
5069 {
5070         spin_lock(&block_rsv->lock);
5071         block_rsv->reserved += num_bytes;
5072         if (update_size)
5073                 block_rsv->size += num_bytes;
5074         else if (block_rsv->reserved >= block_rsv->size)
5075                 block_rsv->full = 1;
5076         spin_unlock(&block_rsv->lock);
5077 }
5078
5079 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5080                              struct btrfs_block_rsv *dest, u64 num_bytes,
5081                              int min_factor)
5082 {
5083         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5084         u64 min_bytes;
5085
5086         if (global_rsv->space_info != dest->space_info)
5087                 return -ENOSPC;
5088
5089         spin_lock(&global_rsv->lock);
5090         min_bytes = div_factor(global_rsv->size, min_factor);
5091         if (global_rsv->reserved < min_bytes + num_bytes) {
5092                 spin_unlock(&global_rsv->lock);
5093                 return -ENOSPC;
5094         }
5095         global_rsv->reserved -= num_bytes;
5096         if (global_rsv->reserved < global_rsv->size)
5097                 global_rsv->full = 0;
5098         spin_unlock(&global_rsv->lock);
5099
5100         block_rsv_add_bytes(dest, num_bytes, 1);
5101         return 0;
5102 }
5103
5104 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5105                                     struct btrfs_block_rsv *block_rsv,
5106                                     struct btrfs_block_rsv *dest, u64 num_bytes)
5107 {
5108         struct btrfs_space_info *space_info = block_rsv->space_info;
5109
5110         spin_lock(&block_rsv->lock);
5111         if (num_bytes == (u64)-1)
5112                 num_bytes = block_rsv->size;
5113         block_rsv->size -= num_bytes;
5114         if (block_rsv->reserved >= block_rsv->size) {
5115                 num_bytes = block_rsv->reserved - block_rsv->size;
5116                 block_rsv->reserved = block_rsv->size;
5117                 block_rsv->full = 1;
5118         } else {
5119                 num_bytes = 0;
5120         }
5121         spin_unlock(&block_rsv->lock);
5122
5123         if (num_bytes > 0) {
5124                 if (dest) {
5125                         spin_lock(&dest->lock);
5126                         if (!dest->full) {
5127                                 u64 bytes_to_add;
5128
5129                                 bytes_to_add = dest->size - dest->reserved;
5130                                 bytes_to_add = min(num_bytes, bytes_to_add);
5131                                 dest->reserved += bytes_to_add;
5132                                 if (dest->reserved >= dest->size)
5133                                         dest->full = 1;
5134                                 num_bytes -= bytes_to_add;
5135                         }
5136                         spin_unlock(&dest->lock);
5137                 }
5138                 if (num_bytes) {
5139                         spin_lock(&space_info->lock);
5140                         space_info->bytes_may_use -= num_bytes;
5141                         trace_btrfs_space_reservation(fs_info, "space_info",
5142                                         space_info->flags, num_bytes, 0);
5143                         spin_unlock(&space_info->lock);
5144                 }
5145         }
5146 }
5147
5148 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
5149                                    struct btrfs_block_rsv *dst, u64 num_bytes)
5150 {
5151         int ret;
5152
5153         ret = block_rsv_use_bytes(src, num_bytes);
5154         if (ret)
5155                 return ret;
5156
5157         block_rsv_add_bytes(dst, num_bytes, 1);
5158         return 0;
5159 }
5160
5161 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5162 {
5163         memset(rsv, 0, sizeof(*rsv));
5164         spin_lock_init(&rsv->lock);
5165         rsv->type = type;
5166 }
5167
5168 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
5169                                               unsigned short type)
5170 {
5171         struct btrfs_block_rsv *block_rsv;
5172         struct btrfs_fs_info *fs_info = root->fs_info;
5173
5174         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5175         if (!block_rsv)
5176                 return NULL;
5177
5178         btrfs_init_block_rsv(block_rsv, type);
5179         block_rsv->space_info = __find_space_info(fs_info,
5180                                                   BTRFS_BLOCK_GROUP_METADATA);
5181         return block_rsv;
5182 }
5183
5184 void btrfs_free_block_rsv(struct btrfs_root *root,
5185                           struct btrfs_block_rsv *rsv)
5186 {
5187         if (!rsv)
5188                 return;
5189         btrfs_block_rsv_release(root, rsv, (u64)-1);
5190         kfree(rsv);
5191 }
5192
5193 void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5194 {
5195         kfree(rsv);
5196 }
5197
5198 int btrfs_block_rsv_add(struct btrfs_root *root,
5199                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5200                         enum btrfs_reserve_flush_enum flush)
5201 {
5202         int ret;
5203
5204         if (num_bytes == 0)
5205                 return 0;
5206
5207         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5208         if (!ret) {
5209                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
5210                 return 0;
5211         }
5212
5213         return ret;
5214 }
5215
5216 int btrfs_block_rsv_check(struct btrfs_root *root,
5217                           struct btrfs_block_rsv *block_rsv, int min_factor)
5218 {
5219         u64 num_bytes = 0;
5220         int ret = -ENOSPC;
5221
5222         if (!block_rsv)
5223                 return 0;
5224
5225         spin_lock(&block_rsv->lock);
5226         num_bytes = div_factor(block_rsv->size, min_factor);
5227         if (block_rsv->reserved >= num_bytes)
5228                 ret = 0;
5229         spin_unlock(&block_rsv->lock);
5230
5231         return ret;
5232 }
5233
5234 int btrfs_block_rsv_refill(struct btrfs_root *root,
5235                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5236                            enum btrfs_reserve_flush_enum flush)
5237 {
5238         u64 num_bytes = 0;
5239         int ret = -ENOSPC;
5240
5241         if (!block_rsv)
5242                 return 0;
5243
5244         spin_lock(&block_rsv->lock);
5245         num_bytes = min_reserved;
5246         if (block_rsv->reserved >= num_bytes)
5247                 ret = 0;
5248         else
5249                 num_bytes -= block_rsv->reserved;
5250         spin_unlock(&block_rsv->lock);
5251
5252         if (!ret)
5253                 return 0;
5254
5255         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5256         if (!ret) {
5257                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
5258                 return 0;
5259         }
5260
5261         return ret;
5262 }
5263
5264 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
5265                             struct btrfs_block_rsv *dst_rsv,
5266                             u64 num_bytes)
5267 {
5268         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5269 }
5270
5271 void btrfs_block_rsv_release(struct btrfs_root *root,
5272                              struct btrfs_block_rsv *block_rsv,
5273                              u64 num_bytes)
5274 {
5275         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5276         if (global_rsv == block_rsv ||
5277             block_rsv->space_info != global_rsv->space_info)
5278                 global_rsv = NULL;
5279         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
5280                                 num_bytes);
5281 }
5282
5283 /*
5284  * helper to calculate size of global block reservation.
5285  * the desired value is sum of space used by extent tree,
5286  * checksum tree and root tree
5287  */
5288 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
5289 {
5290         struct btrfs_space_info *sinfo;
5291         u64 num_bytes;
5292         u64 meta_used;
5293         u64 data_used;
5294         int csum_size = btrfs_super_csum_size(fs_info->super_copy);
5295
5296         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
5297         spin_lock(&sinfo->lock);
5298         data_used = sinfo->bytes_used;
5299         spin_unlock(&sinfo->lock);
5300
5301         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5302         spin_lock(&sinfo->lock);
5303         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
5304                 data_used = 0;
5305         meta_used = sinfo->bytes_used;
5306         spin_unlock(&sinfo->lock);
5307
5308         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
5309                     csum_size * 2;
5310         num_bytes += div_u64(data_used + meta_used, 50);
5311
5312         if (num_bytes * 3 > meta_used)
5313                 num_bytes = div_u64(meta_used, 3);
5314
5315         return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
5316 }
5317
5318 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5319 {
5320         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5321         struct btrfs_space_info *sinfo = block_rsv->space_info;
5322         u64 num_bytes;
5323
5324         num_bytes = calc_global_metadata_size(fs_info);
5325
5326         spin_lock(&sinfo->lock);
5327         spin_lock(&block_rsv->lock);
5328
5329         block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
5330
5331         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
5332                     sinfo->bytes_reserved + sinfo->bytes_readonly +
5333                     sinfo->bytes_may_use;
5334
5335         if (sinfo->total_bytes > num_bytes) {
5336                 num_bytes = sinfo->total_bytes - num_bytes;
5337                 block_rsv->reserved += num_bytes;
5338                 sinfo->bytes_may_use += num_bytes;
5339                 trace_btrfs_space_reservation(fs_info, "space_info",
5340                                       sinfo->flags, num_bytes, 1);
5341         }
5342
5343         if (block_rsv->reserved >= block_rsv->size) {
5344                 num_bytes = block_rsv->reserved - block_rsv->size;
5345                 sinfo->bytes_may_use -= num_bytes;
5346                 trace_btrfs_space_reservation(fs_info, "space_info",
5347                                       sinfo->flags, num_bytes, 0);
5348                 block_rsv->reserved = block_rsv->size;
5349                 block_rsv->full = 1;
5350         }
5351
5352         spin_unlock(&block_rsv->lock);
5353         spin_unlock(&sinfo->lock);
5354 }
5355
5356 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5357 {
5358         struct btrfs_space_info *space_info;
5359
5360         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5361         fs_info->chunk_block_rsv.space_info = space_info;
5362
5363         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5364         fs_info->global_block_rsv.space_info = space_info;
5365         fs_info->delalloc_block_rsv.space_info = space_info;
5366         fs_info->trans_block_rsv.space_info = space_info;
5367         fs_info->empty_block_rsv.space_info = space_info;
5368         fs_info->delayed_block_rsv.space_info = space_info;
5369
5370         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5371         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5372         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5373         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5374         if (fs_info->quota_root)
5375                 fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5376         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5377
5378         update_global_block_rsv(fs_info);
5379 }
5380
5381 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5382 {
5383         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5384                                 (u64)-1);
5385         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
5386         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
5387         WARN_ON(fs_info->trans_block_rsv.size > 0);
5388         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5389         WARN_ON(fs_info->chunk_block_rsv.size > 0);
5390         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5391         WARN_ON(fs_info->delayed_block_rsv.size > 0);
5392         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5393 }
5394
5395 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
5396                                   struct btrfs_root *root)
5397 {
5398         if (!trans->block_rsv)
5399                 return;
5400
5401         if (!trans->bytes_reserved)
5402                 return;
5403
5404         trace_btrfs_space_reservation(root->fs_info, "transaction",
5405                                       trans->transid, trans->bytes_reserved, 0);
5406         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5407         trans->bytes_reserved = 0;
5408 }
5409
5410 /*
5411  * To be called after all the new block groups attached to the transaction
5412  * handle have been created (btrfs_create_pending_block_groups()).
5413  */
5414 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5415 {
5416         struct btrfs_fs_info *fs_info = trans->root->fs_info;
5417
5418         if (!trans->chunk_bytes_reserved)
5419                 return;
5420
5421         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5422
5423         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5424                                 trans->chunk_bytes_reserved);
5425         trans->chunk_bytes_reserved = 0;
5426 }
5427
5428 /* Can only return 0 or -ENOSPC */
5429 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5430                                   struct inode *inode)
5431 {
5432         struct btrfs_root *root = BTRFS_I(inode)->root;
5433         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
5434         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5435
5436         /*
5437          * We need to hold space in order to delete our orphan item once we've
5438          * added it, so this takes the reservation so we can release it later
5439          * when we are truly done with the orphan item.
5440          */
5441         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5442         trace_btrfs_space_reservation(root->fs_info, "orphan",
5443                                       btrfs_ino(inode), num_bytes, 1);
5444         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
5445 }
5446
5447 void btrfs_orphan_release_metadata(struct inode *inode)
5448 {
5449         struct btrfs_root *root = BTRFS_I(inode)->root;
5450         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
5451         trace_btrfs_space_reservation(root->fs_info, "orphan",
5452                                       btrfs_ino(inode), num_bytes, 0);
5453         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
5454 }
5455
5456 /*
5457  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5458  * root: the root of the parent directory
5459  * rsv: block reservation
5460  * items: the number of items that we need do reservation
5461  * qgroup_reserved: used to return the reserved size in qgroup
5462  *
5463  * This function is used to reserve the space for snapshot/subvolume
5464  * creation and deletion. Those operations are different with the
5465  * common file/directory operations, they change two fs/file trees
5466  * and root tree, the number of items that the qgroup reserves is
5467  * different with the free space reservation. So we can not use
5468  * the space reseravtion mechanism in start_transaction().
5469  */
5470 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5471                                      struct btrfs_block_rsv *rsv,
5472                                      int items,
5473                                      u64 *qgroup_reserved,
5474                                      bool use_global_rsv)
5475 {
5476         u64 num_bytes;
5477         int ret;
5478         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5479
5480         if (root->fs_info->quota_enabled) {
5481                 /* One for parent inode, two for dir entries */
5482                 num_bytes = 3 * root->nodesize;
5483                 ret = btrfs_qgroup_reserve_meta(root, num_bytes);
5484                 if (ret)
5485                         return ret;
5486         } else {
5487                 num_bytes = 0;
5488         }
5489
5490         *qgroup_reserved = num_bytes;
5491
5492         num_bytes = btrfs_calc_trans_metadata_size(root, items);
5493         rsv->space_info = __find_space_info(root->fs_info,
5494                                             BTRFS_BLOCK_GROUP_METADATA);
5495         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5496                                   BTRFS_RESERVE_FLUSH_ALL);
5497
5498         if (ret == -ENOSPC && use_global_rsv)
5499                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
5500
5501         if (ret && *qgroup_reserved)
5502                 btrfs_qgroup_free_meta(root, *qgroup_reserved);
5503
5504         return ret;
5505 }
5506
5507 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5508                                       struct btrfs_block_rsv *rsv,
5509                                       u64 qgroup_reserved)
5510 {
5511         btrfs_block_rsv_release(root, rsv, (u64)-1);
5512 }
5513
5514 /**
5515  * drop_outstanding_extent - drop an outstanding extent
5516  * @inode: the inode we're dropping the extent for
5517  * @num_bytes: the number of bytes we're relaseing.
5518  *
5519  * This is called when we are freeing up an outstanding extent, either called
5520  * after an error or after an extent is written.  This will return the number of
5521  * reserved extents that need to be freed.  This must be called with
5522  * BTRFS_I(inode)->lock held.
5523  */
5524 static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5525 {
5526         unsigned drop_inode_space = 0;
5527         unsigned dropped_extents = 0;
5528         unsigned num_extents = 0;
5529
5530         num_extents = (unsigned)div64_u64(num_bytes +
5531                                           BTRFS_MAX_EXTENT_SIZE - 1,
5532                                           BTRFS_MAX_EXTENT_SIZE);
5533         ASSERT(num_extents);
5534         ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5535         BTRFS_I(inode)->outstanding_extents -= num_extents;
5536
5537         if (BTRFS_I(inode)->outstanding_extents == 0 &&
5538             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5539                                &BTRFS_I(inode)->runtime_flags))
5540                 drop_inode_space = 1;
5541
5542         /*
5543          * If we have more or the same amount of outsanding extents than we have
5544          * reserved then we need to leave the reserved extents count alone.
5545          */
5546         if (BTRFS_I(inode)->outstanding_extents >=
5547             BTRFS_I(inode)->reserved_extents)
5548                 return drop_inode_space;
5549
5550         dropped_extents = BTRFS_I(inode)->reserved_extents -
5551                 BTRFS_I(inode)->outstanding_extents;
5552         BTRFS_I(inode)->reserved_extents -= dropped_extents;
5553         return dropped_extents + drop_inode_space;
5554 }
5555
5556 /**
5557  * calc_csum_metadata_size - return the amount of metada space that must be
5558  *      reserved/free'd for the given bytes.
5559  * @inode: the inode we're manipulating
5560  * @num_bytes: the number of bytes in question
5561  * @reserve: 1 if we are reserving space, 0 if we are freeing space
5562  *
5563  * This adjusts the number of csum_bytes in the inode and then returns the
5564  * correct amount of metadata that must either be reserved or freed.  We
5565  * calculate how many checksums we can fit into one leaf and then divide the
5566  * number of bytes that will need to be checksumed by this value to figure out
5567  * how many checksums will be required.  If we are adding bytes then the number
5568  * may go up and we will return the number of additional bytes that must be
5569  * reserved.  If it is going down we will return the number of bytes that must
5570  * be freed.
5571  *
5572  * This must be called with BTRFS_I(inode)->lock held.
5573  */
5574 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5575                                    int reserve)
5576 {
5577         struct btrfs_root *root = BTRFS_I(inode)->root;
5578         u64 old_csums, num_csums;
5579
5580         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5581             BTRFS_I(inode)->csum_bytes == 0)
5582                 return 0;
5583
5584         old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5585         if (reserve)
5586                 BTRFS_I(inode)->csum_bytes += num_bytes;
5587         else
5588                 BTRFS_I(inode)->csum_bytes -= num_bytes;
5589         num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5590
5591         /* No change, no need to reserve more */
5592         if (old_csums == num_csums)
5593                 return 0;
5594
5595         if (reserve)
5596                 return btrfs_calc_trans_metadata_size(root,
5597                                                       num_csums - old_csums);
5598
5599         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5600 }
5601
5602 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5603 {
5604         struct btrfs_root *root = BTRFS_I(inode)->root;
5605         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5606         u64 to_reserve = 0;
5607         u64 csum_bytes;
5608         unsigned nr_extents = 0;
5609         int extra_reserve = 0;
5610         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5611         int ret = 0;
5612         bool delalloc_lock = true;
5613         u64 to_free = 0;
5614         unsigned dropped;
5615
5616         /* If we are a free space inode we need to not flush since we will be in
5617          * the middle of a transaction commit.  We also don't need the delalloc
5618          * mutex since we won't race with anybody.  We need this mostly to make
5619          * lockdep shut its filthy mouth.
5620          */
5621         if (btrfs_is_free_space_inode(inode)) {
5622                 flush = BTRFS_RESERVE_NO_FLUSH;
5623                 delalloc_lock = false;
5624         }
5625
5626         if (flush != BTRFS_RESERVE_NO_FLUSH &&
5627             btrfs_transaction_in_commit(root->fs_info))
5628                 schedule_timeout(1);
5629
5630         if (delalloc_lock)
5631                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5632
5633         num_bytes = ALIGN(num_bytes, root->sectorsize);
5634
5635         spin_lock(&BTRFS_I(inode)->lock);
5636         nr_extents = (unsigned)div64_u64(num_bytes +
5637                                          BTRFS_MAX_EXTENT_SIZE - 1,
5638                                          BTRFS_MAX_EXTENT_SIZE);
5639         BTRFS_I(inode)->outstanding_extents += nr_extents;
5640         nr_extents = 0;
5641
5642         if (BTRFS_I(inode)->outstanding_extents >
5643             BTRFS_I(inode)->reserved_extents)
5644                 nr_extents = BTRFS_I(inode)->outstanding_extents -
5645                         BTRFS_I(inode)->reserved_extents;
5646
5647         /*
5648          * Add an item to reserve for updating the inode when we complete the
5649          * delalloc io.
5650          */
5651         if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5652                       &BTRFS_I(inode)->runtime_flags)) {
5653                 nr_extents++;
5654                 extra_reserve = 1;
5655         }
5656
5657         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
5658         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5659         csum_bytes = BTRFS_I(inode)->csum_bytes;
5660         spin_unlock(&BTRFS_I(inode)->lock);
5661
5662         if (root->fs_info->quota_enabled) {
5663                 ret = btrfs_qgroup_reserve_meta(root,
5664                                 nr_extents * root->nodesize);
5665                 if (ret)
5666                         goto out_fail;
5667         }
5668
5669         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
5670         if (unlikely(ret)) {
5671                 btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
5672                 goto out_fail;
5673         }
5674
5675         spin_lock(&BTRFS_I(inode)->lock);
5676         if (extra_reserve) {
5677                 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5678                         &BTRFS_I(inode)->runtime_flags);
5679                 nr_extents--;
5680         }
5681         BTRFS_I(inode)->reserved_extents += nr_extents;
5682         spin_unlock(&BTRFS_I(inode)->lock);
5683
5684         if (delalloc_lock)
5685                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5686
5687         if (to_reserve)
5688                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5689                                               btrfs_ino(inode), to_reserve, 1);
5690         block_rsv_add_bytes(block_rsv, to_reserve, 1);
5691
5692         return 0;
5693
5694 out_fail:
5695         spin_lock(&BTRFS_I(inode)->lock);
5696         dropped = drop_outstanding_extent(inode, num_bytes);
5697         /*
5698          * If the inodes csum_bytes is the same as the original
5699          * csum_bytes then we know we haven't raced with any free()ers
5700          * so we can just reduce our inodes csum bytes and carry on.
5701          */
5702         if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
5703                 calc_csum_metadata_size(inode, num_bytes, 0);
5704         } else {
5705                 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
5706                 u64 bytes;
5707
5708                 /*
5709                  * This is tricky, but first we need to figure out how much we
5710                  * free'd from any free-ers that occured during this
5711                  * reservation, so we reset ->csum_bytes to the csum_bytes
5712                  * before we dropped our lock, and then call the free for the
5713                  * number of bytes that were freed while we were trying our
5714                  * reservation.
5715                  */
5716                 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
5717                 BTRFS_I(inode)->csum_bytes = csum_bytes;
5718                 to_free = calc_csum_metadata_size(inode, bytes, 0);
5719
5720
5721                 /*
5722                  * Now we need to see how much we would have freed had we not
5723                  * been making this reservation and our ->csum_bytes were not
5724                  * artificially inflated.
5725                  */
5726                 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
5727                 bytes = csum_bytes - orig_csum_bytes;
5728                 bytes = calc_csum_metadata_size(inode, bytes, 0);
5729
5730                 /*
5731                  * Now reset ->csum_bytes to what it should be.  If bytes is
5732                  * more than to_free then we would have free'd more space had we
5733                  * not had an artificially high ->csum_bytes, so we need to free
5734                  * the remainder.  If bytes is the same or less then we don't
5735                  * need to do anything, the other free-ers did the correct
5736                  * thing.
5737                  */
5738                 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
5739                 if (bytes > to_free)
5740                         to_free = bytes - to_free;
5741                 else
5742                         to_free = 0;
5743         }
5744         spin_unlock(&BTRFS_I(inode)->lock);
5745         if (dropped)
5746                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5747
5748         if (to_free) {
5749                 btrfs_block_rsv_release(root, block_rsv, to_free);
5750                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5751                                               btrfs_ino(inode), to_free, 0);
5752         }
5753         if (delalloc_lock)
5754                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5755         return ret;
5756 }
5757
5758 /**
5759  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5760  * @inode: the inode to release the reservation for
5761  * @num_bytes: the number of bytes we're releasing
5762  *
5763  * This will release the metadata reservation for an inode.  This can be called
5764  * once we complete IO for a given set of bytes to release their metadata
5765  * reservations.
5766  */
5767 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5768 {
5769         struct btrfs_root *root = BTRFS_I(inode)->root;
5770         u64 to_free = 0;
5771         unsigned dropped;
5772
5773         num_bytes = ALIGN(num_bytes, root->sectorsize);
5774         spin_lock(&BTRFS_I(inode)->lock);
5775         dropped = drop_outstanding_extent(inode, num_bytes);
5776
5777         if (num_bytes)
5778                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
5779         spin_unlock(&BTRFS_I(inode)->lock);
5780         if (dropped > 0)
5781                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
5782
5783         if (btrfs_test_is_dummy_root(root))
5784                 return;
5785
5786         trace_btrfs_space_reservation(root->fs_info, "delalloc",
5787                                       btrfs_ino(inode), to_free, 0);
5788
5789         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5790                                 to_free);
5791 }
5792
5793 /**
5794  * btrfs_delalloc_reserve_space - reserve data and metadata space for
5795  * delalloc
5796  * @inode: inode we're writing to
5797  * @start: start range we are writing to
5798  * @len: how long the range we are writing to
5799  *
5800  * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
5801  *
5802  * This will do the following things
5803  *
5804  * o reserve space in data space info for num bytes
5805  *   and reserve precious corresponding qgroup space
5806  *   (Done in check_data_free_space)
5807  *
5808  * o reserve space for metadata space, based on the number of outstanding
5809  *   extents and how much csums will be needed
5810  *   also reserve metadata space in a per root over-reserve method.
5811  * o add to the inodes->delalloc_bytes
5812  * o add it to the fs_info's delalloc inodes list.
5813  *   (Above 3 all done in delalloc_reserve_metadata)
5814  *
5815  * Return 0 for success
5816  * Return <0 for error(-ENOSPC or -EQUOT)
5817  */
5818 int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
5819 {
5820         int ret;
5821
5822         ret = btrfs_check_data_free_space(inode, start, len);
5823         if (ret < 0)
5824                 return ret;
5825         ret = btrfs_delalloc_reserve_metadata(inode, len);
5826         if (ret < 0)
5827                 btrfs_free_reserved_data_space(inode, start, len);
5828         return ret;
5829 }
5830
5831 /**
5832  * btrfs_delalloc_release_space - release data and metadata space for delalloc
5833  * @inode: inode we're releasing space for
5834  * @start: start position of the space already reserved
5835  * @len: the len of the space already reserved
5836  *
5837  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5838  * called in the case that we don't need the metadata AND data reservations
5839  * anymore.  So if there is an error or we insert an inline extent.
5840  *
5841  * This function will release the metadata space that was not used and will
5842  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5843  * list if there are no delalloc bytes left.
5844  * Also it will handle the qgroup reserved space.
5845  */
5846 void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
5847 {
5848         btrfs_delalloc_release_metadata(inode, len);
5849         btrfs_free_reserved_data_space(inode, start, len);
5850 }
5851
5852 static int update_block_group(struct btrfs_trans_handle *trans,
5853                               struct btrfs_root *root, u64 bytenr,
5854                               u64 num_bytes, int alloc)
5855 {
5856         struct btrfs_block_group_cache *cache = NULL;
5857         struct btrfs_fs_info *info = root->fs_info;
5858         u64 total = num_bytes;
5859         u64 old_val;
5860         u64 byte_in_group;
5861         int factor;
5862
5863         /* block accounting for super block */
5864         spin_lock(&info->delalloc_root_lock);
5865         old_val = btrfs_super_bytes_used(info->super_copy);
5866         if (alloc)
5867                 old_val += num_bytes;
5868         else
5869                 old_val -= num_bytes;
5870         btrfs_set_super_bytes_used(info->super_copy, old_val);
5871         spin_unlock(&info->delalloc_root_lock);
5872
5873         while (total) {
5874                 cache = btrfs_lookup_block_group(info, bytenr);
5875                 if (!cache)
5876                         return -ENOENT;
5877                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5878                                     BTRFS_BLOCK_GROUP_RAID1 |
5879                                     BTRFS_BLOCK_GROUP_RAID10))
5880                         factor = 2;
5881                 else
5882                         factor = 1;
5883                 /*
5884                  * If this block group has free space cache written out, we
5885                  * need to make sure to load it if we are removing space.  This
5886                  * is because we need the unpinning stage to actually add the
5887                  * space back to the block group, otherwise we will leak space.
5888                  */
5889                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5890                         cache_block_group(cache, 1);
5891
5892                 byte_in_group = bytenr - cache->key.objectid;
5893                 WARN_ON(byte_in_group > cache->key.offset);
5894
5895                 spin_lock(&cache->space_info->lock);
5896                 spin_lock(&cache->lock);
5897
5898                 if (btrfs_test_opt(root, SPACE_CACHE) &&
5899                     cache->disk_cache_state < BTRFS_DC_CLEAR)
5900                         cache->disk_cache_state = BTRFS_DC_CLEAR;
5901
5902                 old_val = btrfs_block_group_used(&cache->item);
5903                 num_bytes = min(total, cache->key.offset - byte_in_group);
5904                 if (alloc) {
5905                         old_val += num_bytes;
5906                         btrfs_set_block_group_used(&cache->item, old_val);
5907                         cache->reserved -= num_bytes;
5908                         cache->space_info->bytes_reserved -= num_bytes;
5909                         cache->space_info->bytes_used += num_bytes;
5910                         cache->space_info->disk_used += num_bytes * factor;
5911                         spin_unlock(&cache->lock);
5912                         spin_unlock(&cache->space_info->lock);
5913                 } else {
5914                         old_val -= num_bytes;
5915                         btrfs_set_block_group_used(&cache->item, old_val);
5916                         cache->pinned += num_bytes;
5917                         cache->space_info->bytes_pinned += num_bytes;
5918                         cache->space_info->bytes_used -= num_bytes;
5919                         cache->space_info->disk_used -= num_bytes * factor;
5920                         spin_unlock(&cache->lock);
5921                         spin_unlock(&cache->space_info->lock);
5922
5923                         set_extent_dirty(info->pinned_extents,
5924                                          bytenr, bytenr + num_bytes - 1,
5925                                          GFP_NOFS | __GFP_NOFAIL);
5926                 }
5927
5928                 spin_lock(&trans->transaction->dirty_bgs_lock);
5929                 if (list_empty(&cache->dirty_list)) {
5930                         list_add_tail(&cache->dirty_list,
5931                                       &trans->transaction->dirty_bgs);
5932                                 trans->transaction->num_dirty_bgs++;
5933                         btrfs_get_block_group(cache);
5934                 }
5935                 spin_unlock(&trans->transaction->dirty_bgs_lock);
5936
5937                 /*
5938                  * No longer have used bytes in this block group, queue it for
5939                  * deletion. We do this after adding the block group to the
5940                  * dirty list to avoid races between cleaner kthread and space
5941                  * cache writeout.
5942                  */
5943                 if (!alloc && old_val == 0) {
5944                         spin_lock(&info->unused_bgs_lock);
5945                         if (list_empty(&cache->bg_list)) {
5946                                 btrfs_get_block_group(cache);
5947                                 list_add_tail(&cache->bg_list,
5948                                               &info->unused_bgs);
5949                         }
5950                         spin_unlock(&info->unused_bgs_lock);
5951                 }
5952
5953                 btrfs_put_block_group(cache);
5954                 total -= num_bytes;
5955                 bytenr += num_bytes;
5956         }
5957         return 0;
5958 }
5959
5960 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5961 {
5962         struct btrfs_block_group_cache *cache;
5963         u64 bytenr;
5964
5965         spin_lock(&root->fs_info->block_group_cache_lock);
5966         bytenr = root->fs_info->first_logical_byte;
5967         spin_unlock(&root->fs_info->block_group_cache_lock);
5968
5969         if (bytenr < (u64)-1)
5970                 return bytenr;
5971
5972         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5973         if (!cache)
5974                 return 0;
5975
5976         bytenr = cache->key.objectid;
5977         btrfs_put_block_group(cache);
5978
5979         return bytenr;
5980 }
5981
5982 static int pin_down_extent(struct btrfs_root *root,
5983                            struct btrfs_block_group_cache *cache,
5984                            u64 bytenr, u64 num_bytes, int reserved)
5985 {
5986         spin_lock(&cache->space_info->lock);
5987         spin_lock(&cache->lock);
5988         cache->pinned += num_bytes;
5989         cache->space_info->bytes_pinned += num_bytes;
5990         if (reserved) {
5991                 cache->reserved -= num_bytes;
5992                 cache->space_info->bytes_reserved -= num_bytes;
5993         }
5994         spin_unlock(&cache->lock);
5995         spin_unlock(&cache->space_info->lock);
5996
5997         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5998                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5999         if (reserved)
6000                 trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
6001         return 0;
6002 }
6003
6004 /*
6005  * this function must be called within transaction
6006  */
6007 int btrfs_pin_extent(struct btrfs_root *root,
6008                      u64 bytenr, u64 num_bytes, int reserved)
6009 {
6010         struct btrfs_block_group_cache *cache;
6011
6012         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6013         BUG_ON(!cache); /* Logic error */
6014
6015         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
6016
6017         btrfs_put_block_group(cache);
6018         return 0;
6019 }
6020
6021 /*
6022  * this function must be called within transaction
6023  */
6024 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
6025                                     u64 bytenr, u64 num_bytes)
6026 {
6027         struct btrfs_block_group_cache *cache;
6028         int ret;
6029
6030         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6031         if (!cache)
6032                 return -EINVAL;
6033
6034         /*
6035          * pull in the free space cache (if any) so that our pin
6036          * removes the free space from the cache.  We have load_only set
6037          * to one because the slow code to read in the free extents does check
6038          * the pinned extents.
6039          */
6040         cache_block_group(cache, 1);
6041
6042         pin_down_extent(root, cache, bytenr, num_bytes, 0);
6043
6044         /* remove us from the free space cache (if we're there at all) */
6045         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6046         btrfs_put_block_group(cache);
6047         return ret;
6048 }
6049
6050 static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
6051 {
6052         int ret;
6053         struct btrfs_block_group_cache *block_group;
6054         struct btrfs_caching_control *caching_ctl;
6055
6056         block_group = btrfs_lookup_block_group(root->fs_info, start);
6057         if (!block_group)
6058                 return -EINVAL;
6059
6060         cache_block_group(block_group, 0);
6061         caching_ctl = get_caching_control(block_group);
6062
6063         if (!caching_ctl) {
6064                 /* Logic error */
6065                 BUG_ON(!block_group_cache_done(block_group));
6066                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6067         } else {
6068                 mutex_lock(&caching_ctl->mutex);
6069
6070                 if (start >= caching_ctl->progress) {
6071                         ret = add_excluded_extent(root, start, num_bytes);
6072                 } else if (start + num_bytes <= caching_ctl->progress) {
6073                         ret = btrfs_remove_free_space(block_group,
6074                                                       start, num_bytes);
6075                 } else {
6076                         num_bytes = caching_ctl->progress - start;
6077                         ret = btrfs_remove_free_space(block_group,
6078                                                       start, num_bytes);
6079                         if (ret)
6080                                 goto out_lock;
6081
6082                         num_bytes = (start + num_bytes) -
6083                                 caching_ctl->progress;
6084                         start = caching_ctl->progress;
6085                         ret = add_excluded_extent(root, start, num_bytes);
6086                 }
6087 out_lock:
6088                 mutex_unlock(&caching_ctl->mutex);
6089                 put_caching_control(caching_ctl);
6090         }
6091         btrfs_put_block_group(block_group);
6092         return ret;
6093 }
6094
6095 int btrfs_exclude_logged_extents(struct btrfs_root *log,
6096                                  struct extent_buffer *eb)
6097 {
6098         struct btrfs_file_extent_item *item;
6099         struct btrfs_key key;
6100         int found_type;
6101         int i;
6102
6103         if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
6104                 return 0;
6105
6106         for (i = 0; i < btrfs_header_nritems(eb); i++) {
6107                 btrfs_item_key_to_cpu(eb, &key, i);
6108                 if (key.type != BTRFS_EXTENT_DATA_KEY)
6109                         continue;
6110                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6111                 found_type = btrfs_file_extent_type(eb, item);
6112                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
6113                         continue;
6114                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6115                         continue;
6116                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6117                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6118                 __exclude_logged_extent(log, key.objectid, key.offset);
6119         }
6120
6121         return 0;
6122 }
6123
6124 /**
6125  * btrfs_update_reserved_bytes - update the block_group and space info counters
6126  * @cache:      The cache we are manipulating
6127  * @num_bytes:  The number of bytes in question
6128  * @reserve:    One of the reservation enums
6129  * @delalloc:   The blocks are allocated for the delalloc write
6130  *
6131  * This is called by the allocator when it reserves space, or by somebody who is
6132  * freeing space that was never actually used on disk.  For example if you
6133  * reserve some space for a new leaf in transaction A and before transaction A
6134  * commits you free that leaf, you call this with reserve set to 0 in order to
6135  * clear the reservation.
6136  *
6137  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
6138  * ENOSPC accounting.  For data we handle the reservation through clearing the
6139  * delalloc bits in the io_tree.  We have to do this since we could end up
6140  * allocating less disk space for the amount of data we have reserved in the
6141  * case of compression.
6142  *
6143  * If this is a reservation and the block group has become read only we cannot
6144  * make the reservation and return -EAGAIN, otherwise this function always
6145  * succeeds.
6146  */
6147 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
6148                                        u64 num_bytes, int reserve, int delalloc)
6149 {
6150         struct btrfs_space_info *space_info = cache->space_info;
6151         int ret = 0;
6152
6153         spin_lock(&space_info->lock);
6154         spin_lock(&cache->lock);
6155         if (reserve != RESERVE_FREE) {
6156                 if (cache->ro) {
6157                         ret = -EAGAIN;
6158                 } else {
6159                         cache->reserved += num_bytes;
6160                         space_info->bytes_reserved += num_bytes;
6161                         if (reserve == RESERVE_ALLOC) {
6162                                 trace_btrfs_space_reservation(cache->fs_info,
6163                                                 "space_info", space_info->flags,
6164                                                 num_bytes, 0);
6165                                 space_info->bytes_may_use -= num_bytes;
6166                         }
6167
6168                         if (delalloc)
6169                                 cache->delalloc_bytes += num_bytes;
6170                 }
6171         } else {
6172                 if (cache->ro)
6173                         space_info->bytes_readonly += num_bytes;
6174                 cache->reserved -= num_bytes;
6175                 space_info->bytes_reserved -= num_bytes;
6176
6177                 if (delalloc)
6178                         cache->delalloc_bytes -= num_bytes;
6179         }
6180         spin_unlock(&cache->lock);
6181         spin_unlock(&space_info->lock);
6182         return ret;
6183 }
6184
6185 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6186                                 struct btrfs_root *root)
6187 {
6188         struct btrfs_fs_info *fs_info = root->fs_info;
6189         struct btrfs_caching_control *next;
6190         struct btrfs_caching_control *caching_ctl;
6191         struct btrfs_block_group_cache *cache;
6192
6193         down_write(&fs_info->commit_root_sem);
6194
6195         list_for_each_entry_safe(caching_ctl, next,
6196                                  &fs_info->caching_block_groups, list) {
6197                 cache = caching_ctl->block_group;
6198                 if (block_group_cache_done(cache)) {
6199                         cache->last_byte_to_unpin = (u64)-1;
6200                         list_del_init(&caching_ctl->list);
6201                         put_caching_control(caching_ctl);
6202                 } else {
6203                         cache->last_byte_to_unpin = caching_ctl->progress;
6204                 }
6205         }
6206
6207         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6208                 fs_info->pinned_extents = &fs_info->freed_extents[1];
6209         else
6210                 fs_info->pinned_extents = &fs_info->freed_extents[0];
6211
6212         up_write(&fs_info->commit_root_sem);
6213
6214         update_global_block_rsv(fs_info);
6215 }
6216
6217 /*
6218  * Returns the free cluster for the given space info and sets empty_cluster to
6219  * what it should be based on the mount options.
6220  */
6221 static struct btrfs_free_cluster *
6222 fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
6223                    u64 *empty_cluster)
6224 {
6225         struct btrfs_free_cluster *ret = NULL;
6226         bool ssd = btrfs_test_opt(root, SSD);
6227
6228         *empty_cluster = 0;
6229         if (btrfs_mixed_space_info(space_info))
6230                 return ret;
6231
6232         if (ssd)
6233                 *empty_cluster = 2 * 1024 * 1024;
6234         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6235                 ret = &root->fs_info->meta_alloc_cluster;
6236                 if (!ssd)
6237                         *empty_cluster = 64 * 1024;
6238         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
6239                 ret = &root->fs_info->data_alloc_cluster;
6240         }
6241
6242         return ret;
6243 }
6244
6245 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6246                               const bool return_free_space)
6247 {
6248         struct btrfs_fs_info *fs_info = root->fs_info;
6249         struct btrfs_block_group_cache *cache = NULL;
6250         struct btrfs_space_info *space_info;
6251         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6252         struct btrfs_free_cluster *cluster = NULL;
6253         u64 len;
6254         u64 total_unpinned = 0;
6255         u64 empty_cluster = 0;
6256         bool readonly;
6257
6258         while (start <= end) {
6259                 readonly = false;
6260                 if (!cache ||
6261                     start >= cache->key.objectid + cache->key.offset) {
6262                         if (cache)
6263                                 btrfs_put_block_group(cache);
6264                         total_unpinned = 0;
6265                         cache = btrfs_lookup_block_group(fs_info, start);
6266                         BUG_ON(!cache); /* Logic error */
6267
6268                         cluster = fetch_cluster_info(root,
6269                                                      cache->space_info,
6270                                                      &empty_cluster);
6271                         empty_cluster <<= 1;
6272                 }
6273
6274                 len = cache->key.objectid + cache->key.offset - start;
6275                 len = min(len, end + 1 - start);
6276
6277                 if (start < cache->last_byte_to_unpin) {
6278                         len = min(len, cache->last_byte_to_unpin - start);
6279                         if (return_free_space)
6280                                 btrfs_add_free_space(cache, start, len);
6281                 }
6282
6283                 start += len;
6284                 total_unpinned += len;
6285                 space_info = cache->space_info;
6286
6287                 /*
6288                  * If this space cluster has been marked as fragmented and we've
6289                  * unpinned enough in this block group to potentially allow a
6290                  * cluster to be created inside of it go ahead and clear the
6291                  * fragmented check.
6292                  */
6293                 if (cluster && cluster->fragmented &&
6294                     total_unpinned > empty_cluster) {
6295                         spin_lock(&cluster->lock);
6296                         cluster->fragmented = 0;
6297                         spin_unlock(&cluster->lock);
6298                 }
6299
6300                 spin_lock(&space_info->lock);
6301                 spin_lock(&cache->lock);
6302                 cache->pinned -= len;
6303                 space_info->bytes_pinned -= len;
6304                 space_info->max_extent_size = 0;
6305                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6306                 if (cache->ro) {
6307                         space_info->bytes_readonly += len;
6308                         readonly = true;
6309                 }
6310                 spin_unlock(&cache->lock);
6311                 if (!readonly && global_rsv->space_info == space_info) {
6312                         spin_lock(&global_rsv->lock);
6313                         if (!global_rsv->full) {
6314                                 len = min(len, global_rsv->size -
6315                                           global_rsv->reserved);
6316                                 global_rsv->reserved += len;
6317                                 space_info->bytes_may_use += len;
6318                                 if (global_rsv->reserved >= global_rsv->size)
6319                                         global_rsv->full = 1;
6320                         }
6321                         spin_unlock(&global_rsv->lock);
6322                 }
6323                 spin_unlock(&space_info->lock);
6324         }
6325
6326         if (cache)
6327                 btrfs_put_block_group(cache);
6328         return 0;
6329 }
6330
6331 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6332                                struct btrfs_root *root)
6333 {
6334         struct btrfs_fs_info *fs_info = root->fs_info;
6335         struct btrfs_block_group_cache *block_group, *tmp;
6336         struct list_head *deleted_bgs;
6337         struct extent_io_tree *unpin;
6338         u64 start;
6339         u64 end;
6340         int ret;
6341
6342         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6343                 unpin = &fs_info->freed_extents[1];
6344         else
6345                 unpin = &fs_info->freed_extents[0];
6346
6347         while (!trans->aborted) {
6348                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
6349                 ret = find_first_extent_bit(unpin, 0, &start, &end,
6350                                             EXTENT_DIRTY, NULL);
6351                 if (ret) {
6352                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6353                         break;
6354                 }
6355
6356                 if (btrfs_test_opt(root, DISCARD))
6357                         ret = btrfs_discard_extent(root, start,
6358                                                    end + 1 - start, NULL);
6359
6360                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
6361                 unpin_extent_range(root, start, end, true);
6362                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6363                 cond_resched();
6364         }
6365
6366         /*
6367          * Transaction is finished.  We don't need the lock anymore.  We
6368          * do need to clean up the block groups in case of a transaction
6369          * abort.
6370          */
6371         deleted_bgs = &trans->transaction->deleted_bgs;
6372         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6373                 u64 trimmed = 0;
6374
6375                 ret = -EROFS;
6376                 if (!trans->aborted)
6377                         ret = btrfs_discard_extent(root,
6378                                                    block_group->key.objectid,
6379                                                    block_group->key.offset,
6380                                                    &trimmed);
6381
6382                 list_del_init(&block_group->bg_list);
6383                 btrfs_put_block_group_trimming(block_group);
6384                 btrfs_put_block_group(block_group);
6385
6386                 if (ret) {
6387                         const char *errstr = btrfs_decode_error(ret);
6388                         btrfs_warn(fs_info,
6389                                    "Discard failed while removing blockgroup: errno=%d %s\n",
6390                                    ret, errstr);
6391                 }
6392         }
6393
6394         return 0;
6395 }
6396
6397 static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
6398                              u64 owner, u64 root_objectid)
6399 {
6400         struct btrfs_space_info *space_info;
6401         u64 flags;
6402
6403         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6404                 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
6405                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
6406                 else
6407                         flags = BTRFS_BLOCK_GROUP_METADATA;
6408         } else {
6409                 flags = BTRFS_BLOCK_GROUP_DATA;
6410         }
6411
6412         space_info = __find_space_info(fs_info, flags);
6413         BUG_ON(!space_info); /* Logic bug */
6414         percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
6415 }
6416
6417
6418 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6419                                 struct btrfs_root *root,
6420                                 struct btrfs_delayed_ref_node *node, u64 parent,
6421                                 u64 root_objectid, u64 owner_objectid,
6422                                 u64 owner_offset, int refs_to_drop,
6423                                 struct btrfs_delayed_extent_op *extent_op)
6424 {
6425         struct btrfs_key key;
6426         struct btrfs_path *path;
6427         struct btrfs_fs_info *info = root->fs_info;
6428         struct btrfs_root *extent_root = info->extent_root;
6429         struct extent_buffer *leaf;
6430         struct btrfs_extent_item *ei;
6431         struct btrfs_extent_inline_ref *iref;
6432         int ret;
6433         int is_data;
6434         int extent_slot = 0;
6435         int found_extent = 0;
6436         int num_to_del = 1;
6437         u32 item_size;
6438         u64 refs;
6439         u64 bytenr = node->bytenr;
6440         u64 num_bytes = node->num_bytes;
6441         int last_ref = 0;
6442         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6443                                                  SKINNY_METADATA);
6444
6445         path = btrfs_alloc_path();
6446         if (!path)
6447                 return -ENOMEM;
6448
6449         path->reada = 1;
6450         path->leave_spinning = 1;
6451
6452         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6453         BUG_ON(!is_data && refs_to_drop != 1);
6454
6455         if (is_data)
6456                 skinny_metadata = 0;
6457
6458         ret = lookup_extent_backref(trans, extent_root, path, &iref,
6459                                     bytenr, num_bytes, parent,
6460                                     root_objectid, owner_objectid,
6461                                     owner_offset);
6462         if (ret == 0) {
6463                 extent_slot = path->slots[0];
6464                 while (extent_slot >= 0) {
6465                         btrfs_item_key_to_cpu(path->nodes[0], &key,
6466                                               extent_slot);
6467                         if (key.objectid != bytenr)
6468                                 break;
6469                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6470                             key.offset == num_bytes) {
6471                                 found_extent = 1;
6472                                 break;
6473                         }
6474                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
6475                             key.offset == owner_objectid) {
6476                                 found_extent = 1;
6477                                 break;
6478                         }
6479                         if (path->slots[0] - extent_slot > 5)
6480                                 break;
6481                         extent_slot--;
6482                 }
6483 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6484                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6485                 if (found_extent && item_size < sizeof(*ei))
6486                         found_extent = 0;
6487 #endif
6488                 if (!found_extent) {
6489                         BUG_ON(iref);
6490                         ret = remove_extent_backref(trans, extent_root, path,
6491                                                     NULL, refs_to_drop,
6492                                                     is_data, &last_ref);
6493                         if (ret) {
6494                                 btrfs_abort_transaction(trans, extent_root, ret);
6495                                 goto out;
6496                         }
6497                         btrfs_release_path(path);
6498                         path->leave_spinning = 1;
6499
6500                         key.objectid = bytenr;
6501                         key.type = BTRFS_EXTENT_ITEM_KEY;
6502                         key.offset = num_bytes;
6503
6504                         if (!is_data && skinny_metadata) {
6505                                 key.type = BTRFS_METADATA_ITEM_KEY;
6506                                 key.offset = owner_objectid;
6507                         }
6508
6509                         ret = btrfs_search_slot(trans, extent_root,
6510                                                 &key, path, -1, 1);
6511                         if (ret > 0 && skinny_metadata && path->slots[0]) {
6512                                 /*
6513                                  * Couldn't find our skinny metadata item,
6514                                  * see if we have ye olde extent item.
6515                                  */
6516                                 path->slots[0]--;
6517                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
6518                                                       path->slots[0]);
6519                                 if (key.objectid == bytenr &&
6520                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
6521                                     key.offset == num_bytes)
6522                                         ret = 0;
6523                         }
6524
6525                         if (ret > 0 && skinny_metadata) {
6526                                 skinny_metadata = false;
6527                                 key.objectid = bytenr;
6528                                 key.type = BTRFS_EXTENT_ITEM_KEY;
6529                                 key.offset = num_bytes;
6530                                 btrfs_release_path(path);
6531                                 ret = btrfs_search_slot(trans, extent_root,
6532                                                         &key, path, -1, 1);
6533                         }
6534
6535                         if (ret) {
6536                                 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6537                                         ret, bytenr);
6538                                 if (ret > 0)
6539                                         btrfs_print_leaf(extent_root,
6540                                                          path->nodes[0]);
6541                         }
6542                         if (ret < 0) {
6543                                 btrfs_abort_transaction(trans, extent_root, ret);
6544                                 goto out;
6545                         }
6546                         extent_slot = path->slots[0];
6547                 }
6548         } else if (WARN_ON(ret == -ENOENT)) {
6549                 btrfs_print_leaf(extent_root, path->nodes[0]);
6550                 btrfs_err(info,
6551                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6552                         bytenr, parent, root_objectid, owner_objectid,
6553                         owner_offset);
6554                 btrfs_abort_transaction(trans, extent_root, ret);
6555                 goto out;
6556         } else {
6557                 btrfs_abort_transaction(trans, extent_root, ret);
6558                 goto out;
6559         }
6560
6561         leaf = path->nodes[0];
6562         item_size = btrfs_item_size_nr(leaf, extent_slot);
6563 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6564         if (item_size < sizeof(*ei)) {
6565                 BUG_ON(found_extent || extent_slot != path->slots[0]);
6566                 ret = convert_extent_item_v0(trans, extent_root, path,
6567                                              owner_objectid, 0);
6568                 if (ret < 0) {
6569                         btrfs_abort_transaction(trans, extent_root, ret);
6570                         goto out;
6571                 }
6572
6573                 btrfs_release_path(path);
6574                 path->leave_spinning = 1;
6575
6576                 key.objectid = bytenr;
6577                 key.type = BTRFS_EXTENT_ITEM_KEY;
6578                 key.offset = num_bytes;
6579
6580                 ret = btrfs_search_slot(trans, extent_root, &key, path,
6581                                         -1, 1);
6582                 if (ret) {
6583                         btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6584                                 ret, bytenr);
6585                         btrfs_print_leaf(extent_root, path->nodes[0]);
6586                 }
6587                 if (ret < 0) {
6588                         btrfs_abort_transaction(trans, extent_root, ret);
6589                         goto out;
6590                 }
6591
6592                 extent_slot = path->slots[0];
6593                 leaf = path->nodes[0];
6594                 item_size = btrfs_item_size_nr(leaf, extent_slot);
6595         }
6596 #endif
6597         BUG_ON(item_size < sizeof(*ei));
6598         ei = btrfs_item_ptr(leaf, extent_slot,
6599                             struct btrfs_extent_item);
6600         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6601             key.type == BTRFS_EXTENT_ITEM_KEY) {
6602                 struct btrfs_tree_block_info *bi;
6603                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6604                 bi = (struct btrfs_tree_block_info *)(ei + 1);
6605                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6606         }
6607
6608         refs = btrfs_extent_refs(leaf, ei);
6609         if (refs < refs_to_drop) {
6610                 btrfs_err(info, "trying to drop %d refs but we only have %Lu "
6611                           "for bytenr %Lu", refs_to_drop, refs, bytenr);
6612                 ret = -EINVAL;
6613                 btrfs_abort_transaction(trans, extent_root, ret);
6614                 goto out;
6615         }
6616         refs -= refs_to_drop;
6617
6618         if (refs > 0) {
6619                 if (extent_op)
6620                         __run_delayed_extent_op(extent_op, leaf, ei);
6621                 /*
6622                  * In the case of inline back ref, reference count will
6623                  * be updated by remove_extent_backref
6624                  */
6625                 if (iref) {
6626                         BUG_ON(!found_extent);
6627                 } else {
6628                         btrfs_set_extent_refs(leaf, ei, refs);
6629                         btrfs_mark_buffer_dirty(leaf);
6630                 }
6631                 if (found_extent) {
6632                         ret = remove_extent_backref(trans, extent_root, path,
6633                                                     iref, refs_to_drop,
6634                                                     is_data, &last_ref);
6635                         if (ret) {
6636                                 btrfs_abort_transaction(trans, extent_root, ret);
6637                                 goto out;
6638                         }
6639                 }
6640                 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
6641                                  root_objectid);
6642         } else {
6643                 if (found_extent) {
6644                         BUG_ON(is_data && refs_to_drop !=
6645                                extent_data_ref_count(path, iref));
6646                         if (iref) {
6647                                 BUG_ON(path->slots[0] != extent_slot);
6648                         } else {
6649                                 BUG_ON(path->slots[0] != extent_slot + 1);
6650                                 path->slots[0] = extent_slot;
6651                                 num_to_del = 2;
6652                         }
6653                 }
6654
6655                 last_ref = 1;
6656                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6657                                       num_to_del);
6658                 if (ret) {
6659                         btrfs_abort_transaction(trans, extent_root, ret);
6660                         goto out;
6661                 }
6662                 btrfs_release_path(path);
6663
6664                 if (is_data) {
6665                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
6666                         if (ret) {
6667                                 btrfs_abort_transaction(trans, extent_root, ret);
6668                                 goto out;
6669                         }
6670                 }
6671
6672                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
6673                 if (ret) {
6674                         btrfs_abort_transaction(trans, extent_root, ret);
6675                         goto out;
6676                 }
6677         }
6678         btrfs_release_path(path);
6679
6680 out:
6681         btrfs_free_path(path);
6682         return ret;
6683 }
6684
6685 /*
6686  * when we free an block, it is possible (and likely) that we free the last
6687  * delayed ref for that extent as well.  This searches the delayed ref tree for
6688  * a given extent, and if there are no other delayed refs to be processed, it
6689  * removes it from the tree.
6690  */
6691 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6692                                       struct btrfs_root *root, u64 bytenr)
6693 {
6694         struct btrfs_delayed_ref_head *head;
6695         struct btrfs_delayed_ref_root *delayed_refs;
6696         int ret = 0;
6697
6698         delayed_refs = &trans->transaction->delayed_refs;
6699         spin_lock(&delayed_refs->lock);
6700         head = btrfs_find_delayed_ref_head(trans, bytenr);
6701         if (!head)
6702                 goto out_delayed_unlock;
6703
6704         spin_lock(&head->lock);
6705         if (!list_empty(&head->ref_list))
6706                 goto out;
6707
6708         if (head->extent_op) {
6709                 if (!head->must_insert_reserved)
6710                         goto out;
6711                 btrfs_free_delayed_extent_op(head->extent_op);
6712                 head->extent_op = NULL;
6713         }
6714
6715         /*
6716          * waiting for the lock here would deadlock.  If someone else has it
6717          * locked they are already in the process of dropping it anyway
6718          */
6719         if (!mutex_trylock(&head->mutex))
6720                 goto out;
6721
6722         /*
6723          * at this point we have a head with no other entries.  Go
6724          * ahead and process it.
6725          */
6726         head->node.in_tree = 0;
6727         rb_erase(&head->href_node, &delayed_refs->href_root);
6728
6729         atomic_dec(&delayed_refs->num_entries);
6730
6731         /*
6732          * we don't take a ref on the node because we're removing it from the
6733          * tree, so we just steal the ref the tree was holding.
6734          */
6735         delayed_refs->num_heads--;
6736         if (head->processing == 0)
6737                 delayed_refs->num_heads_ready--;
6738         head->processing = 0;
6739         spin_unlock(&head->lock);
6740         spin_unlock(&delayed_refs->lock);
6741
6742         BUG_ON(head->extent_op);
6743         if (head->must_insert_reserved)
6744                 ret = 1;
6745
6746         mutex_unlock(&head->mutex);
6747         btrfs_put_delayed_ref(&head->node);
6748         return ret;
6749 out:
6750         spin_unlock(&head->lock);
6751
6752 out_delayed_unlock:
6753         spin_unlock(&delayed_refs->lock);
6754         return 0;
6755 }
6756
6757 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6758                            struct btrfs_root *root,
6759                            struct extent_buffer *buf,
6760                            u64 parent, int last_ref)
6761 {
6762         int pin = 1;
6763         int ret;
6764
6765         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6766                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6767                                         buf->start, buf->len,
6768                                         parent, root->root_key.objectid,
6769                                         btrfs_header_level(buf),
6770                                         BTRFS_DROP_DELAYED_REF, NULL);
6771                 BUG_ON(ret); /* -ENOMEM */
6772         }
6773
6774         if (!last_ref)
6775                 return;
6776
6777         if (btrfs_header_generation(buf) == trans->transid) {
6778                 struct btrfs_block_group_cache *cache;
6779
6780                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6781                         ret = check_ref_cleanup(trans, root, buf->start);
6782                         if (!ret)
6783                                 goto out;
6784                 }
6785
6786                 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
6787
6788                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6789                         pin_down_extent(root, cache, buf->start, buf->len, 1);
6790                         btrfs_put_block_group(cache);
6791                         goto out;
6792                 }
6793
6794                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6795
6796                 btrfs_add_free_space(cache, buf->start, buf->len);
6797                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
6798                 btrfs_put_block_group(cache);
6799                 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
6800                 pin = 0;
6801         }
6802 out:
6803         if (pin)
6804                 add_pinned_bytes(root->fs_info, buf->len,
6805                                  btrfs_header_level(buf),
6806                                  root->root_key.objectid);
6807
6808         /*
6809          * Deleting the buffer, clear the corrupt flag since it doesn't matter
6810          * anymore.
6811          */
6812         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6813 }
6814
6815 /* Can return -ENOMEM */
6816 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6817                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6818                       u64 owner, u64 offset)
6819 {
6820         int ret;
6821         struct btrfs_fs_info *fs_info = root->fs_info;
6822
6823         if (btrfs_test_is_dummy_root(root))
6824                 return 0;
6825
6826         add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6827
6828         /*
6829          * tree log blocks never actually go into the extent allocation
6830          * tree, just update pinning info and exit early.
6831          */
6832         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
6833                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
6834                 /* unlocks the pinned mutex */
6835                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
6836                 ret = 0;
6837         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6838                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6839                                         num_bytes,
6840                                         parent, root_objectid, (int)owner,
6841                                         BTRFS_DROP_DELAYED_REF, NULL);
6842         } else {
6843                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6844                                                 num_bytes,
6845                                                 parent, root_objectid, owner,
6846                                                 offset, 0,
6847                                                 BTRFS_DROP_DELAYED_REF, NULL);
6848         }
6849         return ret;
6850 }
6851
6852 /*
6853  * when we wait for progress in the block group caching, its because
6854  * our allocation attempt failed at least once.  So, we must sleep
6855  * and let some progress happen before we try again.
6856  *
6857  * This function will sleep at least once waiting for new free space to
6858  * show up, and then it will check the block group free space numbers
6859  * for our min num_bytes.  Another option is to have it go ahead
6860  * and look in the rbtree for a free extent of a given size, but this
6861  * is a good start.
6862  *
6863  * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6864  * any of the information in this block group.
6865  */
6866 static noinline void
6867 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6868                                 u64 num_bytes)
6869 {
6870         struct btrfs_caching_control *caching_ctl;
6871
6872         caching_ctl = get_caching_control(cache);
6873         if (!caching_ctl)
6874                 return;
6875
6876         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6877                    (cache->free_space_ctl->free_space >= num_bytes));
6878
6879         put_caching_control(caching_ctl);
6880 }
6881
6882 static noinline int
6883 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6884 {
6885         struct btrfs_caching_control *caching_ctl;
6886         int ret = 0;
6887
6888         caching_ctl = get_caching_control(cache);
6889         if (!caching_ctl)
6890                 return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6891
6892         wait_event(caching_ctl->wait, block_group_cache_done(cache));
6893         if (cache->cached == BTRFS_CACHE_ERROR)
6894                 ret = -EIO;
6895         put_caching_control(caching_ctl);
6896         return ret;
6897 }
6898
6899 int __get_raid_index(u64 flags)
6900 {
6901         if (flags & BTRFS_BLOCK_GROUP_RAID10)
6902                 return BTRFS_RAID_RAID10;
6903         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
6904                 return BTRFS_RAID_RAID1;
6905         else if (flags & BTRFS_BLOCK_GROUP_DUP)
6906                 return BTRFS_RAID_DUP;
6907         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
6908                 return BTRFS_RAID_RAID0;
6909         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
6910                 return BTRFS_RAID_RAID5;
6911         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
6912                 return BTRFS_RAID_RAID6;
6913
6914         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6915 }
6916
6917 int get_block_group_index(struct btrfs_block_group_cache *cache)
6918 {
6919         return __get_raid_index(cache->flags);
6920 }
6921
6922 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
6923         [BTRFS_RAID_RAID10]     = "raid10",
6924         [BTRFS_RAID_RAID1]      = "raid1",
6925         [BTRFS_RAID_DUP]        = "dup",
6926         [BTRFS_RAID_RAID0]      = "raid0",
6927         [BTRFS_RAID_SINGLE]     = "single",
6928         [BTRFS_RAID_RAID5]      = "raid5",
6929         [BTRFS_RAID_RAID6]      = "raid6",
6930 };
6931
6932 static const char *get_raid_name(enum btrfs_raid_types type)
6933 {
6934         if (type >= BTRFS_NR_RAID_TYPES)
6935                 return NULL;
6936
6937         return btrfs_raid_type_names[type];
6938 }
6939
6940 enum btrfs_loop_type {
6941         LOOP_CACHING_NOWAIT = 0,
6942         LOOP_CACHING_WAIT = 1,
6943         LOOP_ALLOC_CHUNK = 2,
6944         LOOP_NO_EMPTY_SIZE = 3,
6945 };
6946
6947 static inline void
6948 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6949                        int delalloc)
6950 {
6951         if (delalloc)
6952                 down_read(&cache->data_rwsem);
6953 }
6954
6955 static inline void
6956 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6957                        int delalloc)
6958 {
6959         btrfs_get_block_group(cache);
6960         if (delalloc)
6961                 down_read(&cache->data_rwsem);
6962 }
6963
6964 static struct btrfs_block_group_cache *
6965 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6966                    struct btrfs_free_cluster *cluster,
6967                    int delalloc)
6968 {
6969         struct btrfs_block_group_cache *used_bg;
6970         bool locked = false;
6971 again:
6972         spin_lock(&cluster->refill_lock);
6973         if (locked) {
6974                 if (used_bg == cluster->block_group)
6975                         return used_bg;
6976
6977                 up_read(&used_bg->data_rwsem);
6978                 btrfs_put_block_group(used_bg);
6979         }
6980
6981         used_bg = cluster->block_group;
6982         if (!used_bg)
6983                 return NULL;
6984
6985         if (used_bg == block_group)
6986                 return used_bg;
6987
6988         btrfs_get_block_group(used_bg);
6989
6990         if (!delalloc)
6991                 return used_bg;
6992
6993         if (down_read_trylock(&used_bg->data_rwsem))
6994                 return used_bg;
6995
6996         spin_unlock(&cluster->refill_lock);
6997         down_read(&used_bg->data_rwsem);
6998         locked = true;
6999         goto again;
7000 }
7001
7002 static inline void
7003 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7004                          int delalloc)
7005 {
7006         if (delalloc)
7007                 up_read(&cache->data_rwsem);
7008         btrfs_put_block_group(cache);
7009 }
7010
7011 /*
7012  * walks the btree of allocated extents and find a hole of a given size.
7013  * The key ins is changed to record the hole:
7014  * ins->objectid == start position
7015  * ins->flags = BTRFS_EXTENT_ITEM_KEY
7016  * ins->offset == the size of the hole.
7017  * Any available blocks before search_start are skipped.
7018  *
7019  * If there is no suitable free space, we will record the max size of
7020  * the free space extent currently.
7021  */
7022 static noinline int find_free_extent(struct btrfs_root *orig_root,
7023                                      u64 num_bytes, u64 empty_size,
7024                                      u64 hint_byte, struct btrfs_key *ins,
7025                                      u64 flags, int delalloc)
7026 {
7027         int ret = 0;
7028         struct btrfs_root *root = orig_root->fs_info->extent_root;
7029         struct btrfs_free_cluster *last_ptr = NULL;
7030         struct btrfs_block_group_cache *block_group = NULL;
7031         u64 search_start = 0;
7032         u64 max_extent_size = 0;
7033         u64 empty_cluster = 0;
7034         struct btrfs_space_info *space_info;
7035         int loop = 0;
7036         int index = __get_raid_index(flags);
7037         int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
7038                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
7039         bool failed_cluster_refill = false;
7040         bool failed_alloc = false;
7041         bool use_cluster = true;
7042         bool have_caching_bg = false;
7043         bool orig_have_caching_bg = false;
7044         bool full_search = false;
7045
7046         WARN_ON(num_bytes < root->sectorsize);
7047         ins->type = BTRFS_EXTENT_ITEM_KEY;
7048         ins->objectid = 0;
7049         ins->offset = 0;
7050
7051         trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
7052
7053         space_info = __find_space_info(root->fs_info, flags);
7054         if (!space_info) {
7055                 btrfs_err(root->fs_info, "No space info for %llu", flags);
7056                 return -ENOSPC;
7057         }
7058
7059         /*
7060          * If our free space is heavily fragmented we may not be able to make
7061          * big contiguous allocations, so instead of doing the expensive search
7062          * for free space, simply return ENOSPC with our max_extent_size so we
7063          * can go ahead and search for a more manageable chunk.
7064          *
7065          * If our max_extent_size is large enough for our allocation simply
7066          * disable clustering since we will likely not be able to find enough
7067          * space to create a cluster and induce latency trying.
7068          */
7069         if (unlikely(space_info->max_extent_size)) {
7070                 spin_lock(&space_info->lock);
7071                 if (space_info->max_extent_size &&
7072                     num_bytes > space_info->max_extent_size) {
7073                         ins->offset = space_info->max_extent_size;
7074                         spin_unlock(&space_info->lock);
7075                         return -ENOSPC;
7076                 } else if (space_info->max_extent_size) {
7077                         use_cluster = false;
7078                 }
7079                 spin_unlock(&space_info->lock);
7080         }
7081
7082         last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
7083         if (last_ptr) {
7084                 spin_lock(&last_ptr->lock);
7085                 if (last_ptr->block_group)
7086                         hint_byte = last_ptr->window_start;
7087                 if (last_ptr->fragmented) {
7088                         /*
7089                          * We still set window_start so we can keep track of the
7090                          * last place we found an allocation to try and save
7091                          * some time.
7092                          */
7093                         hint_byte = last_ptr->window_start;
7094                         use_cluster = false;
7095                 }
7096                 spin_unlock(&last_ptr->lock);
7097         }
7098
7099         search_start = max(search_start, first_logical_byte(root, 0));
7100         search_start = max(search_start, hint_byte);
7101         if (search_start == hint_byte) {
7102                 block_group = btrfs_lookup_block_group(root->fs_info,
7103                                                        search_start);
7104                 /*
7105                  * we don't want to use the block group if it doesn't match our
7106                  * allocation bits, or if its not cached.
7107                  *
7108                  * However if we are re-searching with an ideal block group
7109                  * picked out then we don't care that the block group is cached.
7110                  */
7111                 if (block_group && block_group_bits(block_group, flags) &&
7112                     block_group->cached != BTRFS_CACHE_NO) {
7113                         down_read(&space_info->groups_sem);
7114                         if (list_empty(&block_group->list) ||
7115                             block_group->ro) {
7116                                 /*
7117                                  * someone is removing this block group,
7118                                  * we can't jump into the have_block_group
7119                                  * target because our list pointers are not
7120                                  * valid
7121                                  */
7122                                 btrfs_put_block_group(block_group);
7123                                 up_read(&space_info->groups_sem);
7124                         } else {
7125                                 index = get_block_group_index(block_group);
7126                                 btrfs_lock_block_group(block_group, delalloc);
7127                                 goto have_block_group;
7128                         }
7129                 } else if (block_group) {
7130                         btrfs_put_block_group(block_group);
7131                 }
7132         }
7133 search:
7134         have_caching_bg = false;
7135         if (index == 0 || index == __get_raid_index(flags))
7136                 full_search = true;
7137         down_read(&space_info->groups_sem);
7138         list_for_each_entry(block_group, &space_info->block_groups[index],
7139                             list) {
7140                 u64 offset;
7141                 int cached;
7142
7143                 btrfs_grab_block_group(block_group, delalloc);
7144                 search_start = block_group->key.objectid;
7145
7146                 /*
7147                  * this can happen if we end up cycling through all the
7148                  * raid types, but we want to make sure we only allocate
7149                  * for the proper type.
7150                  */
7151                 if (!block_group_bits(block_group, flags)) {
7152                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
7153                                 BTRFS_BLOCK_GROUP_RAID1 |
7154                                 BTRFS_BLOCK_GROUP_RAID5 |
7155                                 BTRFS_BLOCK_GROUP_RAID6 |
7156                                 BTRFS_BLOCK_GROUP_RAID10;
7157
7158                         /*
7159                          * if they asked for extra copies and this block group
7160                          * doesn't provide them, bail.  This does allow us to
7161                          * fill raid0 from raid1.
7162                          */
7163                         if ((flags & extra) && !(block_group->flags & extra))
7164                                 goto loop;
7165                 }
7166
7167 have_block_group:
7168                 cached = block_group_cache_done(block_group);
7169                 if (unlikely(!cached)) {
7170                         have_caching_bg = true;
7171                         ret = cache_block_group(block_group, 0);
7172                         BUG_ON(ret < 0);
7173                         ret = 0;
7174                 }
7175
7176                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7177                         goto loop;
7178                 if (unlikely(block_group->ro))
7179                         goto loop;
7180
7181                 /*
7182                  * Ok we want to try and use the cluster allocator, so
7183                  * lets look there
7184                  */
7185                 if (last_ptr && use_cluster) {
7186                         struct btrfs_block_group_cache *used_block_group;
7187                         unsigned long aligned_cluster;
7188                         /*
7189                          * the refill lock keeps out other
7190                          * people trying to start a new cluster
7191                          */
7192                         used_block_group = btrfs_lock_cluster(block_group,
7193                                                               last_ptr,
7194                                                               delalloc);
7195                         if (!used_block_group)
7196                                 goto refill_cluster;
7197
7198                         if (used_block_group != block_group &&
7199                             (used_block_group->ro ||
7200                              !block_group_bits(used_block_group, flags)))
7201                                 goto release_cluster;
7202
7203                         offset = btrfs_alloc_from_cluster(used_block_group,
7204                                                 last_ptr,
7205                                                 num_bytes,
7206                                                 used_block_group->key.objectid,
7207                                                 &max_extent_size);
7208                         if (offset) {
7209                                 /* we have a block, we're done */
7210                                 spin_unlock(&last_ptr->refill_lock);
7211                                 trace_btrfs_reserve_extent_cluster(root,
7212                                                 used_block_group,
7213                                                 search_start, num_bytes);
7214                                 if (used_block_group != block_group) {
7215                                         btrfs_release_block_group(block_group,
7216                                                                   delalloc);
7217                                         block_group = used_block_group;
7218                                 }
7219                                 goto checks;
7220                         }
7221
7222                         WARN_ON(last_ptr->block_group != used_block_group);
7223 release_cluster:
7224                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7225                          * set up a new clusters, so lets just skip it
7226                          * and let the allocator find whatever block
7227                          * it can find.  If we reach this point, we
7228                          * will have tried the cluster allocator
7229                          * plenty of times and not have found
7230                          * anything, so we are likely way too
7231                          * fragmented for the clustering stuff to find
7232                          * anything.
7233                          *
7234                          * However, if the cluster is taken from the
7235                          * current block group, release the cluster
7236                          * first, so that we stand a better chance of
7237                          * succeeding in the unclustered
7238                          * allocation.  */
7239                         if (loop >= LOOP_NO_EMPTY_SIZE &&
7240                             used_block_group != block_group) {
7241                                 spin_unlock(&last_ptr->refill_lock);
7242                                 btrfs_release_block_group(used_block_group,
7243                                                           delalloc);
7244                                 goto unclustered_alloc;
7245                         }
7246
7247                         /*
7248                          * this cluster didn't work out, free it and
7249                          * start over
7250                          */
7251                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7252
7253                         if (used_block_group != block_group)
7254                                 btrfs_release_block_group(used_block_group,
7255                                                           delalloc);
7256 refill_cluster:
7257                         if (loop >= LOOP_NO_EMPTY_SIZE) {
7258                                 spin_unlock(&last_ptr->refill_lock);
7259                                 goto unclustered_alloc;
7260                         }
7261
7262                         aligned_cluster = max_t(unsigned long,
7263                                                 empty_cluster + empty_size,
7264                                               block_group->full_stripe_len);
7265
7266                         /* allocate a cluster in this block group */
7267                         ret = btrfs_find_space_cluster(root, block_group,
7268                                                        last_ptr, search_start,
7269                                                        num_bytes,
7270                                                        aligned_cluster);
7271                         if (ret == 0) {
7272                                 /*
7273                                  * now pull our allocation out of this
7274                                  * cluster
7275                                  */
7276                                 offset = btrfs_alloc_from_cluster(block_group,
7277                                                         last_ptr,
7278                                                         num_bytes,
7279                                                         search_start,
7280                                                         &max_extent_size);
7281                                 if (offset) {
7282                                         /* we found one, proceed */
7283                                         spin_unlock(&last_ptr->refill_lock);
7284                                         trace_btrfs_reserve_extent_cluster(root,
7285                                                 block_group, search_start,
7286                                                 num_bytes);
7287                                         goto checks;
7288                                 }
7289                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
7290                                    && !failed_cluster_refill) {
7291                                 spin_unlock(&last_ptr->refill_lock);
7292
7293                                 failed_cluster_refill = true;
7294                                 wait_block_group_cache_progress(block_group,
7295                                        num_bytes + empty_cluster + empty_size);
7296                                 goto have_block_group;
7297                         }
7298
7299                         /*
7300                          * at this point we either didn't find a cluster
7301                          * or we weren't able to allocate a block from our
7302                          * cluster.  Free the cluster we've been trying
7303                          * to use, and go to the next block group
7304                          */
7305                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
7306                         spin_unlock(&last_ptr->refill_lock);
7307                         goto loop;
7308                 }
7309
7310 unclustered_alloc:
7311                 /*
7312                  * We are doing an unclustered alloc, set the fragmented flag so
7313                  * we don't bother trying to setup a cluster again until we get
7314                  * more space.
7315                  */
7316                 if (unlikely(last_ptr)) {
7317                         spin_lock(&last_ptr->lock);
7318                         last_ptr->fragmented = 1;
7319                         spin_unlock(&last_ptr->lock);
7320                 }
7321                 spin_lock(&block_group->free_space_ctl->tree_lock);
7322                 if (cached &&
7323                     block_group->free_space_ctl->free_space <
7324                     num_bytes + empty_cluster + empty_size) {
7325                         if (block_group->free_space_ctl->free_space >
7326                             max_extent_size)
7327                                 max_extent_size =
7328                                         block_group->free_space_ctl->free_space;
7329                         spin_unlock(&block_group->free_space_ctl->tree_lock);
7330                         goto loop;
7331                 }
7332                 spin_unlock(&block_group->free_space_ctl->tree_lock);
7333
7334                 offset = btrfs_find_space_for_alloc(block_group, search_start,
7335                                                     num_bytes, empty_size,
7336                                                     &max_extent_size);
7337                 /*
7338                  * If we didn't find a chunk, and we haven't failed on this
7339                  * block group before, and this block group is in the middle of
7340                  * caching and we are ok with waiting, then go ahead and wait
7341                  * for progress to be made, and set failed_alloc to true.
7342                  *
7343                  * If failed_alloc is true then we've already waited on this
7344                  * block group once and should move on to the next block group.
7345                  */
7346                 if (!offset && !failed_alloc && !cached &&
7347                     loop > LOOP_CACHING_NOWAIT) {
7348                         wait_block_group_cache_progress(block_group,
7349                                                 num_bytes + empty_size);
7350                         failed_alloc = true;
7351                         goto have_block_group;
7352                 } else if (!offset) {
7353                         goto loop;
7354                 }
7355 checks:
7356                 search_start = ALIGN(offset, root->stripesize);
7357
7358                 /* move on to the next group */
7359                 if (search_start + num_bytes >
7360                     block_group->key.objectid + block_group->key.offset) {
7361                         btrfs_add_free_space(block_group, offset, num_bytes);
7362                         goto loop;
7363                 }
7364
7365                 if (offset < search_start)
7366                         btrfs_add_free_space(block_group, offset,
7367                                              search_start - offset);
7368                 BUG_ON(offset > search_start);
7369
7370                 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
7371                                                   alloc_type, delalloc);
7372                 if (ret == -EAGAIN) {
7373                         btrfs_add_free_space(block_group, offset, num_bytes);
7374                         goto loop;
7375                 }
7376
7377                 /* we are all good, lets return */
7378                 ins->objectid = search_start;
7379                 ins->offset = num_bytes;
7380
7381                 trace_btrfs_reserve_extent(orig_root, block_group,
7382                                            search_start, num_bytes);
7383                 btrfs_release_block_group(block_group, delalloc);
7384                 break;
7385 loop:
7386                 failed_cluster_refill = false;
7387                 failed_alloc = false;
7388                 BUG_ON(index != get_block_group_index(block_group));
7389                 btrfs_release_block_group(block_group, delalloc);
7390         }
7391         up_read(&space_info->groups_sem);
7392
7393         if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7394                 && !orig_have_caching_bg)
7395                 orig_have_caching_bg = true;
7396
7397         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7398                 goto search;
7399
7400         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7401                 goto search;
7402
7403         /*
7404          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7405          *                      caching kthreads as we move along
7406          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7407          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7408          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7409          *                      again
7410          */
7411         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7412                 index = 0;
7413                 if (loop == LOOP_CACHING_NOWAIT) {
7414                         /*
7415                          * We want to skip the LOOP_CACHING_WAIT step if we
7416                          * don't have any unached bgs and we've alrelady done a
7417                          * full search through.
7418                          */
7419                         if (orig_have_caching_bg || !full_search)
7420                                 loop = LOOP_CACHING_WAIT;
7421                         else
7422                                 loop = LOOP_ALLOC_CHUNK;
7423                 } else {
7424                         loop++;
7425                 }
7426
7427                 if (loop == LOOP_ALLOC_CHUNK) {
7428                         struct btrfs_trans_handle *trans;
7429                         int exist = 0;
7430
7431                         trans = current->journal_info;
7432                         if (trans)
7433                                 exist = 1;
7434                         else
7435                                 trans = btrfs_join_transaction(root);
7436
7437                         if (IS_ERR(trans)) {
7438                                 ret = PTR_ERR(trans);
7439                                 goto out;
7440                         }
7441
7442                         ret = do_chunk_alloc(trans, root, flags,
7443                                              CHUNK_ALLOC_FORCE);
7444
7445                         /*
7446                          * If we can't allocate a new chunk we've already looped
7447                          * through at least once, move on to the NO_EMPTY_SIZE
7448                          * case.
7449                          */
7450                         if (ret == -ENOSPC)
7451                                 loop = LOOP_NO_EMPTY_SIZE;
7452
7453                         /*
7454                          * Do not bail out on ENOSPC since we
7455                          * can do more things.
7456                          */
7457                         if (ret < 0 && ret != -ENOSPC)
7458                                 btrfs_abort_transaction(trans,
7459                                                         root, ret);
7460                         else
7461                                 ret = 0;
7462                         if (!exist)
7463                                 btrfs_end_transaction(trans, root);
7464                         if (ret)
7465                                 goto out;
7466                 }
7467
7468                 if (loop == LOOP_NO_EMPTY_SIZE) {
7469                         /*
7470                          * Don't loop again if we already have no empty_size and
7471                          * no empty_cluster.
7472                          */
7473                         if (empty_size == 0 &&
7474                             empty_cluster == 0) {
7475                                 ret = -ENOSPC;
7476                                 goto out;
7477                         }
7478                         empty_size = 0;
7479                         empty_cluster = 0;
7480                 }
7481
7482                 goto search;
7483         } else if (!ins->objectid) {
7484                 ret = -ENOSPC;
7485         } else if (ins->objectid) {
7486                 if (!use_cluster && last_ptr) {
7487                         spin_lock(&last_ptr->lock);
7488                         last_ptr->window_start = ins->objectid;
7489                         spin_unlock(&last_ptr->lock);
7490                 }
7491                 ret = 0;
7492         }
7493 out:
7494         if (ret == -ENOSPC) {
7495                 spin_lock(&space_info->lock);
7496                 space_info->max_extent_size = max_extent_size;
7497                 spin_unlock(&space_info->lock);
7498                 ins->offset = max_extent_size;
7499         }
7500         return ret;
7501 }
7502
7503 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
7504                             int dump_block_groups)
7505 {
7506         struct btrfs_block_group_cache *cache;
7507         int index = 0;
7508
7509         spin_lock(&info->lock);
7510         printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
7511                info->flags,
7512                info->total_bytes - info->bytes_used - info->bytes_pinned -
7513                info->bytes_reserved - info->bytes_readonly,
7514                (info->full) ? "" : "not ");
7515         printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
7516                "reserved=%llu, may_use=%llu, readonly=%llu\n",
7517                info->total_bytes, info->bytes_used, info->bytes_pinned,
7518                info->bytes_reserved, info->bytes_may_use,
7519                info->bytes_readonly);
7520         spin_unlock(&info->lock);
7521
7522         if (!dump_block_groups)
7523                 return;
7524
7525         down_read(&info->groups_sem);
7526 again:
7527         list_for_each_entry(cache, &info->block_groups[index], list) {
7528                 spin_lock(&cache->lock);
7529                 printk(KERN_INFO "BTRFS: "
7530                            "block group %llu has %llu bytes, "
7531                            "%llu used %llu pinned %llu reserved %s\n",
7532                        cache->key.objectid, cache->key.offset,
7533                        btrfs_block_group_used(&cache->item), cache->pinned,
7534                        cache->reserved, cache->ro ? "[readonly]" : "");
7535                 btrfs_dump_free_space(cache, bytes);
7536                 spin_unlock(&cache->lock);
7537         }
7538         if (++index < BTRFS_NR_RAID_TYPES)
7539                 goto again;
7540         up_read(&info->groups_sem);
7541 }
7542
7543 int btrfs_reserve_extent(struct btrfs_root *root,
7544                          u64 num_bytes, u64 min_alloc_size,
7545                          u64 empty_size, u64 hint_byte,
7546                          struct btrfs_key *ins, int is_data, int delalloc)
7547 {
7548         bool final_tried = num_bytes == min_alloc_size;
7549         u64 flags;
7550         int ret;
7551
7552         flags = btrfs_get_alloc_profile(root, is_data);
7553 again:
7554         WARN_ON(num_bytes < root->sectorsize);
7555         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
7556                                flags, delalloc);
7557
7558         if (ret == -ENOSPC) {
7559                 if (!final_tried && ins->offset) {
7560                         num_bytes = min(num_bytes >> 1, ins->offset);
7561                         num_bytes = round_down(num_bytes, root->sectorsize);
7562                         num_bytes = max(num_bytes, min_alloc_size);
7563                         if (num_bytes == min_alloc_size)
7564                                 final_tried = true;
7565                         goto again;
7566                 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7567                         struct btrfs_space_info *sinfo;
7568
7569                         sinfo = __find_space_info(root->fs_info, flags);
7570                         btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
7571                                 flags, num_bytes);
7572                         if (sinfo)
7573                                 dump_space_info(sinfo, num_bytes, 1);
7574                 }
7575         }
7576
7577         return ret;
7578 }
7579
7580 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7581                                         u64 start, u64 len,
7582                                         int pin, int delalloc)
7583 {
7584         struct btrfs_block_group_cache *cache;
7585         int ret = 0;
7586
7587         cache = btrfs_lookup_block_group(root->fs_info, start);
7588         if (!cache) {
7589                 btrfs_err(root->fs_info, "Unable to find block group for %llu",
7590                         start);
7591                 return -ENOSPC;
7592         }
7593
7594         if (pin)
7595                 pin_down_extent(root, cache, start, len, 1);
7596         else {
7597                 if (btrfs_test_opt(root, DISCARD))
7598                         ret = btrfs_discard_extent(root, start, len, NULL);
7599                 btrfs_add_free_space(cache, start, len);
7600                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
7601         }
7602
7603         btrfs_put_block_group(cache);
7604
7605         trace_btrfs_reserved_extent_free(root, start, len);
7606
7607         return ret;
7608 }
7609
7610 int btrfs_free_reserved_extent(struct btrfs_root *root,
7611                                u64 start, u64 len, int delalloc)
7612 {
7613         return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
7614 }
7615
7616 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
7617                                        u64 start, u64 len)
7618 {
7619         return __btrfs_free_reserved_extent(root, start, len, 1, 0);
7620 }
7621
7622 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7623                                       struct btrfs_root *root,
7624                                       u64 parent, u64 root_objectid,
7625                                       u64 flags, u64 owner, u64 offset,
7626                                       struct btrfs_key *ins, int ref_mod)
7627 {
7628         int ret;
7629         struct btrfs_fs_info *fs_info = root->fs_info;
7630         struct btrfs_extent_item *extent_item;
7631         struct btrfs_extent_inline_ref *iref;
7632         struct btrfs_path *path;
7633         struct extent_buffer *leaf;
7634         int type;
7635         u32 size;
7636
7637         if (parent > 0)
7638                 type = BTRFS_SHARED_DATA_REF_KEY;
7639         else
7640                 type = BTRFS_EXTENT_DATA_REF_KEY;
7641
7642         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7643
7644         path = btrfs_alloc_path();
7645         if (!path)
7646                 return -ENOMEM;
7647
7648         path->leave_spinning = 1;
7649         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7650                                       ins, size);
7651         if (ret) {
7652                 btrfs_free_path(path);
7653                 return ret;
7654         }
7655
7656         leaf = path->nodes[0];
7657         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7658                                      struct btrfs_extent_item);
7659         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7660         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7661         btrfs_set_extent_flags(leaf, extent_item,
7662                                flags | BTRFS_EXTENT_FLAG_DATA);
7663
7664         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7665         btrfs_set_extent_inline_ref_type(leaf, iref, type);
7666         if (parent > 0) {
7667                 struct btrfs_shared_data_ref *ref;
7668                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
7669                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7670                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7671         } else {
7672                 struct btrfs_extent_data_ref *ref;
7673                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7674                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7675                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7676                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7677                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7678         }
7679
7680         btrfs_mark_buffer_dirty(path->nodes[0]);
7681         btrfs_free_path(path);
7682
7683         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
7684         if (ret) { /* -ENOENT, logic error */
7685                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7686                         ins->objectid, ins->offset);
7687                 BUG();
7688         }
7689         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
7690         return ret;
7691 }
7692
7693 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7694                                      struct btrfs_root *root,
7695                                      u64 parent, u64 root_objectid,
7696                                      u64 flags, struct btrfs_disk_key *key,
7697                                      int level, struct btrfs_key *ins)
7698 {
7699         int ret;
7700         struct btrfs_fs_info *fs_info = root->fs_info;
7701         struct btrfs_extent_item *extent_item;
7702         struct btrfs_tree_block_info *block_info;
7703         struct btrfs_extent_inline_ref *iref;
7704         struct btrfs_path *path;
7705         struct extent_buffer *leaf;
7706         u32 size = sizeof(*extent_item) + sizeof(*iref);
7707         u64 num_bytes = ins->offset;
7708         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7709                                                  SKINNY_METADATA);
7710
7711         if (!skinny_metadata)
7712                 size += sizeof(*block_info);
7713
7714         path = btrfs_alloc_path();
7715         if (!path) {
7716                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7717                                                    root->nodesize);
7718                 return -ENOMEM;
7719         }
7720
7721         path->leave_spinning = 1;
7722         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7723                                       ins, size);
7724         if (ret) {
7725                 btrfs_free_path(path);
7726                 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7727                                                    root->nodesize);
7728                 return ret;
7729         }
7730
7731         leaf = path->nodes[0];
7732         extent_item = btrfs_item_ptr(leaf, path->slots[0],
7733                                      struct btrfs_extent_item);
7734         btrfs_set_extent_refs(leaf, extent_item, 1);
7735         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7736         btrfs_set_extent_flags(leaf, extent_item,
7737                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
7738
7739         if (skinny_metadata) {
7740                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7741                 num_bytes = root->nodesize;
7742         } else {
7743                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7744                 btrfs_set_tree_block_key(leaf, block_info, key);
7745                 btrfs_set_tree_block_level(leaf, block_info, level);
7746                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
7747         }
7748
7749         if (parent > 0) {
7750                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
7751                 btrfs_set_extent_inline_ref_type(leaf, iref,
7752                                                  BTRFS_SHARED_BLOCK_REF_KEY);
7753                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7754         } else {
7755                 btrfs_set_extent_inline_ref_type(leaf, iref,
7756                                                  BTRFS_TREE_BLOCK_REF_KEY);
7757                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
7758         }
7759
7760         btrfs_mark_buffer_dirty(leaf);
7761         btrfs_free_path(path);
7762
7763         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
7764                                  1);
7765         if (ret) { /* -ENOENT, logic error */
7766                 btrfs_err(fs_info, "update block group failed for %llu %llu",
7767                         ins->objectid, ins->offset);
7768                 BUG();
7769         }
7770
7771         trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
7772         return ret;
7773 }
7774
7775 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7776                                      struct btrfs_root *root,
7777                                      u64 root_objectid, u64 owner,
7778                                      u64 offset, u64 ram_bytes,
7779                                      struct btrfs_key *ins)
7780 {
7781         int ret;
7782
7783         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
7784
7785         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
7786                                          ins->offset, 0,
7787                                          root_objectid, owner, offset,
7788                                          ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
7789                                          NULL);
7790         return ret;
7791 }
7792
7793 /*
7794  * this is used by the tree logging recovery code.  It records that
7795  * an extent has been allocated and makes sure to clear the free
7796  * space cache bits as well
7797  */
7798 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7799                                    struct btrfs_root *root,
7800                                    u64 root_objectid, u64 owner, u64 offset,
7801                                    struct btrfs_key *ins)
7802 {
7803         int ret;
7804         struct btrfs_block_group_cache *block_group;
7805
7806         /*
7807          * Mixed block groups will exclude before processing the log so we only
7808          * need to do the exlude dance if this fs isn't mixed.
7809          */
7810         if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
7811                 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
7812                 if (ret)
7813                         return ret;
7814         }
7815
7816         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
7817         if (!block_group)
7818                 return -EINVAL;
7819
7820         ret = btrfs_update_reserved_bytes(block_group, ins->offset,
7821                                           RESERVE_ALLOC_NO_ACCOUNT, 0);
7822         BUG_ON(ret); /* logic error */
7823         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
7824                                          0, owner, offset, ins, 1);
7825         btrfs_put_block_group(block_group);
7826         return ret;
7827 }
7828
7829 static struct extent_buffer *
7830 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7831                       u64 bytenr, int level)
7832 {
7833         struct extent_buffer *buf;
7834
7835         buf = btrfs_find_create_tree_block(root, bytenr);
7836         if (!buf)
7837                 return ERR_PTR(-ENOMEM);
7838
7839         /*
7840          * Extra safety check in case the extent tree is corrupted and extent
7841          * allocator chooses to use a tree block which is already used and
7842          * locked.
7843          */
7844         if (buf->lock_owner == current->pid) {
7845                 btrfs_err_rl(root->fs_info,
7846 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
7847                         buf->start, btrfs_header_owner(buf), current->pid);
7848                 free_extent_buffer(buf);
7849                 return ERR_PTR(-EUCLEAN);
7850         }
7851
7852         btrfs_set_header_generation(buf, trans->transid);
7853         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
7854         btrfs_tree_lock(buf);
7855         clean_tree_block(trans, root->fs_info, buf);
7856         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
7857
7858         btrfs_set_lock_blocking(buf);
7859         btrfs_set_buffer_uptodate(buf);
7860
7861         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7862                 buf->log_index = root->log_transid % 2;
7863                 /*
7864                  * we allow two log transactions at a time, use different
7865                  * EXENT bit to differentiate dirty pages.
7866                  */
7867                 if (buf->log_index == 0)
7868                         set_extent_dirty(&root->dirty_log_pages, buf->start,
7869                                         buf->start + buf->len - 1, GFP_NOFS);
7870                 else
7871                         set_extent_new(&root->dirty_log_pages, buf->start,
7872                                         buf->start + buf->len - 1, GFP_NOFS);
7873         } else {
7874                 buf->log_index = -1;
7875                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7876                          buf->start + buf->len - 1, GFP_NOFS);
7877         }
7878         trans->dirty = true;
7879         /* this returns a buffer locked for blocking */
7880         return buf;
7881 }
7882
7883 static struct btrfs_block_rsv *
7884 use_block_rsv(struct btrfs_trans_handle *trans,
7885               struct btrfs_root *root, u32 blocksize)
7886 {
7887         struct btrfs_block_rsv *block_rsv;
7888         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
7889         int ret;
7890         bool global_updated = false;
7891
7892         block_rsv = get_block_rsv(trans, root);
7893
7894         if (unlikely(block_rsv->size == 0))
7895                 goto try_reserve;
7896 again:
7897         ret = block_rsv_use_bytes(block_rsv, blocksize);
7898         if (!ret)
7899                 return block_rsv;
7900
7901         if (block_rsv->failfast)
7902                 return ERR_PTR(ret);
7903
7904         if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
7905                 global_updated = true;
7906                 update_global_block_rsv(root->fs_info);
7907                 goto again;
7908         }
7909
7910         if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7911                 static DEFINE_RATELIMIT_STATE(_rs,
7912                                 DEFAULT_RATELIMIT_INTERVAL * 10,
7913                                 /*DEFAULT_RATELIMIT_BURST*/ 1);
7914                 if (__ratelimit(&_rs))
7915                         WARN(1, KERN_DEBUG
7916                                 "BTRFS: block rsv returned %d\n", ret);
7917         }
7918 try_reserve:
7919         ret = reserve_metadata_bytes(root, block_rsv, blocksize,
7920                                      BTRFS_RESERVE_NO_FLUSH);
7921         if (!ret)
7922                 return block_rsv;
7923         /*
7924          * If we couldn't reserve metadata bytes try and use some from
7925          * the global reserve if its space type is the same as the global
7926          * reservation.
7927          */
7928         if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
7929             block_rsv->space_info == global_rsv->space_info) {
7930                 ret = block_rsv_use_bytes(global_rsv, blocksize);
7931                 if (!ret)
7932                         return global_rsv;
7933         }
7934         return ERR_PTR(ret);
7935 }
7936
7937 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7938                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
7939 {
7940         block_rsv_add_bytes(block_rsv, blocksize, 0);
7941         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
7942 }
7943
7944 /*
7945  * finds a free extent and does all the dirty work required for allocation
7946  * returns the tree buffer or an ERR_PTR on error.
7947  */
7948 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7949                                         struct btrfs_root *root,
7950                                         u64 parent, u64 root_objectid,
7951                                         struct btrfs_disk_key *key, int level,
7952                                         u64 hint, u64 empty_size)
7953 {
7954         struct btrfs_key ins;
7955         struct btrfs_block_rsv *block_rsv;
7956         struct extent_buffer *buf;
7957         struct btrfs_delayed_extent_op *extent_op;
7958         u64 flags = 0;
7959         int ret;
7960         u32 blocksize = root->nodesize;
7961         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7962                                                  SKINNY_METADATA);
7963
7964         if (btrfs_test_is_dummy_root(root)) {
7965                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7966                                             level);
7967                 if (!IS_ERR(buf))
7968                         root->alloc_bytenr += blocksize;
7969                 return buf;
7970         }
7971
7972         block_rsv = use_block_rsv(trans, root, blocksize);
7973         if (IS_ERR(block_rsv))
7974                 return ERR_CAST(block_rsv);
7975
7976         ret = btrfs_reserve_extent(root, blocksize, blocksize,
7977                                    empty_size, hint, &ins, 0, 0);
7978         if (ret)
7979                 goto out_unuse;
7980
7981         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
7982         if (IS_ERR(buf)) {
7983                 ret = PTR_ERR(buf);
7984                 goto out_free_reserved;
7985         }
7986
7987         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7988                 if (parent == 0)
7989                         parent = ins.objectid;
7990                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7991         } else
7992                 BUG_ON(parent > 0);
7993
7994         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
7995                 extent_op = btrfs_alloc_delayed_extent_op();
7996                 if (!extent_op) {
7997                         ret = -ENOMEM;
7998                         goto out_free_buf;
7999                 }
8000                 if (key)
8001                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
8002                 else
8003                         memset(&extent_op->key, 0, sizeof(extent_op->key));
8004                 extent_op->flags_to_set = flags;
8005                 if (skinny_metadata)
8006                         extent_op->update_key = 0;
8007                 else
8008                         extent_op->update_key = 1;
8009                 extent_op->update_flags = 1;
8010                 extent_op->is_data = 0;
8011                 extent_op->level = level;
8012
8013                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
8014                                                  ins.objectid, ins.offset,
8015                                                  parent, root_objectid, level,
8016                                                  BTRFS_ADD_DELAYED_EXTENT,
8017                                                  extent_op);
8018                 if (ret)
8019                         goto out_free_delayed;
8020         }
8021         return buf;
8022
8023 out_free_delayed:
8024         btrfs_free_delayed_extent_op(extent_op);
8025 out_free_buf:
8026         free_extent_buffer(buf);
8027 out_free_reserved:
8028         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
8029 out_unuse:
8030         unuse_block_rsv(root->fs_info, block_rsv, blocksize);
8031         return ERR_PTR(ret);
8032 }
8033
8034 struct walk_control {
8035         u64 refs[BTRFS_MAX_LEVEL];
8036         u64 flags[BTRFS_MAX_LEVEL];
8037         struct btrfs_key update_progress;
8038         int stage;
8039         int level;
8040         int shared_level;
8041         int update_ref;
8042         int keep_locks;
8043         int reada_slot;
8044         int reada_count;
8045         int for_reloc;
8046 };
8047
8048 #define DROP_REFERENCE  1
8049 #define UPDATE_BACKREF  2
8050
8051 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8052                                      struct btrfs_root *root,
8053                                      struct walk_control *wc,
8054                                      struct btrfs_path *path)
8055 {
8056         u64 bytenr;
8057         u64 generation;
8058         u64 refs;
8059         u64 flags;
8060         u32 nritems;
8061         u32 blocksize;
8062         struct btrfs_key key;
8063         struct extent_buffer *eb;
8064         int ret;
8065         int slot;
8066         int nread = 0;
8067
8068         if (path->slots[wc->level] < wc->reada_slot) {
8069                 wc->reada_count = wc->reada_count * 2 / 3;
8070                 wc->reada_count = max(wc->reada_count, 2);
8071         } else {
8072                 wc->reada_count = wc->reada_count * 3 / 2;
8073                 wc->reada_count = min_t(int, wc->reada_count,
8074                                         BTRFS_NODEPTRS_PER_BLOCK(root));
8075         }
8076
8077         eb = path->nodes[wc->level];
8078         nritems = btrfs_header_nritems(eb);
8079         blocksize = root->nodesize;
8080
8081         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8082                 if (nread >= wc->reada_count)
8083                         break;
8084
8085                 cond_resched();
8086                 bytenr = btrfs_node_blockptr(eb, slot);
8087                 generation = btrfs_node_ptr_generation(eb, slot);
8088
8089                 if (slot == path->slots[wc->level])
8090                         goto reada;
8091
8092                 if (wc->stage == UPDATE_BACKREF &&
8093                     generation <= root->root_key.offset)
8094                         continue;
8095
8096                 /* We don't lock the tree block, it's OK to be racy here */
8097                 ret = btrfs_lookup_extent_info(trans, root, bytenr,
8098                                                wc->level - 1, 1, &refs,
8099                                                &flags);
8100                 /* We don't care about errors in readahead. */
8101                 if (ret < 0)
8102                         continue;
8103                 BUG_ON(refs == 0);
8104
8105                 if (wc->stage == DROP_REFERENCE) {
8106                         if (refs == 1)
8107                                 goto reada;
8108
8109                         if (wc->level == 1 &&
8110                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8111                                 continue;
8112                         if (!wc->update_ref ||
8113                             generation <= root->root_key.offset)
8114                                 continue;
8115                         btrfs_node_key_to_cpu(eb, &key, slot);
8116                         ret = btrfs_comp_cpu_keys(&key,
8117                                                   &wc->update_progress);
8118                         if (ret < 0)
8119                                 continue;
8120                 } else {
8121                         if (wc->level == 1 &&
8122                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8123                                 continue;
8124                 }
8125 reada:
8126                 readahead_tree_block(root, bytenr);
8127                 nread++;
8128         }
8129         wc->reada_slot = slot;
8130 }
8131
8132 /*
8133  * These may not be seen by the usual inc/dec ref code so we have to
8134  * add them here.
8135  */
8136 static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
8137                                      struct btrfs_root *root, u64 bytenr,
8138                                      u64 num_bytes)
8139 {
8140         struct btrfs_qgroup_extent_record *qrecord;
8141         struct btrfs_delayed_ref_root *delayed_refs;
8142
8143         qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
8144         if (!qrecord)
8145                 return -ENOMEM;
8146
8147         qrecord->bytenr = bytenr;
8148         qrecord->num_bytes = num_bytes;
8149         qrecord->old_roots = NULL;
8150
8151         delayed_refs = &trans->transaction->delayed_refs;
8152         spin_lock(&delayed_refs->lock);
8153         if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
8154                 kfree(qrecord);
8155         spin_unlock(&delayed_refs->lock);
8156
8157         return 0;
8158 }
8159
8160 static int account_leaf_items(struct btrfs_trans_handle *trans,
8161                               struct btrfs_root *root,
8162                               struct extent_buffer *eb)
8163 {
8164         int nr = btrfs_header_nritems(eb);
8165         int i, extent_type, ret;
8166         struct btrfs_key key;
8167         struct btrfs_file_extent_item *fi;
8168         u64 bytenr, num_bytes;
8169
8170         /* We can be called directly from walk_up_proc() */
8171         if (!root->fs_info->quota_enabled)
8172                 return 0;
8173
8174         for (i = 0; i < nr; i++) {
8175                 btrfs_item_key_to_cpu(eb, &key, i);
8176
8177                 if (key.type != BTRFS_EXTENT_DATA_KEY)
8178                         continue;
8179
8180                 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
8181                 /* filter out non qgroup-accountable extents  */
8182                 extent_type = btrfs_file_extent_type(eb, fi);
8183
8184                 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
8185                         continue;
8186
8187                 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
8188                 if (!bytenr)
8189                         continue;
8190
8191                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
8192
8193                 ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
8194                 if (ret)
8195                         return ret;
8196         }
8197         return 0;
8198 }
8199
8200 /*
8201  * Walk up the tree from the bottom, freeing leaves and any interior
8202  * nodes which have had all slots visited. If a node (leaf or
8203  * interior) is freed, the node above it will have it's slot
8204  * incremented. The root node will never be freed.
8205  *
8206  * At the end of this function, we should have a path which has all
8207  * slots incremented to the next position for a search. If we need to
8208  * read a new node it will be NULL and the node above it will have the
8209  * correct slot selected for a later read.
8210  *
8211  * If we increment the root nodes slot counter past the number of
8212  * elements, 1 is returned to signal completion of the search.
8213  */
8214 static int adjust_slots_upwards(struct btrfs_root *root,
8215                                 struct btrfs_path *path, int root_level)
8216 {
8217         int level = 0;
8218         int nr, slot;
8219         struct extent_buffer *eb;
8220
8221         if (root_level == 0)
8222                 return 1;
8223
8224         while (level <= root_level) {
8225                 eb = path->nodes[level];
8226                 nr = btrfs_header_nritems(eb);
8227                 path->slots[level]++;
8228                 slot = path->slots[level];
8229                 if (slot >= nr || level == 0) {
8230                         /*
8231                          * Don't free the root -  we will detect this
8232                          * condition after our loop and return a
8233                          * positive value for caller to stop walking the tree.
8234                          */
8235                         if (level != root_level) {
8236                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8237                                 path->locks[level] = 0;
8238
8239                                 free_extent_buffer(eb);
8240                                 path->nodes[level] = NULL;
8241                                 path->slots[level] = 0;
8242                         }
8243                 } else {
8244                         /*
8245                          * We have a valid slot to walk back down
8246                          * from. Stop here so caller can process these
8247                          * new nodes.
8248                          */
8249                         break;
8250                 }
8251
8252                 level++;
8253         }
8254
8255         eb = path->nodes[root_level];
8256         if (path->slots[root_level] >= btrfs_header_nritems(eb))
8257                 return 1;
8258
8259         return 0;
8260 }
8261
8262 /*
8263  * root_eb is the subtree root and is locked before this function is called.
8264  */
8265 static int account_shared_subtree(struct btrfs_trans_handle *trans,
8266                                   struct btrfs_root *root,
8267                                   struct extent_buffer *root_eb,
8268                                   u64 root_gen,
8269                                   int root_level)
8270 {
8271         int ret = 0;
8272         int level;
8273         struct extent_buffer *eb = root_eb;
8274         struct btrfs_path *path = NULL;
8275
8276         BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
8277         BUG_ON(root_eb == NULL);
8278
8279         if (!root->fs_info->quota_enabled)
8280                 return 0;
8281
8282         if (!extent_buffer_uptodate(root_eb)) {
8283                 ret = btrfs_read_buffer(root_eb, root_gen);
8284                 if (ret)
8285                         goto out;
8286         }
8287
8288         if (root_level == 0) {
8289                 ret = account_leaf_items(trans, root, root_eb);
8290                 goto out;
8291         }
8292
8293         path = btrfs_alloc_path();
8294         if (!path)
8295                 return -ENOMEM;
8296
8297         /*
8298          * Walk down the tree.  Missing extent blocks are filled in as
8299          * we go. Metadata is accounted every time we read a new
8300          * extent block.
8301          *
8302          * When we reach a leaf, we account for file extent items in it,
8303          * walk back up the tree (adjusting slot pointers as we go)
8304          * and restart the search process.
8305          */
8306         extent_buffer_get(root_eb); /* For path */
8307         path->nodes[root_level] = root_eb;
8308         path->slots[root_level] = 0;
8309         path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
8310 walk_down:
8311         level = root_level;
8312         while (level >= 0) {
8313                 if (path->nodes[level] == NULL) {
8314                         int parent_slot;
8315                         u64 child_gen;
8316                         u64 child_bytenr;
8317
8318                         /* We need to get child blockptr/gen from
8319                          * parent before we can read it. */
8320                         eb = path->nodes[level + 1];
8321                         parent_slot = path->slots[level + 1];
8322                         child_bytenr = btrfs_node_blockptr(eb, parent_slot);
8323                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
8324
8325                         eb = read_tree_block(root, child_bytenr, child_gen);
8326                         if (IS_ERR(eb)) {
8327                                 ret = PTR_ERR(eb);
8328                                 goto out;
8329                         } else if (!extent_buffer_uptodate(eb)) {
8330                                 free_extent_buffer(eb);
8331                                 ret = -EIO;
8332                                 goto out;
8333                         }
8334
8335                         path->nodes[level] = eb;
8336                         path->slots[level] = 0;
8337
8338                         btrfs_tree_read_lock(eb);
8339                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
8340                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8341
8342                         ret = record_one_subtree_extent(trans, root, child_bytenr,
8343                                                         root->nodesize);
8344                         if (ret)
8345                                 goto out;
8346                 }
8347
8348                 if (level == 0) {
8349                         ret = account_leaf_items(trans, root, path->nodes[level]);
8350                         if (ret)
8351                                 goto out;
8352
8353                         /* Nonzero return here means we completed our search */
8354                         ret = adjust_slots_upwards(root, path, root_level);
8355                         if (ret)
8356                                 break;
8357
8358                         /* Restart search with new slots */
8359                         goto walk_down;
8360                 }
8361
8362                 level--;
8363         }
8364
8365         ret = 0;
8366 out:
8367         btrfs_free_path(path);
8368
8369         return ret;
8370 }
8371
8372 /*
8373  * helper to process tree block while walking down the tree.
8374  *
8375  * when wc->stage == UPDATE_BACKREF, this function updates
8376  * back refs for pointers in the block.
8377  *
8378  * NOTE: return value 1 means we should stop walking down.
8379  */
8380 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8381                                    struct btrfs_root *root,
8382                                    struct btrfs_path *path,
8383                                    struct walk_control *wc, int lookup_info)
8384 {
8385         int level = wc->level;
8386         struct extent_buffer *eb = path->nodes[level];
8387         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8388         int ret;
8389
8390         if (wc->stage == UPDATE_BACKREF &&
8391             btrfs_header_owner(eb) != root->root_key.objectid)
8392                 return 1;
8393
8394         /*
8395          * when reference count of tree block is 1, it won't increase
8396          * again. once full backref flag is set, we never clear it.
8397          */
8398         if (lookup_info &&
8399             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8400              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8401                 BUG_ON(!path->locks[level]);
8402                 ret = btrfs_lookup_extent_info(trans, root,
8403                                                eb->start, level, 1,
8404                                                &wc->refs[level],
8405                                                &wc->flags[level]);
8406                 BUG_ON(ret == -ENOMEM);
8407                 if (ret)
8408                         return ret;
8409                 BUG_ON(wc->refs[level] == 0);
8410         }
8411
8412         if (wc->stage == DROP_REFERENCE) {
8413                 if (wc->refs[level] > 1)
8414                         return 1;
8415
8416                 if (path->locks[level] && !wc->keep_locks) {
8417                         btrfs_tree_unlock_rw(eb, path->locks[level]);
8418                         path->locks[level] = 0;
8419                 }
8420                 return 0;
8421         }
8422
8423         /* wc->stage == UPDATE_BACKREF */
8424         if (!(wc->flags[level] & flag)) {
8425                 BUG_ON(!path->locks[level]);
8426                 ret = btrfs_inc_ref(trans, root, eb, 1);
8427                 BUG_ON(ret); /* -ENOMEM */
8428                 ret = btrfs_dec_ref(trans, root, eb, 0);
8429                 BUG_ON(ret); /* -ENOMEM */
8430                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8431                                                   eb->len, flag,
8432                                                   btrfs_header_level(eb), 0);
8433                 BUG_ON(ret); /* -ENOMEM */
8434                 wc->flags[level] |= flag;
8435         }
8436
8437         /*
8438          * the block is shared by multiple trees, so it's not good to
8439          * keep the tree lock
8440          */
8441         if (path->locks[level] && level > 0) {
8442                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8443                 path->locks[level] = 0;
8444         }
8445         return 0;
8446 }
8447
8448 /*
8449  * helper to process tree block pointer.
8450  *
8451  * when wc->stage == DROP_REFERENCE, this function checks
8452  * reference count of the block pointed to. if the block
8453  * is shared and we need update back refs for the subtree
8454  * rooted at the block, this function changes wc->stage to
8455  * UPDATE_BACKREF. if the block is shared and there is no
8456  * need to update back, this function drops the reference
8457  * to the block.
8458  *
8459  * NOTE: return value 1 means we should stop walking down.
8460  */
8461 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8462                                  struct btrfs_root *root,
8463                                  struct btrfs_path *path,
8464                                  struct walk_control *wc, int *lookup_info)
8465 {
8466         u64 bytenr;
8467         u64 generation;
8468         u64 parent;
8469         u32 blocksize;
8470         struct btrfs_key key;
8471         struct extent_buffer *next;
8472         int level = wc->level;
8473         int reada = 0;
8474         int ret = 0;
8475         bool need_account = false;
8476
8477         generation = btrfs_node_ptr_generation(path->nodes[level],
8478                                                path->slots[level]);
8479         /*
8480          * if the lower level block was created before the snapshot
8481          * was created, we know there is no need to update back refs
8482          * for the subtree
8483          */
8484         if (wc->stage == UPDATE_BACKREF &&
8485             generation <= root->root_key.offset) {
8486                 *lookup_info = 1;
8487                 return 1;
8488         }
8489
8490         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8491         blocksize = root->nodesize;
8492
8493         next = btrfs_find_tree_block(root->fs_info, bytenr);
8494         if (!next) {
8495                 next = btrfs_find_create_tree_block(root, bytenr);
8496                 if (!next)
8497                         return -ENOMEM;
8498                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8499                                                level - 1);
8500                 reada = 1;
8501         }
8502         btrfs_tree_lock(next);
8503         btrfs_set_lock_blocking(next);
8504
8505         ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8506                                        &wc->refs[level - 1],
8507                                        &wc->flags[level - 1]);
8508         if (ret < 0)
8509                 goto out_unlock;
8510
8511         if (unlikely(wc->refs[level - 1] == 0)) {
8512                 btrfs_err(root->fs_info, "Missing references.");
8513                 ret = -EIO;
8514                 goto out_unlock;
8515         }
8516         *lookup_info = 0;
8517
8518         if (wc->stage == DROP_REFERENCE) {
8519                 if (wc->refs[level - 1] > 1) {
8520                         need_account = true;
8521                         if (level == 1 &&
8522                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8523                                 goto skip;
8524
8525                         if (!wc->update_ref ||
8526                             generation <= root->root_key.offset)
8527                                 goto skip;
8528
8529                         btrfs_node_key_to_cpu(path->nodes[level], &key,
8530                                               path->slots[level]);
8531                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8532                         if (ret < 0)
8533                                 goto skip;
8534
8535                         wc->stage = UPDATE_BACKREF;
8536                         wc->shared_level = level - 1;
8537                 }
8538         } else {
8539                 if (level == 1 &&
8540                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8541                         goto skip;
8542         }
8543
8544         if (!btrfs_buffer_uptodate(next, generation, 0)) {
8545                 btrfs_tree_unlock(next);
8546                 free_extent_buffer(next);
8547                 next = NULL;
8548                 *lookup_info = 1;
8549         }
8550
8551         if (!next) {
8552                 if (reada && level == 1)
8553                         reada_walk_down(trans, root, wc, path);
8554                 next = read_tree_block(root, bytenr, generation);
8555                 if (IS_ERR(next)) {
8556                         return PTR_ERR(next);
8557                 } else if (!extent_buffer_uptodate(next)) {
8558                         free_extent_buffer(next);
8559                         return -EIO;
8560                 }
8561                 btrfs_tree_lock(next);
8562                 btrfs_set_lock_blocking(next);
8563         }
8564
8565         level--;
8566         ASSERT(level == btrfs_header_level(next));
8567         if (level != btrfs_header_level(next)) {
8568                 btrfs_err(root->fs_info, "mismatched level");
8569                 ret = -EIO;
8570                 goto out_unlock;
8571         }
8572         path->nodes[level] = next;
8573         path->slots[level] = 0;
8574         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8575         wc->level = level;
8576         if (wc->level == 1)
8577                 wc->reada_slot = 0;
8578         return 0;
8579 skip:
8580         wc->refs[level - 1] = 0;
8581         wc->flags[level - 1] = 0;
8582         if (wc->stage == DROP_REFERENCE) {
8583                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8584                         parent = path->nodes[level]->start;
8585                 } else {
8586                         ASSERT(root->root_key.objectid ==
8587                                btrfs_header_owner(path->nodes[level]));
8588                         if (root->root_key.objectid !=
8589                             btrfs_header_owner(path->nodes[level])) {
8590                                 btrfs_err(root->fs_info,
8591                                                 "mismatched block owner");
8592                                 ret = -EIO;
8593                                 goto out_unlock;
8594                         }
8595                         parent = 0;
8596                 }
8597
8598                 if (need_account) {
8599                         ret = account_shared_subtree(trans, root, next,
8600                                                      generation, level - 1);
8601                         if (ret) {
8602                                 btrfs_err_rl(root->fs_info,
8603                                         "Error "
8604                                         "%d accounting shared subtree. Quota "
8605                                         "is out of sync, rescan required.",
8606                                         ret);
8607                         }
8608                 }
8609                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8610                                 root->root_key.objectid, level - 1, 0);
8611                 if (ret)
8612                         goto out_unlock;
8613         }
8614
8615         *lookup_info = 1;
8616         ret = 1;
8617
8618 out_unlock:
8619         btrfs_tree_unlock(next);
8620         free_extent_buffer(next);
8621
8622         return ret;
8623 }
8624
8625 /*
8626  * helper to process tree block while walking up the tree.
8627  *
8628  * when wc->stage == DROP_REFERENCE, this function drops
8629  * reference count on the block.
8630  *
8631  * when wc->stage == UPDATE_BACKREF, this function changes
8632  * wc->stage back to DROP_REFERENCE if we changed wc->stage
8633  * to UPDATE_BACKREF previously while processing the block.
8634  *
8635  * NOTE: return value 1 means we should stop walking up.
8636  */
8637 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8638                                  struct btrfs_root *root,
8639                                  struct btrfs_path *path,
8640                                  struct walk_control *wc)
8641 {
8642         int ret;
8643         int level = wc->level;
8644         struct extent_buffer *eb = path->nodes[level];
8645         u64 parent = 0;
8646
8647         if (wc->stage == UPDATE_BACKREF) {
8648                 BUG_ON(wc->shared_level < level);
8649                 if (level < wc->shared_level)
8650                         goto out;
8651
8652                 ret = find_next_key(path, level + 1, &wc->update_progress);
8653                 if (ret > 0)
8654                         wc->update_ref = 0;
8655
8656                 wc->stage = DROP_REFERENCE;
8657                 wc->shared_level = -1;
8658                 path->slots[level] = 0;
8659
8660                 /*
8661                  * check reference count again if the block isn't locked.
8662                  * we should start walking down the tree again if reference
8663                  * count is one.
8664                  */
8665                 if (!path->locks[level]) {
8666                         BUG_ON(level == 0);
8667                         btrfs_tree_lock(eb);
8668                         btrfs_set_lock_blocking(eb);
8669                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8670
8671                         ret = btrfs_lookup_extent_info(trans, root,
8672                                                        eb->start, level, 1,
8673                                                        &wc->refs[level],
8674                                                        &wc->flags[level]);
8675                         if (ret < 0) {
8676                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8677                                 path->locks[level] = 0;
8678                                 return ret;
8679                         }
8680                         BUG_ON(wc->refs[level] == 0);
8681                         if (wc->refs[level] == 1) {
8682                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
8683                                 path->locks[level] = 0;
8684                                 return 1;
8685                         }
8686                 }
8687         }
8688
8689         /* wc->stage == DROP_REFERENCE */
8690         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8691
8692         if (wc->refs[level] == 1) {
8693                 if (level == 0) {
8694                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8695                                 ret = btrfs_dec_ref(trans, root, eb, 1);
8696                         else
8697                                 ret = btrfs_dec_ref(trans, root, eb, 0);
8698                         BUG_ON(ret); /* -ENOMEM */
8699                         ret = account_leaf_items(trans, root, eb);
8700                         if (ret) {
8701                                 btrfs_err_rl(root->fs_info,
8702                                         "error "
8703                                         "%d accounting leaf items. Quota "
8704                                         "is out of sync, rescan required.",
8705                                         ret);
8706                         }
8707                 }
8708                 /* make block locked assertion in clean_tree_block happy */
8709                 if (!path->locks[level] &&
8710                     btrfs_header_generation(eb) == trans->transid) {
8711                         btrfs_tree_lock(eb);
8712                         btrfs_set_lock_blocking(eb);
8713                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8714                 }
8715                 clean_tree_block(trans, root->fs_info, eb);
8716         }
8717
8718         if (eb == root->node) {
8719                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8720                         parent = eb->start;
8721                 else if (root->root_key.objectid != btrfs_header_owner(eb))
8722                         goto owner_mismatch;
8723         } else {
8724                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8725                         parent = path->nodes[level + 1]->start;
8726                 else if (root->root_key.objectid !=
8727                          btrfs_header_owner(path->nodes[level + 1]))
8728                         goto owner_mismatch;
8729         }
8730
8731         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8732 out:
8733         wc->refs[level] = 0;
8734         wc->flags[level] = 0;
8735         return 0;
8736
8737 owner_mismatch:
8738         btrfs_err_rl(root->fs_info, "unexpected tree owner, have %llu expect %llu",
8739                      btrfs_header_owner(eb), root->root_key.objectid);
8740         return -EUCLEAN;
8741 }
8742
8743 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8744                                    struct btrfs_root *root,
8745                                    struct btrfs_path *path,
8746                                    struct walk_control *wc)
8747 {
8748         int level = wc->level;
8749         int lookup_info = 1;
8750         int ret;
8751
8752         while (level >= 0) {
8753                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
8754                 if (ret > 0)
8755                         break;
8756
8757                 if (level == 0)
8758                         break;
8759
8760                 if (path->slots[level] >=
8761                     btrfs_header_nritems(path->nodes[level]))
8762                         break;
8763
8764                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
8765                 if (ret > 0) {
8766                         path->slots[level]++;
8767                         continue;
8768                 } else if (ret < 0)
8769                         return ret;
8770                 level = wc->level;
8771         }
8772         return 0;
8773 }
8774
8775 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8776                                  struct btrfs_root *root,
8777                                  struct btrfs_path *path,
8778                                  struct walk_control *wc, int max_level)
8779 {
8780         int level = wc->level;
8781         int ret;
8782
8783         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8784         while (level < max_level && path->nodes[level]) {
8785                 wc->level = level;
8786                 if (path->slots[level] + 1 <
8787                     btrfs_header_nritems(path->nodes[level])) {
8788                         path->slots[level]++;
8789                         return 0;
8790                 } else {
8791                         ret = walk_up_proc(trans, root, path, wc);
8792                         if (ret > 0)
8793                                 return 0;
8794                         if (ret < 0)
8795                                 return ret;
8796
8797                         if (path->locks[level]) {
8798                                 btrfs_tree_unlock_rw(path->nodes[level],
8799                                                      path->locks[level]);
8800                                 path->locks[level] = 0;
8801                         }
8802                         free_extent_buffer(path->nodes[level]);
8803                         path->nodes[level] = NULL;
8804                         level++;
8805                 }
8806         }
8807         return 1;
8808 }
8809
8810 /*
8811  * drop a subvolume tree.
8812  *
8813  * this function traverses the tree freeing any blocks that only
8814  * referenced by the tree.
8815  *
8816  * when a shared tree block is found. this function decreases its
8817  * reference count by one. if update_ref is true, this function
8818  * also make sure backrefs for the shared block and all lower level
8819  * blocks are properly updated.
8820  *
8821  * If called with for_reloc == 0, may exit early with -EAGAIN
8822  */
8823 int btrfs_drop_snapshot(struct btrfs_root *root,
8824                          struct btrfs_block_rsv *block_rsv, int update_ref,
8825                          int for_reloc)
8826 {
8827         struct btrfs_path *path;
8828         struct btrfs_trans_handle *trans;
8829         struct btrfs_root *tree_root = root->fs_info->tree_root;
8830         struct btrfs_root_item *root_item = &root->root_item;
8831         struct walk_control *wc;
8832         struct btrfs_key key;
8833         int err = 0;
8834         int ret;
8835         int level;
8836         bool root_dropped = false;
8837
8838         btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8839
8840         path = btrfs_alloc_path();
8841         if (!path) {
8842                 err = -ENOMEM;
8843                 goto out;
8844         }
8845
8846         wc = kzalloc(sizeof(*wc), GFP_NOFS);
8847         if (!wc) {
8848                 btrfs_free_path(path);
8849                 err = -ENOMEM;
8850                 goto out;
8851         }
8852
8853         trans = btrfs_start_transaction(tree_root, 0);
8854         if (IS_ERR(trans)) {
8855                 err = PTR_ERR(trans);
8856                 goto out_free;
8857         }
8858
8859         if (block_rsv)
8860                 trans->block_rsv = block_rsv;
8861
8862         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
8863                 level = btrfs_header_level(root->node);
8864                 path->nodes[level] = btrfs_lock_root_node(root);
8865                 btrfs_set_lock_blocking(path->nodes[level]);
8866                 path->slots[level] = 0;
8867                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8868                 memset(&wc->update_progress, 0,
8869                        sizeof(wc->update_progress));
8870         } else {
8871                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
8872                 memcpy(&wc->update_progress, &key,
8873                        sizeof(wc->update_progress));
8874
8875                 level = root_item->drop_level;
8876                 BUG_ON(level == 0);
8877                 path->lowest_level = level;
8878                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8879                 path->lowest_level = 0;
8880                 if (ret < 0) {
8881                         err = ret;
8882                         goto out_end_trans;
8883                 }
8884                 WARN_ON(ret > 0);
8885
8886                 /*
8887                  * unlock our path, this is safe because only this
8888                  * function is allowed to delete this snapshot
8889                  */
8890                 btrfs_unlock_up_safe(path, 0);
8891
8892                 level = btrfs_header_level(root->node);
8893                 while (1) {
8894                         btrfs_tree_lock(path->nodes[level]);
8895                         btrfs_set_lock_blocking(path->nodes[level]);
8896                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8897
8898                         ret = btrfs_lookup_extent_info(trans, root,
8899                                                 path->nodes[level]->start,
8900                                                 level, 1, &wc->refs[level],
8901                                                 &wc->flags[level]);
8902                         if (ret < 0) {
8903                                 err = ret;
8904                                 goto out_end_trans;
8905                         }
8906                         BUG_ON(wc->refs[level] == 0);
8907
8908                         if (level == root_item->drop_level)
8909                                 break;
8910
8911                         btrfs_tree_unlock(path->nodes[level]);
8912                         path->locks[level] = 0;
8913                         WARN_ON(wc->refs[level] != 1);
8914                         level--;
8915                 }
8916         }
8917
8918         wc->level = level;
8919         wc->shared_level = -1;
8920         wc->stage = DROP_REFERENCE;
8921         wc->update_ref = update_ref;
8922         wc->keep_locks = 0;
8923         wc->for_reloc = for_reloc;
8924         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8925
8926         while (1) {
8927
8928                 ret = walk_down_tree(trans, root, path, wc);
8929                 if (ret < 0) {
8930                         err = ret;
8931                         break;
8932                 }
8933
8934                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
8935                 if (ret < 0) {
8936                         err = ret;
8937                         break;
8938                 }
8939
8940                 if (ret > 0) {
8941                         BUG_ON(wc->stage != DROP_REFERENCE);
8942                         break;
8943                 }
8944
8945                 if (wc->stage == DROP_REFERENCE) {
8946                         level = wc->level;
8947                         btrfs_node_key(path->nodes[level],
8948                                        &root_item->drop_progress,
8949                                        path->slots[level]);
8950                         root_item->drop_level = level;
8951                 }
8952
8953                 BUG_ON(wc->level == 0);
8954                 if (btrfs_should_end_transaction(trans, tree_root) ||
8955                     (!for_reloc && btrfs_need_cleaner_sleep(root))) {
8956                         ret = btrfs_update_root(trans, tree_root,
8957                                                 &root->root_key,
8958                                                 root_item);
8959                         if (ret) {
8960                                 btrfs_abort_transaction(trans, tree_root, ret);
8961                                 err = ret;
8962                                 goto out_end_trans;
8963                         }
8964
8965                         btrfs_end_transaction_throttle(trans, tree_root);
8966                         if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8967                                 pr_debug("BTRFS: drop snapshot early exit\n");
8968                                 err = -EAGAIN;
8969                                 goto out_free;
8970                         }
8971
8972                         trans = btrfs_start_transaction(tree_root, 0);
8973                         if (IS_ERR(trans)) {
8974                                 err = PTR_ERR(trans);
8975                                 goto out_free;
8976                         }
8977                         if (block_rsv)
8978                                 trans->block_rsv = block_rsv;
8979                 }
8980         }
8981         btrfs_release_path(path);
8982         if (err)
8983                 goto out_end_trans;
8984
8985         ret = btrfs_del_root(trans, tree_root, &root->root_key);
8986         if (ret) {
8987                 btrfs_abort_transaction(trans, tree_root, ret);
8988                 goto out_end_trans;
8989         }
8990
8991         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
8992                 ret = btrfs_find_root(tree_root, &root->root_key, path,
8993                                       NULL, NULL);
8994                 if (ret < 0) {
8995                         btrfs_abort_transaction(trans, tree_root, ret);
8996                         err = ret;
8997                         goto out_end_trans;
8998                 } else if (ret > 0) {
8999                         /* if we fail to delete the orphan item this time
9000                          * around, it'll get picked up the next time.
9001                          *
9002                          * The most common failure here is just -ENOENT.
9003                          */
9004                         btrfs_del_orphan_item(trans, tree_root,
9005                                               root->root_key.objectid);
9006                 }
9007         }
9008
9009         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9010                 btrfs_add_dropped_root(trans, root);
9011         } else {
9012                 free_extent_buffer(root->node);
9013                 free_extent_buffer(root->commit_root);
9014                 btrfs_put_fs_root(root);
9015         }
9016         root_dropped = true;
9017 out_end_trans:
9018         btrfs_end_transaction_throttle(trans, tree_root);
9019 out_free:
9020         kfree(wc);
9021         btrfs_free_path(path);
9022 out:
9023         /*
9024          * So if we need to stop dropping the snapshot for whatever reason we
9025          * need to make sure to add it back to the dead root list so that we
9026          * keep trying to do the work later.  This also cleans up roots if we
9027          * don't have it in the radix (like when we recover after a power fail
9028          * or unmount) so we don't leak memory.
9029          */
9030         if (!for_reloc && root_dropped == false)
9031                 btrfs_add_dead_root(root);
9032         if (err && err != -EAGAIN)
9033                 btrfs_std_error(root->fs_info, err, NULL);
9034         return err;
9035 }
9036
9037 /*
9038  * drop subtree rooted at tree block 'node'.
9039  *
9040  * NOTE: this function will unlock and release tree block 'node'
9041  * only used by relocation code
9042  */
9043 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9044                         struct btrfs_root *root,
9045                         struct extent_buffer *node,
9046                         struct extent_buffer *parent)
9047 {
9048         struct btrfs_path *path;
9049         struct walk_control *wc;
9050         int level;
9051         int parent_level;
9052         int ret = 0;
9053         int wret;
9054
9055         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9056
9057         path = btrfs_alloc_path();
9058         if (!path)
9059                 return -ENOMEM;
9060
9061         wc = kzalloc(sizeof(*wc), GFP_NOFS);
9062         if (!wc) {
9063                 btrfs_free_path(path);
9064                 return -ENOMEM;
9065         }
9066
9067         btrfs_assert_tree_locked(parent);
9068         parent_level = btrfs_header_level(parent);
9069         extent_buffer_get(parent);
9070         path->nodes[parent_level] = parent;
9071         path->slots[parent_level] = btrfs_header_nritems(parent);
9072
9073         btrfs_assert_tree_locked(node);
9074         level = btrfs_header_level(node);
9075         path->nodes[level] = node;
9076         path->slots[level] = 0;
9077         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9078
9079         wc->refs[parent_level] = 1;
9080         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9081         wc->level = level;
9082         wc->shared_level = -1;
9083         wc->stage = DROP_REFERENCE;
9084         wc->update_ref = 0;
9085         wc->keep_locks = 1;
9086         wc->for_reloc = 1;
9087         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9088
9089         while (1) {
9090                 wret = walk_down_tree(trans, root, path, wc);
9091                 if (wret < 0) {
9092                         ret = wret;
9093                         break;
9094                 }
9095
9096                 wret = walk_up_tree(trans, root, path, wc, parent_level);
9097                 if (wret < 0)
9098                         ret = wret;
9099                 if (wret != 0)
9100                         break;
9101         }
9102
9103         kfree(wc);
9104         btrfs_free_path(path);
9105         return ret;
9106 }
9107
9108 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
9109 {
9110         u64 num_devices;
9111         u64 stripped;
9112
9113         /*
9114          * if restripe for this chunk_type is on pick target profile and
9115          * return, otherwise do the usual balance
9116          */
9117         stripped = get_restripe_target(root->fs_info, flags);
9118         if (stripped)
9119                 return extended_to_chunk(stripped);
9120
9121         num_devices = root->fs_info->fs_devices->rw_devices;
9122
9123         stripped = BTRFS_BLOCK_GROUP_RAID0 |
9124                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9125                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9126
9127         if (num_devices == 1) {
9128                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9129                 stripped = flags & ~stripped;
9130
9131                 /* turn raid0 into single device chunks */
9132                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
9133                         return stripped;
9134
9135                 /* turn mirroring into duplication */
9136                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9137                              BTRFS_BLOCK_GROUP_RAID10))
9138                         return stripped | BTRFS_BLOCK_GROUP_DUP;
9139         } else {
9140                 /* they already had raid on here, just return */
9141                 if (flags & stripped)
9142                         return flags;
9143
9144                 stripped |= BTRFS_BLOCK_GROUP_DUP;
9145                 stripped = flags & ~stripped;
9146
9147                 /* switch duplicated blocks with raid1 */
9148                 if (flags & BTRFS_BLOCK_GROUP_DUP)
9149                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
9150
9151                 /* this is drive concat, leave it alone */
9152         }
9153
9154         return flags;
9155 }
9156
9157 static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9158 {
9159         struct btrfs_space_info *sinfo = cache->space_info;
9160         u64 num_bytes;
9161         u64 min_allocable_bytes;
9162         int ret = -ENOSPC;
9163
9164         /*
9165          * We need some metadata space and system metadata space for
9166          * allocating chunks in some corner cases until we force to set
9167          * it to be readonly.
9168          */
9169         if ((sinfo->flags &
9170              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9171             !force)
9172                 min_allocable_bytes = 1 * 1024 * 1024;
9173         else
9174                 min_allocable_bytes = 0;
9175
9176         spin_lock(&sinfo->lock);
9177         spin_lock(&cache->lock);
9178
9179         if (cache->ro) {
9180                 cache->ro++;
9181                 ret = 0;
9182                 goto out;
9183         }
9184
9185         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9186                     cache->bytes_super - btrfs_block_group_used(&cache->item);
9187
9188         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
9189             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
9190             min_allocable_bytes <= sinfo->total_bytes) {
9191                 sinfo->bytes_readonly += num_bytes;
9192                 cache->ro++;
9193                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9194                 ret = 0;
9195         }
9196 out:
9197         spin_unlock(&cache->lock);
9198         spin_unlock(&sinfo->lock);
9199         return ret;
9200 }
9201
9202 int btrfs_inc_block_group_ro(struct btrfs_root *root,
9203                              struct btrfs_block_group_cache *cache)
9204
9205 {
9206         struct btrfs_trans_handle *trans;
9207         u64 alloc_flags;
9208         int ret;
9209
9210 again:
9211         trans = btrfs_join_transaction(root);
9212         if (IS_ERR(trans))
9213                 return PTR_ERR(trans);
9214
9215         /*
9216          * we're not allowed to set block groups readonly after the dirty
9217          * block groups cache has started writing.  If it already started,
9218          * back off and let this transaction commit
9219          */
9220         mutex_lock(&root->fs_info->ro_block_group_mutex);
9221         if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9222                 u64 transid = trans->transid;
9223
9224                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
9225                 btrfs_end_transaction(trans, root);
9226
9227                 ret = btrfs_wait_for_commit(root, transid);
9228                 if (ret)
9229                         return ret;
9230                 goto again;
9231         }
9232
9233         /*
9234          * if we are changing raid levels, try to allocate a corresponding
9235          * block group with the new raid level.
9236          */
9237         alloc_flags = update_block_group_flags(root, cache->flags);
9238         if (alloc_flags != cache->flags) {
9239                 ret = do_chunk_alloc(trans, root, alloc_flags,
9240                                      CHUNK_ALLOC_FORCE);
9241                 /*
9242                  * ENOSPC is allowed here, we may have enough space
9243                  * already allocated at the new raid level to
9244                  * carry on
9245                  */
9246                 if (ret == -ENOSPC)
9247                         ret = 0;
9248                 if (ret < 0)
9249                         goto out;
9250         }
9251
9252         ret = inc_block_group_ro(cache, 0);
9253         if (!ret)
9254                 goto out;
9255         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
9256         ret = do_chunk_alloc(trans, root, alloc_flags,
9257                              CHUNK_ALLOC_FORCE);
9258         if (ret < 0)
9259                 goto out;
9260         ret = inc_block_group_ro(cache, 0);
9261 out:
9262         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9263                 alloc_flags = update_block_group_flags(root, cache->flags);
9264                 lock_chunks(root->fs_info->chunk_root);
9265                 check_system_chunk(trans, root, alloc_flags);
9266                 unlock_chunks(root->fs_info->chunk_root);
9267         }
9268         mutex_unlock(&root->fs_info->ro_block_group_mutex);
9269
9270         btrfs_end_transaction(trans, root);
9271         return ret;
9272 }
9273
9274 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9275                             struct btrfs_root *root, u64 type)
9276 {
9277         u64 alloc_flags = get_alloc_profile(root, type);
9278         return do_chunk_alloc(trans, root, alloc_flags,
9279                               CHUNK_ALLOC_FORCE);
9280 }
9281
9282 /*
9283  * helper to account the unused space of all the readonly block group in the
9284  * space_info. takes mirrors into account.
9285  */
9286 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9287 {
9288         struct btrfs_block_group_cache *block_group;
9289         u64 free_bytes = 0;
9290         int factor;
9291
9292         /* It's df, we don't care if it's racey */
9293         if (list_empty(&sinfo->ro_bgs))
9294                 return 0;
9295
9296         spin_lock(&sinfo->lock);
9297         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9298                 spin_lock(&block_group->lock);
9299
9300                 if (!block_group->ro) {
9301                         spin_unlock(&block_group->lock);
9302                         continue;
9303                 }
9304
9305                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9306                                           BTRFS_BLOCK_GROUP_RAID10 |
9307                                           BTRFS_BLOCK_GROUP_DUP))
9308                         factor = 2;
9309                 else
9310                         factor = 1;
9311
9312                 free_bytes += (block_group->key.offset -
9313                                btrfs_block_group_used(&block_group->item)) *
9314                                factor;
9315
9316                 spin_unlock(&block_group->lock);
9317         }
9318         spin_unlock(&sinfo->lock);
9319
9320         return free_bytes;
9321 }
9322
9323 void btrfs_dec_block_group_ro(struct btrfs_root *root,
9324                               struct btrfs_block_group_cache *cache)
9325 {
9326         struct btrfs_space_info *sinfo = cache->space_info;
9327         u64 num_bytes;
9328
9329         BUG_ON(!cache->ro);
9330
9331         spin_lock(&sinfo->lock);
9332         spin_lock(&cache->lock);
9333         if (!--cache->ro) {
9334                 num_bytes = cache->key.offset - cache->reserved -
9335                             cache->pinned - cache->bytes_super -
9336                             btrfs_block_group_used(&cache->item);
9337                 sinfo->bytes_readonly -= num_bytes;
9338                 list_del_init(&cache->ro_list);
9339         }
9340         spin_unlock(&cache->lock);
9341         spin_unlock(&sinfo->lock);
9342 }
9343
9344 /*
9345  * checks to see if its even possible to relocate this block group.
9346  *
9347  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9348  * ok to go ahead and try.
9349  */
9350 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
9351 {
9352         struct btrfs_block_group_cache *block_group;
9353         struct btrfs_space_info *space_info;
9354         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
9355         struct btrfs_device *device;
9356         struct btrfs_trans_handle *trans;
9357         u64 min_free;
9358         u64 dev_min = 1;
9359         u64 dev_nr = 0;
9360         u64 target;
9361         int index;
9362         int full = 0;
9363         int ret = 0;
9364
9365         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
9366
9367         /* odd, couldn't find the block group, leave it alone */
9368         if (!block_group)
9369                 return -1;
9370
9371         min_free = btrfs_block_group_used(&block_group->item);
9372
9373         /* no bytes used, we're good */
9374         if (!min_free)
9375                 goto out;
9376
9377         space_info = block_group->space_info;
9378         spin_lock(&space_info->lock);
9379
9380         full = space_info->full;
9381
9382         /*
9383          * if this is the last block group we have in this space, we can't
9384          * relocate it unless we're able to allocate a new chunk below.
9385          *
9386          * Otherwise, we need to make sure we have room in the space to handle
9387          * all of the extents from this block group.  If we can, we're good
9388          */
9389         if ((space_info->total_bytes != block_group->key.offset) &&
9390             (space_info->bytes_used + space_info->bytes_reserved +
9391              space_info->bytes_pinned + space_info->bytes_readonly +
9392              min_free < space_info->total_bytes)) {
9393                 spin_unlock(&space_info->lock);
9394                 goto out;
9395         }
9396         spin_unlock(&space_info->lock);
9397
9398         /*
9399          * ok we don't have enough space, but maybe we have free space on our
9400          * devices to allocate new chunks for relocation, so loop through our
9401          * alloc devices and guess if we have enough space.  if this block
9402          * group is going to be restriped, run checks against the target
9403          * profile instead of the current one.
9404          */
9405         ret = -1;
9406
9407         /*
9408          * index:
9409          *      0: raid10
9410          *      1: raid1
9411          *      2: dup
9412          *      3: raid0
9413          *      4: single
9414          */
9415         target = get_restripe_target(root->fs_info, block_group->flags);
9416         if (target) {
9417                 index = __get_raid_index(extended_to_chunk(target));
9418         } else {
9419                 /*
9420                  * this is just a balance, so if we were marked as full
9421                  * we know there is no space for a new chunk
9422                  */
9423                 if (full)
9424                         goto out;
9425
9426                 index = get_block_group_index(block_group);
9427         }
9428
9429         if (index == BTRFS_RAID_RAID10) {
9430                 dev_min = 4;
9431                 /* Divide by 2 */
9432                 min_free >>= 1;
9433         } else if (index == BTRFS_RAID_RAID1) {
9434                 dev_min = 2;
9435         } else if (index == BTRFS_RAID_DUP) {
9436                 /* Multiply by 2 */
9437                 min_free <<= 1;
9438         } else if (index == BTRFS_RAID_RAID0) {
9439                 dev_min = fs_devices->rw_devices;
9440                 min_free = div64_u64(min_free, dev_min);
9441         }
9442
9443         /* We need to do this so that we can look at pending chunks */
9444         trans = btrfs_join_transaction(root);
9445         if (IS_ERR(trans)) {
9446                 ret = PTR_ERR(trans);
9447                 goto out;
9448         }
9449
9450         mutex_lock(&root->fs_info->chunk_mutex);
9451         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9452                 u64 dev_offset;
9453
9454                 /*
9455                  * check to make sure we can actually find a chunk with enough
9456                  * space to fit our block group in.
9457                  */
9458                 if (device->total_bytes > device->bytes_used + min_free &&
9459                     !device->is_tgtdev_for_dev_replace) {
9460                         ret = find_free_dev_extent(trans, device, min_free,
9461                                                    &dev_offset, NULL);
9462                         if (!ret)
9463                                 dev_nr++;
9464
9465                         if (dev_nr >= dev_min)
9466                                 break;
9467
9468                         ret = -1;
9469                 }
9470         }
9471         mutex_unlock(&root->fs_info->chunk_mutex);
9472         btrfs_end_transaction(trans, root);
9473 out:
9474         btrfs_put_block_group(block_group);
9475         return ret;
9476 }
9477
9478 static int find_first_block_group(struct btrfs_root *root,
9479                 struct btrfs_path *path, struct btrfs_key *key)
9480 {
9481         int ret = 0;
9482         struct btrfs_key found_key;
9483         struct extent_buffer *leaf;
9484         int slot;
9485
9486         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9487         if (ret < 0)
9488                 goto out;
9489
9490         while (1) {
9491                 slot = path->slots[0];
9492                 leaf = path->nodes[0];
9493                 if (slot >= btrfs_header_nritems(leaf)) {
9494                         ret = btrfs_next_leaf(root, path);
9495                         if (ret == 0)
9496                                 continue;
9497                         if (ret < 0)
9498                                 goto out;
9499                         break;
9500                 }
9501                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
9502
9503                 if (found_key.objectid >= key->objectid &&
9504                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9505                         ret = 0;
9506                         goto out;
9507                 }
9508                 path->slots[0]++;
9509         }
9510 out:
9511         return ret;
9512 }
9513
9514 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9515 {
9516         struct btrfs_block_group_cache *block_group;
9517         u64 last = 0;
9518
9519         while (1) {
9520                 struct inode *inode;
9521
9522                 block_group = btrfs_lookup_first_block_group(info, last);
9523                 while (block_group) {
9524                         wait_block_group_cache_done(block_group);
9525                         spin_lock(&block_group->lock);
9526                         if (block_group->iref)
9527                                 break;
9528                         spin_unlock(&block_group->lock);
9529                         block_group = next_block_group(info->tree_root,
9530                                                        block_group);
9531                 }
9532                 if (!block_group) {
9533                         if (last == 0)
9534                                 break;
9535                         last = 0;
9536                         continue;
9537                 }
9538
9539                 inode = block_group->inode;
9540                 block_group->iref = 0;
9541                 block_group->inode = NULL;
9542                 spin_unlock(&block_group->lock);
9543                 iput(inode);
9544                 last = block_group->key.objectid + block_group->key.offset;
9545                 btrfs_put_block_group(block_group);
9546         }
9547 }
9548
9549 int btrfs_free_block_groups(struct btrfs_fs_info *info)
9550 {
9551         struct btrfs_block_group_cache *block_group;
9552         struct btrfs_space_info *space_info;
9553         struct btrfs_caching_control *caching_ctl;
9554         struct rb_node *n;
9555
9556         down_write(&info->commit_root_sem);
9557         while (!list_empty(&info->caching_block_groups)) {
9558                 caching_ctl = list_entry(info->caching_block_groups.next,
9559                                          struct btrfs_caching_control, list);
9560                 list_del(&caching_ctl->list);
9561                 put_caching_control(caching_ctl);
9562         }
9563         up_write(&info->commit_root_sem);
9564
9565         spin_lock(&info->unused_bgs_lock);
9566         while (!list_empty(&info->unused_bgs)) {
9567                 block_group = list_first_entry(&info->unused_bgs,
9568                                                struct btrfs_block_group_cache,
9569                                                bg_list);
9570                 list_del_init(&block_group->bg_list);
9571                 btrfs_put_block_group(block_group);
9572         }
9573         spin_unlock(&info->unused_bgs_lock);
9574
9575         spin_lock(&info->block_group_cache_lock);
9576         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9577                 block_group = rb_entry(n, struct btrfs_block_group_cache,
9578                                        cache_node);
9579                 rb_erase(&block_group->cache_node,
9580                          &info->block_group_cache_tree);
9581                 RB_CLEAR_NODE(&block_group->cache_node);
9582                 spin_unlock(&info->block_group_cache_lock);
9583
9584                 down_write(&block_group->space_info->groups_sem);
9585                 list_del(&block_group->list);
9586                 up_write(&block_group->space_info->groups_sem);
9587
9588                 if (block_group->cached == BTRFS_CACHE_STARTED)
9589                         wait_block_group_cache_done(block_group);
9590
9591                 /*
9592                  * We haven't cached this block group, which means we could
9593                  * possibly have excluded extents on this block group.
9594                  */
9595                 if (block_group->cached == BTRFS_CACHE_NO ||
9596                     block_group->cached == BTRFS_CACHE_ERROR)
9597                         free_excluded_extents(info->extent_root, block_group);
9598
9599                 btrfs_remove_free_space_cache(block_group);
9600                 btrfs_put_block_group(block_group);
9601
9602                 spin_lock(&info->block_group_cache_lock);
9603         }
9604         spin_unlock(&info->block_group_cache_lock);
9605
9606         /* now that all the block groups are freed, go through and
9607          * free all the space_info structs.  This is only called during
9608          * the final stages of unmount, and so we know nobody is
9609          * using them.  We call synchronize_rcu() once before we start,
9610          * just to be on the safe side.
9611          */
9612         synchronize_rcu();
9613
9614         release_global_block_rsv(info);
9615
9616         while (!list_empty(&info->space_info)) {
9617                 int i;
9618
9619                 space_info = list_entry(info->space_info.next,
9620                                         struct btrfs_space_info,
9621                                         list);
9622                 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
9623                         if (WARN_ON(space_info->bytes_pinned > 0 ||
9624                             space_info->bytes_reserved > 0 ||
9625                             space_info->bytes_may_use > 0)) {
9626                                 dump_space_info(space_info, 0, 0);
9627                         }
9628                 }
9629                 list_del(&space_info->list);
9630                 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9631                         struct kobject *kobj;
9632                         kobj = space_info->block_group_kobjs[i];
9633                         space_info->block_group_kobjs[i] = NULL;
9634                         if (kobj) {
9635                                 kobject_del(kobj);
9636                                 kobject_put(kobj);
9637                         }
9638                 }
9639                 kobject_del(&space_info->kobj);
9640                 kobject_put(&space_info->kobj);
9641         }
9642         return 0;
9643 }
9644
9645 static void __link_block_group(struct btrfs_space_info *space_info,
9646                                struct btrfs_block_group_cache *cache)
9647 {
9648         int index = get_block_group_index(cache);
9649         bool first = false;
9650
9651         down_write(&space_info->groups_sem);
9652         if (list_empty(&space_info->block_groups[index]))
9653                 first = true;
9654         list_add_tail(&cache->list, &space_info->block_groups[index]);
9655         up_write(&space_info->groups_sem);
9656
9657         if (first) {
9658                 struct raid_kobject *rkobj;
9659                 int ret;
9660
9661                 rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
9662                 if (!rkobj)
9663                         goto out_err;
9664                 rkobj->raid_type = index;
9665                 kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
9666                 ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9667                                   "%s", get_raid_name(index));
9668                 if (ret) {
9669                         kobject_put(&rkobj->kobj);
9670                         goto out_err;
9671                 }
9672                 space_info->block_group_kobjs[index] = &rkobj->kobj;
9673         }
9674
9675         return;
9676 out_err:
9677         pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
9678 }
9679
9680 static struct btrfs_block_group_cache *
9681 btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
9682 {
9683         struct btrfs_block_group_cache *cache;
9684
9685         cache = kzalloc(sizeof(*cache), GFP_NOFS);
9686         if (!cache)
9687                 return NULL;
9688
9689         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
9690                                         GFP_NOFS);
9691         if (!cache->free_space_ctl) {
9692                 kfree(cache);
9693                 return NULL;
9694         }
9695
9696         cache->key.objectid = start;
9697         cache->key.offset = size;
9698         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9699
9700         cache->sectorsize = root->sectorsize;
9701         cache->fs_info = root->fs_info;
9702         cache->full_stripe_len = btrfs_full_stripe_len(root,
9703                                                &root->fs_info->mapping_tree,
9704                                                start);
9705         atomic_set(&cache->count, 1);
9706         spin_lock_init(&cache->lock);
9707         init_rwsem(&cache->data_rwsem);
9708         INIT_LIST_HEAD(&cache->list);
9709         INIT_LIST_HEAD(&cache->cluster_list);
9710         INIT_LIST_HEAD(&cache->bg_list);
9711         INIT_LIST_HEAD(&cache->ro_list);
9712         INIT_LIST_HEAD(&cache->dirty_list);
9713         INIT_LIST_HEAD(&cache->io_list);
9714         btrfs_init_free_space_ctl(cache);
9715         atomic_set(&cache->trimming, 0);
9716
9717         return cache;
9718 }
9719
9720 int btrfs_read_block_groups(struct btrfs_root *root)
9721 {
9722         struct btrfs_path *path;
9723         int ret;
9724         struct btrfs_block_group_cache *cache;
9725         struct btrfs_fs_info *info = root->fs_info;
9726         struct btrfs_space_info *space_info;
9727         struct btrfs_key key;
9728         struct btrfs_key found_key;
9729         struct extent_buffer *leaf;
9730         int need_clear = 0;
9731         u64 cache_gen;
9732         u64 feature;
9733         int mixed;
9734
9735         feature = btrfs_super_incompat_flags(info->super_copy);
9736         mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
9737
9738         root = info->extent_root;
9739         key.objectid = 0;
9740         key.offset = 0;
9741         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9742         path = btrfs_alloc_path();
9743         if (!path)
9744                 return -ENOMEM;
9745         path->reada = 1;
9746
9747         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
9748         if (btrfs_test_opt(root, SPACE_CACHE) &&
9749             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
9750                 need_clear = 1;
9751         if (btrfs_test_opt(root, CLEAR_CACHE))
9752                 need_clear = 1;
9753
9754         while (1) {
9755                 ret = find_first_block_group(root, path, &key);
9756                 if (ret > 0)
9757                         break;
9758                 if (ret != 0)
9759                         goto error;
9760
9761                 leaf = path->nodes[0];
9762                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9763
9764                 cache = btrfs_create_block_group_cache(root, found_key.objectid,
9765                                                        found_key.offset);
9766                 if (!cache) {
9767                         ret = -ENOMEM;
9768                         goto error;
9769                 }
9770
9771                 if (need_clear) {
9772                         /*
9773                          * When we mount with old space cache, we need to
9774                          * set BTRFS_DC_CLEAR and set dirty flag.
9775                          *
9776                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9777                          *    truncate the old free space cache inode and
9778                          *    setup a new one.
9779                          * b) Setting 'dirty flag' makes sure that we flush
9780                          *    the new space cache info onto disk.
9781                          */
9782                         if (btrfs_test_opt(root, SPACE_CACHE))
9783                                 cache->disk_cache_state = BTRFS_DC_CLEAR;
9784                 }
9785
9786                 read_extent_buffer(leaf, &cache->item,
9787                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
9788                                    sizeof(cache->item));
9789                 cache->flags = btrfs_block_group_flags(&cache->item);
9790                 if (!mixed &&
9791                     ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
9792                     (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
9793                         btrfs_err(info,
9794 "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
9795                                   cache->key.objectid);
9796                         ret = -EINVAL;
9797                         goto error;
9798                 }
9799
9800                 key.objectid = found_key.objectid + found_key.offset;
9801                 btrfs_release_path(path);
9802
9803                 /*
9804                  * We need to exclude the super stripes now so that the space
9805                  * info has super bytes accounted for, otherwise we'll think
9806                  * we have more space than we actually do.
9807                  */
9808                 ret = exclude_super_stripes(root, cache);
9809                 if (ret) {
9810                         /*
9811                          * We may have excluded something, so call this just in
9812                          * case.
9813                          */
9814                         free_excluded_extents(root, cache);
9815                         btrfs_put_block_group(cache);
9816                         goto error;
9817                 }
9818
9819                 /*
9820                  * check for two cases, either we are full, and therefore
9821                  * don't need to bother with the caching work since we won't
9822                  * find any space, or we are empty, and we can just add all
9823                  * the space in and be done with it.  This saves us _alot_ of
9824                  * time, particularly in the full case.
9825                  */
9826                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
9827                         cache->last_byte_to_unpin = (u64)-1;
9828                         cache->cached = BTRFS_CACHE_FINISHED;
9829                         free_excluded_extents(root, cache);
9830                 } else if (btrfs_block_group_used(&cache->item) == 0) {
9831                         cache->last_byte_to_unpin = (u64)-1;
9832                         cache->cached = BTRFS_CACHE_FINISHED;
9833                         add_new_free_space(cache, root->fs_info,
9834                                            found_key.objectid,
9835                                            found_key.objectid +
9836                                            found_key.offset);
9837                         free_excluded_extents(root, cache);
9838                 }
9839
9840                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
9841                 if (ret) {
9842                         btrfs_remove_free_space_cache(cache);
9843                         btrfs_put_block_group(cache);
9844                         goto error;
9845                 }
9846
9847                 ret = update_space_info(info, cache->flags, found_key.offset,
9848                                         btrfs_block_group_used(&cache->item),
9849                                         &space_info);
9850                 if (ret) {
9851                         btrfs_remove_free_space_cache(cache);
9852                         spin_lock(&info->block_group_cache_lock);
9853                         rb_erase(&cache->cache_node,
9854                                  &info->block_group_cache_tree);
9855                         RB_CLEAR_NODE(&cache->cache_node);
9856                         spin_unlock(&info->block_group_cache_lock);
9857                         btrfs_put_block_group(cache);
9858                         goto error;
9859                 }
9860
9861                 cache->space_info = space_info;
9862                 spin_lock(&cache->space_info->lock);
9863                 cache->space_info->bytes_readonly += cache->bytes_super;
9864                 spin_unlock(&cache->space_info->lock);
9865
9866                 __link_block_group(space_info, cache);
9867
9868                 set_avail_alloc_bits(root->fs_info, cache->flags);
9869                 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
9870                         inc_block_group_ro(cache, 1);
9871                 } else if (btrfs_block_group_used(&cache->item) == 0) {
9872                         spin_lock(&info->unused_bgs_lock);
9873                         /* Should always be true but just in case. */
9874                         if (list_empty(&cache->bg_list)) {
9875                                 btrfs_get_block_group(cache);
9876                                 list_add_tail(&cache->bg_list,
9877                                               &info->unused_bgs);
9878                         }
9879                         spin_unlock(&info->unused_bgs_lock);
9880                 }
9881         }
9882
9883         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
9884                 if (!(get_alloc_profile(root, space_info->flags) &
9885                       (BTRFS_BLOCK_GROUP_RAID10 |
9886                        BTRFS_BLOCK_GROUP_RAID1 |
9887                        BTRFS_BLOCK_GROUP_RAID5 |
9888                        BTRFS_BLOCK_GROUP_RAID6 |
9889                        BTRFS_BLOCK_GROUP_DUP)))
9890                         continue;
9891                 /*
9892                  * avoid allocating from un-mirrored block group if there are
9893                  * mirrored block groups.
9894                  */
9895                 list_for_each_entry(cache,
9896                                 &space_info->block_groups[BTRFS_RAID_RAID0],
9897                                 list)
9898                         inc_block_group_ro(cache, 1);
9899                 list_for_each_entry(cache,
9900                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
9901                                 list)
9902                         inc_block_group_ro(cache, 1);
9903         }
9904
9905         init_global_block_rsv(info);
9906         ret = 0;
9907 error:
9908         btrfs_free_path(path);
9909         return ret;
9910 }
9911
9912 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9913                                        struct btrfs_root *root)
9914 {
9915         struct btrfs_block_group_cache *block_group;
9916         struct btrfs_root *extent_root = root->fs_info->extent_root;
9917         struct btrfs_block_group_item item;
9918         struct btrfs_key key;
9919         int ret = 0;
9920         bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
9921
9922         trans->can_flush_pending_bgs = false;
9923         while (!list_empty(&trans->new_bgs)) {
9924                 block_group = list_first_entry(&trans->new_bgs,
9925                                                struct btrfs_block_group_cache,
9926                                                bg_list);
9927                 if (ret)
9928                         goto next;
9929
9930                 spin_lock(&block_group->lock);
9931                 memcpy(&item, &block_group->item, sizeof(item));
9932                 memcpy(&key, &block_group->key, sizeof(key));
9933                 spin_unlock(&block_group->lock);
9934
9935                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
9936                                         sizeof(item));
9937                 if (ret)
9938                         btrfs_abort_transaction(trans, extent_root, ret);
9939                 ret = btrfs_finish_chunk_alloc(trans, extent_root,
9940                                                key.objectid, key.offset);
9941                 if (ret)
9942                         btrfs_abort_transaction(trans, extent_root, ret);
9943 next:
9944                 list_del_init(&block_group->bg_list);
9945         }
9946         trans->can_flush_pending_bgs = can_flush_pending_bgs;
9947 }
9948
9949 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9950                            struct btrfs_root *root, u64 bytes_used,
9951                            u64 type, u64 chunk_objectid, u64 chunk_offset,
9952                            u64 size)
9953 {
9954         int ret;
9955         struct btrfs_root *extent_root;
9956         struct btrfs_block_group_cache *cache;
9957
9958         extent_root = root->fs_info->extent_root;
9959
9960         btrfs_set_log_full_commit(root->fs_info, trans);
9961
9962         cache = btrfs_create_block_group_cache(root, chunk_offset, size);
9963         if (!cache)
9964                 return -ENOMEM;
9965
9966         btrfs_set_block_group_used(&cache->item, bytes_used);
9967         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
9968         btrfs_set_block_group_flags(&cache->item, type);
9969
9970         cache->flags = type;
9971         cache->last_byte_to_unpin = (u64)-1;
9972         cache->cached = BTRFS_CACHE_FINISHED;
9973         ret = exclude_super_stripes(root, cache);
9974         if (ret) {
9975                 /*
9976                  * We may have excluded something, so call this just in
9977                  * case.
9978                  */
9979                 free_excluded_extents(root, cache);
9980                 btrfs_put_block_group(cache);
9981                 return ret;
9982         }
9983
9984         add_new_free_space(cache, root->fs_info, chunk_offset,
9985                            chunk_offset + size);
9986
9987         free_excluded_extents(root, cache);
9988
9989 #ifdef CONFIG_BTRFS_DEBUG
9990         if (btrfs_should_fragment_free_space(root, cache)) {
9991                 u64 new_bytes_used = size - bytes_used;
9992
9993                 bytes_used += new_bytes_used >> 1;
9994                 fragment_free_space(root, cache);
9995         }
9996 #endif
9997         /*
9998          * Call to ensure the corresponding space_info object is created and
9999          * assigned to our block group, but don't update its counters just yet.
10000          * We want our bg to be added to the rbtree with its ->space_info set.
10001          */
10002         ret = update_space_info(root->fs_info, cache->flags, 0, 0,
10003                                 &cache->space_info);
10004         if (ret) {
10005                 btrfs_remove_free_space_cache(cache);
10006                 btrfs_put_block_group(cache);
10007                 return ret;
10008         }
10009
10010         ret = btrfs_add_block_group_cache(root->fs_info, cache);
10011         if (ret) {
10012                 btrfs_remove_free_space_cache(cache);
10013                 btrfs_put_block_group(cache);
10014                 return ret;
10015         }
10016
10017         /*
10018          * Now that our block group has its ->space_info set and is inserted in
10019          * the rbtree, update the space info's counters.
10020          */
10021         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
10022                                 &cache->space_info);
10023         if (ret) {
10024                 btrfs_remove_free_space_cache(cache);
10025                 spin_lock(&root->fs_info->block_group_cache_lock);
10026                 rb_erase(&cache->cache_node,
10027                          &root->fs_info->block_group_cache_tree);
10028                 RB_CLEAR_NODE(&cache->cache_node);
10029                 spin_unlock(&root->fs_info->block_group_cache_lock);
10030                 btrfs_put_block_group(cache);
10031                 return ret;
10032         }
10033         update_global_block_rsv(root->fs_info);
10034
10035         spin_lock(&cache->space_info->lock);
10036         cache->space_info->bytes_readonly += cache->bytes_super;
10037         spin_unlock(&cache->space_info->lock);
10038
10039         __link_block_group(cache->space_info, cache);
10040
10041         list_add_tail(&cache->bg_list, &trans->new_bgs);
10042
10043         set_avail_alloc_bits(extent_root->fs_info, type);
10044
10045         return 0;
10046 }
10047
10048 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10049 {
10050         u64 extra_flags = chunk_to_extended(flags) &
10051                                 BTRFS_EXTENDED_PROFILE_MASK;
10052
10053         write_seqlock(&fs_info->profiles_lock);
10054         if (flags & BTRFS_BLOCK_GROUP_DATA)
10055                 fs_info->avail_data_alloc_bits &= ~extra_flags;
10056         if (flags & BTRFS_BLOCK_GROUP_METADATA)
10057                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10058         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10059                 fs_info->avail_system_alloc_bits &= ~extra_flags;
10060         write_sequnlock(&fs_info->profiles_lock);
10061 }
10062
10063 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10064                              struct btrfs_root *root, u64 group_start,
10065                              struct extent_map *em)
10066 {
10067         struct btrfs_path *path;
10068         struct btrfs_block_group_cache *block_group;
10069         struct btrfs_free_cluster *cluster;
10070         struct btrfs_root *tree_root = root->fs_info->tree_root;
10071         struct btrfs_key key;
10072         struct inode *inode;
10073         struct kobject *kobj = NULL;
10074         int ret;
10075         int index;
10076         int factor;
10077         struct btrfs_caching_control *caching_ctl = NULL;
10078         bool remove_em;
10079
10080         root = root->fs_info->extent_root;
10081
10082         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
10083         BUG_ON(!block_group);
10084         BUG_ON(!block_group->ro);
10085
10086         /*
10087          * Free the reserved super bytes from this block group before
10088          * remove it.
10089          */
10090         free_excluded_extents(root, block_group);
10091
10092         memcpy(&key, &block_group->key, sizeof(key));
10093         index = get_block_group_index(block_group);
10094         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10095                                   BTRFS_BLOCK_GROUP_RAID1 |
10096                                   BTRFS_BLOCK_GROUP_RAID10))
10097                 factor = 2;
10098         else
10099                 factor = 1;
10100
10101         /* make sure this block group isn't part of an allocation cluster */
10102         cluster = &root->fs_info->data_alloc_cluster;
10103         spin_lock(&cluster->refill_lock);
10104         btrfs_return_cluster_to_free_space(block_group, cluster);
10105         spin_unlock(&cluster->refill_lock);
10106
10107         /*
10108          * make sure this block group isn't part of a metadata
10109          * allocation cluster
10110          */
10111         cluster = &root->fs_info->meta_alloc_cluster;
10112         spin_lock(&cluster->refill_lock);
10113         btrfs_return_cluster_to_free_space(block_group, cluster);
10114         spin_unlock(&cluster->refill_lock);
10115
10116         path = btrfs_alloc_path();
10117         if (!path) {
10118                 ret = -ENOMEM;
10119                 goto out;
10120         }
10121
10122         /*
10123          * get the inode first so any iput calls done for the io_list
10124          * aren't the final iput (no unlinks allowed now)
10125          */
10126         inode = lookup_free_space_inode(tree_root, block_group, path);
10127
10128         mutex_lock(&trans->transaction->cache_write_mutex);
10129         /*
10130          * make sure our free spache cache IO is done before remove the
10131          * free space inode
10132          */
10133         spin_lock(&trans->transaction->dirty_bgs_lock);
10134         if (!list_empty(&block_group->io_list)) {
10135                 list_del_init(&block_group->io_list);
10136
10137                 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10138
10139                 spin_unlock(&trans->transaction->dirty_bgs_lock);
10140                 btrfs_wait_cache_io(root, trans, block_group,
10141                                     &block_group->io_ctl, path,
10142                                     block_group->key.objectid);
10143                 btrfs_put_block_group(block_group);
10144                 spin_lock(&trans->transaction->dirty_bgs_lock);
10145         }
10146
10147         if (!list_empty(&block_group->dirty_list)) {
10148                 list_del_init(&block_group->dirty_list);
10149                 btrfs_put_block_group(block_group);
10150         }
10151         spin_unlock(&trans->transaction->dirty_bgs_lock);
10152         mutex_unlock(&trans->transaction->cache_write_mutex);
10153
10154         if (!IS_ERR(inode)) {
10155                 ret = btrfs_orphan_add(trans, inode);
10156                 if (ret) {
10157                         btrfs_add_delayed_iput(inode);
10158                         goto out;
10159                 }
10160                 clear_nlink(inode);
10161                 /* One for the block groups ref */
10162                 spin_lock(&block_group->lock);
10163                 if (block_group->iref) {
10164                         block_group->iref = 0;
10165                         block_group->inode = NULL;
10166                         spin_unlock(&block_group->lock);
10167                         iput(inode);
10168                 } else {
10169                         spin_unlock(&block_group->lock);
10170                 }
10171                 /* One for our lookup ref */
10172                 btrfs_add_delayed_iput(inode);
10173         }
10174
10175         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10176         key.offset = block_group->key.objectid;
10177         key.type = 0;
10178
10179         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10180         if (ret < 0)
10181                 goto out;
10182         if (ret > 0)
10183                 btrfs_release_path(path);
10184         if (ret == 0) {
10185                 ret = btrfs_del_item(trans, tree_root, path);
10186                 if (ret)
10187                         goto out;
10188                 btrfs_release_path(path);
10189         }
10190
10191         spin_lock(&root->fs_info->block_group_cache_lock);
10192         rb_erase(&block_group->cache_node,
10193                  &root->fs_info->block_group_cache_tree);
10194         RB_CLEAR_NODE(&block_group->cache_node);
10195
10196         if (root->fs_info->first_logical_byte == block_group->key.objectid)
10197                 root->fs_info->first_logical_byte = (u64)-1;
10198         spin_unlock(&root->fs_info->block_group_cache_lock);
10199
10200         down_write(&block_group->space_info->groups_sem);
10201         /*
10202          * we must use list_del_init so people can check to see if they
10203          * are still on the list after taking the semaphore
10204          */
10205         list_del_init(&block_group->list);
10206         if (list_empty(&block_group->space_info->block_groups[index])) {
10207                 kobj = block_group->space_info->block_group_kobjs[index];
10208                 block_group->space_info->block_group_kobjs[index] = NULL;
10209                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
10210         }
10211         up_write(&block_group->space_info->groups_sem);
10212         if (kobj) {
10213                 kobject_del(kobj);
10214                 kobject_put(kobj);
10215         }
10216
10217         if (block_group->has_caching_ctl)
10218                 caching_ctl = get_caching_control(block_group);
10219         if (block_group->cached == BTRFS_CACHE_STARTED)
10220                 wait_block_group_cache_done(block_group);
10221         if (block_group->has_caching_ctl) {
10222                 down_write(&root->fs_info->commit_root_sem);
10223                 if (!caching_ctl) {
10224                         struct btrfs_caching_control *ctl;
10225
10226                         list_for_each_entry(ctl,
10227                                     &root->fs_info->caching_block_groups, list)
10228                                 if (ctl->block_group == block_group) {
10229                                         caching_ctl = ctl;
10230                                         atomic_inc(&caching_ctl->count);
10231                                         break;
10232                                 }
10233                 }
10234                 if (caching_ctl)
10235                         list_del_init(&caching_ctl->list);
10236                 up_write(&root->fs_info->commit_root_sem);
10237                 if (caching_ctl) {
10238                         /* Once for the caching bgs list and once for us. */
10239                         put_caching_control(caching_ctl);
10240                         put_caching_control(caching_ctl);
10241                 }
10242         }
10243
10244         spin_lock(&trans->transaction->dirty_bgs_lock);
10245         if (!list_empty(&block_group->dirty_list)) {
10246                 WARN_ON(1);
10247         }
10248         if (!list_empty(&block_group->io_list)) {
10249                 WARN_ON(1);
10250         }
10251         spin_unlock(&trans->transaction->dirty_bgs_lock);
10252         btrfs_remove_free_space_cache(block_group);
10253
10254         spin_lock(&block_group->space_info->lock);
10255         list_del_init(&block_group->ro_list);
10256
10257         if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
10258                 WARN_ON(block_group->space_info->total_bytes
10259                         < block_group->key.offset);
10260                 WARN_ON(block_group->space_info->bytes_readonly
10261                         < block_group->key.offset);
10262                 WARN_ON(block_group->space_info->disk_total
10263                         < block_group->key.offset * factor);
10264         }
10265         block_group->space_info->total_bytes -= block_group->key.offset;
10266         block_group->space_info->bytes_readonly -= block_group->key.offset;
10267         block_group->space_info->disk_total -= block_group->key.offset * factor;
10268
10269         spin_unlock(&block_group->space_info->lock);
10270
10271         memcpy(&key, &block_group->key, sizeof(key));
10272
10273         lock_chunks(root);
10274         if (!list_empty(&em->list)) {
10275                 /* We're in the transaction->pending_chunks list. */
10276                 free_extent_map(em);
10277         }
10278         spin_lock(&block_group->lock);
10279         block_group->removed = 1;
10280         /*
10281          * At this point trimming can't start on this block group, because we
10282          * removed the block group from the tree fs_info->block_group_cache_tree
10283          * so no one can't find it anymore and even if someone already got this
10284          * block group before we removed it from the rbtree, they have already
10285          * incremented block_group->trimming - if they didn't, they won't find
10286          * any free space entries because we already removed them all when we
10287          * called btrfs_remove_free_space_cache().
10288          *
10289          * And we must not remove the extent map from the fs_info->mapping_tree
10290          * to prevent the same logical address range and physical device space
10291          * ranges from being reused for a new block group. This is because our
10292          * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10293          * completely transactionless, so while it is trimming a range the
10294          * currently running transaction might finish and a new one start,
10295          * allowing for new block groups to be created that can reuse the same
10296          * physical device locations unless we take this special care.
10297          *
10298          * There may also be an implicit trim operation if the file system
10299          * is mounted with -odiscard. The same protections must remain
10300          * in place until the extents have been discarded completely when
10301          * the transaction commit has completed.
10302          */
10303         remove_em = (atomic_read(&block_group->trimming) == 0);
10304         /*
10305          * Make sure a trimmer task always sees the em in the pinned_chunks list
10306          * if it sees block_group->removed == 1 (needs to lock block_group->lock
10307          * before checking block_group->removed).
10308          */
10309         if (!remove_em) {
10310                 /*
10311                  * Our em might be in trans->transaction->pending_chunks which
10312                  * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10313                  * and so is the fs_info->pinned_chunks list.
10314                  *
10315                  * So at this point we must be holding the chunk_mutex to avoid
10316                  * any races with chunk allocation (more specifically at
10317                  * volumes.c:contains_pending_extent()), to ensure it always
10318                  * sees the em, either in the pending_chunks list or in the
10319                  * pinned_chunks list.
10320                  */
10321                 list_move_tail(&em->list, &root->fs_info->pinned_chunks);
10322         }
10323         spin_unlock(&block_group->lock);
10324
10325         if (remove_em) {
10326                 struct extent_map_tree *em_tree;
10327
10328                 em_tree = &root->fs_info->mapping_tree.map_tree;
10329                 write_lock(&em_tree->lock);
10330                 /*
10331                  * The em might be in the pending_chunks list, so make sure the
10332                  * chunk mutex is locked, since remove_extent_mapping() will
10333                  * delete us from that list.
10334                  */
10335                 remove_extent_mapping(em_tree, em);
10336                 write_unlock(&em_tree->lock);
10337                 /* once for the tree */
10338                 free_extent_map(em);
10339         }
10340
10341         unlock_chunks(root);
10342
10343         btrfs_put_block_group(block_group);
10344         btrfs_put_block_group(block_group);
10345
10346         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10347         if (ret > 0)
10348                 ret = -EIO;
10349         if (ret < 0)
10350                 goto out;
10351
10352         ret = btrfs_del_item(trans, root, path);
10353 out:
10354         btrfs_free_path(path);
10355         return ret;
10356 }
10357
10358 struct btrfs_trans_handle *
10359 btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10360                                      const u64 chunk_offset)
10361 {
10362         struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10363         struct extent_map *em;
10364         struct map_lookup *map;
10365         unsigned int num_items;
10366
10367         read_lock(&em_tree->lock);
10368         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10369         read_unlock(&em_tree->lock);
10370         ASSERT(em && em->start == chunk_offset);
10371
10372         /*
10373          * We need to reserve 3 + N units from the metadata space info in order
10374          * to remove a block group (done at btrfs_remove_chunk() and at
10375          * btrfs_remove_block_group()), which are used for:
10376          *
10377          * 1 unit for adding the free space inode's orphan (located in the tree
10378          * of tree roots).
10379          * 1 unit for deleting the block group item (located in the extent
10380          * tree).
10381          * 1 unit for deleting the free space item (located in tree of tree
10382          * roots).
10383          * N units for deleting N device extent items corresponding to each
10384          * stripe (located in the device tree).
10385          *
10386          * In order to remove a block group we also need to reserve units in the
10387          * system space info in order to update the chunk tree (update one or
10388          * more device items and remove one chunk item), but this is done at
10389          * btrfs_remove_chunk() through a call to check_system_chunk().
10390          */
10391         map = (struct map_lookup *)em->bdev;
10392         num_items = 3 + map->num_stripes;
10393         free_extent_map(em);
10394
10395         return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10396                                                            num_items, 1);
10397 }
10398
10399 /*
10400  * Process the unused_bgs list and remove any that don't have any allocated
10401  * space inside of them.
10402  */
10403 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10404 {
10405         struct btrfs_block_group_cache *block_group;
10406         struct btrfs_space_info *space_info;
10407         struct btrfs_root *root = fs_info->extent_root;
10408         struct btrfs_trans_handle *trans;
10409         int ret = 0;
10410
10411         if (!fs_info->open)
10412                 return;
10413
10414         spin_lock(&fs_info->unused_bgs_lock);
10415         while (!list_empty(&fs_info->unused_bgs)) {
10416                 u64 start, end;
10417                 int trimming;
10418
10419                 block_group = list_first_entry(&fs_info->unused_bgs,
10420                                                struct btrfs_block_group_cache,
10421                                                bg_list);
10422                 list_del_init(&block_group->bg_list);
10423
10424                 space_info = block_group->space_info;
10425
10426                 if (ret || btrfs_mixed_space_info(space_info)) {
10427                         btrfs_put_block_group(block_group);
10428                         continue;
10429                 }
10430                 spin_unlock(&fs_info->unused_bgs_lock);
10431
10432                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
10433
10434                 /* Don't want to race with allocators so take the groups_sem */
10435                 down_write(&space_info->groups_sem);
10436                 spin_lock(&block_group->lock);
10437                 if (block_group->reserved || block_group->pinned ||
10438                     btrfs_block_group_used(&block_group->item) ||
10439                     block_group->ro ||
10440                     list_is_singular(&block_group->list)) {
10441                         /*
10442                          * We want to bail if we made new allocations or have
10443                          * outstanding allocations in this block group.  We do
10444                          * the ro check in case balance is currently acting on
10445                          * this block group.
10446                          */
10447                         spin_unlock(&block_group->lock);
10448                         up_write(&space_info->groups_sem);
10449                         goto next;
10450                 }
10451                 spin_unlock(&block_group->lock);
10452
10453                 /* We don't want to force the issue, only flip if it's ok. */
10454                 ret = inc_block_group_ro(block_group, 0);
10455                 up_write(&space_info->groups_sem);
10456                 if (ret < 0) {
10457                         ret = 0;
10458                         goto next;
10459                 }
10460
10461                 /*
10462                  * Want to do this before we do anything else so we can recover
10463                  * properly if we fail to join the transaction.
10464                  */
10465                 trans = btrfs_start_trans_remove_block_group(fs_info,
10466                                                      block_group->key.objectid);
10467                 if (IS_ERR(trans)) {
10468                         btrfs_dec_block_group_ro(root, block_group);
10469                         ret = PTR_ERR(trans);
10470                         goto next;
10471                 }
10472
10473                 /*
10474                  * We could have pending pinned extents for this block group,
10475                  * just delete them, we don't care about them anymore.
10476                  */
10477                 start = block_group->key.objectid;
10478                 end = start + block_group->key.offset - 1;
10479                 /*
10480                  * Hold the unused_bg_unpin_mutex lock to avoid racing with
10481                  * btrfs_finish_extent_commit(). If we are at transaction N,
10482                  * another task might be running finish_extent_commit() for the
10483                  * previous transaction N - 1, and have seen a range belonging
10484                  * to the block group in freed_extents[] before we were able to
10485                  * clear the whole block group range from freed_extents[]. This
10486                  * means that task can lookup for the block group after we
10487                  * unpinned it from freed_extents[] and removed it, leading to
10488                  * a BUG_ON() at btrfs_unpin_extent_range().
10489                  */
10490                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
10491                 ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10492                                   EXTENT_DIRTY, GFP_NOFS);
10493                 if (ret) {
10494                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10495                         btrfs_dec_block_group_ro(root, block_group);
10496                         goto end_trans;
10497                 }
10498                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10499                                   EXTENT_DIRTY, GFP_NOFS);
10500                 if (ret) {
10501                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10502                         btrfs_dec_block_group_ro(root, block_group);
10503                         goto end_trans;
10504                 }
10505                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10506
10507                 /* Reset pinned so btrfs_put_block_group doesn't complain */
10508                 spin_lock(&space_info->lock);
10509                 spin_lock(&block_group->lock);
10510
10511                 space_info->bytes_pinned -= block_group->pinned;
10512                 space_info->bytes_readonly += block_group->pinned;
10513                 percpu_counter_add(&space_info->total_bytes_pinned,
10514                                    -block_group->pinned);
10515                 block_group->pinned = 0;
10516
10517                 spin_unlock(&block_group->lock);
10518                 spin_unlock(&space_info->lock);
10519
10520                 /* DISCARD can flip during remount */
10521                 trimming = btrfs_test_opt(root, DISCARD);
10522
10523                 /* Implicit trim during transaction commit. */
10524                 if (trimming)
10525                         btrfs_get_block_group_trimming(block_group);
10526
10527                 /*
10528                  * Btrfs_remove_chunk will abort the transaction if things go
10529                  * horribly wrong.
10530                  */
10531                 ret = btrfs_remove_chunk(trans, root,
10532                                          block_group->key.objectid);
10533
10534                 if (ret) {
10535                         if (trimming)
10536                                 btrfs_put_block_group_trimming(block_group);
10537                         goto end_trans;
10538                 }
10539
10540                 /*
10541                  * If we're not mounted with -odiscard, we can just forget
10542                  * about this block group. Otherwise we'll need to wait
10543                  * until transaction commit to do the actual discard.
10544                  */
10545                 if (trimming) {
10546                         spin_lock(&fs_info->unused_bgs_lock);
10547                         /*
10548                          * A concurrent scrub might have added us to the list
10549                          * fs_info->unused_bgs, so use a list_move operation
10550                          * to add the block group to the deleted_bgs list.
10551                          */
10552                         list_move(&block_group->bg_list,
10553                                   &trans->transaction->deleted_bgs);
10554                         spin_unlock(&fs_info->unused_bgs_lock);
10555                         btrfs_get_block_group(block_group);
10556                 }
10557 end_trans:
10558                 btrfs_end_transaction(trans, root);
10559 next:
10560                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10561                 btrfs_put_block_group(block_group);
10562                 spin_lock(&fs_info->unused_bgs_lock);
10563         }
10564         spin_unlock(&fs_info->unused_bgs_lock);
10565 }
10566
10567 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10568 {
10569         struct btrfs_space_info *space_info;
10570         struct btrfs_super_block *disk_super;
10571         u64 features;
10572         u64 flags;
10573         int mixed = 0;
10574         int ret;
10575
10576         disk_super = fs_info->super_copy;
10577         if (!btrfs_super_root(disk_super))
10578                 return 1;
10579
10580         features = btrfs_super_incompat_flags(disk_super);
10581         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10582                 mixed = 1;
10583
10584         flags = BTRFS_BLOCK_GROUP_SYSTEM;
10585         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10586         if (ret)
10587                 goto out;
10588
10589         if (mixed) {
10590                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10591                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10592         } else {
10593                 flags = BTRFS_BLOCK_GROUP_METADATA;
10594                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10595                 if (ret)
10596                         goto out;
10597
10598                 flags = BTRFS_BLOCK_GROUP_DATA;
10599                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
10600         }
10601 out:
10602         return ret;
10603 }
10604
10605 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
10606 {
10607         return unpin_extent_range(root, start, end, false);
10608 }
10609
10610 /*
10611  * It used to be that old block groups would be left around forever.
10612  * Iterating over them would be enough to trim unused space.  Since we
10613  * now automatically remove them, we also need to iterate over unallocated
10614  * space.
10615  *
10616  * We don't want a transaction for this since the discard may take a
10617  * substantial amount of time.  We don't require that a transaction be
10618  * running, but we do need to take a running transaction into account
10619  * to ensure that we're not discarding chunks that were released in
10620  * the current transaction.
10621  *
10622  * Holding the chunks lock will prevent other threads from allocating
10623  * or releasing chunks, but it won't prevent a running transaction
10624  * from committing and releasing the memory that the pending chunks
10625  * list head uses.  For that, we need to take a reference to the
10626  * transaction.
10627  */
10628 static int btrfs_trim_free_extents(struct btrfs_device *device,
10629                                    u64 minlen, u64 *trimmed)
10630 {
10631         u64 start = 0, len = 0;
10632         int ret;
10633
10634         *trimmed = 0;
10635
10636         /* Discard not supported = nothing to do. */
10637         if (!blk_queue_discard(bdev_get_queue(device->bdev)))
10638                 return 0;
10639
10640         /* Not writeable = nothing to do. */
10641         if (!device->writeable)
10642                 return 0;
10643
10644         /* No free space = nothing to do. */
10645         if (device->total_bytes <= device->bytes_used)
10646                 return 0;
10647
10648         ret = 0;
10649
10650         while (1) {
10651                 struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
10652                 struct btrfs_transaction *trans;
10653                 u64 bytes;
10654
10655                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10656                 if (ret)
10657                         return ret;
10658
10659                 down_read(&fs_info->commit_root_sem);
10660
10661                 spin_lock(&fs_info->trans_lock);
10662                 trans = fs_info->running_transaction;
10663                 if (trans)
10664                         atomic_inc(&trans->use_count);
10665                 spin_unlock(&fs_info->trans_lock);
10666
10667                 ret = find_free_dev_extent_start(trans, device, minlen, start,
10668                                                  &start, &len);
10669                 if (trans)
10670                         btrfs_put_transaction(trans);
10671
10672                 if (ret) {
10673                         up_read(&fs_info->commit_root_sem);
10674                         mutex_unlock(&fs_info->chunk_mutex);
10675                         if (ret == -ENOSPC)
10676                                 ret = 0;
10677                         break;
10678                 }
10679
10680                 ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
10681                 up_read(&fs_info->commit_root_sem);
10682                 mutex_unlock(&fs_info->chunk_mutex);
10683
10684                 if (ret)
10685                         break;
10686
10687                 start += len;
10688                 *trimmed += bytes;
10689
10690                 if (fatal_signal_pending(current)) {
10691                         ret = -ERESTARTSYS;
10692                         break;
10693                 }
10694
10695                 cond_resched();
10696         }
10697
10698         return ret;
10699 }
10700
10701 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
10702 {
10703         struct btrfs_fs_info *fs_info = root->fs_info;
10704         struct btrfs_block_group_cache *cache = NULL;
10705         struct btrfs_device *device;
10706         struct list_head *devices;
10707         u64 group_trimmed;
10708         u64 start;
10709         u64 end;
10710         u64 trimmed = 0;
10711         int ret = 0;
10712
10713         cache = btrfs_lookup_first_block_group(fs_info, range->start);
10714         while (cache) {
10715                 if (cache->key.objectid >= (range->start + range->len)) {
10716                         btrfs_put_block_group(cache);
10717                         break;
10718                 }
10719
10720                 start = max(range->start, cache->key.objectid);
10721                 end = min(range->start + range->len,
10722                                 cache->key.objectid + cache->key.offset);
10723
10724                 if (end - start >= range->minlen) {
10725                         if (!block_group_cache_done(cache)) {
10726                                 ret = cache_block_group(cache, 0);
10727                                 if (ret) {
10728                                         btrfs_put_block_group(cache);
10729                                         break;
10730                                 }
10731                                 ret = wait_block_group_cache_done(cache);
10732                                 if (ret) {
10733                                         btrfs_put_block_group(cache);
10734                                         break;
10735                                 }
10736                         }
10737                         ret = btrfs_trim_block_group(cache,
10738                                                      &group_trimmed,
10739                                                      start,
10740                                                      end,
10741                                                      range->minlen);
10742
10743                         trimmed += group_trimmed;
10744                         if (ret) {
10745                                 btrfs_put_block_group(cache);
10746                                 break;
10747                         }
10748                 }
10749
10750                 cache = next_block_group(fs_info->tree_root, cache);
10751         }
10752
10753         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
10754         devices = &root->fs_info->fs_devices->devices;
10755         list_for_each_entry(device, devices, dev_list) {
10756                 ret = btrfs_trim_free_extents(device, range->minlen,
10757                                               &group_trimmed);
10758                 if (ret)
10759                         break;
10760
10761                 trimmed += group_trimmed;
10762         }
10763         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
10764
10765         range->len = trimmed;
10766         return ret;
10767 }
10768
10769 /*
10770  * btrfs_{start,end}_write_no_snapshoting() are similar to
10771  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
10772  * data into the page cache through nocow before the subvolume is snapshoted,
10773  * but flush the data into disk after the snapshot creation, or to prevent
10774  * operations while snapshoting is ongoing and that cause the snapshot to be
10775  * inconsistent (writes followed by expanding truncates for example).
10776  */
10777 void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
10778 {
10779         percpu_counter_dec(&root->subv_writers->counter);
10780         /*
10781          * Make sure counter is updated before we wake up waiters.
10782          */
10783         smp_mb();
10784         if (waitqueue_active(&root->subv_writers->wait))
10785                 wake_up(&root->subv_writers->wait);
10786 }
10787
10788 int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
10789 {
10790         if (atomic_read(&root->will_be_snapshoted))
10791                 return 0;
10792
10793         percpu_counter_inc(&root->subv_writers->counter);
10794         /*
10795          * Make sure counter is updated before we check for snapshot creation.
10796          */
10797         smp_mb();
10798         if (atomic_read(&root->will_be_snapshoted)) {
10799                 btrfs_end_write_no_snapshoting(root);
10800                 return 0;
10801         }
10802         return 1;
10803 }