fs/btrfs/inode.c

   1 /*
   2  * Copyright (C) 2007 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/kernel.h>
  20 #include <linux/bio.h>
  21 #include <linux/buffer_head.h>
  22 #include <linux/file.h>
  23 #include <linux/fs.h>
  24 #include <linux/pagemap.h>
  25 #include <linux/highmem.h>
  26 #include <linux/time.h>
  27 #include <linux/init.h>
  28 #include <linux/string.h>
  29 #include <linux/backing-dev.h>
  30 #include <linux/mpage.h>
  31 #include <linux/swap.h>
  32 #include <linux/writeback.h>
  33 #include <linux/statfs.h>
  34 #include <linux/compat.h>
  35 #include <linux/aio.h>
  36 #include <linux/bit_spinlock.h>
  37 #include <linux/xattr.h>
  38 #include <linux/posix_acl.h>
  39 #include <linux/falloc.h>
  40 #include <linux/slab.h>
  41 #include <linux/ratelimit.h>
  42 #include <linux/mount.h>
  43 #include <linux/btrfs.h>
  44 #include <linux/blkdev.h>
  45 #include <linux/posix_acl_xattr.h>
  46 #include <linux/uio.h>
  47 #include "ctree.h"
  48 #include "disk-io.h"
  49 #include "transaction.h"
  50 #include "btrfs_inode.h"
  51 #include "print-tree.h"
  52 #include "ordered-data.h"
  53 #include "xattr.h"
  54 #include "tree-log.h"
  55 #include "volumes.h"
  56 #include "compression.h"
  57 #include "locking.h"
  58 #include "free-space-cache.h"
  59 #include "inode-map.h"
  60 #include "backref.h"
  61 #include "hash.h"
  62 #include "props.h"
  63 #include "qgroup.h"
  64
  65 struct btrfs_iget_args {
  66         struct btrfs_key *location;
  67         struct btrfs_root *root;
  68 };
  69
  70 static const struct inode_operations btrfs_dir_inode_operations;
  71 static const struct inode_operations btrfs_symlink_inode_operations;
  72 static const struct inode_operations btrfs_dir_ro_inode_operations;
  73 static const struct inode_operations btrfs_special_inode_operations;
  74 static const struct inode_operations btrfs_file_inode_operations;
  75 static const struct address_space_operations btrfs_aops;
  76 static const struct address_space_operations btrfs_symlink_aops;
  77 static const struct file_operations btrfs_dir_file_operations;
  78 static struct extent_io_ops btrfs_extent_io_ops;
  79
  80 static struct kmem_cache *btrfs_inode_cachep;
  81 static struct kmem_cache *btrfs_delalloc_work_cachep;
  82 struct kmem_cache *btrfs_trans_handle_cachep;
  83 struct kmem_cache *btrfs_transaction_cachep;
  84 struct kmem_cache *btrfs_path_cachep;
  85 struct kmem_cache *btrfs_free_space_cachep;
  86
  87 #define S_SHIFT 12
  88 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
  89         [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
  90         [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
  91         [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
  92         [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
  93         [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
  94         [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
  95         [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
  96 };
  97
  98 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  99 static int btrfs_truncate(struct inode *inode);
 100 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 101 static noinline int cow_file_range(struct inode *inode,
 102                                    struct page *locked_page,
 103                                    u64 start, u64 end, int *page_started,
 104                                    unsigned long *nr_written, int unlock);
 105 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 106                                            u64 len, u64 orig_start,
 107                                            u64 block_start, u64 block_len,
 108                                            u64 orig_block_len, u64 ram_bytes,
 109                                            int type);
 110
 111 static int btrfs_dirty_inode(struct inode *inode);
 112
 113 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 114 void btrfs_test_inode_set_ops(struct inode *inode)
 115 {
 116         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 117 }
 118 #endif
 119
 120 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 121                                      struct inode *inode,  struct inode *dir,
 122                                      const struct qstr *qstr)
 123 {
 124         int err;
 125
 126         err = btrfs_init_acl(trans, inode, dir);
 127         if (!err)
 128                 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
 129         return err;
 130 }
 131
 132 /*
 133  * this does all the hard work for inserting an inline extent into
 134  * the btree.  The caller should have done a btrfs_drop_extents so that
 135  * no overlapping inline items exist in the btree
 136  */
 137 static int insert_inline_extent(struct btrfs_trans_handle *trans,
 138                                 struct btrfs_path *path, int extent_inserted,
 139                                 struct btrfs_root *root, struct inode *inode,
 140                                 u64 start, size_t size, size_t compressed_size,
 141                                 int compress_type,
 142                                 struct page **compressed_pages)
 143 {
 144         struct extent_buffer *leaf;
 145         struct page *page = NULL;
 146         char *kaddr;
 147         unsigned long ptr;
 148         struct btrfs_file_extent_item *ei;
 149         int err = 0;
 150         int ret;
 151         size_t cur_size = size;
 152         unsigned long offset;
 153
 154         if (compressed_size && compressed_pages)
 155                 cur_size = compressed_size;
 156
 157         inode_add_bytes(inode, size);
 158
 159         if (!extent_inserted) {
 160                 struct btrfs_key key;
 161                 size_t datasize;
 162
 163                 key.objectid = btrfs_ino(inode);
 164                 key.offset = start;
 165                 key.type = BTRFS_EXTENT_DATA_KEY;
 166
 167                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
 168                 path->leave_spinning = 1;
 169                 ret = btrfs_insert_empty_item(trans, root, path, &key,
 170                                               datasize);
 171                 if (ret) {
 172                         err = ret;
 173                         goto fail;
 174                 }
 175         }
 176         leaf = path->nodes[0];
 177         ei = btrfs_item_ptr(leaf, path->slots[0],
 178                             struct btrfs_file_extent_item);
 179         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 180         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 181         btrfs_set_file_extent_encryption(leaf, ei, 0);
 182         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 183         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 184         ptr = btrfs_file_extent_inline_start(ei);
 185
 186         if (compress_type != BTRFS_COMPRESS_NONE) {
 187                 struct page *cpage;
 188                 int i = 0;
 189                 while (compressed_size > 0) {
 190                         cpage = compressed_pages[i];
 191                         cur_size = min_t(unsigned long, compressed_size,
 192                                        PAGE_CACHE_SIZE);
 193
 194                         kaddr = kmap_atomic(cpage);
 195                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
 196                         kunmap_atomic(kaddr);
 197
 198                         i++;
 199                         ptr += cur_size;
 200                         compressed_size -= cur_size;
 201                 }
 202                 btrfs_set_file_extent_compression(leaf, ei,
 203                                                   compress_type);
 204         } else {
 205                 page = find_get_page(inode->i_mapping,
 206                                      start >> PAGE_CACHE_SHIFT);
 207                 btrfs_set_file_extent_compression(leaf, ei, 0);
 208                 kaddr = kmap_atomic(page);
 209                 offset = start & (PAGE_CACHE_SIZE - 1);
 210                 write_extent_buffer(leaf, kaddr + offset, ptr, size);
 211                 kunmap_atomic(kaddr);
 212                 page_cache_release(page);
 213         }
 214         btrfs_mark_buffer_dirty(leaf);
 215         btrfs_release_path(path);
 216
 217         /*
 218          * we're an inline extent, so nobody can
 219          * extend the file past i_size without locking
 220          * a page we already have locked.
 221          *
 222          * We must do any isize and inode updates
 223          * before we unlock the pages.  Otherwise we
 224          * could end up racing with unlink.
 225          */
 226         BTRFS_I(inode)->disk_i_size = inode->i_size;
 227         ret = btrfs_update_inode(trans, root, inode);
 228
 229         return ret;
 230 fail:
 231         return err;
 232 }
 233
 234
 235 /*
 236  * conditionally insert an inline extent into the file.  This
 237  * does the checks required to make sure the data is small enough
 238  * to fit as an inline extent.
 239  */
 240 static noinline int cow_file_range_inline(struct btrfs_root *root,
 241                                           struct inode *inode, u64 start,
 242                                           u64 end, size_t compressed_size,
 243                                           int compress_type,
 244                                           struct page **compressed_pages)
 245 {
 246         struct btrfs_trans_handle *trans;
 247         u64 isize = i_size_read(inode);
 248         u64 actual_end = min(end + 1, isize);
 249         u64 inline_len = actual_end - start;
 250         u64 aligned_end = ALIGN(end, root->sectorsize);
 251         u64 data_len = inline_len;
 252         int ret;
 253         struct btrfs_path *path;
 254         int extent_inserted = 0;
 255         u32 extent_item_size;
 256
 257         if (compressed_size)
 258                 data_len = compressed_size;
 259
 260         if (start > 0 ||
 261             actual_end > PAGE_CACHE_SIZE ||
 262             data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
 263             (!compressed_size &&
 264             (actual_end & (root->sectorsize - 1)) == 0) ||
 265             end + 1 < isize ||
 266             data_len > root->fs_info->max_inline) {
 267                 return 1;
 268         }
 269
 270         path = btrfs_alloc_path();
 271         if (!path)
 272                 return -ENOMEM;
 273
 274         trans = btrfs_join_transaction(root);
 275         if (IS_ERR(trans)) {
 276                 btrfs_free_path(path);
 277                 return PTR_ERR(trans);
 278         }
 279         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 280
 281         if (compressed_size && compressed_pages)
 282                 extent_item_size = btrfs_file_extent_calc_inline_size(
 283                    compressed_size);
 284         else
 285                 extent_item_size = btrfs_file_extent_calc_inline_size(
 286                     inline_len);
 287
 288         ret = __btrfs_drop_extents(trans, root, inode, path,
 289                                    start, aligned_end, NULL,
 290                                    1, 1, extent_item_size, &extent_inserted);
 291         if (ret) {
 292                 btrfs_abort_transaction(trans, root, ret);
 293                 goto out;
 294         }
 295
 296         if (isize > actual_end)
 297                 inline_len = min_t(u64, isize, actual_end);
 298         ret = insert_inline_extent(trans, path, extent_inserted,
 299                                    root, inode, start,
 300                                    inline_len, compressed_size,
 301                                    compress_type, compressed_pages);
 302         if (ret && ret != -ENOSPC) {
 303                 btrfs_abort_transaction(trans, root, ret);
 304                 goto out;
 305         } else if (ret == -ENOSPC) {
 306                 ret = 1;
 307                 goto out;
 308         }
 309
 310         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 311         btrfs_delalloc_release_metadata(inode, end + 1 - start);
 312         btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 313 out:
 314         btrfs_free_path(path);
 315         btrfs_end_transaction(trans, root);
 316         return ret;
 317 }
 318
 319 struct async_extent {
 320         u64 start;
 321         u64 ram_size;
 322         u64 compressed_size;
 323         struct page **pages;
 324         unsigned long nr_pages;
 325         int compress_type;
 326         struct list_head list;
 327 };
 328
 329 struct async_cow {
 330         struct inode *inode;
 331         struct btrfs_root *root;
 332         struct page *locked_page;
 333         u64 start;
 334         u64 end;
 335         struct list_head extents;
 336         struct btrfs_work work;
 337 };
 338
 339 static noinline int add_async_extent(struct async_cow *cow,
 340                                      u64 start, u64 ram_size,
 341                                      u64 compressed_size,
 342                                      struct page **pages,
 343                                      unsigned long nr_pages,
 344                                      int compress_type)
 345 {
 346         struct async_extent *async_extent;
 347
 348         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 349         BUG_ON(!async_extent); /* -ENOMEM */
 350         async_extent->start = start;
 351         async_extent->ram_size = ram_size;
 352         async_extent->compressed_size = compressed_size;
 353         async_extent->pages = pages;
 354         async_extent->nr_pages = nr_pages;
 355         async_extent->compress_type = compress_type;
 356         list_add_tail(&async_extent->list, &cow->extents);
 357         return 0;
 358 }
 359
 360 static inline int inode_need_compress(struct inode *inode)
 361 {
 362         struct btrfs_root *root = BTRFS_I(inode)->root;
 363
 364         /* force compress */
 365         if (btrfs_test_opt(root, FORCE_COMPRESS))
 366                 return 1;
 367         /* bad compression ratios */
 368         if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
 369                 return 0;
 370         if (btrfs_test_opt(root, COMPRESS) ||
 371             BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
 372             BTRFS_I(inode)->force_compress)
 373                 return 1;
 374         return 0;
 375 }
 376
 377 /*
 378  * we create compressed extents in two phases.  The first
 379  * phase compresses a range of pages that have already been
 380  * locked (both pages and state bits are locked).
 381  *
 382  * This is done inside an ordered work queue, and the compression
 383  * is spread across many cpus.  The actual IO submission is step
 384  * two, and the ordered work queue takes care of making sure that
 385  * happens in the same order things were put onto the queue by
 386  * writepages and friends.
 387  *
 388  * If this code finds it can't get good compression, it puts an
 389  * entry onto the work queue to write the uncompressed bytes.  This
 390  * makes sure that both compressed inodes and uncompressed inodes
 391  * are written in the same order that the flusher thread sent them
 392  * down.
 393  */
 394 static noinline void compress_file_range(struct inode *inode,
 395                                         struct page *locked_page,
 396                                         u64 start, u64 end,
 397                                         struct async_cow *async_cow,
 398                                         int *num_added)
 399 {
 400         struct btrfs_root *root = BTRFS_I(inode)->root;
 401         u64 num_bytes;
 402         u64 blocksize = root->sectorsize;
 403         u64 actual_end;
 404         u64 isize = i_size_read(inode);
 405         int ret = 0;
 406         struct page **pages = NULL;
 407         unsigned long nr_pages;
 408         unsigned long nr_pages_ret = 0;
 409         unsigned long total_compressed = 0;
 410         unsigned long total_in = 0;
 411         unsigned long max_compressed = 128 * 1024;
 412         unsigned long max_uncompressed = 128 * 1024;
 413         int i;
 414         int will_compress;
 415         int compress_type = root->fs_info->compress_type;
 416         int redirty = 0;
 417
 418         /* if this is a small write inside eof, kick off a defrag */
 419         if ((end - start + 1) < 16 * 1024 &&
 420             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 421                 btrfs_add_inode_defrag(NULL, inode);
 422
 423         actual_end = min_t(u64, isize, end + 1);
 424 again:
 425         will_compress = 0;
 426         nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
 427         nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 428
 429         /*
 430          * we don't want to send crud past the end of i_size through
 431          * compression, that's just a waste of CPU time.  So, if the
 432          * end of the file is before the start of our current
 433          * requested range of bytes, we bail out to the uncompressed
 434          * cleanup code that can deal with all of this.
 435          *
 436          * It isn't really the fastest way to fix things, but this is a
 437          * very uncommon corner.
 438          */
 439         if (actual_end <= start)
 440                 goto cleanup_and_bail_uncompressed;
 441
 442         total_compressed = actual_end - start;
 443
 444         /*
 445          * skip compression for a small file range(<=blocksize) that
 446          * isn't an inline extent, since it dosen't save disk space at all.
 447          */
 448         if (total_compressed <= blocksize &&
 449            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 450                 goto cleanup_and_bail_uncompressed;
 451
 452         /* we want to make sure that amount of ram required to uncompress
 453          * an extent is reasonable, so we limit the total size in ram
 454          * of a compressed extent to 128k.  This is a crucial number
 455          * because it also controls how easily we can spread reads across
 456          * cpus for decompression.
 457          *
 458          * We also want to make sure the amount of IO required to do
 459          * a random read is reasonably small, so we limit the size of
 460          * a compressed extent to 128k.
 461          */
 462         total_compressed = min(total_compressed, max_uncompressed);
 463         num_bytes = ALIGN(end - start + 1, blocksize);
 464         num_bytes = max(blocksize,  num_bytes);
 465         total_in = 0;
 466         ret = 0;
 467
 468         /*
 469          * we do compression for mount -o compress and when the
 470          * inode has not been flagged as nocompress.  This flag can
 471          * change at any time if we discover bad compression ratios.
 472          */
 473         if (inode_need_compress(inode)) {
 474                 WARN_ON(pages);
 475                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 476                 if (!pages) {
 477                         /* just bail out to the uncompressed code */
 478                         goto cont;
 479                 }
 480
 481                 if (BTRFS_I(inode)->force_compress)
 482                         compress_type = BTRFS_I(inode)->force_compress;
 483
 484                 /*
 485                  * we need to call clear_page_dirty_for_io on each
 486                  * page in the range.  Otherwise applications with the file
 487                  * mmap'd can wander in and change the page contents while
 488                  * we are compressing them.
 489                  *
 490                  * If the compression fails for any reason, we set the pages
 491                  * dirty again later on.
 492                  */
 493                 extent_range_clear_dirty_for_io(inode, start, end);
 494                 redirty = 1;
 495                 ret = btrfs_compress_pages(compress_type,
 496                                            inode->i_mapping, start,
 497                                            total_compressed, pages,
 498                                            nr_pages, &nr_pages_ret,
 499                                            &total_in,
 500                                            &total_compressed,
 501                                            max_compressed);
 502
 503                 if (!ret) {
 504                         unsigned long offset = total_compressed &
 505                                 (PAGE_CACHE_SIZE - 1);
 506                         struct page *page = pages[nr_pages_ret - 1];
 507                         char *kaddr;
 508
 509                         /* zero the tail end of the last page, we might be
 510                          * sending it down to disk
 511                          */
 512                         if (offset) {
 513                                 kaddr = kmap_atomic(page);
 514                                 memset(kaddr + offset, 0,
 515                                        PAGE_CACHE_SIZE - offset);
 516                                 kunmap_atomic(kaddr);
 517                         }
 518                         will_compress = 1;
 519                 }
 520         }
 521 cont:
 522         if (start == 0) {
 523                 /* lets try to make an inline extent */
 524                 if (ret || total_in < (actual_end - start)) {
 525                         /* we didn't compress the entire range, try
 526                          * to make an uncompressed inline extent.
 527                          */
 528                         ret = cow_file_range_inline(root, inode, start, end,
 529                                                     0, 0, NULL);
 530                 } else {
 531                         /* try making a compressed inline extent */
 532                         ret = cow_file_range_inline(root, inode, start, end,
 533                                                     total_compressed,
 534                                                     compress_type, pages);
 535                 }
 536                 if (ret <= 0) {
 537                         unsigned long clear_flags = EXTENT_DELALLOC |
 538                                 EXTENT_DEFRAG;
 539                         unsigned long page_error_op;
 540
 541                         clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
 542                         page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
 543
 544                         /*
 545                          * inline extent creation worked or returned error,
 546                          * we don't need to create any more async work items.
 547                          * Unlock and free up our temp pages.
 548                          */
 549                         extent_clear_unlock_delalloc(inode, start, end, NULL,
 550                                                      clear_flags, PAGE_UNLOCK |
 551                                                      PAGE_CLEAR_DIRTY |
 552                                                      PAGE_SET_WRITEBACK |
 553                                                      page_error_op |
 554                                                      PAGE_END_WRITEBACK);
 555                         goto free_pages_out;
 556                 }
 557         }
 558
 559         if (will_compress) {
 560                 /*
 561                  * we aren't doing an inline extent round the compressed size
 562                  * up to a block size boundary so the allocator does sane
 563                  * things
 564                  */
 565                 total_compressed = ALIGN(total_compressed, blocksize);
 566
 567                 /*
 568                  * one last check to make sure the compression is really a
 569                  * win, compare the page count read with the blocks on disk
 570                  */
 571                 total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
 572                 if (total_compressed >= total_in) {
 573                         will_compress = 0;
 574                 } else {
 575                         num_bytes = total_in;
 576                 }
 577         }
 578         if (!will_compress && pages) {
 579                 /*
 580                  * the compression code ran but failed to make things smaller,
 581                  * free any pages it allocated and our page pointer array
 582                  */
 583                 for (i = 0; i < nr_pages_ret; i++) {
 584                         WARN_ON(pages[i]->mapping);
 585                         page_cache_release(pages[i]);
 586                 }
 587                 kfree(pages);
 588                 pages = NULL;
 589                 total_compressed = 0;
 590                 nr_pages_ret = 0;
 591
 592                 /* flag the file so we don't compress in the future */
 593                 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
 594                     !(BTRFS_I(inode)->force_compress)) {
 595                         BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 596                 }
 597         }
 598         if (will_compress) {
 599                 *num_added += 1;
 600
 601                 /* the async work queues will take care of doing actual
 602                  * allocation on disk for these compressed pages,
 603                  * and will submit them to the elevator.
 604                  */
 605                 add_async_extent(async_cow, start, num_bytes,
 606                                  total_compressed, pages, nr_pages_ret,
 607                                  compress_type);
 608
 609                 if (start + num_bytes < end) {
 610                         start += num_bytes;
 611                         pages = NULL;
 612                         cond_resched();
 613                         goto again;
 614                 }
 615         } else {
 616 cleanup_and_bail_uncompressed:
 617                 /*
 618                  * No compression, but we still need to write the pages in
 619                  * the file we've been given so far.  redirty the locked
 620                  * page if it corresponds to our extent and set things up
 621                  * for the async work queue to run cow_file_range to do
 622                  * the normal delalloc dance
 623                  */
 624                 if (page_offset(locked_page) >= start &&
 625                     page_offset(locked_page) <= end) {
 626                         __set_page_dirty_nobuffers(locked_page);
 627                         /* unlocked later on in the async handlers */
 628                 }
 629                 if (redirty)
 630                         extent_range_redirty_for_io(inode, start, end);
 631                 add_async_extent(async_cow, start, end - start + 1,
 632                                  0, NULL, 0, BTRFS_COMPRESS_NONE);
 633                 *num_added += 1;
 634         }
 635
 636         return;
 637
 638 free_pages_out:
 639         for (i = 0; i < nr_pages_ret; i++) {
 640                 WARN_ON(pages[i]->mapping);
 641                 page_cache_release(pages[i]);
 642         }
 643         kfree(pages);
 644 }
 645
 646 static void free_async_extent_pages(struct async_extent *async_extent)
 647 {
 648         int i;
 649
 650         if (!async_extent->pages)
 651                 return;
 652
 653         for (i = 0; i < async_extent->nr_pages; i++) {
 654                 WARN_ON(async_extent->pages[i]->mapping);
 655                 page_cache_release(async_extent->pages[i]);
 656         }
 657         kfree(async_extent->pages);
 658         async_extent->nr_pages = 0;
 659         async_extent->pages = NULL;
 660 }
 661
 662 /*
 663  * phase two of compressed writeback.  This is the ordered portion
 664  * of the code, which only gets called in the order the work was
 665  * queued.  We walk all the async extents created by compress_file_range
 666  * and send them down to the disk.
 667  */
 668 static noinline void submit_compressed_extents(struct inode *inode,
 669                                               struct async_cow *async_cow)
 670 {
 671         struct async_extent *async_extent;
 672         u64 alloc_hint = 0;
 673         struct btrfs_key ins;
 674         struct extent_map *em;
 675         struct btrfs_root *root = BTRFS_I(inode)->root;
 676         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 677         struct extent_io_tree *io_tree;
 678         int ret = 0;
 679
 680 again:
 681         while (!list_empty(&async_cow->extents)) {
 682                 async_extent = list_entry(async_cow->extents.next,
 683                                           struct async_extent, list);
 684                 list_del(&async_extent->list);
 685
 686                 io_tree = &BTRFS_I(inode)->io_tree;
 687
 688 retry:
 689                 /* did the compression code fall back to uncompressed IO? */
 690                 if (!async_extent->pages) {
 691                         int page_started = 0;
 692                         unsigned long nr_written = 0;
 693
 694                         lock_extent(io_tree, async_extent->start,
 695                                          async_extent->start +
 696                                          async_extent->ram_size - 1);
 697
 698                         /* allocate blocks */
 699                         ret = cow_file_range(inode, async_cow->locked_page,
 700                                              async_extent->start,
 701                                              async_extent->start +
 702                                              async_extent->ram_size - 1,
 703                                              &page_started, &nr_written, 0);
 704
 705                         /* JDM XXX */
 706
 707                         /*
 708                          * if page_started, cow_file_range inserted an
 709                          * inline extent and took care of all the unlocking
 710                          * and IO for us.  Otherwise, we need to submit
 711                          * all those pages down to the drive.
 712                          */
 713                         if (!page_started && !ret)
 714                                 extent_write_locked_range(io_tree,
 715                                                   inode, async_extent->start,
 716                                                   async_extent->start +
 717                                                   async_extent->ram_size - 1,
 718                                                   btrfs_get_extent,
 719                                                   WB_SYNC_ALL);
 720                         else if (ret)
 721                                 unlock_page(async_cow->locked_page);
 722                         kfree(async_extent);
 723                         cond_resched();
 724                         continue;
 725                 }
 726
 727                 lock_extent(io_tree, async_extent->start,
 728                             async_extent->start + async_extent->ram_size - 1);
 729
 730                 ret = btrfs_reserve_extent(root,
 731                                            async_extent->compressed_size,
 732                                            async_extent->compressed_size,
 733                                            0, alloc_hint, &ins, 1, 1);
 734                 if (ret) {
 735                         free_async_extent_pages(async_extent);
 736
 737                         if (ret == -ENOSPC) {
 738                                 unlock_extent(io_tree, async_extent->start,
 739                                               async_extent->start +
 740                                               async_extent->ram_size - 1);
 741
 742                                 /*
 743                                  * we need to redirty the pages if we decide to
 744                                  * fallback to uncompressed IO, otherwise we
 745                                  * will not submit these pages down to lower
 746                                  * layers.
 747                                  */
 748                                 extent_range_redirty_for_io(inode,
 749                                                 async_extent->start,
 750                                                 async_extent->start +
 751                                                 async_extent->ram_size - 1);
 752
 753                                 goto retry;
 754                         }
 755                         goto out_free;
 756                 }
 757                 /*
 758                  * here we're doing allocation and writeback of the
 759                  * compressed pages
 760                  */
 761                 btrfs_drop_extent_cache(inode, async_extent->start,
 762                                         async_extent->start +
 763                                         async_extent->ram_size - 1, 0);
 764
 765                 em = alloc_extent_map();
 766                 if (!em) {
 767                         ret = -ENOMEM;
 768                         goto out_free_reserve;
 769                 }
 770                 em->start = async_extent->start;
 771                 em->len = async_extent->ram_size;
 772                 em->orig_start = em->start;
 773                 em->mod_start = em->start;
 774                 em->mod_len = em->len;
 775
 776                 em->block_start = ins.objectid;
 777                 em->block_len = ins.offset;
 778                 em->orig_block_len = ins.offset;
 779                 em->ram_bytes = async_extent->ram_size;
 780                 em->bdev = root->fs_info->fs_devices->latest_bdev;
 781                 em->compress_type = async_extent->compress_type;
 782                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
 783                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 784                 em->generation = -1;
 785
 786                 while (1) {
 787                         write_lock(&em_tree->lock);
 788                         ret = add_extent_mapping(em_tree, em, 1);
 789                         write_unlock(&em_tree->lock);
 790                         if (ret != -EEXIST) {
 791                                 free_extent_map(em);
 792                                 break;
 793                         }
 794                         btrfs_drop_extent_cache(inode, async_extent->start,
 795                                                 async_extent->start +
 796                                                 async_extent->ram_size - 1, 0);
 797                 }
 798
 799                 if (ret)
 800                         goto out_free_reserve;
 801
 802                 ret = btrfs_add_ordered_extent_compress(inode,
 803                                                 async_extent->start,
 804                                                 ins.objectid,
 805                                                 async_extent->ram_size,
 806                                                 ins.offset,
 807                                                 BTRFS_ORDERED_COMPRESSED,
 808                                                 async_extent->compress_type);
 809                 if (ret) {
 810                         btrfs_drop_extent_cache(inode, async_extent->start,
 811                                                 async_extent->start +
 812                                                 async_extent->ram_size - 1, 0);
 813                         goto out_free_reserve;
 814                 }
 815
 816                 /*
 817                  * clear dirty, set writeback and unlock the pages.
 818                  */
 819                 extent_clear_unlock_delalloc(inode, async_extent->start,
 820                                 async_extent->start +
 821                                 async_extent->ram_size - 1,
 822                                 NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 823                                 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 824                                 PAGE_SET_WRITEBACK);
 825                 ret = btrfs_submit_compressed_write(inode,
 826                                     async_extent->start,
 827                                     async_extent->ram_size,
 828                                     ins.objectid,
 829                                     ins.offset, async_extent->pages,
 830                                     async_extent->nr_pages);
 831                 if (ret) {
 832                         struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 833                         struct page *p = async_extent->pages[0];
 834                         const u64 start = async_extent->start;
 835                         const u64 end = start + async_extent->ram_size - 1;
 836
 837                         p->mapping = inode->i_mapping;
 838                         tree->ops->writepage_end_io_hook(p, start, end,
 839                                                          NULL, 0);
 840                         p->mapping = NULL;
 841                         extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
 842                                                      PAGE_END_WRITEBACK |
 843                                                      PAGE_SET_ERROR);
 844                         free_async_extent_pages(async_extent);
 845                 }
 846                 alloc_hint = ins.objectid + ins.offset;
 847                 kfree(async_extent);
 848                 cond_resched();
 849         }
 850         return;
 851 out_free_reserve:
 852         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 853 out_free:
 854         extent_clear_unlock_delalloc(inode, async_extent->start,
 855                                      async_extent->start +
 856                                      async_extent->ram_size - 1,
 857                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
 858                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
 859                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 860                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
 861                                      PAGE_SET_ERROR);
 862         free_async_extent_pages(async_extent);
 863         kfree(async_extent);
 864         goto again;
 865 }
 866
 867 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
 868                                       u64 num_bytes)
 869 {
 870         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 871         struct extent_map *em;
 872         u64 alloc_hint = 0;
 873
 874         read_lock(&em_tree->lock);
 875         em = search_extent_mapping(em_tree, start, num_bytes);
 876         if (em) {
 877                 /*
 878                  * if block start isn't an actual block number then find the
 879                  * first block in this inode and use that as a hint.  If that
 880                  * block is also bogus then just don't worry about it.
 881                  */
 882                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 883                         free_extent_map(em);
 884                         em = search_extent_mapping(em_tree, 0, 0);
 885                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
 886                                 alloc_hint = em->block_start;
 887                         if (em)
 888                                 free_extent_map(em);
 889                 } else {
 890                         alloc_hint = em->block_start;
 891                         free_extent_map(em);
 892                 }
 893         }
 894         read_unlock(&em_tree->lock);
 895
 896         return alloc_hint;
 897 }
 898
 899 /*
 900  * when extent_io.c finds a delayed allocation range in the file,
 901  * the call backs end up in this code.  The basic idea is to
 902  * allocate extents on disk for the range, and create ordered data structs
 903  * in ram to track those extents.
 904  *
 905  * locked_page is the page that writepage had locked already.  We use
 906  * it to make sure we don't do extra locks or unlocks.
 907  *
 908  * *page_started is set to one if we unlock locked_page and do everything
 909  * required to start IO on it.  It may be clean and already done with
 910  * IO when we return.
 911  */
 912 static noinline int cow_file_range(struct inode *inode,
 913                                    struct page *locked_page,
 914                                    u64 start, u64 end, int *page_started,
 915                                    unsigned long *nr_written,
 916                                    int unlock)
 917 {
 918         struct btrfs_root *root = BTRFS_I(inode)->root;
 919         u64 alloc_hint = 0;
 920         u64 num_bytes;
 921         unsigned long ram_size;
 922         u64 disk_num_bytes;
 923         u64 cur_alloc_size;
 924         u64 blocksize = root->sectorsize;
 925         struct btrfs_key ins;
 926         struct extent_map *em;
 927         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 928         int ret = 0;
 929
 930         if (btrfs_is_free_space_inode(inode)) {
 931                 WARN_ON_ONCE(1);
 932                 ret = -EINVAL;
 933                 goto out_unlock;
 934         }
 935
 936         num_bytes = ALIGN(end - start + 1, blocksize);
 937         num_bytes = max(blocksize,  num_bytes);
 938         disk_num_bytes = num_bytes;
 939
 940         /* if this is a small write inside eof, kick off defrag */
 941         if (num_bytes < 64 * 1024 &&
 942             (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 943                 btrfs_add_inode_defrag(NULL, inode);
 944
 945         if (start == 0) {
 946                 /* lets try to make an inline extent */
 947                 ret = cow_file_range_inline(root, inode, start, end, 0, 0,
 948                                             NULL);
 949                 if (ret == 0) {
 950                         extent_clear_unlock_delalloc(inode, start, end, NULL,
 951                                      EXTENT_LOCKED | EXTENT_DELALLOC |
 952                                      EXTENT_DEFRAG, PAGE_UNLOCK |
 953                                      PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
 954                                      PAGE_END_WRITEBACK);
 955
 956                         *nr_written = *nr_written +
 957                              (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 958                         *page_started = 1;
 959                         goto out;
 960                 } else if (ret < 0) {
 961                         goto out_unlock;
 962                 }
 963         }
 964
 965         BUG_ON(disk_num_bytes >
 966                btrfs_super_total_bytes(root->fs_info->super_copy));
 967
 968         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 969         btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 970
 971         while (disk_num_bytes > 0) {
 972                 unsigned long op;
 973
 974                 cur_alloc_size = disk_num_bytes;
 975                 ret = btrfs_reserve_extent(root, cur_alloc_size,
 976                                            root->sectorsize, 0, alloc_hint,
 977                                            &ins, 1, 1);
 978                 if (ret < 0)
 979                         goto out_unlock;
 980
 981                 em = alloc_extent_map();
 982                 if (!em) {
 983                         ret = -ENOMEM;
 984                         goto out_reserve;
 985                 }
 986                 em->start = start;
 987                 em->orig_start = em->start;
 988                 ram_size = ins.offset;
 989                 em->len = ins.offset;
 990                 em->mod_start = em->start;
 991                 em->mod_len = em->len;
 992
 993                 em->block_start = ins.objectid;
 994                 em->block_len = ins.offset;
 995                 em->orig_block_len = ins.offset;
 996                 em->ram_bytes = ram_size;
 997                 em->bdev = root->fs_info->fs_devices->latest_bdev;
 998                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
 999                 em->generation = -1;
1000
1001                 while (1) {
1002                         write_lock(&em_tree->lock);
1003                         ret = add_extent_mapping(em_tree, em, 1);
1004                         write_unlock(&em_tree->lock);
1005                         if (ret != -EEXIST) {
1006                                 free_extent_map(em);
1007                                 break;
1008                         }
1009                         btrfs_drop_extent_cache(inode, start,
1010                                                 start + ram_size - 1, 0);
1011                 }
1012                 if (ret)
1013                         goto out_reserve;
1014
1015                 cur_alloc_size = ins.offset;
1016                 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1017                                                ram_size, cur_alloc_size, 0);
1018                 if (ret)
1019                         goto out_drop_extent_cache;
1020
1021                 if (root->root_key.objectid ==
1022                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1023                         ret = btrfs_reloc_clone_csums(inode, start,
1024                                                       cur_alloc_size);
1025                         if (ret)
1026                                 goto out_drop_extent_cache;
1027                 }
1028
1029                 if (disk_num_bytes < cur_alloc_size)
1030                         break;
1031
1032                 /* we're not doing compressed IO, don't unlock the first
1033                  * page (which the caller expects to stay locked), don't
1034                  * clear any dirty bits and don't set any writeback bits
1035                  *
1036                  * Do set the Private2 bit so we know this page was properly
1037                  * setup for writepage
1038                  */
1039                 op = unlock ? PAGE_UNLOCK : 0;
1040                 op |= PAGE_SET_PRIVATE2;
1041
1042                 extent_clear_unlock_delalloc(inode, start,
1043                                              start + ram_size - 1, locked_page,
1044                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1045                                              op);
1046                 disk_num_bytes -= cur_alloc_size;
1047                 num_bytes -= cur_alloc_size;
1048                 alloc_hint = ins.objectid + ins.offset;
1049                 start += cur_alloc_size;
1050         }
1051 out:
1052         return ret;
1053
1054 out_drop_extent_cache:
1055         btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1056 out_reserve:
1057         btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1058 out_unlock:
1059         extent_clear_unlock_delalloc(inode, start, end, locked_page,
1060                                      EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
1061                                      EXTENT_DELALLOC | EXTENT_DEFRAG,
1062                                      PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
1063                                      PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
1064         goto out;
1065 }
1066
1067 /*
1068  * work queue call back to started compression on a file and pages
1069  */
1070 static noinline void async_cow_start(struct btrfs_work *work)
1071 {
1072         struct async_cow *async_cow;
1073         int num_added = 0;
1074         async_cow = container_of(work, struct async_cow, work);
1075
1076         compress_file_range(async_cow->inode, async_cow->locked_page,
1077                             async_cow->start, async_cow->end, async_cow,
1078                             &num_added);
1079         if (num_added == 0) {
1080                 btrfs_add_delayed_iput(async_cow->inode);
1081                 async_cow->inode = NULL;
1082         }
1083 }
1084
1085 /*
1086  * work queue call back to submit previously compressed pages
1087  */
1088 static noinline void async_cow_submit(struct btrfs_work *work)
1089 {
1090         struct async_cow *async_cow;
1091         struct btrfs_root *root;
1092         unsigned long nr_pages;
1093
1094         async_cow = container_of(work, struct async_cow, work);
1095
1096         root = async_cow->root;
1097         nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1098                 PAGE_CACHE_SHIFT;
1099
1100         if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1101             5 * 1024 * 1024 &&
1102             waitqueue_active(&root->fs_info->async_submit_wait))
1103                 wake_up(&root->fs_info->async_submit_wait);
1104
1105         if (async_cow->inode)
1106                 submit_compressed_extents(async_cow->inode, async_cow);
1107 }
1108
1109 static noinline void async_cow_free(struct btrfs_work *work)
1110 {
1111         struct async_cow *async_cow;
1112         async_cow = container_of(work, struct async_cow, work);
1113         if (async_cow->inode)
1114                 btrfs_add_delayed_iput(async_cow->inode);
1115         kfree(async_cow);
1116 }
1117
1118 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1119                                 u64 start, u64 end, int *page_started,
1120                                 unsigned long *nr_written)
1121 {
1122         struct async_cow *async_cow;
1123         struct btrfs_root *root = BTRFS_I(inode)->root;
1124         unsigned long nr_pages;
1125         u64 cur_end;
1126         int limit = 10 * 1024 * 1024;
1127
1128         clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1129                          1, 0, NULL, GFP_NOFS);
1130         while (start < end) {
1131                 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1132                 BUG_ON(!async_cow); /* -ENOMEM */
1133                 async_cow->inode = igrab(inode);
1134                 async_cow->root = root;
1135                 async_cow->locked_page = locked_page;
1136                 async_cow->start = start;
1137
1138                 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1139                     !btrfs_test_opt(root, FORCE_COMPRESS))
1140                         cur_end = end;
1141                 else
1142                         cur_end = min(end, start + 512 * 1024 - 1);
1143
1144                 async_cow->end = cur_end;
1145                 INIT_LIST_HEAD(&async_cow->extents);
1146
1147                 btrfs_init_work(&async_cow->work,
1148                                 btrfs_delalloc_helper,
1149                                 async_cow_start, async_cow_submit,
1150                                 async_cow_free);
1151
1152                 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1153                         PAGE_CACHE_SHIFT;
1154                 atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1155
1156                 btrfs_queue_work(root->fs_info->delalloc_workers,
1157                                  &async_cow->work);
1158
1159                 if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1160                         wait_event(root->fs_info->async_submit_wait,
1161                            (atomic_read(&root->fs_info->async_delalloc_pages) <
1162                             limit));
1163                 }
1164
1165                 while (atomic_read(&root->fs_info->async_submit_draining) &&
1166                       atomic_read(&root->fs_info->async_delalloc_pages)) {
1167                         wait_event(root->fs_info->async_submit_wait,
1168                           (atomic_read(&root->fs_info->async_delalloc_pages) ==
1169                            0));
1170                 }
1171
1172                 *nr_written += nr_pages;
1173                 start = cur_end + 1;
1174         }
1175         *page_started = 1;
1176         return 0;
1177 }
1178
1179 static noinline int csum_exist_in_range(struct btrfs_root *root,
1180                                         u64 bytenr, u64 num_bytes)
1181 {
1182         int ret;
1183         struct btrfs_ordered_sum *sums;
1184         LIST_HEAD(list);
1185
1186         ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1187                                        bytenr + num_bytes - 1, &list, 0);
1188         if (ret == 0 && list_empty(&list))
1189                 return 0;
1190
1191         while (!list_empty(&list)) {
1192                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1193                 list_del(&sums->list);
1194                 kfree(sums);
1195         }
1196         return 1;
1197 }
1198
1199 /*
1200  * when nowcow writeback call back.  This checks for snapshots or COW copies
1201  * of the extents that exist in the file, and COWs the file as required.
1202  *
1203  * If no cow copies or snapshots exist, we write directly to the existing
1204  * blocks on disk
1205  */
1206 static noinline int run_delalloc_nocow(struct inode *inode,
1207                                        struct page *locked_page,
1208                               u64 start, u64 end, int *page_started, int force,
1209                               unsigned long *nr_written)
1210 {
1211         struct btrfs_root *root = BTRFS_I(inode)->root;
1212         struct btrfs_trans_handle *trans;
1213         struct extent_buffer *leaf;
1214         struct btrfs_path *path;
1215         struct btrfs_file_extent_item *fi;
1216         struct btrfs_key found_key;
1217         u64 cow_start;
1218         u64 cur_offset;
1219         u64 extent_end;
1220         u64 extent_offset;
1221         u64 disk_bytenr;
1222         u64 num_bytes;
1223         u64 disk_num_bytes;
1224         u64 ram_bytes;
1225         int extent_type;
1226         int ret, err;
1227         int type;
1228         int nocow;
1229         int check_prev = 1;
1230         bool nolock;
1231         u64 ino = btrfs_ino(inode);
1232
1233         path = btrfs_alloc_path();
1234         if (!path) {
1235                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1236                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1237                                              EXTENT_DO_ACCOUNTING |
1238                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1239                                              PAGE_CLEAR_DIRTY |
1240                                              PAGE_SET_WRITEBACK |
1241                                              PAGE_END_WRITEBACK);
1242                 return -ENOMEM;
1243         }
1244
1245         nolock = btrfs_is_free_space_inode(inode);
1246
1247         if (nolock)
1248                 trans = btrfs_join_transaction_nolock(root);
1249         else
1250                 trans = btrfs_join_transaction(root);
1251
1252         if (IS_ERR(trans)) {
1253                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1254                                              EXTENT_LOCKED | EXTENT_DELALLOC |
1255                                              EXTENT_DO_ACCOUNTING |
1256                                              EXTENT_DEFRAG, PAGE_UNLOCK |
1257                                              PAGE_CLEAR_DIRTY |
1258                                              PAGE_SET_WRITEBACK |
1259                                              PAGE_END_WRITEBACK);
1260                 btrfs_free_path(path);
1261                 return PTR_ERR(trans);
1262         }
1263
1264         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1265
1266         cow_start = (u64)-1;
1267         cur_offset = start;
1268         while (1) {
1269                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
1270                                                cur_offset, 0);
1271                 if (ret < 0)
1272                         goto error;
1273                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1274                         leaf = path->nodes[0];
1275                         btrfs_item_key_to_cpu(leaf, &found_key,
1276                                               path->slots[0] - 1);
1277                         if (found_key.objectid == ino &&
1278                             found_key.type == BTRFS_EXTENT_DATA_KEY)
1279                                 path->slots[0]--;
1280                 }
1281                 check_prev = 0;
1282 next_slot:
1283                 leaf = path->nodes[0];
1284                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1285                         ret = btrfs_next_leaf(root, path);
1286                         if (ret < 0)
1287                                 goto error;
1288                         if (ret > 0)
1289                                 break;
1290                         leaf = path->nodes[0];
1291                 }
1292
1293                 nocow = 0;
1294                 disk_bytenr = 0;
1295                 num_bytes = 0;
1296                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1297
1298                 if (found_key.objectid > ino ||
1299                     found_key.type > BTRFS_EXTENT_DATA_KEY ||
1300                     found_key.offset > end)
1301                         break;
1302
1303                 if (found_key.offset > cur_offset) {
1304                         extent_end = found_key.offset;
1305                         extent_type = 0;
1306                         goto out_check;
1307                 }
1308
1309                 fi = btrfs_item_ptr(leaf, path->slots[0],
1310                                     struct btrfs_file_extent_item);
1311                 extent_type = btrfs_file_extent_type(leaf, fi);
1312
1313                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1314                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
1315                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1316                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1317                         extent_offset = btrfs_file_extent_offset(leaf, fi);
1318                         extent_end = found_key.offset +
1319                                 btrfs_file_extent_num_bytes(leaf, fi);
1320                         disk_num_bytes =
1321                                 btrfs_file_extent_disk_num_bytes(leaf, fi);
1322                         if (extent_end <= start) {
1323                                 path->slots[0]++;
1324                                 goto next_slot;
1325                         }
1326                         if (disk_bytenr == 0)
1327                                 goto out_check;
1328                         if (btrfs_file_extent_compression(leaf, fi) ||
1329                             btrfs_file_extent_encryption(leaf, fi) ||
1330                             btrfs_file_extent_other_encoding(leaf, fi))
1331                                 goto out_check;
1332                         if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1333                                 goto out_check;
1334                         if (btrfs_extent_readonly(root, disk_bytenr))
1335                                 goto out_check;
1336                         if (btrfs_cross_ref_exist(trans, root, ino,
1337                                                   found_key.offset -
1338                                                   extent_offset, disk_bytenr))
1339                                 goto out_check;
1340                         disk_bytenr += extent_offset;
1341                         disk_bytenr += cur_offset - found_key.offset;
1342                         num_bytes = min(end + 1, extent_end) - cur_offset;
1343                         /*
1344                          * if there are pending snapshots for this root,
1345                          * we fall into common COW way.
1346                          */
1347                         if (!nolock) {
1348                                 err = btrfs_start_write_no_snapshoting(root);
1349                                 if (!err)
1350                                         goto out_check;
1351                         }
1352                         /*
1353                          * force cow if csum exists in the range.
1354                          * this ensure that csum for a given extent are
1355                          * either valid or do not exist.
1356                          */
1357                         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1358                                 goto out_check;
1359                         nocow = 1;
1360                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1361                         extent_end = found_key.offset +
1362                                 btrfs_file_extent_inline_len(leaf,
1363                                                      path->slots[0], fi);
1364                         extent_end = ALIGN(extent_end, root->sectorsize);
1365                 } else {
1366                         BUG_ON(1);
1367                 }
1368 out_check:
1369                 if (extent_end <= start) {
1370                         path->slots[0]++;
1371                         if (!nolock && nocow)
1372                                 btrfs_end_write_no_snapshoting(root);
1373                         goto next_slot;
1374                 }
1375                 if (!nocow) {
1376                         if (cow_start == (u64)-1)
1377                                 cow_start = cur_offset;
1378                         cur_offset = extent_end;
1379                         if (cur_offset > end)
1380                                 break;
1381                         path->slots[0]++;
1382                         goto next_slot;
1383                 }
1384
1385                 btrfs_release_path(path);
1386                 if (cow_start != (u64)-1) {
1387                         ret = cow_file_range(inode, locked_page,
1388                                              cow_start, found_key.offset - 1,
1389                                              page_started, nr_written, 1);
1390                         if (ret) {
1391                                 if (!nolock && nocow)
1392                                         btrfs_end_write_no_snapshoting(root);
1393                                 goto error;
1394                         }
1395                         cow_start = (u64)-1;
1396                 }
1397
1398                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1399                         struct extent_map *em;
1400                         struct extent_map_tree *em_tree;
1401                         em_tree = &BTRFS_I(inode)->extent_tree;
1402                         em = alloc_extent_map();
1403                         BUG_ON(!em); /* -ENOMEM */
1404                         em->start = cur_offset;
1405                         em->orig_start = found_key.offset - extent_offset;
1406                         em->len = num_bytes;
1407                         em->block_len = num_bytes;
1408                         em->block_start = disk_bytenr;
1409                         em->orig_block_len = disk_num_bytes;
1410                         em->ram_bytes = ram_bytes;
1411                         em->bdev = root->fs_info->fs_devices->latest_bdev;
1412                         em->mod_start = em->start;
1413                         em->mod_len = em->len;
1414                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
1415                         set_bit(EXTENT_FLAG_FILLING, &em->flags);
1416                         em->generation = -1;
1417                         while (1) {
1418                                 write_lock(&em_tree->lock);
1419                                 ret = add_extent_mapping(em_tree, em, 1);
1420                                 write_unlock(&em_tree->lock);
1421                                 if (ret != -EEXIST) {
1422                                         free_extent_map(em);
1423                                         break;
1424                                 }
1425                                 btrfs_drop_extent_cache(inode, em->start,
1426                                                 em->start + em->len - 1, 0);
1427                         }
1428                         type = BTRFS_ORDERED_PREALLOC;
1429                 } else {
1430                         type = BTRFS_ORDERED_NOCOW;
1431                 }
1432
1433                 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1434                                                num_bytes, num_bytes, type);
1435                 BUG_ON(ret); /* -ENOMEM */
1436
1437                 if (root->root_key.objectid ==
1438                     BTRFS_DATA_RELOC_TREE_OBJECTID) {
1439                         ret = btrfs_reloc_clone_csums(inode, cur_offset,
1440                                                       num_bytes);
1441                         if (ret) {
1442                                 if (!nolock && nocow)
1443                                         btrfs_end_write_no_snapshoting(root);
1444                                 goto error;
1445                         }
1446                 }
1447
1448                 extent_clear_unlock_delalloc(inode, cur_offset,
1449                                              cur_offset + num_bytes - 1,
1450                                              locked_page, EXTENT_LOCKED |
1451                                              EXTENT_DELALLOC, PAGE_UNLOCK |
1452                                              PAGE_SET_PRIVATE2);
1453                 if (!nolock && nocow)
1454                         btrfs_end_write_no_snapshoting(root);
1455                 cur_offset = extent_end;
1456                 if (cur_offset > end)
1457                         break;
1458         }
1459         btrfs_release_path(path);
1460
1461         if (cur_offset <= end && cow_start == (u64)-1) {
1462                 cow_start = cur_offset;
1463                 cur_offset = end;
1464         }
1465
1466         if (cow_start != (u64)-1) {
1467                 ret = cow_file_range(inode, locked_page, cow_start, end,
1468                                      page_started, nr_written, 1);
1469                 if (ret)
1470                         goto error;
1471         }
1472
1473 error:
1474         err = btrfs_end_transaction(trans, root);
1475         if (!ret)
1476                 ret = err;
1477
1478         if (ret && cur_offset < end)
1479                 extent_clear_unlock_delalloc(inode, cur_offset, end,
1480                                              locked_page, EXTENT_LOCKED |
1481                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
1482                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1483                                              PAGE_CLEAR_DIRTY |
1484                                              PAGE_SET_WRITEBACK |
1485                                              PAGE_END_WRITEBACK);
1486         btrfs_free_path(path);
1487         return ret;
1488 }
1489
1490 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1491 {
1492
1493         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1494             !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1495                 return 0;
1496
1497         /*
1498          * @defrag_bytes is a hint value, no spinlock held here,
1499          * if is not zero, it means the file is defragging.
1500          * Force cow if given extent needs to be defragged.
1501          */
1502         if (BTRFS_I(inode)->defrag_bytes &&
1503             test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1504                            EXTENT_DEFRAG, 0, NULL))
1505                 return 1;
1506
1507         return 0;
1508 }
1509
1510 /*
1511  * extent_io.c call back to do delayed allocation processing
1512  */
1513 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1514                               u64 start, u64 end, int *page_started,
1515                               unsigned long *nr_written)
1516 {
1517         int ret;
1518         int force_cow = need_force_cow(inode, start, end);
1519
1520         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1521                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1522                                          page_started, 1, nr_written);
1523         } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1524                 ret = run_delalloc_nocow(inode, locked_page, start, end,
1525                                          page_started, 0, nr_written);
1526         } else if (!inode_need_compress(inode)) {
1527                 ret = cow_file_range(inode, locked_page, start, end,
1528                                       page_started, nr_written, 1);
1529         } else {
1530                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1531                         &BTRFS_I(inode)->runtime_flags);
1532                 ret = cow_file_range_async(inode, locked_page, start, end,
1533                                            page_started, nr_written);
1534         }
1535         return ret;
1536 }
1537
1538 static void btrfs_split_extent_hook(struct inode *inode,
1539                                     struct extent_state *orig, u64 split)
1540 {
1541         u64 size;
1542
1543         /* not delalloc, ignore it */
1544         if (!(orig->state & EXTENT_DELALLOC))
1545                 return;
1546
1547         size = orig->end - orig->start + 1;
1548         if (size > BTRFS_MAX_EXTENT_SIZE) {
1549                 u64 num_extents;
1550                 u64 new_size;
1551
1552                 /*
1553                  * See the explanation in btrfs_merge_extent_hook, the same
1554                  * applies here, just in reverse.
1555                  */
1556                 new_size = orig->end - split + 1;
1557                 num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1558                                         BTRFS_MAX_EXTENT_SIZE);
1559                 new_size = split - orig->start;
1560                 num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1561                                         BTRFS_MAX_EXTENT_SIZE);
1562                 if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
1563                               BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1564                         return;
1565         }
1566
1567         spin_lock(&BTRFS_I(inode)->lock);
1568         BTRFS_I(inode)->outstanding_extents++;
1569         spin_unlock(&BTRFS_I(inode)->lock);
1570 }
1571
1572 /*
1573  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1574  * extents so we can keep track of new extents that are just merged onto old
1575  * extents, such as when we are doing sequential writes, so we can properly
1576  * account for the metadata space we'll need.
1577  */
1578 static void btrfs_merge_extent_hook(struct inode *inode,
1579                                     struct extent_state *new,
1580                                     struct extent_state *other)
1581 {
1582         u64 new_size, old_size;
1583         u64 num_extents;
1584
1585         /* not delalloc, ignore it */
1586         if (!(other->state & EXTENT_DELALLOC))
1587                 return;
1588
1589         if (new->start > other->start)
1590                 new_size = new->end - other->start + 1;
1591         else
1592                 new_size = other->end - new->start + 1;
1593
1594         /* we're not bigger than the max, unreserve the space and go */
1595         if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1596                 spin_lock(&BTRFS_I(inode)->lock);
1597                 BTRFS_I(inode)->outstanding_extents--;
1598                 spin_unlock(&BTRFS_I(inode)->lock);
1599                 return;
1600         }
1601
1602         /*
1603          * We have to add up either side to figure out how many extents were
1604          * accounted for before we merged into one big extent.  If the number of
1605          * extents we accounted for is <= the amount we need for the new range
1606          * then we can return, otherwise drop.  Think of it like this
1607          *
1608          * [ 4k][MAX_SIZE]
1609          *
1610          * So we've grown the extent by a MAX_SIZE extent, this would mean we
1611          * need 2 outstanding extents, on one side we have 1 and the other side
1612          * we have 1 so they are == and we can return.  But in this case
1613          *
1614          * [MAX_SIZE+4k][MAX_SIZE+4k]
1615          *
1616          * Each range on their own accounts for 2 extents, but merged together
1617          * they are only 3 extents worth of accounting, so we need to drop in
1618          * this case.
1619          */
1620         old_size = other->end - other->start + 1;
1621         num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1622                                 BTRFS_MAX_EXTENT_SIZE);
1623         old_size = new->end - new->start + 1;
1624         num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
1625                                  BTRFS_MAX_EXTENT_SIZE);
1626
1627         if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
1628                       BTRFS_MAX_EXTENT_SIZE) >= num_extents)
1629                 return;
1630
1631         spin_lock(&BTRFS_I(inode)->lock);
1632         BTRFS_I(inode)->outstanding_extents--;
1633         spin_unlock(&BTRFS_I(inode)->lock);
1634 }
1635
1636 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1637                                       struct inode *inode)
1638 {
1639         spin_lock(&root->delalloc_lock);
1640         if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1641                 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1642                               &root->delalloc_inodes);
1643                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1644                         &BTRFS_I(inode)->runtime_flags);
1645                 root->nr_delalloc_inodes++;
1646                 if (root->nr_delalloc_inodes == 1) {
1647                         spin_lock(&root->fs_info->delalloc_root_lock);
1648                         BUG_ON(!list_empty(&root->delalloc_root));
1649                         list_add_tail(&root->delalloc_root,
1650                                       &root->fs_info->delalloc_roots);
1651                         spin_unlock(&root->fs_info->delalloc_root_lock);
1652                 }
1653         }
1654         spin_unlock(&root->delalloc_lock);
1655 }
1656
1657 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1658                                      struct inode *inode)
1659 {
1660         spin_lock(&root->delalloc_lock);
1661         if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1662                 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1663                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1664                           &BTRFS_I(inode)->runtime_flags);
1665                 root->nr_delalloc_inodes--;
1666                 if (!root->nr_delalloc_inodes) {
1667                         spin_lock(&root->fs_info->delalloc_root_lock);
1668                         BUG_ON(list_empty(&root->delalloc_root));
1669                         list_del_init(&root->delalloc_root);
1670                         spin_unlock(&root->fs_info->delalloc_root_lock);
1671                 }
1672         }
1673         spin_unlock(&root->delalloc_lock);
1674 }
1675
1676 /*
1677  * extent_io.c set_bit_hook, used to track delayed allocation
1678  * bytes in this file, and to maintain the list of inodes that
1679  * have pending delalloc work to be done.
1680  */
1681 static void btrfs_set_bit_hook(struct inode *inode,
1682                                struct extent_state *state, unsigned *bits)
1683 {
1684
1685         if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1686                 WARN_ON(1);
1687         /*
1688          * set_bit and clear bit hooks normally require _irqsave/restore
1689          * but in this case, we are only testing for the DELALLOC
1690          * bit, which is only set or cleared with irqs on
1691          */
1692         if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1693                 struct btrfs_root *root = BTRFS_I(inode)->root;
1694                 u64 len = state->end + 1 - state->start;
1695                 bool do_list = !btrfs_is_free_space_inode(inode);
1696
1697                 if (*bits & EXTENT_FIRST_DELALLOC) {
1698                         *bits &= ~EXTENT_FIRST_DELALLOC;
1699                 } else {
1700                         spin_lock(&BTRFS_I(inode)->lock);
1701                         BTRFS_I(inode)->outstanding_extents++;
1702                         spin_unlock(&BTRFS_I(inode)->lock);
1703                 }
1704
1705                 /* For sanity tests */
1706                 if (btrfs_test_is_dummy_root(root))
1707                         return;
1708
1709                 __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1710                                      root->fs_info->delalloc_batch);
1711                 spin_lock(&BTRFS_I(inode)->lock);
1712                 BTRFS_I(inode)->delalloc_bytes += len;
1713                 if (*bits & EXTENT_DEFRAG)
1714                         BTRFS_I(inode)->defrag_bytes += len;
1715                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1716                                          &BTRFS_I(inode)->runtime_flags))
1717                         btrfs_add_delalloc_inodes(root, inode);
1718                 spin_unlock(&BTRFS_I(inode)->lock);
1719         }
1720 }
1721
1722 /*
1723  * extent_io.c clear_bit_hook, see set_bit_hook for why
1724  */
1725 static void btrfs_clear_bit_hook(struct inode *inode,
1726                                  struct extent_state *state,
1727                                  unsigned *bits)
1728 {
1729         u64 len = state->end + 1 - state->start;
1730         u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
1731                                     BTRFS_MAX_EXTENT_SIZE);
1732
1733         spin_lock(&BTRFS_I(inode)->lock);
1734         if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1735                 BTRFS_I(inode)->defrag_bytes -= len;
1736         spin_unlock(&BTRFS_I(inode)->lock);
1737
1738         /*
1739          * set_bit and clear bit hooks normally require _irqsave/restore
1740          * but in this case, we are only testing for the DELALLOC
1741          * bit, which is only set or cleared with irqs on
1742          */
1743         if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1744                 struct btrfs_root *root = BTRFS_I(inode)->root;
1745                 bool do_list = !btrfs_is_free_space_inode(inode);
1746
1747                 if (*bits & EXTENT_FIRST_DELALLOC) {
1748                         *bits &= ~EXTENT_FIRST_DELALLOC;
1749                 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1750                         spin_lock(&BTRFS_I(inode)->lock);
1751                         BTRFS_I(inode)->outstanding_extents -= num_extents;
1752                         spin_unlock(&BTRFS_I(inode)->lock);
1753                 }
1754
1755                 /*
1756                  * We don't reserve metadata space for space cache inodes so we
1757                  * don't need to call dellalloc_release_metadata if there is an
1758                  * error.
1759                  */
1760                 if (*bits & EXTENT_DO_ACCOUNTING &&
1761                     root != root->fs_info->tree_root)
1762                         btrfs_delalloc_release_metadata(inode, len);
1763
1764                 /* For sanity tests. */
1765                 if (btrfs_test_is_dummy_root(root))
1766                         return;
1767
1768                 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1769                     && do_list && !(state->state & EXTENT_NORESERVE))
1770                         btrfs_free_reserved_data_space(inode, len);
1771
1772                 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1773                                      root->fs_info->delalloc_batch);
1774                 spin_lock(&BTRFS_I(inode)->lock);
1775                 BTRFS_I(inode)->delalloc_bytes -= len;
1776                 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1777                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1778                              &BTRFS_I(inode)->runtime_flags))
1779                         btrfs_del_delalloc_inode(root, inode);
1780                 spin_unlock(&BTRFS_I(inode)->lock);
1781         }
1782 }
1783
1784 /*
1785  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1786  * we don't create bios that span stripes or chunks
1787  */
1788 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1789                          size_t size, struct bio *bio,
1790                          unsigned long bio_flags)
1791 {
1792         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1793         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1794         u64 length = 0;
1795         u64 map_length;
1796         int ret;
1797
1798         if (bio_flags & EXTENT_BIO_COMPRESSED)
1799                 return 0;
1800
1801         length = bio->bi_iter.bi_size;
1802         map_length = length;
1803         ret = btrfs_map_block(root->fs_info, rw, logical,
1804                               &map_length, NULL, 0);
1805         /* Will always return 0 with map_multi == NULL */
1806         BUG_ON(ret < 0);
1807         if (map_length < length + size)
1808                 return 1;
1809         return 0;
1810 }
1811
1812 /*
1813  * in order to insert checksums into the metadata in large chunks,
1814  * we wait until bio submission time.   All the pages in the bio are
1815  * checksummed and sums are attached onto the ordered extent record.
1816  *
1817  * At IO completion time the cums attached on the ordered extent record
1818  * are inserted into the btree
1819  */
1820 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1821                                     struct bio *bio, int mirror_num,
1822                                     unsigned long bio_flags,
1823                                     u64 bio_offset)
1824 {
1825         struct btrfs_root *root = BTRFS_I(inode)->root;
1826         int ret = 0;
1827
1828         ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1829         BUG_ON(ret); /* -ENOMEM */
1830         return 0;
1831 }
1832
1833 /*
1834  * in order to insert checksums into the metadata in large chunks,
1835  * we wait until bio submission time.   All the pages in the bio are
1836  * checksummed and sums are attached onto the ordered extent record.
1837  *
1838  * At IO completion time the cums attached on the ordered extent record
1839  * are inserted into the btree
1840  */
1841 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1842                           int mirror_num, unsigned long bio_flags,
1843                           u64 bio_offset)
1844 {
1845         struct btrfs_root *root = BTRFS_I(inode)->root;
1846         int ret;
1847
1848         ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1849         if (ret)
1850                 bio_endio(bio, ret);
1851         return ret;
1852 }
1853
1854 /*
1855  * extent_io.c submission hook. This does the right thing for csum calculation
1856  * on write, or reading the csums from the tree before a read
1857  */
1858 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1859                           int mirror_num, unsigned long bio_flags,
1860                           u64 bio_offset)
1861 {
1862         struct btrfs_root *root = BTRFS_I(inode)->root;
1863         int ret = 0;
1864         int skip_sum;
1865         int metadata = 0;
1866         int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1867
1868         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1869
1870         if (btrfs_is_free_space_inode(inode))
1871                 metadata = 2;
1872
1873         if (!(rw & REQ_WRITE)) {
1874                 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1875                 if (ret)
1876                         goto out;
1877
1878                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1879                         ret = btrfs_submit_compressed_read(inode, bio,
1880                                                            mirror_num,
1881                                                            bio_flags);
1882                         goto out;
1883                 } else if (!skip_sum) {
1884                         ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1885                         if (ret)
1886                                 goto out;
1887                 }
1888                 goto mapit;
1889         } else if (async && !skip_sum) {
1890                 /* csum items have already been cloned */
1891                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1892                         goto mapit;
1893                 /* we're doing a write, do the async checksumming */
1894                 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1895                                    inode, rw, bio, mirror_num,
1896                                    bio_flags, bio_offset,
1897                                    __btrfs_submit_bio_start,
1898                                    __btrfs_submit_bio_done);
1899                 goto out;
1900         } else if (!skip_sum) {
1901                 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1902                 if (ret)
1903                         goto out;
1904         }
1905
1906 mapit:
1907         ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1908
1909 out:
1910         if (ret < 0)
1911                 bio_endio(bio, ret);
1912         return ret;
1913 }
1914
1915 /*
1916  * given a list of ordered sums record them in the inode.  This happens
1917  * at IO completion time based on sums calculated at bio submission time.
1918  */
1919 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1920                              struct inode *inode, u64 file_offset,
1921                              struct list_head *list)
1922 {
1923         struct btrfs_ordered_sum *sum;
1924
1925         list_for_each_entry(sum, list, list) {
1926                 trans->adding_csums = 1;
1927                 btrfs_csum_file_blocks(trans,
1928                        BTRFS_I(inode)->root->fs_info->csum_root, sum);
1929                 trans->adding_csums = 0;
1930         }
1931         return 0;
1932 }
1933
1934 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1935                               struct extent_state **cached_state)
1936 {
1937         WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1938         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1939                                    cached_state, GFP_NOFS);
1940 }
1941
1942 /* see btrfs_writepage_start_hook for details on why this is required */
1943 struct btrfs_writepage_fixup {
1944         struct page *page;
1945         struct btrfs_work work;
1946 };
1947
1948 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1949 {
1950         struct btrfs_writepage_fixup *fixup;
1951         struct btrfs_ordered_extent *ordered;
1952         struct extent_state *cached_state = NULL;
1953         struct page *page;
1954         struct inode *inode;
1955         u64 page_start;
1956         u64 page_end;
1957         int ret;
1958
1959         fixup = container_of(work, struct btrfs_writepage_fixup, work);
1960         page = fixup->page;
1961 again:
1962         lock_page(page);
1963         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1964                 ClearPageChecked(page);
1965                 goto out_page;
1966         }
1967
1968         inode = page->mapping->host;
1969         page_start = page_offset(page);
1970         page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1971
1972         lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1973                          &cached_state);
1974
1975         /* already ordered? We're done */
1976         if (PagePrivate2(page))
1977                 goto out;
1978
1979         ordered = btrfs_lookup_ordered_extent(inode, page_start);
1980         if (ordered) {
1981                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1982                                      page_end, &cached_state, GFP_NOFS);
1983                 unlock_page(page);
1984                 btrfs_start_ordered_extent(inode, ordered, 1);
1985                 btrfs_put_ordered_extent(ordered);
1986                 goto again;
1987         }
1988
1989         ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1990         if (ret) {
1991                 mapping_set_error(page->mapping, ret);
1992                 end_extent_writepage(page, ret, page_start, page_end);
1993                 ClearPageChecked(page);
1994                 goto out;
1995          }
1996
1997         btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1998         ClearPageChecked(page);
1999         set_page_dirty(page);
2000 out:
2001         unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2002                              &cached_state, GFP_NOFS);
2003 out_page:
2004         unlock_page(page);
2005         page_cache_release(page);
2006         kfree(fixup);
2007 }
2008
2009 /*
2010  * There are a few paths in the higher layers of the kernel that directly
2011  * set the page dirty bit without asking the filesystem if it is a
2012  * good idea.  This causes problems because we want to make sure COW
2013  * properly happens and the data=ordered rules are followed.
2014  *
2015  * In our case any range that doesn't have the ORDERED bit set
2016  * hasn't been properly setup for IO.  We kick off an async process
2017  * to fix it up.  The async helper will wait for ordered extents, set
2018  * the delalloc bit and make it safe to write the page.
2019  */
2020 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2021 {
2022         struct inode *inode = page->mapping->host;
2023         struct btrfs_writepage_fixup *fixup;
2024         struct btrfs_root *root = BTRFS_I(inode)->root;
2025
2026         /* this page is properly in the ordered list */
2027         if (TestClearPagePrivate2(page))
2028                 return 0;
2029
2030         if (PageChecked(page))
2031                 return -EAGAIN;
2032
2033         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2034         if (!fixup)
2035                 return -EAGAIN;
2036
2037         SetPageChecked(page);
2038         page_cache_get(page);
2039         btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2040                         btrfs_writepage_fixup_worker, NULL, NULL);
2041         fixup->page = page;
2042         btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
2043         return -EBUSY;
2044 }
2045
2046 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2047                                        struct inode *inode, u64 file_pos,
2048                                        u64 disk_bytenr, u64 disk_num_bytes,
2049                                        u64 num_bytes, u64 ram_bytes,
2050                                        u8 compression, u8 encryption,
2051                                        u16 other_encoding, int extent_type)
2052 {
2053         struct btrfs_root *root = BTRFS_I(inode)->root;
2054         struct btrfs_file_extent_item *fi;
2055         struct btrfs_path *path;
2056         struct extent_buffer *leaf;
2057         struct btrfs_key ins;
2058         int extent_inserted = 0;
2059         int ret;
2060
2061         path = btrfs_alloc_path();
2062         if (!path)
2063                 return -ENOMEM;
2064
2065         /*
2066          * we may be replacing one extent in the tree with another.
2067          * The new extent is pinned in the extent map, and we don't want
2068          * to drop it from the cache until it is completely in the btree.
2069          *
2070          * So, tell btrfs_drop_extents to leave this extent in the cache.
2071          * the caller is expected to unpin it and allow it to be merged
2072          * with the others.
2073          */
2074         ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2075                                    file_pos + num_bytes, NULL, 0,
2076                                    1, sizeof(*fi), &extent_inserted);
2077         if (ret)
2078                 goto out;
2079
2080         if (!extent_inserted) {
2081                 ins.objectid = btrfs_ino(inode);
2082                 ins.offset = file_pos;
2083                 ins.type = BTRFS_EXTENT_DATA_KEY;
2084
2085                 path->leave_spinning = 1;
2086                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2087                                               sizeof(*fi));
2088                 if (ret)
2089                         goto out;
2090         }
2091         leaf = path->nodes[0];
2092         fi = btrfs_item_ptr(leaf, path->slots[0],
2093                             struct btrfs_file_extent_item);
2094         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2095         btrfs_set_file_extent_type(leaf, fi, extent_type);
2096         btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2097         btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2098         btrfs_set_file_extent_offset(leaf, fi, 0);
2099         btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2100         btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2101         btrfs_set_file_extent_compression(leaf, fi, compression);
2102         btrfs_set_file_extent_encryption(leaf, fi, encryption);
2103         btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2104
2105         btrfs_mark_buffer_dirty(leaf);
2106         btrfs_release_path(path);
2107
2108         inode_add_bytes(inode, num_bytes);
2109
2110         ins.objectid = disk_bytenr;
2111         ins.offset = disk_num_bytes;
2112         ins.type = BTRFS_EXTENT_ITEM_KEY;
2113         ret = btrfs_alloc_reserved_file_extent(trans, root,
2114                                         root->root_key.objectid,
2115                                         btrfs_ino(inode), file_pos, &ins);
2116 out:
2117         btrfs_free_path(path);
2118
2119         return ret;
2120 }
2121
2122 /* snapshot-aware defrag */
2123 struct sa_defrag_extent_backref {
2124         struct rb_node node;
2125         struct old_sa_defrag_extent *old;
2126         u64 root_id;
2127         u64 inum;
2128         u64 file_pos;
2129         u64 extent_offset;
2130         u64 num_bytes;
2131         u64 generation;
2132 };
2133
2134 struct old_sa_defrag_extent {
2135         struct list_head list;
2136         struct new_sa_defrag_extent *new;
2137
2138         u64 extent_offset;
2139         u64 bytenr;
2140         u64 offset;
2141         u64 len;
2142         int count;
2143 };
2144
2145 struct new_sa_defrag_extent {
2146         struct rb_root root;
2147         struct list_head head;
2148         struct btrfs_path *path;
2149         struct inode *inode;
2150         u64 file_pos;
2151         u64 len;
2152         u64 bytenr;
2153         u64 disk_len;
2154         u8 compress_type;
2155 };
2156
2157 static int backref_comp(struct sa_defrag_extent_backref *b1,
2158                         struct sa_defrag_extent_backref *b2)
2159 {
2160         if (b1->root_id < b2->root_id)
2161                 return -1;
2162         else if (b1->root_id > b2->root_id)
2163                 return 1;
2164
2165         if (b1->inum < b2->inum)
2166                 return -1;
2167         else if (b1->inum > b2->inum)
2168                 return 1;
2169
2170         if (b1->file_pos < b2->file_pos)
2171                 return -1;
2172         else if (b1->file_pos > b2->file_pos)
2173                 return 1;
2174
2175         /*
2176          * [------------------------------] ===> (a range of space)
2177          *     |<--->|   |<---->| =============> (fs/file tree A)
2178          * |<---------------------------->| ===> (fs/file tree B)
2179          *
2180          * A range of space can refer to two file extents in one tree while
2181          * refer to only one file extent in another tree.
2182          *
2183          * So we may process a disk offset more than one time(two extents in A)
2184          * and locate at the same extent(one extent in B), then insert two same
2185          * backrefs(both refer to the extent in B).
2186          */
2187         return 0;
2188 }
2189
2190 static void backref_insert(struct rb_root *root,
2191                            struct sa_defrag_extent_backref *backref)
2192 {
2193         struct rb_node **p = &root->rb_node;
2194         struct rb_node *parent = NULL;
2195         struct sa_defrag_extent_backref *entry;
2196         int ret;
2197
2198         while (*p) {
2199                 parent = *p;
2200                 entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2201
2202                 ret = backref_comp(backref, entry);
2203                 if (ret < 0)
2204                         p = &(*p)->rb_left;
2205                 else
2206                         p = &(*p)->rb_right;
2207         }
2208
2209         rb_link_node(&backref->node, parent, p);
2210         rb_insert_color(&backref->node, root);
2211 }
2212
2213 /*
2214  * Note the backref might has changed, and in this case we just return 0.
2215  */
2216 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2217                                        void *ctx)
2218 {
2219         struct btrfs_file_extent_item *extent;
2220         struct btrfs_fs_info *fs_info;
2221         struct old_sa_defrag_extent *old = ctx;
2222         struct new_sa_defrag_extent *new = old->new;
2223         struct btrfs_path *path = new->path;
2224         struct btrfs_key key;
2225         struct btrfs_root *root;
2226         struct sa_defrag_extent_backref *backref;
2227         struct extent_buffer *leaf;
2228         struct inode *inode = new->inode;
2229         int slot;
2230         int ret;
2231         u64 extent_offset;
2232         u64 num_bytes;
2233
2234         if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2235             inum == btrfs_ino(inode))
2236                 return 0;
2237
2238         key.objectid = root_id;
2239         key.type = BTRFS_ROOT_ITEM_KEY;
2240         key.offset = (u64)-1;
2241
2242         fs_info = BTRFS_I(inode)->root->fs_info;
2243         root = btrfs_read_fs_root_no_name(fs_info, &key);
2244         if (IS_ERR(root)) {
2245                 if (PTR_ERR(root) == -ENOENT)
2246                         return 0;
2247                 WARN_ON(1);
2248                 pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2249                          inum, offset, root_id);
2250                 return PTR_ERR(root);
2251         }
2252
2253         key.objectid = inum;
2254         key.type = BTRFS_EXTENT_DATA_KEY;
2255         if (offset > (u64)-1 << 32)
2256                 key.offset = 0;
2257         else
2258                 key.offset = offset;
2259
2260         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2261         if (WARN_ON(ret < 0))
2262                 return ret;
2263         ret = 0;
2264
2265         while (1) {
2266                 cond_resched();
2267
2268                 leaf = path->nodes[0];
2269                 slot = path->slots[0];
2270
2271                 if (slot >= btrfs_header_nritems(leaf)) {
2272                         ret = btrfs_next_leaf(root, path);
2273                         if (ret < 0) {
2274                                 goto out;
2275                         } else if (ret > 0) {
2276                                 ret = 0;
2277                                 goto out;
2278                         }
2279                         continue;
2280                 }
2281
2282                 path->slots[0]++;
2283
2284                 btrfs_item_key_to_cpu(leaf, &key, slot);
2285
2286                 if (key.objectid > inum)
2287                         goto out;
2288
2289                 if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2290                         continue;
2291
2292                 extent = btrfs_item_ptr(leaf, slot,
2293                                         struct btrfs_file_extent_item);
2294
2295                 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2296                         continue;
2297
2298                 /*
2299                  * 'offset' refers to the exact key.offset,
2300                  * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2301                  * (key.offset - extent_offset).
2302                  */
2303                 if (key.offset != offset)
2304                         continue;
2305
2306                 extent_offset = btrfs_file_extent_offset(leaf, extent);
2307                 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2308
2309                 if (extent_offset >= old->extent_offset + old->offset +
2310                     old->len || extent_offset + num_bytes <=
2311                     old->extent_offset + old->offset)
2312                         continue;
2313                 break;
2314         }
2315
2316         backref = kmalloc(sizeof(*backref), GFP_NOFS);
2317         if (!backref) {
2318                 ret = -ENOENT;
2319                 goto out;
2320         }
2321
2322         backref->root_id = root_id;
2323         backref->inum = inum;
2324         backref->file_pos = offset;
2325         backref->num_bytes = num_bytes;
2326         backref->extent_offset = extent_offset;
2327         backref->generation = btrfs_file_extent_generation(leaf, extent);
2328         backref->old = old;
2329         backref_insert(&new->root, backref);
2330         old->count++;
2331 out:
2332         btrfs_release_path(path);
2333         WARN_ON(ret);
2334         return ret;
2335 }
2336
2337 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2338                                    struct new_sa_defrag_extent *new)
2339 {
2340         struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2341         struct old_sa_defrag_extent *old, *tmp;
2342         int ret;
2343
2344         new->path = path;
2345
2346         list_for_each_entry_safe(old, tmp, &new->head, list) {
2347                 ret = iterate_inodes_from_logical(old->bytenr +
2348                                                   old->extent_offset, fs_info,
2349                                                   path, record_one_backref,
2350                                                   old);
2351                 if (ret < 0 && ret != -ENOENT)
2352                         return false;
2353
2354                 /* no backref to be processed for this extent */
2355                 if (!old->count) {
2356                         list_del(&old->list);
2357                         kfree(old);
2358                 }
2359         }
2360
2361         if (list_empty(&new->head))
2362                 return false;
2363
2364         return true;
2365 }
2366
2367 static int relink_is_mergable(struct extent_buffer *leaf,
2368                               struct btrfs_file_extent_item *fi,
2369                               struct new_sa_defrag_extent *new)
2370 {
2371         if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2372                 return 0;
2373
2374         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2375                 return 0;
2376
2377         if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2378                 return 0;
2379
2380         if (btrfs_file_extent_encryption(leaf, fi) ||
2381             btrfs_file_extent_other_encoding(leaf, fi))
2382                 return 0;
2383
2384         return 1;
2385 }
2386
2387 /*
2388  * Note the backref might has changed, and in this case we just return 0.
2389  */
2390 static noinline int relink_extent_backref(struct btrfs_path *path,
2391                                  struct sa_defrag_extent_backref *prev,
2392                                  struct sa_defrag_extent_backref *backref)
2393 {
2394         struct btrfs_file_extent_item *extent;
2395         struct btrfs_file_extent_item *item;
2396         struct btrfs_ordered_extent *ordered;
2397         struct btrfs_trans_handle *trans;
2398         struct btrfs_fs_info *fs_info;
2399         struct btrfs_root *root;
2400         struct btrfs_key key;
2401         struct extent_buffer *leaf;
2402         struct old_sa_defrag_extent *old = backref->old;
2403         struct new_sa_defrag_extent *new = old->new;
2404         struct inode *src_inode = new->inode;
2405         struct inode *inode;
2406         struct extent_state *cached = NULL;
2407         int ret = 0;
2408         u64 start;
2409         u64 len;
2410         u64 lock_start;
2411         u64 lock_end;
2412         bool merge = false;
2413         int index;
2414
2415         if (prev && prev->root_id == backref->root_id &&
2416             prev->inum == backref->inum &&
2417             prev->file_pos + prev->num_bytes == backref->file_pos)
2418                 merge = true;
2419
2420         /* step 1: get root */
2421         key.objectid = backref->root_id;
2422         key.type = BTRFS_ROOT_ITEM_KEY;
2423         key.offset = (u64)-1;
2424
2425         fs_info = BTRFS_I(src_inode)->root->fs_info;
2426         index = srcu_read_lock(&fs_info->subvol_srcu);
2427
2428         root = btrfs_read_fs_root_no_name(fs_info, &key);
2429         if (IS_ERR(root)) {
2430                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2431                 if (PTR_ERR(root) == -ENOENT)
2432                         return 0;
2433                 return PTR_ERR(root);
2434         }
2435
2436         if (btrfs_root_readonly(root)) {
2437                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2438                 return 0;
2439         }
2440
2441         /* step 2: get inode */
2442         key.objectid = backref->inum;
2443         key.type = BTRFS_INODE_ITEM_KEY;
2444         key.offset = 0;
2445
2446         inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2447         if (IS_ERR(inode)) {
2448                 srcu_read_unlock(&fs_info->subvol_srcu, index);
2449                 return 0;
2450         }
2451
2452         srcu_read_unlock(&fs_info->subvol_srcu, index);
2453
2454         /* step 3: relink backref */
2455         lock_start = backref->file_pos;
2456         lock_end = backref->file_pos + backref->num_bytes - 1;
2457         lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2458                          0, &cached);
2459
2460         ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2461         if (ordered) {
2462                 btrfs_put_ordered_extent(ordered);
2463                 goto out_unlock;
2464         }
2465
2466         trans = btrfs_join_transaction(root);
2467         if (IS_ERR(trans)) {
2468                 ret = PTR_ERR(trans);
2469                 goto out_unlock;
2470         }
2471
2472         key.objectid = backref->inum;
2473         key.type = BTRFS_EXTENT_DATA_KEY;
2474         key.offset = backref->file_pos;
2475
2476         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2477         if (ret < 0) {
2478                 goto out_free_path;
2479         } else if (ret > 0) {
2480                 ret = 0;
2481                 goto out_free_path;
2482         }
2483
2484         extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2485                                 struct btrfs_file_extent_item);
2486
2487         if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2488             backref->generation)
2489                 goto out_free_path;
2490
2491         btrfs_release_path(path);
2492
2493         start = backref->file_pos;
2494         if (backref->extent_offset < old->extent_offset + old->offset)
2495                 start += old->extent_offset + old->offset -
2496                          backref->extent_offset;
2497
2498         len = min(backref->extent_offset + backref->num_bytes,
2499                   old->extent_offset + old->offset + old->len);
2500         len -= max(backref->extent_offset, old->extent_offset + old->offset);
2501
2502         ret = btrfs_drop_extents(trans, root, inode, start,
2503                                  start + len, 1);
2504         if (ret)
2505                 goto out_free_path;
2506 again:
2507         key.objectid = btrfs_ino(inode);
2508         key.type = BTRFS_EXTENT_DATA_KEY;
2509         key.offset = start;
2510
2511         path->leave_spinning = 1;
2512         if (merge) {
2513                 struct btrfs_file_extent_item *fi;
2514                 u64 extent_len;
2515                 struct btrfs_key found_key;
2516
2517                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2518                 if (ret < 0)
2519                         goto out_free_path;
2520
2521                 path->slots[0]--;
2522                 leaf = path->nodes[0];
2523                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2524
2525                 fi = btrfs_item_ptr(leaf, path->slots[0],
2526                                     struct btrfs_file_extent_item);
2527                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2528
2529                 if (extent_len + found_key.offset == start &&
2530                     relink_is_mergable(leaf, fi, new)) {
2531                         btrfs_set_file_extent_num_bytes(leaf, fi,
2532                                                         extent_len + len);
2533                         btrfs_mark_buffer_dirty(leaf);
2534                         inode_add_bytes(inode, len);
2535
2536                         ret = 1;
2537                         goto out_free_path;
2538                 } else {
2539                         merge = false;
2540                         btrfs_release_path(path);
2541                         goto again;
2542                 }
2543         }
2544
2545         ret = btrfs_insert_empty_item(trans, root, path, &key,
2546                                         sizeof(*extent));
2547         if (ret) {
2548                 btrfs_abort_transaction(trans, root, ret);
2549                 goto out_free_path;
2550         }
2551
2552         leaf = path->nodes[0];
2553         item = btrfs_item_ptr(leaf, path->slots[0],
2554                                 struct btrfs_file_extent_item);
2555         btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2556         btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2557         btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2558         btrfs_set_file_extent_num_bytes(leaf, item, len);
2559         btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2560         btrfs_set_file_extent_generation(leaf, item, trans->transid);
2561         btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2562         btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2563         btrfs_set_file_extent_encryption(leaf, item, 0);
2564         btrfs_set_file_extent_other_encoding(leaf, item, 0);
2565
2566         btrfs_mark_buffer_dirty(leaf);
2567         inode_add_bytes(inode, len);
2568         btrfs_release_path(path);
2569
2570         ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2571                         new->disk_len, 0,
2572                         backref->root_id, backref->inum,
2573                         new->file_pos, 0);      /* start - extent_offset */
2574         if (ret) {
2575                 btrfs_abort_transaction(trans, root, ret);
2576                 goto out_free_path;
2577         }
2578
2579         ret = 1;
2580 out_free_path:
2581         btrfs_release_path(path);
2582         path->leave_spinning = 0;
2583         btrfs_end_transaction(trans, root);
2584 out_unlock:
2585         unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2586                              &cached, GFP_NOFS);
2587         iput(inode);
2588         return ret;
2589 }
2590
2591 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2592 {
2593         struct old_sa_defrag_extent *old, *tmp;
2594
2595         if (!new)
2596                 return;
2597
2598         list_for_each_entry_safe(old, tmp, &new->head, list) {
2599                 list_del(&old->list);
2600                 kfree(old);
2601         }
2602         kfree(new);
2603 }
2604
2605 static void relink_file_extents(struct new_sa_defrag_extent *new)
2606 {
2607         struct btrfs_path *path;
2608         struct sa_defrag_extent_backref *backref;
2609         struct sa_defrag_extent_backref *prev = NULL;
2610         struct inode *inode;
2611         struct btrfs_root *root;
2612         struct rb_node *node;
2613         int ret;
2614
2615         inode = new->inode;
2616         root = BTRFS_I(inode)->root;
2617
2618         path = btrfs_alloc_path();
2619         if (!path)
2620                 return;
2621
2622         if (!record_extent_backrefs(path, new)) {
2623                 btrfs_free_path(path);
2624                 goto out;
2625         }
2626         btrfs_release_path(path);
2627
2628         while (1) {
2629                 node = rb_first(&new->root);
2630                 if (!node)
2631                         break;
2632                 rb_erase(node, &new->root);
2633
2634                 backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2635
2636                 ret = relink_extent_backref(path, prev, backref);
2637                 WARN_ON(ret < 0);
2638
2639                 kfree(prev);
2640
2641                 if (ret == 1)
2642                         prev = backref;
2643                 else
2644                         prev = NULL;
2645                 cond_resched();
2646         }
2647         kfree(prev);
2648
2649         btrfs_free_path(path);
2650 out:
2651         free_sa_defrag_extent(new);
2652
2653         atomic_dec(&root->fs_info->defrag_running);
2654         wake_up(&root->fs_info->transaction_wait);
2655 }
2656
2657 static struct new_sa_defrag_extent *
2658 record_old_file_extents(struct inode *inode,
2659                         struct btrfs_ordered_extent *ordered)
2660 {
2661         struct btrfs_root *root = BTRFS_I(inode)->root;
2662         struct btrfs_path *path;
2663         struct btrfs_key key;
2664         struct old_sa_defrag_extent *old;
2665         struct new_sa_defrag_extent *new;
2666         int ret;
2667
2668         new = kmalloc(sizeof(*new), GFP_NOFS);
2669         if (!new)
2670                 return NULL;
2671
2672         new->inode = inode;
2673         new->file_pos = ordered->file_offset;
2674         new->len = ordered->len;
2675         new->bytenr = ordered->start;
2676         new->disk_len = ordered->disk_len;
2677         new->compress_type = ordered->compress_type;
2678         new->root = RB_ROOT;
2679         INIT_LIST_HEAD(&new->head);
2680
2681         path = btrfs_alloc_path();
2682         if (!path)
2683                 goto out_kfree;
2684
2685         key.objectid = btrfs_ino(inode);
2686         key.type = BTRFS_EXTENT_DATA_KEY;
2687         key.offset = new->file_pos;
2688
2689         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2690         if (ret < 0)
2691                 goto out_free_path;
2692         if (ret > 0 && path->slots[0] > 0)
2693                 path->slots[0]--;
2694
2695         /* find out all the old extents for the file range */
2696         while (1) {
2697                 struct btrfs_file_extent_item *extent;
2698                 struct extent_buffer *l;
2699                 int slot;
2700                 u64 num_bytes;
2701                 u64 offset;
2702                 u64 end;
2703                 u64 disk_bytenr;
2704                 u64 extent_offset;
2705
2706                 l = path->nodes[0];
2707                 slot = path->slots[0];
2708
2709                 if (slot >= btrfs_header_nritems(l)) {
2710                         ret = btrfs_next_leaf(root, path);
2711                         if (ret < 0)
2712                                 goto out_free_path;
2713                         else if (ret > 0)
2714                                 break;
2715                         continue;
2716                 }
2717
2718                 btrfs_item_key_to_cpu(l, &key, slot);
2719
2720                 if (key.objectid != btrfs_ino(inode))
2721                         break;
2722                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2723                         break;
2724                 if (key.offset >= new->file_pos + new->len)
2725                         break;
2726
2727                 extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2728
2729                 num_bytes = btrfs_file_extent_num_bytes(l, extent);
2730                 if (key.offset + num_bytes < new->file_pos)
2731                         goto next;
2732
2733                 disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2734                 if (!disk_bytenr)
2735                         goto next;
2736
2737                 extent_offset = btrfs_file_extent_offset(l, extent);
2738
2739                 old = kmalloc(sizeof(*old), GFP_NOFS);
2740                 if (!old)
2741                         goto out_free_path;
2742
2743                 offset = max(new->file_pos, key.offset);
2744                 end = min(new->file_pos + new->len, key.offset + num_bytes);
2745
2746                 old->bytenr = disk_bytenr;
2747                 old->extent_offset = extent_offset;
2748                 old->offset = offset - key.offset;
2749                 old->len = end - offset;
2750                 old->new = new;
2751                 old->count = 0;
2752                 list_add_tail(&old->list, &new->head);
2753 next:
2754                 path->slots[0]++;
2755                 cond_resched();
2756         }
2757
2758         btrfs_free_path(path);
2759         atomic_inc(&root->fs_info->defrag_running);
2760
2761         return new;
2762
2763 out_free_path:
2764         btrfs_free_path(path);
2765 out_kfree:
2766         free_sa_defrag_extent(new);
2767         return NULL;
2768 }
2769
2770 static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
2771                                          u64 start, u64 len)
2772 {
2773         struct btrfs_block_group_cache *cache;
2774
2775         cache = btrfs_lookup_block_group(root->fs_info, start);
2776         ASSERT(cache);
2777
2778         spin_lock(&cache->lock);
2779         cache->delalloc_bytes -= len;
2780         spin_unlock(&cache->lock);
2781
2782         btrfs_put_block_group(cache);
2783 }
2784
2785 /* as ordered data IO finishes, this gets called so we can finish
2786  * an ordered extent if the range of bytes in the file it covers are
2787  * fully written.
2788  */
2789 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2790 {
2791         struct inode *inode = ordered_extent->inode;
2792         struct btrfs_root *root = BTRFS_I(inode)->root;
2793         struct btrfs_trans_handle *trans = NULL;
2794         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2795         struct extent_state *cached_state = NULL;
2796         struct new_sa_defrag_extent *new = NULL;
2797         int compress_type = 0;
2798         int ret = 0;
2799         u64 logical_len = ordered_extent->len;
2800         bool nolock;
2801         bool truncated = false;
2802
2803         nolock = btrfs_is_free_space_inode(inode);
2804
2805         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2806                 ret = -EIO;
2807                 goto out;
2808         }
2809
2810         btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2811                                      ordered_extent->file_offset +
2812                                      ordered_extent->len - 1);
2813
2814         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2815                 truncated = true;
2816                 logical_len = ordered_extent->truncated_len;
2817                 /* Truncated the entire extent, don't bother adding */
2818                 if (!logical_len)
2819                         goto out;
2820         }
2821
2822         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2823                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2824                 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2825                 if (nolock)
2826                         trans = btrfs_join_transaction_nolock(root);
2827                 else
2828                         trans = btrfs_join_transaction(root);
2829                 if (IS_ERR(trans)) {
2830                         ret = PTR_ERR(trans);
2831                         trans = NULL;
2832                         goto out;
2833                 }
2834                 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2835                 ret = btrfs_update_inode_fallback(trans, root, inode);
2836                 if (ret) /* -ENOMEM or corruption */
2837                         btrfs_abort_transaction(trans, root, ret);
2838                 goto out;
2839         }
2840
2841         lock_extent_bits(io_tree, ordered_extent->file_offset,
2842                          ordered_extent->file_offset + ordered_extent->len - 1,
2843                          0, &cached_state);
2844
2845         ret = test_range_bit(io_tree, ordered_extent->file_offset,
2846                         ordered_extent->file_offset + ordered_extent->len - 1,
2847                         EXTENT_DEFRAG, 1, cached_state);
2848         if (ret) {
2849                 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2850                 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2851                         /* the inode is shared */
2852                         new = record_old_file_extents(inode, ordered_extent);
2853
2854                 clear_extent_bit(io_tree, ordered_extent->file_offset,
2855                         ordered_extent->file_offset + ordered_extent->len - 1,
2856                         EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2857         }
2858
2859         if (nolock)
2860                 trans = btrfs_join_transaction_nolock(root);
2861         else
2862                 trans = btrfs_join_transaction(root);
2863         if (IS_ERR(trans)) {
2864                 ret = PTR_ERR(trans);
2865                 trans = NULL;
2866                 goto out_unlock;
2867         }
2868
2869         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2870
2871         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2872                 compress_type = ordered_extent->compress_type;
2873         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2874                 BUG_ON(compress_type);
2875                 ret = btrfs_mark_extent_written(trans, inode,
2876                                                 ordered_extent->file_offset,
2877                                                 ordered_extent->file_offset +
2878                                                 logical_len);
2879         } else {
2880                 BUG_ON(root == root->fs_info->tree_root);
2881                 ret = insert_reserved_file_extent(trans, inode,
2882                                                 ordered_extent->file_offset,
2883                                                 ordered_extent->start,
2884                                                 ordered_extent->disk_len,
2885                                                 logical_len, logical_len,
2886                                                 compress_type, 0, 0,
2887                                                 BTRFS_FILE_EXTENT_REG);
2888                 if (!ret)
2889                         btrfs_release_delalloc_bytes(root,
2890                                                      ordered_extent->start,
2891                                                      ordered_extent->disk_len);
2892         }
2893         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2894                            ordered_extent->file_offset, ordered_extent->len,
2895                            trans->transid);
2896         if (ret < 0) {
2897                 btrfs_abort_transaction(trans, root, ret);
2898                 goto out_unlock;
2899         }
2900
2901         add_pending_csums(trans, inode, ordered_extent->file_offset,
2902                           &ordered_extent->list);
2903
2904         btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2905         ret = btrfs_update_inode_fallback(trans, root, inode);
2906         if (ret) { /* -ENOMEM or corruption */
2907                 btrfs_abort_transaction(trans, root, ret);
2908                 goto out_unlock;
2909         }
2910         ret = 0;
2911 out_unlock:
2912         unlock_extent_cached(io_tree, ordered_extent->file_offset,
2913                              ordered_extent->file_offset +
2914                              ordered_extent->len - 1, &cached_state, GFP_NOFS);
2915 out:
2916         if (root != root->fs_info->tree_root)
2917                 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2918         if (trans)
2919                 btrfs_end_transaction(trans, root);
2920
2921         if (ret || truncated) {
2922                 u64 start, end;
2923
2924                 if (truncated)
2925                         start = ordered_extent->file_offset + logical_len;
2926                 else
2927                         start = ordered_extent->file_offset;
2928                 end = ordered_extent->file_offset + ordered_extent->len - 1;
2929                 clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2930
2931                 /* Drop the cache for the part of the extent we didn't write. */
2932                 btrfs_drop_extent_cache(inode, start, end, 0);
2933
2934                 /*
2935                  * If the ordered extent had an IOERR or something else went
2936                  * wrong we need to return the space for this ordered extent
2937                  * back to the allocator.  We only free the extent in the
2938                  * truncated case if we didn't write out the extent at all.
2939                  */
2940                 if ((ret || !logical_len) &&
2941                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2942                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2943                         btrfs_free_reserved_extent(root, ordered_extent->start,
2944                                                    ordered_extent->disk_len, 1);
2945         }
2946
2947
2948         /*
2949          * This needs to be done to make sure anybody waiting knows we are done
2950          * updating everything for this ordered extent.
2951          */
2952         btrfs_remove_ordered_extent(inode, ordered_extent);
2953
2954         /* for snapshot-aware defrag */
2955         if (new) {
2956                 if (ret) {
2957                         free_sa_defrag_extent(new);
2958                         atomic_dec(&root->fs_info->defrag_running);
2959                 } else {
2960                         relink_file_extents(new);
2961                 }
2962         }
2963
2964         /* once for us */
2965         btrfs_put_ordered_extent(ordered_extent);
2966         /* once for the tree */
2967         btrfs_put_ordered_extent(ordered_extent);
2968
2969         return ret;
2970 }
2971
2972 static void finish_ordered_fn(struct btrfs_work *work)
2973 {
2974         struct btrfs_ordered_extent *ordered_extent;
2975         ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2976         btrfs_finish_ordered_io(ordered_extent);
2977 }
2978
2979 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2980                                 struct extent_state *state, int uptodate)
2981 {
2982         struct inode *inode = page->mapping->host;
2983         struct btrfs_root *root = BTRFS_I(inode)->root;
2984         struct btrfs_ordered_extent *ordered_extent = NULL;
2985         struct btrfs_workqueue *wq;
2986         btrfs_work_func_t func;
2987
2988         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2989
2990         ClearPagePrivate2(page);
2991         if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2992                                             end - start + 1, uptodate))
2993                 return 0;
2994
2995         if (btrfs_is_free_space_inode(inode)) {
2996                 wq = root->fs_info->endio_freespace_worker;
2997                 func = btrfs_freespace_write_helper;
2998         } else {
2999                 wq = root->fs_info->endio_write_workers;
3000                 func = btrfs_endio_write_helper;
3001         }
3002
3003         btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3004                         NULL);
3005         btrfs_queue_work(wq, &ordered_extent->work);
3006
3007         return 0;
3008 }
3009
3010 static int __readpage_endio_check(struct inode *inode,
3011                                   struct btrfs_io_bio *io_bio,
3012                                   int icsum, struct page *page,
3013                                   int pgoff, u64 start, size_t len)
3014 {
3015         char *kaddr;
3016         u32 csum_expected;
3017         u32 csum = ~(u32)0;
3018         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
3019                                       DEFAULT_RATELIMIT_BURST);
3020
3021         csum_expected = *(((u32 *)io_bio->csum) + icsum);
3022
3023         kaddr = kmap_atomic(page);
3024         csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3025         btrfs_csum_final(csum, (char *)&csum);
3026         if (csum != csum_expected)
3027                 goto zeroit;
3028
3029         kunmap_atomic(kaddr);
3030         return 0;
3031 zeroit:
3032         if (__ratelimit(&_rs))
3033                 btrfs_warn(BTRFS_I(inode)->root->fs_info,
3034                            "csum failed ino %llu off %llu csum %u expected csum %u",
3035                            btrfs_ino(inode), start, csum, csum_expected);
3036         memset(kaddr + pgoff, 1, len);
3037         flush_dcache_page(page);
3038         kunmap_atomic(kaddr);
3039         if (csum_expected == 0)
3040                 return 0;
3041         return -EIO;
3042 }
3043
3044 /*
3045  * when reads are done, we need to check csums to verify the data is correct
3046  * if there's a match, we allow the bio to finish.  If not, the code in
3047  * extent_io.c will try to find good copies for us.
3048  */
3049 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3050                                       u64 phy_offset, struct page *page,
3051                                       u64 start, u64 end, int mirror)
3052 {
3053         size_t offset = start - page_offset(page);
3054         struct inode *inode = page->mapping->host;
3055         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3056         struct btrfs_root *root = BTRFS_I(inode)->root;
3057
3058         if (PageChecked(page)) {
3059                 ClearPageChecked(page);
3060                 return 0;
3061         }
3062
3063         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3064                 return 0;
3065
3066         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3067             test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3068                 clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
3069                                   GFP_NOFS);
3070                 return 0;
3071         }
3072
3073         phy_offset >>= inode->i_sb->s_blocksize_bits;
3074         return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3075                                       start, (size_t)(end - start + 1));
3076 }
3077
3078 struct delayed_iput {
3079         struct list_head list;
3080         struct inode *inode;
3081 };
3082
3083 /* JDM: If this is fs-wide, why can't we add a pointer to
3084  * btrfs_inode instead and avoid the allocation? */
3085 void btrfs_add_delayed_iput(struct inode *inode)
3086 {
3087         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3088         struct delayed_iput *delayed;
3089
3090         if (atomic_add_unless(&inode->i_count, -1, 1))
3091                 return;
3092
3093         delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
3094         delayed->inode = inode;
3095
3096         spin_lock(&fs_info->delayed_iput_lock);
3097         list_add_tail(&delayed->list, &fs_info->delayed_iputs);
3098         spin_unlock(&fs_info->delayed_iput_lock);
3099 }
3100
3101 void btrfs_run_delayed_iputs(struct btrfs_root *root)
3102 {
3103         LIST_HEAD(list);
3104         struct btrfs_fs_info *fs_info = root->fs_info;
3105         struct delayed_iput *delayed;
3106         int empty;
3107
3108         spin_lock(&fs_info->delayed_iput_lock);
3109         empty = list_empty(&fs_info->delayed_iputs);
3110         spin_unlock(&fs_info->delayed_iput_lock);
3111         if (empty)
3112                 return;
3113
3114         down_read(&fs_info->delayed_iput_sem);
3115
3116         spin_lock(&fs_info->delayed_iput_lock);
3117         list_splice_init(&fs_info->delayed_iputs, &list);
3118         spin_unlock(&fs_info->delayed_iput_lock);
3119
3120         while (!list_empty(&list)) {
3121                 delayed = list_entry(list.next, struct delayed_iput, list);
3122                 list_del(&delayed->list);
3123                 iput(delayed->inode);
3124                 kfree(delayed);
3125         }
3126
3127         up_read(&root->fs_info->delayed_iput_sem);
3128 }
3129
3130 /*
3131  * This is called in transaction commit time. If there are no orphan
3132  * files in the subvolume, it removes orphan item and frees block_rsv
3133  * structure.
3134  */
3135 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3136                               struct btrfs_root *root)
3137 {
3138         struct btrfs_block_rsv *block_rsv;
3139         int ret;
3140
3141         if (atomic_read(&root->orphan_inodes) ||
3142             root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3143                 return;
3144
3145         spin_lock(&root->orphan_lock);
3146         if (atomic_read(&root->orphan_inodes)) {
3147                 spin_unlock(&root->orphan_lock);
3148                 return;
3149         }
3150
3151         if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3152                 spin_unlock(&root->orphan_lock);
3153                 return;
3154         }
3155
3156         block_rsv = root->orphan_block_rsv;
3157         root->orphan_block_rsv = NULL;
3158         spin_unlock(&root->orphan_lock);
3159
3160         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3161             btrfs_root_refs(&root->root_item) > 0) {
3162                 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
3163                                             root->root_key.objectid);
3164                 if (ret)
3165                         btrfs_abort_transaction(trans, root, ret);
3166                 else
3167                         clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3168                                   &root->state);
3169         }
3170
3171         if (block_rsv) {
3172                 WARN_ON(block_rsv->size > 0);
3173                 btrfs_free_block_rsv(root, block_rsv);
3174         }
3175 }
3176
3177 /*
3178  * This creates an orphan entry for the given inode in case something goes
3179  * wrong in the middle of an unlink/truncate.
3180  *
3181  * NOTE: caller of this function should reserve 5 units of metadata for
3182  *       this function.
3183  */
3184 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
3185 {
3186         struct btrfs_root *root = BTRFS_I(inode)->root;
3187         struct btrfs_block_rsv *block_rsv = NULL;
3188         int reserve = 0;
3189         int insert = 0;
3190         int ret;
3191
3192         if (!root->orphan_block_rsv) {
3193                 block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
3194                 if (!block_rsv)
3195                         return -ENOMEM;
3196         }
3197
3198         spin_lock(&root->orphan_lock);
3199         if (!root->orphan_block_rsv) {
3200                 root->orphan_block_rsv = block_rsv;
3201         } else if (block_rsv) {
3202                 btrfs_free_block_rsv(root, block_rsv);
3203                 block_rsv = NULL;
3204         }
3205
3206         if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3207                               &BTRFS_I(inode)->runtime_flags)) {
3208 #if 0
3209                 /*
3210                  * For proper ENOSPC handling, we should do orphan
3211                  * cleanup when mounting. But this introduces backward
3212                  * compatibility issue.
3213                  */
3214                 if (!xchg(&root->orphan_item_inserted, 1))
3215                         insert = 2;
3216                 else
3217                         insert = 1;
3218 #endif
3219                 insert = 1;
3220                 atomic_inc(&root->orphan_inodes);
3221         }
3222
3223         if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3224                               &BTRFS_I(inode)->runtime_flags))
3225                 reserve = 1;
3226         spin_unlock(&root->orphan_lock);
3227
3228         /* grab metadata reservation from transaction handle */
3229         if (reserve) {
3230                 ret = btrfs_orphan_reserve_metadata(trans, inode);
3231                 BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
3232         }
3233
3234         /* insert an orphan item to track this unlinked/truncated file */
3235         if (insert >= 1) {
3236                 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3237                 if (ret) {
3238                         atomic_dec(&root->orphan_inodes);
3239                         if (reserve) {
3240                                 clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3241                                           &BTRFS_I(inode)->runtime_flags);
3242                                 btrfs_orphan_release_metadata(inode);
3243                         }
3244                         if (ret != -EEXIST) {
3245                                 clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3246                                           &BTRFS_I(inode)->runtime_flags);
3247                                 btrfs_abort_transaction(trans, root, ret);
3248                                 return ret;
3249                         }
3250                 }
3251                 ret = 0;
3252         }
3253
3254         /* insert an orphan item to track subvolume contains orphan files */
3255         if (insert >= 2) {
3256                 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
3257                                                root->root_key.objectid);
3258                 if (ret && ret != -EEXIST) {
3259                         btrfs_abort_transaction(trans, root, ret);
3260                         return ret;
3261                 }
3262         }
3263         return 0;
3264 }
3265
3266 /*
3267  * We have done the truncate/delete so we can go ahead and remove the orphan
3268  * item for this particular inode.
3269  */
3270 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3271                             struct inode *inode)
3272 {
3273         struct btrfs_root *root = BTRFS_I(inode)->root;
3274         int delete_item = 0;
3275         int release_rsv = 0;
3276         int ret = 0;
3277
3278         spin_lock(&root->orphan_lock);
3279         if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3280                                &BTRFS_I(inode)->runtime_flags))
3281                 delete_item = 1;
3282
3283         if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3284                                &BTRFS_I(inode)->runtime_flags))
3285                 release_rsv = 1;
3286         spin_unlock(&root->orphan_lock);
3287
3288         if (delete_item) {
3289                 atomic_dec(&root->orphan_inodes);
3290                 if (trans)
3291                         ret = btrfs_del_orphan_item(trans, root,
3292                                                     btrfs_ino(inode));
3293         }
3294
3295         if (release_rsv)
3296                 btrfs_orphan_release_metadata(inode);
3297
3298         return ret;
3299 }
3300
3301 /*
3302  * this cleans up any orphans that may be left on the list from the last use
3303  * of this root.
3304  */
3305 int btrfs_orphan_cleanup(struct btrfs_root *root)
3306 {
3307         struct btrfs_path *path;
3308         struct extent_buffer *leaf;
3309         struct btrfs_key key, found_key;
3310         struct btrfs_trans_handle *trans;
3311         struct inode *inode;
3312         u64 last_objectid = 0;
3313         int ret = 0, nr_unlink = 0, nr_truncate = 0;
3314
3315         if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3316                 return 0;
3317
3318         path = btrfs_alloc_path();
3319         if (!path) {
3320                 ret = -ENOMEM;
3321                 goto out;
3322         }
3323         path->reada = -1;
3324
3325         key.objectid = BTRFS_ORPHAN_OBJECTID;
3326         key.type = BTRFS_ORPHAN_ITEM_KEY;
3327         key.offset = (u64)-1;
3328
3329         while (1) {
3330                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3331                 if (ret < 0)
3332                         goto out;
3333
3334                 /*
3335                  * if ret == 0 means we found what we were searching for, which
3336                  * is weird, but possible, so only screw with path if we didn't
3337                  * find the key and see if we have stuff that matches
3338                  */
3339                 if (ret > 0) {
3340                         ret = 0;
3341                         if (path->slots[0] == 0)
3342                                 break;
3343                         path->slots[0]--;
3344                 }
3345
3346                 /* pull out the item */
3347                 leaf = path->nodes[0];
3348                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3349
3350                 /* make sure the item matches what we want */
3351                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3352                         break;
3353                 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3354                         break;
3355
3356                 /* release the path since we're done with it */
3357                 btrfs_release_path(path);
3358
3359                 /*
3360                  * this is where we are basically btrfs_lookup, without the
3361                  * crossing root thing.  we store the inode number in the
3362                  * offset of the orphan item.
3363                  */
3364
3365                 if (found_key.offset == last_objectid) {
3366                         btrfs_err(root->fs_info,
3367                                 "Error removing orphan entry, stopping orphan cleanup");
3368                         ret = -EINVAL;
3369                         goto out;
3370                 }
3371
3372                 last_objectid = found_key.offset;
3373
3374                 found_key.objectid = found_key.offset;
3375                 found_key.type = BTRFS_INODE_ITEM_KEY;
3376                 found_key.offset = 0;
3377                 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3378                 ret = PTR_ERR_OR_ZERO(inode);
3379                 if (ret && ret != -ESTALE)
3380                         goto out;
3381
3382                 if (ret == -ESTALE && root == root->fs_info->tree_root) {
3383                         struct btrfs_root *dead_root;
3384                         struct btrfs_fs_info *fs_info = root->fs_info;
3385                         int is_dead_root = 0;
3386
3387                         /*
3388                          * this is an orphan in the tree root. Currently these
3389                          * could come from 2 sources:
3390                          *  a) a snapshot deletion in progress
3391                          *  b) a free space cache inode
3392                          * We need to distinguish those two, as the snapshot
3393                          * orphan must not get deleted.
3394                          * find_dead_roots already ran before us, so if this
3395                          * is a snapshot deletion, we should find the root
3396                          * in the dead_roots list
3397                          */
3398                         spin_lock(&fs_info->trans_lock);
3399                         list_for_each_entry(dead_root, &fs_info->dead_roots,
3400                                             root_list) {
3401                                 if (dead_root->root_key.objectid ==
3402                                     found_key.objectid) {
3403                                         is_dead_root = 1;
3404                                         break;
3405                                 }
3406                         }
3407                         spin_unlock(&fs_info->trans_lock);
3408                         if (is_dead_root) {
3409                                 /* prevent this orphan from being found again */
3410                                 key.offset = found_key.objectid - 1;
3411                                 continue;
3412                         }
3413                 }
3414                 /*
3415                  * Inode is already gone but the orphan item is still there,
3416                  * kill the orphan item.
3417                  */
3418                 if (ret == -ESTALE) {
3419                         trans = btrfs_start_transaction(root, 1);
3420                         if (IS_ERR(trans)) {
3421                                 ret = PTR_ERR(trans);
3422                                 goto out;
3423                         }
3424                         btrfs_debug(root->fs_info, "auto deleting %Lu",
3425                                 found_key.objectid);
3426                         ret = btrfs_del_orphan_item(trans, root,
3427                                                     found_key.objectid);
3428                         btrfs_end_transaction(trans, root);
3429                         if (ret)
3430                                 goto out;
3431                         continue;
3432                 }
3433
3434                 /*
3435                  * add this inode to the orphan list so btrfs_orphan_del does
3436                  * the proper thing when we hit it
3437                  */
3438                 set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3439                         &BTRFS_I(inode)->runtime_flags);
3440                 atomic_inc(&root->orphan_inodes);
3441
3442                 /* if we have links, this was a truncate, lets do that */
3443                 if (inode->i_nlink) {
3444                         if (WARN_ON(!S_ISREG(inode->i_mode))) {
3445                                 iput(inode);
3446                                 continue;
3447                         }
3448                         nr_truncate++;
3449
3450                         /* 1 for the orphan item deletion. */
3451                         trans = btrfs_start_transaction(root, 1);
3452                         if (IS_ERR(trans)) {
3453                                 iput(inode);
3454                                 ret = PTR_ERR(trans);
3455                                 goto out;
3456                         }
3457                         ret = btrfs_orphan_add(trans, inode);
3458                         btrfs_end_transaction(trans, root);
3459                         if (ret) {
3460                                 iput(inode);
3461                                 goto out;
3462                         }
3463
3464                         ret = btrfs_truncate(inode);
3465                         if (ret)
3466                                 btrfs_orphan_del(NULL, inode);
3467                 } else {
3468                         nr_unlink++;
3469                 }
3470
3471                 /* this will do delete_inode and everything for us */
3472                 iput(inode);
3473                 if (ret)
3474                         goto out;
3475         }
3476         /* release the path since we're done with it */
3477         btrfs_release_path(path);
3478
3479         root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3480
3481         if (root->orphan_block_rsv)
3482                 btrfs_block_rsv_release(root, root->orphan_block_rsv,
3483                                         (u64)-1);
3484
3485         if (root->orphan_block_rsv ||
3486             test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3487                 trans = btrfs_join_transaction(root);
3488                 if (!IS_ERR(trans))
3489                         btrfs_end_transaction(trans, root);
3490         }
3491
3492         if (nr_unlink)
3493                 btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3494         if (nr_truncate)
3495                 btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3496
3497 out:
3498         if (ret)
3499                 btrfs_err(root->fs_info,
3500                         "could not do orphan cleanup %d", ret);
3501         btrfs_free_path(path);
3502         return ret;
3503 }
3504
3505 /*
3506  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3507  * don't find any xattrs, we know there can't be any acls.
3508  *
3509  * slot is the slot the inode is in, objectid is the objectid of the inode
3510  */
3511 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3512                                           int slot, u64 objectid,
3513                                           int *first_xattr_slot)
3514 {
3515         u32 nritems = btrfs_header_nritems(leaf);
3516         struct btrfs_key found_key;
3517         static u64 xattr_access = 0;
3518         static u64 xattr_default = 0;
3519         int scanned = 0;
3520
3521         if (!xattr_access) {
3522                 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3523                                         strlen(POSIX_ACL_XATTR_ACCESS));
3524                 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3525                                         strlen(POSIX_ACL_XATTR_DEFAULT));
3526         }
3527
3528         slot++;
3529         *first_xattr_slot = -1;
3530         while (slot < nritems) {
3531                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3532
3533                 /* we found a different objectid, there must not be acls */
3534                 if (found_key.objectid != objectid)
3535                         return 0;
3536
3537                 /* we found an xattr, assume we've got an acl */
3538                 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3539                         if (*first_xattr_slot == -1)
3540                                 *first_xattr_slot = slot;
3541                         if (found_key.offset == xattr_access ||
3542                             found_key.offset == xattr_default)
3543                                 return 1;
3544                 }
3545
3546                 /*
3547                  * we found a key greater than an xattr key, there can't
3548                  * be any acls later on
3549                  */
3550                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3551                         return 0;
3552
3553                 slot++;
3554                 scanned++;
3555
3556                 /*
3557                  * it goes inode, inode backrefs, xattrs, extents,
3558                  * so if there are a ton of hard links to an inode there can
3559                  * be a lot of backrefs.  Don't waste time searching too hard,
3560                  * this is just an optimization
3561                  */
3562                 if (scanned >= 8)
3563                         break;
3564         }
3565         /* we hit the end of the leaf before we found an xattr or
3566          * something larger than an xattr.  We have to assume the inode
3567          * has acls
3568          */
3569         if (*first_xattr_slot == -1)
3570                 *first_xattr_slot = slot;
3571         return 1;
3572 }
3573
3574 /*
3575  * read an inode from the btree into the in-memory inode
3576  */
3577 static void btrfs_read_locked_inode(struct inode *inode)
3578 {
3579         struct btrfs_path *path;
3580         struct extent_buffer *leaf;
3581         struct btrfs_inode_item *inode_item;
3582         struct btrfs_root *root = BTRFS_I(inode)->root;
3583         struct btrfs_key location;
3584         unsigned long ptr;
3585         int maybe_acls;
3586         u32 rdev;
3587         int ret;
3588         bool filled = false;
3589         int first_xattr_slot;
3590
3591         ret = btrfs_fill_inode(inode, &rdev);
3592         if (!ret)
3593                 filled = true;
3594
3595         path = btrfs_alloc_path();
3596         if (!path)
3597                 goto make_bad;
3598
3599         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3600
3601         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3602         if (ret)
3603                 goto make_bad;
3604
3605         leaf = path->nodes[0];
3606
3607         if (filled)
3608                 goto cache_index;
3609
3610         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3611                                     struct btrfs_inode_item);
3612         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3613         set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3614         i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3615         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3616         btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3617
3618         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3619         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3620
3621         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3622         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3623
3624         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3625         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3626
3627         BTRFS_I(inode)->i_otime.tv_sec =
3628                 btrfs_timespec_sec(leaf, &inode_item->otime);
3629         BTRFS_I(inode)->i_otime.tv_nsec =
3630                 btrfs_timespec_nsec(leaf, &inode_item->otime);
3631
3632         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3633         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3634         BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3635
3636         /*
3637          * If we were modified in the current generation and evicted from memory
3638          * and then re-read we need to do a full sync since we don't have any
3639          * idea about which extents were modified before we were evicted from
3640          * cache.
3641          */
3642         if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3643                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3644                         &BTRFS_I(inode)->runtime_flags);
3645
3646         inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3647         inode->i_generation = BTRFS_I(inode)->generation;
3648         inode->i_rdev = 0;
3649         rdev = btrfs_inode_rdev(leaf, inode_item);
3650
3651         BTRFS_I(inode)->index_cnt = (u64)-1;
3652         BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3653
3654 cache_index:
3655         path->slots[0]++;
3656         if (inode->i_nlink != 1 ||
3657             path->slots[0] >= btrfs_header_nritems(leaf))
3658                 goto cache_acl;
3659
3660         btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3661         if (location.objectid != btrfs_ino(inode))
3662                 goto cache_acl;
3663
3664         ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3665         if (location.type == BTRFS_INODE_REF_KEY) {
3666                 struct btrfs_inode_ref *ref;
3667
3668                 ref = (struct btrfs_inode_ref *)ptr;
3669                 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3670         } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3671                 struct btrfs_inode_extref *extref;
3672
3673                 extref = (struct btrfs_inode_extref *)ptr;
3674                 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3675                                                                      extref);
3676         }
3677 cache_acl:
3678         /*
3679          * try to precache a NULL acl entry for files that don't have
3680          * any xattrs or acls
3681          */
3682         maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3683                                            btrfs_ino(inode), &first_xattr_slot);
3684         if (first_xattr_slot != -1) {
3685                 path->slots[0] = first_xattr_slot;
3686                 ret = btrfs_load_inode_props(inode, path);
3687                 if (ret)
3688                         btrfs_err(root->fs_info,
3689                                   "error loading props for ino %llu (root %llu): %d",
3690                                   btrfs_ino(inode),
3691                                   root->root_key.objectid, ret);
3692         }
3693         btrfs_free_path(path);
3694
3695         if (!maybe_acls)
3696                 cache_no_acl(inode);
3697
3698         switch (inode->i_mode & S_IFMT) {
3699         case S_IFREG:
3700                 inode->i_mapping->a_ops = &btrfs_aops;
3701                 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3702                 inode->i_fop = &btrfs_file_operations;
3703                 inode->i_op = &btrfs_file_inode_operations;
3704                 break;
3705         case S_IFDIR:
3706                 inode->i_fop = &btrfs_dir_file_operations;
3707                 if (root == root->fs_info->tree_root)
3708                         inode->i_op = &btrfs_dir_ro_inode_operations;
3709                 else
3710                         inode->i_op = &btrfs_dir_inode_operations;
3711                 break;
3712         case S_IFLNK:
3713                 inode->i_op = &btrfs_symlink_inode_operations;
3714                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
3715                 break;
3716         default:
3717                 inode->i_op = &btrfs_special_inode_operations;
3718                 init_special_inode(inode, inode->i_mode, rdev);
3719                 break;
3720         }
3721
3722         btrfs_update_iflags(inode);
3723         return;
3724
3725 make_bad:
3726         btrfs_free_path(path);
3727         make_bad_inode(inode);
3728 }
3729
3730 /*
3731  * given a leaf and an inode, copy the inode fields into the leaf
3732  */
3733 static void fill_inode_item(struct btrfs_trans_handle *trans,
3734                             struct extent_buffer *leaf,
3735                             struct btrfs_inode_item *item,
3736                             struct inode *inode)
3737 {
3738         struct btrfs_map_token token;
3739
3740         btrfs_init_map_token(&token);
3741
3742         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3743         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3744         btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3745                                    &token);
3746         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3747         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3748
3749         btrfs_set_token_timespec_sec(leaf, &item->atime,
3750                                      inode->i_atime.tv_sec, &token);
3751         btrfs_set_token_timespec_nsec(leaf, &item->atime,
3752                                       inode->i_atime.tv_nsec, &token);
3753
3754         btrfs_set_token_timespec_sec(leaf, &item->mtime,
3755                                      inode->i_mtime.tv_sec, &token);
3756         btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3757                                       inode->i_mtime.tv_nsec, &token);
3758
3759         btrfs_set_token_timespec_sec(leaf, &item->ctime,
3760                                      inode->i_ctime.tv_sec, &token);
3761         btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3762                                       inode->i_ctime.tv_nsec, &token);
3763
3764         btrfs_set_token_timespec_sec(leaf, &item->otime,
3765                                      BTRFS_I(inode)->i_otime.tv_sec, &token);
3766         btrfs_set_token_timespec_nsec(leaf, &item->otime,
3767                                       BTRFS_I(inode)->i_otime.tv_nsec, &token);
3768
3769         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3770                                      &token);
3771         btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3772                                          &token);
3773         btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3774         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3775         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3776         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3777         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3778 }
3779
3780 /*
3781  * copy everything in the in-memory inode into the btree.
3782  */
3783 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3784                                 struct btrfs_root *root, struct inode *inode)
3785 {
3786         struct btrfs_inode_item *inode_item;
3787         struct btrfs_path *path;
3788         struct extent_buffer *leaf;
3789         int ret;
3790
3791         path = btrfs_alloc_path();
3792         if (!path)
3793                 return -ENOMEM;
3794
3795         path->leave_spinning = 1;
3796         ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3797                                  1);
3798         if (ret) {
3799                 if (ret > 0)
3800                         ret = -ENOENT;
3801                 goto failed;
3802         }
3803
3804         leaf = path->nodes[0];
3805         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3806                                     struct btrfs_inode_item);
3807
3808         fill_inode_item(trans, leaf, inode_item, inode);
3809         btrfs_mark_buffer_dirty(leaf);
3810         btrfs_set_inode_last_trans(trans, inode);
3811         ret = 0;
3812 failed:
3813         btrfs_free_path(path);
3814         return ret;
3815 }
3816
3817 /*
3818  * copy everything in the in-memory inode into the btree.
3819  */
3820 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3821                                 struct btrfs_root *root, struct inode *inode)
3822 {
3823         int ret;
3824
3825         /*
3826          * If the inode is a free space inode, we can deadlock during commit
3827          * if we put it into the delayed code.
3828          *
3829          * The data relocation inode should also be directly updated
3830          * without delay
3831          */
3832         if (!btrfs_is_free_space_inode(inode)
3833             && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3834             && !root->fs_info->log_root_recovering) {
3835                 btrfs_update_root_times(trans, root);
3836
3837                 ret = btrfs_delayed_update_inode(trans, root, inode);
3838                 if (!ret)
3839                         btrfs_set_inode_last_trans(trans, inode);
3840                 return ret;
3841         }
3842
3843         return btrfs_update_inode_item(trans, root, inode);
3844 }
3845
3846 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3847                                          struct btrfs_root *root,
3848                                          struct inode *inode)
3849 {
3850         int ret;
3851
3852         ret = btrfs_update_inode(trans, root, inode);
3853         if (ret == -ENOSPC)
3854                 return btrfs_update_inode_item(trans, root, inode);
3855         return ret;
3856 }
3857
3858 /*
3859  * unlink helper that gets used here in inode.c and in the tree logging
3860  * recovery code.  It remove a link in a directory with a given name, and
3861  * also drops the back refs in the inode to the directory
3862  */
3863 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3864                                 struct btrfs_root *root,
3865                                 struct inode *dir, struct inode *inode,
3866                                 const char *name, int name_len)
3867 {
3868         struct btrfs_path *path;
3869         int ret = 0;
3870         struct extent_buffer *leaf;
3871         struct btrfs_dir_item *di;
3872         struct btrfs_key key;
3873         u64 index;
3874         u64 ino = btrfs_ino(inode);
3875         u64 dir_ino = btrfs_ino(dir);
3876
3877         path = btrfs_alloc_path();
3878         if (!path) {
3879                 ret = -ENOMEM;
3880                 goto out;
3881         }
3882
3883         path->leave_spinning = 1;
3884         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3885                                     name, name_len, -1);
3886         if (IS_ERR(di)) {
3887                 ret = PTR_ERR(di);
3888                 goto err;
3889         }
3890         if (!di) {
3891                 ret = -ENOENT;
3892                 goto err;
3893         }
3894         leaf = path->nodes[0];
3895         btrfs_dir_item_key_to_cpu(leaf, di, &key);
3896         ret = btrfs_delete_one_dir_name(trans, root, path, di);
3897         if (ret)
3898                 goto err;
3899         btrfs_release_path(path);
3900
3901         /*
3902          * If we don't have dir index, we have to get it by looking up
3903          * the inode ref, since we get the inode ref, remove it directly,
3904          * it is unnecessary to do delayed deletion.
3905          *
3906          * But if we have dir index, needn't search inode ref to get it.
3907          * Since the inode ref is close to the inode item, it is better
3908          * that we delay to delete it, and just do this deletion when
3909          * we update the inode item.
3910          */
3911         if (BTRFS_I(inode)->dir_index) {
3912                 ret = btrfs_delayed_delete_inode_ref(inode);
3913                 if (!ret) {
3914                         index = BTRFS_I(inode)->dir_index;
3915                         goto skip_backref;
3916                 }
3917         }
3918
3919         ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3920                                   dir_ino, &index);
3921         if (ret) {
3922                 btrfs_info(root->fs_info,
3923                         "failed to delete reference to %.*s, inode %llu parent %llu",
3924                         name_len, name, ino, dir_ino);
3925                 btrfs_abort_transaction(trans, root, ret);
3926                 goto err;
3927         }
3928 skip_backref:
3929         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3930         if (ret) {
3931                 btrfs_abort_transaction(trans, root, ret);
3932                 goto err;
3933         }
3934
3935         ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
3936                                          inode, dir_ino);
3937         if (ret != 0 && ret != -ENOENT) {
3938                 btrfs_abort_transaction(trans, root, ret);
3939                 goto err;
3940         }
3941
3942         ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
3943                                            dir, index);
3944         if (ret == -ENOENT)
3945                 ret = 0;
3946         else if (ret)
3947                 btrfs_abort_transaction(trans, root, ret);
3948 err:
3949         btrfs_free_path(path);
3950         if (ret)
3951                 goto out;
3952
3953         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3954         inode_inc_iversion(inode);
3955         inode_inc_iversion(dir);
3956         inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3957         ret = btrfs_update_inode(trans, root, dir);
3958 out:
3959         return ret;
3960 }
3961
3962 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3963                        struct btrfs_root *root,
3964                        struct inode *dir, struct inode *inode,
3965                        const char *name, int name_len)
3966 {
3967         int ret;
3968         ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
3969         if (!ret) {
3970                 drop_nlink(inode);
3971                 ret = btrfs_update_inode(trans, root, inode);
3972         }
3973         return ret;
3974 }
3975
3976 /*
3977  * helper to start transaction for unlink and rmdir.
3978  *
3979  * unlink and rmdir are special in btrfs, they do not always free space, so
3980  * if we cannot make our reservations the normal way try and see if there is
3981  * plenty of slack room in the global reserve to migrate, otherwise we cannot
3982  * allow the unlink to occur.
3983  */
3984 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3985 {
3986         struct btrfs_trans_handle *trans;
3987         struct btrfs_root *root = BTRFS_I(dir)->root;
3988         int ret;
3989
3990         /*
3991          * 1 for the possible orphan item
3992          * 1 for the dir item
3993          * 1 for the dir index
3994          * 1 for the inode ref
3995          * 1 for the inode
3996          */
3997         trans = btrfs_start_transaction(root, 5);
3998         if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3999                 return trans;
4000
4001         if (PTR_ERR(trans) == -ENOSPC) {
4002                 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
4003
4004                 trans = btrfs_start_transaction(root, 0);
4005                 if (IS_ERR(trans))
4006                         return trans;
4007                 ret = btrfs_cond_migrate_bytes(root->fs_info,
4008                                                &root->fs_info->trans_block_rsv,
4009                                                num_bytes, 5);
4010                 if (ret) {
4011                         btrfs_end_transaction(trans, root);
4012                         return ERR_PTR(ret);
4013                 }
4014                 trans->block_rsv = &root->fs_info->trans_block_rsv;
4015                 trans->bytes_reserved = num_bytes;
4016         }
4017         return trans;
4018 }
4019
4020 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4021 {
4022         struct btrfs_root *root = BTRFS_I(dir)->root;
4023         struct btrfs_trans_handle *trans;
4024         struct inode *inode = dentry->d_inode;
4025         int ret;
4026
4027         trans = __unlink_start_trans(dir);
4028         if (IS_ERR(trans))
4029                 return PTR_ERR(trans);
4030
4031         btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
4032
4033         ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
4034                                  dentry->d_name.name, dentry->d_name.len);
4035         if (ret)
4036                 goto out;
4037
4038         if (inode->i_nlink == 0) {
4039                 ret = btrfs_orphan_add(trans, inode);
4040                 if (ret)
4041                         goto out;
4042         }
4043
4044 out:
4045         btrfs_end_transaction(trans, root);
4046         btrfs_btree_balance_dirty(root);
4047         return ret;
4048 }
4049
4050 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4051                         struct btrfs_root *root,
4052                         struct inode *dir, u64 objectid,
4053                         const char *name, int name_len)
4054 {
4055         struct btrfs_path *path;
4056         struct extent_buffer *leaf;
4057         struct btrfs_dir_item *di;
4058         struct btrfs_key key;
4059         u64 index;
4060         int ret;
4061         u64 dir_ino = btrfs_ino(dir);
4062
4063         path = btrfs_alloc_path();
4064         if (!path)
4065                 return -ENOMEM;
4066
4067         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4068                                    name, name_len, -1);
4069         if (IS_ERR_OR_NULL(di)) {
4070                 if (!di)
4071                         ret = -ENOENT;
4072                 else
4073                         ret = PTR_ERR(di);
4074                 goto out;
4075         }
4076
4077         leaf = path->nodes[0];
4078         btrfs_dir_item_key_to_cpu(leaf, di, &key);
4079         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4080         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4081         if (ret) {
4082                 btrfs_abort_transaction(trans, root, ret);
4083                 goto out;
4084         }
4085         btrfs_release_path(path);
4086
4087         ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
4088                                  objectid, root->root_key.objectid,
4089                                  dir_ino, &index, name, name_len);
4090         if (ret < 0) {
4091                 if (ret != -ENOENT) {
4092                         btrfs_abort_transaction(trans, root, ret);
4093                         goto out;
4094                 }
4095                 di = btrfs_search_dir_index_item(root, path, dir_ino,
4096                                                  name, name_len);
4097                 if (IS_ERR_OR_NULL(di)) {
4098                         if (!di)
4099                                 ret = -ENOENT;
4100                         else
4101                                 ret = PTR_ERR(di);
4102                         btrfs_abort_transaction(trans, root, ret);
4103                         goto out;
4104                 }
4105
4106                 leaf = path->nodes[0];
4107                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4108                 btrfs_release_path(path);
4109                 index = key.offset;
4110         }
4111         btrfs_release_path(path);
4112
4113         ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
4114         if (ret) {
4115                 btrfs_abort_transaction(trans, root, ret);
4116                 goto out;
4117         }
4118
4119         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
4120         inode_inc_iversion(dir);
4121         dir->i_mtime = dir->i_ctime = CURRENT_TIME;
4122         ret = btrfs_update_inode_fallback(trans, root, dir);
4123         if (ret)
4124                 btrfs_abort_transaction(trans, root, ret);
4125 out:
4126         btrfs_free_path(path);
4127         return ret;
4128 }
4129
4130 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4131 {
4132         struct inode *inode = dentry->d_inode;
4133         int err = 0;
4134         struct btrfs_root *root = BTRFS_I(dir)->root;
4135         struct btrfs_trans_handle *trans;
4136
4137         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4138                 return -ENOTEMPTY;
4139         if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
4140                 return -EPERM;
4141
4142         trans = __unlink_start_trans(dir);
4143         if (IS_ERR(trans))
4144                 return PTR_ERR(trans);
4145
4146         if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4147                 err = btrfs_unlink_subvol(trans, root, dir,
4148                                           BTRFS_I(inode)->location.objectid,
4149                                           dentry->d_name.name,
4150                                           dentry->d_name.len);
4151                 goto out;
4152         }
4153
4154         err = btrfs_orphan_add(trans, inode);
4155         if (err)
4156                 goto out;
4157
4158         /* now the directory is empty */
4159         err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
4160                                  dentry->d_name.name, dentry->d_name.len);
4161         if (!err)
4162                 btrfs_i_size_write(inode, 0);
4163 out:
4164         btrfs_end_transaction(trans, root);
4165         btrfs_btree_balance_dirty(root);
4166
4167         return err;
4168 }
4169
4170 static int truncate_space_check(struct btrfs_trans_handle *trans,
4171                                 struct btrfs_root *root,
4172                                 u64 bytes_deleted)
4173 {
4174         int ret;
4175
4176         bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
4177         ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
4178                                   bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4179         if (!ret)
4180                 trans->bytes_reserved += bytes_deleted;
4181         return ret;
4182
4183 }
4184
4185 /*
4186  * this can truncate away extent items, csum items and directory items.
4187  * It starts at a high offset and removes keys until it can't find
4188  * any higher than new_size
4189  *
4190  * csum items that cross the new i_size are truncated to the new size
4191  * as well.
4192  *
4193  * min_type is the minimum key type to truncate down to.  If set to 0, this
4194  * will kill all the items on this inode, including the INODE_ITEM_KEY.
4195  */
4196 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4197                                struct btrfs_root *root,
4198                                struct inode *inode,
4199                                u64 new_size, u32 min_type)
4200 {
4201         struct btrfs_path *path;
4202         struct extent_buffer *leaf;
4203         struct btrfs_file_extent_item *fi;
4204         struct btrfs_key key;
4205         struct btrfs_key found_key;
4206         u64 extent_start = 0;
4207         u64 extent_num_bytes = 0;
4208         u64 extent_offset = 0;
4209         u64 item_end = 0;
4210         u64 last_size = (u64)-1;
4211         u32 found_type = (u8)-1;
4212         int found_extent;
4213         int del_item;
4214         int pending_del_nr = 0;
4215         int pending_del_slot = 0;
4216         int extent_type = -1;
4217         int ret;
4218         int err = 0;
4219         u64 ino = btrfs_ino(inode);
4220         u64 bytes_deleted = 0;
4221         bool be_nice = 0;
4222         bool should_throttle = 0;
4223         bool should_end = 0;
4224
4225         BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4226
4227         /*
4228          * for non-free space inodes and ref cows, we want to back off from
4229          * time to time
4230          */
4231         if (!btrfs_is_free_space_inode(inode) &&
4232             test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4233                 be_nice = 1;
4234
4235         path = btrfs_alloc_path();
4236         if (!path)
4237                 return -ENOMEM;
4238         path->reada = -1;
4239
4240         /*
4241          * We want to drop from the next block forward in case this new size is
4242          * not block aligned since we will be keeping the last block of the
4243          * extent just the way it is.
4244          */
4245         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4246             root == root->fs_info->tree_root)
4247                 btrfs_drop_extent_cache(inode, ALIGN(new_size,
4248                                         root->sectorsize), (u64)-1, 0);
4249
4250         /*
4251          * This function is also used to drop the items in the log tree before
4252          * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4253          * it is used to drop the loged items. So we shouldn't kill the delayed
4254          * items.
4255          */
4256         if (min_type == 0 && root == BTRFS_I(inode)->root)
4257                 btrfs_kill_delayed_inode_items(inode);
4258
4259         key.objectid = ino;
4260         key.offset = (u64)-1;
4261         key.type = (u8)-1;
4262
4263 search_again:
4264         /*
4265          * with a 16K leaf size and 128MB extents, you can actually queue
4266          * up a huge file in a single leaf.  Most of the time that
4267          * bytes_deleted is > 0, it will be huge by the time we get here
4268          */
4269         if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
4270                 if (btrfs_should_end_transaction(trans, root)) {
4271                         err = -EAGAIN;
4272                         goto error;
4273                 }
4274         }
4275
4276
4277         path->leave_spinning = 1;
4278         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4279         if (ret < 0) {
4280                 err = ret;
4281                 goto out;
4282         }
4283
4284         if (ret > 0) {
4285                 /* there are no items in the tree for us to truncate, we're
4286                  * done
4287                  */
4288                 if (path->slots[0] == 0)
4289                         goto out;
4290                 path->slots[0]--;
4291         }
4292
4293         while (1) {
4294                 fi = NULL;
4295                 leaf = path->nodes[0];
4296                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4297                 found_type = found_key.type;
4298
4299                 if (found_key.objectid != ino)
4300                         break;
4301
4302                 if (found_type < min_type)
4303                         break;
4304
4305                 item_end = found_key.offset;
4306                 if (found_type == BTRFS_EXTENT_DATA_KEY) {
4307                         fi = btrfs_item_ptr(leaf, path->slots[0],
4308                                             struct btrfs_file_extent_item);
4309                         extent_type = btrfs_file_extent_type(leaf, fi);
4310                         if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4311                                 item_end +=
4312                                     btrfs_file_extent_num_bytes(leaf, fi);
4313                         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4314                                 item_end += btrfs_file_extent_inline_len(leaf,
4315                                                          path->slots[0], fi);
4316                         }
4317                         item_end--;
4318                 }
4319                 if (found_type > min_type) {
4320                         del_item = 1;
4321                 } else {
4322                         if (item_end < new_size)
4323                                 break;
4324                         if (found_key.offset >= new_size)
4325                                 del_item = 1;
4326                         else
4327                                 del_item = 0;
4328                 }
4329                 found_extent = 0;
4330                 /* FIXME, shrink the extent if the ref count is only 1 */
4331                 if (found_type != BTRFS_EXTENT_DATA_KEY)
4332                         goto delete;
4333
4334                 if (del_item)
4335                         last_size = found_key.offset;
4336                 else
4337                         last_size = new_size;
4338
4339                 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4340                         u64 num_dec;
4341                         extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4342                         if (!del_item) {
4343                                 u64 orig_num_bytes =
4344                                         btrfs_file_extent_num_bytes(leaf, fi);
4345                                 extent_num_bytes = ALIGN(new_size -
4346                                                 found_key.offset,
4347                                                 root->sectorsize);
4348                                 btrfs_set_file_extent_num_bytes(leaf, fi,
4349                                                          extent_num_bytes);
4350                                 num_dec = (orig_num_bytes -
4351                                            extent_num_bytes);
4352                                 if (test_bit(BTRFS_ROOT_REF_COWS,
4353                                              &root->state) &&
4354                                     extent_start != 0)
4355                                         inode_sub_bytes(inode, num_dec);
4356                                 btrfs_mark_buffer_dirty(leaf);
4357                         } else {
4358                                 extent_num_bytes =
4359                                         btrfs_file_extent_disk_num_bytes(leaf,
4360                                                                          fi);
4361                                 extent_offset = found_key.offset -
4362                                         btrfs_file_extent_offset(leaf, fi);
4363
4364                                 /* FIXME blocksize != 4096 */
4365                                 num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4366                                 if (extent_start != 0) {
4367                                         found_extent = 1;
4368                                         if (test_bit(BTRFS_ROOT_REF_COWS,
4369                                                      &root->state))
4370                                                 inode_sub_bytes(inode, num_dec);
4371                                 }
4372                         }
4373                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4374                         /*
4375                          * we can't truncate inline items that have had
4376                          * special encodings
4377                          */
4378                         if (!del_item &&
4379                             btrfs_file_extent_compression(leaf, fi) == 0 &&
4380                             btrfs_file_extent_encryption(leaf, fi) == 0 &&
4381                             btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4382                                 u32 size = new_size - found_key.offset;
4383
4384                                 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4385                                         inode_sub_bytes(inode, item_end + 1 -
4386                                                         new_size);
4387
4388                                 /*
4389                                  * update the ram bytes to properly reflect
4390                                  * the new size of our item
4391                                  */
4392                                 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4393                                 size =
4394                                     btrfs_file_extent_calc_inline_size(size);
4395                                 btrfs_truncate_item(root, path, size, 1);
4396                         } else if (test_bit(BTRFS_ROOT_REF_COWS,
4397                                             &root->state)) {
4398                                 inode_sub_bytes(inode, item_end + 1 -
4399                                                 found_key.offset);
4400                         }
4401                 }
4402 delete:
4403                 if (del_item) {
4404                         if (!pending_del_nr) {
4405                                 /* no pending yet, add ourselves */
4406                                 pending_del_slot = path->slots[0];
4407                                 pending_del_nr = 1;
4408                         } else if (pending_del_nr &&
4409                                    path->slots[0] + 1 == pending_del_slot) {
4410                                 /* hop on the pending chunk */
4411                                 pending_del_nr++;
4412                                 pending_del_slot = path->slots[0];
4413                         } else {
4414                                 BUG();
4415                         }
4416                 } else {
4417                         break;
4418                 }
4419                 should_throttle = 0;
4420
4421                 if (found_extent &&
4422                     (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4423                      root == root->fs_info->tree_root)) {
4424                         btrfs_set_path_blocking(path);
4425                         bytes_deleted += extent_num_bytes;
4426                         ret = btrfs_free_extent(trans, root, extent_start,
4427                                                 extent_num_bytes, 0,
4428                                                 btrfs_header_owner(leaf),
4429                                                 ino, extent_offset, 0);
4430                         BUG_ON(ret);
4431                         if (btrfs_should_throttle_delayed_refs(trans, root))
4432                                 btrfs_async_run_delayed_refs(root,
4433                                         trans->delayed_ref_updates * 2, 0);
4434                         if (be_nice) {
4435                                 if (truncate_space_check(trans, root,
4436                                                          extent_num_bytes)) {
4437                                         should_end = 1;
4438                                 }
4439                                 if (btrfs_should_throttle_delayed_refs(trans,
4440                                                                        root)) {
4441                                         should_throttle = 1;
4442                                 }
4443                         }
4444                 }
4445
4446                 if (found_type == BTRFS_INODE_ITEM_KEY)
4447                         break;
4448
4449                 if (path->slots[0] == 0 ||
4450                     path->slots[0] != pending_del_slot ||
4451                     should_throttle || should_end) {
4452                         if (pending_del_nr) {
4453                                 ret = btrfs_del_items(trans, root, path,
4454                                                 pending_del_slot,
4455                                                 pending_del_nr);
4456                                 if (ret) {
4457                                         btrfs_abort_transaction(trans,
4458                                                                 root, ret);
4459                                         goto error;
4460                                 }
4461                                 pending_del_nr = 0;
4462                         }
4463                         btrfs_release_path(path);
4464                         if (should_throttle) {
4465                                 unsigned long updates = trans->delayed_ref_updates;
4466                                 if (updates) {
4467                                         trans->delayed_ref_updates = 0;
4468                                         ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4469                                         if (ret && !err)
4470                                                 err = ret;
4471                                 }
4472                         }
4473                         /*
4474                          * if we failed to refill our space rsv, bail out
4475                          * and let the transaction restart
4476                          */
4477                         if (should_end) {
4478                                 err = -EAGAIN;
4479                                 goto error;
4480                         }
4481                         goto search_again;
4482                 } else {
4483                         path->slots[0]--;
4484                 }
4485         }
4486 out:
4487         if (pending_del_nr) {
4488                 ret = btrfs_del_items(trans, root, path, pending_del_slot,
4489                                       pending_del_nr);
4490                 if (ret)
4491                         btrfs_abort_transaction(trans, root, ret);
4492         }
4493 error:
4494         if (last_size != (u64)-1 &&
4495             root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4496                 btrfs_ordered_update_i_size(inode, last_size, NULL);
4497
4498         btrfs_free_path(path);
4499
4500         if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
4501                 unsigned long updates = trans->delayed_ref_updates;
4502                 if (updates) {
4503                         trans->delayed_ref_updates = 0;
4504                         ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4505                         if (ret && !err)
4506                                 err = ret;
4507                 }
4508         }
4509         return err;
4510 }
4511
4512 /*
4513  * btrfs_truncate_page - read, zero a chunk and write a page
4514  * @inode - inode that we're zeroing
4515  * @from - the offset to start zeroing
4516  * @len - the length to zero, 0 to zero the entire range respective to the
4517  *      offset
4518  * @front - zero up to the offset instead of from the offset on
4519  *
4520  * This will find the page for the "from" offset and cow the page and zero the
4521  * part we want to zero.  This is used with truncate and hole punching.
4522  */
4523 int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4524                         int front)
4525 {
4526         struct address_space *mapping = inode->i_mapping;
4527         struct btrfs_root *root = BTRFS_I(inode)->root;
4528         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4529         struct btrfs_ordered_extent *ordered;
4530         struct extent_state *cached_state = NULL;
4531         char *kaddr;
4532         u32 blocksize = root->sectorsize;
4533         pgoff_t index = from >> PAGE_CACHE_SHIFT;
4534         unsigned offset = from & (PAGE_CACHE_SIZE-1);
4535         struct page *page;
4536         gfp_t mask = btrfs_alloc_write_mask(mapping);
4537         int ret = 0;
4538         u64 page_start;
4539         u64 page_end;
4540
4541         if ((offset & (blocksize - 1)) == 0 &&
4542             (!len || ((len & (blocksize - 1)) == 0)))
4543                 goto out;
4544         ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
4545         if (ret)
4546                 goto out;
4547
4548 again:
4549         page = find_or_create_page(mapping, index, mask);
4550         if (!page) {
4551                 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4552                 ret = -ENOMEM;
4553                 goto out;
4554         }
4555
4556         page_start = page_offset(page);
4557         page_end = page_start + PAGE_CACHE_SIZE - 1;
4558
4559         if (!PageUptodate(page)) {
4560                 ret = btrfs_readpage(NULL, page);
4561                 lock_page(page);
4562                 if (page->mapping != mapping) {
4563                         unlock_page(page);
4564                         page_cache_release(page);
4565                         goto again;
4566                 }
4567                 if (!PageUptodate(page)) {
4568                         ret = -EIO;
4569                         goto out_unlock;
4570                 }
4571         }
4572         wait_on_page_writeback(page);
4573
4574         lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
4575         set_page_extent_mapped(page);
4576
4577         ordered = btrfs_lookup_ordered_extent(inode, page_start);
4578         if (ordered) {
4579                 unlock_extent_cached(io_tree, page_start, page_end,
4580                                      &cached_state, GFP_NOFS);
4581                 unlock_page(page);
4582                 page_cache_release(page);
4583                 btrfs_start_ordered_extent(inode, ordered, 1);
4584                 btrfs_put_ordered_extent(ordered);
4585                 goto again;
4586         }
4587
4588         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
4589                           EXTENT_DIRTY | EXTENT_DELALLOC |
4590                           EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4591                           0, 0, &cached_state, GFP_NOFS);
4592
4593         ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
4594                                         &cached_state);
4595         if (ret) {
4596                 unlock_extent_cached(io_tree, page_start, page_end,
4597                                      &cached_state, GFP_NOFS);
4598                 goto out_unlock;
4599         }
4600
4601         if (offset != PAGE_CACHE_SIZE) {
4602                 if (!len)
4603                         len = PAGE_CACHE_SIZE - offset;
4604                 kaddr = kmap(page);
4605                 if (front)
4606                         memset(kaddr, 0, offset);
4607                 else
4608                         memset(kaddr + offset, 0, len);
4609                 flush_dcache_page(page);
4610                 kunmap(page);
4611         }
4612         ClearPageChecked(page);
4613         set_page_dirty(page);
4614         unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
4615                              GFP_NOFS);
4616
4617 out_unlock:
4618         if (ret)
4619                 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4620         unlock_page(page);
4621         page_cache_release(page);
4622 out:
4623         return ret;
4624 }
4625
4626 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4627                              u64 offset, u64 len)
4628 {
4629         struct btrfs_trans_handle *trans;
4630         int ret;
4631
4632         /*
4633          * Still need to make sure the inode looks like it's been updated so
4634          * that any holes get logged if we fsync.
4635          */
4636         if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4637                 BTRFS_I(inode)->last_trans = root->fs_info->generation;
4638                 BTRFS_I(inode)->last_sub_trans = root->log_transid;
4639                 BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4640                 return 0;
4641         }
4642
4643         /*
4644          * 1 - for the one we're dropping
4645          * 1 - for the one we're adding
4646          * 1 - for updating the inode.
4647          */
4648         trans = btrfs_start_transaction(root, 3);
4649         if (IS_ERR(trans))
4650                 return PTR_ERR(trans);
4651
4652         ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4653         if (ret) {
4654                 btrfs_abort_transaction(trans, root, ret);
4655                 btrfs_end_transaction(trans, root);
4656                 return ret;
4657         }
4658
4659         ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4660                                        0, 0, len, 0, len, 0, 0, 0);
4661         if (ret)
4662                 btrfs_abort_transaction(trans, root, ret);
4663         else
4664                 btrfs_update_inode(trans, root, inode);
4665         btrfs_end_transaction(trans, root);
4666         return ret;
4667 }
4668
4669 /*
4670  * This function puts in dummy file extents for the area we're creating a hole
4671  * for.  So if we are truncating this file to a larger size we need to insert
4672  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4673  * the range between oldsize and size
4674  */
4675 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4676 {
4677         struct btrfs_root *root = BTRFS_I(inode)->root;
4678         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4679         struct extent_map *em = NULL;
4680         struct extent_state *cached_state = NULL;
4681         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4682         u64 hole_start = ALIGN(oldsize, root->sectorsize);
4683         u64 block_end = ALIGN(size, root->sectorsize);
4684         u64 last_byte;
4685         u64 cur_offset;
4686         u64 hole_size;
4687         int err = 0;
4688
4689         /*
4690          * If our size started in the middle of a page we need to zero out the
4691          * rest of the page before we expand the i_size, otherwise we could
4692          * expose stale data.
4693          */
4694         err = btrfs_truncate_page(inode, oldsize, 0, 0);
4695         if (err)
4696                 return err;
4697
4698         if (size <= hole_start)
4699                 return 0;
4700
4701         while (1) {
4702                 struct btrfs_ordered_extent *ordered;
4703
4704                 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
4705                                  &cached_state);
4706                 ordered = btrfs_lookup_ordered_range(inode, hole_start,
4707                                                      block_end - hole_start);
4708                 if (!ordered)
4709                         break;
4710                 unlock_extent_cached(io_tree, hole_start, block_end - 1,
4711                                      &cached_state, GFP_NOFS);
4712                 btrfs_start_ordered_extent(inode, ordered, 1);
4713                 btrfs_put_ordered_extent(ordered);
4714         }
4715
4716         cur_offset = hole_start;
4717         while (1) {
4718                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4719                                 block_end - cur_offset, 0);
4720                 if (IS_ERR(em)) {
4721                         err = PTR_ERR(em);
4722                         em = NULL;
4723                         break;
4724                 }
4725                 last_byte = min(extent_map_end(em), block_end);
4726                 last_byte = ALIGN(last_byte , root->sectorsize);
4727                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4728                         struct extent_map *hole_em;
4729                         hole_size = last_byte - cur_offset;
4730
4731                         err = maybe_insert_hole(root, inode, cur_offset,
4732                                                 hole_size);
4733                         if (err)
4734                                 break;
4735                         btrfs_drop_extent_cache(inode, cur_offset,
4736                                                 cur_offset + hole_size - 1, 0);
4737                         hole_em = alloc_extent_map();
4738                         if (!hole_em) {
4739                                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4740                                         &BTRFS_I(inode)->runtime_flags);
4741                                 goto next;
4742                         }
4743                         hole_em->start = cur_offset;
4744                         hole_em->len = hole_size;
4745                         hole_em->orig_start = cur_offset;
4746
4747                         hole_em->block_start = EXTENT_MAP_HOLE;
4748                         hole_em->block_len = 0;
4749                         hole_em->orig_block_len = 0;
4750                         hole_em->ram_bytes = hole_size;
4751                         hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4752                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
4753                         hole_em->generation = root->fs_info->generation;
4754
4755                         while (1) {
4756                                 write_lock(&em_tree->lock);
4757                                 err = add_extent_mapping(em_tree, hole_em, 1);
4758                                 write_unlock(&em_tree->lock);
4759                                 if (err != -EEXIST)
4760                                         break;
4761                                 btrfs_drop_extent_cache(inode, cur_offset,
4762                                                         cur_offset +
4763                                                         hole_size - 1, 0);
4764                         }
4765                         free_extent_map(hole_em);
4766                 }
4767 next:
4768                 free_extent_map(em);
4769                 em = NULL;
4770                 cur_offset = last_byte;
4771                 if (cur_offset >= block_end)
4772                         break;
4773         }
4774         free_extent_map(em);
4775         unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4776                              GFP_NOFS);
4777         return err;
4778 }
4779
4780 static int wait_snapshoting_atomic_t(atomic_t *a)
4781 {
4782         schedule();
4783         return 0;
4784 }
4785
4786 static void wait_for_snapshot_creation(struct btrfs_root *root)
4787 {
4788         while (true) {
4789                 int ret;
4790
4791                 ret = btrfs_start_write_no_snapshoting(root);
4792                 if (ret)
4793                         break;
4794                 wait_on_atomic_t(&root->will_be_snapshoted,
4795                                  wait_snapshoting_atomic_t,
4796                                  TASK_UNINTERRUPTIBLE);
4797         }
4798 }
4799
4800 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4801 {
4802         struct btrfs_root *root = BTRFS_I(inode)->root;
4803         struct btrfs_trans_handle *trans;
4804         loff_t oldsize = i_size_read(inode);
4805         loff_t newsize = attr->ia_size;
4806         int mask = attr->ia_valid;
4807         int ret;
4808
4809         /*
4810          * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4811          * special case where we need to update the times despite not having
4812          * these flags set.  For all other operations the VFS set these flags
4813          * explicitly if it wants a timestamp update.
4814          */
4815         if (newsize != oldsize) {
4816                 inode_inc_iversion(inode);
4817                 if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4818                         inode->i_ctime = inode->i_mtime =
4819                                 current_fs_time(inode->i_sb);
4820         }
4821
4822         if (newsize > oldsize) {
4823                 truncate_pagecache(inode, newsize);
4824                 /*
4825                  * Don't do an expanding truncate while snapshoting is ongoing.
4826                  * This is to ensure the snapshot captures a fully consistent
4827                  * state of this file - if the snapshot captures this expanding
4828                  * truncation, it must capture all writes that happened before
4829                  * this truncation.
4830                  */
4831                 wait_for_snapshot_creation(root);
4832                 ret = btrfs_cont_expand(inode, oldsize, newsize);
4833                 if (ret) {
4834                         btrfs_end_write_no_snapshoting(root);
4835                         return ret;
4836                 }
4837
4838                 trans = btrfs_start_transaction(root, 1);
4839                 if (IS_ERR(trans)) {
4840                         btrfs_end_write_no_snapshoting(root);
4841                         return PTR_ERR(trans);
4842                 }
4843
4844                 i_size_write(inode, newsize);
4845                 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4846                 ret = btrfs_update_inode(trans, root, inode);
4847                 btrfs_end_write_no_snapshoting(root);
4848                 btrfs_end_transaction(trans, root);
4849         } else {
4850
4851                 /*
4852                  * We're truncating a file that used to have good data down to
4853                  * zero. Make sure it gets into the ordered flush list so that
4854                  * any new writes get down to disk quickly.
4855                  */
4856                 if (newsize == 0)
4857                         set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4858                                 &BTRFS_I(inode)->runtime_flags);
4859
4860                 /*
4861                  * 1 for the orphan item we're going to add
4862                  * 1 for the orphan item deletion.
4863                  */
4864                 trans = btrfs_start_transaction(root, 2);
4865                 if (IS_ERR(trans))
4866                         return PTR_ERR(trans);
4867
4868                 /*
4869                  * We need to do this in case we fail at _any_ point during the
4870                  * actual truncate.  Once we do the truncate_setsize we could
4871                  * invalidate pages which forces any outstanding ordered io to
4872                  * be instantly completed which will give us extents that need
4873                  * to be truncated.  If we fail to get an orphan inode down we
4874                  * could have left over extents that were never meant to live,
4875                  * so we need to garuntee from this point on that everything
4876                  * will be consistent.
4877                  */
4878                 ret = btrfs_orphan_add(trans, inode);
4879                 btrfs_end_transaction(trans, root);
4880                 if (ret)
4881                         return ret;
4882
4883                 /* we don't support swapfiles, so vmtruncate shouldn't fail */
4884                 truncate_setsize(inode, newsize);
4885
4886                 /* Disable nonlocked read DIO to avoid the end less truncate */
4887                 btrfs_inode_block_unlocked_dio(inode);
4888                 inode_dio_wait(inode);
4889                 btrfs_inode_resume_unlocked_dio(inode);
4890
4891                 ret = btrfs_truncate(inode);
4892                 if (ret && inode->i_nlink) {
4893                         int err;
4894
4895                         /*
4896                          * failed to truncate, disk_i_size is only adjusted down
4897                          * as we remove extents, so it should represent the true
4898                          * size of the inode, so reset the in memory size and
4899                          * delete our orphan entry.
4900                          */
4901                         trans = btrfs_join_transaction(root);
4902                         if (IS_ERR(trans)) {
4903                                 btrfs_orphan_del(NULL, inode);
4904                                 return ret;
4905                         }
4906                         i_size_write(inode, BTRFS_I(inode)->disk_i_size);
4907                         err = btrfs_orphan_del(trans, inode);
4908                         if (err)
4909                                 btrfs_abort_transaction(trans, root, err);
4910                         btrfs_end_transaction(trans, root);
4911                 }
4912         }
4913
4914         return ret;
4915 }
4916
4917 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4918 {
4919         struct inode *inode = dentry->d_inode;
4920         struct btrfs_root *root = BTRFS_I(inode)->root;
4921         int err;
4922
4923         if (btrfs_root_readonly(root))
4924                 return -EROFS;
4925
4926         err = inode_change_ok(inode, attr);
4927         if (err)
4928                 return err;
4929
4930         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
4931                 err = btrfs_setsize(inode, attr);
4932                 if (err)
4933                         return err;
4934         }
4935
4936         if (attr->ia_valid) {
4937                 setattr_copy(inode, attr);
4938                 inode_inc_iversion(inode);
4939                 err = btrfs_dirty_inode(inode);
4940
4941                 if (!err && attr->ia_valid & ATTR_MODE)
4942                         err = posix_acl_chmod(inode, inode->i_mode);
4943         }
4944
4945         return err;
4946 }
4947
4948 /*
4949  * While truncating the inode pages during eviction, we get the VFS calling
4950  * btrfs_invalidatepage() against each page of the inode. This is slow because
4951  * the calls to btrfs_invalidatepage() result in a huge amount of calls to
4952  * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
4953  * extent_state structures over and over, wasting lots of time.
4954  *
4955  * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
4956  * those expensive operations on a per page basis and do only the ordered io
4957  * finishing, while we release here the extent_map and extent_state structures,
4958  * without the excessive merging and splitting.
4959  */
4960 static void evict_inode_truncate_pages(struct inode *inode)
4961 {
4962         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4963         struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
4964         struct rb_node *node;
4965
4966         ASSERT(inode->i_state & I_FREEING);
4967         truncate_inode_pages_final(&inode->i_data);
4968
4969         write_lock(&map_tree->lock);
4970         while (!RB_EMPTY_ROOT(&map_tree->map)) {
4971                 struct extent_map *em;
4972
4973                 node = rb_first(&map_tree->map);
4974                 em = rb_entry(node, struct extent_map, rb_node);
4975                 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
4976                 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4977                 remove_extent_mapping(map_tree, em);
4978                 free_extent_map(em);
4979                 if (need_resched()) {
4980                         write_unlock(&map_tree->lock);
4981                         cond_resched();
4982                         write_lock(&map_tree->lock);
4983                 }
4984         }
4985         write_unlock(&map_tree->lock);
4986
4987         spin_lock(&io_tree->lock);
4988         while (!RB_EMPTY_ROOT(&io_tree->state)) {
4989                 struct extent_state *state;
4990                 struct extent_state *cached_state = NULL;
4991
4992                 node = rb_first(&io_tree->state);
4993                 state = rb_entry(node, struct extent_state, rb_node);
4994                 atomic_inc(&state->refs);
4995                 spin_unlock(&io_tree->lock);
4996
4997                 lock_extent_bits(io_tree, state->start, state->end,
4998                                  0, &cached_state);
4999                 clear_extent_bit(io_tree, state->start, state->end,
5000                                  EXTENT_LOCKED | EXTENT_DIRTY |
5001                                  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
5002                                  EXTENT_DEFRAG, 1, 1,
5003                                  &cached_state, GFP_NOFS);
5004                 free_extent_state(state);
5005
5006                 cond_resched();
5007                 spin_lock(&io_tree->lock);
5008         }
5009         spin_unlock(&io_tree->lock);
5010 }
5011
5012 void btrfs_evict_inode(struct inode *inode)
5013 {
5014         struct btrfs_trans_handle *trans;
5015         struct btrfs_root *root = BTRFS_I(inode)->root;
5016         struct btrfs_block_rsv *rsv, *global_rsv;
5017         int steal_from_global = 0;
5018         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
5019         int ret;
5020
5021         trace_btrfs_inode_evict(inode);
5022
5023         evict_inode_truncate_pages(inode);
5024
5025         if (inode->i_nlink &&
5026             ((btrfs_root_refs(&root->root_item) != 0 &&
5027               root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5028              btrfs_is_free_space_inode(inode)))
5029                 goto no_delete;
5030
5031         if (is_bad_inode(inode)) {
5032                 btrfs_orphan_del(NULL, inode);
5033                 goto no_delete;
5034         }
5035         /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
5036         btrfs_wait_ordered_range(inode, 0, (u64)-1);
5037
5038         btrfs_free_io_failure_record(inode, 0, (u64)-1);
5039
5040         if (root->fs_info->log_root_recovering) {
5041                 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
5042                                  &BTRFS_I(inode)->runtime_flags));
5043                 goto no_delete;
5044         }
5045
5046         if (inode->i_nlink > 0) {
5047                 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5048                        root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5049                 goto no_delete;
5050         }
5051
5052         ret = btrfs_commit_inode_delayed_inode(inode);
5053         if (ret) {
5054                 btrfs_orphan_del(NULL, inode);
5055                 goto no_delete;
5056         }
5057
5058         rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
5059         if (!rsv) {
5060                 btrfs_orphan_del(NULL, inode);
5061                 goto no_delete;
5062         }
5063         rsv->size = min_size;
5064         rsv->failfast = 1;
5065         global_rsv = &root->fs_info->global_block_rsv;
5066
5067         btrfs_i_size_write(inode, 0);
5068
5069         /*
5070          * This is a bit simpler than btrfs_truncate since we've already
5071          * reserved our space for our orphan item in the unlink, so we just
5072          * need to reserve some slack space in case we add bytes and update
5073          * inode item when doing the truncate.
5074          */
5075         while (1) {
5076                 ret = btrfs_block_rsv_refill(root, rsv, min_size,
5077                                              BTRFS_RESERVE_FLUSH_LIMIT);
5078
5079                 /*
5080                  * Try and steal from the global reserve since we will
5081                  * likely not use this space anyway, we want to try as
5082                  * hard as possible to get this to work.
5083                  */
5084                 if (ret)
5085                         steal_from_global++;
5086                 else
5087                         steal_from_global = 0;
5088                 ret = 0;
5089
5090                 /*
5091                  * steal_from_global == 0: we reserved stuff, hooray!
5092                  * steal_from_global == 1: we didn't reserve stuff, boo!
5093                  * steal_from_global == 2: we've committed, still not a lot of
5094                  * room but maybe we'll have room in the global reserve this
5095                  * time.
5096                  * steal_from_global == 3: abandon all hope!
5097                  */
5098                 if (steal_from_global > 2) {
5099                         btrfs_warn(root->fs_info,
5100                                 "Could not get space for a delete, will truncate on mount %d",
5101                                 ret);
5102                         btrfs_orphan_del(NULL, inode);
5103                         btrfs_free_block_rsv(root, rsv);
5104                         goto no_delete;
5105                 }
5106
5107                 trans = btrfs_join_transaction(root);
5108                 if (IS_ERR(trans)) {
5109                         btrfs_orphan_del(NULL, inode);
5110                         btrfs_free_block_rsv(root, rsv);
5111                         goto no_delete;
5112                 }
5113
5114                 /*
5115                  * We can't just steal from the global reserve, we need tomake
5116                  * sure there is room to do it, if not we need to commit and try
5117                  * again.
5118                  */
5119                 if (steal_from_global) {
5120                         if (!btrfs_check_space_for_delayed_refs(trans, root))
5121                                 ret = btrfs_block_rsv_migrate(global_rsv, rsv,
5122                                                               min_size);
5123                         else
5124                                 ret = -ENOSPC;
5125                 }
5126
5127                 /*
5128                  * Couldn't steal from the global reserve, we have too much
5129                  * pending stuff built up, commit the transaction and try it
5130                  * again.
5131                  */
5132                 if (ret) {
5133                         ret = btrfs_commit_transaction(trans, root);
5134                         if (ret) {
5135                                 btrfs_orphan_del(NULL, inode);
5136                                 btrfs_free_block_rsv(root, rsv);
5137                                 goto no_delete;
5138                         }
5139                         continue;
5140                 } else {
5141                         steal_from_global = 0;
5142                 }
5143
5144                 trans->block_rsv = rsv;
5145
5146                 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5147                 if (ret != -ENOSPC && ret != -EAGAIN)
5148                         break;
5149
5150                 trans->block_rsv = &root->fs_info->trans_block_rsv;
5151                 btrfs_end_transaction(trans, root);
5152                 trans = NULL;
5153                 btrfs_btree_balance_dirty(root);
5154         }
5155
5156         btrfs_free_block_rsv(root, rsv);
5157
5158         /*
5159          * Errors here aren't a big deal, it just means we leave orphan items
5160          * in the tree.  They will be cleaned up on the next mount.
5161          */
5162         if (ret == 0) {
5163                 trans->block_rsv = root->orphan_block_rsv;
5164                 btrfs_orphan_del(trans, inode);
5165         } else {
5166                 btrfs_orphan_del(NULL, inode);
5167         }
5168
5169         trans->block_rsv = &root->fs_info->trans_block_rsv;
5170         if (!(root == root->fs_info->tree_root ||
5171               root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
5172                 btrfs_return_ino(root, btrfs_ino(inode));
5173
5174         btrfs_end_transaction(trans, root);
5175         btrfs_btree_balance_dirty(root);
5176 no_delete:
5177         btrfs_remove_delayed_node(inode);
5178         clear_inode(inode);
5179         return;
5180 }
5181
5182 /*
5183  * this returns the key found in the dir entry in the location pointer.
5184  * If no dir entries were found, location->objectid is 0.
5185  */
5186 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5187                                struct btrfs_key *location)
5188 {
5189         const char *name = dentry->d_name.name;
5190         int namelen = dentry->d_name.len;
5191         struct btrfs_dir_item *di;
5192         struct btrfs_path *path;
5193         struct btrfs_root *root = BTRFS_I(dir)->root;
5194         int ret = 0;
5195
5196         path = btrfs_alloc_path();
5197         if (!path)
5198                 return -ENOMEM;
5199
5200         di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
5201                                     namelen, 0);
5202         if (IS_ERR(di))
5203                 ret = PTR_ERR(di);
5204
5205         if (IS_ERR_OR_NULL(di))
5206                 goto out_err;
5207
5208         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5209 out:
5210         btrfs_free_path(path);
5211         return ret;
5212 out_err:
5213         location->objectid = 0;
5214         goto out;
5215 }
5216
5217 /*
5218  * when we hit a tree root in a directory, the btrfs part of the inode
5219  * needs to be changed to reflect the root directory of the tree root.  This
5220  * is kind of like crossing a mount point.
5221  */
5222 static int fixup_tree_root_location(struct btrfs_root *root,
5223                                     struct inode *dir,
5224                                     struct dentry *dentry,
5225                                     struct btrfs_key *location,
5226                                     struct btrfs_root **sub_root)
5227 {
5228         struct btrfs_path *path;
5229         struct btrfs_root *new_root;
5230         struct btrfs_root_ref *ref;
5231         struct extent_buffer *leaf;
5232         struct btrfs_key key;
5233         int ret;
5234         int err = 0;
5235
5236         path = btrfs_alloc_path();
5237         if (!path) {
5238                 err = -ENOMEM;
5239                 goto out;
5240         }
5241
5242         err = -ENOENT;
5243         key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5244         key.type = BTRFS_ROOT_REF_KEY;
5245         key.offset = location->objectid;
5246
5247         ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
5248                                 0, 0);
5249         if (ret) {
5250                 if (ret < 0)
5251                         err = ret;
5252                 goto out;
5253         }
5254
5255         leaf = path->nodes[0];
5256         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5257         if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5258             btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5259                 goto out;
5260
5261         ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5262                                    (unsigned long)(ref + 1),
5263                                    dentry->d_name.len);
5264         if (ret)
5265                 goto out;
5266
5267         btrfs_release_path(path);
5268
5269         new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
5270         if (IS_ERR(new_root)) {
5271                 err = PTR_ERR(new_root);
5272                 goto out;
5273         }
5274
5275         *sub_root = new_root;
5276         location->objectid = btrfs_root_dirid(&new_root->root_item);
5277         location->type = BTRFS_INODE_ITEM_KEY;
5278         location->offset = 0;
5279         err = 0;
5280 out:
5281         btrfs_free_path(path);
5282         return err;
5283 }
5284
5285 static void inode_tree_add(struct inode *inode)
5286 {
5287         struct btrfs_root *root = BTRFS_I(inode)->root;
5288         struct btrfs_inode *entry;
5289         struct rb_node **p;
5290         struct rb_node *parent;
5291         struct rb_node *new = &BTRFS_I(inode)->rb_node;
5292         u64 ino = btrfs_ino(inode);
5293
5294         if (inode_unhashed(inode))
5295                 return;
5296         parent = NULL;
5297         spin_lock(&root->inode_lock);
5298         p = &root->inode_tree.rb_node;
5299         while (*p) {
5300                 parent = *p;
5301                 entry = rb_entry(parent, struct btrfs_inode, rb_node);
5302
5303                 if (ino < btrfs_ino(&entry->vfs_inode))
5304                         p = &parent->rb_left;
5305                 else if (ino > btrfs_ino(&entry->vfs_inode))
5306                         p = &parent->rb_right;
5307                 else {
5308                         WARN_ON(!(entry->vfs_inode.i_state &
5309                                   (I_WILL_FREE | I_FREEING)));
5310                         rb_replace_node(parent, new, &root->inode_tree);
5311                         RB_CLEAR_NODE(parent);
5312                         spin_unlock(&root->inode_lock);
5313                         return;
5314                 }
5315         }
5316         rb_link_node(new, parent, p);
5317         rb_insert_color(new, &root->inode_tree);
5318         spin_unlock(&root->inode_lock);
5319 }
5320
5321 static void inode_tree_del(struct inode *inode)
5322 {
5323         struct btrfs_root *root = BTRFS_I(inode)->root;
5324         int empty = 0;
5325
5326         spin_lock(&root->inode_lock);
5327         if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5328                 rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5329                 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5330                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5331         }
5332         spin_unlock(&root->inode_lock);
5333
5334         if (empty && btrfs_root_refs(&root->root_item) == 0) {
5335                 synchronize_srcu(&root->fs_info->subvol_srcu);
5336                 spin_lock(&root->inode_lock);
5337                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5338                 spin_unlock(&root->inode_lock);
5339                 if (empty)
5340                         btrfs_add_dead_root(root);
5341         }
5342 }
5343
5344 void btrfs_invalidate_inodes(struct btrfs_root *root)
5345 {
5346         struct rb_node *node;
5347         struct rb_node *prev;
5348         struct btrfs_inode *entry;
5349         struct inode *inode;
5350         u64 objectid = 0;
5351
5352         if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
5353                 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5354
5355         spin_lock(&root->inode_lock);
5356 again:
5357         node = root->inode_tree.rb_node;
5358         prev = NULL;
5359         while (node) {
5360                 prev = node;
5361                 entry = rb_entry(node, struct btrfs_inode, rb_node);
5362
5363                 if (objectid < btrfs_ino(&entry->vfs_inode))
5364                         node = node->rb_left;
5365                 else if (objectid > btrfs_ino(&entry->vfs_inode))
5366                         node = node->rb_right;
5367                 else
5368                         break;
5369         }
5370         if (!node) {
5371                 while (prev) {
5372                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
5373                         if (objectid <= btrfs_ino(&entry->vfs_inode)) {
5374                                 node = prev;
5375                                 break;
5376                         }
5377                         prev = rb_next(prev);
5378                 }
5379         }
5380         while (node) {
5381                 entry = rb_entry(node, struct btrfs_inode, rb_node);
5382                 objectid = btrfs_ino(&entry->vfs_inode) + 1;
5383                 inode = igrab(&entry->vfs_inode);
5384                 if (inode) {
5385                         spin_unlock(&root->inode_lock);
5386                         if (atomic_read(&inode->i_count) > 1)
5387                                 d_prune_aliases(inode);
5388                         /*
5389                          * btrfs_drop_inode will have it removed from
5390                          * the inode cache when its usage count
5391                          * hits zero.
5392                          */
5393                         iput(inode);
5394                         cond_resched();
5395                         spin_lock(&root->inode_lock);
5396                         goto again;
5397                 }
5398
5399                 if (cond_resched_lock(&root->inode_lock))
5400                         goto again;
5401
5402                 node = rb_next(node);
5403         }
5404         spin_unlock(&root->inode_lock);
5405 }
5406
5407 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5408 {
5409         struct btrfs_iget_args *args = p;
5410         inode->i_ino = args->location->objectid;
5411         memcpy(&BTRFS_I(inode)->location, args->location,
5412                sizeof(*args->location));
5413         BTRFS_I(inode)->root = args->root;
5414         return 0;
5415 }
5416
5417 static int btrfs_find_actor(struct inode *inode, void *opaque)
5418 {
5419         struct btrfs_iget_args *args = opaque;
5420         return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5421                 args->root == BTRFS_I(inode)->root;
5422 }
5423
5424 static struct inode *btrfs_iget_locked(struct super_block *s,
5425                                        struct btrfs_key *location,
5426                                        struct btrfs_root *root)
5427 {
5428         struct inode *inode;
5429         struct btrfs_iget_args args;
5430         unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5431
5432         args.location = location;
5433         args.root = root;
5434
5435         inode = iget5_locked(s, hashval, btrfs_find_actor,
5436                              btrfs_init_locked_inode,
5437                              (void *)&args);
5438         return inode;
5439 }
5440
5441 /* Get an inode object given its location and corresponding root.
5442  * Returns in *is_new if the inode was read from disk
5443  */
5444 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5445                          struct btrfs_root *root, int *new)
5446 {
5447         struct inode *inode;
5448
5449         inode = btrfs_iget_locked(s, location, root);
5450         if (!inode)
5451                 return ERR_PTR(-ENOMEM);
5452
5453         if (inode->i_state & I_NEW) {
5454                 btrfs_read_locked_inode(inode);
5455                 if (!is_bad_inode(inode)) {
5456                         inode_tree_add(inode);
5457                         unlock_new_inode(inode);
5458                         if (new)
5459                                 *new = 1;
5460                 } else {
5461                         unlock_new_inode(inode);
5462                         iput(inode);
5463                         inode = ERR_PTR(-ESTALE);
5464                 }
5465         }
5466
5467         return inode;
5468 }
5469
5470 static struct inode *new_simple_dir(struct super_block *s,
5471                                     struct btrfs_key *key,
5472                                     struct btrfs_root *root)
5473 {
5474         struct inode *inode = new_inode(s);
5475
5476         if (!inode)
5477                 return ERR_PTR(-ENOMEM);
5478
5479         BTRFS_I(inode)->root = root;
5480         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5481         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5482
5483         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5484         inode->i_op = &btrfs_dir_ro_inode_operations;
5485         inode->i_fop = &simple_dir_operations;
5486         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5487         inode->i_mtime = CURRENT_TIME;
5488         inode->i_atime = inode->i_mtime;
5489         inode->i_ctime = inode->i_mtime;
5490         BTRFS_I(inode)->i_otime = inode->i_mtime;
5491
5492         return inode;
5493 }
5494
5495 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5496 {
5497         struct inode *inode;
5498         struct btrfs_root *root = BTRFS_I(dir)->root;
5499         struct btrfs_root *sub_root = root;
5500         struct btrfs_key location;
5501         int index;
5502         int ret = 0;
5503
5504         if (dentry->d_name.len > BTRFS_NAME_LEN)
5505                 return ERR_PTR(-ENAMETOOLONG);
5506
5507         ret = btrfs_inode_by_name(dir, dentry, &location);
5508         if (ret < 0)
5509                 return ERR_PTR(ret);
5510
5511         if (location.objectid == 0)
5512                 return ERR_PTR(-ENOENT);
5513
5514         if (location.type == BTRFS_INODE_ITEM_KEY) {
5515                 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5516                 return inode;
5517         }
5518
5519         BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5520
5521         index = srcu_read_lock(&root->fs_info->subvol_srcu);
5522         ret = fixup_tree_root_location(root, dir, dentry,
5523                                        &location, &sub_root);
5524         if (ret < 0) {
5525                 if (ret != -ENOENT)
5526                         inode = ERR_PTR(ret);
5527                 else
5528                         inode = new_simple_dir(dir->i_sb, &location, sub_root);
5529         } else {
5530                 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5531         }
5532         srcu_read_unlock(&root->fs_info->subvol_srcu, index);
5533
5534         if (!IS_ERR(inode) && root != sub_root) {
5535                 down_read(&root->fs_info->cleanup_work_sem);
5536                 if (!(inode->i_sb->s_flags & MS_RDONLY))
5537                         ret = btrfs_orphan_cleanup(sub_root);
5538                 up_read(&root->fs_info->cleanup_work_sem);
5539                 if (ret) {
5540                         iput(inode);
5541                         inode = ERR_PTR(ret);
5542                 }
5543         }
5544
5545         return inode;
5546 }
5547
5548 static int btrfs_dentry_delete(const struct dentry *dentry)
5549 {
5550         struct btrfs_root *root;
5551         struct inode *inode = dentry->d_inode;
5552
5553         if (!inode && !IS_ROOT(dentry))
5554                 inode = dentry->d_parent->d_inode;
5555
5556         if (inode) {
5557                 root = BTRFS_I(inode)->root;
5558                 if (btrfs_root_refs(&root->root_item) == 0)
5559                         return 1;
5560
5561                 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5562                         return 1;
5563         }
5564         return 0;
5565 }
5566
5567 static void btrfs_dentry_release(struct dentry *dentry)
5568 {
5569         kfree(dentry->d_fsdata);
5570 }
5571
5572 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5573                                    unsigned int flags)
5574 {
5575         struct inode *inode;
5576
5577         inode = btrfs_lookup_dentry(dir, dentry);
5578         if (IS_ERR(inode)) {
5579                 if (PTR_ERR(inode) == -ENOENT)
5580                         inode = NULL;
5581                 else
5582                         return ERR_CAST(inode);
5583         }
5584
5585         return d_splice_alias(inode, dentry);
5586 }
5587
5588 unsigned char btrfs_filetype_table[] = {
5589         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5590 };
5591
5592 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5593 {
5594         struct inode *inode = file_inode(file);
5595         struct btrfs_root *root = BTRFS_I(inode)->root;
5596         struct btrfs_item *item;
5597         struct btrfs_dir_item *di;
5598         struct btrfs_key key;
5599         struct btrfs_key found_key;
5600         struct btrfs_path *path;
5601         struct list_head ins_list;
5602         struct list_head del_list;
5603         int ret;
5604         struct extent_buffer *leaf;
5605         int slot;
5606         unsigned char d_type;
5607         int over = 0;
5608         u32 di_cur;
5609         u32 di_total;
5610         u32 di_len;
5611         int key_type = BTRFS_DIR_INDEX_KEY;
5612         char tmp_name[32];
5613         char *name_ptr;
5614         int name_len;
5615         int is_curr = 0;        /* ctx->pos points to the current index? */
5616
5617         /* FIXME, use a real flag for deciding about the key type */
5618         if (root->fs_info->tree_root == root)
5619                 key_type = BTRFS_DIR_ITEM_KEY;
5620
5621         if (!dir_emit_dots(file, ctx))
5622                 return 0;
5623
5624         path = btrfs_alloc_path();
5625         if (!path)
5626                 return -ENOMEM;
5627
5628         path->reada = 1;
5629
5630         if (key_type == BTRFS_DIR_INDEX_KEY) {
5631                 INIT_LIST_HEAD(&ins_list);
5632                 INIT_LIST_HEAD(&del_list);
5633                 btrfs_get_delayed_items(inode, &ins_list, &del_list);
5634         }
5635
5636         key.type = key_type;
5637         key.offset = ctx->pos;
5638         key.objectid = btrfs_ino(inode);
5639
5640         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5641         if (ret < 0)
5642                 goto err;
5643
5644         while (1) {
5645                 leaf = path->nodes[0];
5646                 slot = path->slots[0];
5647                 if (slot >= btrfs_header_nritems(leaf)) {
5648                         ret = btrfs_next_leaf(root, path);
5649                         if (ret < 0)
5650                                 goto err;
5651                         else if (ret > 0)
5652                                 break;
5653                         continue;
5654                 }
5655
5656                 item = btrfs_item_nr(slot);
5657                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5658
5659                 if (found_key.objectid != key.objectid)
5660                         break;
5661                 if (found_key.type != key_type)
5662                         break;
5663                 if (found_key.offset < ctx->pos)
5664                         goto next;
5665                 if (key_type == BTRFS_DIR_INDEX_KEY &&
5666                     btrfs_should_delete_dir_index(&del_list,
5667                                                   found_key.offset))
5668                         goto next;
5669
5670                 ctx->pos = found_key.offset;
5671                 is_curr = 1;
5672
5673                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5674                 di_cur = 0;
5675                 di_total = btrfs_item_size(leaf, item);
5676
5677                 while (di_cur < di_total) {
5678                         struct btrfs_key location;
5679
5680                         if (verify_dir_item(root, leaf, di))
5681                                 break;
5682
5683                         name_len = btrfs_dir_name_len(leaf, di);
5684                         if (name_len <= sizeof(tmp_name)) {
5685                                 name_ptr = tmp_name;
5686                         } else {
5687                                 name_ptr = kmalloc(name_len, GFP_NOFS);
5688                                 if (!name_ptr) {
5689                                         ret = -ENOMEM;
5690                                         goto err;
5691                                 }
5692                         }
5693                         read_extent_buffer(leaf, name_ptr,
5694                                            (unsigned long)(di + 1), name_len);
5695
5696                         d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5697                         btrfs_dir_item_key_to_cpu(leaf, di, &location);
5698
5699
5700                         /* is this a reference to our own snapshot? If so
5701                          * skip it.
5702                          *
5703                          * In contrast to old kernels, we insert the snapshot's
5704                          * dir item and dir index after it has been created, so
5705                          * we won't find a reference to our own snapshot. We
5706                          * still keep the following code for backward
5707                          * compatibility.
5708                          */
5709                         if (location.type == BTRFS_ROOT_ITEM_KEY &&
5710                             location.objectid == root->root_key.objectid) {
5711                                 over = 0;
5712                                 goto skip;
5713                         }
5714                         over = !dir_emit(ctx, name_ptr, name_len,
5715                                        location.objectid, d_type);
5716
5717 skip:
5718                         if (name_ptr != tmp_name)
5719                                 kfree(name_ptr);
5720
5721                         if (over)
5722                                 goto nopos;
5723                         di_len = btrfs_dir_name_len(leaf, di) +
5724                                  btrfs_dir_data_len(leaf, di) + sizeof(*di);
5725                         di_cur += di_len;
5726                         di = (struct btrfs_dir_item *)((char *)di + di_len);
5727                 }
5728 next:
5729                 path->slots[0]++;
5730         }
5731
5732         if (key_type == BTRFS_DIR_INDEX_KEY) {
5733                 if (is_curr)
5734                         ctx->pos++;
5735                 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5736                 if (ret)
5737                         goto nopos;
5738         }
5739
5740         /* Reached end of directory/root. Bump pos past the last item. */
5741         ctx->pos++;
5742
5743         /*
5744          * Stop new entries from being returned after we return the last
5745          * entry.
5746          *
5747          * New directory entries are assigned a strictly increasing
5748          * offset.  This means that new entries created during readdir
5749          * are *guaranteed* to be seen in the future by that readdir.
5750          * This has broken buggy programs which operate on names as
5751          * they're returned by readdir.  Until we re-use freed offsets
5752          * we have this hack to stop new entries from being returned
5753          * under the assumption that they'll never reach this huge
5754          * offset.
5755          *
5756          * This is being careful not to overflow 32bit loff_t unless the
5757          * last entry requires it because doing so has broken 32bit apps
5758          * in the past.
5759          */
5760         if (key_type == BTRFS_DIR_INDEX_KEY) {
5761                 if (ctx->pos >= INT_MAX)
5762                         ctx->pos = LLONG_MAX;
5763                 else
5764                         ctx->pos = INT_MAX;
5765         }
5766 nopos:
5767         ret = 0;
5768 err:
5769         if (key_type == BTRFS_DIR_INDEX_KEY)
5770                 btrfs_put_delayed_items(&ins_list, &del_list);
5771         btrfs_free_path(path);
5772         return ret;
5773 }
5774
5775 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
5776 {
5777         struct btrfs_root *root = BTRFS_I(inode)->root;
5778         struct btrfs_trans_handle *trans;
5779         int ret = 0;
5780         bool nolock = false;
5781
5782         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5783                 return 0;
5784
5785         if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
5786                 nolock = true;
5787
5788         if (wbc->sync_mode == WB_SYNC_ALL) {
5789                 if (nolock)
5790                         trans = btrfs_join_transaction_nolock(root);
5791                 else
5792                         trans = btrfs_join_transaction(root);
5793                 if (IS_ERR(trans))
5794                         return PTR_ERR(trans);
5795                 ret = btrfs_commit_transaction(trans, root);
5796         }
5797         return ret;
5798 }
5799
5800 /*
5801  * This is somewhat expensive, updating the tree every time the
5802  * inode changes.  But, it is most likely to find the inode in cache.
5803  * FIXME, needs more benchmarking...there are no reasons other than performance
5804  * to keep or drop this code.
5805  */
5806 static int btrfs_dirty_inode(struct inode *inode)
5807 {
5808         struct btrfs_root *root = BTRFS_I(inode)->root;
5809         struct btrfs_trans_handle *trans;
5810         int ret;
5811
5812         if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5813                 return 0;
5814
5815         trans = btrfs_join_transaction(root);
5816         if (IS_ERR(trans))
5817                 return PTR_ERR(trans);
5818
5819         ret = btrfs_update_inode(trans, root, inode);
5820         if (ret && ret == -ENOSPC) {
5821                 /* whoops, lets try again with the full transaction */
5822                 btrfs_end_transaction(trans, root);
5823                 trans = btrfs_start_transaction(root, 1);
5824                 if (IS_ERR(trans))
5825                         return PTR_ERR(trans);
5826
5827                 ret = btrfs_update_inode(trans, root, inode);
5828         }
5829         btrfs_end_transaction(trans, root);
5830         if (BTRFS_I(inode)->delayed_node)
5831                 btrfs_balance_delayed_items(root);
5832
5833         return ret;
5834 }
5835
5836 /*
5837  * This is a copy of file_update_time.  We need this so we can return error on
5838  * ENOSPC for updating the inode in the case of file write and mmap writes.
5839  */
5840 static int btrfs_update_time(struct inode *inode, struct timespec *now,
5841                              int flags)
5842 {
5843         struct btrfs_root *root = BTRFS_I(inode)->root;
5844
5845         if (btrfs_root_readonly(root))
5846                 return -EROFS;
5847
5848         if (flags & S_VERSION)
5849                 inode_inc_iversion(inode);
5850         if (flags & S_CTIME)
5851                 inode->i_ctime = *now;
5852         if (flags & S_MTIME)
5853                 inode->i_mtime = *now;
5854         if (flags & S_ATIME)
5855                 inode->i_atime = *now;
5856         return btrfs_dirty_inode(inode);
5857 }
5858
5859 /*
5860  * find the highest existing sequence number in a directory
5861  * and then set the in-memory index_cnt variable to reflect
5862  * free sequence numbers
5863  */
5864 static int btrfs_set_inode_index_count(struct inode *inode)
5865 {
5866         struct btrfs_root *root = BTRFS_I(inode)->root;
5867         struct btrfs_key key, found_key;
5868         struct btrfs_path *path;
5869         struct extent_buffer *leaf;
5870         int ret;
5871
5872         key.objectid = btrfs_ino(inode);
5873         key.type = BTRFS_DIR_INDEX_KEY;
5874         key.offset = (u64)-1;
5875
5876         path = btrfs_alloc_path();
5877         if (!path)
5878                 return -ENOMEM;
5879
5880         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5881         if (ret < 0)
5882                 goto out;
5883         /* FIXME: we should be able to handle this */
5884         if (ret == 0)
5885                 goto out;
5886         ret = 0;
5887
5888         /*
5889          * MAGIC NUMBER EXPLANATION:
5890          * since we search a directory based on f_pos we have to start at 2
5891          * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5892          * else has to start at 2
5893          */
5894         if (path->slots[0] == 0) {
5895                 BTRFS_I(inode)->index_cnt = 2;
5896                 goto out;
5897         }
5898
5899         path->slots[0]--;
5900
5901         leaf = path->nodes[0];
5902         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5903
5904         if (found_key.objectid != btrfs_ino(inode) ||
5905             found_key.type != BTRFS_DIR_INDEX_KEY) {
5906                 BTRFS_I(inode)->index_cnt = 2;
5907                 goto out;
5908         }
5909
5910         BTRFS_I(inode)->index_cnt = found_key.offset + 1;
5911 out:
5912         btrfs_free_path(path);
5913         return ret;
5914 }
5915
5916 /*
5917  * helper to find a free sequence number in a given directory.  This current
5918  * code is very simple, later versions will do smarter things in the btree
5919  */
5920 int btrfs_set_inode_index(struct inode *dir, u64 *index)
5921 {
5922         int ret = 0;
5923
5924         if (BTRFS_I(dir)->index_cnt == (u64)-1) {
5925                 ret = btrfs_inode_delayed_dir_index_count(dir);
5926                 if (ret) {
5927                         ret = btrfs_set_inode_index_count(dir);
5928                         if (ret)
5929                                 return ret;
5930                 }
5931         }
5932
5933         *index = BTRFS_I(dir)->index_cnt;
5934         BTRFS_I(dir)->index_cnt++;
5935
5936         return ret;
5937 }
5938
5939 static int btrfs_insert_inode_locked(struct inode *inode)
5940 {
5941         struct btrfs_iget_args args;
5942         args.location = &BTRFS_I(inode)->location;
5943         args.root = BTRFS_I(inode)->root;
5944
5945         return insert_inode_locked4(inode,
5946                    btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
5947                    btrfs_find_actor, &args);
5948 }
5949
5950 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5951                                      struct btrfs_root *root,
5952                                      struct inode *dir,
5953                                      const char *name, int name_len,
5954                                      u64 ref_objectid, u64 objectid,
5955                                      umode_t mode, u64 *index)
5956 {
5957         struct inode *inode;
5958         struct btrfs_inode_item *inode_item;
5959         struct btrfs_key *location;
5960         struct btrfs_path *path;
5961         struct btrfs_inode_ref *ref;
5962         struct btrfs_key key[2];
5963         u32 sizes[2];
5964         int nitems = name ? 2 : 1;
5965         unsigned long ptr;
5966         int ret;
5967
5968         path = btrfs_alloc_path();
5969         if (!path)
5970                 return ERR_PTR(-ENOMEM);
5971
5972         inode = new_inode(root->fs_info->sb);
5973         if (!inode) {
5974                 btrfs_free_path(path);
5975                 return ERR_PTR(-ENOMEM);
5976         }
5977
5978         /*
5979          * O_TMPFILE, set link count to 0, so that after this point,
5980          * we fill in an inode item with the correct link count.
5981          */
5982         if (!name)
5983                 set_nlink(inode, 0);
5984
5985         /*
5986          * we have to initialize this early, so we can reclaim the inode
5987          * number if we fail afterwards in this function.
5988          */
5989         inode->i_ino = objectid;
5990
5991         if (dir && name) {
5992                 trace_btrfs_inode_request(dir);
5993
5994                 ret = btrfs_set_inode_index(dir, index);
5995                 if (ret) {
5996                         btrfs_free_path(path);
5997                         iput(inode);
5998                         return ERR_PTR(ret);
5999                 }
6000         } else if (dir) {
6001                 *index = 0;
6002         }
6003         /*
6004          * index_cnt is ignored for everything but a dir,
6005          * btrfs_get_inode_index_count has an explanation for the magic
6006          * number
6007          */
6008         BTRFS_I(inode)->index_cnt = 2;
6009         BTRFS_I(inode)->dir_index = *index;
6010         BTRFS_I(inode)->root = root;
6011         BTRFS_I(inode)->generation = trans->transid;
6012         inode->i_generation = BTRFS_I(inode)->generation;
6013
6014         /*
6015          * We could have gotten an inode number from somebody who was fsynced
6016          * and then removed in this same transaction, so let's just set full
6017          * sync since it will be a full sync anyway and this will blow away the
6018          * old info in the log.
6019          */
6020         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6021
6022         key[0].objectid = objectid;
6023         key[0].type = BTRFS_INODE_ITEM_KEY;
6024         key[0].offset = 0;
6025
6026         sizes[0] = sizeof(struct btrfs_inode_item);
6027
6028         if (name) {
6029                 /*
6030                  * Start new inodes with an inode_ref. This is slightly more
6031                  * efficient for small numbers of hard links since they will
6032                  * be packed into one item. Extended refs will kick in if we
6033                  * add more hard links than can fit in the ref item.
6034                  */
6035                 key[1].objectid = objectid;
6036                 key[1].type = BTRFS_INODE_REF_KEY;
6037                 key[1].offset = ref_objectid;
6038
6039                 sizes[1] = name_len + sizeof(*ref);
6040         }
6041
6042         location = &BTRFS_I(inode)->location;
6043         location->objectid = objectid;
6044         location->offset = 0;
6045         location->type = BTRFS_INODE_ITEM_KEY;
6046
6047         ret = btrfs_insert_inode_locked(inode);
6048         if (ret < 0)
6049                 goto fail;
6050
6051         path->leave_spinning = 1;
6052         ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
6053         if (ret != 0)
6054                 goto fail_unlock;
6055
6056         inode_init_owner(inode, dir, mode);
6057         inode_set_bytes(inode, 0);
6058
6059         inode->i_mtime = CURRENT_TIME;
6060         inode->i_atime = inode->i_mtime;
6061         inode->i_ctime = inode->i_mtime;
6062         BTRFS_I(inode)->i_otime = inode->i_mtime;
6063
6064         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6065                                   struct btrfs_inode_item);
6066         memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
6067                              sizeof(*inode_item));
6068         fill_inode_item(trans, path->nodes[0], inode_item, inode);
6069
6070         if (name) {
6071                 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6072                                      struct btrfs_inode_ref);
6073                 btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6074                 btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6075                 ptr = (unsigned long)(ref + 1);
6076                 write_extent_buffer(path->nodes[0], name, ptr, name_len);
6077         }
6078
6079         btrfs_mark_buffer_dirty(path->nodes[0]);
6080         btrfs_free_path(path);
6081
6082         btrfs_inherit_iflags(inode, dir);
6083
6084         if (S_ISREG(mode)) {
6085                 if (btrfs_test_opt(root, NODATASUM))
6086                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6087                 if (btrfs_test_opt(root, NODATACOW))
6088                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6089                                 BTRFS_INODE_NODATASUM;
6090         }
6091
6092         inode_tree_add(inode);
6093
6094         trace_btrfs_inode_new(inode);
6095         btrfs_set_inode_last_trans(trans, inode);
6096
6097         btrfs_update_root_times(trans, root);
6098
6099         ret = btrfs_inode_inherit_props(trans, inode, dir);
6100         if (ret)
6101                 btrfs_err(root->fs_info,
6102                           "error inheriting props for ino %llu (root %llu): %d",
6103                           btrfs_ino(inode), root->root_key.objectid, ret);
6104
6105         return inode;
6106
6107 fail_unlock:
6108         unlock_new_inode(inode);
6109 fail:
6110         if (dir && name)
6111                 BTRFS_I(dir)->index_cnt--;
6112         btrfs_free_path(path);
6113         iput(inode);
6114         return ERR_PTR(ret);
6115 }
6116
6117 static inline u8 btrfs_inode_type(struct inode *inode)
6118 {
6119         return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
6120 }
6121
6122 /*
6123  * utility function to add 'inode' into 'parent_inode' with
6124  * a give name and a given sequence number.
6125  * if 'add_backref' is true, also insert a backref from the
6126  * inode to the parent directory.
6127  */
6128 int btrfs_add_link(struct btrfs_trans_handle *trans,
6129                    struct inode *parent_inode, struct inode *inode,
6130                    const char *name, int name_len, int add_backref, u64 index)
6131 {
6132         int ret = 0;
6133         struct btrfs_key key;
6134         struct btrfs_root *root = BTRFS_I(parent_inode)->root;
6135         u64 ino = btrfs_ino(inode);
6136         u64 parent_ino = btrfs_ino(parent_inode);
6137
6138         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6139                 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
6140         } else {
6141                 key.objectid = ino;
6142                 key.type = BTRFS_INODE_ITEM_KEY;
6143                 key.offset = 0;
6144         }
6145
6146         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6147                 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
6148                                          key.objectid, root->root_key.objectid,
6149                                          parent_ino, index, name, name_len);
6150         } else if (add_backref) {
6151                 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6152                                              parent_ino, index);
6153         }
6154
6155         /* Nothing to clean up yet */
6156         if (ret)
6157                 return ret;
6158
6159         ret = btrfs_insert_dir_item(trans, root, name, name_len,
6160                                     parent_inode, &key,
6161                                     btrfs_inode_type(inode), index);
6162         if (ret == -EEXIST || ret == -EOVERFLOW)
6163                 goto fail_dir_item;
6164         else if (ret) {
6165                 btrfs_abort_transaction(trans, root, ret);
6166                 return ret;
6167         }
6168
6169         btrfs_i_size_write(parent_inode, parent_inode->i_size +
6170                            name_len * 2);
6171         inode_inc_iversion(parent_inode);
6172         parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
6173         ret = btrfs_update_inode(trans, root, parent_inode);
6174         if (ret)
6175                 btrfs_abort_transaction(trans, root, ret);
6176         return ret;
6177
6178 fail_dir_item:
6179         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6180                 u64 local_index;
6181                 int err;
6182                 err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
6183                                  key.objectid, root->root_key.objectid,
6184                                  parent_ino, &local_index, name, name_len);
6185
6186         } else if (add_backref) {
6187                 u64 local_index;
6188                 int err;
6189
6190                 err = btrfs_del_inode_ref(trans, root, name, name_len,
6191                                           ino, parent_ino, &local_index);
6192         }
6193         return ret;
6194 }
6195
6196 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6197                             struct inode *dir, struct dentry *dentry,
6198                             struct inode *inode, int backref, u64 index)
6199 {
6200         int err = btrfs_add_link(trans, dir, inode,
6201                                  dentry->d_name.name, dentry->d_name.len,
6202                                  backref, index);
6203         if (err > 0)
6204                 err = -EEXIST;
6205         return err;
6206 }
6207
6208 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
6209                         umode_t mode, dev_t rdev)
6210 {
6211         struct btrfs_trans_handle *trans;
6212         struct btrfs_root *root = BTRFS_I(dir)->root;
6213         struct inode *inode = NULL;
6214         int err;
6215         int drop_inode = 0;
6216         u64 objectid;
6217         u64 index = 0;
6218
6219         if (!new_valid_dev(rdev))
6220                 return -EINVAL;
6221
6222         /*
6223          * 2 for inode item and ref
6224          * 2 for dir items
6225          * 1 for xattr if selinux is on
6226          */
6227         trans = btrfs_start_transaction(root, 5);
6228         if (IS_ERR(trans))
6229                 return PTR_ERR(trans);
6230
6231         err = btrfs_find_free_ino(root, &objectid);
6232         if (err)
6233                 goto out_unlock;
6234
6235         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6236                                 dentry->d_name.len, btrfs_ino(dir), objectid,
6237                                 mode, &index);
6238         if (IS_ERR(inode)) {
6239                 err = PTR_ERR(inode);
6240                 goto out_unlock;
6241         }
6242
6243         /*
6244         * If the active LSM wants to access the inode during
6245         * d_instantiate it needs these. Smack checks to see
6246         * if the filesystem supports xattrs by looking at the
6247         * ops vector.
6248         */
6249         inode->i_op = &btrfs_special_inode_operations;
6250         init_special_inode(inode, inode->i_mode, rdev);
6251
6252         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6253         if (err)
6254                 goto out_unlock_inode;
6255
6256         err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6257         if (err) {
6258                 goto out_unlock_inode;
6259         } else {
6260                 btrfs_update_inode(trans, root, inode);
6261                 unlock_new_inode(inode);
6262                 d_instantiate(dentry, inode);
6263         }
6264
6265 out_unlock:
6266         btrfs_end_transaction(trans, root);
6267         btrfs_balance_delayed_items(root);
6268         btrfs_btree_balance_dirty(root);
6269         if (drop_inode) {
6270                 inode_dec_link_count(inode);
6271                 iput(inode);
6272         }
6273         return err;
6274
6275 out_unlock_inode:
6276         drop_inode = 1;
6277         unlock_new_inode(inode);
6278         goto out_unlock;
6279
6280 }
6281
6282 static int btrfs_create(struct inode *dir, struct dentry *dentry,
6283                         umode_t mode, bool excl)
6284 {
6285         struct btrfs_trans_handle *trans;
6286         struct btrfs_root *root = BTRFS_I(dir)->root;
6287         struct inode *inode = NULL;
6288         int drop_inode_on_err = 0;
6289         int err;
6290         u64 objectid;
6291         u64 index = 0;
6292
6293         /*
6294          * 2 for inode item and ref
6295          * 2 for dir items
6296          * 1 for xattr if selinux is on
6297          */
6298         trans = btrfs_start_transaction(root, 5);
6299         if (IS_ERR(trans))
6300                 return PTR_ERR(trans);
6301
6302         err = btrfs_find_free_ino(root, &objectid);
6303         if (err)
6304                 goto out_unlock;
6305
6306         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6307                                 dentry->d_name.len, btrfs_ino(dir), objectid,
6308                                 mode, &index);
6309         if (IS_ERR(inode)) {
6310                 err = PTR_ERR(inode);
6311                 goto out_unlock;
6312         }
6313         drop_inode_on_err = 1;
6314         /*
6315         * If the active LSM wants to access the inode during
6316         * d_instantiate it needs these. Smack checks to see
6317         * if the filesystem supports xattrs by looking at the
6318         * ops vector.
6319         */
6320         inode->i_fop = &btrfs_file_operations;
6321         inode->i_op = &btrfs_file_inode_operations;
6322         inode->i_mapping->a_ops = &btrfs_aops;
6323
6324         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6325         if (err)
6326                 goto out_unlock_inode;
6327
6328         err = btrfs_update_inode(trans, root, inode);
6329         if (err)
6330                 goto out_unlock_inode;
6331
6332         err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6333         if (err)
6334                 goto out_unlock_inode;
6335
6336         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6337         unlock_new_inode(inode);
6338         d_instantiate(dentry, inode);
6339
6340 out_unlock:
6341         btrfs_end_transaction(trans, root);
6342         if (err && drop_inode_on_err) {
6343                 inode_dec_link_count(inode);
6344                 iput(inode);
6345         }
6346         btrfs_balance_delayed_items(root);
6347         btrfs_btree_balance_dirty(root);
6348         return err;
6349
6350 out_unlock_inode:
6351         unlock_new_inode(inode);
6352         goto out_unlock;
6353
6354 }
6355
6356 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6357                       struct dentry *dentry)
6358 {
6359         struct btrfs_trans_handle *trans;
6360         struct btrfs_root *root = BTRFS_I(dir)->root;
6361         struct inode *inode = old_dentry->d_inode;
6362         u64 index;
6363         int err;
6364         int drop_inode = 0;
6365
6366         /* do not allow sys_link's with other subvols of the same device */
6367         if (root->objectid != BTRFS_I(inode)->root->objectid)
6368                 return -EXDEV;
6369
6370         if (inode->i_nlink >= BTRFS_LINK_MAX)
6371                 return -EMLINK;
6372
6373         err = btrfs_set_inode_index(dir, &index);
6374         if (err)
6375                 goto fail;
6376
6377         /*
6378          * 2 items for inode and inode ref
6379          * 2 items for dir items
6380          * 1 item for parent inode
6381          */
6382         trans = btrfs_start_transaction(root, 5);
6383         if (IS_ERR(trans)) {
6384                 err = PTR_ERR(trans);
6385                 goto fail;
6386         }
6387
6388         /* There are several dir indexes for this inode, clear the cache. */
6389         BTRFS_I(inode)->dir_index = 0ULL;
6390         inc_nlink(inode);
6391         inode_inc_iversion(inode);
6392         inode->i_ctime = CURRENT_TIME;
6393         ihold(inode);
6394         set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6395
6396         err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
6397
6398         if (err) {
6399                 drop_inode = 1;
6400         } else {
6401                 struct dentry *parent = dentry->d_parent;
6402                 err = btrfs_update_inode(trans, root, inode);
6403                 if (err)
6404                         goto fail;
6405                 if (inode->i_nlink == 1) {
6406                         /*
6407                          * If new hard link count is 1, it's a file created
6408                          * with open(2) O_TMPFILE flag.
6409                          */
6410                         err = btrfs_orphan_del(trans, inode);
6411                         if (err)
6412                                 goto fail;
6413                 }
6414                 d_instantiate(dentry, inode);
6415                 btrfs_log_new_name(trans, inode, NULL, parent);
6416         }
6417
6418         btrfs_end_transaction(trans, root);
6419         btrfs_balance_delayed_items(root);
6420 fail:
6421         if (drop_inode) {
6422                 inode_dec_link_count(inode);
6423                 iput(inode);
6424         }
6425         btrfs_btree_balance_dirty(root);
6426         return err;
6427 }
6428
6429 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6430 {
6431         struct inode *inode = NULL;
6432         struct btrfs_trans_handle *trans;
6433         struct btrfs_root *root = BTRFS_I(dir)->root;
6434         int err = 0;
6435         int drop_on_err = 0;
6436         u64 objectid = 0;
6437         u64 index = 0;
6438
6439         /*
6440          * 2 items for inode and ref
6441          * 2 items for dir items
6442          * 1 for xattr if selinux is on
6443          */
6444         trans = btrfs_start_transaction(root, 5);
6445         if (IS_ERR(trans))
6446                 return PTR_ERR(trans);
6447
6448         err = btrfs_find_free_ino(root, &objectid);
6449         if (err)
6450                 goto out_fail;
6451
6452         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6453                                 dentry->d_name.len, btrfs_ino(dir), objectid,
6454                                 S_IFDIR | mode, &index);
6455         if (IS_ERR(inode)) {
6456                 err = PTR_ERR(inode);
6457                 goto out_fail;
6458         }
6459
6460         drop_on_err = 1;
6461         /* these must be set before we unlock the inode */
6462         inode->i_op = &btrfs_dir_inode_operations;
6463         inode->i_fop = &btrfs_dir_file_operations;
6464
6465         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6466         if (err)
6467                 goto out_fail_inode;
6468
6469         btrfs_i_size_write(inode, 0);
6470         err = btrfs_update_inode(trans, root, inode);
6471         if (err)
6472                 goto out_fail_inode;
6473
6474         err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6475                              dentry->d_name.len, 0, index);
6476         if (err)
6477                 goto out_fail_inode;
6478
6479         d_instantiate(dentry, inode);
6480         /*
6481          * mkdir is special.  We're unlocking after we call d_instantiate
6482          * to avoid a race with nfsd calling d_instantiate.
6483          */
6484         unlock_new_inode(inode);
6485         drop_on_err = 0;
6486
6487 out_fail:
6488         btrfs_end_transaction(trans, root);
6489         if (drop_on_err) {
6490                 inode_dec_link_count(inode);
6491                 iput(inode);
6492         }
6493         btrfs_balance_delayed_items(root);
6494         btrfs_btree_balance_dirty(root);
6495         return err;
6496
6497 out_fail_inode:
6498         unlock_new_inode(inode);
6499         goto out_fail;
6500 }
6501
6502 /* Find next extent map of a given extent map, caller needs to ensure locks */
6503 static struct extent_map *next_extent_map(struct extent_map *em)
6504 {
6505         struct rb_node *next;
6506
6507         next = rb_next(&em->rb_node);
6508         if (!next)
6509                 return NULL;
6510         return container_of(next, struct extent_map, rb_node);
6511 }
6512
6513 static struct extent_map *prev_extent_map(struct extent_map *em)
6514 {
6515         struct rb_node *prev;
6516
6517         prev = rb_prev(&em->rb_node);
6518         if (!prev)
6519                 return NULL;
6520         return container_of(prev, struct extent_map, rb_node);
6521 }
6522
6523 /* helper for btfs_get_extent.  Given an existing extent in the tree,
6524  * the existing extent is the nearest extent to map_start,
6525  * and an extent that you want to insert, deal with overlap and insert
6526  * the best fitted new extent into the tree.
6527  */
6528 static int merge_extent_mapping(struct extent_map_tree *em_tree,
6529                                 struct extent_map *existing,
6530                                 struct extent_map *em,
6531                                 u64 map_start)
6532 {
6533         struct extent_map *prev;
6534         struct extent_map *next;
6535         u64 start;
6536         u64 end;
6537         u64 start_diff;
6538
6539         BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6540
6541         if (existing->start > map_start) {
6542                 next = existing;
6543                 prev = prev_extent_map(next);
6544         } else {
6545                 prev = existing;
6546                 next = next_extent_map(prev);
6547         }
6548
6549         start = prev ? extent_map_end(prev) : em->start;
6550         start = max_t(u64, start, em->start);
6551         end = next ? next->start : extent_map_end(em);
6552         end = min_t(u64, end, extent_map_end(em));
6553         start_diff = start - em->start;
6554         em->start = start;
6555         em->len = end - start;
6556         if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6557             !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6558                 em->block_start += start_diff;
6559                 em->block_len -= start_diff;
6560         }
6561         return add_extent_mapping(em_tree, em, 0);
6562 }
6563
6564 static noinline int uncompress_inline(struct btrfs_path *path,
6565                                       struct inode *inode, struct page *page,
6566                                       size_t pg_offset, u64 extent_offset,
6567                                       struct btrfs_file_extent_item *item)
6568 {
6569         int ret;
6570         struct extent_buffer *leaf = path->nodes[0];
6571         char *tmp;
6572         size_t max_size;
6573         unsigned long inline_size;
6574         unsigned long ptr;
6575         int compress_type;
6576
6577         WARN_ON(pg_offset != 0);
6578         compress_type = btrfs_file_extent_compression(leaf, item);
6579         max_size = btrfs_file_extent_ram_bytes(leaf, item);
6580         inline_size = btrfs_file_extent_inline_item_len(leaf,
6581                                         btrfs_item_nr(path->slots[0]));
6582         tmp = kmalloc(inline_size, GFP_NOFS);
6583         if (!tmp)
6584                 return -ENOMEM;
6585         ptr = btrfs_file_extent_inline_start(item);
6586
6587         read_extent_buffer(leaf, tmp, ptr, inline_size);
6588
6589         max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
6590         ret = btrfs_decompress(compress_type, tmp, page,
6591                                extent_offset, inline_size, max_size);
6592         kfree(tmp);
6593         return ret;
6594 }
6595
6596 /*
6597  * a bit scary, this does extent mapping from logical file offset to the disk.
6598  * the ugly parts come from merging extents from the disk with the in-ram
6599  * representation.  This gets more complex because of the data=ordered code,
6600  * where the in-ram extents might be locked pending data=ordered completion.
6601  *
6602  * This also copies inline extents directly into the page.
6603  */
6604
6605 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
6606                                     size_t pg_offset, u64 start, u64 len,
6607                                     int create)
6608 {
6609         int ret;
6610         int err = 0;
6611         u64 extent_start = 0;
6612         u64 extent_end = 0;
6613         u64 objectid = btrfs_ino(inode);
6614         u32 found_type;
6615         struct btrfs_path *path = NULL;
6616         struct btrfs_root *root = BTRFS_I(inode)->root;
6617         struct btrfs_file_extent_item *item;
6618         struct extent_buffer *leaf;
6619         struct btrfs_key found_key;
6620         struct extent_map *em = NULL;
6621         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
6622         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6623         struct btrfs_trans_handle *trans = NULL;
6624         const bool new_inline = !page || create;
6625
6626 again:
6627         read_lock(&em_tree->lock);
6628         em = lookup_extent_mapping(em_tree, start, len);
6629         if (em)
6630                 em->bdev = root->fs_info->fs_devices->latest_bdev;
6631         read_unlock(&em_tree->lock);
6632
6633         if (em) {
6634                 if (em->start > start || em->start + em->len <= start)
6635                         free_extent_map(em);
6636                 else if (em->block_start == EXTENT_MAP_INLINE && page)
6637                         free_extent_map(em);
6638                 else
6639                         goto out;
6640         }
6641         em = alloc_extent_map();
6642         if (!em) {
6643                 err = -ENOMEM;
6644                 goto out;
6645         }
6646         em->bdev = root->fs_info->fs_devices->latest_bdev;
6647         em->start = EXTENT_MAP_HOLE;
6648         em->orig_start = EXTENT_MAP_HOLE;
6649         em->len = (u64)-1;
6650         em->block_len = (u64)-1;
6651
6652         if (!path) {
6653                 path = btrfs_alloc_path();
6654                 if (!path) {
6655                         err = -ENOMEM;
6656                         goto out;
6657                 }
6658                 /*
6659                  * Chances are we'll be called again, so go ahead and do
6660                  * readahead
6661                  */
6662                 path->reada = 1;
6663         }
6664
6665         ret = btrfs_lookup_file_extent(trans, root, path,
6666                                        objectid, start, trans != NULL);
6667         if (ret < 0) {
6668                 err = ret;
6669                 goto out;
6670         }
6671
6672         if (ret != 0) {
6673                 if (path->slots[0] == 0)
6674                         goto not_found;
6675                 path->slots[0]--;
6676         }
6677
6678         leaf = path->nodes[0];
6679         item = btrfs_item_ptr(leaf, path->slots[0],
6680                               struct btrfs_file_extent_item);
6681         /* are we inside the extent that was found? */
6682         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6683         found_type = found_key.type;
6684         if (found_key.objectid != objectid ||
6685             found_type != BTRFS_EXTENT_DATA_KEY) {
6686                 /*
6687                  * If we backup past the first extent we want to move forward
6688                  * and see if there is an extent in front of us, otherwise we'll
6689                  * say there is a hole for our whole search range which can
6690                  * cause problems.
6691                  */
6692                 extent_end = start;
6693                 goto next;
6694         }
6695
6696         found_type = btrfs_file_extent_type(leaf, item);
6697         extent_start = found_key.offset;
6698         if (found_type == BTRFS_FILE_EXTENT_REG ||
6699             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6700                 extent_end = extent_start +
6701                        btrfs_file_extent_num_bytes(leaf, item);
6702         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6703                 size_t size;
6704                 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6705                 extent_end = ALIGN(extent_start + size, root->sectorsize);
6706         }
6707 next:
6708         if (start >= extent_end) {
6709                 path->slots[0]++;
6710                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6711                         ret = btrfs_next_leaf(root, path);
6712                         if (ret < 0) {
6713                                 err = ret;
6714                                 goto out;
6715                         }
6716                         if (ret > 0)
6717                                 goto not_found;
6718                         leaf = path->nodes[0];
6719                 }
6720                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6721                 if (found_key.objectid != objectid ||
6722                     found_key.type != BTRFS_EXTENT_DATA_KEY)
6723                         goto not_found;
6724                 if (start + len <= found_key.offset)
6725                         goto not_found;
6726                 if (start > found_key.offset)
6727                         goto next;
6728                 em->start = start;
6729                 em->orig_start = start;
6730                 em->len = found_key.offset - start;
6731                 goto not_found_em;
6732         }
6733
6734         btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
6735
6736         if (found_type == BTRFS_FILE_EXTENT_REG ||
6737             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6738                 goto insert;
6739         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6740                 unsigned long ptr;
6741                 char *map;
6742                 size_t size;
6743                 size_t extent_offset;
6744                 size_t copy_size;
6745
6746                 if (new_inline)
6747                         goto out;
6748
6749                 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6750                 extent_offset = page_offset(page) + pg_offset - extent_start;
6751                 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
6752                                 size - extent_offset);
6753                 em->start = extent_start + extent_offset;
6754                 em->len = ALIGN(copy_size, root->sectorsize);
6755                 em->orig_block_len = em->len;
6756                 em->orig_start = em->start;
6757                 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6758                 if (create == 0 && !PageUptodate(page)) {
6759                         if (btrfs_file_extent_compression(leaf, item) !=
6760                             BTRFS_COMPRESS_NONE) {
6761                                 ret = uncompress_inline(path, inode, page,
6762                                                         pg_offset,
6763                                                         extent_offset, item);
6764                                 if (ret) {
6765                                         err = ret;
6766                                         goto out;
6767                                 }
6768                         } else {
6769                                 map = kmap(page);
6770                                 read_extent_buffer(leaf, map + pg_offset, ptr,
6771                                                    copy_size);
6772                                 if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
6773                                         memset(map + pg_offset + copy_size, 0,
6774                                                PAGE_CACHE_SIZE - pg_offset -
6775                                                copy_size);
6776                                 }
6777                                 kunmap(page);
6778                         }
6779                         flush_dcache_page(page);
6780                 } else if (create && PageUptodate(page)) {
6781                         BUG();
6782                         if (!trans) {
6783                                 kunmap(page);
6784                                 free_extent_map(em);
6785                                 em = NULL;
6786
6787                                 btrfs_release_path(path);
6788                                 trans = btrfs_join_transaction(root);
6789
6790                                 if (IS_ERR(trans))
6791                                         return ERR_CAST(trans);
6792                                 goto again;
6793                         }
6794                         map = kmap(page);
6795                         write_extent_buffer(leaf, map + pg_offset, ptr,
6796                                             copy_size);
6797                         kunmap(page);
6798                         btrfs_mark_buffer_dirty(leaf);
6799                 }
6800                 set_extent_uptodate(io_tree, em->start,
6801                                     extent_map_end(em) - 1, NULL, GFP_NOFS);
6802                 goto insert;
6803         }
6804 not_found:
6805         em->start = start;
6806         em->orig_start = start;
6807         em->len = len;
6808 not_found_em:
6809         em->block_start = EXTENT_MAP_HOLE;
6810         set_bit(EXTENT_FLAG_VACANCY, &em->flags);
6811 insert:
6812         btrfs_release_path(path);
6813         if (em->start > start || extent_map_end(em) <= start) {
6814                 btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
6815                         em->start, em->len, start, len);
6816                 err = -EIO;
6817                 goto out;
6818         }
6819
6820         err = 0;
6821         write_lock(&em_tree->lock);
6822         ret = add_extent_mapping(em_tree, em, 0);
6823         /* it is possible that someone inserted the extent into the tree
6824          * while we had the lock dropped.  It is also possible that
6825          * an overlapping map exists in the tree
6826          */
6827         if (ret == -EEXIST) {
6828                 struct extent_map *existing;
6829
6830                 ret = 0;
6831
6832                 existing = search_extent_mapping(em_tree, start, len);
6833                 /*
6834                  * existing will always be non-NULL, since there must be
6835                  * extent causing the -EEXIST.
6836                  */
6837                 if (start >= extent_map_end(existing) ||
6838                     start <= existing->start) {
6839                         /*
6840                          * The existing extent map is the one nearest to
6841                          * the [start, start + len) range which overlaps
6842                          */
6843                         err = merge_extent_mapping(em_tree, existing,
6844                                                    em, start);
6845                         free_extent_map(existing);
6846                         if (err) {
6847                                 free_extent_map(em);
6848                                 em = NULL;
6849                         }
6850                 } else {
6851                         free_extent_map(em);
6852                         em = existing;
6853                         err = 0;
6854                 }
6855         }
6856         write_unlock(&em_tree->lock);
6857 out:
6858
6859         trace_btrfs_get_extent(root, em);
6860
6861         if (path)
6862                 btrfs_free_path(path);
6863         if (trans) {
6864                 ret = btrfs_end_transaction(trans, root);
6865                 if (!err)
6866                         err = ret;
6867         }
6868         if (err) {
6869                 free_extent_map(em);
6870                 return ERR_PTR(err);
6871         }
6872         BUG_ON(!em); /* Error is always set */
6873         return em;
6874 }
6875
6876 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
6877                                            size_t pg_offset, u64 start, u64 len,
6878                                            int create)
6879 {
6880         struct extent_map *em;
6881         struct extent_map *hole_em = NULL;
6882         u64 range_start = start;
6883         u64 end;
6884         u64 found;
6885         u64 found_end;
6886         int err = 0;
6887
6888         em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
6889         if (IS_ERR(em))
6890                 return em;
6891         if (em) {
6892                 /*
6893                  * if our em maps to
6894                  * -  a hole or
6895                  * -  a pre-alloc extent,
6896                  * there might actually be delalloc bytes behind it.
6897                  */
6898                 if (em->block_start != EXTENT_MAP_HOLE &&
6899                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6900                         return em;
6901                 else
6902                         hole_em = em;
6903         }
6904
6905         /* check to see if we've wrapped (len == -1 or similar) */
6906         end = start + len;
6907         if (end < start)
6908                 end = (u64)-1;
6909         else
6910                 end -= 1;
6911
6912         em = NULL;
6913
6914         /* ok, we didn't find anything, lets look for delalloc */
6915         found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
6916                                  end, len, EXTENT_DELALLOC, 1);
6917         found_end = range_start + found;
6918         if (found_end < range_start)
6919                 found_end = (u64)-1;
6920
6921         /*
6922          * we didn't find anything useful, return
6923          * the original results from get_extent()
6924          */
6925         if (range_start > end || found_end <= start) {
6926                 em = hole_em;
6927                 hole_em = NULL;
6928                 goto out;
6929         }
6930
6931         /* adjust the range_start to make sure it doesn't
6932          * go backwards from the start they passed in
6933          */
6934         range_start = max(start, range_start);
6935         found = found_end - range_start;
6936
6937         if (found > 0) {
6938                 u64 hole_start = start;
6939                 u64 hole_len = len;
6940
6941                 em = alloc_extent_map();
6942                 if (!em) {
6943                         err = -ENOMEM;
6944                         goto out;
6945                 }
6946                 /*
6947                  * when btrfs_get_extent can't find anything it
6948                  * returns one huge hole
6949                  *
6950                  * make sure what it found really fits our range, and
6951                  * adjust to make sure it is based on the start from
6952                  * the caller
6953                  */
6954                 if (hole_em) {
6955                         u64 calc_end = extent_map_end(hole_em);
6956
6957                         if (calc_end <= start || (hole_em->start > end)) {
6958                                 free_extent_map(hole_em);
6959                                 hole_em = NULL;
6960                         } else {
6961                                 hole_start = max(hole_em->start, start);
6962                                 hole_len = calc_end - hole_start;
6963                         }
6964                 }
6965                 em->bdev = NULL;
6966                 if (hole_em && range_start > hole_start) {
6967                         /* our hole starts before our delalloc, so we
6968                          * have to return just the parts of the hole
6969                          * that go until  the delalloc starts
6970                          */
6971                         em->len = min(hole_len,
6972                                       range_start - hole_start);
6973                         em->start = hole_start;
6974                         em->orig_start = hole_start;
6975                         /*
6976                          * don't adjust block start at all,
6977                          * it is fixed at EXTENT_MAP_HOLE
6978                          */
6979                         em->block_start = hole_em->block_start;
6980                         em->block_len = hole_len;
6981                         if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
6982                                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6983                 } else {
6984                         em->start = range_start;
6985                         em->len = found;
6986                         em->orig_start = range_start;
6987                         em->block_start = EXTENT_MAP_DELALLOC;
6988                         em->block_len = found;
6989                 }
6990         } else if (hole_em) {
6991                 return hole_em;
6992         }
6993 out:
6994
6995         free_extent_map(hole_em);
6996         if (err) {
6997                 free_extent_map(em);
6998                 return ERR_PTR(err);
6999         }
7000         return em;
7001 }
7002
7003 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7004                                                   u64 start, u64 len)
7005 {
7006         struct btrfs_root *root = BTRFS_I(inode)->root;
7007         struct extent_map *em;
7008         struct btrfs_key ins;
7009         u64 alloc_hint;
7010         int ret;
7011
7012         alloc_hint = get_extent_allocation_hint(inode, start, len);
7013         ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
7014                                    alloc_hint, &ins, 1, 1);
7015         if (ret)
7016                 return ERR_PTR(ret);
7017
7018         em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
7019                               ins.offset, ins.offset, ins.offset, 0);
7020         if (IS_ERR(em)) {
7021                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7022                 return em;
7023         }
7024
7025         ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
7026                                            ins.offset, ins.offset, 0);
7027         if (ret) {
7028                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7029                 free_extent_map(em);
7030                 return ERR_PTR(ret);
7031         }
7032
7033         return em;
7034 }
7035
7036 /*
7037  * returns 1 when the nocow is safe, < 1 on error, 0 if the
7038  * block must be cow'd
7039  */
7040 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7041                               u64 *orig_start, u64 *orig_block_len,
7042                               u64 *ram_bytes)
7043 {
7044         struct btrfs_trans_handle *trans;
7045         struct btrfs_path *path;
7046         int ret;
7047         struct extent_buffer *leaf;
7048         struct btrfs_root *root = BTRFS_I(inode)->root;
7049         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7050         struct btrfs_file_extent_item *fi;
7051         struct btrfs_key key;
7052         u64 disk_bytenr;
7053         u64 backref_offset;
7054         u64 extent_end;
7055         u64 num_bytes;
7056         int slot;
7057         int found_type;
7058         bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7059
7060         path = btrfs_alloc_path();
7061         if (!path)
7062                 return -ENOMEM;
7063
7064         ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
7065                                        offset, 0);
7066         if (ret < 0)
7067                 goto out;
7068
7069         slot = path->slots[0];
7070         if (ret == 1) {
7071                 if (slot == 0) {
7072                         /* can't find the item, must cow */
7073                         ret = 0;
7074                         goto out;
7075                 }
7076                 slot--;
7077         }
7078         ret = 0;
7079         leaf = path->nodes[0];
7080         btrfs_item_key_to_cpu(leaf, &key, slot);
7081         if (key.objectid != btrfs_ino(inode) ||
7082             key.type != BTRFS_EXTENT_DATA_KEY) {
7083                 /* not our file or wrong item type, must cow */
7084                 goto out;
7085         }
7086
7087         if (key.offset > offset) {
7088                 /* Wrong offset, must cow */
7089                 goto out;
7090         }
7091
7092         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7093         found_type = btrfs_file_extent_type(leaf, fi);
7094         if (found_type != BTRFS_FILE_EXTENT_REG &&
7095             found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7096                 /* not a regular extent, must cow */
7097                 goto out;
7098         }
7099
7100         if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7101                 goto out;
7102
7103         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7104         if (extent_end <= offset)
7105                 goto out;
7106
7107         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7108         if (disk_bytenr == 0)
7109                 goto out;
7110
7111         if (btrfs_file_extent_compression(leaf, fi) ||
7112             btrfs_file_extent_encryption(leaf, fi) ||
7113             btrfs_file_extent_other_encoding(leaf, fi))
7114                 goto out;
7115
7116         backref_offset = btrfs_file_extent_offset(leaf, fi);
7117
7118         if (orig_start) {
7119                 *orig_start = key.offset - backref_offset;
7120                 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7121                 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7122         }
7123
7124         if (btrfs_extent_readonly(root, disk_bytenr))
7125                 goto out;
7126
7127         num_bytes = min(offset + *len, extent_end) - offset;
7128         if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7129                 u64 range_end;
7130
7131                 range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
7132                 ret = test_range_bit(io_tree, offset, range_end,
7133                                      EXTENT_DELALLOC, 0, NULL);
7134                 if (ret) {
7135                         ret = -EAGAIN;
7136                         goto out;
7137                 }
7138         }
7139
7140         btrfs_release_path(path);
7141
7142         /*
7143          * look for other files referencing this extent, if we
7144          * find any we must cow
7145          */
7146         trans = btrfs_join_transaction(root);
7147         if (IS_ERR(trans)) {
7148                 ret = 0;
7149                 goto out;
7150         }
7151
7152         ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
7153                                     key.offset - backref_offset, disk_bytenr);
7154         btrfs_end_transaction(trans, root);
7155         if (ret) {
7156                 ret = 0;
7157                 goto out;
7158         }
7159
7160         /*
7161          * adjust disk_bytenr and num_bytes to cover just the bytes
7162          * in this extent we are about to write.  If there
7163          * are any csums in that range we have to cow in order
7164          * to keep the csums correct
7165          */
7166         disk_bytenr += backref_offset;
7167         disk_bytenr += offset - key.offset;
7168         if (csum_exist_in_range(root, disk_bytenr, num_bytes))
7169                                 goto out;
7170         /*
7171          * all of the above have passed, it is safe to overwrite this extent
7172          * without cow
7173          */
7174         *len = num_bytes;
7175         ret = 1;
7176 out:
7177         btrfs_free_path(path);
7178         return ret;
7179 }
7180
7181 bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
7182 {
7183         struct radix_tree_root *root = &inode->i_mapping->page_tree;
7184         int found = false;
7185         void **pagep = NULL;
7186         struct page *page = NULL;
7187         int start_idx;
7188         int end_idx;
7189
7190         start_idx = start >> PAGE_CACHE_SHIFT;
7191
7192         /*
7193          * end is the last byte in the last page.  end == start is legal
7194          */
7195         end_idx = end >> PAGE_CACHE_SHIFT;
7196
7197         rcu_read_lock();
7198
7199         /* Most of the code in this while loop is lifted from
7200          * find_get_page.  It's been modified to begin searching from a
7201          * page and return just the first page found in that range.  If the
7202          * found idx is less than or equal to the end idx then we know that
7203          * a page exists.  If no pages are found or if those pages are
7204          * outside of the range then we're fine (yay!) */
7205         while (page == NULL &&
7206                radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
7207                 page = radix_tree_deref_slot(pagep);
7208                 if (unlikely(!page))
7209                         break;
7210
7211                 if (radix_tree_exception(page)) {
7212                         if (radix_tree_deref_retry(page)) {
7213                                 page = NULL;
7214                                 continue;
7215                         }
7216                         /*
7217                          * Otherwise, shmem/tmpfs must be storing a swap entry
7218                          * here as an exceptional entry: so return it without
7219                          * attempting to raise page count.
7220                          */
7221                         page = NULL;
7222                         break; /* TODO: Is this relevant for this use case? */
7223                 }
7224
7225                 if (!page_cache_get_speculative(page)) {
7226                         page = NULL;
7227                         continue;
7228                 }
7229
7230                 /*
7231                  * Has the page moved?
7232                  * This is part of the lockless pagecache protocol. See
7233                  * include/linux/pagemap.h for details.
7234                  */
7235                 if (unlikely(page != *pagep)) {
7236                         page_cache_release(page);
7237                         page = NULL;
7238                 }
7239         }
7240
7241         if (page) {
7242                 if (page->index <= end_idx)
7243                         found = true;
7244                 page_cache_release(page);
7245         }
7246
7247         rcu_read_unlock();
7248         return found;
7249 }
7250
7251 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7252                               struct extent_state **cached_state, int writing)
7253 {
7254         struct btrfs_ordered_extent *ordered;
7255         int ret = 0;
7256
7257         while (1) {
7258                 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7259                                  0, cached_state);
7260                 /*
7261                  * We're concerned with the entire range that we're going to be
7262                  * doing DIO to, so we need to make sure theres no ordered
7263                  * extents in this range.
7264                  */
7265                 ordered = btrfs_lookup_ordered_range(inode, lockstart,
7266                                                      lockend - lockstart + 1);
7267
7268                 /*
7269                  * We need to make sure there are no buffered pages in this
7270                  * range either, we could have raced between the invalidate in
7271                  * generic_file_direct_write and locking the extent.  The
7272                  * invalidate needs to happen so that reads after a write do not
7273                  * get stale data.
7274                  */
7275                 if (!ordered &&
7276                     (!writing ||
7277                      !btrfs_page_exists_in_range(inode, lockstart, lockend)))
7278                         break;
7279
7280                 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7281                                      cached_state, GFP_NOFS);
7282
7283                 if (ordered) {
7284                         btrfs_start_ordered_extent(inode, ordered, 1);
7285                         btrfs_put_ordered_extent(ordered);
7286                 } else {
7287                         /* Screw you mmap */
7288                         ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
7289                         if (ret)
7290                                 break;
7291                         ret = filemap_fdatawait_range(inode->i_mapping,
7292                                                       lockstart,
7293                                                       lockend);
7294                         if (ret)
7295                                 break;
7296
7297                         /*
7298                          * If we found a page that couldn't be invalidated just
7299                          * fall back to buffered.
7300                          */
7301                         ret = invalidate_inode_pages2_range(inode->i_mapping,
7302                                         lockstart >> PAGE_CACHE_SHIFT,
7303                                         lockend >> PAGE_CACHE_SHIFT);
7304                         if (ret)
7305                                 break;
7306                 }
7307
7308                 cond_resched();
7309         }
7310
7311         return ret;
7312 }
7313
7314 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
7315                                            u64 len, u64 orig_start,
7316                                            u64 block_start, u64 block_len,
7317                                            u64 orig_block_len, u64 ram_bytes,
7318                                            int type)
7319 {
7320         struct extent_map_tree *em_tree;
7321         struct extent_map *em;
7322         struct btrfs_root *root = BTRFS_I(inode)->root;
7323         int ret;
7324
7325         em_tree = &BTRFS_I(inode)->extent_tree;
7326         em = alloc_extent_map();
7327         if (!em)
7328                 return ERR_PTR(-ENOMEM);
7329
7330         em->start = start;
7331         em->orig_start = orig_start;
7332         em->mod_start = start;
7333         em->mod_len = len;
7334         em->len = len;
7335         em->block_len = block_len;
7336         em->block_start = block_start;
7337         em->bdev = root->fs_info->fs_devices->latest_bdev;
7338         em->orig_block_len = orig_block_len;
7339         em->ram_bytes = ram_bytes;
7340         em->generation = -1;
7341         set_bit(EXTENT_FLAG_PINNED, &em->flags);
7342         if (type == BTRFS_ORDERED_PREALLOC)
7343                 set_bit(EXTENT_FLAG_FILLING, &em->flags);
7344
7345         do {
7346                 btrfs_drop_extent_cache(inode, em->start,
7347                                 em->start + em->len - 1, 0);
7348                 write_lock(&em_tree->lock);
7349                 ret = add_extent_mapping(em_tree, em, 1);
7350                 write_unlock(&em_tree->lock);
7351         } while (ret == -EEXIST);
7352
7353         if (ret) {
7354                 free_extent_map(em);
7355                 return ERR_PTR(ret);
7356         }
7357
7358         return em;
7359 }
7360
7361
7362 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7363                                    struct buffer_head *bh_result, int create)
7364 {
7365         struct extent_map *em;
7366         struct btrfs_root *root = BTRFS_I(inode)->root;
7367         struct extent_state *cached_state = NULL;
7368         u64 start = iblock << inode->i_blkbits;
7369         u64 lockstart, lockend;
7370         u64 len = bh_result->b_size;
7371         u64 *outstanding_extents = NULL;
7372         int unlock_bits = EXTENT_LOCKED;
7373         int ret = 0;
7374
7375         if (create)
7376                 unlock_bits |= EXTENT_DIRTY;
7377         else
7378                 len = min_t(u64, len, root->sectorsize);
7379
7380         lockstart = start;
7381         lockend = start + len - 1;
7382
7383         if (current->journal_info) {
7384                 /*
7385                  * Need to pull our outstanding extents and set journal_info to NULL so
7386                  * that anything that needs to check if there's a transction doesn't get
7387                  * confused.
7388                  */
7389                 outstanding_extents = current->journal_info;
7390                 current->journal_info = NULL;
7391         }
7392
7393         /*
7394          * If this errors out it's because we couldn't invalidate pagecache for
7395          * this range and we need to fallback to buffered.
7396          */
7397         if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
7398                 return -ENOTBLK;
7399
7400         em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
7401         if (IS_ERR(em)) {
7402                 ret = PTR_ERR(em);
7403                 goto unlock_err;
7404         }
7405
7406         /*
7407          * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7408          * io.  INLINE is special, and we could probably kludge it in here, but
7409          * it's still buffered so for safety lets just fall back to the generic
7410          * buffered path.
7411          *
7412          * For COMPRESSED we _have_ to read the entire extent in so we can
7413          * decompress it, so there will be buffering required no matter what we
7414          * do, so go ahead and fallback to buffered.
7415          *
7416          * We return -ENOTBLK because thats what makes DIO go ahead and go back
7417          * to buffered IO.  Don't blame me, this is the price we pay for using
7418          * the generic code.
7419          */
7420         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7421             em->block_start == EXTENT_MAP_INLINE) {
7422                 free_extent_map(em);
7423                 ret = -ENOTBLK;
7424                 goto unlock_err;
7425         }
7426
7427         /* Just a good old fashioned hole, return */
7428         if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7429                         test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7430                 free_extent_map(em);
7431                 goto unlock_err;
7432         }
7433
7434         /*
7435          * We don't allocate a new extent in the following cases
7436          *
7437          * 1) The inode is marked as NODATACOW.  In this case we'll just use the
7438          * existing extent.
7439          * 2) The extent is marked as PREALLOC.  We're good to go here and can
7440          * just use the extent.
7441          *
7442          */
7443         if (!create) {
7444                 len = min(len, em->len - (start - em->start));
7445                 lockstart = start + len;
7446                 goto unlock;
7447         }
7448
7449         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7450             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7451              em->block_start != EXTENT_MAP_HOLE)) {
7452                 int type;
7453                 u64 block_start, orig_start, orig_block_len, ram_bytes;
7454
7455                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7456                         type = BTRFS_ORDERED_PREALLOC;
7457                 else
7458                         type = BTRFS_ORDERED_NOCOW;
7459                 len = min(len, em->len - (start - em->start));
7460                 block_start = em->block_start + (start - em->start);
7461
7462                 if (can_nocow_extent(inode, start, &len, &orig_start,
7463                                      &orig_block_len, &ram_bytes) == 1) {
7464                         if (type == BTRFS_ORDERED_PREALLOC) {
7465                                 free_extent_map(em);
7466                                 em = create_pinned_em(inode, start, len,
7467                                                        orig_start,
7468                                                        block_start, len,
7469                                                        orig_block_len,
7470                                                        ram_bytes, type);
7471                                 if (IS_ERR(em)) {
7472                                         ret = PTR_ERR(em);
7473                                         goto unlock_err;
7474                                 }
7475                         }
7476
7477                         ret = btrfs_add_ordered_extent_dio(inode, start,
7478                                            block_start, len, len, type);
7479                         if (ret) {
7480                                 free_extent_map(em);
7481                                 goto unlock_err;
7482                         }
7483                         goto unlock;
7484                 }
7485         }
7486
7487         /*
7488          * this will cow the extent, reset the len in case we changed
7489          * it above
7490          */
7491         len = bh_result->b_size;
7492         free_extent_map(em);
7493         em = btrfs_new_extent_direct(inode, start, len);
7494         if (IS_ERR(em)) {
7495                 ret = PTR_ERR(em);
7496                 goto unlock_err;
7497         }
7498         len = min(len, em->len - (start - em->start));
7499 unlock:
7500         bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7501                 inode->i_blkbits;
7502         bh_result->b_size = len;
7503         bh_result->b_bdev = em->bdev;
7504         set_buffer_mapped(bh_result);
7505         if (create) {
7506                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7507                         set_buffer_new(bh_result);
7508
7509                 /*
7510                  * Need to update the i_size under the extent lock so buffered
7511                  * readers will get the updated i_size when we unlock.
7512                  */
7513                 if (start + len > i_size_read(inode))
7514                         i_size_write(inode, start + len);
7515
7516                 /*
7517                  * If we have an outstanding_extents count still set then we're
7518                  * within our reservation, otherwise we need to adjust our inode
7519                  * counter appropriately.
7520                  */
7521                 if (*outstanding_extents) {
7522                         (*outstanding_extents)--;
7523                 } else {
7524                         spin_lock(&BTRFS_I(inode)->lock);
7525                         BTRFS_I(inode)->outstanding_extents++;
7526                         spin_unlock(&BTRFS_I(inode)->lock);
7527                 }
7528
7529                 current->journal_info = outstanding_extents;
7530                 btrfs_free_reserved_data_space(inode, len);
7531         }
7532
7533         /*
7534          * In the case of write we need to clear and unlock the entire range,
7535          * in the case of read we need to unlock only the end area that we
7536          * aren't using if there is any left over space.
7537          */
7538         if (lockstart < lockend) {
7539                 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7540                                  lockend, unlock_bits, 1, 0,
7541                                  &cached_state, GFP_NOFS);
7542         } else {
7543                 free_extent_state(cached_state);
7544         }
7545
7546         free_extent_map(em);
7547
7548         return 0;
7549
7550 unlock_err:
7551         clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7552                          unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7553         if (outstanding_extents)
7554                 current->journal_info = outstanding_extents;
7555         return ret;
7556 }
7557
7558 static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7559                                         int rw, int mirror_num)
7560 {
7561         struct btrfs_root *root = BTRFS_I(inode)->root;
7562         int ret;
7563
7564         BUG_ON(rw & REQ_WRITE);
7565
7566         bio_get(bio);
7567
7568         ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7569                                   BTRFS_WQ_ENDIO_DIO_REPAIR);
7570         if (ret)
7571                 goto err;
7572
7573         ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
7574 err:
7575         bio_put(bio);
7576         return ret;
7577 }
7578
7579 static int btrfs_check_dio_repairable(struct inode *inode,
7580                                       struct bio *failed_bio,
7581                                       struct io_failure_record *failrec,
7582                                       int failed_mirror)
7583 {
7584         int num_copies;
7585
7586         num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
7587                                       failrec->logical, failrec->len);
7588         if (num_copies == 1) {
7589                 /*
7590                  * we only have a single copy of the data, so don't bother with
7591                  * all the retry and error correction code that follows. no
7592                  * matter what the error is, it is very likely to persist.
7593                  */
7594                 pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
7595                          num_copies, failrec->this_mirror, failed_mirror);
7596                 return 0;
7597         }
7598
7599         failrec->failed_mirror = failed_mirror;
7600         failrec->this_mirror++;
7601         if (failrec->this_mirror == failed_mirror)
7602                 failrec->this_mirror++;
7603
7604         if (failrec->this_mirror > num_copies) {
7605                 pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
7606                          num_copies, failrec->this_mirror, failed_mirror);
7607                 return 0;
7608         }
7609
7610         return 1;
7611 }
7612
7613 static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7614                           struct page *page, u64 start, u64 end,
7615                           int failed_mirror, bio_end_io_t *repair_endio,
7616                           void *repair_arg)
7617 {
7618         struct io_failure_record *failrec;
7619         struct bio *bio;
7620         int isector;
7621         int read_mode;
7622         int ret;
7623
7624         BUG_ON(failed_bio->bi_rw & REQ_WRITE);
7625
7626         ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7627         if (ret)
7628                 return ret;
7629
7630         ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7631                                          failed_mirror);
7632         if (!ret) {
7633                 free_io_failure(inode, failrec);
7634                 return -EIO;
7635         }
7636
7637         if (failed_bio->bi_vcnt > 1)
7638                 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7639         else
7640                 read_mode = READ_SYNC;
7641
7642         isector = start - btrfs_io_bio(failed_bio)->logical;
7643         isector >>= inode->i_sb->s_blocksize_bits;
7644         bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7645                                       0, isector, repair_endio, repair_arg);
7646         if (!bio) {
7647                 free_io_failure(inode, failrec);
7648                 return -EIO;
7649         }
7650
7651         btrfs_debug(BTRFS_I(inode)->root->fs_info,
7652                     "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
7653                     read_mode, failrec->this_mirror, failrec->in_validation);
7654
7655         ret = submit_dio_repair_bio(inode, bio, read_mode,
7656                                     failrec->this_mirror);
7657         if (ret) {
7658                 free_io_failure(inode, failrec);
7659                 bio_put(bio);
7660         }
7661
7662         return ret;
7663 }
7664
7665 struct btrfs_retry_complete {
7666         struct completion done;
7667         struct inode *inode;
7668         u64 start;
7669         int uptodate;
7670 };
7671
7672 static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
7673 {
7674         struct btrfs_retry_complete *done = bio->bi_private;
7675         struct bio_vec *bvec;
7676         int i;
7677
7678         if (err)
7679                 goto end;
7680
7681         done->uptodate = 1;
7682         bio_for_each_segment_all(bvec, bio, i)
7683                 clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
7684 end:
7685         complete(&done->done);
7686         bio_put(bio);
7687 }
7688
7689 static int __btrfs_correct_data_nocsum(struct inode *inode,
7690                                        struct btrfs_io_bio *io_bio)
7691 {
7692         struct bio_vec *bvec;
7693         struct btrfs_retry_complete done;
7694         u64 start;
7695         int i;
7696         int ret;
7697
7698         start = io_bio->logical;
7699         done.inode = inode;
7700
7701         bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7702 try_again:
7703                 done.uptodate = 0;
7704                 done.start = start;
7705                 init_completion(&done.done);
7706
7707                 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7708                                      start + bvec->bv_len - 1,
7709                                      io_bio->mirror_num,
7710                                      btrfs_retry_endio_nocsum, &done);
7711                 if (ret)
7712                         return ret;
7713
7714                 wait_for_completion(&done.done);
7715
7716                 if (!done.uptodate) {
7717                         /* We might have another mirror, so try again */
7718                         goto try_again;
7719                 }
7720
7721                 start += bvec->bv_len;
7722         }
7723
7724         return 0;
7725 }
7726
7727 static void btrfs_retry_endio(struct bio *bio, int err)
7728 {
7729         struct btrfs_retry_complete *done = bio->bi_private;
7730         struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7731         struct bio_vec *bvec;
7732         int uptodate;
7733         int ret;
7734         int i;
7735
7736         if (err)
7737                 goto end;
7738
7739         uptodate = 1;
7740         bio_for_each_segment_all(bvec, bio, i) {
7741                 ret = __readpage_endio_check(done->inode, io_bio, i,
7742                                              bvec->bv_page, 0,
7743                                              done->start, bvec->bv_len);
7744                 if (!ret)
7745                         clean_io_failure(done->inode, done->start,
7746                                          bvec->bv_page, 0);
7747                 else
7748                         uptodate = 0;
7749         }
7750
7751         done->uptodate = uptodate;
7752 end:
7753         complete(&done->done);
7754         bio_put(bio);
7755 }
7756
7757 static int __btrfs_subio_endio_read(struct inode *inode,
7758                                     struct btrfs_io_bio *io_bio, int err)
7759 {
7760         struct bio_vec *bvec;
7761         struct btrfs_retry_complete done;
7762         u64 start;
7763         u64 offset = 0;
7764         int i;
7765         int ret;
7766
7767         err = 0;
7768         start = io_bio->logical;
7769         done.inode = inode;
7770
7771         bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7772                 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
7773                                              0, start, bvec->bv_len);
7774                 if (likely(!ret))
7775                         goto next;
7776 try_again:
7777                 done.uptodate = 0;
7778                 done.start = start;
7779                 init_completion(&done.done);
7780
7781                 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7782                                      start + bvec->bv_len - 1,
7783                                      io_bio->mirror_num,
7784                                      btrfs_retry_endio, &done);
7785                 if (ret) {
7786                         err = ret;
7787                         goto next;
7788                 }
7789
7790                 wait_for_completion(&done.done);
7791
7792                 if (!done.uptodate) {
7793                         /* We might have another mirror, so try again */
7794                         goto try_again;
7795                 }
7796 next:
7797                 offset += bvec->bv_len;
7798                 start += bvec->bv_len;
7799         }
7800
7801         return err;
7802 }
7803
7804 static int btrfs_subio_endio_read(struct inode *inode,
7805                                   struct btrfs_io_bio *io_bio, int err)
7806 {
7807         bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7808
7809         if (skip_csum) {
7810                 if (unlikely(err))
7811                         return __btrfs_correct_data_nocsum(inode, io_bio);
7812                 else
7813                         return 0;
7814         } else {
7815                 return __btrfs_subio_endio_read(inode, io_bio, err);
7816         }
7817 }
7818
7819 static void btrfs_endio_direct_read(struct bio *bio, int err)
7820 {
7821         struct btrfs_dio_private *dip = bio->bi_private;
7822         struct inode *inode = dip->inode;
7823         struct bio *dio_bio;
7824         struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7825
7826         if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
7827                 err = btrfs_subio_endio_read(inode, io_bio, err);
7828
7829         unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
7830                       dip->logical_offset + dip->bytes - 1);
7831         dio_bio = dip->dio_bio;
7832
7833         kfree(dip);
7834
7835         /* If we had a csum failure make sure to clear the uptodate flag */
7836         if (err)
7837                 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7838         dio_end_io(dio_bio, err);
7839
7840         if (io_bio->end_io)
7841                 io_bio->end_io(io_bio, err);
7842         bio_put(bio);
7843 }
7844
7845 static void btrfs_endio_direct_write(struct bio *bio, int err)
7846 {
7847         struct btrfs_dio_private *dip = bio->bi_private;
7848         struct inode *inode = dip->inode;
7849         struct btrfs_root *root = BTRFS_I(inode)->root;
7850         struct btrfs_ordered_extent *ordered = NULL;
7851         u64 ordered_offset = dip->logical_offset;
7852         u64 ordered_bytes = dip->bytes;
7853         struct bio *dio_bio;
7854         int ret;
7855
7856         if (err)
7857                 goto out_done;
7858 again:
7859         ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
7860                                                    &ordered_offset,
7861                                                    ordered_bytes, !err);
7862         if (!ret)
7863                 goto out_test;
7864
7865         btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
7866                         finish_ordered_fn, NULL, NULL);
7867         btrfs_queue_work(root->fs_info->endio_write_workers,
7868                          &ordered->work);
7869 out_test:
7870         /*
7871          * our bio might span multiple ordered extents.  If we haven't
7872          * completed the accounting for the whole dio, go back and try again
7873          */
7874         if (ordered_offset < dip->logical_offset + dip->bytes) {
7875                 ordered_bytes = dip->logical_offset + dip->bytes -
7876                         ordered_offset;
7877                 ordered = NULL;
7878                 goto again;
7879         }
7880 out_done:
7881         dio_bio = dip->dio_bio;
7882
7883         kfree(dip);
7884
7885         /* If we had an error make sure to clear the uptodate flag */
7886         if (err)
7887                 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7888         dio_end_io(dio_bio, err);
7889         bio_put(bio);
7890 }
7891
7892 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
7893                                     struct bio *bio, int mirror_num,
7894                                     unsigned long bio_flags, u64 offset)
7895 {
7896         int ret;
7897         struct btrfs_root *root = BTRFS_I(inode)->root;
7898         ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
7899         BUG_ON(ret); /* -ENOMEM */
7900         return 0;
7901 }
7902
7903 static void btrfs_end_dio_bio(struct bio *bio, int err)
7904 {
7905         struct btrfs_dio_private *dip = bio->bi_private;
7906
7907         if (err)
7908                 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
7909                            "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7910                            btrfs_ino(dip->inode), bio->bi_rw,
7911                            (unsigned long long)bio->bi_iter.bi_sector,
7912                            bio->bi_iter.bi_size, err);
7913
7914         if (dip->subio_endio)
7915                 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
7916
7917         if (err) {
7918                 dip->errors = 1;
7919
7920                 /*
7921                  * before atomic variable goto zero, we must make sure
7922                  * dip->errors is perceived to be set.
7923                  */
7924                 smp_mb__before_atomic();
7925         }
7926
7927         /* if there are more bios still pending for this dio, just exit */
7928         if (!atomic_dec_and_test(&dip->pending_bios))
7929                 goto out;
7930
7931         if (dip->errors) {
7932                 bio_io_error(dip->orig_bio);
7933         } else {
7934                 set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
7935                 bio_endio(dip->orig_bio, 0);
7936         }
7937 out:
7938         bio_put(bio);
7939 }
7940
7941 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
7942                                        u64 first_sector, gfp_t gfp_flags)
7943 {
7944         int nr_vecs = bio_get_nr_vecs(bdev);
7945         return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
7946 }
7947
7948 static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
7949                                                  struct inode *inode,
7950                                                  struct btrfs_dio_private *dip,
7951                                                  struct bio *bio,
7952                                                  u64 file_offset)
7953 {
7954         struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7955         struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
7956         int ret;
7957
7958         /*
7959          * We load all the csum data we need when we submit
7960          * the first bio to reduce the csum tree search and
7961          * contention.
7962          */
7963         if (dip->logical_offset == file_offset) {
7964                 ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
7965                                                 file_offset);
7966                 if (ret)
7967                         return ret;
7968         }
7969
7970         if (bio == dip->orig_bio)
7971                 return 0;
7972
7973         file_offset -= dip->logical_offset;
7974         file_offset >>= inode->i_sb->s_blocksize_bits;
7975         io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
7976
7977         return 0;
7978 }
7979
7980 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7981                                          int rw, u64 file_offset, int skip_sum,
7982                                          int async_submit)
7983 {
7984         struct btrfs_dio_private *dip = bio->bi_private;
7985         int write = rw & REQ_WRITE;
7986         struct btrfs_root *root = BTRFS_I(inode)->root;
7987         int ret;
7988
7989         if (async_submit)
7990                 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
7991
7992         bio_get(bio);
7993
7994         if (!write) {
7995                 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7996                                 BTRFS_WQ_ENDIO_DATA);
7997                 if (ret)
7998                         goto err;
7999         }
8000
8001         if (skip_sum)
8002                 goto map;
8003
8004         if (write && async_submit) {
8005                 ret = btrfs_wq_submit_bio(root->fs_info,
8006                                    inode, rw, bio, 0, 0,
8007                                    file_offset,
8008                                    __btrfs_submit_bio_start_direct_io,
8009                                    __btrfs_submit_bio_done);
8010                 goto err;
8011         } else if (write) {
8012                 /*
8013                  * If we aren't doing async submit, calculate the csum of the
8014                  * bio now.
8015                  */
8016                 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
8017                 if (ret)
8018                         goto err;
8019         } else {
8020                 ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
8021                                                      file_offset);
8022                 if (ret)
8023                         goto err;
8024         }
8025 map:
8026         ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
8027 err:
8028         bio_put(bio);
8029         return ret;
8030 }
8031
8032 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
8033                                     int skip_sum)
8034 {
8035         struct inode *inode = dip->inode;
8036         struct btrfs_root *root = BTRFS_I(inode)->root;
8037         struct bio *bio;
8038         struct bio *orig_bio = dip->orig_bio;
8039         struct bio_vec *bvec = orig_bio->bi_io_vec;
8040         u64 start_sector = orig_bio->bi_iter.bi_sector;
8041         u64 file_offset = dip->logical_offset;
8042         u64 submit_len = 0;
8043         u64 map_length;
8044         int nr_pages = 0;
8045         int ret;
8046         int async_submit = 0;
8047
8048         map_length = orig_bio->bi_iter.bi_size;
8049         ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
8050                               &map_length, NULL, 0);
8051         if (ret)
8052                 return -EIO;
8053
8054         if (map_length >= orig_bio->bi_iter.bi_size) {
8055                 bio = orig_bio;
8056                 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8057                 goto submit;
8058         }
8059
8060         /* async crcs make it difficult to collect full stripe writes. */
8061         if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8062                 async_submit = 0;
8063         else
8064                 async_submit = 1;
8065
8066         bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
8067         if (!bio)
8068                 return -ENOMEM;
8069
8070         bio->bi_private = dip;
8071         bio->bi_end_io = btrfs_end_dio_bio;
8072         btrfs_io_bio(bio)->logical = file_offset;
8073         atomic_inc(&dip->pending_bios);
8074
8075         while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
8076                 if (map_length < submit_len + bvec->bv_len ||
8077                     bio_add_page(bio, bvec->bv_page, bvec->bv_len,
8078                                  bvec->bv_offset) < bvec->bv_len) {
8079                         /*
8080                          * inc the count before we submit the bio so
8081                          * we know the end IO handler won't happen before
8082                          * we inc the count. Otherwise, the dip might get freed
8083                          * before we're done setting it up
8084                          */
8085                         atomic_inc(&dip->pending_bios);
8086                         ret = __btrfs_submit_dio_bio(bio, inode, rw,
8087                                                      file_offset, skip_sum,
8088                                                      async_submit);
8089                         if (ret) {
8090                                 bio_put(bio);
8091                                 atomic_dec(&dip->pending_bios);
8092                                 goto out_err;
8093                         }
8094
8095                         start_sector += submit_len >> 9;
8096                         file_offset += submit_len;
8097
8098                         submit_len = 0;
8099                         nr_pages = 0;
8100
8101                         bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
8102                                                   start_sector, GFP_NOFS);
8103                         if (!bio)
8104                                 goto out_err;
8105                         bio->bi_private = dip;
8106                         bio->bi_end_io = btrfs_end_dio_bio;
8107                         btrfs_io_bio(bio)->logical = file_offset;
8108
8109                         map_length = orig_bio->bi_iter.bi_size;
8110                         ret = btrfs_map_block(root->fs_info, rw,
8111                                               start_sector << 9,
8112                                               &map_length, NULL, 0);
8113                         if (ret) {
8114                                 bio_put(bio);
8115                                 goto out_err;
8116                         }
8117                 } else {
8118                         submit_len += bvec->bv_len;
8119                         nr_pages++;
8120                         bvec++;
8121                 }
8122         }
8123
8124 submit:
8125         ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
8126                                      async_submit);
8127         if (!ret)
8128                 return 0;
8129
8130         bio_put(bio);
8131 out_err:
8132         dip->errors = 1;
8133         /*
8134          * before atomic variable goto zero, we must
8135          * make sure dip->errors is perceived to be set.
8136          */
8137         smp_mb__before_atomic();
8138         if (atomic_dec_and_test(&dip->pending_bios))
8139                 bio_io_error(dip->orig_bio);
8140
8141         /* bio_end_io() will handle error, so we needn't return it */
8142         return 0;
8143 }
8144
8145 static void btrfs_submit_direct(int rw, struct bio *dio_bio,
8146                                 struct inode *inode, loff_t file_offset)
8147 {
8148         struct btrfs_root *root = BTRFS_I(inode)->root;
8149         struct btrfs_dio_private *dip;
8150         struct bio *io_bio;
8151         struct btrfs_io_bio *btrfs_bio;
8152         int skip_sum;
8153         int write = rw & REQ_WRITE;
8154         int ret = 0;
8155
8156         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8157
8158         io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
8159         if (!io_bio) {
8160                 ret = -ENOMEM;
8161                 goto free_ordered;
8162         }
8163
8164         dip = kzalloc(sizeof(*dip), GFP_NOFS);
8165         if (!dip) {
8166                 ret = -ENOMEM;
8167                 goto free_io_bio;
8168         }
8169
8170         dip->private = dio_bio->bi_private;
8171         dip->inode = inode;
8172         dip->logical_offset = file_offset;
8173         dip->bytes = dio_bio->bi_iter.bi_size;
8174         dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8175         io_bio->bi_private = dip;
8176         dip->orig_bio = io_bio;
8177         dip->dio_bio = dio_bio;
8178         atomic_set(&dip->pending_bios, 0);
8179         btrfs_bio = btrfs_io_bio(io_bio);
8180         btrfs_bio->logical = file_offset;
8181
8182         if (write) {
8183                 io_bio->bi_end_io = btrfs_endio_direct_write;
8184         } else {
8185                 io_bio->bi_end_io = btrfs_endio_direct_read;
8186                 dip->subio_endio = btrfs_subio_endio_read;
8187         }
8188
8189         ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
8190         if (!ret)
8191                 return;
8192
8193         if (btrfs_bio->end_io)
8194                 btrfs_bio->end_io(btrfs_bio, ret);
8195 free_io_bio:
8196         bio_put(io_bio);
8197
8198 free_ordered:
8199         /*
8200          * If this is a write, we need to clean up the reserved space and kill
8201          * the ordered extent.
8202          */
8203         if (write) {
8204                 struct btrfs_ordered_extent *ordered;
8205                 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
8206                 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
8207                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
8208                         btrfs_free_reserved_extent(root, ordered->start,
8209                                                    ordered->disk_len, 1);
8210                 btrfs_put_ordered_extent(ordered);
8211                 btrfs_put_ordered_extent(ordered);
8212         }
8213         bio_endio(dio_bio, ret);
8214 }
8215
8216 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
8217                         const struct iov_iter *iter, loff_t offset)
8218 {
8219         int seg;
8220         int i;
8221         unsigned blocksize_mask = root->sectorsize - 1;
8222         ssize_t retval = -EINVAL;
8223
8224         if (offset & blocksize_mask)
8225                 goto out;
8226
8227         if (iov_iter_alignment(iter) & blocksize_mask)
8228                 goto out;
8229
8230         /* If this is a write we don't need to check anymore */
8231         if (rw & WRITE)
8232                 return 0;
8233         /*
8234          * Check to make sure we don't have duplicate iov_base's in this
8235          * iovec, if so return EINVAL, otherwise we'll get csum errors
8236          * when reading back.
8237          */
8238         for (seg = 0; seg < iter->nr_segs; seg++) {
8239                 for (i = seg + 1; i < iter->nr_segs; i++) {
8240                         if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
8241                                 goto out;
8242                 }
8243         }
8244         retval = 0;
8245 out:
8246         return retval;
8247 }
8248
8249 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
8250                         struct iov_iter *iter, loff_t offset)
8251 {
8252         struct file *file = iocb->ki_filp;
8253         struct inode *inode = file->f_mapping->host;
8254         u64 outstanding_extents = 0;
8255         size_t count = 0;
8256         int flags = 0;
8257         bool wakeup = true;
8258         bool relock = false;
8259         ssize_t ret;
8260
8261         if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
8262                 return 0;
8263
8264         atomic_inc(&inode->i_dio_count);
8265         smp_mb__after_atomic();
8266
8267         /*
8268          * The generic stuff only does filemap_write_and_wait_range, which
8269          * isn't enough if we've written compressed pages to this area, so
8270          * we need to flush the dirty pages again to make absolutely sure
8271          * that any outstanding dirty pages are on disk.
8272          */
8273         count = iov_iter_count(iter);
8274         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8275                      &BTRFS_I(inode)->runtime_flags))
8276                 filemap_fdatawrite_range(inode->i_mapping, offset,
8277                                          offset + count - 1);
8278
8279         if (rw & WRITE) {
8280                 /*
8281                  * If the write DIO is beyond the EOF, we need update
8282                  * the isize, but it is protected by i_mutex. So we can
8283                  * not unlock the i_mutex at this case.
8284                  */
8285                 if (offset + count <= inode->i_size) {
8286                         mutex_unlock(&inode->i_mutex);
8287                         relock = true;
8288                 }
8289                 ret = btrfs_delalloc_reserve_space(inode, count);
8290                 if (ret)
8291                         goto out;
8292                 outstanding_extents = div64_u64(count +
8293                                                 BTRFS_MAX_EXTENT_SIZE - 1,
8294                                                 BTRFS_MAX_EXTENT_SIZE);
8295
8296                 /*
8297                  * We need to know how many extents we reserved so that we can
8298                  * do the accounting properly if we go over the number we
8299                  * originally calculated.  Abuse current->journal_info for this.
8300                  */
8301                 current->journal_info = &outstanding_extents;
8302         } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8303                                      &BTRFS_I(inode)->runtime_flags)) {
8304                 inode_dio_done(inode);
8305                 flags = DIO_LOCKING | DIO_SKIP_HOLES;
8306                 wakeup = false;
8307         }
8308
8309         ret = __blockdev_direct_IO(rw, iocb, inode,
8310                         BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
8311                         iter, offset, btrfs_get_blocks_direct, NULL,
8312                         btrfs_submit_direct, flags);
8313         if (rw & WRITE) {
8314                 current->journal_info = NULL;
8315                 if (ret < 0 && ret != -EIOCBQUEUED)
8316                         btrfs_delalloc_release_space(inode, count);
8317                 else if (ret >= 0 && (size_t)ret < count)
8318                         btrfs_delalloc_release_space(inode,
8319                                                      count - (size_t)ret);
8320         }
8321 out:
8322         if (wakeup)
8323                 inode_dio_done(inode);
8324         if (relock)
8325                 mutex_lock(&inode->i_mutex);
8326
8327         return ret;
8328 }
8329
8330 #define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
8331
8332 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8333                 __u64 start, __u64 len)
8334 {
8335         int     ret;
8336
8337         ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8338         if (ret)
8339                 return ret;
8340
8341         return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
8342 }
8343
8344 int btrfs_readpage(struct file *file, struct page *page)
8345 {
8346         struct extent_io_tree *tree;
8347         tree = &BTRFS_I(page->mapping->host)->io_tree;
8348         return extent_read_full_page(tree, page, btrfs_get_extent, 0);
8349 }
8350
8351 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8352 {
8353         struct extent_io_tree *tree;
8354
8355
8356         if (current->flags & PF_MEMALLOC) {
8357                 redirty_page_for_writepage(wbc, page);
8358                 unlock_page(page);
8359                 return 0;
8360         }
8361         tree = &BTRFS_I(page->mapping->host)->io_tree;
8362         return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
8363 }
8364
8365 static int btrfs_writepages(struct address_space *mapping,
8366                             struct writeback_control *wbc)
8367 {
8368         struct extent_io_tree *tree;
8369
8370         tree = &BTRFS_I(mapping->host)->io_tree;
8371         return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
8372 }
8373
8374 static int
8375 btrfs_readpages(struct file *file, struct address_space *mapping,
8376                 struct list_head *pages, unsigned nr_pages)
8377 {
8378         struct extent_io_tree *tree;
8379         tree = &BTRFS_I(mapping->host)->io_tree;
8380         return extent_readpages(tree, mapping, pages, nr_pages,
8381                                 btrfs_get_extent);
8382 }
8383 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8384 {
8385         struct extent_io_tree *tree;
8386         struct extent_map_tree *map;
8387         int ret;
8388
8389         tree = &BTRFS_I(page->mapping->host)->io_tree;
8390         map = &BTRFS_I(page->mapping->host)->extent_tree;
8391         ret = try_release_extent_mapping(map, tree, page, gfp_flags);
8392         if (ret == 1) {
8393                 ClearPagePrivate(page);
8394                 set_page_private(page, 0);
8395                 page_cache_release(page);
8396         }
8397         return ret;
8398 }
8399
8400 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8401 {
8402         if (PageWriteback(page) || PageDirty(page))
8403                 return 0;
8404         return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
8405 }
8406
8407 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8408                                  unsigned int length)
8409 {
8410         struct inode *inode = page->mapping->host;
8411         struct extent_io_tree *tree;
8412         struct btrfs_ordered_extent *ordered;
8413         struct extent_state *cached_state = NULL;
8414         u64 page_start = page_offset(page);
8415         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
8416         int inode_evicting = inode->i_state & I_FREEING;
8417
8418         /*
8419          * we have the page locked, so new writeback can't start,
8420          * and the dirty bit won't be cleared while we are here.
8421          *
8422          * Wait for IO on this page so that we can safely clear
8423          * the PagePrivate2 bit and do ordered accounting
8424          */
8425         wait_on_page_writeback(page);
8426
8427         tree = &BTRFS_I(inode)->io_tree;
8428         if (offset) {
8429                 btrfs_releasepage(page, GFP_NOFS);
8430                 return;
8431         }
8432
8433         if (!inode_evicting)
8434                 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
8435         ordered = btrfs_lookup_ordered_extent(inode, page_start);
8436         if (ordered) {
8437                 /*
8438                  * IO on this page will never be started, so we need
8439                  * to account for any ordered extents now
8440                  */
8441                 if (!inode_evicting)
8442                         clear_extent_bit(tree, page_start, page_end,
8443                                          EXTENT_DIRTY | EXTENT_DELALLOC |
8444                                          EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8445                                          EXTENT_DEFRAG, 1, 0, &cached_state,
8446                                          GFP_NOFS);
8447                 /*
8448                  * whoever cleared the private bit is responsible
8449                  * for the finish_ordered_io
8450                  */
8451                 if (TestClearPagePrivate2(page)) {
8452                         struct btrfs_ordered_inode_tree *tree;
8453                         u64 new_len;
8454
8455                         tree = &BTRFS_I(inode)->ordered_tree;
8456
8457                         spin_lock_irq(&tree->lock);
8458                         set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8459                         new_len = page_start - ordered->file_offset;
8460                         if (new_len < ordered->truncated_len)
8461                                 ordered->truncated_len = new_len;
8462                         spin_unlock_irq(&tree->lock);
8463
8464                         if (btrfs_dec_test_ordered_pending(inode, &ordered,
8465                                                            page_start,
8466                                                            PAGE_CACHE_SIZE, 1))
8467                                 btrfs_finish_ordered_io(ordered);
8468                 }
8469                 btrfs_put_ordered_extent(ordered);
8470                 if (!inode_evicting) {
8471                         cached_state = NULL;
8472                         lock_extent_bits(tree, page_start, page_end, 0,
8473                                          &cached_state);
8474                 }
8475         }
8476
8477         if (!inode_evicting) {
8478                 clear_extent_bit(tree, page_start, page_end,
8479                                  EXTENT_LOCKED | EXTENT_DIRTY |
8480                                  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8481                                  EXTENT_DEFRAG, 1, 1,
8482                                  &cached_state, GFP_NOFS);
8483
8484                 __btrfs_releasepage(page, GFP_NOFS);
8485         }
8486
8487         ClearPageChecked(page);
8488         if (PagePrivate(page)) {
8489                 ClearPagePrivate(page);
8490                 set_page_private(page, 0);
8491                 page_cache_release(page);
8492         }
8493 }
8494
8495 /*
8496  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8497  * called from a page fault handler when a page is first dirtied. Hence we must
8498  * be careful to check for EOF conditions here. We set the page up correctly
8499  * for a written page which means we get ENOSPC checking when writing into
8500  * holes and correct delalloc and unwritten extent mapping on filesystems that
8501  * support these features.
8502  *
8503  * We are not allowed to take the i_mutex here so we have to play games to
8504  * protect against truncate races as the page could now be beyond EOF.  Because
8505  * vmtruncate() writes the inode size before removing pages, once we have the
8506  * page lock we can determine safely if the page is beyond EOF. If it is not
8507  * beyond EOF, then the page is guaranteed safe against truncation until we
8508  * unlock the page.
8509  */
8510 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
8511 {
8512         struct page *page = vmf->page;
8513         struct inode *inode = file_inode(vma->vm_file);
8514         struct btrfs_root *root = BTRFS_I(inode)->root;
8515         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8516         struct btrfs_ordered_extent *ordered;
8517         struct extent_state *cached_state = NULL;
8518         char *kaddr;
8519         unsigned long zero_start;
8520         loff_t size;
8521         int ret;
8522         int reserved = 0;
8523         u64 page_start;
8524         u64 page_end;
8525
8526         sb_start_pagefault(inode->i_sb);
8527         ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
8528         if (!ret) {
8529                 ret = file_update_time(vma->vm_file);
8530                 reserved = 1;
8531         }
8532         if (ret) {
8533                 if (ret == -ENOMEM)
8534                         ret = VM_FAULT_OOM;
8535                 else /* -ENOSPC, -EIO, etc */
8536                         ret = VM_FAULT_SIGBUS;
8537                 if (reserved)
8538                         goto out;
8539                 goto out_noreserve;
8540         }
8541
8542         ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8543 again:
8544         lock_page(page);
8545         size = i_size_read(inode);
8546         page_start = page_offset(page);
8547         page_end = page_start + PAGE_CACHE_SIZE - 1;
8548
8549         if ((page->mapping != inode->i_mapping) ||
8550             (page_start >= size)) {
8551                 /* page got truncated out from underneath us */
8552                 goto out_unlock;
8553         }
8554         wait_on_page_writeback(page);
8555
8556         lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
8557         set_page_extent_mapped(page);
8558
8559         /*
8560          * we can't set the delalloc bits if there are pending ordered
8561          * extents.  Drop our locks and wait for them to finish
8562          */
8563         ordered = btrfs_lookup_ordered_extent(inode, page_start);
8564         if (ordered) {
8565                 unlock_extent_cached(io_tree, page_start, page_end,
8566                                      &cached_state, GFP_NOFS);
8567                 unlock_page(page);
8568                 btrfs_start_ordered_extent(inode, ordered, 1);
8569                 btrfs_put_ordered_extent(ordered);
8570                 goto again;
8571         }
8572
8573         /*
8574          * XXX - page_mkwrite gets called every time the page is dirtied, even
8575          * if it was already dirty, so for space accounting reasons we need to
8576          * clear any delalloc bits for the range we are fixing to save.  There
8577          * is probably a better way to do this, but for now keep consistent with
8578          * prepare_pages in the normal write path.
8579          */
8580         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
8581                           EXTENT_DIRTY | EXTENT_DELALLOC |
8582                           EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
8583                           0, 0, &cached_state, GFP_NOFS);
8584
8585         ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
8586                                         &cached_state);
8587         if (ret) {
8588                 unlock_extent_cached(io_tree, page_start, page_end,
8589                                      &cached_state, GFP_NOFS);
8590                 ret = VM_FAULT_SIGBUS;
8591                 goto out_unlock;
8592         }
8593         ret = 0;
8594
8595         /* page is wholly or partially inside EOF */
8596         if (page_start + PAGE_CACHE_SIZE > size)
8597                 zero_start = size & ~PAGE_CACHE_MASK;
8598         else
8599                 zero_start = PAGE_CACHE_SIZE;
8600
8601         if (zero_start != PAGE_CACHE_SIZE) {
8602                 kaddr = kmap(page);
8603                 memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
8604                 flush_dcache_page(page);
8605                 kunmap(page);
8606         }
8607         ClearPageChecked(page);
8608         set_page_dirty(page);
8609         SetPageUptodate(page);
8610
8611         BTRFS_I(inode)->last_trans = root->fs_info->generation;
8612         BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
8613         BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
8614
8615         unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
8616
8617 out_unlock:
8618         if (!ret) {
8619                 sb_end_pagefault(inode->i_sb);
8620                 return VM_FAULT_LOCKED;
8621         }
8622         unlock_page(page);
8623 out:
8624         btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
8625 out_noreserve:
8626         sb_end_pagefault(inode->i_sb);
8627         return ret;
8628 }
8629
8630 static int btrfs_truncate(struct inode *inode)
8631 {
8632         struct btrfs_root *root = BTRFS_I(inode)->root;
8633         struct btrfs_block_rsv *rsv;
8634         int ret = 0;
8635         int err = 0;
8636         struct btrfs_trans_handle *trans;
8637         u64 mask = root->sectorsize - 1;
8638         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
8639
8640         ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
8641                                        (u64)-1);
8642         if (ret)
8643                 return ret;
8644
8645         /*
8646          * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
8647          * 3 things going on here
8648          *
8649          * 1) We need to reserve space for our orphan item and the space to
8650          * delete our orphan item.  Lord knows we don't want to have a dangling
8651          * orphan item because we didn't reserve space to remove it.
8652          *
8653          * 2) We need to reserve space to update our inode.
8654          *
8655          * 3) We need to have something to cache all the space that is going to
8656          * be free'd up by the truncate operation, but also have some slack
8657          * space reserved in case it uses space during the truncate (thank you
8658          * very much snapshotting).
8659          *
8660          * And we need these to all be seperate.  The fact is we can use alot of
8661          * space doing the truncate, and we have no earthly idea how much space
8662          * we will use, so we need the truncate reservation to be seperate so it
8663          * doesn't end up using space reserved for updating the inode or
8664          * removing the orphan item.  We also need to be able to stop the
8665          * transaction and start a new one, which means we need to be able to
8666          * update the inode several times, and we have no idea of knowing how
8667          * many times that will be, so we can't just reserve 1 item for the
8668          * entirety of the opration, so that has to be done seperately as well.
8669          * Then there is the orphan item, which does indeed need to be held on
8670          * to for the whole operation, and we need nobody to touch this reserved
8671          * space except the orphan code.
8672          *
8673          * So that leaves us with
8674          *
8675          * 1) root->orphan_block_rsv - for the orphan deletion.
8676          * 2) rsv - for the truncate reservation, which we will steal from the
8677          * transaction reservation.
8678          * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
8679          * updating the inode.
8680          */
8681         rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
8682         if (!rsv)
8683                 return -ENOMEM;
8684         rsv->size = min_size;
8685         rsv->failfast = 1;
8686
8687         /*
8688          * 1 for the truncate slack space
8689          * 1 for updating the inode.
8690          */
8691         trans = btrfs_start_transaction(root, 2);
8692         if (IS_ERR(trans)) {
8693                 err = PTR_ERR(trans);
8694                 goto out;
8695         }
8696
8697         /* Migrate the slack space for the truncate to our reserve */
8698         ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
8699                                       min_size);
8700         BUG_ON(ret);
8701
8702         /*
8703          * So if we truncate and then write and fsync we normally would just
8704          * write the extents that changed, which is a problem if we need to
8705          * first truncate that entire inode.  So set this flag so we write out
8706          * all of the extents in the inode to the sync log so we're completely
8707          * safe.
8708          */
8709         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
8710         trans->block_rsv = rsv;
8711
8712         while (1) {
8713                 ret = btrfs_truncate_inode_items(trans, root, inode,
8714                                                  inode->i_size,
8715                                                  BTRFS_EXTENT_DATA_KEY);
8716                 if (ret != -ENOSPC && ret != -EAGAIN) {
8717                         err = ret;
8718                         break;
8719                 }
8720
8721                 trans->block_rsv = &root->fs_info->trans_block_rsv;
8722                 ret = btrfs_update_inode(trans, root, inode);
8723                 if (ret) {
8724                         err = ret;
8725                         break;
8726                 }
8727
8728                 btrfs_end_transaction(trans, root);
8729                 btrfs_btree_balance_dirty(root);
8730
8731                 trans = btrfs_start_transaction(root, 2);
8732                 if (IS_ERR(trans)) {
8733                         ret = err = PTR_ERR(trans);
8734                         trans = NULL;
8735                         break;
8736                 }
8737
8738                 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
8739                                               rsv, min_size);
8740                 BUG_ON(ret);    /* shouldn't happen */
8741                 trans->block_rsv = rsv;
8742         }
8743
8744         if (ret == 0 && inode->i_nlink > 0) {
8745                 trans->block_rsv = root->orphan_block_rsv;
8746                 ret = btrfs_orphan_del(trans, inode);
8747                 if (ret)
8748                         err = ret;
8749         }
8750
8751         if (trans) {
8752                 trans->block_rsv = &root->fs_info->trans_block_rsv;
8753                 ret = btrfs_update_inode(trans, root, inode);
8754                 if (ret && !err)
8755                         err = ret;
8756
8757                 ret = btrfs_end_transaction(trans, root);
8758                 btrfs_btree_balance_dirty(root);
8759         }
8760
8761 out:
8762         btrfs_free_block_rsv(root, rsv);
8763
8764         if (ret && !err)
8765                 err = ret;
8766
8767         return err;
8768 }
8769
8770 /*
8771  * create a new subvolume directory/inode (helper for the ioctl).
8772  */
8773 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8774                              struct btrfs_root *new_root,
8775                              struct btrfs_root *parent_root,
8776                              u64 new_dirid)
8777 {
8778         struct inode *inode;
8779         int err;
8780         u64 index = 0;
8781
8782         inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
8783                                 new_dirid, new_dirid,
8784                                 S_IFDIR | (~current_umask() & S_IRWXUGO),
8785                                 &index);
8786         if (IS_ERR(inode))
8787                 return PTR_ERR(inode);
8788         inode->i_op = &btrfs_dir_inode_operations;
8789         inode->i_fop = &btrfs_dir_file_operations;
8790
8791         set_nlink(inode, 1);
8792         btrfs_i_size_write(inode, 0);
8793         unlock_new_inode(inode);
8794
8795         err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
8796         if (err)
8797                 btrfs_err(new_root->fs_info,
8798                           "error inheriting subvolume %llu properties: %d",
8799                           new_root->root_key.objectid, err);
8800
8801         err = btrfs_update_inode(trans, new_root, inode);
8802
8803         iput(inode);
8804         return err;
8805 }
8806
8807 struct inode *btrfs_alloc_inode(struct super_block *sb)
8808 {
8809         struct btrfs_inode *ei;
8810         struct inode *inode;
8811
8812         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
8813         if (!ei)
8814                 return NULL;
8815
8816         ei->root = NULL;
8817         ei->generation = 0;
8818         ei->last_trans = 0;
8819         ei->last_sub_trans = 0;
8820         ei->logged_trans = 0;
8821         ei->delalloc_bytes = 0;
8822         ei->defrag_bytes = 0;
8823         ei->disk_i_size = 0;
8824         ei->flags = 0;
8825         ei->csum_bytes = 0;
8826         ei->index_cnt = (u64)-1;
8827         ei->dir_index = 0;
8828         ei->last_unlink_trans = 0;
8829         ei->last_log_commit = 0;
8830
8831         spin_lock_init(&ei->lock);
8832         ei->outstanding_extents = 0;
8833         ei->reserved_extents = 0;
8834
8835         ei->runtime_flags = 0;
8836         ei->force_compress = BTRFS_COMPRESS_NONE;
8837
8838         ei->delayed_node = NULL;
8839
8840         ei->i_otime.tv_sec = 0;
8841         ei->i_otime.tv_nsec = 0;
8842
8843         inode = &ei->vfs_inode;
8844         extent_map_tree_init(&ei->extent_tree);
8845         extent_io_tree_init(&ei->io_tree, &inode->i_data);
8846         extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
8847         ei->io_tree.track_uptodate = 1;
8848         ei->io_failure_tree.track_uptodate = 1;
8849         atomic_set(&ei->sync_writers, 0);
8850         mutex_init(&ei->log_mutex);
8851         mutex_init(&ei->delalloc_mutex);
8852         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8853         INIT_LIST_HEAD(&ei->delalloc_inodes);
8854         RB_CLEAR_NODE(&ei->rb_node);
8855
8856         return inode;
8857 }
8858
8859 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8860 void btrfs_test_destroy_inode(struct inode *inode)
8861 {
8862         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8863         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8864 }
8865 #endif
8866
8867 static void btrfs_i_callback(struct rcu_head *head)
8868 {
8869         struct inode *inode = container_of(head, struct inode, i_rcu);
8870         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8871 }
8872
8873 void btrfs_destroy_inode(struct inode *inode)
8874 {
8875         struct btrfs_ordered_extent *ordered;
8876         struct btrfs_root *root = BTRFS_I(inode)->root;
8877
8878         WARN_ON(!hlist_empty(&inode->i_dentry));
8879         WARN_ON(inode->i_data.nrpages);
8880         WARN_ON(BTRFS_I(inode)->outstanding_extents);
8881         WARN_ON(BTRFS_I(inode)->reserved_extents);
8882         WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8883         WARN_ON(BTRFS_I(inode)->csum_bytes);
8884         WARN_ON(BTRFS_I(inode)->defrag_bytes);
8885
8886         /*
8887          * This can happen where we create an inode, but somebody else also
8888          * created the same inode and we need to destroy the one we already
8889          * created.
8890          */
8891         if (!root)
8892                 goto free;
8893
8894         if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8895                      &BTRFS_I(inode)->runtime_flags)) {
8896                 btrfs_info(root->fs_info, "inode %llu still on the orphan list",
8897                         btrfs_ino(inode));
8898                 atomic_dec(&root->orphan_inodes);
8899         }
8900
8901         while (1) {
8902                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8903                 if (!ordered)
8904                         break;
8905                 else {
8906                         btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
8907                                 ordered->file_offset, ordered->len);
8908                         btrfs_remove_ordered_extent(inode, ordered);
8909                         btrfs_put_ordered_extent(ordered);
8910                         btrfs_put_ordered_extent(ordered);
8911                 }
8912         }
8913         inode_tree_del(inode);
8914         btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
8915 free:
8916         call_rcu(&inode->i_rcu, btrfs_i_callback);
8917 }
8918
8919 int btrfs_drop_inode(struct inode *inode)
8920 {
8921         struct btrfs_root *root = BTRFS_I(inode)->root;
8922
8923         if (root == NULL)
8924                 return 1;
8925
8926         /* the snap/subvol tree is on deleting */
8927         if (btrfs_root_refs(&root->root_item) == 0)
8928                 return 1;
8929         else
8930                 return generic_drop_inode(inode);
8931 }
8932
8933 static void init_once(void *foo)
8934 {
8935         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
8936
8937         inode_init_once(&ei->vfs_inode);
8938 }
8939
8940 void btrfs_destroy_cachep(void)
8941 {
8942         /*
8943          * Make sure all delayed rcu free inodes are flushed before we
8944          * destroy cache.
8945          */
8946         rcu_barrier();
8947         if (btrfs_inode_cachep)
8948                 kmem_cache_destroy(btrfs_inode_cachep);
8949         if (btrfs_trans_handle_cachep)
8950                 kmem_cache_destroy(btrfs_trans_handle_cachep);
8951         if (btrfs_transaction_cachep)
8952                 kmem_cache_destroy(btrfs_transaction_cachep);
8953         if (btrfs_path_cachep)
8954                 kmem_cache_destroy(btrfs_path_cachep);
8955         if (btrfs_free_space_cachep)
8956                 kmem_cache_destroy(btrfs_free_space_cachep);
8957         if (btrfs_delalloc_work_cachep)
8958                 kmem_cache_destroy(btrfs_delalloc_work_cachep);
8959 }
8960
8961 int btrfs_init_cachep(void)
8962 {
8963         btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8964                         sizeof(struct btrfs_inode), 0,
8965                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
8966         if (!btrfs_inode_cachep)
8967                 goto fail;
8968
8969         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
8970                         sizeof(struct btrfs_trans_handle), 0,
8971                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8972         if (!btrfs_trans_handle_cachep)
8973                 goto fail;
8974
8975         btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
8976                         sizeof(struct btrfs_transaction), 0,
8977                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8978         if (!btrfs_transaction_cachep)
8979                 goto fail;
8980
8981         btrfs_path_cachep = kmem_cache_create("btrfs_path",
8982                         sizeof(struct btrfs_path), 0,
8983                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8984         if (!btrfs_path_cachep)
8985                 goto fail;
8986
8987         btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
8988                         sizeof(struct btrfs_free_space), 0,
8989                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
8990         if (!btrfs_free_space_cachep)
8991                 goto fail;
8992
8993         btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
8994                         sizeof(struct btrfs_delalloc_work), 0,
8995                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
8996                         NULL);
8997         if (!btrfs_delalloc_work_cachep)
8998                 goto fail;
8999
9000         return 0;
9001 fail:
9002         btrfs_destroy_cachep();
9003         return -ENOMEM;
9004 }
9005
9006 static int btrfs_getattr(struct vfsmount *mnt,
9007                          struct dentry *dentry, struct kstat *stat)
9008 {
9009         u64 delalloc_bytes;
9010         struct inode *inode = dentry->d_inode;
9011         u32 blocksize = inode->i_sb->s_blocksize;
9012
9013         generic_fillattr(inode, stat);
9014         stat->dev = BTRFS_I(inode)->root->anon_dev;
9015         stat->blksize = PAGE_CACHE_SIZE;
9016
9017         spin_lock(&BTRFS_I(inode)->lock);
9018         delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
9019         spin_unlock(&BTRFS_I(inode)->lock);
9020         stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
9021                         ALIGN(delalloc_bytes, blocksize)) >> 9;
9022         return 0;
9023 }
9024
9025 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9026                            struct inode *new_dir, struct dentry *new_dentry)
9027 {
9028         struct btrfs_trans_handle *trans;
9029         struct btrfs_root *root = BTRFS_I(old_dir)->root;
9030         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9031         struct inode *new_inode = new_dentry->d_inode;
9032         struct inode *old_inode = old_dentry->d_inode;
9033         struct timespec ctime = CURRENT_TIME;
9034         u64 index = 0;
9035         u64 root_objectid;
9036         int ret;
9037         u64 old_ino = btrfs_ino(old_inode);
9038
9039         if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9040                 return -EPERM;
9041
9042         /* we only allow rename subvolume link between subvolumes */
9043         if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9044                 return -EXDEV;
9045
9046         if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9047             (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
9048                 return -ENOTEMPTY;
9049
9050         if (S_ISDIR(old_inode->i_mode) && new_inode &&
9051             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9052                 return -ENOTEMPTY;
9053
9054
9055         /* check for collisions, even if the  name isn't there */
9056         ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9057                              new_dentry->d_name.name,
9058                              new_dentry->d_name.len);
9059
9060         if (ret) {
9061                 if (ret == -EEXIST) {
9062                         /* we shouldn't get
9063                          * eexist without a new_inode */
9064                         if (WARN_ON(!new_inode)) {
9065                                 return ret;
9066                         }
9067                 } else {
9068                         /* maybe -EOVERFLOW */
9069                         return ret;
9070                 }
9071         }
9072         ret = 0;
9073
9074         /*
9075          * we're using rename to replace one file with another.  Start IO on it
9076          * now so  we don't add too much work to the end of the transaction
9077          */
9078         if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9079                 filemap_flush(old_inode->i_mapping);
9080
9081         /* close the racy window with snapshot create/destroy ioctl */
9082         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9083                 down_read(&root->fs_info->subvol_sem);
9084         /*
9085          * We want to reserve the absolute worst case amount of items.  So if
9086          * both inodes are subvols and we need to unlink them then that would
9087          * require 4 item modifications, but if they are both normal inodes it
9088          * would require 5 item modifications, so we'll assume their normal
9089          * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9090          * should cover the worst case number of items we'll modify.
9091          */
9092         trans = btrfs_start_transaction(root, 11);
9093         if (IS_ERR(trans)) {
9094                 ret = PTR_ERR(trans);
9095                 goto out_notrans;
9096         }
9097
9098         if (dest != root)
9099                 btrfs_record_root_in_trans(trans, dest);
9100
9101         ret = btrfs_set_inode_index(new_dir, &index);
9102         if (ret)
9103                 goto out_fail;
9104
9105         BTRFS_I(old_inode)->dir_index = 0ULL;
9106         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9107                 /* force full log commit if subvolume involved. */
9108                 btrfs_set_log_full_commit(root->fs_info, trans);
9109         } else {
9110                 ret = btrfs_insert_inode_ref(trans, dest,
9111                                              new_dentry->d_name.name,
9112                                              new_dentry->d_name.len,
9113                                              old_ino,
9114                                              btrfs_ino(new_dir), index);
9115                 if (ret)
9116                         goto out_fail;
9117                 /*
9118                  * this is an ugly little race, but the rename is required
9119                  * to make sure that if we crash, the inode is either at the
9120                  * old name or the new one.  pinning the log transaction lets
9121                  * us make sure we don't allow a log commit to come in after
9122                  * we unlink the name but before we add the new name back in.
9123                  */
9124                 btrfs_pin_log_trans(root);
9125         }
9126
9127         inode_inc_iversion(old_dir);
9128         inode_inc_iversion(new_dir);
9129         inode_inc_iversion(old_inode);
9130         old_dir->i_ctime = old_dir->i_mtime = ctime;
9131         new_dir->i_ctime = new_dir->i_mtime = ctime;
9132         old_inode->i_ctime = ctime;
9133
9134         if (old_dentry->d_parent != new_dentry->d_parent)
9135                 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
9136
9137         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9138                 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9139                 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
9140                                         old_dentry->d_name.name,
9141                                         old_dentry->d_name.len);
9142         } else {
9143                 ret = __btrfs_unlink_inode(trans, root, old_dir,
9144                                         old_dentry->d_inode,
9145                                         old_dentry->d_name.name,
9146                                         old_dentry->d_name.len);
9147                 if (!ret)
9148                         ret = btrfs_update_inode(trans, root, old_inode);
9149         }
9150         if (ret) {
9151                 btrfs_abort_transaction(trans, root, ret);
9152                 goto out_fail;
9153         }
9154
9155         if (new_inode) {
9156                 inode_inc_iversion(new_inode);
9157                 new_inode->i_ctime = CURRENT_TIME;
9158                 if (unlikely(btrfs_ino(new_inode) ==
9159                              BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9160                         root_objectid = BTRFS_I(new_inode)->location.objectid;
9161                         ret = btrfs_unlink_subvol(trans, dest, new_dir,
9162                                                 root_objectid,
9163                                                 new_dentry->d_name.name,
9164                                                 new_dentry->d_name.len);
9165                         BUG_ON(new_inode->i_nlink == 0);
9166                 } else {
9167                         ret = btrfs_unlink_inode(trans, dest, new_dir,
9168                                                  new_dentry->d_inode,
9169                                                  new_dentry->d_name.name,
9170                                                  new_dentry->d_name.len);
9171                 }
9172                 if (!ret && new_inode->i_nlink == 0)
9173                         ret = btrfs_orphan_add(trans, new_dentry->d_inode);
9174                 if (ret) {
9175                         btrfs_abort_transaction(trans, root, ret);
9176                         goto out_fail;
9177                 }
9178         }
9179
9180         ret = btrfs_add_link(trans, new_dir, old_inode,
9181                              new_dentry->d_name.name,
9182                              new_dentry->d_name.len, 0, index);
9183         if (ret) {
9184                 btrfs_abort_transaction(trans, root, ret);
9185                 goto out_fail;
9186         }
9187
9188         if (old_inode->i_nlink == 1)
9189                 BTRFS_I(old_inode)->dir_index = index;
9190
9191         if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
9192                 struct dentry *parent = new_dentry->d_parent;
9193                 btrfs_log_new_name(trans, old_inode, old_dir, parent);
9194                 btrfs_end_log_trans(root);
9195         }
9196 out_fail:
9197         btrfs_end_transaction(trans, root);
9198 out_notrans:
9199         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9200                 up_read(&root->fs_info->subvol_sem);
9201
9202         return ret;
9203 }
9204
9205 static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
9206                          struct inode *new_dir, struct dentry *new_dentry,
9207                          unsigned int flags)
9208 {
9209         if (flags & ~RENAME_NOREPLACE)
9210                 return -EINVAL;
9211
9212         return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
9213 }
9214
9215 static void btrfs_run_delalloc_work(struct btrfs_work *work)
9216 {
9217         struct btrfs_delalloc_work *delalloc_work;
9218         struct inode *inode;
9219
9220         delalloc_work = container_of(work, struct btrfs_delalloc_work,
9221                                      work);
9222         inode = delalloc_work->inode;
9223         if (delalloc_work->wait) {
9224                 btrfs_wait_ordered_range(inode, 0, (u64)-1);
9225         } else {
9226                 filemap_flush(inode->i_mapping);
9227                 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9228                              &BTRFS_I(inode)->runtime_flags))
9229                         filemap_flush(inode->i_mapping);
9230         }
9231
9232         if (delalloc_work->delay_iput)
9233                 btrfs_add_delayed_iput(inode);
9234         else
9235                 iput(inode);
9236         complete(&delalloc_work->completion);
9237 }
9238
9239 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
9240                                                     int wait, int delay_iput)
9241 {
9242         struct btrfs_delalloc_work *work;
9243
9244         work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
9245         if (!work)
9246                 return NULL;
9247
9248         init_completion(&work->completion);
9249         INIT_LIST_HEAD(&work->list);
9250         work->inode = inode;
9251         work->wait = wait;
9252         work->delay_iput = delay_iput;
9253         WARN_ON_ONCE(!inode);
9254         btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
9255                         btrfs_run_delalloc_work, NULL, NULL);
9256
9257         return work;
9258 }
9259
9260 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
9261 {
9262         wait_for_completion(&work->completion);
9263         kmem_cache_free(btrfs_delalloc_work_cachep, work);
9264 }
9265
9266 /*
9267  * some fairly slow code that needs optimization. This walks the list
9268  * of all the inodes with pending delalloc and forces them to disk.
9269  */
9270 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
9271                                    int nr)
9272 {
9273         struct btrfs_inode *binode;
9274         struct inode *inode;
9275         struct btrfs_delalloc_work *work, *next;
9276         struct list_head works;
9277         struct list_head splice;
9278         int ret = 0;
9279
9280         INIT_LIST_HEAD(&works);
9281         INIT_LIST_HEAD(&splice);
9282
9283         mutex_lock(&root->delalloc_mutex);
9284         spin_lock(&root->delalloc_lock);
9285         list_splice_init(&root->delalloc_inodes, &splice);
9286         while (!list_empty(&splice)) {
9287                 binode = list_entry(splice.next, struct btrfs_inode,
9288                                     delalloc_inodes);
9289
9290                 list_move_tail(&binode->delalloc_inodes,
9291                                &root->delalloc_inodes);
9292                 inode = igrab(&binode->vfs_inode);
9293                 if (!inode) {
9294                         cond_resched_lock(&root->delalloc_lock);
9295                         continue;
9296                 }
9297                 spin_unlock(&root->delalloc_lock);
9298
9299                 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
9300                 if (!work) {
9301                         if (delay_iput)
9302                                 btrfs_add_delayed_iput(inode);
9303                         else
9304                                 iput(inode);
9305                         ret = -ENOMEM;
9306                         goto out;
9307                 }
9308                 list_add_tail(&work->list, &works);
9309                 btrfs_queue_work(root->fs_info->flush_workers,
9310                                  &work->work);
9311                 ret++;
9312                 if (nr != -1 && ret >= nr)
9313                         goto out;
9314                 cond_resched();
9315                 spin_lock(&root->delalloc_lock);
9316         }
9317         spin_unlock(&root->delalloc_lock);
9318
9319 out:
9320         list_for_each_entry_safe(work, next, &works, list) {
9321                 list_del_init(&work->list);
9322                 btrfs_wait_and_free_delalloc_work(work);
9323         }
9324
9325         if (!list_empty_careful(&splice)) {
9326                 spin_lock(&root->delalloc_lock);
9327                 list_splice_tail(&splice, &root->delalloc_inodes);
9328                 spin_unlock(&root->delalloc_lock);
9329         }
9330         mutex_unlock(&root->delalloc_mutex);
9331         return ret;
9332 }
9333
9334 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
9335 {
9336         int ret;
9337
9338         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
9339                 return -EROFS;
9340
9341         ret = __start_delalloc_inodes(root, delay_iput, -1);
9342         if (ret > 0)
9343                 ret = 0;
9344         /*
9345          * the filemap_flush will queue IO into the worker threads, but
9346          * we have to make sure the IO is actually started and that
9347          * ordered extents get created before we return
9348          */
9349         atomic_inc(&root->fs_info->async_submit_draining);
9350         while (atomic_read(&root->fs_info->nr_async_submits) ||
9351               atomic_read(&root->fs_info->async_delalloc_pages)) {
9352                 wait_event(root->fs_info->async_submit_wait,
9353                    (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
9354                     atomic_read(&root->fs_info->async_delalloc_pages) == 0));
9355         }
9356         atomic_dec(&root->fs_info->async_submit_draining);
9357         return ret;
9358 }
9359
9360 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
9361                                int nr)
9362 {
9363         struct btrfs_root *root;
9364         struct list_head splice;
9365         int ret;
9366
9367         if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
9368                 return -EROFS;
9369
9370         INIT_LIST_HEAD(&splice);
9371
9372         mutex_lock(&fs_info->delalloc_root_mutex);
9373         spin_lock(&fs_info->delalloc_root_lock);
9374         list_splice_init(&fs_info->delalloc_roots, &splice);
9375         while (!list_empty(&splice) && nr) {
9376                 root = list_first_entry(&splice, struct btrfs_root,
9377                                         delalloc_root);
9378                 root = btrfs_grab_fs_root(root);
9379                 BUG_ON(!root);
9380                 list_move_tail(&root->delalloc_root,
9381                                &fs_info->delalloc_roots);
9382                 spin_unlock(&fs_info->delalloc_root_lock);
9383
9384                 ret = __start_delalloc_inodes(root, delay_iput, nr);
9385                 btrfs_put_fs_root(root);
9386                 if (ret < 0)
9387                         goto out;
9388
9389                 if (nr != -1) {
9390                         nr -= ret;
9391                         WARN_ON(nr < 0);
9392                 }
9393                 spin_lock(&fs_info->delalloc_root_lock);
9394         }
9395         spin_unlock(&fs_info->delalloc_root_lock);
9396
9397         ret = 0;
9398         atomic_inc(&fs_info->async_submit_draining);
9399         while (atomic_read(&fs_info->nr_async_submits) ||
9400               atomic_read(&fs_info->async_delalloc_pages)) {
9401                 wait_event(fs_info->async_submit_wait,
9402                    (atomic_read(&fs_info->nr_async_submits) == 0 &&
9403                     atomic_read(&fs_info->async_delalloc_pages) == 0));
9404         }
9405         atomic_dec(&fs_info->async_submit_draining);
9406 out:
9407         if (!list_empty_careful(&splice)) {
9408                 spin_lock(&fs_info->delalloc_root_lock);
9409                 list_splice_tail(&splice, &fs_info->delalloc_roots);
9410                 spin_unlock(&fs_info->delalloc_root_lock);
9411         }
9412         mutex_unlock(&fs_info->delalloc_root_mutex);
9413         return ret;
9414 }
9415
9416 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
9417                          const char *symname)
9418 {
9419         struct btrfs_trans_handle *trans;
9420         struct btrfs_root *root = BTRFS_I(dir)->root;
9421         struct btrfs_path *path;
9422         struct btrfs_key key;
9423         struct inode *inode = NULL;
9424         int err;
9425         int drop_inode = 0;
9426         u64 objectid;
9427         u64 index = 0;
9428         int name_len;
9429         int datasize;
9430         unsigned long ptr;
9431         struct btrfs_file_extent_item *ei;
9432         struct extent_buffer *leaf;
9433
9434         name_len = strlen(symname);
9435         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
9436                 return -ENAMETOOLONG;
9437
9438         /*
9439          * 2 items for inode item and ref
9440          * 2 items for dir items
9441          * 1 item for xattr if selinux is on
9442          */
9443         trans = btrfs_start_transaction(root, 5);
9444         if (IS_ERR(trans))
9445                 return PTR_ERR(trans);
9446
9447         err = btrfs_find_free_ino(root, &objectid);
9448         if (err)
9449                 goto out_unlock;
9450
9451         inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
9452                                 dentry->d_name.len, btrfs_ino(dir), objectid,
9453                                 S_IFLNK|S_IRWXUGO, &index);
9454         if (IS_ERR(inode)) {
9455                 err = PTR_ERR(inode);
9456                 goto out_unlock;
9457         }
9458
9459         /*
9460         * If the active LSM wants to access the inode during
9461         * d_instantiate it needs these. Smack checks to see
9462         * if the filesystem supports xattrs by looking at the
9463         * ops vector.
9464         */
9465         inode->i_fop = &btrfs_file_operations;
9466         inode->i_op = &btrfs_file_inode_operations;
9467         inode->i_mapping->a_ops = &btrfs_aops;
9468         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9469
9470         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
9471         if (err)
9472                 goto out_unlock_inode;
9473
9474         err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
9475         if (err)
9476                 goto out_unlock_inode;
9477
9478         path = btrfs_alloc_path();
9479         if (!path) {
9480                 err = -ENOMEM;
9481                 goto out_unlock_inode;
9482         }
9483         key.objectid = btrfs_ino(inode);
9484         key.offset = 0;
9485         key.type = BTRFS_EXTENT_DATA_KEY;
9486         datasize = btrfs_file_extent_calc_inline_size(name_len);
9487         err = btrfs_insert_empty_item(trans, root, path, &key,
9488                                       datasize);
9489         if (err) {
9490                 btrfs_free_path(path);
9491                 goto out_unlock_inode;
9492         }
9493         leaf = path->nodes[0];
9494         ei = btrfs_item_ptr(leaf, path->slots[0],
9495                             struct btrfs_file_extent_item);
9496         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
9497         btrfs_set_file_extent_type(leaf, ei,
9498                                    BTRFS_FILE_EXTENT_INLINE);
9499         btrfs_set_file_extent_encryption(leaf, ei, 0);
9500         btrfs_set_file_extent_compression(leaf, ei, 0);
9501         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
9502         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
9503
9504         ptr = btrfs_file_extent_inline_start(ei);
9505         write_extent_buffer(leaf, symname, ptr, name_len);
9506         btrfs_mark_buffer_dirty(leaf);
9507         btrfs_free_path(path);
9508
9509         inode->i_op = &btrfs_symlink_inode_operations;
9510         inode->i_mapping->a_ops = &btrfs_symlink_aops;
9511         inode_set_bytes(inode, name_len);
9512         btrfs_i_size_write(inode, name_len);
9513         err = btrfs_update_inode(trans, root, inode);
9514         if (err) {
9515                 drop_inode = 1;
9516                 goto out_unlock_inode;
9517         }
9518
9519         unlock_new_inode(inode);
9520         d_instantiate(dentry, inode);
9521
9522 out_unlock:
9523         btrfs_end_transaction(trans, root);
9524         if (drop_inode) {
9525                 inode_dec_link_count(inode);
9526                 iput(inode);
9527         }
9528         btrfs_btree_balance_dirty(root);
9529         return err;
9530
9531 out_unlock_inode:
9532         drop_inode = 1;
9533         unlock_new_inode(inode);
9534         goto out_unlock;
9535 }
9536
9537 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9538                                        u64 start, u64 num_bytes, u64 min_size,
9539                                        loff_t actual_len, u64 *alloc_hint,
9540                                        struct btrfs_trans_handle *trans)
9541 {
9542         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
9543         struct extent_map *em;
9544         struct btrfs_root *root = BTRFS_I(inode)->root;
9545         struct btrfs_key ins;
9546         u64 cur_offset = start;
9547         u64 i_size;
9548         u64 cur_bytes;
9549         int ret = 0;
9550         bool own_trans = true;
9551
9552         if (trans)
9553                 own_trans = false;
9554         while (num_bytes > 0) {
9555                 if (own_trans) {
9556                         trans = btrfs_start_transaction(root, 3);
9557                         if (IS_ERR(trans)) {
9558                                 ret = PTR_ERR(trans);
9559                                 break;
9560                         }
9561                 }
9562
9563                 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
9564                 cur_bytes = max(cur_bytes, min_size);
9565                 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
9566                                            *alloc_hint, &ins, 1, 0);
9567                 if (ret) {
9568                         if (own_trans)
9569                                 btrfs_end_transaction(trans, root);
9570                         break;
9571                 }
9572
9573                 ret = insert_reserved_file_extent(trans, inode,
9574                                                   cur_offset, ins.objectid,
9575                                                   ins.offset, ins.offset,
9576                                                   ins.offset, 0, 0, 0,
9577                                                   BTRFS_FILE_EXTENT_PREALLOC);
9578                 if (ret) {
9579                         btrfs_free_reserved_extent(root, ins.objectid,
9580                                                    ins.offset, 0);
9581                         btrfs_abort_transaction(trans, root, ret);
9582                         if (own_trans)
9583                                 btrfs_end_transaction(trans, root);
9584                         break;
9585                 }
9586
9587                 btrfs_drop_extent_cache(inode, cur_offset,
9588                                         cur_offset + ins.offset -1, 0);
9589
9590                 em = alloc_extent_map();
9591                 if (!em) {
9592                         set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
9593                                 &BTRFS_I(inode)->runtime_flags);
9594                         goto next;
9595                 }
9596
9597                 em->start = cur_offset;
9598                 em->orig_start = cur_offset;
9599                 em->len = ins.offset;
9600                 em->block_start = ins.objectid;
9601                 em->block_len = ins.offset;
9602                 em->orig_block_len = ins.offset;
9603                 em->ram_bytes = ins.offset;
9604                 em->bdev = root->fs_info->fs_devices->latest_bdev;
9605                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
9606                 em->generation = trans->transid;
9607
9608                 while (1) {
9609                         write_lock(&em_tree->lock);
9610                         ret = add_extent_mapping(em_tree, em, 1);
9611                         write_unlock(&em_tree->lock);
9612                         if (ret != -EEXIST)
9613                                 break;
9614                         btrfs_drop_extent_cache(inode, cur_offset,
9615                                                 cur_offset + ins.offset - 1,
9616                                                 0);
9617                 }
9618                 free_extent_map(em);
9619 next:
9620                 num_bytes -= ins.offset;
9621                 cur_offset += ins.offset;
9622                 *alloc_hint = ins.objectid + ins.offset;
9623
9624                 inode_inc_iversion(inode);
9625                 inode->i_ctime = CURRENT_TIME;
9626                 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9627                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9628                     (actual_len > inode->i_size) &&
9629                     (cur_offset > inode->i_size)) {
9630                         if (cur_offset > actual_len)
9631                                 i_size = actual_len;
9632                         else
9633                                 i_size = cur_offset;
9634                         i_size_write(inode, i_size);
9635                         btrfs_ordered_update_i_size(inode, i_size, NULL);
9636                 }
9637
9638                 ret = btrfs_update_inode(trans, root, inode);
9639
9640                 if (ret) {
9641                         btrfs_abort_transaction(trans, root, ret);
9642                         if (own_trans)
9643                                 btrfs_end_transaction(trans, root);
9644                         break;
9645                 }
9646
9647                 if (own_trans)
9648                         btrfs_end_transaction(trans, root);
9649         }
9650         return ret;
9651 }
9652
9653 int btrfs_prealloc_file_range(struct inode *inode, int mode,
9654                               u64 start, u64 num_bytes, u64 min_size,
9655                               loff_t actual_len, u64 *alloc_hint)
9656 {
9657         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9658                                            min_size, actual_len, alloc_hint,
9659                                            NULL);
9660 }
9661
9662 int btrfs_prealloc_file_range_trans(struct inode *inode,
9663                                     struct btrfs_trans_handle *trans, int mode,
9664                                     u64 start, u64 num_bytes, u64 min_size,
9665                                     loff_t actual_len, u64 *alloc_hint)
9666 {
9667         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9668                                            min_size, actual_len, alloc_hint, trans);
9669 }
9670
9671 static int btrfs_set_page_dirty(struct page *page)
9672 {
9673         return __set_page_dirty_nobuffers(page);
9674 }
9675
9676 static int btrfs_permission(struct inode *inode, int mask)
9677 {
9678         struct btrfs_root *root = BTRFS_I(inode)->root;
9679         umode_t mode = inode->i_mode;
9680
9681         if (mask & MAY_WRITE &&
9682             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9683                 if (btrfs_root_readonly(root))
9684                         return -EROFS;
9685                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9686                         return -EACCES;
9687         }
9688         return generic_permission(inode, mask);
9689 }
9690
9691 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
9692 {
9693         struct btrfs_trans_handle *trans;
9694         struct btrfs_root *root = BTRFS_I(dir)->root;
9695         struct inode *inode = NULL;
9696         u64 objectid;
9697         u64 index;
9698         int ret = 0;
9699
9700         /*
9701          * 5 units required for adding orphan entry
9702          */
9703         trans = btrfs_start_transaction(root, 5);
9704         if (IS_ERR(trans))
9705                 return PTR_ERR(trans);
9706
9707         ret = btrfs_find_free_ino(root, &objectid);
9708         if (ret)
9709                 goto out;
9710
9711         inode = btrfs_new_inode(trans, root, dir, NULL, 0,
9712                                 btrfs_ino(dir), objectid, mode, &index);
9713         if (IS_ERR(inode)) {
9714                 ret = PTR_ERR(inode);
9715                 inode = NULL;
9716                 goto out;
9717         }
9718
9719         inode->i_fop = &btrfs_file_operations;
9720         inode->i_op = &btrfs_file_inode_operations;
9721
9722         inode->i_mapping->a_ops = &btrfs_aops;
9723         BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9724
9725         ret = btrfs_init_inode_security(trans, inode, dir, NULL);
9726         if (ret)
9727                 goto out_inode;
9728
9729         ret = btrfs_update_inode(trans, root, inode);
9730         if (ret)
9731                 goto out_inode;
9732         ret = btrfs_orphan_add(trans, inode);
9733         if (ret)
9734                 goto out_inode;
9735
9736         /*
9737          * We set number of links to 0 in btrfs_new_inode(), and here we set
9738          * it to 1 because d_tmpfile() will issue a warning if the count is 0,
9739          * through:
9740          *
9741          *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9742          */
9743         set_nlink(inode, 1);
9744         unlock_new_inode(inode);
9745         d_tmpfile(dentry, inode);
9746         mark_inode_dirty(inode);
9747
9748 out:
9749         btrfs_end_transaction(trans, root);
9750         if (ret)
9751                 iput(inode);
9752         btrfs_balance_delayed_items(root);
9753         btrfs_btree_balance_dirty(root);
9754         return ret;
9755
9756 out_inode:
9757         unlock_new_inode(inode);
9758         goto out;
9759
9760 }
9761
9762 /* Inspired by filemap_check_errors() */
9763 int btrfs_inode_check_errors(struct inode *inode)
9764 {
9765         int ret = 0;
9766
9767         if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
9768             test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
9769                 ret = -ENOSPC;
9770         if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
9771             test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
9772                 ret = -EIO;
9773
9774         return ret;
9775 }
9776
9777 static const struct inode_operations btrfs_dir_inode_operations = {
9778         .getattr        = btrfs_getattr,
9779         .lookup         = btrfs_lookup,
9780         .create         = btrfs_create,
9781         .unlink         = btrfs_unlink,
9782         .link           = btrfs_link,
9783         .mkdir          = btrfs_mkdir,
9784         .rmdir          = btrfs_rmdir,
9785         .rename2        = btrfs_rename2,
9786         .symlink        = btrfs_symlink,
9787         .setattr        = btrfs_setattr,
9788         .mknod          = btrfs_mknod,
9789         .setxattr       = btrfs_setxattr,
9790         .getxattr       = btrfs_getxattr,
9791         .listxattr      = btrfs_listxattr,
9792         .removexattr    = btrfs_removexattr,
9793         .permission     = btrfs_permission,
9794         .get_acl        = btrfs_get_acl,
9795         .set_acl        = btrfs_set_acl,
9796         .update_time    = btrfs_update_time,
9797         .tmpfile        = btrfs_tmpfile,
9798 };
9799 static const struct inode_operations btrfs_dir_ro_inode_operations = {
9800         .lookup         = btrfs_lookup,
9801         .permission     = btrfs_permission,
9802         .get_acl        = btrfs_get_acl,
9803         .set_acl        = btrfs_set_acl,
9804         .update_time    = btrfs_update_time,
9805 };
9806
9807 static const struct file_operations btrfs_dir_file_operations = {
9808         .llseek         = generic_file_llseek,
9809         .read           = generic_read_dir,
9810         .iterate        = btrfs_real_readdir,
9811         .unlocked_ioctl = btrfs_ioctl,
9812 #ifdef CONFIG_COMPAT
9813         .compat_ioctl   = btrfs_ioctl,
9814 #endif
9815         .release        = btrfs_release_file,
9816         .fsync          = btrfs_sync_file,
9817 };
9818
9819 static struct extent_io_ops btrfs_extent_io_ops = {
9820         .fill_delalloc = run_delalloc_range,
9821         .submit_bio_hook = btrfs_submit_bio_hook,
9822         .merge_bio_hook = btrfs_merge_bio_hook,
9823         .readpage_end_io_hook = btrfs_readpage_end_io_hook,
9824         .writepage_end_io_hook = btrfs_writepage_end_io_hook,
9825         .writepage_start_hook = btrfs_writepage_start_hook,
9826         .set_bit_hook = btrfs_set_bit_hook,
9827         .clear_bit_hook = btrfs_clear_bit_hook,
9828         .merge_extent_hook = btrfs_merge_extent_hook,
9829         .split_extent_hook = btrfs_split_extent_hook,
9830 };
9831
9832 /*
9833  * btrfs doesn't support the bmap operation because swapfiles
9834  * use bmap to make a mapping of extents in the file.  They assume
9835  * these extents won't change over the life of the file and they
9836  * use the bmap result to do IO directly to the drive.
9837  *
9838  * the btrfs bmap call would return logical addresses that aren't
9839  * suitable for IO and they also will change frequently as COW
9840  * operations happen.  So, swapfile + btrfs == corruption.
9841  *
9842  * For now we're avoiding this by dropping bmap.
9843  */
9844 static const struct address_space_operations btrfs_aops = {
9845         .readpage       = btrfs_readpage,
9846         .writepage      = btrfs_writepage,
9847         .writepages     = btrfs_writepages,
9848         .readpages      = btrfs_readpages,
9849         .direct_IO      = btrfs_direct_IO,
9850         .invalidatepage = btrfs_invalidatepage,
9851         .releasepage    = btrfs_releasepage,
9852         .set_page_dirty = btrfs_set_page_dirty,
9853         .error_remove_page = generic_error_remove_page,
9854 };
9855
9856 static const struct address_space_operations btrfs_symlink_aops = {
9857         .readpage       = btrfs_readpage,
9858         .writepage      = btrfs_writepage,
9859         .invalidatepage = btrfs_invalidatepage,
9860         .releasepage    = btrfs_releasepage,
9861 };
9862
9863 static const struct inode_operations btrfs_file_inode_operations = {
9864         .getattr        = btrfs_getattr,
9865         .setattr        = btrfs_setattr,
9866         .setxattr       = btrfs_setxattr,
9867         .getxattr       = btrfs_getxattr,
9868         .listxattr      = btrfs_listxattr,
9869         .removexattr    = btrfs_removexattr,
9870         .permission     = btrfs_permission,
9871         .fiemap         = btrfs_fiemap,
9872         .get_acl        = btrfs_get_acl,
9873         .set_acl        = btrfs_set_acl,
9874         .update_time    = btrfs_update_time,
9875 };
9876 static const struct inode_operations btrfs_special_inode_operations = {
9877         .getattr        = btrfs_getattr,
9878         .setattr        = btrfs_setattr,
9879         .permission     = btrfs_permission,
9880         .setxattr       = btrfs_setxattr,
9881         .getxattr       = btrfs_getxattr,
9882         .listxattr      = btrfs_listxattr,
9883         .removexattr    = btrfs_removexattr,
9884         .get_acl        = btrfs_get_acl,
9885         .set_acl        = btrfs_set_acl,
9886         .update_time    = btrfs_update_time,
9887 };
9888 static const struct inode_operations btrfs_symlink_inode_operations = {
9889         .readlink       = generic_readlink,
9890         .follow_link    = page_follow_link_light,
9891         .put_link       = page_put_link,
9892         .getattr        = btrfs_getattr,
9893         .setattr        = btrfs_setattr,
9894         .permission     = btrfs_permission,
9895         .setxattr       = btrfs_setxattr,
9896         .getxattr       = btrfs_getxattr,
9897         .listxattr      = btrfs_listxattr,
9898         .removexattr    = btrfs_removexattr,
9899         .update_time    = btrfs_update_time,
9900 };
9901
9902 const struct dentry_operations btrfs_dentry_operations = {
9903         .d_delete       = btrfs_dentry_delete,
9904         .d_release      = btrfs_dentry_release,
9905 };