Merge tag 'xfs-for-linus-4.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 24 Apr 2015 14:08:41 +0000 (07:08 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 24 Apr 2015 14:08:41 +0000 (07:08 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 24 Apr 2015 14:08:41 +0000 (07:08 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 24 Apr 2015 14:08:41 +0000 (07:08 -0700)
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt

index 0bfafe1..5a5a055 100644 (file)
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -228,30 +228,19 @@ default behaviour.
  Deprecated Mount Options
  ========================
  
-  delaylog/nodelaylog
-       Delayed logging is the only logging method that XFS supports
-       now, so these mount options are now ignored.
-
-       Due for removal in 3.12.
-
-  ihashsize=value
-       In memory inode hashes have been removed, so this option has
-       no function as of August 2007. Option is deprecated.
-
-       Due for removal in 3.12.
+None at present.
  
-  irixsgid
-       This behaviour is now controlled by a sysctl, so the mount
-       option is ignored.
  
-       Due for removal in 3.12.
+Removed Mount Options
+=====================
  
-  osyncisdsync
-  osyncisosync
-       O_SYNC and O_DSYNC are fully supported, so there is no need
-       for these options any more.
+  Name                         Removed
+  ----                         -------
+  delaylog/nodelaylog          v3.20
+  ihashsize                    v3.20
+  irixsgid                     v3.20
+  osyncisdsync/osyncisosync    v3.20
  
-       Due for removal in 3.12.
  
  sysctls
  =======
diff --git a/fs/open.c b/fs/open.c

index 6796f04..98e5a52 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -231,8 +231,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                 return -EINVAL;
  
         /* Return error if mode is not supported */
-       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-                    FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+       if (mode & ~FALLOC_FL_SUPPORTED_MASK)
                 return -EOPNOTSUPP;
  
         /* Punch hole and zero range are mutually exclusive */
@@ -250,6 +249,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
             (mode & ~FALLOC_FL_COLLAPSE_RANGE))
                 return -EINVAL;
  
+       /* Insert range should only be used exclusively. */
+       if ((mode & FALLOC_FL_INSERT_RANGE) &&
+           (mode & ~FALLOC_FL_INSERT_RANGE))
+               return -EINVAL;
+
         if (!(file->f_mode & FMODE_WRITE))
                 return -EBADF;
  
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c

index a6fbf44..516162b 100644 (file)
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -260,6 +260,7 @@ xfs_alloc_fix_len(
                 rlen = rlen - (k - args->mod);
         else
                 rlen = rlen - args->prod + (args->mod - k);
+       /* casts to (int) catch length underflows */
         if ((int)rlen < (int)args->minlen)
                 return;
         ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
@@ -286,7 +287,8 @@ xfs_alloc_fix_minleft(
         if (diff >= 0)
                 return 1;
         args->len += diff;              /* shrink the allocated space */
-       if (args->len >= args->minlen)
+       /* casts to (int) catch length underflows */
+       if ((int)args->len >= (int)args->minlen)
                 return 1;
         args->agbno = NULLAGBLOCK;
         return 0;
@@ -315,6 +317,9 @@ xfs_alloc_fixup_trees(
         xfs_agblock_t   nfbno2;         /* second new free startblock */
         xfs_extlen_t    nflen1=0;       /* first new free length */
         xfs_extlen_t    nflen2=0;       /* second new free length */
+       struct xfs_mount *mp;
+
+       mp = cnt_cur->bc_mp;
  
         /*
          * Look up the record in the by-size tree if necessary.
@@ -323,13 +328,13 @@ xfs_alloc_fixup_trees(
  #ifdef DEBUG
                 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(
+               XFS_WANT_CORRUPTED_RETURN(mp,
                         i == 1 && nfbno1 == fbno && nflen1 == flen);
  #endif
         } else {
                 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
+               XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
         }
         /*
          * Look up the record in the by-block tree if necessary.
@@ -338,13 +343,13 @@ xfs_alloc_fixup_trees(
  #ifdef DEBUG
                 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(
+               XFS_WANT_CORRUPTED_RETURN(mp,
                         i == 1 && nfbno1 == fbno && nflen1 == flen);
  #endif
         } else {
                 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
+               XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
         }
  
  #ifdef DEBUG
@@ -355,7 +360,7 @@ xfs_alloc_fixup_trees(
                 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
                 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
  
-               XFS_WANT_CORRUPTED_RETURN(
+               XFS_WANT_CORRUPTED_RETURN(mp,
                         bnoblock->bb_numrecs == cntblock->bb_numrecs);
         }
  #endif
@@ -386,25 +391,25 @@ xfs_alloc_fixup_trees(
          */
         if ((error = xfs_btree_delete(cnt_cur, &i)))
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
         /*
          * Add new by-size btree entry(s).
          */
         if (nfbno1 != NULLAGBLOCK) {
                 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 0);
+               XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
                 if ((error = xfs_btree_insert(cnt_cur, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
+               XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
         }
         if (nfbno2 != NULLAGBLOCK) {
                 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 0);
+               XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
                 if ((error = xfs_btree_insert(cnt_cur, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
+               XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
         }
         /*
          * Fix up the by-block btree entry(s).
@@ -415,7 +420,7 @@ xfs_alloc_fixup_trees(
                  */
                 if ((error = xfs_btree_delete(bno_cur, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
+               XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
         } else {
                 /*
                  * Update the by-block entry to start later|be shorter.
@@ -429,10 +434,10 @@ xfs_alloc_fixup_trees(
                  */
                 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 0);
+               XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
                 if ((error = xfs_btree_insert(bno_cur, &i)))
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
+               XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
         }
         return 0;
  }
@@ -682,7 +687,7 @@ xfs_alloc_ag_vextent_exact(
         error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
         if (error)
                 goto error0;
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
         ASSERT(fbno <= args->agbno);
  
         /*
@@ -783,7 +788,7 @@ xfs_alloc_find_best_extent(
                 error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
                 if (error)
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                 xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
  
                 /*
@@ -946,7 +951,7 @@ restart:
                                 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
                                                 &ltlen, &i)))
                                         goto error0;
-                               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                               XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                                 if (ltlen >= args->minlen)
                                         break;
                                 if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
@@ -966,7 +971,7 @@ restart:
                          */
                         if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                 goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                         xfs_alloc_compute_aligned(args, ltbno, ltlen,
                                                   &ltbnoa, &ltlena);
                         if (ltlena < args->minlen)
@@ -999,7 +1004,7 @@ restart:
                 cnt_cur->bc_ptrs[0] = besti;
                 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                 ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
                 args->len = blen;
                 if (!xfs_alloc_fix_minleft(args)) {
@@ -1088,7 +1093,7 @@ restart:
                 if (bno_cur_lt) {
                         if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                 goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                         xfs_alloc_compute_aligned(args, ltbno, ltlen,
                                                   &ltbnoa, &ltlena);
                         if (ltlena >= args->minlen)
@@ -1104,7 +1109,7 @@ restart:
                 if (bno_cur_gt) {
                         if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                 goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                         xfs_alloc_compute_aligned(args, gtbno, gtlen,
                                                   &gtbnoa, &gtlena);
                         if (gtlena >= args->minlen)
@@ -1303,7 +1308,7 @@ restart:
                         error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
                         if (error)
                                 goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
  
                         xfs_alloc_compute_aligned(args, fbno, flen,
                                                   &rbno, &rlen);
@@ -1342,7 +1347,7 @@ restart:
          * This can't happen in the second case above.
          */
         rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-       XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+       XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
                         (rlen <= flen && rbno + rlen <= fbno + flen), error0);
         if (rlen < args->maxlen) {
                 xfs_agblock_t   bestfbno;
@@ -1362,13 +1367,13 @@ restart:
                         if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
                                         &i)))
                                 goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                         if (flen < bestrlen)
                                 break;
                         xfs_alloc_compute_aligned(args, fbno, flen,
                                                   &rbno, &rlen);
                         rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-                       XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+                       XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
                                 (rlen <= flen && rbno + rlen <= fbno + flen),
                                 error0);
                         if (rlen > bestrlen) {
@@ -1383,7 +1388,7 @@ restart:
                 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
                                 &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                 rlen = bestrlen;
                 rbno = bestrbno;
                 flen = bestflen;
@@ -1408,7 +1413,7 @@ restart:
         if (!xfs_alloc_fix_minleft(args))
                 goto out_nominleft;
         rlen = args->len;
-       XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+       XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
         /*
          * Allocate and initialize a cursor for the by-block tree.
          */
@@ -1422,7 +1427,7 @@ restart:
         cnt_cur = bno_cur = NULL;
         args->len = rlen;
         args->agbno = rbno;
-       XFS_WANT_CORRUPTED_GOTO(
+       XFS_WANT_CORRUPTED_GOTO(args->mp,
                 args->agbno + args->len <=
                         be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
                 error0);
@@ -1467,7 +1472,7 @@ xfs_alloc_ag_vextent_small(
         if (i) {
                 if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
         }
         /*
          * Nothing in the btree, try the freelist.  Make sure
@@ -1493,7 +1498,7 @@ xfs_alloc_ag_vextent_small(
                         }
                         args->len = 1;
                         args->agbno = fbno;
-                       XFS_WANT_CORRUPTED_GOTO(
+                       XFS_WANT_CORRUPTED_GOTO(args->mp,
                                 args->agbno + args->len <=
                                 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
                                 error0);
@@ -1579,7 +1584,7 @@ xfs_free_ag_extent(
                  */
                 if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 /*
                  * It's not contiguous, though.
                  */
@@ -1591,7 +1596,8 @@ xfs_free_ag_extent(
                          * space was invalid, it's (partly) already free.
                          * Very bad.
                          */
-                       XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+                       XFS_WANT_CORRUPTED_GOTO(mp,
+                                               ltbno + ltlen <= bno, error0);
                 }
         }
         /*
@@ -1606,7 +1612,7 @@ xfs_free_ag_extent(
                  */
                 if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 /*
                  * It's not contiguous, though.
                  */
@@ -1618,7 +1624,7 @@ xfs_free_ag_extent(
                          * space was invalid, it's (partly) already free.
                          * Very bad.
                          */
-                       XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+                       XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0);
                 }
         }
         /*
@@ -1635,31 +1641,31 @@ xfs_free_ag_extent(
                  */
                 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 if ((error = xfs_btree_delete(cnt_cur, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 /*
                  * Delete the old by-size entry on the right.
                  */
                 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 if ((error = xfs_btree_delete(cnt_cur, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 /*
                  * Delete the old by-block entry for the right block.
                  */
                 if ((error = xfs_btree_delete(bno_cur, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 /*
                  * Move the by-block cursor back to the left neighbor.
                  */
                 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
  #ifdef DEBUG
                 /*
                  * Check that this is the right record: delete didn't
@@ -1672,7 +1678,7 @@ xfs_free_ag_extent(
                         if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
                                         &i)))
                                 goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(
+                       XFS_WANT_CORRUPTED_GOTO(mp,
                                 i == 1 && xxbno == ltbno && xxlen == ltlen,
                                 error0);
                 }
@@ -1695,17 +1701,17 @@ xfs_free_ag_extent(
                  */
                 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 if ((error = xfs_btree_delete(cnt_cur, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 /*
                  * Back up the by-block cursor to the left neighbor, and
                  * update its length.
                  */
                 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 nbno = ltbno;
                 nlen = len + ltlen;
                 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
@@ -1721,10 +1727,10 @@ xfs_free_ag_extent(
                  */
                 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 if ((error = xfs_btree_delete(cnt_cur, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 /*
                  * Update the starting block and length of the right
                  * neighbor in the by-block tree.
@@ -1743,7 +1749,7 @@ xfs_free_ag_extent(
                 nlen = len;
                 if ((error = xfs_btree_insert(bno_cur, &i)))
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
         }
         xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
         bno_cur = NULL;
@@ -1752,10 +1758,10 @@ xfs_free_ag_extent(
          */
         if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
                 goto error0;
-       XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0);
         if ((error = xfs_btree_insert(cnt_cur, &i)))
                 goto error0;
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
         xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
         cnt_cur = NULL;
  
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c

index 15105db..04e79d5 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -86,8 +86,83 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
                         int move_count);
  STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
  
+/*
+ * attr3 block 'firstused' conversion helpers.
+ *
+ * firstused refers to the offset of the first used byte of the nameval region
+ * of an attr leaf block. The region starts at the tail of the block and expands
+ * backwards towards the middle. As such, firstused is initialized to the block
+ * size for an empty leaf block and is reduced from there.
+ *
+ * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k.
+ * The in-core firstused field is 32-bit and thus supports the maximum fsb size.
+ * The on-disk field is only 16-bit, however, and overflows at 64k. Since this
+ * only occurs at exactly 64k, we use zero as a magic on-disk value to represent
+ * the attr block size. The following helpers manage the conversion between the
+ * in-core and on-disk formats.
+ */
+
+static void
+xfs_attr3_leaf_firstused_from_disk(
+       struct xfs_da_geometry          *geo,
+       struct xfs_attr3_icleaf_hdr     *to,
+       struct xfs_attr_leafblock       *from)
+{
+       struct xfs_attr3_leaf_hdr       *hdr3;
+
+       if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+               hdr3 = (struct xfs_attr3_leaf_hdr *) from;
+               to->firstused = be16_to_cpu(hdr3->firstused);
+       } else {
+               to->firstused = be16_to_cpu(from->hdr.firstused);
+       }
+
+       /*
+        * Convert from the magic fsb size value to actual blocksize. This
+        * should only occur for empty blocks when the block size overflows
+        * 16-bits.
+        */
+       if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) {
+               ASSERT(!to->count && !to->usedbytes);
+               ASSERT(geo->blksize > USHRT_MAX);
+               to->firstused = geo->blksize;
+       }
+}
+
+static void
+xfs_attr3_leaf_firstused_to_disk(
+       struct xfs_da_geometry          *geo,
+       struct xfs_attr_leafblock       *to,
+       struct xfs_attr3_icleaf_hdr     *from)
+{
+       struct xfs_attr3_leaf_hdr       *hdr3;
+       uint32_t                        firstused;
+
+       /* magic value should only be seen on disk */
+       ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF);
+
+       /*
+        * Scale down the 32-bit in-core firstused value to the 16-bit on-disk
+        * value. This only overflows at the max supported value of 64k. Use the
+        * magic on-disk value to represent block size in this case.
+        */
+       firstused = from->firstused;
+       if (firstused > USHRT_MAX) {
+               ASSERT(from->firstused == geo->blksize);
+               firstused = XFS_ATTR3_LEAF_NULLOFF;
+       }
+
+       if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+               hdr3 = (struct xfs_attr3_leaf_hdr *) to;
+               hdr3->firstused = cpu_to_be16(firstused);
+       } else {
+               to->hdr.firstused = cpu_to_be16(firstused);
+       }
+}
+
  void
  xfs_attr3_leaf_hdr_from_disk(
+       struct xfs_da_geometry          *geo,
         struct xfs_attr3_icleaf_hdr     *to,
         struct xfs_attr_leafblock       *from)
  {
@@ -104,7 +179,7 @@ xfs_attr3_leaf_hdr_from_disk(
                 to->magic = be16_to_cpu(hdr3->info.hdr.magic);
                 to->count = be16_to_cpu(hdr3->count);
                 to->usedbytes = be16_to_cpu(hdr3->usedbytes);
-               to->firstused = be16_to_cpu(hdr3->firstused);
+               xfs_attr3_leaf_firstused_from_disk(geo, to, from);
                 to->holes = hdr3->holes;
  
                 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -118,7 +193,7 @@ xfs_attr3_leaf_hdr_from_disk(
         to->magic = be16_to_cpu(from->hdr.info.magic);
         to->count = be16_to_cpu(from->hdr.count);
         to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
-       to->firstused = be16_to_cpu(from->hdr.firstused);
+       xfs_attr3_leaf_firstused_from_disk(geo, to, from);
         to->holes = from->hdr.holes;
  
         for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -129,10 +204,11 @@ xfs_attr3_leaf_hdr_from_disk(
  
  void
  xfs_attr3_leaf_hdr_to_disk(
+       struct xfs_da_geometry          *geo,
         struct xfs_attr_leafblock       *to,
         struct xfs_attr3_icleaf_hdr     *from)
  {
-       int     i;
+       int                             i;
  
         ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
                from->magic == XFS_ATTR3_LEAF_MAGIC);
@@ -145,7 +221,7 @@ xfs_attr3_leaf_hdr_to_disk(
                 hdr3->info.hdr.magic = cpu_to_be16(from->magic);
                 hdr3->count = cpu_to_be16(from->count);
                 hdr3->usedbytes = cpu_to_be16(from->usedbytes);
-               hdr3->firstused = cpu_to_be16(from->firstused);
+               xfs_attr3_leaf_firstused_to_disk(geo, to, from);
                 hdr3->holes = from->holes;
                 hdr3->pad1 = 0;
  
@@ -160,7 +236,7 @@ xfs_attr3_leaf_hdr_to_disk(
         to->hdr.info.magic = cpu_to_be16(from->magic);
         to->hdr.count = cpu_to_be16(from->count);
         to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
-       to->hdr.firstused = cpu_to_be16(from->firstused);
+       xfs_attr3_leaf_firstused_to_disk(geo, to, from);
         to->hdr.holes = from->holes;
         to->hdr.pad1 = 0;
  
@@ -178,7 +254,7 @@ xfs_attr3_leaf_verify(
         struct xfs_attr_leafblock *leaf = bp->b_addr;
         struct xfs_attr3_icleaf_hdr ichdr;
  
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
  
         if (xfs_sb_version_hascrc(&mp->m_sb)) {
                 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
@@ -757,9 +833,10 @@ xfs_attr_shortform_allfit(
         struct xfs_attr3_icleaf_hdr leafhdr;
         int                     bytes;
         int                     i;
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
  
         leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
         entry = xfs_attr3_leaf_entryp(leaf);
  
         bytes = sizeof(struct xfs_attr_sf_hdr);
@@ -812,7 +889,7 @@ xfs_attr3_leaf_to_shortform(
         memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
  
         leaf = (xfs_attr_leafblock_t *)tmpbuffer;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
         entry = xfs_attr3_leaf_entryp(leaf);
  
         /* XXX (dgc): buffer is about to be marked stale - why zero it? */
@@ -923,7 +1000,7 @@ xfs_attr3_leaf_to_node(
         btree = dp->d_ops->node_tree_p(node);
  
         leaf = bp2->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
         entries = xfs_attr3_leaf_entryp(leaf);
  
         /* both on-disk, don't endian-flip twice */
@@ -988,7 +1065,7 @@ xfs_attr3_leaf_create(
         }
         ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
  
-       xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+       xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
         xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
  
         *bpp = bp;
@@ -1073,7 +1150,7 @@ xfs_attr3_leaf_add(
         trace_xfs_attr_leaf_add(args);
  
         leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
         ASSERT(args->index >= 0 && args->index <= ichdr.count);
         entsize = xfs_attr_leaf_newentsize(args, NULL);
  
@@ -1126,7 +1203,7 @@ xfs_attr3_leaf_add(
         tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
  
  out_log_hdr:
-       xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+       xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
         xfs_trans_log_buf(args->trans, bp,
                 XFS_DA_LOGRANGE(leaf, &leaf->hdr,
                                 xfs_attr3_leaf_hdr_size(leaf)));
@@ -1294,7 +1371,7 @@ xfs_attr3_leaf_compact(
                                                 ichdr_dst->freemap[0].base;
  
         /* write the header back to initialise the underlying buffer */
-       xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
+       xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst);
  
         /*
          * Copy all entry's in the same (sorted) order,
@@ -1344,9 +1421,10 @@ xfs_attr_leaf_order(
  {
         struct xfs_attr3_icleaf_hdr ichdr1;
         struct xfs_attr3_icleaf_hdr ichdr2;
+       struct xfs_mount *mp = leaf1_bp->b_target->bt_mount;
  
-       xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
-       xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+       xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
+       xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
         return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
  }
  
@@ -1388,8 +1466,8 @@ xfs_attr3_leaf_rebalance(
         ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
         leaf1 = blk1->bp->b_addr;
         leaf2 = blk2->bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
-       xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+       xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1);
+       xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2);
         ASSERT(ichdr2.count == 0);
         args = state->args;
  
@@ -1490,8 +1568,8 @@ xfs_attr3_leaf_rebalance(
                                         ichdr1.count, count);
         }
  
-       xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
-       xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+       xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1);
+       xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2);
         xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
         xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
  
@@ -1684,7 +1762,7 @@ xfs_attr3_leaf_toosmall(
          */
         blk = &state->path.blk[ state->path.active-1 ];
         leaf = blk->bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf);
         bytes = xfs_attr3_leaf_hdr_size(leaf) +
                 ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
                 ichdr.usedbytes;
@@ -1740,7 +1818,7 @@ xfs_attr3_leaf_toosmall(
                 if (error)
                         return error;
  
-               xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+               xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr);
  
                 bytes = state->args->geo->blksize -
                         (state->args->geo->blksize >> 2) -
@@ -1805,7 +1883,7 @@ xfs_attr3_leaf_remove(
         trace_xfs_attr_leaf_remove(args);
  
         leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
  
         ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
         ASSERT(args->index >= 0 && args->index < ichdr.count);
@@ -1918,12 +1996,11 @@ xfs_attr3_leaf_remove(
                                 tmp = be16_to_cpu(entry->nameidx);
                 }
                 ichdr.firstused = tmp;
-               if (!ichdr.firstused)
-                       ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
+               ASSERT(ichdr.firstused != 0);
         } else {
                 ichdr.holes = 1;        /* mark as needing compaction */
         }
-       xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+       xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
         xfs_trans_log_buf(args->trans, bp,
                           XFS_DA_LOGRANGE(leaf, &leaf->hdr,
                                           xfs_attr3_leaf_hdr_size(leaf)));
@@ -1957,8 +2034,8 @@ xfs_attr3_leaf_unbalance(
  
         drop_leaf = drop_blk->bp->b_addr;
         save_leaf = save_blk->bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
-       xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+       xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf);
+       xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf);
         entry = xfs_attr3_leaf_entryp(drop_leaf);
  
         /*
@@ -2012,7 +2089,7 @@ xfs_attr3_leaf_unbalance(
                 tmphdr.firstused = state->args->geo->blksize;
  
                 /* write the header to the temp buffer to initialise it */
-               xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+               xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr);
  
                 if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
                                          drop_blk->bp, &drophdr)) {
@@ -2039,7 +2116,7 @@ xfs_attr3_leaf_unbalance(
                 kmem_free(tmp_leaf);
         }
  
-       xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
+       xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
         xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
                                            state->args->geo->blksize - 1);
  
@@ -2085,7 +2162,7 @@ xfs_attr3_leaf_lookup_int(
         trace_xfs_attr_leaf_lookup(args);
  
         leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
         entries = xfs_attr3_leaf_entryp(leaf);
         ASSERT(ichdr.count < args->geo->blksize / 8);
  
@@ -2190,7 +2267,7 @@ xfs_attr3_leaf_getvalue(
         int                     valuelen;
  
         leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
         ASSERT(ichdr.count < args->geo->blksize / 8);
         ASSERT(args->index < ichdr.count);
  
@@ -2391,8 +2468,9 @@ xfs_attr_leaf_lasthash(
  {
         struct xfs_attr3_icleaf_hdr ichdr;
         struct xfs_attr_leaf_entry *entries;
+       struct xfs_mount *mp = bp->b_target->bt_mount;
  
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+       xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
         entries = xfs_attr3_leaf_entryp(bp->b_addr);
         if (count)
                 *count = ichdr.count;
@@ -2486,7 +2564,7 @@ xfs_attr3_leaf_clearflag(
         ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
  
  #ifdef DEBUG
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
         ASSERT(args->index < ichdr.count);
         ASSERT(args->index >= 0);
  
@@ -2550,7 +2628,7 @@ xfs_attr3_leaf_setflag(
  
         leaf = bp->b_addr;
  #ifdef DEBUG
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
         ASSERT(args->index < ichdr.count);
         ASSERT(args->index >= 0);
  #endif
@@ -2629,11 +2707,11 @@ xfs_attr3_leaf_flipflags(
         entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
  
  #ifdef DEBUG
-       xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1);
         ASSERT(args->index < ichdr1.count);
         ASSERT(args->index >= 0);
  
-       xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+       xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2);
         ASSERT(args->index2 < ichdr2.count);
         ASSERT(args->index2 >= 0);
  
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h

index e2929da..025c4b8 100644 (file)
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -100,9 +100,11 @@ int        xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
  int    xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
                         xfs_dablk_t bno, xfs_daddr_t mappedbno,
                         struct xfs_buf **bpp);
-void   xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+void   xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
+                                    struct xfs_attr3_icleaf_hdr *to,
                                      struct xfs_attr_leafblock *from);
-void   xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+void   xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
+                                  struct xfs_attr_leafblock *to,
                                    struct xfs_attr3_icleaf_hdr *from);
  
  #endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index 61ec015..aeffeaa 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -244,30 +244,6 @@ xfs_bmap_forkoff_reset(
         }
  }
  
-/*
- * Debug/sanity checking code
- */
-
-STATIC int
-xfs_bmap_sanity_check(
-       struct xfs_mount        *mp,
-       struct xfs_buf          *bp,
-       int                     level)
-{
-       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-
-       if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
-           block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
-               return 0;
-
-       if (be16_to_cpu(block->bb_level) != level ||
-           be16_to_cpu(block->bb_numrecs) == 0 ||
-           be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
-               return 0;
-
-       return 1;
-}
-
  #ifdef DEBUG
  STATIC struct xfs_buf *
  xfs_bmap_get_bp(
@@ -410,9 +386,6 @@ xfs_bmap_check_leaf_extents(
                                 goto error_norelse;
                 }
                 block = XFS_BUF_TO_BLOCK(bp);
-               XFS_WANT_CORRUPTED_GOTO(
-                       xfs_bmap_sanity_check(mp, bp, level),
-                       error0);
                 if (level == 0)
                         break;
  
@@ -424,7 +397,8 @@ xfs_bmap_check_leaf_extents(
                 xfs_check_block(block, mp, 0, 0);
                 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                 bno = be64_to_cpu(*pp);
-               XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+               XFS_WANT_CORRUPTED_GOTO(mp,
+                                       XFS_FSB_SANITY_CHECK(mp, bno), error0);
                 if (bp_release) {
                         bp_release = 0;
                         xfs_trans_brelse(NULL, bp);
@@ -1029,7 +1003,7 @@ xfs_bmap_add_attrfork_btree(
                 if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                         goto error0;
                 /* must be at least one entry */
-               XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
                 if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
                         goto error0;
                 if (stat == 0) {
@@ -1311,14 +1285,12 @@ xfs_bmap_read_extents(
                 if (error)
                         return error;
                 block = XFS_BUF_TO_BLOCK(bp);
-               XFS_WANT_CORRUPTED_GOTO(
-                       xfs_bmap_sanity_check(mp, bp, level),
-                       error0);
                 if (level == 0)
                         break;
                 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                 bno = be64_to_cpu(*pp);
-               XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+               XFS_WANT_CORRUPTED_GOTO(mp,
+                       XFS_FSB_SANITY_CHECK(mp, bno), error0);
                 xfs_trans_brelse(tp, bp);
         }
         /*
@@ -1345,9 +1317,6 @@ xfs_bmap_read_extents(
                                 XFS_ERRLEVEL_LOW, ip->i_mount, block);
                         goto error0;
                 }
-               XFS_WANT_CORRUPTED_GOTO(
-                       xfs_bmap_sanity_check(mp, bp, 0),
-                       error0);
                 /*
                  * Read-ahead the next leaf block, if any.
                  */
@@ -1755,7 +1724,9 @@ xfs_bmap_add_extent_delay_real(
         xfs_filblks_t           temp=0; /* value for da_new calculations */
         xfs_filblks_t           temp2=0;/* value for da_new calculations */
         int                     tmp_rval;       /* partial logging flags */
+       struct xfs_mount        *mp;
  
+       mp  = bma->tp ? bma->tp->t_mountp : NULL;
         ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
  
         ASSERT(bma->idx >= 0);
@@ -1866,15 +1837,15 @@ xfs_bmap_add_extent_delay_real(
                                         RIGHT.br_blockcount, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_btree_delete(bma->cur, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_btree_decrement(bma->cur, 0, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                         LEFT.br_startblock,
                                         LEFT.br_blockcount +
@@ -1907,7 +1878,7 @@ xfs_bmap_add_extent_delay_real(
                                         &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                         LEFT.br_startblock,
                                         LEFT.br_blockcount +
@@ -1938,7 +1909,7 @@ xfs_bmap_add_extent_delay_real(
                                         RIGHT.br_blockcount, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
                                         new->br_startblock,
                                         PREV.br_blockcount +
@@ -1968,12 +1939,12 @@ xfs_bmap_add_extent_delay_real(
                                         &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                         bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                         error = xfs_btree_insert(bma->cur, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 }
                 break;
  
@@ -2001,7 +1972,7 @@ xfs_bmap_add_extent_delay_real(
                                         &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                         LEFT.br_startblock,
                                         LEFT.br_blockcount +
@@ -2038,12 +2009,12 @@ xfs_bmap_add_extent_delay_real(
                                         &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                         bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                         error = xfs_btree_insert(bma->cur, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 }
  
                 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2084,7 +2055,7 @@ xfs_bmap_add_extent_delay_real(
                                         RIGHT.br_blockcount, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_bmbt_update(bma->cur, new->br_startoff,
                                         new->br_startblock,
                                         new->br_blockcount +
@@ -2122,12 +2093,12 @@ xfs_bmap_add_extent_delay_real(
                                         &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                         bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                         error = xfs_btree_insert(bma->cur, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 }
  
                 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2191,12 +2162,12 @@ xfs_bmap_add_extent_delay_real(
                                         &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                         bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                         error = xfs_btree_insert(bma->cur, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 }
  
                 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2212,9 +2183,8 @@ xfs_bmap_add_extent_delay_real(
                 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
                         (bma->cur ? bma->cur->bc_private.b.allocated : 0));
                 if (diff > 0) {
-                       error = xfs_icsb_modify_counters(bma->ip->i_mount,
-                                       XFS_SBS_FDBLOCKS,
-                                       -((int64_t)diff), 0);
+                       error = xfs_mod_fdblocks(bma->ip->i_mount,
+                                                -((int64_t)diff), false);
                         ASSERT(!error);
                         if (error)
                                 goto done;
@@ -2265,9 +2235,8 @@ xfs_bmap_add_extent_delay_real(
                         temp += bma->cur->bc_private.b.allocated;
                 ASSERT(temp <= da_old);
                 if (temp < da_old)
-                       xfs_icsb_modify_counters(bma->ip->i_mount,
-                                       XFS_SBS_FDBLOCKS,
-                                       (int64_t)(da_old - temp), 0);
+                       xfs_mod_fdblocks(bma->ip->i_mount,
+                                       (int64_t)(da_old - temp), false);
         }
  
         /* clear out the allocated field, done with it now in any case. */
@@ -2309,6 +2278,7 @@ xfs_bmap_add_extent_unwritten_real(
                                         /* left is 0, right is 1, prev is 2 */
         int                     rval=0; /* return value (logging flags) */
         int                     state = 0;/* state bits, accessed thru macros */
+       struct xfs_mount        *mp = tp->t_mountp;
  
         *logflagsp = 0;
  
@@ -2421,19 +2391,19 @@ xfs_bmap_add_extent_unwritten_real(
                                         RIGHT.br_startblock,
                                         RIGHT.br_blockcount, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_btree_delete(cur, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_btree_decrement(cur, 0, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_btree_delete(cur, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_btree_decrement(cur, 0, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                 LEFT.br_startblock,
                                 LEFT.br_blockcount + PREV.br_blockcount +
@@ -2464,13 +2434,13 @@ xfs_bmap_add_extent_unwritten_real(
                                         PREV.br_startblock, PREV.br_blockcount,
                                         &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_btree_delete(cur, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_btree_decrement(cur, 0, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                 LEFT.br_startblock,
                                 LEFT.br_blockcount + PREV.br_blockcount,
@@ -2499,13 +2469,13 @@ xfs_bmap_add_extent_unwritten_real(
                                         RIGHT.br_startblock,
                                         RIGHT.br_blockcount, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_btree_delete(cur, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_btree_decrement(cur, 0, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                 new->br_startblock,
                                 new->br_blockcount + RIGHT.br_blockcount,
@@ -2532,7 +2502,7 @@ xfs_bmap_add_extent_unwritten_real(
                                         new->br_startblock, new->br_blockcount,
                                         &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                 new->br_startblock, new->br_blockcount,
                                 newext)))
@@ -2569,7 +2539,7 @@ xfs_bmap_add_extent_unwritten_real(
                                         PREV.br_startblock, PREV.br_blockcount,
                                         &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_bmbt_update(cur,
                                 PREV.br_startoff + new->br_blockcount,
                                 PREV.br_startblock + new->br_blockcount,
@@ -2611,7 +2581,7 @@ xfs_bmap_add_extent_unwritten_real(
                                         PREV.br_startblock, PREV.br_blockcount,
                                         &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_bmbt_update(cur,
                                 PREV.br_startoff + new->br_blockcount,
                                 PREV.br_startblock + new->br_blockcount,
@@ -2621,7 +2591,7 @@ xfs_bmap_add_extent_unwritten_real(
                         cur->bc_rec.b = *new;
                         if ((error = xfs_btree_insert(cur, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 }
                 break;
  
@@ -2651,7 +2621,7 @@ xfs_bmap_add_extent_unwritten_real(
                                         PREV.br_startblock,
                                         PREV.br_blockcount, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
                                 PREV.br_startblock,
                                 PREV.br_blockcount - new->br_blockcount,
@@ -2689,7 +2659,7 @@ xfs_bmap_add_extent_unwritten_real(
                                         PREV.br_startblock, PREV.br_blockcount,
                                         &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
                                 PREV.br_startblock,
                                 PREV.br_blockcount - new->br_blockcount,
@@ -2699,11 +2669,11 @@ xfs_bmap_add_extent_unwritten_real(
                                         new->br_startblock, new->br_blockcount,
                                         &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                         cur->bc_rec.b.br_state = XFS_EXT_NORM;
                         if ((error = xfs_btree_insert(cur, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 }
                 break;
  
@@ -2737,7 +2707,7 @@ xfs_bmap_add_extent_unwritten_real(
                                         PREV.br_startblock, PREV.br_blockcount,
                                         &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         /* new right extent - oldext */
                         if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
                                 r[1].br_startblock, r[1].br_blockcount,
@@ -2749,7 +2719,7 @@ xfs_bmap_add_extent_unwritten_real(
                                 new->br_startoff - PREV.br_startoff;
                         if ((error = xfs_btree_insert(cur, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         /*
                          * Reset the cursor to the position of the new extent
                          * we are about to insert as we can't trust it after
@@ -2759,12 +2729,12 @@ xfs_bmap_add_extent_unwritten_real(
                                         new->br_startblock, new->br_blockcount,
                                         &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                         /* new middle extent - newext */
                         cur->bc_rec.b.br_state = new->br_state;
                         if ((error = xfs_btree_insert(cur, &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 }
                 break;
  
@@ -2944,8 +2914,8 @@ xfs_bmap_add_extent_hole_delay(
         }
         if (oldlen != newlen) {
                 ASSERT(oldlen > newlen);
-               xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
-                       (int64_t)(oldlen - newlen), 0);
+               xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
+                                false);
                 /*
                  * Nothing to do for disk quota accounting here.
                  */
@@ -2968,7 +2938,9 @@ xfs_bmap_add_extent_hole_real(
         xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
         int                     rval=0; /* return value (logging flags) */
         int                     state;  /* state bits, accessed thru macros */
+       struct xfs_mount        *mp;
  
+       mp = bma->tp ? bma->tp->t_mountp : NULL;
         ifp = XFS_IFORK_PTR(bma->ip, whichfork);
  
         ASSERT(bma->idx >= 0);
@@ -3056,15 +3028,15 @@ xfs_bmap_add_extent_hole_real(
                                         &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_btree_delete(bma->cur, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_btree_decrement(bma->cur, 0, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_bmbt_update(bma->cur, left.br_startoff,
                                         left.br_startblock,
                                         left.br_blockcount +
@@ -3097,7 +3069,7 @@ xfs_bmap_add_extent_hole_real(
                                         &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_bmbt_update(bma->cur, left.br_startoff,
                                         left.br_startblock,
                                         left.br_blockcount +
@@ -3131,7 +3103,7 @@ xfs_bmap_add_extent_hole_real(
                                         right.br_blockcount, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         error = xfs_bmbt_update(bma->cur, new->br_startoff,
                                         new->br_startblock,
                                         new->br_blockcount +
@@ -3161,12 +3133,12 @@ xfs_bmap_add_extent_hole_real(
                                         new->br_blockcount, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                         bma->cur->bc_rec.b.br_state = new->br_state;
                         error = xfs_btree_insert(bma->cur, &i);
                         if (error)
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 }
                 break;
         }
@@ -4160,18 +4132,15 @@ xfs_bmapi_reserve_delalloc(
         ASSERT(indlen > 0);
  
         if (rt) {
-               error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-                                         -((int64_t)extsz), 0);
+               error = xfs_mod_frextents(mp, -((int64_t)extsz));
         } else {
-               error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                                -((int64_t)alen), 0);
+               error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
         }
  
         if (error)
                 goto out_unreserve_quota;
  
-       error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                        -((int64_t)indlen), 0);
+       error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
         if (error)
                 goto out_unreserve_blocks;
  
@@ -4198,9 +4167,9 @@ xfs_bmapi_reserve_delalloc(
  
  out_unreserve_blocks:
         if (rt)
-               xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+               xfs_mod_frextents(mp, extsz);
         else
-               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+               xfs_mod_fdblocks(mp, alen, false);
  out_unreserve_quota:
         if (XFS_IS_QUOTA_ON(mp))
                 xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
@@ -4801,7 +4770,7 @@ xfs_bmap_del_extent(
                                         got.br_startblock, got.br_blockcount,
                                         &i)))
                                 goto done;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 }
                 da_old = da_new = 0;
         } else {
@@ -4835,7 +4804,7 @@ xfs_bmap_del_extent(
                 }
                 if ((error = xfs_btree_delete(cur, &i)))
                         goto done;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                 break;
  
         case 2:
@@ -4935,7 +4904,8 @@ xfs_bmap_del_extent(
                                                         got.br_startblock,
                                                         temp, &i)))
                                                 goto done;
-                                       XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                                       XFS_WANT_CORRUPTED_GOTO(mp,
+                                                               i == 1, done);
                                         /*
                                          * Update the btree record back
                                          * to the original value.
@@ -4956,7 +4926,7 @@ xfs_bmap_del_extent(
                                         error = -ENOSPC;
                                         goto done;
                                 }
-                               XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                         } else
                                 flags |= xfs_ilog_fext(whichfork);
                         XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -5012,10 +4982,8 @@ xfs_bmap_del_extent(
          * Nothing to do for disk quota accounting here.
          */
         ASSERT(da_old >= da_new);
-       if (da_old > da_new) {
-               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                       (int64_t)(da_old - da_new), 0);
-       }
+       if (da_old > da_new)
+               xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
  done:
         *logflagsp = flags;
         return error;
@@ -5284,14 +5252,13 @@ xfs_bunmapi(
  
                                 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
                                 do_div(rtexts, mp->m_sb.sb_rextsize);
-                               xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
-                                               (int64_t)rtexts, 0);
+                               xfs_mod_frextents(mp, (int64_t)rtexts);
                                 (void)xfs_trans_reserve_quota_nblks(NULL,
                                         ip, -((long)del.br_blockcount), 0,
                                         XFS_QMOPT_RES_RTBLKS);
                         } else {
-                               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                               (int64_t)del.br_blockcount, 0);
+                               xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
+                                                false);
                                 (void)xfs_trans_reserve_quota_nblks(NULL,
                                         ip, -((long)del.br_blockcount), 0,
                                         XFS_QMOPT_RES_REGBLKS);
@@ -5453,6 +5420,7 @@ xfs_bmse_merge(
         struct xfs_bmbt_irec            left;
         xfs_filblks_t                   blockcount;
         int                             error, i;
+       struct xfs_mount                *mp = ip->i_mount;
  
         xfs_bmbt_get_all(gotp, &got);
         xfs_bmbt_get_all(leftp, &left);
@@ -5487,19 +5455,19 @@ xfs_bmse_merge(
                                    got.br_blockcount, &i);
         if (error)
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
  
         error = xfs_btree_delete(cur, &i);
         if (error)
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
  
         /* lookup and update size of the previous extent */
         error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
                                    left.br_blockcount, &i);
         if (error)
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
  
         left.br_blockcount = blockcount;
  
@@ -5518,50 +5486,92 @@ xfs_bmse_shift_one(
         int                             *current_ext,
         struct xfs_bmbt_rec_host        *gotp,
         struct xfs_btree_cur            *cur,
-       int                             *logflags)
+       int                             *logflags,
+       enum shift_direction            direction)
  {
         struct xfs_ifork                *ifp;
+       struct xfs_mount                *mp;
         xfs_fileoff_t                   startoff;
-       struct xfs_bmbt_rec_host        *leftp;
+       struct xfs_bmbt_rec_host        *adj_irecp;
         struct xfs_bmbt_irec            got;
-       struct xfs_bmbt_irec            left;
+       struct xfs_bmbt_irec            adj_irec;
         int                             error;
         int                             i;
+       int                             total_extents;
  
+       mp = ip->i_mount;
         ifp = XFS_IFORK_PTR(ip, whichfork);
+       total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
  
         xfs_bmbt_get_all(gotp, &got);
-       startoff = got.br_startoff - offset_shift_fsb;
  
         /* delalloc extents should be prevented by caller */
-       XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock));
+       XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
  
-       /*
-        * Check for merge if we've got an extent to the left, otherwise make
-        * sure there's enough room at the start of the file for the shift.
-        */
-       if (*current_ext) {
-               /* grab the left extent and check for a large enough hole */
-               leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
-               xfs_bmbt_get_all(leftp, &left);
+       if (direction == SHIFT_LEFT) {
+               startoff = got.br_startoff - offset_shift_fsb;
+
+               /*
+                * Check for merge if we've got an extent to the left,
+                * otherwise make sure there's enough room at the start
+                * of the file for the shift.
+                */
+               if (!*current_ext) {
+                       if (got.br_startoff < offset_shift_fsb)
+                               return -EINVAL;
+                       goto update_current_ext;
+               }
+               /*
+                * grab the left extent and check for a large
+                * enough hole.
+                */
+               adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1);
+               xfs_bmbt_get_all(adj_irecp, &adj_irec);
  
-               if (startoff < left.br_startoff + left.br_blockcount)
+               if (startoff <
+                   adj_irec.br_startoff + adj_irec.br_blockcount)
                         return -EINVAL;
  
                 /* check whether to merge the extent or shift it down */
-               if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) {
+               if (xfs_bmse_can_merge(&adj_irec, &got,
+                                      offset_shift_fsb)) {
                         return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
-                                             *current_ext, gotp, leftp, cur,
-                                             logflags);
+                                             *current_ext, gotp, adj_irecp,
+                                             cur, logflags);
                 }
-       } else if (got.br_startoff < offset_shift_fsb)
-               return -EINVAL;
-
+       } else {
+               startoff = got.br_startoff + offset_shift_fsb;
+               /* nothing to move if this is the last extent */
+               if (*current_ext >= (total_extents - 1))
+                       goto update_current_ext;
+               /*
+                * If this is not the last extent in the file, make sure there
+                * is enough room between current extent and next extent for
+                * accommodating the shift.
+                */
+               adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1);
+               xfs_bmbt_get_all(adj_irecp, &adj_irec);
+               if (startoff + got.br_blockcount > adj_irec.br_startoff)
+                       return -EINVAL;
+               /*
+                * Unlike a left shift (which involves a hole punch),
+                * a right shift does not modify extent neighbors
+                * in any way. We should never find mergeable extents
+                * in this scenario. Check anyways and warn if we
+                * encounter two extents that could be one.
+                */
+               if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb))
+                       WARN_ON_ONCE(1);
+       }
         /*
          * Increment the extent index for the next iteration, update the start
          * offset of the in-core extent and update the btree if applicable.
          */
-       (*current_ext)++;
+update_current_ext:
+       if (direction == SHIFT_LEFT)
+               (*current_ext)++;
+       else
+               (*current_ext)--;
         xfs_bmbt_set_startoff(gotp, startoff);
         *logflags |= XFS_ILOG_CORE;
         if (!cur) {
@@ -5573,18 +5583,18 @@ xfs_bmse_shift_one(
                                    got.br_blockcount, &i);
         if (error)
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
  
         got.br_startoff = startoff;
         return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
-                               got.br_blockcount, got.br_state);
+                              got.br_blockcount, got.br_state);
  }
  
  /*
- * Shift extent records to the left to cover a hole.
+ * Shift extent records to the left/right to cover/create a hole.
   *
   * The maximum number of extents to be shifted in a single operation is
- * @num_exts. @start_fsb specifies the file offset to start the shift and the
+ * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the
   * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
   * is the length by which each extent is shifted. If there is no hole to shift
   * the extents into, this will be considered invalid operation and we abort
@@ -5594,12 +5604,13 @@ int
  xfs_bmap_shift_extents(
         struct xfs_trans        *tp,
         struct xfs_inode        *ip,
-       xfs_fileoff_t           start_fsb,
+       xfs_fileoff_t           *next_fsb,
         xfs_fileoff_t           offset_shift_fsb,
         int                     *done,
-       xfs_fileoff_t           *next_fsb,
+       xfs_fileoff_t           stop_fsb,
         xfs_fsblock_t           *firstblock,
         struct xfs_bmap_free    *flist,
+       enum shift_direction    direction,
         int                     num_exts)
  {
         struct xfs_btree_cur            *cur = NULL;
@@ -5609,10 +5620,11 @@ xfs_bmap_shift_extents(
         struct xfs_ifork                *ifp;
         xfs_extnum_t                    nexts = 0;
         xfs_extnum_t                    current_ext;
+       xfs_extnum_t                    total_extents;
+       xfs_extnum_t                    stop_extent;
         int                             error = 0;
         int                             whichfork = XFS_DATA_FORK;
         int                             logflags = 0;
-       int                             total_extents;
  
         if (unlikely(XFS_TEST_ERROR(
             (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5628,6 +5640,8 @@ xfs_bmap_shift_extents(
  
         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+       ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT);
  
         ifp = XFS_IFORK_PTR(ip, whichfork);
         if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -5645,43 +5659,83 @@ xfs_bmap_shift_extents(
         }
  
         /*
+        * There may be delalloc extents in the data fork before the range we
+        * are collapsing out, so we cannot use the count of real extents here.
+        * Instead we have to calculate it from the incore fork.
+        */
+       total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+       if (total_extents == 0) {
+               *done = 1;
+               goto del_cursor;
+       }
+
+       /*
+        * In case of first right shift, we need to initialize next_fsb
+        */
+       if (*next_fsb == NULLFSBLOCK) {
+               gotp = xfs_iext_get_ext(ifp, total_extents - 1);
+               xfs_bmbt_get_all(gotp, &got);
+               *next_fsb = got.br_startoff;
+               if (stop_fsb > *next_fsb) {
+                       *done = 1;
+                       goto del_cursor;
+               }
+       }
+
+       /* Lookup the extent index at which we have to stop */
+       if (direction == SHIFT_RIGHT) {
+               gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent);
+               /* Make stop_extent exclusive of shift range */
+               stop_extent--;
+       } else
+               stop_extent = total_extents;
+
+       /*
          * Look up the extent index for the fsb where we start shifting. We can
          * henceforth iterate with current_ext as extent list changes are locked
          * out via ilock.
          *
          * gotp can be null in 2 cases: 1) if there are no extents or 2)
-        * start_fsb lies in a hole beyond which there are no extents. Either
+        * *next_fsb lies in a hole beyond which there are no extents. Either
          * way, we are done.
          */
-       gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext);
+       gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, &current_ext);
         if (!gotp) {
                 *done = 1;
                 goto del_cursor;
         }
  
-       /*
-        * There may be delalloc extents in the data fork before the range we
-        * are collapsing out, so we cannot use the count of real extents here.
-        * Instead we have to calculate it from the incore fork.
-        */
-       total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-       while (nexts++ < num_exts && current_ext < total_extents) {
+       /* some sanity checking before we finally start shifting extents */
+       if ((direction == SHIFT_LEFT && current_ext >= stop_extent) ||
+            (direction == SHIFT_RIGHT && current_ext <= stop_extent)) {
+               error = -EIO;
+               goto del_cursor;
+       }
+
+       while (nexts++ < num_exts) {
                 error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
-                                       &current_ext, gotp, cur, &logflags);
+                                          &current_ext, gotp, cur, &logflags,
+                                          direction);
                 if (error)
                         goto del_cursor;
+               /*
+                * If there was an extent merge during the shift, the extent
+                * count can change. Update the total and grade the next record.
+                */
+               if (direction == SHIFT_LEFT) {
+                       total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+                       stop_extent = total_extents;
+               }
  
-               /* update total extent count and grab the next record */
-               total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
-               if (current_ext >= total_extents)
+               if (current_ext == stop_extent) {
+                       *done = 1;
+                       *next_fsb = NULLFSBLOCK;
                         break;
+               }
                 gotp = xfs_iext_get_ext(ifp, current_ext);
         }
  
-       /* Check if we are done */
-       if (current_ext == total_extents) {
-               *done = 1;
-       } else if (next_fsb) {
+       if (!*done) {
                 xfs_bmbt_get_all(gotp, &got);
                 *next_fsb = got.br_startoff;
         }
@@ -5696,3 +5750,189 @@ del_cursor:
  
         return error;
  }
+
+/*
+ * Splits an extent into two extents at split_fsb block such that it is
+ * the first block of the current_ext. @current_ext is a target extent
+ * to be split. @split_fsb is a block where the extents is split.
+ * If split_fsb lies in a hole or the first block of extents, just return 0.
+ */
+STATIC int
+xfs_bmap_split_extent_at(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           split_fsb,
+       xfs_fsblock_t           *firstfsb,
+       struct xfs_bmap_free    *free_list)
+{
+       int                             whichfork = XFS_DATA_FORK;
+       struct xfs_btree_cur            *cur = NULL;
+       struct xfs_bmbt_rec_host        *gotp;
+       struct xfs_bmbt_irec            got;
+       struct xfs_bmbt_irec            new; /* split extent */
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_ifork                *ifp;
+       xfs_fsblock_t                   gotblkcnt; /* new block count for got */
+       xfs_extnum_t                    current_ext;
+       int                             error = 0;
+       int                             logflags = 0;
+       int                             i = 0;
+
+       if (unlikely(XFS_TEST_ERROR(
+           (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+            mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+               XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
+                                XFS_ERRLEVEL_LOW, mp);
+               return -EFSCORRUPTED;
+       }
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       ifp = XFS_IFORK_PTR(ip, whichfork);
+       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+               /* Read in all the extents */
+               error = xfs_iread_extents(tp, ip, whichfork);
+               if (error)
+                       return error;
+       }
+
+       /*
+        * gotp can be null in 2 cases: 1) if there are no extents
+        * or 2) split_fsb lies in a hole beyond which there are
+        * no extents. Either way, we are done.
+        */
+       gotp = xfs_iext_bno_to_ext(ifp, split_fsb, &current_ext);
+       if (!gotp)
+               return 0;
+
+       xfs_bmbt_get_all(gotp, &got);
+
+       /*
+        * Check split_fsb lies in a hole or the start boundary offset
+        * of the extent.
+        */
+       if (got.br_startoff >= split_fsb)
+               return 0;
+
+       gotblkcnt = split_fsb - got.br_startoff;
+       new.br_startoff = split_fsb;
+       new.br_startblock = got.br_startblock + gotblkcnt;
+       new.br_blockcount = got.br_blockcount - gotblkcnt;
+       new.br_state = got.br_state;
+
+       if (ifp->if_flags & XFS_IFBROOT) {
+               cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+               cur->bc_private.b.firstblock = *firstfsb;
+               cur->bc_private.b.flist = free_list;
+               cur->bc_private.b.flags = 0;
+               error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                               got.br_startblock,
+                               got.br_blockcount,
+                               &i);
+               if (error)
+                       goto del_cursor;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+       }
+
+       xfs_bmbt_set_blockcount(gotp, gotblkcnt);
+       got.br_blockcount = gotblkcnt;
+
+       logflags = XFS_ILOG_CORE;
+       if (cur) {
+               error = xfs_bmbt_update(cur, got.br_startoff,
+                               got.br_startblock,
+                               got.br_blockcount,
+                               got.br_state);
+               if (error)
+                       goto del_cursor;
+       } else
+               logflags |= XFS_ILOG_DEXT;
+
+       /* Add new extent */
+       current_ext++;
+       xfs_iext_insert(ip, current_ext, 1, &new, 0);
+       XFS_IFORK_NEXT_SET(ip, whichfork,
+                          XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+
+       if (cur) {
+               error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
+                               new.br_startblock, new.br_blockcount,
+                               &i);
+               if (error)
+                       goto del_cursor;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
+               cur->bc_rec.b.br_state = new.br_state;
+
+               error = xfs_btree_insert(cur, &i);
+               if (error)
+                       goto del_cursor;
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+       }
+
+       /*
+        * Convert to a btree if necessary.
+        */
+       if (xfs_bmap_needs_btree(ip, whichfork)) {
+               int tmp_logflags; /* partial log flag return val */
+
+               ASSERT(cur == NULL);
+               error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
+                               &cur, 0, &tmp_logflags, whichfork);
+               logflags |= tmp_logflags;
+       }
+
+del_cursor:
+       if (cur) {
+               cur->bc_private.b.allocated = 0;
+               xfs_btree_del_cursor(cur,
+                               error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       }
+
+       if (logflags)
+               xfs_trans_log_inode(tp, ip, logflags);
+       return error;
+}
+
+int
+xfs_bmap_split_extent(
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           split_fsb)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp;
+       struct xfs_bmap_free    free_list;
+       xfs_fsblock_t           firstfsb;
+       int                     committed;
+       int                     error;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+       error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                       XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+       xfs_bmap_init(&free_list, &firstfsb);
+
+       error = xfs_bmap_split_extent_at(tp, ip, split_fsb,
+                       &firstfsb, &free_list);
+       if (error)
+               goto out;
+
+       error = xfs_bmap_finish(&tp, &free_list, &committed);
+       if (error)
+               goto out;
+
+       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+
+out:
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+       return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h

index b9d8a49..6aaa0c1 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -166,6 +166,11 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
   */
  #define XFS_BMAP_MAX_SHIFT_EXTENTS     1
  
+enum shift_direction {
+       SHIFT_LEFT = 0,
+       SHIFT_RIGHT,
+};
+
  #ifdef DEBUG
  void   xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
                 int whichfork, unsigned long caller_ip);
@@ -211,8 +216,10 @@ int        xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
                 xfs_extnum_t num);
  uint   xfs_default_attroffset(struct xfs_inode *ip);
  int    xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
-               xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb,
-               int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock,
-               struct xfs_bmap_free *flist, int num_exts);
+               xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
+               int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
+               struct xfs_bmap_free *flist, enum shift_direction direction,
+               int num_exts);
+int    xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
  
  #endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c

index 81cad43..c72283d 100644 (file)
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -168,7 +168,7 @@ xfs_btree_check_lptr(
         xfs_fsblock_t           bno,    /* btree block disk address */
         int                     level)  /* btree block level */
  {
-       XFS_WANT_CORRUPTED_RETURN(
+       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
                 level > 0 &&
                 bno != NULLFSBLOCK &&
                 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
@@ -187,7 +187,7 @@ xfs_btree_check_sptr(
  {
         xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
  
-       XFS_WANT_CORRUPTED_RETURN(
+       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
                 level > 0 &&
                 bno != NULLAGBLOCK &&
                 bno != 0 &&
@@ -1825,7 +1825,7 @@ xfs_btree_lookup(
                         error = xfs_btree_increment(cur, 0, &i);
                         if (error)
                                 goto error0;
-                       XFS_WANT_CORRUPTED_RETURN(i == 1);
+                       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
                         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
                         *stat = 1;
                         return 0;
@@ -2285,7 +2285,7 @@ xfs_btree_rshift(
         if (error)
                 goto error0;
         i = xfs_btree_lastrec(tcur, level);
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
  
         error = xfs_btree_increment(tcur, level, &i);
         if (error)
@@ -3138,7 +3138,7 @@ xfs_btree_insert(
                         goto error0;
                 }
  
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                 level++;
  
                 /*
@@ -3582,15 +3582,15 @@ xfs_btree_delrec(
                  * Actually any entry but the first would suffice.
                  */
                 i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
  
                 error = xfs_btree_increment(tcur, level, &i);
                 if (error)
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
  
                 i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
  
                 /* Grab a pointer to the block. */
                 right = xfs_btree_get_block(tcur, level, &rbp);
@@ -3634,12 +3634,12 @@ xfs_btree_delrec(
                 rrecs = xfs_btree_get_numrecs(right);
                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
                         i = xfs_btree_firstrec(tcur, level);
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
  
                         error = xfs_btree_decrement(tcur, level, &i);
                         if (error)
                                 goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                       XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                 }
         }
  
@@ -3653,13 +3653,13 @@ xfs_btree_delrec(
                  * previous block.
                  */
                 i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
  
                 error = xfs_btree_decrement(tcur, level, &i);
                 if (error)
                         goto error0;
                 i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
  
                 /* Grab a pointer to the block. */
                 left = xfs_btree_get_block(tcur, level, &lbp);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c

index 9cb0115..2385f8c 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -538,12 +538,12 @@ xfs_da3_root_split(
         oldroot = blk1->bp->b_addr;
         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
             oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
-               struct xfs_da3_icnode_hdr nodehdr;
+               struct xfs_da3_icnode_hdr icnodehdr;
  
-               dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+               dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
                 btree = dp->d_ops->node_tree_p(oldroot);
-               size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
-               level = nodehdr.level;
+               size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
+               level = icnodehdr.level;
  
                 /*
                  * we are about to copy oldroot to bp, so set up the type
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h

index 0a49b02..74bcbab 100644 (file)
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -725,7 +725,13 @@ struct xfs_attr3_icleaf_hdr {
         __uint16_t      magic;
         __uint16_t      count;
         __uint16_t      usedbytes;
-       __uint16_t      firstused;
+       /*
+        * firstused is 32-bit here instead of 16-bit like the on-disk variant
+        * to support maximum fsb size of 64k without overflow issues throughout
+        * the attr code. Instead, the overflow condition is handled on
+        * conversion to/from disk.
+        */
+       __uint32_t      firstused;
         __u8            holes;
         struct {
                 __uint16_t      base;
@@ -734,6 +740,12 @@ struct xfs_attr3_icleaf_hdr {
  };
  
  /*
+ * Special value to represent fs block size in the leaf header firstused field.
+ * Only used when block size overflows the 2-bytes available on disk.
+ */
+#define XFS_ATTR3_LEAF_NULLOFF 0
+
+/*
   * Flags used in the leaf_entry[i].flags field.
   * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
   * on the system call, they are "or"ed together for various operations.
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c

index 5ff31be..de1ea16 100644 (file)
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -89,7 +89,7 @@ __xfs_dir3_data_check(
                  * so just ensure that the count falls somewhere inside the
                  * block right now.
                  */
-               XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+               XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) <
                         ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
                 break;
         case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
@@ -107,21 +107,21 @@ __xfs_dir3_data_check(
         bf = ops->data_bestfree_p(hdr);
         count = lastfree = freeseen = 0;
         if (!bf[0].length) {
-               XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
+               XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset);
                 freeseen |= 1 << 0;
         }
         if (!bf[1].length) {
-               XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
+               XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset);
                 freeseen |= 1 << 1;
         }
         if (!bf[2].length) {
-               XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
+               XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset);
                 freeseen |= 1 << 2;
         }
  
-       XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+       XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >=
                                                 be16_to_cpu(bf[1].length));
-       XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+       XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >=
                                                 be16_to_cpu(bf[2].length));
         /*
          * Loop over the data/unused entries.
@@ -134,18 +134,18 @@ __xfs_dir3_data_check(
                  * doesn't need to be there.
                  */
                 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
-                       XFS_WANT_CORRUPTED_RETURN(
+                       XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0);
+                       XFS_WANT_CORRUPTED_RETURN(mp,
                                 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
                                                (char *)dup - (char *)hdr);
                         dfp = xfs_dir2_data_freefind(hdr, bf, dup);
                         if (dfp) {
                                 i = (int)(dfp - bf);
-                               XFS_WANT_CORRUPTED_RETURN(
+                               XFS_WANT_CORRUPTED_RETURN(mp,
                                         (freeseen & (1 << i)) == 0);
                                 freeseen |= 1 << i;
                         } else {
-                               XFS_WANT_CORRUPTED_RETURN(
+                               XFS_WANT_CORRUPTED_RETURN(mp,
                                         be16_to_cpu(dup->length) <=
                                                 be16_to_cpu(bf[2].length));
                         }
@@ -160,13 +160,13 @@ __xfs_dir3_data_check(
                  * The linear search is crude but this is DEBUG code.
                  */
                 dep = (xfs_dir2_data_entry_t *)p;
-               XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
-               XFS_WANT_CORRUPTED_RETURN(
+               XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0);
+               XFS_WANT_CORRUPTED_RETURN(mp,
                         !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
-               XFS_WANT_CORRUPTED_RETURN(
+               XFS_WANT_CORRUPTED_RETURN(mp,
                         be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
                                                (char *)dep - (char *)hdr);
-               XFS_WANT_CORRUPTED_RETURN(
+               XFS_WANT_CORRUPTED_RETURN(mp,
                                 ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
                 count++;
                 lastfree = 0;
@@ -183,14 +183,15 @@ __xfs_dir3_data_check(
                                     be32_to_cpu(lep[i].hashval) == hash)
                                         break;
                         }
-                       XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
+                       XFS_WANT_CORRUPTED_RETURN(mp,
+                                                 i < be32_to_cpu(btp->count));
                 }
                 p += ops->data_entsize(dep->namelen);
         }
         /*
          * Need to have seen all the entries and all the bestfree slots.
          */
-       XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
+       XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7);
         if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
             hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
                 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
@@ -198,13 +199,13 @@ __xfs_dir3_data_check(
                             cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
                                 stale++;
                         if (i > 0)
-                               XFS_WANT_CORRUPTED_RETURN(
+                               XFS_WANT_CORRUPTED_RETURN(mp,
                                         be32_to_cpu(lep[i].hashval) >=
                                                 be32_to_cpu(lep[i - 1].hashval));
                 }
-               XFS_WANT_CORRUPTED_RETURN(count ==
+               XFS_WANT_CORRUPTED_RETURN(mp, count ==
                         be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
-               XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
+               XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale));
         }
         return 0;
  }
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h

index 8eb7189..4daaa66 100644 (file)
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -264,68 +264,6 @@ typedef struct xfs_dsb {
         /* must be padded to 64 bit alignment */
  } xfs_dsb_t;
  
-/*
- * Sequence number values for the fields.
- */
-typedef enum {
-       XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
-       XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
-       XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
-       XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
-       XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
-       XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
-       XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
-       XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
-       XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
-       XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
-       XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
-       XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
-       XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
-       XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
-       XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
-       XFS_SBS_PQUOTINO, XFS_SBS_LSN,
-       XFS_SBS_FIELDCOUNT
-} xfs_sb_field_t;
-
-/*
- * Mask values, defined based on the xfs_sb_field_t values.
- * Only define the ones we're using.
- */
-#define        XFS_SB_MVAL(x)          (1LL << XFS_SBS_ ## x)
-#define        XFS_SB_UUID             XFS_SB_MVAL(UUID)
-#define        XFS_SB_FNAME            XFS_SB_MVAL(FNAME)
-#define        XFS_SB_ROOTINO          XFS_SB_MVAL(ROOTINO)
-#define        XFS_SB_RBMINO           XFS_SB_MVAL(RBMINO)
-#define        XFS_SB_RSUMINO          XFS_SB_MVAL(RSUMINO)
-#define        XFS_SB_VERSIONNUM       XFS_SB_MVAL(VERSIONNUM)
-#define XFS_SB_UQUOTINO                XFS_SB_MVAL(UQUOTINO)
-#define XFS_SB_GQUOTINO                XFS_SB_MVAL(GQUOTINO)
-#define XFS_SB_QFLAGS          XFS_SB_MVAL(QFLAGS)
-#define XFS_SB_SHARED_VN       XFS_SB_MVAL(SHARED_VN)
-#define XFS_SB_UNIT            XFS_SB_MVAL(UNIT)
-#define XFS_SB_WIDTH           XFS_SB_MVAL(WIDTH)
-#define XFS_SB_ICOUNT          XFS_SB_MVAL(ICOUNT)
-#define XFS_SB_IFREE           XFS_SB_MVAL(IFREE)
-#define XFS_SB_FDBLOCKS                XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2       (XFS_SB_MVAL(FEATURES2) | \
-                                XFS_SB_MVAL(BAD_FEATURES2))
-#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
-#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
-#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
-#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
-#define XFS_SB_CRC             XFS_SB_MVAL(CRC)
-#define XFS_SB_PQUOTINO                XFS_SB_MVAL(PQUOTINO)
-#define        XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
-#define        XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
-#define        XFS_SB_MOD_BITS         \
-       (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
-        XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
-        XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
-        XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
-        XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
-        XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
-        XFS_SB_PQUOTINO)
-
  
  /*
   * Misc. Flags - warning - these will be cleared by xfs_repair unless
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c

index 116ef1d..07349a1 100644 (file)
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -376,7 +376,8 @@ xfs_ialloc_ag_alloc(
          */
         newlen = args.mp->m_ialloc_inos;
         if (args.mp->m_maxicount &&
-           args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+           percpu_counter_read(&args.mp->m_icount) + newlen >
+                                                       args.mp->m_maxicount)
                 return -ENOSPC;
         args.minlen = args.maxlen = args.mp->m_ialloc_blks;
         /*
@@ -700,7 +701,7 @@ xfs_ialloc_next_rec(
                 error = xfs_inobt_get_rec(cur, rec, &i);
                 if (error)
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
+               XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
         }
  
         return 0;
@@ -724,7 +725,7 @@ xfs_ialloc_get_rec(
                 error = xfs_inobt_get_rec(cur, rec, &i);
                 if (error)
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
+               XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
         }
  
         return 0;
@@ -783,12 +784,12 @@ xfs_dialloc_ag_inobt(
                 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
                 if (error)
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
  
                 error = xfs_inobt_get_rec(cur, &rec, &j);
                 if (error)
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
  
                 if (rec.ir_freecount > 0) {
                         /*
@@ -944,19 +945,19 @@ newino:
         error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
         if (error)
                 goto error0;
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
  
         for (;;) {
                 error = xfs_inobt_get_rec(cur, &rec, &i);
                 if (error)
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                 if (rec.ir_freecount > 0)
                         break;
                 error = xfs_btree_increment(cur, 0, &i);
                 if (error)
                         goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
         }
  
  alloc_inode:
@@ -1016,7 +1017,7 @@ xfs_dialloc_ag_finobt_near(
                 error = xfs_inobt_get_rec(lcur, rec, &i);
                 if (error)
                         return error;
-               XFS_WANT_CORRUPTED_RETURN(i == 1);
+               XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
  
                 /*
                  * See if we've landed in the parent inode record. The finobt
@@ -1039,10 +1040,10 @@ xfs_dialloc_ag_finobt_near(
                 error = xfs_inobt_get_rec(rcur, &rrec, &j);
                 if (error)
                         goto error_rcur;
-               XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+               XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
         }
  
-       XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+       XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
         if (i == 1 && j == 1) {
                 /*
                  * Both the left and right records are valid. Choose the closer
@@ -1095,7 +1096,7 @@ xfs_dialloc_ag_finobt_newino(
                         error = xfs_inobt_get_rec(cur, rec, &i);
                         if (error)
                                 return error;
-                       XFS_WANT_CORRUPTED_RETURN(i == 1);
+                       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
                         return 0;
                 }
         }
@@ -1106,12 +1107,12 @@ xfs_dialloc_ag_finobt_newino(
         error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
         if (error)
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
  
         error = xfs_inobt_get_rec(cur, rec, &i);
         if (error)
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
  
         return 0;
  }
@@ -1133,19 +1134,19 @@ xfs_dialloc_ag_update_inobt(
         error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
         if (error)
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
  
         error = xfs_inobt_get_rec(cur, &rec, &i);
         if (error)
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(i == 1);
+       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
         ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
                                    XFS_INODES_PER_CHUNK) == 0);
  
         rec.ir_free &= ~XFS_INOBT_MASK(offset);
         rec.ir_freecount--;
  
-       XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
                                   (rec.ir_freecount == frec->ir_freecount));
  
         return xfs_inobt_update(cur, &rec);
@@ -1340,7 +1341,8 @@ xfs_dialloc(
          * inode.
          */
         if (mp->m_maxicount &&
-           mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
+           percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos >
+                                                       mp->m_maxicount) {
                 noroom = 1;
                 okalloc = 0;
         }
@@ -1475,14 +1477,14 @@ xfs_difree_inobt(
                         __func__, error);
                 goto error0;
         }
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
         error = xfs_inobt_get_rec(cur, &rec, &i);
         if (error) {
                 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
                         __func__, error);
                 goto error0;
         }
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
         /*
          * Get the offset in the inode chunk.
          */
@@ -1592,7 +1594,7 @@ xfs_difree_finobt(
                  * freed an inode in a previously fully allocated chunk. If not,
                  * something is out of sync.
                  */
-               XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+               XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
  
                 error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
                                              ibtrec->ir_free, &i);
@@ -1613,12 +1615,12 @@ xfs_difree_finobt(
         error = xfs_inobt_get_rec(cur, &rec, &i);
         if (error)
                 goto error;
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+       XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
  
         rec.ir_free |= XFS_INOBT_MASK(offset);
         rec.ir_freecount++;
  
-       XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+       XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
                                 (rec.ir_freecount == ibtrec->ir_freecount),
                                 error);
  
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c

index b0a5fe9..dc4bfc5 100644 (file)
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -111,14 +111,6 @@ xfs_mount_validate_sb(
         bool            check_inprogress,
         bool            check_version)
  {
-
-       /*
-        * If the log device and data device have the
-        * same device number, the log is internal.
-        * Consequently, the sb_logstart should be non-zero.  If
-        * we have a zero sb_logstart in this case, we may be trying to mount
-        * a volume filesystem in a non-volume manner.
-        */
         if (sbp->sb_magicnum != XFS_SB_MAGIC) {
                 xfs_warn(mp, "bad magic number");
                 return -EWRONGFS;
@@ -743,17 +735,15 @@ xfs_initialize_perag_data(
                 btree += pag->pagf_btreeblks;
                 xfs_perag_put(pag);
         }
-       /*
-        * Overwrite incore superblock counters with just-read data
-        */
+
+       /* Overwrite incore superblock counters with just-read data */
         spin_lock(&mp->m_sb_lock);
         sbp->sb_ifree = ifree;
         sbp->sb_icount = ialloc;
         sbp->sb_fdblocks = bfree + bfreelst + btree;
         spin_unlock(&mp->m_sb_lock);
  
-       /* Fixup the per-cpu counters as well. */
-       xfs_icsb_reinit_counters(mp);
+       xfs_reinit_percpu_counters(mp);
  
         return 0;
  }
@@ -771,6 +761,10 @@ xfs_log_sb(
         struct xfs_mount        *mp = tp->t_mountp;
         struct xfs_buf          *bp = xfs_trans_getsb(tp, mp, 0);
  
+       mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+       mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+       mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+
         xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
         xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index 1d8eef9..a56960d 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1232,6 +1232,117 @@ xfs_vm_releasepage(
         return try_to_free_buffers(page);
  }
  
+/*
+ * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * type of write IO we are doing. This passes to the completion function the
+ * operations it needs to perform. If the mapping is for an overwrite wholly
+ * within the EOF then we don't need an ioend and so we don't allocate one.
+ * This avoids the unnecessary overhead of allocating and freeing ioends for
+ * workloads that don't require transactions on IO completion.
+ *
+ * If we get multiple mappings in a single IO, we might be mapping different
+ * types. But because the direct IO can only have a single private pointer, we
+ * need to ensure that:
+ *
+ * a) i) the ioend spans the entire region of unwritten mappings; or
+ *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
+ * b) if it contains unwritten extents, it is *permanently* marked as such
+ *
+ * We could do this by chaining ioends like buffered IO does, but we only
+ * actually get one IO completion callback from the direct IO, and that spans
+ * the entire IO regardless of how many mappings and IOs are needed to complete
+ * the DIO. There is only going to be one reference to the ioend and its life
+ * cycle is constrained by the DIO completion code. hence we don't need
+ * reference counting here.
+ */
+static void
+xfs_map_direct(
+       struct inode            *inode,
+       struct buffer_head      *bh_result,
+       struct xfs_bmbt_irec    *imap,
+       xfs_off_t               offset)
+{
+       struct xfs_ioend        *ioend;
+       xfs_off_t               size = bh_result->b_size;
+       int                     type;
+
+       if (ISUNWRITTEN(imap))
+               type = XFS_IO_UNWRITTEN;
+       else
+               type = XFS_IO_OVERWRITE;
+
+       trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+
+       if (bh_result->b_private) {
+               ioend = bh_result->b_private;
+               ASSERT(ioend->io_size > 0);
+               ASSERT(offset >= ioend->io_offset);
+               if (offset + size > ioend->io_offset + ioend->io_size)
+                       ioend->io_size = offset - ioend->io_offset + size;
+
+               if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
+                       ioend->io_type = XFS_IO_UNWRITTEN;
+
+               trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
+                                             ioend->io_size, ioend->io_type,
+                                             imap);
+       } else if (type == XFS_IO_UNWRITTEN ||
+                  offset + size > i_size_read(inode)) {
+               ioend = xfs_alloc_ioend(inode, type);
+               ioend->io_offset = offset;
+               ioend->io_size = size;
+
+               bh_result->b_private = ioend;
+               set_buffer_defer_completion(bh_result);
+
+               trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
+                                          imap);
+       } else {
+               trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+                                           imap);
+       }
+}
+
+/*
+ * If this is O_DIRECT or the mpage code calling tell them how large the mapping
+ * is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the mapping
+ * for blocks beyond EOF must be marked new so that sub block regions can be
+ * correctly zeroed. We can't do this for mappings within EOF unless the mapping
+ * was just allocated or is unwritten, otherwise the callers would overwrite
+ * existing data with zeros. Hence we have to split the mapping into a range up
+ * to and including EOF, and a second mapping for beyond EOF.
+ */
+static void
+xfs_map_trim_size(
+       struct inode            *inode,
+       sector_t                iblock,
+       struct buffer_head      *bh_result,
+       struct xfs_bmbt_irec    *imap,
+       xfs_off_t               offset,
+       ssize_t                 size)
+{
+       xfs_off_t               mapping_size;
+
+       mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
+       mapping_size <<= inode->i_blkbits;
+
+       ASSERT(mapping_size > 0);
+       if (mapping_size > size)
+               mapping_size = size;
+       if (offset < i_size_read(inode) &&
+           offset + mapping_size >= i_size_read(inode)) {
+               /* limit mapping to block that spans EOF */
+               mapping_size = roundup_64(i_size_read(inode) - offset,
+                                         1 << inode->i_blkbits);
+       }
+       if (mapping_size > LONG_MAX)
+               mapping_size = LONG_MAX;
+
+       bh_result->b_size = mapping_size;
+}
+
  STATIC int
  __xfs_get_blocks(
         struct inode            *inode,
@@ -1320,31 +1431,37 @@ __xfs_get_blocks(
  
                         xfs_iunlock(ip, lockmode);
                 }
-
-               trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+               trace_xfs_get_blocks_alloc(ip, offset, size,
+                               ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+                                                  : XFS_IO_DELALLOC, &imap);
         } else if (nimaps) {
-               trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+               trace_xfs_get_blocks_found(ip, offset, size,
+                               ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+                                                  : XFS_IO_OVERWRITE, &imap);
                 xfs_iunlock(ip, lockmode);
         } else {
                 trace_xfs_get_blocks_notfound(ip, offset, size);
                 goto out_unlock;
         }
  
+       /* trim mapping down to size requested */
+       if (direct || size > (1 << inode->i_blkbits))
+               xfs_map_trim_size(inode, iblock, bh_result,
+                                 &imap, offset, size);
+
+       /*
+        * For unwritten extents do not report a disk address in the buffered
+        * read case (treat as if we're reading into a hole).
+        */
         if (imap.br_startblock != HOLESTARTBLOCK &&
-           imap.br_startblock != DELAYSTARTBLOCK) {
-               /*
-                * For unwritten extents do not report a disk address on
-                * the read case (treat as if we're reading into a hole).
-                */
-               if (create || !ISUNWRITTEN(&imap))
-                       xfs_map_buffer(inode, bh_result, &imap, offset);
-               if (create && ISUNWRITTEN(&imap)) {
-                       if (direct) {
-                               bh_result->b_private = inode;
-                               set_buffer_defer_completion(bh_result);
-                       }
+           imap.br_startblock != DELAYSTARTBLOCK &&
+           (create || !ISUNWRITTEN(&imap))) {
+               xfs_map_buffer(inode, bh_result, &imap, offset);
+               if (ISUNWRITTEN(&imap))
                         set_buffer_unwritten(bh_result);
-               }
+               /* direct IO needs special help */
+               if (create && direct)
+                       xfs_map_direct(inode, bh_result, &imap, offset);
         }
  
         /*
@@ -1377,39 +1494,6 @@ __xfs_get_blocks(
                 }
         }
  
-       /*
-        * If this is O_DIRECT or the mpage code calling tell them how large
-        * the mapping is, so that we can avoid repeated get_blocks calls.
-        *
-        * If the mapping spans EOF, then we have to break the mapping up as the
-        * mapping for blocks beyond EOF must be marked new so that sub block
-        * regions can be correctly zeroed. We can't do this for mappings within
-        * EOF unless the mapping was just allocated or is unwritten, otherwise
-        * the callers would overwrite existing data with zeros. Hence we have
-        * to split the mapping into a range up to and including EOF, and a
-        * second mapping for beyond EOF.
-        */
-       if (direct || size > (1 << inode->i_blkbits)) {
-               xfs_off_t               mapping_size;
-
-               mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-               mapping_size <<= inode->i_blkbits;
-
-               ASSERT(mapping_size > 0);
-               if (mapping_size > size)
-                       mapping_size = size;
-               if (offset < i_size_read(inode) &&
-                   offset + mapping_size >= i_size_read(inode)) {
-                       /* limit mapping to block that spans EOF */
-                       mapping_size = roundup_64(i_size_read(inode) - offset,
-                                                 1 << inode->i_blkbits);
-               }
-               if (mapping_size > LONG_MAX)
-                       mapping_size = LONG_MAX;
-
-               bh_result->b_size = mapping_size;
-       }
-
         return 0;
  
  out_unlock:
@@ -1440,9 +1524,11 @@ xfs_get_blocks_direct(
  /*
   * Complete a direct I/O write request.
   *
- * If the private argument is non-NULL __xfs_get_blocks signals us that we
- * need to issue a transaction to convert the range from unwritten to written
- * extents.
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
   */
  STATIC void
  xfs_end_io_direct_write(
@@ -1454,43 +1540,71 @@ xfs_end_io_direct_write(
         struct inode            *inode = file_inode(iocb->ki_filp);
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_ioend        *ioend = private;
  
-       if (XFS_FORCED_SHUTDOWN(mp))
+       trace_xfs_gbmap_direct_endio(ip, offset, size,
+                                    ioend ? ioend->io_type : 0, NULL);
+
+       if (!ioend) {
+               ASSERT(offset + size <= i_size_read(inode));
                 return;
+       }
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               goto out_end_io;
  
         /*
-        * While the generic direct I/O code updates the inode size, it does
-        * so only after the end_io handler is called, which means our
-        * end_io handler thinks the on-disk size is outside the in-core
-        * size.  To prevent this just update it a little bit earlier here.
+        * dio completion end_io functions are only called on writes if more
+        * than 0 bytes was written.
          */
+       ASSERT(size > 0);
+
+       /*
+        * The ioend only maps whole blocks, while the IO may be sector aligned.
+        * Hence the ioend offset/size may not match the IO offset/size exactly.
+        * Because we don't map overwrites within EOF into the ioend, the offset
+        * may not match, but only if the endio spans EOF.  Either way, write
+        * the IO sizes into the ioend so that completion processing does the
+        * right thing.
+        */
+       ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
+       ioend->io_size = size;
+       ioend->io_offset = offset;
+
+       /*
+        * The ioend tells us whether we are doing unwritten extent conversion
+        * or an append transaction that updates the on-disk file size. These
+        * cases are the only cases where we should *potentially* be needing
+        * to update the VFS inode size.
+        *
+        * We need to update the in-core inode size here so that we don't end up
+        * with the on-disk inode size being outside the in-core inode size. We
+        * have no other method of updating EOF for AIO, so always do it here
+        * if necessary.
+        *
+        * We need to lock the test/set EOF update as we can be racing with
+        * other IO completions here to update the EOF. Failing to serialise
+        * here can result in EOF moving backwards and Bad Things Happen when
+        * that occurs.
+        */
+       spin_lock(&ip->i_flags_lock);
         if (offset + size > i_size_read(inode))
                 i_size_write(inode, offset + size);
+       spin_unlock(&ip->i_flags_lock);
  
         /*
-        * For direct I/O we do not know if we need to allocate blocks or not,
-        * so we can't preallocate an append transaction, as that results in
-        * nested reservations and log space deadlocks. Hence allocate the
-        * transaction here. While this is sub-optimal and can block IO
-        * completion for some time, we're stuck with doing it this way until
-        * we can pass the ioend to the direct IO allocation callbacks and
-        * avoid nesting that way.
+        * If we are doing an append IO that needs to update the EOF on disk,
+        * do the transaction reserve now so we can use common end io
+        * processing. Stashing the error (if there is one) in the ioend will
+        * result in the ioend processing passing on the error if it is
+        * possible as we can't return it from here.
          */
-       if (private && size > 0) {
-               xfs_iomap_write_unwritten(ip, offset, size);
-       } else if (offset + size > ip->i_d.di_size) {
-               struct xfs_trans        *tp;
-               int                     error;
-
-               tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-               error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       return;
-               }
+       if (ioend->io_type == XFS_IO_OVERWRITE)
+               ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
  
-               xfs_setfilesize(ip, tp, offset, size);
-       }
+out_end_io:
+       xfs_end_io(&ioend->io_work);
+       return;
  }
  
  STATIC ssize_t
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c

index 83af4c1..f9c1c64 100644 (file)
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -132,9 +132,10 @@ xfs_attr3_leaf_inactive(
         int                     size;
         int                     tmp;
         int                     i;
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
  
         leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
  
         /*
          * Count the number of "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c

index a43d370..65fb37a 100644 (file)
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -225,6 +225,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
         int error, i;
         struct xfs_buf *bp;
         struct xfs_inode        *dp = context->dp;
+       struct xfs_mount        *mp = dp->i_mount;
  
         trace_xfs_attr_node_list(context);
  
@@ -256,7 +257,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                         case XFS_ATTR_LEAF_MAGIC:
                         case XFS_ATTR3_LEAF_MAGIC:
                                 leaf = bp->b_addr;
-                               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+                               xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
+                                                            &leafhdr, leaf);
                                 entries = xfs_attr3_leaf_entryp(leaf);
                                 if (cursor->hashval > be32_to_cpu(
                                                 entries[leafhdr.count - 1].hashval)) {
@@ -340,7 +342,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                         xfs_trans_brelse(NULL, bp);
                         return error;
                 }
-               xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+               xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
                 if (context->seen_enough || leafhdr.forw == 0)
                         break;
                 cursor->blkno = leafhdr.forw;
@@ -368,11 +370,12 @@ xfs_attr3_leaf_list_int(
         struct xfs_attr_leaf_entry      *entry;
         int                             retval;
         int                             i;
+       struct xfs_mount                *mp = context->dp->i_mount;
  
         trace_xfs_attr_list_leaf(context);
  
         leaf = bp->b_addr;
-       xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+       xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
         entries = xfs_attr3_leaf_entryp(leaf);
  
         cursor = context->cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index 22a5dcb..a52bbd3 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1376,22 +1376,19 @@ out:
  }
  
  /*
- * xfs_collapse_file_space()
- *     This routine frees disk space and shift extent for the given file.
- *     The first thing we do is to free data blocks in the specified range
- *     by calling xfs_free_file_space(). It would also sync dirty data
- *     and invalidate page cache over the region on which collapse range
- *     is working. And Shift extent records to the left to cover a hole.
- * RETURNS:
- *     0 on success
- *     errno on error
- *
+ * @next_fsb will keep track of the extent currently undergoing shift.
+ * @stop_fsb will keep track of the extent at which we have to stop.
+ * If we are shifting left, we will start with block (offset + len) and
+ * shift each extent till last extent.
+ * If we are shifting right, we will start with last extent inside file space
+ * and continue until we reach the block corresponding to offset.
   */
-int
-xfs_collapse_file_space(
-       struct xfs_inode        *ip,
-       xfs_off_t               offset,
-       xfs_off_t               len)
+static int
+xfs_shift_file_space(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len,
+       enum shift_direction    direction)
  {
         int                     done = 0;
         struct xfs_mount        *mp = ip->i_mount;
@@ -1400,21 +1397,26 @@ xfs_collapse_file_space(
         struct xfs_bmap_free    free_list;
         xfs_fsblock_t           first_block;
         int                     committed;
-       xfs_fileoff_t           start_fsb;
+       xfs_fileoff_t           stop_fsb;
         xfs_fileoff_t           next_fsb;
         xfs_fileoff_t           shift_fsb;
  
-       ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+       ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
  
-       trace_xfs_collapse_file_space(ip);
+       if (direction == SHIFT_LEFT) {
+               next_fsb = XFS_B_TO_FSB(mp, offset + len);
+               stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+       } else {
+               /*
+                * If right shift, delegate the work of initialization of
+                * next_fsb to xfs_bmap_shift_extent as it has ilock held.
+                */
+               next_fsb = NULLFSBLOCK;
+               stop_fsb = XFS_B_TO_FSB(mp, offset);
+       }
  
-       next_fsb = XFS_B_TO_FSB(mp, offset + len);
         shift_fsb = XFS_B_TO_FSB(mp, len);
  
-       error = xfs_free_file_space(ip, offset, len);
-       if (error)
-               return error;
-
         /*
          * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
          * into the accessible region of the file.
@@ -1427,20 +1429,28 @@ xfs_collapse_file_space(
  
         /*
          * Writeback and invalidate cache for the remainder of the file as we're
-        * about to shift down every extent from the collapse range to EOF. The
-        * free of the collapse range above might have already done some of
-        * this, but we shouldn't rely on it to do anything outside of the range
-        * that was freed.
+        * about to shift down every extent from offset to EOF.
          */
         error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                            offset + len, -1);
+                                            offset, -1);
         if (error)
                 return error;
         error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                       (offset + len) >> PAGE_CACHE_SHIFT, -1);
+                                       offset >> PAGE_CACHE_SHIFT, -1);
         if (error)
                 return error;
  
+       /*
+        * The extent shiting code works on extent granularity. So, if
+        * stop_fsb is not the starting block of extent, we need to split
+        * the extent at stop_fsb.
+        */
+       if (direction == SHIFT_RIGHT) {
+               error = xfs_bmap_split_extent(ip, stop_fsb);
+               if (error)
+                       return error;
+       }
+
         while (!error && !done) {
                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
                 /*
@@ -1464,7 +1474,7 @@ xfs_collapse_file_space(
                 if (error)
                         goto out;
  
-               xfs_trans_ijoin(tp, ip, 0);
+               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  
                 xfs_bmap_init(&free_list, &first_block);
  
@@ -1472,10 +1482,9 @@ xfs_collapse_file_space(
                  * We are using the write transaction in which max 2 bmbt
                  * updates are allowed
                  */
-               start_fsb = next_fsb;
-               error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb,
-                               &done, &next_fsb, &first_block, &free_list,
-                               XFS_BMAP_MAX_SHIFT_EXTENTS);
+               error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
+                               &done, stop_fsb, &first_block, &free_list,
+                               direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
                 if (error)
                         goto out;
  
@@ -1484,18 +1493,70 @@ xfs_collapse_file_space(
                         goto out;
  
                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
         }
  
         return error;
  
  out:
         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
  }
  
  /*
+ * xfs_collapse_file_space()
+ *     This routine frees disk space and shift extent for the given file.
+ *     The first thing we do is to free data blocks in the specified range
+ *     by calling xfs_free_file_space(). It would also sync dirty data
+ *     and invalidate page cache over the region on which collapse range
+ *     is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ *     0 on success
+ *     errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len)
+{
+       int error;
+
+       ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+       trace_xfs_collapse_file_space(ip);
+
+       error = xfs_free_file_space(ip, offset, len);
+       if (error)
+               return error;
+
+       return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
+}
+
+/*
+ * xfs_insert_file_space()
+ *     This routine create hole space by shifting extents for the given file.
+ *     The first thing we do is to sync dirty data and invalidate page cache
+ *     over the region on which insert range is working. And split an extent
+ *     to two extents at given offset by calling xfs_bmap_split_extent.
+ *     And shift all extent records which are laying between [offset,
+ *     last allocated extent] to the right to reserve hole range.
+ * RETURNS:
+ *     0 on success
+ *     errno on error
+ */
+int
+xfs_insert_file_space(
+       struct xfs_inode        *ip,
+       loff_t                  offset,
+       loff_t                  len)
+{
+       ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+       trace_xfs_insert_file_space(ip);
+
+       return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
+}
+
+/*
   * We need to check that the format of the data fork in the temporary inode is
   * valid for the target inode before doing the swap. This is not a problem with
   * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
@@ -1599,13 +1660,6 @@ xfs_swap_extent_flush(
         /* Verify O_DIRECT for ftmp */
         if (VFS_I(ip)->i_mapping->nrpages)
                 return -EINVAL;
-
-       /*
-        * Don't try to swap extents on mmap()d files because we can't lock
-        * out races against page faults safely.
-        */
-       if (mapping_mapped(VFS_I(ip)->i_mapping))
-               return -EBUSY;
         return 0;
  }
  
@@ -1633,13 +1687,14 @@ xfs_swap_extents(
         }
  
         /*
-        * Lock up the inodes against other IO and truncate to begin with.
-        * Then we can ensure the inodes are flushed and have no page cache
-        * safely. Once we have done this we can take the ilocks and do the rest
-        * of the checks.
+        * Lock the inodes against other IO, page faults and truncate to
+        * begin with.  Then we can ensure the inodes are flushed and have no
+        * page cache safely. Once we have done this we can take the ilocks and
+        * do the rest of the checks.
          */
-       lock_flags = XFS_IOLOCK_EXCL;
+       lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
         xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+       xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
  
         /* Verify that both files have the same format */
         if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
@@ -1666,8 +1721,16 @@ xfs_swap_extents(
                 xfs_trans_cancel(tp, 0);
                 goto out_unlock;
         }
+
+       /*
+        * Lock and join the inodes to the tansaction so that transaction commit
+        * or cancel will unlock the inodes from this point onwards.
+        */
         xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
         lock_flags |= XFS_ILOCK_EXCL;
+       xfs_trans_ijoin(tp, ip, lock_flags);
+       xfs_trans_ijoin(tp, tip, lock_flags);
+
  
         /* Verify all data are being swapped */
         if (sxp->sx_offset != 0 ||
@@ -1720,9 +1783,6 @@ xfs_swap_extents(
                         goto out_trans_cancel;
         }
  
-       xfs_trans_ijoin(tp, ip, lock_flags);
-       xfs_trans_ijoin(tp, tip, lock_flags);
-
         /*
          * Before we've swapped the forks, lets set the owners of the forks
          * appropriately. We have to do this as we are demand paging the btree
@@ -1856,5 +1916,5 @@ out_unlock:
  
  out_trans_cancel:
         xfs_trans_cancel(tp, 0);
-       goto out_unlock;
+       goto out;
  }
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h

index 736429a..af97d9a 100644 (file)
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,6 +63,8 @@ int   xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
                             xfs_off_t len);
  int    xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
                                 xfs_off_t len);
+int    xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
+                               xfs_off_t len);
  
  /* EOF block manipulation functions */
  bool   xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c

index 507d96a..092d652 100644 (file)
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -537,9 +537,9 @@ xfs_buf_item_push(
  
         /* has a previous flush failed due to IO errors? */
         if ((bp->b_flags & XBF_WRITE_FAIL) &&
-           ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
+           ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
                 xfs_warn(bp->b_target->bt_mount,
-"Detected failing async write on buffer block 0x%llx. Retrying async write.",
+"Failing async write on buffer block 0x%llx. Retrying async write.",
                          (long long)bp->b_bn);
         }
  
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c

index 799e5a2..e85a951 100644 (file)
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -84,7 +84,7 @@ xfs_trim_extents(
                 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
                 if (error)
                         goto out_del_cursor;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+               XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor);
                 ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
  
                 /*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c

index 3ee186a..338e50b 100644 (file)
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -131,7 +131,7 @@ xfs_error_report(
  {
         if (level <= xfs_error_level) {
                 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
-               "Internal error %s at line %d of file %s.  Caller %pF",
+               "Internal error %s at line %d of file %s.  Caller %pS",
                             tag, linenum, filename, ra);
  
                 xfs_stack_trace();
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h

index 279a76e..c0394ed 100644 (file)
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -40,25 +40,25 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
  /*
   * Macros to set EFSCORRUPTED & return/branch.
   */
-#define        XFS_WANT_CORRUPTED_GOTO(x,l)    \
+#define        XFS_WANT_CORRUPTED_GOTO(mp, x, l)       \
         { \
                 int fs_is_ok = (x); \
                 ASSERT(fs_is_ok); \
                 if (unlikely(!fs_is_ok)) { \
                         XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
-                                        XFS_ERRLEVEL_LOW, NULL); \
+                                        XFS_ERRLEVEL_LOW, mp); \
                         error = -EFSCORRUPTED; \
                         goto l; \
                 } \
         }
  
-#define        XFS_WANT_CORRUPTED_RETURN(x)    \
+#define        XFS_WANT_CORRUPTED_RETURN(mp, x)        \
         { \
                 int fs_is_ok = (x); \
                 ASSERT(fs_is_ok); \
                 if (unlikely(!fs_is_ok)) { \
                         XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
-                                        XFS_ERRLEVEL_LOW, NULL); \
+                                        XFS_ERRLEVEL_LOW, mp); \
                         return -EFSCORRUPTED; \
                 } \
         }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 1f12ad0..8121e75 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -559,7 +559,7 @@ restart:
         if (error <= 0)
                 return error;
  
-       error = xfs_break_layouts(inode, iolock);
+       error = xfs_break_layouts(inode, iolock, true);
         if (error)
                 return error;
  
@@ -569,21 +569,42 @@ restart:
          * write.  If zeroing is needed and we are currently holding the
          * iolock shared, we need to update it to exclusive which implies
          * having to redo all checks before.
+        *
+        * We need to serialise against EOF updates that occur in IO
+        * completions here. We want to make sure that nobody is changing the
+        * size while we do this check until we have placed an IO barrier (i.e.
+        * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+        * The spinlock effectively forms a memory barrier once we have the
+        * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+        * and hence be able to correctly determine if we need to run zeroing.
          */
+       spin_lock(&ip->i_flags_lock);
         if (iocb->ki_pos > i_size_read(inode)) {
                 bool    zero = false;
  
+               spin_unlock(&ip->i_flags_lock);
                 if (*iolock == XFS_IOLOCK_SHARED) {
                         xfs_rw_iunlock(ip, *iolock);
                         *iolock = XFS_IOLOCK_EXCL;
                         xfs_rw_ilock(ip, *iolock);
                         iov_iter_reexpand(from, count);
+
+                       /*
+                        * We now have an IO submission barrier in place, but
+                        * AIO can do EOF updates during IO completion and hence
+                        * we now need to wait for all of them to drain. Non-AIO
+                        * DIO will have drained before we are given the
+                        * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+                        * no-op.
+                        */
+                       inode_dio_wait(inode);
                         goto restart;
                 }
                 error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
                 if (error)
                         return error;
-       }
+       } else
+               spin_unlock(&ip->i_flags_lock);
  
         /*
          * Updating the timestamps will grab the ilock again from
@@ -645,6 +666,8 @@ xfs_file_dio_aio_write(
         int                     iolock;
         size_t                  count = iov_iter_count(from);
         loff_t                  pos = iocb->ki_pos;
+       loff_t                  end;
+       struct iov_iter         data;
         struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                         mp->m_rtdev_targp : mp->m_ddev_targp;
  
@@ -685,10 +708,11 @@ xfs_file_dio_aio_write(
                 goto out;
         count = iov_iter_count(from);
         pos = iocb->ki_pos;
+       end = pos + count - 1;
  
         if (mapping->nrpages) {
                 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                   pos, pos + count - 1);
+                                                  pos, end);
                 if (ret)
                         goto out;
                 /*
@@ -698,7 +722,7 @@ xfs_file_dio_aio_write(
                  */
                 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
                                         pos >> PAGE_CACHE_SHIFT,
-                                       (pos + count - 1) >> PAGE_CACHE_SHIFT);
+                                       end >> PAGE_CACHE_SHIFT);
                 WARN_ON_ONCE(ret);
                 ret = 0;
         }
@@ -715,8 +739,22 @@ xfs_file_dio_aio_write(
         }
  
         trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-       ret = generic_file_direct_write(iocb, from, pos);
  
+       data = *from;
+       ret = mapping->a_ops->direct_IO(iocb, &data, pos);
+
+       /* see generic_file_direct_write() for why this is necessary */
+       if (mapping->nrpages) {
+               invalidate_inode_pages2_range(mapping,
+                                             pos >> PAGE_CACHE_SHIFT,
+                                             end >> PAGE_CACHE_SHIFT);
+       }
+
+       if (ret > 0) {
+               pos += ret;
+               iov_iter_advance(from, ret);
+               iocb->ki_pos = pos;
+       }
  out:
         xfs_rw_iunlock(ip, iolock);
  
@@ -822,6 +860,11 @@ xfs_file_write_iter(
         return ret;
  }
  
+#define        XFS_FALLOC_FL_SUPPORTED                                         \
+               (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
+                FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
+                FALLOC_FL_INSERT_RANGE)
+
  STATIC long
  xfs_file_fallocate(
         struct file             *file,
@@ -835,18 +878,21 @@ xfs_file_fallocate(
         enum xfs_prealloc_flags flags = 0;
         uint                    iolock = XFS_IOLOCK_EXCL;
         loff_t                  new_size = 0;
+       bool                    do_file_insert = 0;
  
         if (!S_ISREG(inode->i_mode))
                 return -EINVAL;
-       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-                    FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+       if (mode & ~XFS_FALLOC_FL_SUPPORTED)
                 return -EOPNOTSUPP;
  
         xfs_ilock(ip, iolock);
-       error = xfs_break_layouts(inode, &iolock);
+       error = xfs_break_layouts(inode, &iolock, false);
         if (error)
                 goto out_unlock;
  
+       xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+       iolock |= XFS_MMAPLOCK_EXCL;
+
         if (mode & FALLOC_FL_PUNCH_HOLE) {
                 error = xfs_free_file_space(ip, offset, len);
                 if (error)
@@ -873,6 +919,27 @@ xfs_file_fallocate(
                 error = xfs_collapse_file_space(ip, offset, len);
                 if (error)
                         goto out_unlock;
+       } else if (mode & FALLOC_FL_INSERT_RANGE) {
+               unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+               new_size = i_size_read(inode) + len;
+               if (offset & blksize_mask || len & blksize_mask) {
+                       error = -EINVAL;
+                       goto out_unlock;
+               }
+
+               /* check the new inode size does not wrap through zero */
+               if (new_size > inode->i_sb->s_maxbytes) {
+                       error = -EFBIG;
+                       goto out_unlock;
+               }
+
+               /* Offset should be less than i_size */
+               if (offset >= i_size_read(inode)) {
+                       error = -EINVAL;
+                       goto out_unlock;
+               }
+               do_file_insert = 1;
         } else {
                 flags |= XFS_PREALLOC_SET;
  
@@ -907,8 +974,19 @@ xfs_file_fallocate(
                 iattr.ia_valid = ATTR_SIZE;
                 iattr.ia_size = new_size;
                 error = xfs_setattr_size(ip, &iattr);
+               if (error)
+                       goto out_unlock;
         }
  
+       /*
+        * Perform hole insertion now that the file size has been
+        * updated so that if we crash during the operation we don't
+        * leave shifted extents past EOF and hence losing access to
+        * the data that is contained within them.
+        */
+       if (do_file_insert)
+               error = xfs_insert_file_space(ip, offset, len);
+
  out_unlock:
         xfs_iunlock(ip, iolock);
         return error;
@@ -997,20 +1075,6 @@ xfs_file_mmap(
  }
  
  /*
- * mmap()d file has taken write protection fault and is being made
- * writable. We can set the page state up correctly for a writable
- * page, which means we can do correct delalloc accounting (ENOSPC
- * checking!) and unwritten extent mapping.
- */
-STATIC int
-xfs_vm_page_mkwrite(
-       struct vm_area_struct   *vma,
-       struct vm_fault         *vmf)
-{
-       return block_page_mkwrite(vma, vmf, xfs_get_blocks);
-}
-
-/*
   * This type is designed to indicate the type of offset we would like
   * to search from page cache for xfs_seek_hole_data().
   */
@@ -1385,6 +1449,55 @@ xfs_file_llseek(
         }
  }
  
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_sem (MM)
+ *   i_mmap_lock (XFS - truncate serialisation)
+ *     page_lock (MM)
+ *       i_lock (XFS - extent map serialisation)
+ */
+STATIC int
+xfs_filemap_fault(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+{
+       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+       int                     error;
+
+       trace_xfs_filemap_fault(ip);
+
+       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+       error = filemap_fault(vma, vmf);
+       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+       return error;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
+ */
+STATIC int
+xfs_filemap_page_mkwrite(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+{
+       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+       int                     error;
+
+       trace_xfs_filemap_page_mkwrite(ip);
+
+       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+       error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+
+       return error;
+}
+
  const struct file_operations xfs_file_operations = {
         .llseek         = xfs_file_llseek,
         .read_iter      = xfs_file_read_iter,
@@ -1415,7 +1528,7 @@ const struct file_operations xfs_dir_file_operations = {
  };
  
  static const struct vm_operations_struct xfs_file_vm_ops = {
-       .fault          = filemap_fault,
+       .fault          = xfs_filemap_fault,
         .map_pages      = filemap_map_pages,
-       .page_mkwrite   = xfs_vm_page_mkwrite,
+       .page_mkwrite   = xfs_filemap_page_mkwrite,
  };
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c

index a2e86e8..8f9f854 100644 (file)
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -322,7 +322,7 @@ xfs_filestream_lookup_ag(
  
         pip = xfs_filestream_get_parent(ip);
         if (!pip)
-               goto out;
+               return NULLAGNUMBER;
  
         mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
         if (mru) {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c

index 74efe5b..cb7e8a2 100644 (file)
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -637,12 +637,13 @@ xfs_fs_counts(
         xfs_mount_t             *mp,
         xfs_fsop_counts_t       *cnt)
  {
-       xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+       cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
+       cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
+       cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+                                                       XFS_ALLOC_SET_ASIDE(mp);
+
         spin_lock(&mp->m_sb_lock);
-       cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
         cnt->freertx = mp->m_sb.sb_frextents;
-       cnt->freeino = mp->m_sb.sb_ifree;
-       cnt->allocino = mp->m_sb.sb_icount;
         spin_unlock(&mp->m_sb_lock);
         return 0;
  }
@@ -692,14 +693,9 @@ xfs_reserve_blocks(
          * what to do. This means that the amount of free space can
          * change while we do this, so we need to retry if we end up
          * trying to reserve more space than is available.
-        *
-        * We also use the xfs_mod_incore_sb() interface so that we
-        * don't have to care about whether per cpu counter are
-        * enabled, disabled or even compiled in....
          */
  retry:
         spin_lock(&mp->m_sb_lock);
-       xfs_icsb_sync_counters_locked(mp, 0);
  
         /*
          * If our previous reservation was larger than the current value,
@@ -716,7 +712,8 @@ retry:
         } else {
                 __int64_t       free;
  
-               free =  mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+               free = percpu_counter_sum(&mp->m_fdblocks) -
+                                                       XFS_ALLOC_SET_ASIDE(mp);
                 if (!free)
                         goto out; /* ENOSPC and fdblks_delta = 0 */
  
@@ -755,8 +752,7 @@ out:
                  * the extra reserve blocks from the reserve.....
                  */
                 int error;
-               error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                                fdblks_delta, 0);
+               error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
                 if (error == -ENOSPC)
                         goto retry;
         }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c

index 9771b7e..76a9f27 100644 (file)
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -439,11 +439,11 @@ again:
         *ipp = ip;
  
         /*
-        * If we have a real type for an on-disk inode, we can set ops(&unlock)
+        * If we have a real type for an on-disk inode, we can setup the inode
          * now.  If it's a new inode being created, xfs_ialloc will handle it.
          */
         if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-               xfs_setup_inode(ip);
+               xfs_setup_existing_inode(ip);
         return 0;
  
  out_error_or_again:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 6163767..d6ebc85 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared(
  }
  
  /*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock.  This routine
- * allows either or both of the locks to be obtained.
+ * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
+ * the i_lock.  This routine allows various combinations of the locks to be
+ * obtained.
   *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
+ * The 3 locks should always be ordered so that the IO lock is obtained first,
+ * the mmap lock second and the ilock last in order to prevent deadlock.
   *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- *       to be locked.  It can be:
- *             XFS_IOLOCK_SHARED,
- *             XFS_IOLOCK_EXCL,
- *             XFS_ILOCK_SHARED,
- *             XFS_ILOCK_EXCL,
- *             XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- *             XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- *             XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- *             XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ * Basic locking order:
+ *
+ * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
+ *
+ * mmap_sem locking order:
+ *
+ * i_iolock -> page lock -> mmap_sem
+ * mmap_sem -> i_mmap_lock -> page_lock
+ *
+ * The difference in mmap_sem locking order mean that we cannot hold the
+ * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
+ * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
+ * in get_user_pages() to map the user pages into the kernel address space for
+ * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * page faults already hold the mmap_sem.
+ *
+ * Hence to serialise fully against both syscall and mmap based IO, we need to
+ * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * taken in places where we need to invalidate the page cache in a race
+ * free manner (e.g. truncate, hole punch and other extent manipulation
+ * functions).
   */
  void
  xfs_ilock(
@@ -150,6 +160,8 @@ xfs_ilock(
          */
         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+       ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+              (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -159,6 +171,11 @@ xfs_ilock(
         else if (lock_flags & XFS_IOLOCK_SHARED)
                 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
  
+       if (lock_flags & XFS_MMAPLOCK_EXCL)
+               mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+       else if (lock_flags & XFS_MMAPLOCK_SHARED)
+               mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+
         if (lock_flags & XFS_ILOCK_EXCL)
                 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
         else if (lock_flags & XFS_ILOCK_SHARED)
@@ -191,6 +208,8 @@ xfs_ilock_nowait(
          */
         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+       ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+              (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -202,21 +221,35 @@ xfs_ilock_nowait(
                 if (!mrtryaccess(&ip->i_iolock))
                         goto out;
         }
+
+       if (lock_flags & XFS_MMAPLOCK_EXCL) {
+               if (!mrtryupdate(&ip->i_mmaplock))
+                       goto out_undo_iolock;
+       } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+               if (!mrtryaccess(&ip->i_mmaplock))
+                       goto out_undo_iolock;
+       }
+
         if (lock_flags & XFS_ILOCK_EXCL) {
                 if (!mrtryupdate(&ip->i_lock))
-                       goto out_undo_iolock;
+                       goto out_undo_mmaplock;
         } else if (lock_flags & XFS_ILOCK_SHARED) {
                 if (!mrtryaccess(&ip->i_lock))
-                       goto out_undo_iolock;
+                       goto out_undo_mmaplock;
         }
         return 1;
  
- out_undo_iolock:
+out_undo_mmaplock:
+       if (lock_flags & XFS_MMAPLOCK_EXCL)
+               mrunlock_excl(&ip->i_mmaplock);
+       else if (lock_flags & XFS_MMAPLOCK_SHARED)
+               mrunlock_shared(&ip->i_mmaplock);
+out_undo_iolock:
         if (lock_flags & XFS_IOLOCK_EXCL)
                 mrunlock_excl(&ip->i_iolock);
         else if (lock_flags & XFS_IOLOCK_SHARED)
                 mrunlock_shared(&ip->i_iolock);
- out:
+out:
         return 0;
  }
  
@@ -244,6 +277,8 @@ xfs_iunlock(
          */
         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+       ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+              (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -254,6 +289,11 @@ xfs_iunlock(
         else if (lock_flags & XFS_IOLOCK_SHARED)
                 mrunlock_shared(&ip->i_iolock);
  
+       if (lock_flags & XFS_MMAPLOCK_EXCL)
+               mrunlock_excl(&ip->i_mmaplock);
+       else if (lock_flags & XFS_MMAPLOCK_SHARED)
+               mrunlock_shared(&ip->i_mmaplock);
+
         if (lock_flags & XFS_ILOCK_EXCL)
                 mrunlock_excl(&ip->i_lock);
         else if (lock_flags & XFS_ILOCK_SHARED)
@@ -271,11 +311,14 @@ xfs_ilock_demote(
         xfs_inode_t             *ip,
         uint                    lock_flags)
  {
-       ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
-       ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+       ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
+       ASSERT((lock_flags &
+               ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
  
         if (lock_flags & XFS_ILOCK_EXCL)
                 mrdemote(&ip->i_lock);
+       if (lock_flags & XFS_MMAPLOCK_EXCL)
+               mrdemote(&ip->i_mmaplock);
         if (lock_flags & XFS_IOLOCK_EXCL)
                 mrdemote(&ip->i_iolock);
  
@@ -294,6 +337,12 @@ xfs_isilocked(
                 return rwsem_is_locked(&ip->i_lock.mr_lock);
         }
  
+       if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
+               if (!(lock_flags & XFS_MMAPLOCK_SHARED))
+                       return !!ip->i_mmaplock.mr_writer;
+               return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+       }
+
         if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
                 if (!(lock_flags & XFS_IOLOCK_SHARED))
                         return !!ip->i_iolock.mr_writer;
@@ -314,14 +363,27 @@ int xfs_lock_delays;
  #endif
  
  /*
- * Bump the subclass so xfs_lock_inodes() acquires each lock with
- * a different value
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
+ * value. This shouldn't be called for page fault locking, but we also need to
+ * ensure we don't overrun the number of lockdep subclasses for the iolock or
+ * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
   */
  static inline int
  xfs_lock_inumorder(int lock_mode, int subclass)
  {
-       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
+               ASSERT(subclass + XFS_LOCK_INUMORDER <
+                       (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+       }
+
+       if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
+               ASSERT(subclass + XFS_LOCK_INUMORDER <
+                       (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
+               lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
+                                                       XFS_MMAPLOCK_SHIFT;
+       }
+
         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
  
@@ -329,15 +391,14 @@ xfs_lock_inumorder(int lock_mode, int subclass)
  }
  
  /*
- * The following routine will lock n inodes in exclusive mode.
- * We assume the caller calls us with the inodes in i_ino order.
+ * The following routine will lock n inodes in exclusive mode.  We assume the
+ * caller calls us with the inodes in i_ino order.
   *
- * We need to detect deadlock where an inode that we lock
- * is in the AIL and we start waiting for another inode that is locked
- * by a thread in a long running transaction (such as truncate). This can
- * result in deadlock since the long running trans might need to wait
- * for the inode we just locked in order to push the tail and free space
- * in the log.
+ * We need to detect deadlock where an inode that we lock is in the AIL and we
+ * start waiting for another inode that is locked by a thread in a long running
+ * transaction (such as truncate). This can result in deadlock since the long
+ * running trans might need to wait for the inode we just locked in order to
+ * push the tail and free space in the log.
   */
  void
  xfs_lock_inodes(
@@ -348,30 +409,27 @@ xfs_lock_inodes(
         int             attempts = 0, i, j, try_lock;
         xfs_log_item_t  *lp;
  
-       ASSERT(ips && (inodes >= 2)); /* we need at least two */
+       /* currently supports between 2 and 5 inodes */
+       ASSERT(ips && inodes >= 2 && inodes <= 5);
  
         try_lock = 0;
         i = 0;
-
  again:
         for (; i < inodes; i++) {
                 ASSERT(ips[i]);
  
-               if (i && (ips[i] == ips[i-1]))  /* Already locked */
+               if (i && (ips[i] == ips[i - 1]))        /* Already locked */
                         continue;
  
                 /*
-                * If try_lock is not set yet, make sure all locked inodes
-                * are not in the AIL.
-                * If any are, set try_lock to be used later.
+                * If try_lock is not set yet, make sure all locked inodes are
+                * not in the AIL.  If any are, set try_lock to be used later.
                  */
-
                 if (!try_lock) {
                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
-                               if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                               if (lp && (lp->li_flags & XFS_LI_IN_AIL))
                                         try_lock++;
-                               }
                         }
                 }
  
@@ -381,51 +439,42 @@ again:
                  * we can't get any, we must release all we have
                  * and try again.
                  */
+               if (!try_lock) {
+                       xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+                       continue;
+               }
+
+               /* try_lock means we have an inode locked that is in the AIL. */
+               ASSERT(i != 0);
+               if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
+                       continue;
  
-               if (try_lock) {
-                       /* try_lock must be 0 if i is 0. */
+               /*
+                * Unlock all previous guys and try again.  xfs_iunlock will try
+                * to push the tail if the inode is in the AIL.
+                */
+               attempts++;
+               for (j = i - 1; j >= 0; j--) {
                         /*
-                        * try_lock means we have an inode locked
-                        * that is in the AIL.
+                        * Check to see if we've already unlocked this one.  Not
+                        * the first one going back, and the inode ptr is the
+                        * same.
                          */
-                       ASSERT(i != 0);
-                       if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
-                               attempts++;
-
-                               /*
-                                * Unlock all previous guys and try again.
-                                * xfs_iunlock will try to push the tail
-                                * if the inode is in the AIL.
-                                */
-
-                               for(j = i - 1; j >= 0; j--) {
-
-                                       /*
-                                        * Check to see if we've already
-                                        * unlocked this one.
-                                        * Not the first one going back,
-                                        * and the inode ptr is the same.
-                                        */
-                                       if ((j != (i - 1)) && ips[j] ==
-                                                               ips[j+1])
-                                               continue;
-
-                                       xfs_iunlock(ips[j], lock_mode);
-                               }
+                       if (j != (i - 1) && ips[j] == ips[j + 1])
+                               continue;
+
+                       xfs_iunlock(ips[j], lock_mode);
+               }
  
-                               if ((attempts % 5) == 0) {
-                                       delay(1); /* Don't just spin the CPU */
+               if ((attempts % 5) == 0) {
+                       delay(1); /* Don't just spin the CPU */
  #ifdef DEBUG
-                                       xfs_lock_delays++;
+                       xfs_lock_delays++;
  #endif
-                               }
-                               i = 0;
-                               try_lock = 0;
-                               goto again;
-                       }
-               } else {
-                       xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
                 }
+               i = 0;
+               try_lock = 0;
+               goto again;
         }
  
  #ifdef DEBUG
@@ -440,10 +489,10 @@ again:
  }
  
  /*
- * xfs_lock_two_inodes() can only be used to lock one type of lock
- * at a time - the iolock or the ilock, but not both at once. If
- * we lock both at once, lockdep will report false positives saying
- * we have violated locking orders.
+ * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
+ * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
+ * lock more than one at a time, lockdep will report false positives saying we
+ * have violated locking orders.
   */
  void
  xfs_lock_two_inodes(
@@ -455,8 +504,12 @@ xfs_lock_two_inodes(
         int                     attempts = 0;
         xfs_log_item_t          *lp;
  
-       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
-               ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+       if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
+               ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+               ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
+       } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
+               ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
+
         ASSERT(ip0->i_ino != ip1->i_ino);
  
         if (ip0->i_ino > ip1->i_ino) {
@@ -818,7 +871,7 @@ xfs_ialloc(
         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
         xfs_trans_log_inode(tp, ip, flags);
  
-       /* now that we have an i_mode we can setup inode ops and unlock */
+       /* now that we have an i_mode we can setup the inode structure */
         xfs_setup_inode(ip);
  
         *ipp = ip;
@@ -1235,12 +1288,14 @@ xfs_create(
         xfs_trans_cancel(tp, cancel_flags);
   out_release_inode:
         /*
-        * Wait until after the current transaction is aborted to
-        * release the inode.  This prevents recursive transactions
-        * and deadlocks from xfs_inactive.
+        * Wait until after the current transaction is aborted to finish the
+        * setup of the inode and release the inode.  This prevents recursive
+        * transactions and deadlocks from xfs_inactive.
          */
-       if (ip)
+       if (ip) {
+               xfs_finish_inode_setup(ip);
                 IRELE(ip);
+       }
  
         xfs_qm_dqrele(udqp);
         xfs_qm_dqrele(gdqp);
@@ -1345,12 +1400,14 @@ xfs_create_tmpfile(
         xfs_trans_cancel(tp, cancel_flags);
   out_release_inode:
         /*
-        * Wait until after the current transaction is aborted to
-        * release the inode.  This prevents recursive transactions
-        * and deadlocks from xfs_inactive.
+        * Wait until after the current transaction is aborted to finish the
+        * setup of the inode and release the inode.  This prevents recursive
+        * transactions and deadlocks from xfs_inactive.
          */
-       if (ip)
+       if (ip) {
+               xfs_finish_inode_setup(ip);
                 IRELE(ip);
+       }
  
         xfs_qm_dqrele(udqp);
         xfs_qm_dqrele(gdqp);
@@ -2611,19 +2668,22 @@ xfs_remove(
  /*
   * Enter all inodes for a rename transaction into a sorted array.
   */
+#define __XFS_SORT_INODES      5
  STATIC void
  xfs_sort_for_rename(
-       xfs_inode_t     *dp1,   /* in: old (source) directory inode */
-       xfs_inode_t     *dp2,   /* in: new (target) directory inode */
-       xfs_inode_t     *ip1,   /* in: inode of old entry */
-       xfs_inode_t     *ip2,   /* in: inode of new entry, if it
-                                  already exists, NULL otherwise. */
-       xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
-       int             *num_inodes)  /* out: number of inodes in array */
+       struct xfs_inode        *dp1,   /* in: old (source) directory inode */
+       struct xfs_inode        *dp2,   /* in: new (target) directory inode */
+       struct xfs_inode        *ip1,   /* in: inode of old entry */
+       struct xfs_inode        *ip2,   /* in: inode of new entry */
+       struct xfs_inode        *wip,   /* in: whiteout inode */
+       struct xfs_inode        **i_tab,/* out: sorted array of inodes */
+       int                     *num_inodes)  /* in/out: inodes in array */
  {
-       xfs_inode_t             *temp;
         int                     i, j;
  
+       ASSERT(*num_inodes == __XFS_SORT_INODES);
+       memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
+
         /*
          * i_tab contains a list of pointers to inodes.  We initialize
          * the table here & we'll sort it.  We will then use it to
@@ -2631,25 +2691,24 @@ xfs_sort_for_rename(
          *
          * Note that the table may contain duplicates.  e.g., dp1 == dp2.
          */
-       i_tab[0] = dp1;
-       i_tab[1] = dp2;
-       i_tab[2] = ip1;
-       if (ip2) {
-               *num_inodes = 4;
-               i_tab[3] = ip2;
-       } else {
-               *num_inodes = 3;
-               i_tab[3] = NULL;
-       }
+       i = 0;
+       i_tab[i++] = dp1;
+       i_tab[i++] = dp2;
+       i_tab[i++] = ip1;
+       if (ip2)
+               i_tab[i++] = ip2;
+       if (wip)
+               i_tab[i++] = wip;
+       *num_inodes = i;
  
         /*
          * Sort the elements via bubble sort.  (Remember, there are at
-        * most 4 elements to sort, so this is adequate.)
+        * most 5 elements to sort, so this is adequate.)
          */
         for (i = 0; i < *num_inodes; i++) {
                 for (j = 1; j < *num_inodes; j++) {
                         if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
-                               temp = i_tab[j];
+                               struct xfs_inode *temp = i_tab[j];
                                 i_tab[j] = i_tab[j-1];
                                 i_tab[j-1] = temp;
                         }
@@ -2657,6 +2716,31 @@ xfs_sort_for_rename(
         }
  }
  
+static int
+xfs_finish_rename(
+       struct xfs_trans        *tp,
+       struct xfs_bmap_free    *free_list)
+{
+       int                     committed = 0;
+       int                     error;
+
+       /*
+        * If this is a synchronous mount, make sure that the rename transaction
+        * goes to disk before returning to the user.
+        */
+       if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+               xfs_trans_set_sync(tp);
+
+       error = xfs_bmap_finish(&tp, free_list, &committed);
+       if (error) {
+               xfs_bmap_cancel(free_list);
+               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+               return error;
+       }
+
+       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+}
+
  /*
   * xfs_cross_rename()
   *
@@ -2685,14 +2769,14 @@ xfs_cross_rename(
                                 ip2->i_ino,
                                 first_block, free_list, spaceres);
         if (error)
-               goto out;
+               goto out_trans_abort;
  
         /* Swap inode number for dirent in second parent */
         error = xfs_dir_replace(tp, dp2, name2,
                                 ip1->i_ino,
                                 first_block, free_list, spaceres);
         if (error)
-               goto out;
+               goto out_trans_abort;
  
         /*
          * If we're renaming one or more directories across different parents,
@@ -2707,16 +2791,16 @@ xfs_cross_rename(
                                                 dp1->i_ino, first_block,
                                                 free_list, spaceres);
                         if (error)
-                               goto out;
+                               goto out_trans_abort;
  
                         /* transfer ip2 ".." reference to dp1 */
                         if (!S_ISDIR(ip1->i_d.di_mode)) {
                                 error = xfs_droplink(tp, dp2);
                                 if (error)
-                                       goto out;
+                                       goto out_trans_abort;
                                 error = xfs_bumplink(tp, dp1);
                                 if (error)
-                                       goto out;
+                                       goto out_trans_abort;
                         }
  
                         /*
@@ -2734,16 +2818,16 @@ xfs_cross_rename(
                                                 dp2->i_ino, first_block,
                                                 free_list, spaceres);
                         if (error)
-                               goto out;
+                               goto out_trans_abort;
  
                         /* transfer ip1 ".." reference to dp2 */
                         if (!S_ISDIR(ip2->i_d.di_mode)) {
                                 error = xfs_droplink(tp, dp1);
                                 if (error)
-                                       goto out;
+                                       goto out_trans_abort;
                                 error = xfs_bumplink(tp, dp2);
                                 if (error)
-                                       goto out;
+                                       goto out_trans_abort;
                         }
  
                         /*
@@ -2771,66 +2855,108 @@ xfs_cross_rename(
         }
         xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
         xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
-out:
+       return xfs_finish_rename(tp, free_list);
+
+out_trans_abort:
+       xfs_bmap_cancel(free_list);
+       xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
         return error;
  }
  
  /*
+ * xfs_rename_alloc_whiteout()
+ *
+ * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * whiteout in a rename transaction. We use a tmpfile inode here so that if we
+ * crash between allocating the inode and linking it into the rename transaction
+ * recovery will free the inode and we won't leak it.
+ */
+static int
+xfs_rename_alloc_whiteout(
+       struct xfs_inode        *dp,
+       struct xfs_inode        **wip)
+{
+       struct xfs_inode        *tmpfile;
+       int                     error;
+
+       error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+       if (error)
+               return error;
+
+       /* Satisfy xfs_bumplink that this is a real tmpfile */
+       xfs_finish_inode_setup(tmpfile);
+       VFS_I(tmpfile)->i_state |= I_LINKABLE;
+
+       *wip = tmpfile;
+       return 0;
+}
+
+/*
   * xfs_rename
   */
  int
  xfs_rename(
-       xfs_inode_t     *src_dp,
-       struct xfs_name *src_name,
-       xfs_inode_t     *src_ip,
-       xfs_inode_t     *target_dp,
-       struct xfs_name *target_name,
-       xfs_inode_t     *target_ip,
-       unsigned int    flags)
+       struct xfs_inode        *src_dp,
+       struct xfs_name         *src_name,
+       struct xfs_inode        *src_ip,
+       struct xfs_inode        *target_dp,
+       struct xfs_name         *target_name,
+       struct xfs_inode        *target_ip,
+       unsigned int            flags)
  {
-       xfs_trans_t     *tp = NULL;
-       xfs_mount_t     *mp = src_dp->i_mount;
-       int             new_parent;             /* moving to a new dir */
-       int             src_is_directory;       /* src_name is a directory */
-       int             error;
-       xfs_bmap_free_t free_list;
-       xfs_fsblock_t   first_block;
-       int             cancel_flags;
-       int             committed;
-       xfs_inode_t     *inodes[4];
-       int             spaceres;
-       int             num_inodes;
+       struct xfs_mount        *mp = src_dp->i_mount;
+       struct xfs_trans        *tp;
+       struct xfs_bmap_free    free_list;
+       xfs_fsblock_t           first_block;
+       struct xfs_inode        *wip = NULL;            /* whiteout inode */
+       struct xfs_inode        *inodes[__XFS_SORT_INODES];
+       int                     num_inodes = __XFS_SORT_INODES;
+       bool                    new_parent = (src_dp != target_dp);
+       bool                    src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+       int                     cancel_flags = 0;
+       int                     spaceres;
+       int                     error;
  
         trace_xfs_rename(src_dp, target_dp, src_name, target_name);
  
-       new_parent = (src_dp != target_dp);
-       src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+       if ((flags & RENAME_EXCHANGE) && !target_ip)
+               return -EINVAL;
  
-       xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+       /*
+        * If we are doing a whiteout operation, allocate the whiteout inode
+        * we will be placing at the target and ensure the type is set
+        * appropriately.
+        */
+       if (flags & RENAME_WHITEOUT) {
+               ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
+               error = xfs_rename_alloc_whiteout(target_dp, &wip);
+               if (error)
+                       return error;
+
+               /* setup target dirent info as whiteout */
+               src_name->type = XFS_DIR3_FT_CHRDEV;
+       }
+
+       xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
                                 inodes, &num_inodes);
  
-       xfs_bmap_init(&free_list, &first_block);
         tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
-       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
         spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
         if (error == -ENOSPC) {
                 spaceres = 0;
                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
         }
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               goto std_return;
-       }
+       if (error)
+               goto out_trans_cancel;
+       cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
  
         /*
          * Attach the dquots to the inodes
          */
         error = xfs_qm_vop_rename_dqattach(inodes);
-       if (error) {
-               xfs_trans_cancel(tp, cancel_flags);
-               goto std_return;
-       }
+       if (error)
+               goto out_trans_cancel;
  
         /*
          * Lock all the participating inodes. Depending upon whether
@@ -2851,6 +2977,8 @@ xfs_rename(
         xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
         if (target_ip)
                 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+       if (wip)
+               xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
  
         /*
          * If we are using project inheritance, we only allow renames
@@ -2860,24 +2988,16 @@ xfs_rename(
         if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
                      (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
                 error = -EXDEV;
-               goto error_return;
+               goto out_trans_cancel;
         }
  
-       /*
-        * Handle RENAME_EXCHANGE flags
-        */
-       if (flags & RENAME_EXCHANGE) {
-               if (target_ip == NULL) {
-                       error = -EINVAL;
-                       goto error_return;
-               }
-               error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
-                                        target_dp, target_name, target_ip,
-                                        &free_list, &first_block, spaceres);
-               if (error)
-                       goto abort_return;
-               goto finish_rename;
-       }
+       xfs_bmap_init(&free_list, &first_block);
+
+       /* RENAME_EXCHANGE is unique from here on. */
+       if (flags & RENAME_EXCHANGE)
+               return xfs_cross_rename(tp, src_dp, src_name, src_ip,
+                                       target_dp, target_name, target_ip,
+                                       &free_list, &first_block, spaceres);
  
         /*
          * Set up the target.
@@ -2890,7 +3010,7 @@ xfs_rename(
                 if (!spaceres) {
                         error = xfs_dir_canenter(tp, target_dp, target_name);
                         if (error)
-                               goto error_return;
+                               goto out_trans_cancel;
                 }
                 /*
                  * If target does not exist and the rename crosses
@@ -2901,9 +3021,9 @@ xfs_rename(
                                                 src_ip->i_ino, &first_block,
                                                 &free_list, spaceres);
                 if (error == -ENOSPC)
-                       goto error_return;
+                       goto out_bmap_cancel;
                 if (error)
-                       goto abort_return;
+                       goto out_trans_abort;
  
                 xfs_trans_ichgtime(tp, target_dp,
                                         XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2911,7 +3031,7 @@ xfs_rename(
                 if (new_parent && src_is_directory) {
                         error = xfs_bumplink(tp, target_dp);
                         if (error)
-                               goto abort_return;
+                               goto out_trans_abort;
                 }
         } else { /* target_ip != NULL */
                 /*
@@ -2926,7 +3046,7 @@ xfs_rename(
                         if (!(xfs_dir_isempty(target_ip)) ||
                             (target_ip->i_d.di_nlink > 2)) {
                                 error = -EEXIST;
-                               goto error_return;
+                               goto out_trans_cancel;
                         }
                 }
  
@@ -2943,7 +3063,7 @@ xfs_rename(
                                         src_ip->i_ino,
                                         &first_block, &free_list, spaceres);
                 if (error)
-                       goto abort_return;
+                       goto out_trans_abort;
  
                 xfs_trans_ichgtime(tp, target_dp,
                                         XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2954,7 +3074,7 @@ xfs_rename(
                  */
                 error = xfs_droplink(tp, target_ip);
                 if (error)
-                       goto abort_return;
+                       goto out_trans_abort;
  
                 if (src_is_directory) {
                         /*
@@ -2962,7 +3082,7 @@ xfs_rename(
                          */
                         error = xfs_droplink(tp, target_ip);
                         if (error)
-                               goto abort_return;
+                               goto out_trans_abort;
                 }
         } /* target_ip != NULL */
  
@@ -2979,7 +3099,7 @@ xfs_rename(
                                         &first_block, &free_list, spaceres);
                 ASSERT(error != -EEXIST);
                 if (error)
-                       goto abort_return;
+                       goto out_trans_abort;
         }
  
         /*
@@ -3005,49 +3125,67 @@ xfs_rename(
                  */
                 error = xfs_droplink(tp, src_dp);
                 if (error)
-                       goto abort_return;
+                       goto out_trans_abort;
         }
  
-       error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+       /*
+        * For whiteouts, we only need to update the source dirent with the
+        * inode number of the whiteout inode rather than removing it
+        * altogether.
+        */
+       if (wip) {
+               error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
                                         &first_block, &free_list, spaceres);
+       } else
+               error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+                                          &first_block, &free_list, spaceres);
         if (error)
-               goto abort_return;
-
-       xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-       if (new_parent)
-               xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+               goto out_trans_abort;
  
-finish_rename:
         /*
-        * If this is a synchronous mount, make sure that the
-        * rename transaction goes to disk before returning to
-        * the user.
+        * For whiteouts, we need to bump the link count on the whiteout inode.
+        * This means that failures all the way up to this point leave the inode
+        * on the unlinked list and so cleanup is a simple matter of dropping
+        * the remaining reference to it. If we fail here after bumping the link
+        * count, we're shutting down the filesystem so we'll never see the
+        * intermediate state on disk.
          */
-       if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
-               xfs_trans_set_sync(tp);
-       }
+       if (wip) {
+               ASSERT(wip->i_d.di_nlink == 0);
+               error = xfs_bumplink(tp, wip);
+               if (error)
+                       goto out_trans_abort;
+               error = xfs_iunlink_remove(tp, wip);
+               if (error)
+                       goto out_trans_abort;
+               xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
  
-       error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error) {
-               xfs_bmap_cancel(&free_list);
-               xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
-                                XFS_TRANS_ABORT));
-               goto std_return;
+               /*
+                * Now we have a real link, clear the "I'm a tmpfile" state
+                * flag from the inode so it doesn't accidentally get misused in
+                * future.
+                */
+               VFS_I(wip)->i_state &= ~I_LINKABLE;
         }
  
-       /*
-        * trans_commit will unlock src_ip, target_ip & decrement
-        * the vnode references.
-        */
-       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+       if (new_parent)
+               xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
  
- abort_return:
+       error = xfs_finish_rename(tp, &free_list);
+       if (wip)
+               IRELE(wip);
+       return error;
+
+out_trans_abort:
         cancel_flags |= XFS_TRANS_ABORT;
- error_return:
+out_bmap_cancel:
         xfs_bmap_cancel(&free_list);
+out_trans_cancel:
         xfs_trans_cancel(tp, cancel_flags);
- std_return:
+       if (wip)
+               IRELE(wip);
         return error;
  }
  
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index a1cd55f..8f22d20 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,6 +56,7 @@ typedef struct xfs_inode {
         struct xfs_inode_log_item *i_itemp;     /* logging information */
         mrlock_t                i_lock;         /* inode lock */
         mrlock_t                i_iolock;       /* inode IO lock */
+       mrlock_t                i_mmaplock;     /* inode mmap IO lock */
         atomic_t                i_pincount;     /* inode pin count */
         spinlock_t              i_flags_lock;   /* inode i_flags lock */
         /* Miscellaneous state. */
@@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  #define        XFS_IOLOCK_SHARED       (1<<1)
  #define        XFS_ILOCK_EXCL          (1<<2)
  #define        XFS_ILOCK_SHARED        (1<<3)
+#define        XFS_MMAPLOCK_EXCL       (1<<4)
+#define        XFS_MMAPLOCK_SHARED     (1<<5)
  
  #define XFS_LOCK_MASK          (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
-                               | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
+                               | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
+                               | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
  
  #define XFS_LOCK_FLAGS \
         { XFS_IOLOCK_EXCL,      "IOLOCK_EXCL" }, \
         { XFS_IOLOCK_SHARED,    "IOLOCK_SHARED" }, \
         { XFS_ILOCK_EXCL,       "ILOCK_EXCL" }, \
-       { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }
+       { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }, \
+       { XFS_MMAPLOCK_EXCL,    "MMAPLOCK_EXCL" }, \
+       { XFS_MMAPLOCK_SHARED,  "MMAPLOCK_SHARED" }
  
  
  /*
@@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  #define XFS_IOLOCK_SHIFT       16
  #define        XFS_IOLOCK_PARENT       (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
  
+#define XFS_MMAPLOCK_SHIFT     20
+
  #define XFS_ILOCK_SHIFT                24
  #define        XFS_ILOCK_PARENT        (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
  #define        XFS_ILOCK_RTBITMAP      (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
  #define        XFS_ILOCK_RTSUM         (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
  
-#define XFS_IOLOCK_DEP_MASK    0x00ff0000
+#define XFS_IOLOCK_DEP_MASK    0x000f0000
+#define XFS_MMAPLOCK_DEP_MASK  0x00f00000
  #define XFS_ILOCK_DEP_MASK     0xff000000
-#define XFS_LOCK_DEP_MASK      (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK)
+#define XFS_LOCK_DEP_MASK      (XFS_IOLOCK_DEP_MASK | \
+                                XFS_MMAPLOCK_DEP_MASK | \
+                                XFS_ILOCK_DEP_MASK)
  
-#define XFS_IOLOCK_DEP(flags)  (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
-#define XFS_ILOCK_DEP(flags)   (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+#define XFS_IOLOCK_DEP(flags)  (((flags) & XFS_IOLOCK_DEP_MASK) \
+                                       >> XFS_IOLOCK_SHIFT)
+#define XFS_MMAPLOCK_DEP(flags)        (((flags) & XFS_MMAPLOCK_DEP_MASK) \
+                                       >> XFS_MMAPLOCK_SHIFT)
+#define XFS_ILOCK_DEP(flags)   (((flags) & XFS_ILOCK_DEP_MASK) \
+                                       >> XFS_ILOCK_SHIFT)
  
  /*
   * For multiple groups support: if S_ISGID bit is set in the parent
@@ -391,6 +406,28 @@ int        xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
  int    xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
  
  
+/* from xfs_iops.c */
+/*
+ * When setting up a newly allocated inode, we need to call
+ * xfs_finish_inode_setup() once the inode is fully instantiated at
+ * the VFS level to prevent the rest of the world seeing the inode
+ * before we've completed instantiation. Otherwise we can do it
+ * the moment the inode lookup is complete.
+ */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
+{
+       xfs_iflags_clear(ip, XFS_INEW);
+       barrier();
+       unlock_new_inode(VFS_I(ip));
+}
+
+static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
+{
+       xfs_setup_inode(ip);
+       xfs_finish_inode_setup(ip);
+}
+
  #define IHOLD(ip) \
  do { \
         ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c

index ac4feae..5f4a396 100644 (file)
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -631,7 +631,7 @@ xfs_ioc_space(
  
         if (filp->f_flags & O_DSYNC)
                 flags |= XFS_PREALLOC_SYNC;
-       if (ioflags & XFS_IO_INVIS)     
+       if (ioflags & XFS_IO_INVIS)
                 flags |= XFS_PREALLOC_INVISIBLE;
  
         error = mnt_want_write_file(filp);
@@ -639,10 +639,13 @@ xfs_ioc_space(
                 return error;
  
         xfs_ilock(ip, iolock);
-       error = xfs_break_layouts(inode, &iolock);
+       error = xfs_break_layouts(inode, &iolock, false);
         if (error)
                 goto out_unlock;
  
+       xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+       iolock |= XFS_MMAPLOCK_EXCL;
+
         switch (bf->l_whence) {
         case 0: /*SEEK_SET*/
                 break;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index ccb1dd0..38e633b 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -460,8 +460,7 @@ xfs_iomap_prealloc_size(
         alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
                                        alloc_blocks);
  
-       xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
-       freesp = mp->m_sb.sb_fdblocks;
+       freesp = percpu_counter_read_positive(&mp->m_fdblocks);
         if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
                 shift = 2;
                 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index e53a903..2f1839e 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -187,6 +187,8 @@ xfs_generic_create(
         else
                 d_instantiate(dentry, inode);
  
+       xfs_finish_inode_setup(ip);
+
   out_free_acl:
         if (default_acl)
                 posix_acl_release(default_acl);
@@ -195,6 +197,7 @@ xfs_generic_create(
         return error;
  
   out_cleanup_inode:
+       xfs_finish_inode_setup(ip);
         if (!tmpfile)
                 xfs_cleanup_inode(dir, inode, dentry);
         iput(inode);
@@ -367,9 +370,11 @@ xfs_vn_symlink(
                 goto out_cleanup_inode;
  
         d_instantiate(dentry, inode);
+       xfs_finish_inode_setup(cip);
         return 0;
  
   out_cleanup_inode:
+       xfs_finish_inode_setup(cip);
         xfs_cleanup_inode(dir, inode, dentry);
         iput(inode);
   out:
@@ -389,7 +394,7 @@ xfs_vn_rename(
         struct xfs_name oname;
         struct xfs_name nname;
  
-       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+       if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                 return -EINVAL;
  
         /* if we are exchanging files, we need to set i_mode of both files */
@@ -766,6 +771,7 @@ xfs_setattr_size(
                 return error;
  
         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+       ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
         ASSERT(S_ISREG(ip->i_d.di_mode));
         ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
                 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -829,55 +835,27 @@ xfs_setattr_size(
         inode_dio_wait(inode);
  
         /*
-        * Do all the page cache truncate work outside the transaction context
-        * as the "lock" order is page lock->log space reservation.  i.e.
-        * locking pages inside the transaction can ABBA deadlock with
-        * writeback. We have to do the VFS inode size update before we truncate
-        * the pagecache, however, to avoid racing with page faults beyond the
-        * new EOF they are not serialised against truncate operations except by
-        * page locks and size updates.
+        * We've already locked out new page faults, so now we can safely remove
+        * pages from the page cache knowing they won't get refaulted until we
+        * drop the XFS_MMAP_EXCL lock after the extent manipulations are
+        * complete. The truncate_setsize() call also cleans partial EOF page
+        * PTEs on extending truncates and hence ensures sub-page block size
+        * filesystems are correctly handled, too.
          *
-        * Hence we are in a situation where a truncate can fail with ENOMEM
-        * from xfs_trans_reserve(), but having already truncated the in-memory
-        * version of the file (i.e. made user visible changes). There's not
-        * much we can do about this, except to hope that the caller sees ENOMEM
-        * and retries the truncate operation.
+        * We have to do all the page cache truncate work outside the
+        * transaction context as the "lock" order is page lock->log space
+        * reservation as defined by extent allocation in the writeback path.
+        * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
+        * having already truncated the in-memory version of the file (i.e. made
+        * user visible changes). There's not much we can do about this, except
+        * to hope that the caller sees ENOMEM and retries the truncate
+        * operation.
          */
         error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
         if (error)
                 return error;
         truncate_setsize(inode, newsize);
  
-       /*
-        * The "we can't serialise against page faults" pain gets worse.
-        *
-        * If the file is mapped then we have to clean the page at the old EOF
-        * when extending the file. Extending the file can expose changes the
-        * underlying page mapping (e.g. from beyond EOF to a hole or
-        * unwritten), and so on the next attempt to write to that page we need
-        * to remap it for write. i.e. we need .page_mkwrite() to be called.
-        * Hence we need to clean the page to clean the pte and so a new write
-        * fault will be triggered appropriately.
-        *
-        * If we do it before we change the inode size, then we can race with a
-        * page fault that maps the page with exactly the same problem. If we do
-        * it after we change the file size, then a new page fault can come in
-        * and allocate space before we've run the rest of the truncate
-        * transaction. That's kinda grotesque, but it's better than have data
-        * over a hole, and so that's the lesser evil that has been chosen here.
-        *
-        * The real solution, however, is to have some mechanism for locking out
-        * page faults while a truncate is in progress.
-        */
-       if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
-               error = filemap_write_and_wait_range(
-                               VFS_I(ip)->i_mapping,
-                               round_down(oldsize, PAGE_CACHE_SIZE),
-                               round_up(oldsize, PAGE_CACHE_SIZE) - 1);
-               if (error)
-                       return error;
-       }
-
         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
         if (error)
@@ -975,9 +953,13 @@ xfs_vn_setattr(
                 uint            iolock = XFS_IOLOCK_EXCL;
  
                 xfs_ilock(ip, iolock);
-               error = xfs_break_layouts(dentry->d_inode, &iolock);
-               if (!error)
+               error = xfs_break_layouts(dentry->d_inode, &iolock, true);
+               if (!error) {
+                       xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+                       iolock |= XFS_MMAPLOCK_EXCL;
+
                         error = xfs_setattr_size(ip, iattr);
+               }
                 xfs_iunlock(ip, iolock);
         } else {
                 error = xfs_setattr_nonsize(ip, iattr, 0);
@@ -1228,16 +1210,12 @@ xfs_diflags_to_iflags(
  }
  
  /*
- * Initialize the Linux inode, set up the operation vectors and
- * unlock the inode.
+ * Initialize the Linux inode and set up the operation vectors.
   *
- * When reading existing inodes from disk this is called directly
- * from xfs_iget, when creating a new inode it is called from
- * xfs_ialloc after setting up the inode.
- *
- * We are always called with an uninitialised linux inode here.
- * We need to initialise the necessary fields and take a reference
- * on it.
+ * When reading existing inodes from disk this is called directly from xfs_iget,
+ * when creating a new inode it is called from xfs_ialloc after setting up the
+ * inode. These callers have different criteria for clearing XFS_INEW, so leave
+ * it up to the caller to deal with unlocking the inode appropriately.
   */
  void
  xfs_setup_inode(
@@ -1324,9 +1302,4 @@ xfs_setup_inode(
                 inode_has_no_xattr(inode);
                 cache_no_acl(inode);
         }
-
-       xfs_iflags_clear(ip, XFS_INEW);
-       barrier();
-
-       unlock_new_inode(inode);
  }
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h

index ea7a98e..a0f84ab 100644 (file)
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations;
  
  extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
  
-extern void xfs_setup_inode(struct xfs_inode *);
-
  /*
   * Internal setattr interfaces.
   */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c

index 82e3142..8042989 100644 (file)
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -229,7 +229,7 @@ xfs_bulkstat_grab_ichunk(
         error = xfs_inobt_get_rec(cur, irec, &stat);
         if (error)
                 return error;
-       XFS_WANT_CORRUPTED_RETURN(stat == 1);
+       XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
  
         /* Check if the record contains the inode in request */
         if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h

index c31d2c2..7c7842c 100644 (file)
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -116,15 +116,6 @@ typedef __uint64_t __psunsigned_t;
  #undef XFS_NATIVE_HOST
  #endif
  
-/*
- * Feature macros (disable/enable)
- */
-#ifdef CONFIG_SMP
-#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
-#else
-#undef  HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
-#endif
-
  #define irix_sgid_inherit      xfs_params.sgid_inherit.val
  #define irix_symlink_mode      xfs_params.symlink_mode.val
  #define xfs_panic_mask         xfs_params.panic_mask.val
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c

index a5a945f..4f5784f 100644 (file)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4463,10 +4463,10 @@ xlog_do_recover(
         xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
         ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
         ASSERT(xfs_sb_good_version(sbp));
+       xfs_reinit_percpu_counters(log->l_mp);
+
         xfs_buf_relse(bp);
  
-       /* We've re-read the superblock so re-initialize per-cpu counters */
-       xfs_icsb_reinit_counters(log->l_mp);
  
         xlog_recover_check_summary(log);
  
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index 4fa80e6..2ce7ee3 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,18 +43,6 @@
  #include "xfs_sysfs.h"
  
  
-#ifdef HAVE_PERCPU_SB
-STATIC void    xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
-                                               int);
-STATIC void    xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
-                                               int);
-STATIC void    xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
-#else
-
-#define xfs_icsb_balance_counter(mp, a, b)             do { } while (0)
-#define xfs_icsb_balance_counter_locked(mp, a, b)      do { } while (0)
-#endif
-
  static DEFINE_MUTEX(xfs_uuid_table_mutex);
  static int xfs_uuid_table_size;
  static uuid_t *xfs_uuid_table;
@@ -347,8 +335,7 @@ reread:
                 goto reread;
         }
  
-       /* Initialize per-cpu counters */
-       xfs_icsb_reinit_counters(mp);
+       xfs_reinit_percpu_counters(mp);
  
         /* no need to be quiet anymore, so reset the buf ops */
         bp->b_ops = &xfs_sb_buf_ops;
@@ -1087,8 +1074,6 @@ xfs_log_sbcount(xfs_mount_t *mp)
         if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
                 return 0;
  
-       xfs_icsb_sync_counters(mp, 0);
-
         /*
          * we don't need to do this if we are updating the superblock
          * counters on every modification.
@@ -1099,253 +1084,136 @@ xfs_log_sbcount(xfs_mount_t *mp)
         return xfs_sync_sb(mp, true);
  }
  
-/*
- * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply
- * a delta to a specified field in the in-core superblock.  Simply
- * switch on the field indicated and apply the delta to that field.
- * Fields are not allowed to dip below zero, so if the delta would
- * do this do not apply it and return EINVAL.
- *
- * The m_sb_lock must be held when this routine is called.
- */
-STATIC int
-xfs_mod_incore_sb_unlocked(
-       xfs_mount_t     *mp,
-       xfs_sb_field_t  field,
-       int64_t         delta,
-       int             rsvd)
+int
+xfs_mod_icount(
+       struct xfs_mount        *mp,
+       int64_t                 delta)
  {
-       int             scounter;       /* short counter for 32 bit fields */
-       long long       lcounter;       /* long counter for 64 bit fields */
-       long long       res_used, rem;
-
-       /*
-        * With the in-core superblock spin lock held, switch
-        * on the indicated field.  Apply the delta to the
-        * proper field.  If the fields value would dip below
-        * 0, then do not apply the delta and return EINVAL.
-        */
-       switch (field) {
-       case XFS_SBS_ICOUNT:
-               lcounter = (long long)mp->m_sb.sb_icount;
-               lcounter += delta;
-               if (lcounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_icount = lcounter;
-               return 0;
-       case XFS_SBS_IFREE:
-               lcounter = (long long)mp->m_sb.sb_ifree;
-               lcounter += delta;
-               if (lcounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_ifree = lcounter;
-               return 0;
-       case XFS_SBS_FDBLOCKS:
-               lcounter = (long long)
-                       mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
-               res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
-
-               if (delta > 0) {                /* Putting blocks back */
-                       if (res_used > delta) {
-                               mp->m_resblks_avail += delta;
-                       } else {
-                               rem = delta - res_used;
-                               mp->m_resblks_avail = mp->m_resblks;
-                               lcounter += rem;
-                       }
-               } else {                                /* Taking blocks away */
-                       lcounter += delta;
-                       if (lcounter >= 0) {
-                               mp->m_sb.sb_fdblocks = lcounter +
-                                                       XFS_ALLOC_SET_ASIDE(mp);
-                               return 0;
-                       }
-
-                       /*
-                        * We are out of blocks, use any available reserved
-                        * blocks if were allowed to.
-                        */
-                       if (!rsvd)
-                               return -ENOSPC;
-
-                       lcounter = (long long)mp->m_resblks_avail + delta;
-                       if (lcounter >= 0) {
-                               mp->m_resblks_avail = lcounter;
-                               return 0;
-                       }
-                       printk_once(KERN_WARNING
-                               "Filesystem \"%s\": reserve blocks depleted! "
-                               "Consider increasing reserve pool size.",
-                               mp->m_fsname);
-                       return -ENOSPC;
-               }
-
-               mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
-               return 0;
-       case XFS_SBS_FREXTENTS:
-               lcounter = (long long)mp->m_sb.sb_frextents;
-               lcounter += delta;
-               if (lcounter < 0) {
-                       return -ENOSPC;
-               }
-               mp->m_sb.sb_frextents = lcounter;
-               return 0;
-       case XFS_SBS_DBLOCKS:
-               lcounter = (long long)mp->m_sb.sb_dblocks;
-               lcounter += delta;
-               if (lcounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_dblocks = lcounter;
-               return 0;
-       case XFS_SBS_AGCOUNT:
-               scounter = mp->m_sb.sb_agcount;
-               scounter += delta;
-               if (scounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_agcount = scounter;
-               return 0;
-       case XFS_SBS_IMAX_PCT:
-               scounter = mp->m_sb.sb_imax_pct;
-               scounter += delta;
-               if (scounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_imax_pct = scounter;
-               return 0;
-       case XFS_SBS_REXTSIZE:
-               scounter = mp->m_sb.sb_rextsize;
-               scounter += delta;
-               if (scounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_rextsize = scounter;
-               return 0;
-       case XFS_SBS_RBMBLOCKS:
-               scounter = mp->m_sb.sb_rbmblocks;
-               scounter += delta;
-               if (scounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_rbmblocks = scounter;
-               return 0;
-       case XFS_SBS_RBLOCKS:
-               lcounter = (long long)mp->m_sb.sb_rblocks;
-               lcounter += delta;
-               if (lcounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_rblocks = lcounter;
-               return 0;
-       case XFS_SBS_REXTENTS:
-               lcounter = (long long)mp->m_sb.sb_rextents;
-               lcounter += delta;
-               if (lcounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_rextents = lcounter;
-               return 0;
-       case XFS_SBS_REXTSLOG:
-               scounter = mp->m_sb.sb_rextslog;
-               scounter += delta;
-               if (scounter < 0) {
-                       ASSERT(0);
-                       return -EINVAL;
-               }
-               mp->m_sb.sb_rextslog = scounter;
-               return 0;
-       default:
+       /* deltas are +/-64, hence the large batch size of 128. */
+       __percpu_counter_add(&mp->m_icount, delta, 128);
+       if (percpu_counter_compare(&mp->m_icount, 0) < 0) {
                 ASSERT(0);
+               percpu_counter_add(&mp->m_icount, -delta);
                 return -EINVAL;
         }
+       return 0;
  }
  
-/*
- * xfs_mod_incore_sb() is used to change a field in the in-core
- * superblock structure by the specified delta.  This modification
- * is protected by the m_sb_lock.  Just use the xfs_mod_incore_sb_unlocked()
- * routine to do the work.
- */
  int
-xfs_mod_incore_sb(
+xfs_mod_ifree(
         struct xfs_mount        *mp,
-       xfs_sb_field_t          field,
-       int64_t                 delta,
-       int                     rsvd)
+       int64_t                 delta)
  {
-       int                     status;
-
-#ifdef HAVE_PERCPU_SB
-       ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
-#endif
-       spin_lock(&mp->m_sb_lock);
-       status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-       spin_unlock(&mp->m_sb_lock);
-
-       return status;
+       percpu_counter_add(&mp->m_ifree, delta);
+       if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
+               ASSERT(0);
+               percpu_counter_add(&mp->m_ifree, -delta);
+               return -EINVAL;
+       }
+       return 0;
  }
  
-/*
- * Change more than one field in the in-core superblock structure at a time.
- *
- * The fields and changes to those fields are specified in the array of
- * xfs_mod_sb structures passed in.  Either all of the specified deltas
- * will be applied or none of them will.  If any modified field dips below 0,
- * then all modifications will be backed out and EINVAL will be returned.
- *
- * Note that this function may not be used for the superblock values that
- * are tracked with the in-memory per-cpu counters - a direct call to
- * xfs_icsb_modify_counters is required for these.
- */
  int
-xfs_mod_incore_sb_batch(
+xfs_mod_fdblocks(
         struct xfs_mount        *mp,
-       xfs_mod_sb_t            *msb,
-       uint                    nmsb,
-       int                     rsvd)
+       int64_t                 delta,
+       bool                    rsvd)
  {
-       xfs_mod_sb_t            *msbp;
-       int                     error = 0;
+       int64_t                 lcounter;
+       long long               res_used;
+       s32                     batch;
+
+       if (delta > 0) {
+               /*
+                * If the reserve pool is depleted, put blocks back into it
+                * first. Most of the time the pool is full.
+                */
+               if (likely(mp->m_resblks == mp->m_resblks_avail)) {
+                       percpu_counter_add(&mp->m_fdblocks, delta);
+                       return 0;
+               }
+
+               spin_lock(&mp->m_sb_lock);
+               res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+
+               if (res_used > delta) {
+                       mp->m_resblks_avail += delta;
+               } else {
+                       delta -= res_used;
+                       mp->m_resblks_avail = mp->m_resblks;
+                       percpu_counter_add(&mp->m_fdblocks, delta);
+               }
+               spin_unlock(&mp->m_sb_lock);
+               return 0;
+       }
  
         /*
-        * Loop through the array of mod structures and apply each individually.
-        * If any fail, then back out all those which have already been applied.
-        * Do all of this within the scope of the m_sb_lock so that all of the
-        * changes will be atomic.
+        * Taking blocks away, need to be more accurate the closer we
+        * are to zero.
+        *
+        * batch size is set to a maximum of 1024 blocks - if we are
+        * allocating of freeing extents larger than this then we aren't
+        * going to be hammering the counter lock so a lock per update
+        * is not a problem.
+        *
+        * If the counter has a value of less than 2 * max batch size,
+        * then make everything serialise as we are real close to
+        * ENOSPC.
+        */
+#define __BATCH        1024
+       if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0)
+               batch = 1;
+       else
+               batch = __BATCH;
+#undef __BATCH
+
+       __percpu_counter_add(&mp->m_fdblocks, delta, batch);
+       if (percpu_counter_compare(&mp->m_fdblocks,
+                                  XFS_ALLOC_SET_ASIDE(mp)) >= 0) {
+               /* we had space! */
+               return 0;
+       }
+
+       /*
+        * lock up the sb for dipping into reserves before releasing the space
+        * that took us to ENOSPC.
          */
         spin_lock(&mp->m_sb_lock);
-       for (msbp = msb; msbp < (msb + nmsb); msbp++) {
-               ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
-                      msbp->msb_field > XFS_SBS_FDBLOCKS);
+       percpu_counter_add(&mp->m_fdblocks, -delta);
+       if (!rsvd)
+               goto fdblocks_enospc;
  
-               error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-                                                  msbp->msb_delta, rsvd);
-               if (error)
-                       goto unwind;
+       lcounter = (long long)mp->m_resblks_avail + delta;
+       if (lcounter >= 0) {
+               mp->m_resblks_avail = lcounter;
+               spin_unlock(&mp->m_sb_lock);
+               return 0;
         }
+       printk_once(KERN_WARNING
+               "Filesystem \"%s\": reserve blocks depleted! "
+               "Consider increasing reserve pool size.",
+               mp->m_fsname);
+fdblocks_enospc:
         spin_unlock(&mp->m_sb_lock);
-       return 0;
+       return -ENOSPC;
+}
  
-unwind:
-       while (--msbp >= msb) {
-               error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
-                                                  -msbp->msb_delta, rsvd);
-               ASSERT(error == 0);
-       }
+int
+xfs_mod_frextents(
+       struct xfs_mount        *mp,
+       int64_t                 delta)
+{
+       int64_t                 lcounter;
+       int                     ret = 0;
+
+       spin_lock(&mp->m_sb_lock);
+       lcounter = mp->m_sb.sb_frextents + delta;
+       if (lcounter < 0)
+               ret = -ENOSPC;
+       else
+               mp->m_sb.sb_frextents = lcounter;
         spin_unlock(&mp->m_sb_lock);
-       return error;
+       return ret;
  }
  
  /*
@@ -1407,573 +1275,3 @@ xfs_dev_is_read_only(
         }
         return 0;
  }
-
-#ifdef HAVE_PERCPU_SB
-/*
- * Per-cpu incore superblock counters
- *
- * Simple concept, difficult implementation
- *
- * Basically, replace the incore superblock counters with a distributed per cpu
- * counter for contended fields (e.g.  free block count).
- *
- * Difficulties arise in that the incore sb is used for ENOSPC checking, and
- * hence needs to be accurately read when we are running low on space. Hence
- * there is a method to enable and disable the per-cpu counters based on how
- * much "stuff" is available in them.
- *
- * Basically, a counter is enabled if there is enough free resource to justify
- * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
- * ENOSPC), then we disable the counters to synchronise all callers and
- * re-distribute the available resources.
- *
- * If, once we redistributed the available resources, we still get a failure,
- * we disable the per-cpu counter and go through the slow path.
- *
- * The slow path is the current xfs_mod_incore_sb() function.  This means that
- * when we disable a per-cpu counter, we need to drain its resources back to
- * the global superblock. We do this after disabling the counter to prevent
- * more threads from queueing up on the counter.
- *
- * Essentially, this means that we still need a lock in the fast path to enable
- * synchronisation between the global counters and the per-cpu counters. This
- * is not a problem because the lock will be local to a CPU almost all the time
- * and have little contention except when we get to ENOSPC conditions.
- *
- * Basically, this lock becomes a barrier that enables us to lock out the fast
- * path while we do things like enabling and disabling counters and
- * synchronising the counters.
- *
- * Locking rules:
- *
- *     1. m_sb_lock before picking up per-cpu locks
- *     2. per-cpu locks always picked up via for_each_online_cpu() order
- *     3. accurate counter sync requires m_sb_lock + per cpu locks
- *     4. modifying per-cpu counters requires holding per-cpu lock
- *     5. modifying global counters requires holding m_sb_lock
- *     6. enabling or disabling a counter requires holding the m_sb_lock 
- *        and _none_ of the per-cpu locks.
- *
- * Disabled counters are only ever re-enabled by a balance operation
- * that results in more free resources per CPU than a given threshold.
- * To ensure counters don't remain disabled, they are rebalanced when
- * the global resource goes above a higher threshold (i.e. some hysteresis
- * is present to prevent thrashing).
- */
-
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * hot-plug CPU notifier support.
- *
- * We need a notifier per filesystem as we need to be able to identify
- * the filesystem to balance the counters out. This is achieved by
- * having a notifier block embedded in the xfs_mount_t and doing pointer
- * magic to get the mount pointer from the notifier block address.
- */
-STATIC int
-xfs_icsb_cpu_notify(
-       struct notifier_block *nfb,
-       unsigned long action,
-       void *hcpu)
-{
-       xfs_icsb_cnts_t *cntp;
-       xfs_mount_t     *mp;
-
-       mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
-       cntp = (xfs_icsb_cnts_t *)
-                       per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
-       switch (action) {
-       case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
-               /* Easy Case - initialize the area and locks, and
-                * then rebalance when online does everything else for us. */
-               memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-               break;
-       case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
-               xfs_icsb_lock(mp);
-               xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-               xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-               xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
-               xfs_icsb_unlock(mp);
-               break;
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               /* Disable all the counters, then fold the dead cpu's
-                * count into the total on the global superblock and
-                * re-enable the counters. */
-               xfs_icsb_lock(mp);
-               spin_lock(&mp->m_sb_lock);
-               xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
-               xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
-               xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
-
-               mp->m_sb.sb_icount += cntp->icsb_icount;
-               mp->m_sb.sb_ifree += cntp->icsb_ifree;
-               mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
-
-               memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-
-               xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
-               xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
-               xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
-               spin_unlock(&mp->m_sb_lock);
-               xfs_icsb_unlock(mp);
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-int
-xfs_icsb_init_counters(
-       xfs_mount_t     *mp)
-{
-       xfs_icsb_cnts_t *cntp;
-       int             i;
-
-       mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
-       if (mp->m_sb_cnts == NULL)
-               return -ENOMEM;
-
-       for_each_online_cpu(i) {
-               cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-               memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-       }
-
-       mutex_init(&mp->m_icsb_mutex);
-
-       /*
-        * start with all counters disabled so that the
-        * initial balance kicks us off correctly
-        */
-       mp->m_icsb_counters = -1;
-
-#ifdef CONFIG_HOTPLUG_CPU
-       mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
-       mp->m_icsb_notifier.priority = 0;
-       register_hotcpu_notifier(&mp->m_icsb_notifier);
-#endif /* CONFIG_HOTPLUG_CPU */
-
-       return 0;
-}
-
-void
-xfs_icsb_reinit_counters(
-       xfs_mount_t     *mp)
-{
-       xfs_icsb_lock(mp);
-       /*
-        * start with all counters disabled so that the
-        * initial balance kicks us off correctly
-        */
-       mp->m_icsb_counters = -1;
-       xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-       xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-       xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
-       xfs_icsb_unlock(mp);
-}
-
-void
-xfs_icsb_destroy_counters(
-       xfs_mount_t     *mp)
-{
-       if (mp->m_sb_cnts) {
-               unregister_hotcpu_notifier(&mp->m_icsb_notifier);
-               free_percpu(mp->m_sb_cnts);
-       }
-       mutex_destroy(&mp->m_icsb_mutex);
-}
-
-STATIC void
-xfs_icsb_lock_cntr(
-       xfs_icsb_cnts_t *icsbp)
-{
-       while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
-               ndelay(1000);
-       }
-}
-
-STATIC void
-xfs_icsb_unlock_cntr(
-       xfs_icsb_cnts_t *icsbp)
-{
-       clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
-}
-
-
-STATIC void
-xfs_icsb_lock_all_counters(
-       xfs_mount_t     *mp)
-{
-       xfs_icsb_cnts_t *cntp;
-       int             i;
-
-       for_each_online_cpu(i) {
-               cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-               xfs_icsb_lock_cntr(cntp);
-       }
-}
-
-STATIC void
-xfs_icsb_unlock_all_counters(
-       xfs_mount_t     *mp)
-{
-       xfs_icsb_cnts_t *cntp;
-       int             i;
-
-       for_each_online_cpu(i) {
-               cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-               xfs_icsb_unlock_cntr(cntp);
-       }
-}
-
-STATIC void
-xfs_icsb_count(
-       xfs_mount_t     *mp,
-       xfs_icsb_cnts_t *cnt,
-       int             flags)
-{
-       xfs_icsb_cnts_t *cntp;
-       int             i;
-
-       memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
-
-       if (!(flags & XFS_ICSB_LAZY_COUNT))
-               xfs_icsb_lock_all_counters(mp);
-
-       for_each_online_cpu(i) {
-               cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-               cnt->icsb_icount += cntp->icsb_icount;
-               cnt->icsb_ifree += cntp->icsb_ifree;
-               cnt->icsb_fdblocks += cntp->icsb_fdblocks;
-       }
-
-       if (!(flags & XFS_ICSB_LAZY_COUNT))
-               xfs_icsb_unlock_all_counters(mp);
-}
-
-STATIC int
-xfs_icsb_counter_disabled(
-       xfs_mount_t     *mp,
-       xfs_sb_field_t  field)
-{
-       ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-       return test_bit(field, &mp->m_icsb_counters);
-}
-
-STATIC void
-xfs_icsb_disable_counter(
-       xfs_mount_t     *mp,
-       xfs_sb_field_t  field)
-{
-       xfs_icsb_cnts_t cnt;
-
-       ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-
-       /*
-        * If we are already disabled, then there is nothing to do
-        * here. We check before locking all the counters to avoid
-        * the expensive lock operation when being called in the
-        * slow path and the counter is already disabled. This is
-        * safe because the only time we set or clear this state is under
-        * the m_icsb_mutex.
-        */
-       if (xfs_icsb_counter_disabled(mp, field))
-               return;
-
-       xfs_icsb_lock_all_counters(mp);
-       if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
-               /* drain back to superblock */
-
-               xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
-               switch(field) {
-               case XFS_SBS_ICOUNT:
-                       mp->m_sb.sb_icount = cnt.icsb_icount;
-                       break;
-               case XFS_SBS_IFREE:
-                       mp->m_sb.sb_ifree = cnt.icsb_ifree;
-                       break;
-               case XFS_SBS_FDBLOCKS:
-                       mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-                       break;
-               default:
-                       BUG();
-               }
-       }
-
-       xfs_icsb_unlock_all_counters(mp);
-}
-
-STATIC void
-xfs_icsb_enable_counter(
-       xfs_mount_t     *mp,
-       xfs_sb_field_t  field,
-       uint64_t        count,
-       uint64_t        resid)
-{
-       xfs_icsb_cnts_t *cntp;
-       int             i;
-
-       ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-
-       xfs_icsb_lock_all_counters(mp);
-       for_each_online_cpu(i) {
-               cntp = per_cpu_ptr(mp->m_sb_cnts, i);
-               switch (field) {
-               case XFS_SBS_ICOUNT:
-                       cntp->icsb_icount = count + resid;
-                       break;
-               case XFS_SBS_IFREE:
-                       cntp->icsb_ifree = count + resid;
-                       break;
-               case XFS_SBS_FDBLOCKS:
-                       cntp->icsb_fdblocks = count + resid;
-                       break;
-               default:
-                       BUG();
-                       break;
-               }
-               resid = 0;
-       }
-       clear_bit(field, &mp->m_icsb_counters);
-       xfs_icsb_unlock_all_counters(mp);
-}
-
-void
-xfs_icsb_sync_counters_locked(
-       xfs_mount_t     *mp,
-       int             flags)
-{
-       xfs_icsb_cnts_t cnt;
-
-       xfs_icsb_count(mp, &cnt, flags);
-
-       if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
-               mp->m_sb.sb_icount = cnt.icsb_icount;
-       if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
-               mp->m_sb.sb_ifree = cnt.icsb_ifree;
-       if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
-               mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-}
-
-/*
- * Accurate update of per-cpu counters to incore superblock
- */
-void
-xfs_icsb_sync_counters(
-       xfs_mount_t     *mp,
-       int             flags)
-{
-       spin_lock(&mp->m_sb_lock);
-       xfs_icsb_sync_counters_locked(mp, flags);
-       spin_unlock(&mp->m_sb_lock);
-}
-
-/*
- * Balance and enable/disable counters as necessary.
- *
- * Thresholds for re-enabling counters are somewhat magic.  inode counts are
- * chosen to be the same number as single on disk allocation chunk per CPU, and
- * free blocks is something far enough zero that we aren't going thrash when we
- * get near ENOSPC. We also need to supply a minimum we require per cpu to
- * prevent looping endlessly when xfs_alloc_space asks for more than will
- * be distributed to a single CPU but each CPU has enough blocks to be
- * reenabled.
- *
- * Note that we can be called when counters are already disabled.
- * xfs_icsb_disable_counter() optimises the counter locking in this case to
- * prevent locking every per-cpu counter needlessly.
- */
-
-#define XFS_ICSB_INO_CNTR_REENABLE     (uint64_t)64
-#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
-               (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
-STATIC void
-xfs_icsb_balance_counter_locked(
-       xfs_mount_t     *mp,
-       xfs_sb_field_t  field,
-       int             min_per_cpu)
-{
-       uint64_t        count, resid;
-       int             weight = num_online_cpus();
-       uint64_t        min = (uint64_t)min_per_cpu;
-
-       /* disable counter and sync counter */
-       xfs_icsb_disable_counter(mp, field);
-
-       /* update counters  - first CPU gets residual*/
-       switch (field) {
-       case XFS_SBS_ICOUNT:
-               count = mp->m_sb.sb_icount;
-               resid = do_div(count, weight);
-               if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-                       return;
-               break;
-       case XFS_SBS_IFREE:
-               count = mp->m_sb.sb_ifree;
-               resid = do_div(count, weight);
-               if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-                       return;
-               break;
-       case XFS_SBS_FDBLOCKS:
-               count = mp->m_sb.sb_fdblocks;
-               resid = do_div(count, weight);
-               if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
-                       return;
-               break;
-       default:
-               BUG();
-               count = resid = 0;      /* quiet, gcc */
-               break;
-       }
-
-       xfs_icsb_enable_counter(mp, field, count, resid);
-}
-
-STATIC void
-xfs_icsb_balance_counter(
-       xfs_mount_t     *mp,
-       xfs_sb_field_t  fields,
-       int             min_per_cpu)
-{
-       spin_lock(&mp->m_sb_lock);
-       xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
-       spin_unlock(&mp->m_sb_lock);
-}
-
-int
-xfs_icsb_modify_counters(
-       xfs_mount_t     *mp,
-       xfs_sb_field_t  field,
-       int64_t         delta,
-       int             rsvd)
-{
-       xfs_icsb_cnts_t *icsbp;
-       long long       lcounter;       /* long counter for 64 bit fields */
-       int             ret = 0;
-
-       might_sleep();
-again:
-       preempt_disable();
-       icsbp = this_cpu_ptr(mp->m_sb_cnts);
-
-       /*
-        * if the counter is disabled, go to slow path
-        */
-       if (unlikely(xfs_icsb_counter_disabled(mp, field)))
-               goto slow_path;
-       xfs_icsb_lock_cntr(icsbp);
-       if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
-               xfs_icsb_unlock_cntr(icsbp);
-               goto slow_path;
-       }
-
-       switch (field) {
-       case XFS_SBS_ICOUNT:
-               lcounter = icsbp->icsb_icount;
-               lcounter += delta;
-               if (unlikely(lcounter < 0))
-                       goto balance_counter;
-               icsbp->icsb_icount = lcounter;
-               break;
-
-       case XFS_SBS_IFREE:
-               lcounter = icsbp->icsb_ifree;
-               lcounter += delta;
-               if (unlikely(lcounter < 0))
-                       goto balance_counter;
-               icsbp->icsb_ifree = lcounter;
-               break;
-
-       case XFS_SBS_FDBLOCKS:
-               BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
-
-               lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
-               lcounter += delta;
-               if (unlikely(lcounter < 0))
-                       goto balance_counter;
-               icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
-               break;
-       default:
-               BUG();
-               break;
-       }
-       xfs_icsb_unlock_cntr(icsbp);
-       preempt_enable();
-       return 0;
-
-slow_path:
-       preempt_enable();
-
-       /*
-        * serialise with a mutex so we don't burn lots of cpu on
-        * the superblock lock. We still need to hold the superblock
-        * lock, however, when we modify the global structures.
-        */
-       xfs_icsb_lock(mp);
-
-       /*
-        * Now running atomically.
-        *
-        * If the counter is enabled, someone has beaten us to rebalancing.
-        * Drop the lock and try again in the fast path....
-        */
-       if (!(xfs_icsb_counter_disabled(mp, field))) {
-               xfs_icsb_unlock(mp);
-               goto again;
-       }
-
-       /*
-        * The counter is currently disabled. Because we are
-        * running atomically here, we know a rebalance cannot
-        * be in progress. Hence we can go straight to operating
-        * on the global superblock. We do not call xfs_mod_incore_sb()
-        * here even though we need to get the m_sb_lock. Doing so
-        * will cause us to re-enter this function and deadlock.
-        * Hence we get the m_sb_lock ourselves and then call
-        * xfs_mod_incore_sb_unlocked() as the unlocked path operates
-        * directly on the global counters.
-        */
-       spin_lock(&mp->m_sb_lock);
-       ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-       spin_unlock(&mp->m_sb_lock);
-
-       /*
-        * Now that we've modified the global superblock, we
-        * may be able to re-enable the distributed counters
-        * (e.g. lots of space just got freed). After that
-        * we are done.
-        */
-       if (ret != -ENOSPC)
-               xfs_icsb_balance_counter(mp, field, 0);
-       xfs_icsb_unlock(mp);
-       return ret;
-
-balance_counter:
-       xfs_icsb_unlock_cntr(icsbp);
-       preempt_enable();
-
-       /*
-        * We may have multiple threads here if multiple per-cpu
-        * counters run dry at the same time. This will mean we can
-        * do more balances than strictly necessary but it is not
-        * the common slowpath case.
-        */
-       xfs_icsb_lock(mp);
-
-       /*
-        * running atomically.
-        *
-        * This will leave the counter in the correct state for future
-        * accesses. After the rebalance, we simply try again and our retry
-        * will either succeed through the fast path or slow path without
-        * another balance operation being required.
-        */
-       xfs_icsb_balance_counter(mp, field, delta);
-       xfs_icsb_unlock(mp);
-       goto again;
-}
-
-#endif
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index 0d8abd6..8c995a2 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,8 +18,6 @@
  #ifndef __XFS_MOUNT_H__
  #define        __XFS_MOUNT_H__
  
-#ifdef __KERNEL__
-
  struct xlog;
  struct xfs_inode;
  struct xfs_mru_cache;
@@ -29,44 +27,6 @@ struct xfs_quotainfo;
  struct xfs_dir_ops;
  struct xfs_da_geometry;
  
-#ifdef HAVE_PERCPU_SB
-
-/*
- * Valid per-cpu incore superblock counters. Note that if you add new counters,
- * you may need to define new counter disabled bit field descriptors as there
- * are more possible fields in the superblock that can fit in a bitfield on a
- * 32 bit platform. The XFS_SBS_* values for the current current counters just
- * fit.
- */
-typedef struct xfs_icsb_cnts {
-       uint64_t        icsb_fdblocks;
-       uint64_t        icsb_ifree;
-       uint64_t        icsb_icount;
-       unsigned long   icsb_flags;
-} xfs_icsb_cnts_t;
-
-#define XFS_ICSB_FLAG_LOCK     (1 << 0)        /* counter lock bit */
-
-#define XFS_ICSB_LAZY_COUNT    (1 << 1)        /* accuracy not needed */
-
-extern int     xfs_icsb_init_counters(struct xfs_mount *);
-extern void    xfs_icsb_reinit_counters(struct xfs_mount *);
-extern void    xfs_icsb_destroy_counters(struct xfs_mount *);
-extern void    xfs_icsb_sync_counters(struct xfs_mount *, int);
-extern void    xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
-extern int     xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
-                                               int64_t, int);
-
-#else
-#define xfs_icsb_init_counters(mp)             (0)
-#define xfs_icsb_destroy_counters(mp)          do { } while (0)
-#define xfs_icsb_reinit_counters(mp)           do { } while (0)
-#define xfs_icsb_sync_counters(mp, flags)      do { } while (0)
-#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
-#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
-       xfs_mod_incore_sb(mp, field, delta, rsvd)
-#endif
-
  /* dynamic preallocation free space thresholds, 5% down to 1% */
  enum {
         XFS_LOWSP_1_PCNT = 0,
@@ -81,8 +41,13 @@ typedef struct xfs_mount {
         struct super_block      *m_super;
         xfs_tid_t               m_tid;          /* next unused tid for fs */
         struct xfs_ail          *m_ail;         /* fs active log item list */
-       xfs_sb_t                m_sb;           /* copy of fs superblock */
+
+       struct xfs_sb           m_sb;           /* copy of fs superblock */
         spinlock_t              m_sb_lock;      /* sb counter lock */
+       struct percpu_counter   m_icount;       /* allocated inodes counter */
+       struct percpu_counter   m_ifree;        /* free inodes counter */
+       struct percpu_counter   m_fdblocks;     /* free block counter */
+
         struct xfs_buf          *m_sb_bp;       /* buffer for superblock */
         char                    *m_fsname;      /* filesystem name */
         int                     m_fsname_len;   /* strlen of fs name */
@@ -152,12 +117,6 @@ typedef struct xfs_mount {
         const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
         uint                    m_chsize;       /* size of next field */
         atomic_t                m_active_trans; /* number trans frozen */
-#ifdef HAVE_PERCPU_SB
-       xfs_icsb_cnts_t __percpu *m_sb_cnts;    /* per-cpu superblock counters */
-       unsigned long           m_icsb_counters; /* disabled per-cpu counters */
-       struct notifier_block   m_icsb_notifier; /* hotplug cpu notifier */
-       struct mutex            m_icsb_mutex;   /* balancer sync lock */
-#endif
         struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
         struct delayed_work     m_reclaim_work; /* background inode reclaim */
         struct delayed_work     m_eofblocks_work; /* background eof blocks
@@ -301,35 +260,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
  }
  
  /*
- * Per-cpu superblock locking functions
- */
-#ifdef HAVE_PERCPU_SB
-static inline void
-xfs_icsb_lock(xfs_mount_t *mp)
-{
-       mutex_lock(&mp->m_icsb_mutex);
-}
-
-static inline void
-xfs_icsb_unlock(xfs_mount_t *mp)
-{
-       mutex_unlock(&mp->m_icsb_mutex);
-}
-#else
-#define xfs_icsb_lock(mp)
-#define xfs_icsb_unlock(mp)
-#endif
-
-/*
- * This structure is for use by the xfs_mod_incore_sb_batch() routine.
- * xfs_growfs can specify a few fields which are more than int limit
- */
-typedef struct xfs_mod_sb {
-       xfs_sb_field_t  msb_field;      /* Field to modify, see below */
-       int64_t         msb_delta;      /* Change to make to specified field */
-} xfs_mod_sb_t;
-
-/*
   * Per-ag incore structure, copies of information in agf and agi, to improve the
   * performance of allocation group selection.
   */
@@ -383,11 +313,14 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
  extern int     xfs_mountfs(xfs_mount_t *mp);
  extern int     xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
                                      xfs_agnumber_t *maxagi);
-
  extern void    xfs_unmountfs(xfs_mount_t *);
-extern int     xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int     xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
-                       uint, int);
+
+extern int     xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
+extern int     xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
+extern int     xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
+                                bool reserved);
+extern int     xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+
  extern int     xfs_mount_log_sb(xfs_mount_t *);
  extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
  extern int     xfs_readsb(xfs_mount_t *, int);
@@ -399,6 +332,4 @@ extern int  xfs_dev_is_read_only(struct xfs_mount *, char *);
  
  extern void    xfs_set_low_space_thresholds(struct xfs_mount *);
  
-#endif /* __KERNEL__ */
-
  #endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c

index 30ecca3..f8a674d 100644 (file)
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -437,7 +437,7 @@ xfs_mru_cache_insert(
         if (!mru || !mru->lists)
                 return -EINVAL;
  
-       if (radix_tree_preload(GFP_KERNEL))
+       if (radix_tree_preload(GFP_NOFS))
                 return -ENOMEM;
  
         INIT_LIST_HEAD(&elem->list_node);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c

index 365dd57..981a657 100644 (file)
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -31,7 +31,8 @@
  int
  xfs_break_layouts(
         struct inode            *inode,
-       uint                    *iolock)
+       uint                    *iolock,
+       bool                    with_imutex)
  {
         struct xfs_inode        *ip = XFS_I(inode);
         int                     error;
@@ -40,8 +41,12 @@ xfs_break_layouts(
  
         while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
                 xfs_iunlock(ip, *iolock);
+               if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
+                       mutex_unlock(&inode->i_mutex);
                 error = break_layout(inode, true);
                 *iolock = XFS_IOLOCK_EXCL;
+               if (with_imutex)
+                       mutex_lock(&inode->i_mutex);
                 xfs_ilock(ip, *iolock);
         }
  
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h

index b7fbfce..8147ac1 100644 (file)
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,9 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
  int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
                 struct iattr *iattr);
  
-int xfs_break_layouts(struct inode *inode, uint *iolock);
+int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
  #else
-static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
+static inline int
+xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
  {
         return 0;
  }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c

index fbbb9e6..5538468 100644 (file)
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -719,6 +719,7 @@ xfs_qm_qino_alloc(
         xfs_trans_t     *tp;
         int             error;
         int             committed;
+       bool            need_alloc = true;
  
         *ip = NULL;
         /*
@@ -747,6 +748,7 @@ xfs_qm_qino_alloc(
                                 return error;
                         mp->m_sb.sb_gquotino = NULLFSINO;
                         mp->m_sb.sb_pquotino = NULLFSINO;
+                       need_alloc = false;
                 }
         }
  
@@ -758,7 +760,7 @@ xfs_qm_qino_alloc(
                 return error;
         }
  
-       if (!*ip) {
+       if (need_alloc) {
                 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
                                                                 &committed);
                 if (error) {
@@ -794,11 +796,14 @@ xfs_qm_qino_alloc(
         spin_unlock(&mp->m_sb_lock);
         xfs_log_sb(tp);
  
-       if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       if (error) {
+               ASSERT(XFS_FORCED_SHUTDOWN(mp));
                 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
-               return error;
         }
-       return 0;
+       if (need_alloc)
+               xfs_finish_inode_setup(*ip);
+       return error;
  }
  
  
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 8fcc4cc..5f357ca 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -109,8 +109,6 @@ static struct xfs_kobj xfs_dbg_kobj;        /* global debug sysfs attrs */
  #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
  #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
  #define MNTOPT_QUOTANOENF  "qnoenforce"        /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG    "delaylog"  /* Delayed logging enabled */
-#define MNTOPT_NODELAYLOG  "nodelaylog"        /* Delayed logging disabled */
  #define MNTOPT_DISCARD    "discard"    /* Discard unused blocks */
  #define MNTOPT_NODISCARD   "nodiscard" /* Do not discard unused blocks */
  
@@ -361,28 +359,10 @@ xfs_parseargs(
                 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
                         mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
                         mp->m_qflags &= ~XFS_GQUOTA_ENFD;
-               } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
-                       xfs_warn(mp,
-       "delaylog is the default now, option is deprecated.");
-               } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
-                       xfs_warn(mp,
-       "nodelaylog support has been removed, option is deprecated.");
                 } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
                         mp->m_flags |= XFS_MOUNT_DISCARD;
                 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
                         mp->m_flags &= ~XFS_MOUNT_DISCARD;
-               } else if (!strcmp(this_char, "ihashsize")) {
-                       xfs_warn(mp,
-       "ihashsize no longer used, option is deprecated.");
-               } else if (!strcmp(this_char, "osyncisdsync")) {
-                       xfs_warn(mp,
-       "osyncisdsync has no effect, option is deprecated.");
-               } else if (!strcmp(this_char, "osyncisosync")) {
-                       xfs_warn(mp,
-       "osyncisosync has no effect, option is deprecated.");
-               } else if (!strcmp(this_char, "irixsgid")) {
-                       xfs_warn(mp,
-       "irixsgid is now a sysctl(2) variable, option is deprecated.");
                 } else {
                         xfs_warn(mp, "unknown mount option [%s].", this_char);
                         return -EINVAL;
@@ -986,6 +966,8 @@ xfs_fs_inode_init_once(
         atomic_set(&ip->i_pincount, 0);
         spin_lock_init(&ip->i_flags_lock);
  
+       mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                    "xfsino", ip->i_ino);
         mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
                      "xfsino", ip->i_ino);
  }
@@ -1033,23 +1015,6 @@ xfs_free_fsname(
         kfree(mp->m_logname);
  }
  
-STATIC void
-xfs_fs_put_super(
-       struct super_block      *sb)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-
-       xfs_filestream_unmount(mp);
-       xfs_unmountfs(mp);
-
-       xfs_freesb(mp);
-       xfs_icsb_destroy_counters(mp);
-       xfs_destroy_mount_workqueues(mp);
-       xfs_close_devices(mp);
-       xfs_free_fsname(mp);
-       kfree(mp);
-}
-
  STATIC int
  xfs_fs_sync_fs(
         struct super_block      *sb,
@@ -1085,6 +1050,9 @@ xfs_fs_statfs(
         xfs_sb_t                *sbp = &mp->m_sb;
         struct xfs_inode        *ip = XFS_I(dentry->d_inode);
         __uint64_t              fakeinos, id;
+       __uint64_t              icount;
+       __uint64_t              ifree;
+       __uint64_t              fdblocks;
         xfs_extlen_t            lsize;
         __int64_t               ffree;
  
@@ -1095,17 +1063,21 @@ xfs_fs_statfs(
         statp->f_fsid.val[0] = (u32)id;
         statp->f_fsid.val[1] = (u32)(id >> 32);
  
-       xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+       icount = percpu_counter_sum(&mp->m_icount);
+       ifree = percpu_counter_sum(&mp->m_ifree);
+       fdblocks = percpu_counter_sum(&mp->m_fdblocks);
  
         spin_lock(&mp->m_sb_lock);
         statp->f_bsize = sbp->sb_blocksize;
         lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
         statp->f_blocks = sbp->sb_dblocks - lsize;
-       statp->f_bfree = statp->f_bavail =
-                               sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+       spin_unlock(&mp->m_sb_lock);
+
+       statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+       statp->f_bavail = statp->f_bfree;
+
         fakeinos = statp->f_bfree << sbp->sb_inopblog;
-       statp->f_files =
-           MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+       statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
         if (mp->m_maxicount)
                 statp->f_files = min_t(typeof(statp->f_files),
                                         statp->f_files,
@@ -1117,10 +1089,9 @@ xfs_fs_statfs(
                                         sbp->sb_icount);
  
         /* make sure statp->f_ffree does not underflow */
-       ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+       ffree = statp->f_files - (icount - ifree);
         statp->f_ffree = max_t(__int64_t, ffree, 0);
  
-       spin_unlock(&mp->m_sb_lock);
  
         if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
             ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
@@ -1256,6 +1227,12 @@ xfs_fs_remount(
  
         /* ro -> rw */
         if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+               if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+                       xfs_warn(mp,
+               "ro->rw transition prohibited on norecovery mount");
+                       return -EINVAL;
+               }
+
                 mp->m_flags &= ~XFS_MOUNT_RDONLY;
  
                 /*
@@ -1401,6 +1378,51 @@ xfs_finish_flags(
         return 0;
  }
  
+static int
+xfs_init_percpu_counters(
+       struct xfs_mount        *mp)
+{
+       int             error;
+
+       error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
+       if (error)
+               return -ENOMEM;
+
+       error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL);
+       if (error)
+               goto free_icount;
+
+       error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
+       if (error)
+               goto free_ifree;
+
+       return 0;
+
+free_ifree:
+       percpu_counter_destroy(&mp->m_ifree);
+free_icount:
+       percpu_counter_destroy(&mp->m_icount);
+       return -ENOMEM;
+}
+
+void
+xfs_reinit_percpu_counters(
+       struct xfs_mount        *mp)
+{
+       percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
+       percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
+       percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+}
+
+static void
+xfs_destroy_percpu_counters(
+       struct xfs_mount        *mp)
+{
+       percpu_counter_destroy(&mp->m_icount);
+       percpu_counter_destroy(&mp->m_ifree);
+       percpu_counter_destroy(&mp->m_fdblocks);
+}
+
  STATIC int
  xfs_fs_fill_super(
         struct super_block      *sb,
@@ -1449,7 +1471,7 @@ xfs_fs_fill_super(
         if (error)
                 goto out_close_devices;
  
-       error = xfs_icsb_init_counters(mp);
+       error = xfs_init_percpu_counters(mp);
         if (error)
                 goto out_destroy_workqueues;
  
@@ -1507,7 +1529,7 @@ xfs_fs_fill_super(
   out_free_sb:
         xfs_freesb(mp);
   out_destroy_counters:
-       xfs_icsb_destroy_counters(mp);
+       xfs_destroy_percpu_counters(mp);
  out_destroy_workqueues:
         xfs_destroy_mount_workqueues(mp);
   out_close_devices:
@@ -1524,6 +1546,24 @@ out_destroy_workqueues:
         goto out_free_sb;
  }
  
+STATIC void
+xfs_fs_put_super(
+       struct super_block      *sb)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       xfs_notice(mp, "Unmounting Filesystem");
+       xfs_filestream_unmount(mp);
+       xfs_unmountfs(mp);
+
+       xfs_freesb(mp);
+       xfs_destroy_percpu_counters(mp);
+       xfs_destroy_mount_workqueues(mp);
+       xfs_close_devices(mp);
+       xfs_free_fsname(mp);
+       kfree(mp);
+}
+
  STATIC struct dentry *
  xfs_fs_mount(
         struct file_system_type *fs_type,
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h

index 2b830c2..499058f 100644 (file)
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -72,6 +72,8 @@ extern const struct export_operations xfs_export_operations;
  extern const struct xattr_handler *xfs_xattr_handlers[];
  extern const struct quotactl_ops xfs_quotactl_operations;
  
+extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
+
  #define XFS_M(sb)              ((struct xfs_mount *)((sb)->s_fs_info))
  
  #endif /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c

index 25791df..3df411e 100644 (file)
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -177,7 +177,7 @@ xfs_symlink(
         int                     pathlen;
         struct xfs_bmap_free    free_list;
         xfs_fsblock_t           first_block;
-       bool                    unlock_dp_on_error = false;
+       bool                    unlock_dp_on_error = false;
         uint                    cancel_flags;
         int                     committed;
         xfs_fileoff_t           first_fsb;
@@ -221,7 +221,7 @@ xfs_symlink(
                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
                         &udqp, &gdqp, &pdqp);
         if (error)
-               goto std_return;
+               return error;
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
@@ -241,7 +241,7 @@ xfs_symlink(
         }
         if (error) {
                 cancel_flags = 0;
-               goto error_return;
+               goto out_trans_cancel;
         }
  
         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -252,7 +252,7 @@ xfs_symlink(
          */
         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
                 error = -EPERM;
-               goto error_return;
+               goto out_trans_cancel;
         }
  
         /*
@@ -261,7 +261,7 @@ xfs_symlink(
         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
                                                 pdqp, resblks, 1, 0);
         if (error)
-               goto error_return;
+               goto out_trans_cancel;
  
         /*
          * Check for ability to enter directory entry, if no space reserved.
@@ -269,7 +269,7 @@ xfs_symlink(
         if (!resblks) {
                 error = xfs_dir_canenter(tp, dp, link_name);
                 if (error)
-                       goto error_return;
+                       goto out_trans_cancel;
         }
         /*
          * Initialize the bmap freelist prior to calling either
@@ -282,15 +282,14 @@ xfs_symlink(
          */
         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
                                prid, resblks > 0, &ip, NULL);
-       if (error) {
-               if (error == -ENOSPC)
-                       goto error_return;
-               goto error1;
-       }
+       if (error)
+               goto out_trans_cancel;
  
         /*
-        * An error after we've joined dp to the transaction will result in the
-        * transaction cancel unlocking dp so don't do it explicitly in the
+        * Now we join the directory inode to the transaction.  We do not do it
+        * earlier because xfs_dir_ialloc might commit the previous transaction
+        * (and release all the locks).  An error from here on will result in
+        * the transaction cancel unlocking dp so don't do it explicitly in the
          * error path.
          */
         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
@@ -330,7 +329,7 @@ xfs_symlink(
                                   XFS_BMAPI_METADATA, &first_block, resblks,
                                   mval, &nmaps, &free_list);
                 if (error)
-                       goto error2;
+                       goto out_bmap_cancel;
  
                 if (resblks)
                         resblks -= fs_blocks;
@@ -348,7 +347,7 @@ xfs_symlink(
                                                BTOBB(byte_cnt), 0);
                         if (!bp) {
                                 error = -ENOMEM;
-                               goto error2;
+                               goto out_bmap_cancel;
                         }
                         bp->b_ops = &xfs_symlink_buf_ops;
  
@@ -378,7 +377,7 @@ xfs_symlink(
         error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
                                         &first_block, &free_list, resblks);
         if (error)
-               goto error2;
+               goto out_bmap_cancel;
         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
  
@@ -392,10 +391,13 @@ xfs_symlink(
         }
  
         error = xfs_bmap_finish(&tp, &free_list, &committed);
-       if (error) {
-               goto error2;
-       }
+       if (error)
+               goto out_bmap_cancel;
+
         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+       if (error)
+               goto out_release_inode;
+
         xfs_qm_dqrele(udqp);
         xfs_qm_dqrele(gdqp);
         xfs_qm_dqrele(pdqp);
@@ -403,20 +405,28 @@ xfs_symlink(
         *ipp = ip;
         return 0;
  
- error2:
-       IRELE(ip);
- error1:
+out_bmap_cancel:
         xfs_bmap_cancel(&free_list);
         cancel_flags |= XFS_TRANS_ABORT;
- error_return:
+out_trans_cancel:
         xfs_trans_cancel(tp, cancel_flags);
+out_release_inode:
+       /*
+        * Wait until after the current transaction is aborted to finish the
+        * setup of the inode and release the inode.  This prevents recursive
+        * transactions and deadlocks from xfs_inactive.
+        */
+       if (ip) {
+               xfs_finish_inode_setup(ip);
+               IRELE(ip);
+       }
+
         xfs_qm_dqrele(udqp);
         xfs_qm_dqrele(gdqp);
         xfs_qm_dqrele(pdqp);
  
         if (unlock_dp_on_error)
                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
         return error;
  }
  
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 51372e3..615781b 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -115,7 +115,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
                 __entry->refcount = refcount;
                 __entry->caller_ip = caller_ip;
         ),
-       TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+       TP_printk("dev %d:%d agno %u refcount %d caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->agno,
                   __entry->refcount,
@@ -239,7 +239,7 @@ TRACE_EVENT(xfs_iext_insert,
                 __entry->caller_ip = caller_ip;
         ),
         TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-                 "offset %lld block %lld count %lld flag %d caller %pf",
+                 "offset %lld block %lld count %lld flag %d caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -283,7 +283,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
                 __entry->caller_ip = caller_ip;
         ),
         TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-                 "offset %lld block %lld count %lld flag %d caller %pf",
+                 "offset %lld block %lld count %lld flag %d caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
                 __entry->caller_ip = caller_ip;
         ),
         TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
-                 "lock %d flags %s caller %pf",
+                 "lock %d flags %s caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long long)__entry->bno,
                   __entry->nblks,
@@ -402,7 +402,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
                 __entry->caller_ip = caller_ip;
         ),
         TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                 "lock %d flags %s caller %pf",
+                 "lock %d flags %s caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long long)__entry->bno,
                   __entry->buffer_length,
@@ -447,7 +447,7 @@ TRACE_EVENT(xfs_buf_ioerror,
                 __entry->caller_ip = caller_ip;
         ),
         TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                 "lock %d error %d flags %s caller %pf",
+                 "lock %d error %d flags %s caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   (unsigned long long)__entry->bno,
                   __entry->buffer_length,
@@ -613,7 +613,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class,
                 __entry->lock_flags = lock_flags;
                 __entry->caller_ip = caller_ip;
         ),
-       TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
+       TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
@@ -664,6 +664,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space);
  DEFINE_INODE_EVENT(xfs_free_file_space);
  DEFINE_INODE_EVENT(xfs_zero_file_space);
  DEFINE_INODE_EVENT(xfs_collapse_file_space);
+DEFINE_INODE_EVENT(xfs_insert_file_space);
  DEFINE_INODE_EVENT(xfs_readdir);
  #ifdef CONFIG_XFS_POSIX_ACL
  DEFINE_INODE_EVENT(xfs_get_acl);
@@ -685,6 +686,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
  DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
  DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
  
+DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+
  DECLARE_EVENT_CLASS(xfs_iref_class,
         TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
         TP_ARGS(ip, caller_ip),
@@ -702,7 +706,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
                 __entry->pincount = atomic_read(&ip->i_pincount);
                 __entry->caller_ip = caller_ip;
         ),
-       TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
+       TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __entry->count,
@@ -1217,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
  DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
  DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
  DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
  
  DECLARE_EVENT_CLASS(xfs_simple_io_class,
         TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1333,7 +1342,7 @@ TRACE_EVENT(xfs_bunmap,
                 __entry->flags = flags;
         ),
         TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
-                 "flags %s caller %pf",
+                 "flags %s caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->ino,
                   __entry->size,
@@ -1466,7 +1475,7 @@ TRACE_EVENT(xfs_agf,
         ),
         TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
                   "levels b %u c %u flfirst %u fllast %u flcount %u "
-                 "freeblks %u longest %u caller %pf",
+                 "freeblks %u longest %u caller %ps",
                   MAJOR(__entry->dev), MINOR(__entry->dev),
                   __entry->agno,
                   __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c

index eb90cd5..220ef2c 100644 (file)
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -173,7 +173,7 @@ xfs_trans_reserve(
         uint                    rtextents)
  {
         int             error = 0;
-       int             rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+       bool            rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
  
         /* Mark this thread as being in a transaction */
         current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -184,8 +184,7 @@ xfs_trans_reserve(
          * fail if the count would go below zero.
          */
         if (blocks > 0) {
-               error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
-                                         -((int64_t)blocks), rsvd);
+               error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
                 if (error != 0) {
                         current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                         return -ENOSPC;
@@ -236,8 +235,7 @@ xfs_trans_reserve(
          * fail if the count would go below zero.
          */
         if (rtextents > 0) {
-               error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
-                                         -((int64_t)rtextents), rsvd);
+               error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
                 if (error) {
                         error = -ENOSPC;
                         goto undo_log;
@@ -268,8 +266,7 @@ undo_log:
  
  undo_blocks:
         if (blocks > 0) {
-               xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
-                                        (int64_t)blocks, rsvd);
+               xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
                 tp->t_blk_res = 0;
         }
  
@@ -488,6 +485,54 @@ xfs_trans_apply_sb_deltas(
                                   sizeof(sbp->sb_frextents) - 1);
  }
  
+STATIC int
+xfs_sb_mod8(
+       uint8_t                 *field,
+       int8_t                  delta)
+{
+       int8_t                  counter = *field;
+
+       counter += delta;
+       if (counter < 0) {
+               ASSERT(0);
+               return -EINVAL;
+       }
+       *field = counter;
+       return 0;
+}
+
+STATIC int
+xfs_sb_mod32(
+       uint32_t                *field,
+       int32_t                 delta)
+{
+       int32_t                 counter = *field;
+
+       counter += delta;
+       if (counter < 0) {
+               ASSERT(0);
+               return -EINVAL;
+       }
+       *field = counter;
+       return 0;
+}
+
+STATIC int
+xfs_sb_mod64(
+       uint64_t                *field,
+       int64_t                 delta)
+{
+       int64_t                 counter = *field;
+
+       counter += delta;
+       if (counter < 0) {
+               ASSERT(0);
+               return -EINVAL;
+       }
+       *field = counter;
+       return 0;
+}
+
  /*
   * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
   * and apply superblock counter changes to the in-core superblock.  The
@@ -495,13 +540,6 @@ xfs_trans_apply_sb_deltas(
   * applied to the in-core superblock.  The idea is that that has already been
   * done.
   *
- * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
- * However, we have to ensure that we only modify each superblock field only
- * once because the application of the delta values may not be atomic. That can
- * lead to ENOSPC races occurring if we have two separate modifcations of the
- * free space counter to put back the entire reservation and then take away
- * what we used.
- *
   * If we are not logging superblock counters, then the inode allocated/free and
   * used block counts are not updated in the on disk superblock. In this case,
   * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
@@ -509,21 +547,15 @@ xfs_trans_apply_sb_deltas(
   */
  void
  xfs_trans_unreserve_and_mod_sb(
-       xfs_trans_t     *tp)
+       struct xfs_trans        *tp)
  {
-       xfs_mod_sb_t    msb[9]; /* If you add cases, add entries */
-       xfs_mod_sb_t    *msbp;
-       xfs_mount_t     *mp = tp->t_mountp;
-       /* REFERENCED */
-       int             error;
-       int             rsvd;
-       int64_t         blkdelta = 0;
-       int64_t         rtxdelta = 0;
-       int64_t         idelta = 0;
-       int64_t         ifreedelta = 0;
-
-       msbp = msb;
-       rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+       struct xfs_mount        *mp = tp->t_mountp;
+       bool                    rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+       int64_t                 blkdelta = 0;
+       int64_t                 rtxdelta = 0;
+       int64_t                 idelta = 0;
+       int64_t                 ifreedelta = 0;
+       int                     error;
  
         /* calculate deltas */
         if (tp->t_blk_res > 0)
@@ -547,97 +579,115 @@ xfs_trans_unreserve_and_mod_sb(
  
         /* apply the per-cpu counters */
         if (blkdelta) {
-               error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
-                                                blkdelta, rsvd);
+               error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
                 if (error)
                         goto out;
         }
  
         if (idelta) {
-               error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
-                                                idelta, rsvd);
+               error = xfs_mod_icount(mp, idelta);
                 if (error)
                         goto out_undo_fdblocks;
         }
  
         if (ifreedelta) {
-               error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
-                                                ifreedelta, rsvd);
+               error = xfs_mod_ifree(mp, ifreedelta);
                 if (error)
                         goto out_undo_icount;
         }
  
+       if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+               return;
+
         /* apply remaining deltas */
-       if (rtxdelta != 0) {
-               msbp->msb_field = XFS_SBS_FREXTENTS;
-               msbp->msb_delta = rtxdelta;
-               msbp++;
+       spin_lock(&mp->m_sb_lock);
+       if (rtxdelta) {
+               error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta);
+               if (error)
+                       goto out_undo_ifree;
         }
  
-       if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
-               if (tp->t_dblocks_delta != 0) {
-                       msbp->msb_field = XFS_SBS_DBLOCKS;
-                       msbp->msb_delta = tp->t_dblocks_delta;
-                       msbp++;
-               }
-               if (tp->t_agcount_delta != 0) {
-                       msbp->msb_field = XFS_SBS_AGCOUNT;
-                       msbp->msb_delta = tp->t_agcount_delta;
-                       msbp++;
-               }
-               if (tp->t_imaxpct_delta != 0) {
-                       msbp->msb_field = XFS_SBS_IMAX_PCT;
-                       msbp->msb_delta = tp->t_imaxpct_delta;
-                       msbp++;
-               }
-               if (tp->t_rextsize_delta != 0) {
-                       msbp->msb_field = XFS_SBS_REXTSIZE;
-                       msbp->msb_delta = tp->t_rextsize_delta;
-                       msbp++;
-               }
-               if (tp->t_rbmblocks_delta != 0) {
-                       msbp->msb_field = XFS_SBS_RBMBLOCKS;
-                       msbp->msb_delta = tp->t_rbmblocks_delta;
-                       msbp++;
-               }
-               if (tp->t_rblocks_delta != 0) {
-                       msbp->msb_field = XFS_SBS_RBLOCKS;
-                       msbp->msb_delta = tp->t_rblocks_delta;
-                       msbp++;
-               }
-               if (tp->t_rextents_delta != 0) {
-                       msbp->msb_field = XFS_SBS_REXTENTS;
-                       msbp->msb_delta = tp->t_rextents_delta;
-                       msbp++;
-               }
-               if (tp->t_rextslog_delta != 0) {
-                       msbp->msb_field = XFS_SBS_REXTSLOG;
-                       msbp->msb_delta = tp->t_rextslog_delta;
-                       msbp++;
-               }
+       if (tp->t_dblocks_delta != 0) {
+               error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
+               if (error)
+                       goto out_undo_frextents;
         }
-
-       /*
-        * If we need to change anything, do it.
-        */
-       if (msbp > msb) {
-               error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
-                       (uint)(msbp - msb), rsvd);
+       if (tp->t_agcount_delta != 0) {
+               error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
                 if (error)
-                       goto out_undo_ifreecount;
+                       goto out_undo_dblocks;
         }
-
+       if (tp->t_imaxpct_delta != 0) {
+               error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta);
+               if (error)
+                       goto out_undo_agcount;
+       }
+       if (tp->t_rextsize_delta != 0) {
+               error = xfs_sb_mod32(&mp->m_sb.sb_rextsize,
+                                    tp->t_rextsize_delta);
+               if (error)
+                       goto out_undo_imaxpct;
+       }
+       if (tp->t_rbmblocks_delta != 0) {
+               error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks,
+                                    tp->t_rbmblocks_delta);
+               if (error)
+                       goto out_undo_rextsize;
+       }
+       if (tp->t_rblocks_delta != 0) {
+               error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta);
+               if (error)
+                       goto out_undo_rbmblocks;
+       }
+       if (tp->t_rextents_delta != 0) {
+               error = xfs_sb_mod64(&mp->m_sb.sb_rextents,
+                                    tp->t_rextents_delta);
+               if (error)
+                       goto out_undo_rblocks;
+       }
+       if (tp->t_rextslog_delta != 0) {
+               error = xfs_sb_mod8(&mp->m_sb.sb_rextslog,
+                                    tp->t_rextslog_delta);
+               if (error)
+                       goto out_undo_rextents;
+       }
+       spin_unlock(&mp->m_sb_lock);
         return;
  
-out_undo_ifreecount:
+out_undo_rextents:
+       if (tp->t_rextents_delta)
+               xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta);
+out_undo_rblocks:
+       if (tp->t_rblocks_delta)
+               xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta);
+out_undo_rbmblocks:
+       if (tp->t_rbmblocks_delta)
+               xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta);
+out_undo_rextsize:
+       if (tp->t_rextsize_delta)
+               xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta);
+out_undo_imaxpct:
+       if (tp->t_rextsize_delta)
+               xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta);
+out_undo_agcount:
+       if (tp->t_agcount_delta)
+               xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta);
+out_undo_dblocks:
+       if (tp->t_dblocks_delta)
+               xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta);
+out_undo_frextents:
+       if (rtxdelta)
+               xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta);
+out_undo_ifree:
+       spin_unlock(&mp->m_sb_lock);
         if (ifreedelta)
-               xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+               xfs_mod_ifree(mp, -ifreedelta);
  out_undo_icount:
         if (idelta)
-               xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+               xfs_mod_icount(mp, -idelta);
  out_undo_fdblocks:
         if (blkdelta)
-               xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+               xfs_mod_fdblocks(mp, -blkdelta, rsvd);
  out:
         ASSERT(error == 0);
         return;
diff --git a/include/linux/falloc.h b/include/linux/falloc.h

index 3159168..9961110 100644 (file)
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -21,4 +21,10 @@ struct space_resv {
  #define FS_IOC_RESVSP          _IOW('X', 40, struct space_resv)
  #define FS_IOC_RESVSP64                _IOW('X', 42, struct space_resv)
  
+#define        FALLOC_FL_SUPPORTED_MASK        (FALLOC_FL_KEEP_SIZE |          \
+                                        FALLOC_FL_PUNCH_HOLE |         \
+                                        FALLOC_FL_COLLAPSE_RANGE |     \
+                                        FALLOC_FL_ZERO_RANGE |         \
+                                        FALLOC_FL_INSERT_RANGE)
+
  #endif /* _FALLOC_H_ */
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h

index d1197ae..3e445a7 100644 (file)
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -41,4 +41,21 @@
   */
  #define FALLOC_FL_ZERO_RANGE           0x10
  
+/*
+ * FALLOC_FL_INSERT_RANGE is use to insert space within the file size without
+ * overwriting any existing data. The contents of the file beyond offset are
+ * shifted towards right by len bytes to create a hole.  As such, this
+ * operation will increase the size of the file by len bytes.
+ *
+ * Different filesystems may implement different limitations on the granularity
+ * of the operation. Most will limit operations to filesystem block size
+ * boundaries, but this boundary may be larger or smaller depending on
+ * the filesystem and/or the configuration of the filesystem or file.
+ *
+ * Attempting to insert space using this flag at OR beyond the end of
+ * the file is considered an illegal operation - just use ftruncate(2) or
+ * fallocate(2) with mode 0 for such type of operations.
+ */
+#define FALLOC_FL_INSERT_RANGE         0x20
+
  #endif /* _UAPI_FALLOC_H_ */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 24 Apr 2015 14:08:41 +0000 (07:08 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 24 Apr 2015 14:08:41 +0000 (07:08 -0700)
Documentation/filesystems/xfs.txt		patch \| blob \| history
fs/open.c		patch \| blob \| history
fs/xfs/libxfs/xfs_alloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_leaf.c		patch \| blob \| history
fs/xfs/libxfs/xfs_attr_leaf.h		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.h		patch \| blob \| history
fs/xfs/libxfs/xfs_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_da_btree.c		patch \| blob \| history
fs/xfs/libxfs/xfs_da_format.h		patch \| blob \| history
fs/xfs/libxfs/xfs_dir2_data.c		patch \| blob \| history
fs/xfs/libxfs/xfs_format.h		patch \| blob \| history
fs/xfs/libxfs/xfs_ialloc.c		patch \| blob \| history
fs/xfs/libxfs/xfs_sb.c		patch \| blob \| history
fs/xfs/xfs_aops.c		patch \| blob \| history
fs/xfs/xfs_attr_inactive.c		patch \| blob \| history
fs/xfs/xfs_attr_list.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.h		patch \| blob \| history
fs/xfs/xfs_buf_item.c		patch \| blob \| history
fs/xfs/xfs_discard.c		patch \| blob \| history
fs/xfs/xfs_error.c		patch \| blob \| history
fs/xfs/xfs_error.h		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_filestream.c		patch \| blob \| history
fs/xfs/xfs_fsops.c		patch \| blob \| history
fs/xfs/xfs_icache.c		patch \| blob \| history
fs/xfs/xfs_inode.c		patch \| blob \| history
fs/xfs/xfs_inode.h		patch \| blob \| history
fs/xfs/xfs_ioctl.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_iops.c		patch \| blob \| history
fs/xfs/xfs_iops.h		patch \| blob \| history
fs/xfs/xfs_itable.c		patch \| blob \| history
fs/xfs/xfs_linux.h		patch \| blob \| history
fs/xfs/xfs_log_recover.c		patch \| blob \| history
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_mru_cache.c		patch \| blob \| history
fs/xfs/xfs_pnfs.c		patch \| blob \| history
fs/xfs/xfs_pnfs.h		patch \| blob \| history
fs/xfs/xfs_qm.c		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_super.h		patch \| blob \| history
fs/xfs/xfs_symlink.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history
fs/xfs/xfs_trans.c		patch \| blob \| history
include/linux/falloc.h		patch \| blob \| history
include/uapi/linux/falloc.h		patch \| blob \| history