fs/xfs/libxfs/xfs_ialloc.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_mount.h"
  14 #include "xfs_inode.h"
  15 #include "xfs_btree.h"
  16 #include "xfs_ialloc.h"
  17 #include "xfs_ialloc_btree.h"
  18 #include "xfs_alloc.h"
  19 #include "xfs_errortag.h"
  20 #include "xfs_error.h"
  21 #include "xfs_bmap.h"
  22 #include "xfs_trans.h"
  23 #include "xfs_buf_item.h"
  24 #include "xfs_icreate_item.h"
  25 #include "xfs_icache.h"
  26 #include "xfs_trace.h"
  27 #include "xfs_log.h"
  28 #include "xfs_rmap.h"
  29 #include "xfs_ag.h"
  30
  31 /*
  32  * Lookup a record by ino in the btree given by cur.
  33  */
  34 int                                     /* error */
  35 xfs_inobt_lookup(
  36         struct xfs_btree_cur    *cur,   /* btree cursor */
  37         xfs_agino_t             ino,    /* starting inode of chunk */
  38         xfs_lookup_t            dir,    /* <=, >=, == */
  39         int                     *stat)  /* success/failure */
  40 {
  41         cur->bc_rec.i.ir_startino = ino;
  42         cur->bc_rec.i.ir_holemask = 0;
  43         cur->bc_rec.i.ir_count = 0;
  44         cur->bc_rec.i.ir_freecount = 0;
  45         cur->bc_rec.i.ir_free = 0;
  46         return xfs_btree_lookup(cur, dir, stat);
  47 }
  48
  49 /*
  50  * Update the record referred to by cur to the value given.
  51  * This either works (return 0) or gets an EFSCORRUPTED error.
  52  */
  53 STATIC int                              /* error */
  54 xfs_inobt_update(
  55         struct xfs_btree_cur    *cur,   /* btree cursor */
  56         xfs_inobt_rec_incore_t  *irec)  /* btree record */
  57 {
  58         union xfs_btree_rec     rec;
  59
  60         rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
  61         if (xfs_has_sparseinodes(cur->bc_mp)) {
  62                 rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
  63                 rec.inobt.ir_u.sp.ir_count = irec->ir_count;
  64                 rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
  65         } else {
  66                 /* ir_holemask/ir_count not supported on-disk */
  67                 rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
  68         }
  69         rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
  70         return xfs_btree_update(cur, &rec);
  71 }
  72
  73 /* Convert on-disk btree record to incore inobt record. */
  74 void
  75 xfs_inobt_btrec_to_irec(
  76         struct xfs_mount                *mp,
  77         const union xfs_btree_rec       *rec,
  78         struct xfs_inobt_rec_incore     *irec)
  79 {
  80         irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
  81         if (xfs_has_sparseinodes(mp)) {
  82                 irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
  83                 irec->ir_count = rec->inobt.ir_u.sp.ir_count;
  84                 irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
  85         } else {
  86                 /*
  87                  * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
  88                  * values for full inode chunks.
  89                  */
  90                 irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
  91                 irec->ir_count = XFS_INODES_PER_CHUNK;
  92                 irec->ir_freecount =
  93                                 be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
  94         }
  95         irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
  96 }
  97
  98 /*
  99  * Get the data from the pointed-to record.
 100  */
 101 int
 102 xfs_inobt_get_rec(
 103         struct xfs_btree_cur            *cur,
 104         struct xfs_inobt_rec_incore     *irec,
 105         int                             *stat)
 106 {
 107         struct xfs_mount                *mp = cur->bc_mp;
 108         union xfs_btree_rec             *rec;
 109         int                             error;
 110         uint64_t                        realfree;
 111
 112         error = xfs_btree_get_rec(cur, &rec, stat);
 113         if (error || *stat == 0)
 114                 return error;
 115
 116         xfs_inobt_btrec_to_irec(mp, rec, irec);
 117
 118         if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
 119                 goto out_bad_rec;
 120         if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
 121             irec->ir_count > XFS_INODES_PER_CHUNK)
 122                 goto out_bad_rec;
 123         if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
 124                 goto out_bad_rec;
 125
 126         /* if there are no holes, return the first available offset */
 127         if (!xfs_inobt_issparse(irec->ir_holemask))
 128                 realfree = irec->ir_free;
 129         else
 130                 realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
 131         if (hweight64(realfree) != irec->ir_freecount)
 132                 goto out_bad_rec;
 133
 134         return 0;
 135
 136 out_bad_rec:
 137         xfs_warn(mp,
 138                 "%s Inode BTree record corruption in AG %d detected!",
 139                 cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
 140                 cur->bc_ag.pag->pag_agno);
 141         xfs_warn(mp,
 142 "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
 143                 irec->ir_startino, irec->ir_count, irec->ir_freecount,
 144                 irec->ir_free, irec->ir_holemask);
 145         return -EFSCORRUPTED;
 146 }
 147
 148 /*
 149  * Insert a single inobt record. Cursor must already point to desired location.
 150  */
 151 int
 152 xfs_inobt_insert_rec(
 153         struct xfs_btree_cur    *cur,
 154         uint16_t                holemask,
 155         uint8_t                 count,
 156         int32_t                 freecount,
 157         xfs_inofree_t           free,
 158         int                     *stat)
 159 {
 160         cur->bc_rec.i.ir_holemask = holemask;
 161         cur->bc_rec.i.ir_count = count;
 162         cur->bc_rec.i.ir_freecount = freecount;
 163         cur->bc_rec.i.ir_free = free;
 164         return xfs_btree_insert(cur, stat);
 165 }
 166
 167 /*
 168  * Insert records describing a newly allocated inode chunk into the inobt.
 169  */
 170 STATIC int
 171 xfs_inobt_insert(
 172         struct xfs_perag        *pag,
 173         struct xfs_trans        *tp,
 174         struct xfs_buf          *agbp,
 175         xfs_agino_t             newino,
 176         xfs_agino_t             newlen,
 177         xfs_btnum_t             btnum)
 178 {
 179         struct xfs_btree_cur    *cur;
 180         xfs_agino_t             thisino;
 181         int                     i;
 182         int                     error;
 183
 184         cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
 185
 186         for (thisino = newino;
 187              thisino < newino + newlen;
 188              thisino += XFS_INODES_PER_CHUNK) {
 189                 error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
 190                 if (error) {
 191                         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 192                         return error;
 193                 }
 194                 ASSERT(i == 0);
 195
 196                 error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
 197                                              XFS_INODES_PER_CHUNK,
 198                                              XFS_INODES_PER_CHUNK,
 199                                              XFS_INOBT_ALL_FREE, &i);
 200                 if (error) {
 201                         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 202                         return error;
 203                 }
 204                 ASSERT(i == 1);
 205         }
 206
 207         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 208
 209         return 0;
 210 }
 211
 212 /*
 213  * Verify that the number of free inodes in the AGI is correct.
 214  */
 215 #ifdef DEBUG
 216 static int
 217 xfs_check_agi_freecount(
 218         struct xfs_btree_cur    *cur)
 219 {
 220         if (cur->bc_nlevels == 1) {
 221                 xfs_inobt_rec_incore_t rec;
 222                 int             freecount = 0;
 223                 int             error;
 224                 int             i;
 225
 226                 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 227                 if (error)
 228                         return error;
 229
 230                 do {
 231                         error = xfs_inobt_get_rec(cur, &rec, &i);
 232                         if (error)
 233                                 return error;
 234
 235                         if (i) {
 236                                 freecount += rec.ir_freecount;
 237                                 error = xfs_btree_increment(cur, 0, &i);
 238                                 if (error)
 239                                         return error;
 240                         }
 241                 } while (i == 1);
 242
 243                 if (!xfs_is_shutdown(cur->bc_mp))
 244                         ASSERT(freecount == cur->bc_ag.pag->pagi_freecount);
 245         }
 246         return 0;
 247 }
 248 #else
 249 #define xfs_check_agi_freecount(cur)    0
 250 #endif
 251
 252 /*
 253  * Initialise a new set of inodes. When called without a transaction context
 254  * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
 255  * than logging them (which in a transaction context puts them into the AIL
 256  * for writeback rather than the xfsbufd queue).
 257  */
 258 int
 259 xfs_ialloc_inode_init(
 260         struct xfs_mount        *mp,
 261         struct xfs_trans        *tp,
 262         struct list_head        *buffer_list,
 263         int                     icount,
 264         xfs_agnumber_t          agno,
 265         xfs_agblock_t           agbno,
 266         xfs_agblock_t           length,
 267         unsigned int            gen)
 268 {
 269         struct xfs_buf          *fbuf;
 270         struct xfs_dinode       *free;
 271         int                     nbufs;
 272         int                     version;
 273         int                     i, j;
 274         xfs_daddr_t             d;
 275         xfs_ino_t               ino = 0;
 276         int                     error;
 277
 278         /*
 279          * Loop over the new block(s), filling in the inodes.  For small block
 280          * sizes, manipulate the inodes in buffers  which are multiples of the
 281          * blocks size.
 282          */
 283         nbufs = length / M_IGEO(mp)->blocks_per_cluster;
 284
 285         /*
 286          * Figure out what version number to use in the inodes we create.  If
 287          * the superblock version has caught up to the one that supports the new
 288          * inode format, then use the new inode version.  Otherwise use the old
 289          * version so that old kernels will continue to be able to use the file
 290          * system.
 291          *
 292          * For v3 inodes, we also need to write the inode number into the inode,
 293          * so calculate the first inode number of the chunk here as
 294          * XFS_AGB_TO_AGINO() only works within a filesystem block, not
 295          * across multiple filesystem blocks (such as a cluster) and so cannot
 296          * be used in the cluster buffer loop below.
 297          *
 298          * Further, because we are writing the inode directly into the buffer
 299          * and calculating a CRC on the entire inode, we have ot log the entire
 300          * inode so that the entire range the CRC covers is present in the log.
 301          * That means for v3 inode we log the entire buffer rather than just the
 302          * inode cores.
 303          */
 304         if (xfs_has_v3inodes(mp)) {
 305                 version = 3;
 306                 ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
 307
 308                 /*
 309                  * log the initialisation that is about to take place as an
 310                  * logical operation. This means the transaction does not
 311                  * need to log the physical changes to the inode buffers as log
 312                  * recovery will know what initialisation is actually needed.
 313                  * Hence we only need to log the buffers as "ordered" buffers so
 314                  * they track in the AIL as if they were physically logged.
 315                  */
 316                 if (tp)
 317                         xfs_icreate_log(tp, agno, agbno, icount,
 318                                         mp->m_sb.sb_inodesize, length, gen);
 319         } else
 320                 version = 2;
 321
 322         for (j = 0; j < nbufs; j++) {
 323                 /*
 324                  * Get the block.
 325                  */
 326                 d = XFS_AGB_TO_DADDR(mp, agno, agbno +
 327                                 (j * M_IGEO(mp)->blocks_per_cluster));
 328                 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 329                                 mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
 330                                 XBF_UNMAPPED, &fbuf);
 331                 if (error)
 332                         return error;
 333
 334                 /* Initialize the inode buffers and log them appropriately. */
 335                 fbuf->b_ops = &xfs_inode_buf_ops;
 336                 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
 337                 for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
 338                         int     ioffset = i << mp->m_sb.sb_inodelog;
 339
 340                         free = xfs_make_iptr(mp, fbuf, i);
 341                         free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 342                         free->di_version = version;
 343                         free->di_gen = cpu_to_be32(gen);
 344                         free->di_next_unlinked = cpu_to_be32(NULLAGINO);
 345
 346                         if (version == 3) {
 347                                 free->di_ino = cpu_to_be64(ino);
 348                                 ino++;
 349                                 uuid_copy(&free->di_uuid,
 350                                           &mp->m_sb.sb_meta_uuid);
 351                                 xfs_dinode_calc_crc(mp, free);
 352                         } else if (tp) {
 353                                 /* just log the inode core */
 354                                 xfs_trans_log_buf(tp, fbuf, ioffset,
 355                                           ioffset + XFS_DINODE_SIZE(mp) - 1);
 356                         }
 357                 }
 358
 359                 if (tp) {
 360                         /*
 361                          * Mark the buffer as an inode allocation buffer so it
 362                          * sticks in AIL at the point of this allocation
 363                          * transaction. This ensures the they are on disk before
 364                          * the tail of the log can be moved past this
 365                          * transaction (i.e. by preventing relogging from moving
 366                          * it forward in the log).
 367                          */
 368                         xfs_trans_inode_alloc_buf(tp, fbuf);
 369                         if (version == 3) {
 370                                 /*
 371                                  * Mark the buffer as ordered so that they are
 372                                  * not physically logged in the transaction but
 373                                  * still tracked in the AIL as part of the
 374                                  * transaction and pin the log appropriately.
 375                                  */
 376                                 xfs_trans_ordered_buf(tp, fbuf);
 377                         }
 378                 } else {
 379                         fbuf->b_flags |= XBF_DONE;
 380                         xfs_buf_delwri_queue(fbuf, buffer_list);
 381                         xfs_buf_relse(fbuf);
 382                 }
 383         }
 384         return 0;
 385 }
 386
 387 /*
 388  * Align startino and allocmask for a recently allocated sparse chunk such that
 389  * they are fit for insertion (or merge) into the on-disk inode btrees.
 390  *
 391  * Background:
 392  *
 393  * When enabled, sparse inode support increases the inode alignment from cluster
 394  * size to inode chunk size. This means that the minimum range between two
 395  * non-adjacent inode records in the inobt is large enough for a full inode
 396  * record. This allows for cluster sized, cluster aligned block allocation
 397  * without need to worry about whether the resulting inode record overlaps with
 398  * another record in the tree. Without this basic rule, we would have to deal
 399  * with the consequences of overlap by potentially undoing recent allocations in
 400  * the inode allocation codepath.
 401  *
 402  * Because of this alignment rule (which is enforced on mount), there are two
 403  * inobt possibilities for newly allocated sparse chunks. One is that the
 404  * aligned inode record for the chunk covers a range of inodes not already
 405  * covered in the inobt (i.e., it is safe to insert a new sparse record). The
 406  * other is that a record already exists at the aligned startino that considers
 407  * the newly allocated range as sparse. In the latter case, record content is
 408  * merged in hope that sparse inode chunks fill to full chunks over time.
 409  */
 410 STATIC void
 411 xfs_align_sparse_ino(
 412         struct xfs_mount                *mp,
 413         xfs_agino_t                     *startino,
 414         uint16_t                        *allocmask)
 415 {
 416         xfs_agblock_t                   agbno;
 417         xfs_agblock_t                   mod;
 418         int                             offset;
 419
 420         agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
 421         mod = agbno % mp->m_sb.sb_inoalignmt;
 422         if (!mod)
 423                 return;
 424
 425         /* calculate the inode offset and align startino */
 426         offset = XFS_AGB_TO_AGINO(mp, mod);
 427         *startino -= offset;
 428
 429         /*
 430          * Since startino has been aligned down, left shift allocmask such that
 431          * it continues to represent the same physical inodes relative to the
 432          * new startino.
 433          */
 434         *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
 435 }
 436
 437 /*
 438  * Determine whether the source inode record can merge into the target. Both
 439  * records must be sparse, the inode ranges must match and there must be no
 440  * allocation overlap between the records.
 441  */
 442 STATIC bool
 443 __xfs_inobt_can_merge(
 444         struct xfs_inobt_rec_incore     *trec,  /* tgt record */
 445         struct xfs_inobt_rec_incore     *srec)  /* src record */
 446 {
 447         uint64_t                        talloc;
 448         uint64_t                        salloc;
 449
 450         /* records must cover the same inode range */
 451         if (trec->ir_startino != srec->ir_startino)
 452                 return false;
 453
 454         /* both records must be sparse */
 455         if (!xfs_inobt_issparse(trec->ir_holemask) ||
 456             !xfs_inobt_issparse(srec->ir_holemask))
 457                 return false;
 458
 459         /* both records must track some inodes */
 460         if (!trec->ir_count || !srec->ir_count)
 461                 return false;
 462
 463         /* can't exceed capacity of a full record */
 464         if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
 465                 return false;
 466
 467         /* verify there is no allocation overlap */
 468         talloc = xfs_inobt_irec_to_allocmask(trec);
 469         salloc = xfs_inobt_irec_to_allocmask(srec);
 470         if (talloc & salloc)
 471                 return false;
 472
 473         return true;
 474 }
 475
 476 /*
 477  * Merge the source inode record into the target. The caller must call
 478  * __xfs_inobt_can_merge() to ensure the merge is valid.
 479  */
 480 STATIC void
 481 __xfs_inobt_rec_merge(
 482         struct xfs_inobt_rec_incore     *trec,  /* target */
 483         struct xfs_inobt_rec_incore     *srec)  /* src */
 484 {
 485         ASSERT(trec->ir_startino == srec->ir_startino);
 486
 487         /* combine the counts */
 488         trec->ir_count += srec->ir_count;
 489         trec->ir_freecount += srec->ir_freecount;
 490
 491         /*
 492          * Merge the holemask and free mask. For both fields, 0 bits refer to
 493          * allocated inodes. We combine the allocated ranges with bitwise AND.
 494          */
 495         trec->ir_holemask &= srec->ir_holemask;
 496         trec->ir_free &= srec->ir_free;
 497 }
 498
 499 /*
 500  * Insert a new sparse inode chunk into the associated inode btree. The inode
 501  * record for the sparse chunk is pre-aligned to a startino that should match
 502  * any pre-existing sparse inode record in the tree. This allows sparse chunks
 503  * to fill over time.
 504  *
 505  * This function supports two modes of handling preexisting records depending on
 506  * the merge flag. If merge is true, the provided record is merged with the
 507  * existing record and updated in place. The merged record is returned in nrec.
 508  * If merge is false, an existing record is replaced with the provided record.
 509  * If no preexisting record exists, the provided record is always inserted.
 510  *
 511  * It is considered corruption if a merge is requested and not possible. Given
 512  * the sparse inode alignment constraints, this should never happen.
 513  */
 514 STATIC int
 515 xfs_inobt_insert_sprec(
 516         struct xfs_perag                *pag,
 517         struct xfs_trans                *tp,
 518         struct xfs_buf                  *agbp,
 519         int                             btnum,
 520         struct xfs_inobt_rec_incore     *nrec,  /* in/out: new/merged rec. */
 521         bool                            merge)  /* merge or replace */
 522 {
 523         struct xfs_mount                *mp = pag->pag_mount;
 524         struct xfs_btree_cur            *cur;
 525         int                             error;
 526         int                             i;
 527         struct xfs_inobt_rec_incore     rec;
 528
 529         cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
 530
 531         /* the new record is pre-aligned so we know where to look */
 532         error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
 533         if (error)
 534                 goto error;
 535         /* if nothing there, insert a new record and return */
 536         if (i == 0) {
 537                 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
 538                                              nrec->ir_count, nrec->ir_freecount,
 539                                              nrec->ir_free, &i);
 540                 if (error)
 541                         goto error;
 542                 if (XFS_IS_CORRUPT(mp, i != 1)) {
 543                         error = -EFSCORRUPTED;
 544                         goto error;
 545                 }
 546
 547                 goto out;
 548         }
 549
 550         /*
 551          * A record exists at this startino. Merge or replace the record
 552          * depending on what we've been asked to do.
 553          */
 554         if (merge) {
 555                 error = xfs_inobt_get_rec(cur, &rec, &i);
 556                 if (error)
 557                         goto error;
 558                 if (XFS_IS_CORRUPT(mp, i != 1)) {
 559                         error = -EFSCORRUPTED;
 560                         goto error;
 561                 }
 562                 if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
 563                         error = -EFSCORRUPTED;
 564                         goto error;
 565                 }
 566
 567                 /*
 568                  * This should never fail. If we have coexisting records that
 569                  * cannot merge, something is seriously wrong.
 570                  */
 571                 if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
 572                         error = -EFSCORRUPTED;
 573                         goto error;
 574                 }
 575
 576                 trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
 577                                          rec.ir_holemask, nrec->ir_startino,
 578                                          nrec->ir_holemask);
 579
 580                 /* merge to nrec to output the updated record */
 581                 __xfs_inobt_rec_merge(nrec, &rec);
 582
 583                 trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
 584                                           nrec->ir_holemask);
 585
 586                 error = xfs_inobt_rec_check_count(mp, nrec);
 587                 if (error)
 588                         goto error;
 589         }
 590
 591         error = xfs_inobt_update(cur, nrec);
 592         if (error)
 593                 goto error;
 594
 595 out:
 596         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 597         return 0;
 598 error:
 599         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 600         return error;
 601 }
 602
 603 /*
 604  * Allocate new inodes in the allocation group specified by agbp.  Returns 0 if
 605  * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
 606  * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
 607  * inode count threshold, or the usual negative error code for other errors.
 608  */
 609 STATIC int
 610 xfs_ialloc_ag_alloc(
 611         struct xfs_perag        *pag,
 612         struct xfs_trans        *tp,
 613         struct xfs_buf          *agbp)
 614 {
 615         struct xfs_agi          *agi;
 616         struct xfs_alloc_arg    args;
 617         int                     error;
 618         xfs_agino_t             newino;         /* new first inode's number */
 619         xfs_agino_t             newlen;         /* new number of inodes */
 620         int                     isaligned = 0;  /* inode allocation at stripe */
 621                                                 /* unit boundary */
 622         /* init. to full chunk */
 623         struct xfs_inobt_rec_incore rec;
 624         struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
 625         uint16_t                allocmask = (uint16_t) -1;
 626         int                     do_sparse = 0;
 627
 628         memset(&args, 0, sizeof(args));
 629         args.tp = tp;
 630         args.mp = tp->t_mountp;
 631         args.fsbno = NULLFSBLOCK;
 632         args.oinfo = XFS_RMAP_OINFO_INODES;
 633         args.pag = pag;
 634
 635 #ifdef DEBUG
 636         /* randomly do sparse inode allocations */
 637         if (xfs_has_sparseinodes(tp->t_mountp) &&
 638             igeo->ialloc_min_blks < igeo->ialloc_blks)
 639                 do_sparse = get_random_u32_below(2);
 640 #endif
 641
 642         /*
 643          * Locking will ensure that we don't have two callers in here
 644          * at one time.
 645          */
 646         newlen = igeo->ialloc_inos;
 647         if (igeo->maxicount &&
 648             percpu_counter_read_positive(&args.mp->m_icount) + newlen >
 649                                                         igeo->maxicount)
 650                 return -ENOSPC;
 651         args.minlen = args.maxlen = igeo->ialloc_blks;
 652         /*
 653          * First try to allocate inodes contiguous with the last-allocated
 654          * chunk of inodes.  If the filesystem is striped, this will fill
 655          * an entire stripe unit with inodes.
 656          */
 657         agi = agbp->b_addr;
 658         newino = be32_to_cpu(agi->agi_newino);
 659         args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 660                      igeo->ialloc_blks;
 661         if (do_sparse)
 662                 goto sparse_alloc;
 663         if (likely(newino != NULLAGINO &&
 664                   (args.agbno < be32_to_cpu(agi->agi_length)))) {
 665                 args.prod = 1;
 666
 667                 /*
 668                  * We need to take into account alignment here to ensure that
 669                  * we don't modify the free list if we fail to have an exact
 670                  * block. If we don't have an exact match, and every oher
 671                  * attempt allocation attempt fails, we'll end up cancelling
 672                  * a dirty transaction and shutting down.
 673                  *
 674                  * For an exact allocation, alignment must be 1,
 675                  * however we need to take cluster alignment into account when
 676                  * fixing up the freelist. Use the minalignslop field to
 677                  * indicate that extra blocks might be required for alignment,
 678                  * but not to use them in the actual exact allocation.
 679                  */
 680                 args.alignment = 1;
 681                 args.minalignslop = igeo->cluster_align - 1;
 682
 683                 /* Allow space for the inode btree to split. */
 684                 args.minleft = igeo->inobt_maxlevels;
 685                 error = xfs_alloc_vextent_exact_bno(&args,
 686                                 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
 687                                                 args.agbno));
 688                 if (error)
 689                         return error;
 690
 691                 /*
 692                  * This request might have dirtied the transaction if the AG can
 693                  * satisfy the request, but the exact block was not available.
 694                  * If the allocation did fail, subsequent requests will relax
 695                  * the exact agbno requirement and increase the alignment
 696                  * instead. It is critical that the total size of the request
 697                  * (len + alignment + slop) does not increase from this point
 698                  * on, so reset minalignslop to ensure it is not included in
 699                  * subsequent requests.
 700                  */
 701                 args.minalignslop = 0;
 702         }
 703
 704         if (unlikely(args.fsbno == NULLFSBLOCK)) {
 705                 /*
 706                  * Set the alignment for the allocation.
 707                  * If stripe alignment is turned on then align at stripe unit
 708                  * boundary.
 709                  * If the cluster size is smaller than a filesystem block
 710                  * then we're doing I/O for inodes in filesystem block size
 711                  * pieces, so don't need alignment anyway.
 712                  */
 713                 isaligned = 0;
 714                 if (igeo->ialloc_align) {
 715                         ASSERT(!xfs_has_noalign(args.mp));
 716                         args.alignment = args.mp->m_dalign;
 717                         isaligned = 1;
 718                 } else
 719                         args.alignment = igeo->cluster_align;
 720                 /*
 721                  * Allocate a fixed-size extent of inodes.
 722                  */
 723                 args.prod = 1;
 724                 /*
 725                  * Allow space for the inode btree to split.
 726                  */
 727                 args.minleft = igeo->inobt_maxlevels;
 728                 error = xfs_alloc_vextent_near_bno(&args,
 729                                 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
 730                                                 be32_to_cpu(agi->agi_root)));
 731                 if (error)
 732                         return error;
 733         }
 734
 735         /*
 736          * If stripe alignment is turned on, then try again with cluster
 737          * alignment.
 738          */
 739         if (isaligned && args.fsbno == NULLFSBLOCK) {
 740                 args.alignment = igeo->cluster_align;
 741                 error = xfs_alloc_vextent_near_bno(&args,
 742                                 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
 743                                                 be32_to_cpu(agi->agi_root)));
 744                 if (error)
 745                         return error;
 746         }
 747
 748         /*
 749          * Finally, try a sparse allocation if the filesystem supports it and
 750          * the sparse allocation length is smaller than a full chunk.
 751          */
 752         if (xfs_has_sparseinodes(args.mp) &&
 753             igeo->ialloc_min_blks < igeo->ialloc_blks &&
 754             args.fsbno == NULLFSBLOCK) {
 755 sparse_alloc:
 756                 args.alignment = args.mp->m_sb.sb_spino_align;
 757                 args.prod = 1;
 758
 759                 args.minlen = igeo->ialloc_min_blks;
 760                 args.maxlen = args.minlen;
 761
 762                 /*
 763                  * The inode record will be aligned to full chunk size. We must
 764                  * prevent sparse allocation from AG boundaries that result in
 765                  * invalid inode records, such as records that start at agbno 0
 766                  * or extend beyond the AG.
 767                  *
 768                  * Set min agbno to the first aligned, non-zero agbno and max to
 769                  * the last aligned agbno that is at least one full chunk from
 770                  * the end of the AG.
 771                  */
 772                 args.min_agbno = args.mp->m_sb.sb_inoalignmt;
 773                 args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
 774                                             args.mp->m_sb.sb_inoalignmt) -
 775                                  igeo->ialloc_blks;
 776
 777                 error = xfs_alloc_vextent_near_bno(&args,
 778                                 XFS_AGB_TO_FSB(args.mp, pag->pag_agno,
 779                                                 be32_to_cpu(agi->agi_root)));
 780                 if (error)
 781                         return error;
 782
 783                 newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
 784                 ASSERT(newlen <= XFS_INODES_PER_CHUNK);
 785                 allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
 786         }
 787
 788         if (args.fsbno == NULLFSBLOCK)
 789                 return -EAGAIN;
 790
 791         ASSERT(args.len == args.minlen);
 792
 793         /*
 794          * Stamp and write the inode buffers.
 795          *
 796          * Seed the new inode cluster with a random generation number. This
 797          * prevents short-term reuse of generation numbers if a chunk is
 798          * freed and then immediately reallocated. We use random numbers
 799          * rather than a linear progression to prevent the next generation
 800          * number from being easily guessable.
 801          */
 802         error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno,
 803                         args.agbno, args.len, get_random_u32());
 804
 805         if (error)
 806                 return error;
 807         /*
 808          * Convert the results.
 809          */
 810         newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
 811
 812         if (xfs_inobt_issparse(~allocmask)) {
 813                 /*
 814                  * We've allocated a sparse chunk. Align the startino and mask.
 815                  */
 816                 xfs_align_sparse_ino(args.mp, &newino, &allocmask);
 817
 818                 rec.ir_startino = newino;
 819                 rec.ir_holemask = ~allocmask;
 820                 rec.ir_count = newlen;
 821                 rec.ir_freecount = newlen;
 822                 rec.ir_free = XFS_INOBT_ALL_FREE;
 823
 824                 /*
 825                  * Insert the sparse record into the inobt and allow for a merge
 826                  * if necessary. If a merge does occur, rec is updated to the
 827                  * merged record.
 828                  */
 829                 error = xfs_inobt_insert_sprec(pag, tp, agbp,
 830                                 XFS_BTNUM_INO, &rec, true);
 831                 if (error == -EFSCORRUPTED) {
 832                         xfs_alert(args.mp,
 833         "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
 834                                   XFS_AGINO_TO_INO(args.mp, pag->pag_agno,
 835                                                    rec.ir_startino),
 836                                   rec.ir_holemask, rec.ir_count);
 837                         xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
 838                 }
 839                 if (error)
 840                         return error;
 841
 842                 /*
 843                  * We can't merge the part we've just allocated as for the inobt
 844                  * due to finobt semantics. The original record may or may not
 845                  * exist independent of whether physical inodes exist in this
 846                  * sparse chunk.
 847                  *
 848                  * We must update the finobt record based on the inobt record.
 849                  * rec contains the fully merged and up to date inobt record
 850                  * from the previous call. Set merge false to replace any
 851                  * existing record with this one.
 852                  */
 853                 if (xfs_has_finobt(args.mp)) {
 854                         error = xfs_inobt_insert_sprec(pag, tp, agbp,
 855                                        XFS_BTNUM_FINO, &rec, false);
 856                         if (error)
 857                                 return error;
 858                 }
 859         } else {
 860                 /* full chunk - insert new records to both btrees */
 861                 error = xfs_inobt_insert(pag, tp, agbp, newino, newlen,
 862                                          XFS_BTNUM_INO);
 863                 if (error)
 864                         return error;
 865
 866                 if (xfs_has_finobt(args.mp)) {
 867                         error = xfs_inobt_insert(pag, tp, agbp, newino,
 868                                                  newlen, XFS_BTNUM_FINO);
 869                         if (error)
 870                                 return error;
 871                 }
 872         }
 873
 874         /*
 875          * Update AGI counts and newino.
 876          */
 877         be32_add_cpu(&agi->agi_count, newlen);
 878         be32_add_cpu(&agi->agi_freecount, newlen);
 879         pag->pagi_freecount += newlen;
 880         pag->pagi_count += newlen;
 881         agi->agi_newino = cpu_to_be32(newino);
 882
 883         /*
 884          * Log allocation group header fields
 885          */
 886         xfs_ialloc_log_agi(tp, agbp,
 887                 XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
 888         /*
 889          * Modify/log superblock values for inode count and inode free count.
 890          */
 891         xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
 892         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
 893         return 0;
 894 }
 895
 896 /*
 897  * Try to retrieve the next record to the left/right from the current one.
 898  */
 899 STATIC int
 900 xfs_ialloc_next_rec(
 901         struct xfs_btree_cur    *cur,
 902         xfs_inobt_rec_incore_t  *rec,
 903         int                     *done,
 904         int                     left)
 905 {
 906         int                     error;
 907         int                     i;
 908
 909         if (left)
 910                 error = xfs_btree_decrement(cur, 0, &i);
 911         else
 912                 error = xfs_btree_increment(cur, 0, &i);
 913
 914         if (error)
 915                 return error;
 916         *done = !i;
 917         if (i) {
 918                 error = xfs_inobt_get_rec(cur, rec, &i);
 919                 if (error)
 920                         return error;
 921                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
 922                         return -EFSCORRUPTED;
 923         }
 924
 925         return 0;
 926 }
 927
 928 STATIC int
 929 xfs_ialloc_get_rec(
 930         struct xfs_btree_cur    *cur,
 931         xfs_agino_t             agino,
 932         xfs_inobt_rec_incore_t  *rec,
 933         int                     *done)
 934 {
 935         int                     error;
 936         int                     i;
 937
 938         error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
 939         if (error)
 940                 return error;
 941         *done = !i;
 942         if (i) {
 943                 error = xfs_inobt_get_rec(cur, rec, &i);
 944                 if (error)
 945                         return error;
 946                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
 947                         return -EFSCORRUPTED;
 948         }
 949
 950         return 0;
 951 }
 952
 953 /*
 954  * Return the offset of the first free inode in the record. If the inode chunk
 955  * is sparsely allocated, we convert the record holemask to inode granularity
 956  * and mask off the unallocated regions from the inode free mask.
 957  */
 958 STATIC int
 959 xfs_inobt_first_free_inode(
 960         struct xfs_inobt_rec_incore     *rec)
 961 {
 962         xfs_inofree_t                   realfree;
 963
 964         /* if there are no holes, return the first available offset */
 965         if (!xfs_inobt_issparse(rec->ir_holemask))
 966                 return xfs_lowbit64(rec->ir_free);
 967
 968         realfree = xfs_inobt_irec_to_allocmask(rec);
 969         realfree &= rec->ir_free;
 970
 971         return xfs_lowbit64(realfree);
 972 }
 973
 974 /*
 975  * Allocate an inode using the inobt-only algorithm.
 976  */
 977 STATIC int
 978 xfs_dialloc_ag_inobt(
 979         struct xfs_perag        *pag,
 980         struct xfs_trans        *tp,
 981         struct xfs_buf          *agbp,
 982         xfs_ino_t               parent,
 983         xfs_ino_t               *inop)
 984 {
 985         struct xfs_mount        *mp = tp->t_mountp;
 986         struct xfs_agi          *agi = agbp->b_addr;
 987         xfs_agnumber_t          pagno = XFS_INO_TO_AGNO(mp, parent);
 988         xfs_agino_t             pagino = XFS_INO_TO_AGINO(mp, parent);
 989         struct xfs_btree_cur    *cur, *tcur;
 990         struct xfs_inobt_rec_incore rec, trec;
 991         xfs_ino_t               ino;
 992         int                     error;
 993         int                     offset;
 994         int                     i, j;
 995         int                     searchdistance = 10;
 996
 997         ASSERT(xfs_perag_initialised_agi(pag));
 998         ASSERT(xfs_perag_allows_inodes(pag));
 999         ASSERT(pag->pagi_freecount > 0);
1000
1001  restart_pagno:
1002         cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
1003         /*
1004          * If pagino is 0 (this is the root inode allocation) use newino.
1005          * This must work because we've just allocated some.
1006          */
1007         if (!pagino)
1008                 pagino = be32_to_cpu(agi->agi_newino);
1009
1010         error = xfs_check_agi_freecount(cur);
1011         if (error)
1012                 goto error0;
1013
1014         /*
1015          * If in the same AG as the parent, try to get near the parent.
1016          */
1017         if (pagno == pag->pag_agno) {
1018                 int             doneleft;       /* done, to the left */
1019                 int             doneright;      /* done, to the right */
1020
1021                 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
1022                 if (error)
1023                         goto error0;
1024                 if (XFS_IS_CORRUPT(mp, i != 1)) {
1025                         error = -EFSCORRUPTED;
1026                         goto error0;
1027                 }
1028
1029                 error = xfs_inobt_get_rec(cur, &rec, &j);
1030                 if (error)
1031                         goto error0;
1032                 if (XFS_IS_CORRUPT(mp, j != 1)) {
1033                         error = -EFSCORRUPTED;
1034                         goto error0;
1035                 }
1036
1037                 if (rec.ir_freecount > 0) {
1038                         /*
1039                          * Found a free inode in the same chunk
1040                          * as the parent, done.
1041                          */
1042                         goto alloc_inode;
1043                 }
1044
1045
1046                 /*
1047                  * In the same AG as parent, but parent's chunk is full.
1048                  */
1049
1050                 /* duplicate the cursor, search left & right simultaneously */
1051                 error = xfs_btree_dup_cursor(cur, &tcur);
1052                 if (error)
1053                         goto error0;
1054
1055                 /*
1056                  * Skip to last blocks looked up if same parent inode.
1057                  */
1058                 if (pagino != NULLAGINO &&
1059                     pag->pagl_pagino == pagino &&
1060                     pag->pagl_leftrec != NULLAGINO &&
1061                     pag->pagl_rightrec != NULLAGINO) {
1062                         error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
1063                                                    &trec, &doneleft);
1064                         if (error)
1065                                 goto error1;
1066
1067                         error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
1068                                                    &rec, &doneright);
1069                         if (error)
1070                                 goto error1;
1071                 } else {
1072                         /* search left with tcur, back up 1 record */
1073                         error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
1074                         if (error)
1075                                 goto error1;
1076
1077                         /* search right with cur, go forward 1 record. */
1078                         error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
1079                         if (error)
1080                                 goto error1;
1081                 }
1082
1083                 /*
1084                  * Loop until we find an inode chunk with a free inode.
1085                  */
1086                 while (--searchdistance > 0 && (!doneleft || !doneright)) {
1087                         int     useleft;  /* using left inode chunk this time */
1088
1089                         /* figure out the closer block if both are valid. */
1090                         if (!doneleft && !doneright) {
1091                                 useleft = pagino -
1092                                  (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
1093                                   rec.ir_startino - pagino;
1094                         } else {
1095                                 useleft = !doneleft;
1096                         }
1097
1098                         /* free inodes to the left? */
1099                         if (useleft && trec.ir_freecount) {
1100                                 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1101                                 cur = tcur;
1102
1103                                 pag->pagl_leftrec = trec.ir_startino;
1104                                 pag->pagl_rightrec = rec.ir_startino;
1105                                 pag->pagl_pagino = pagino;
1106                                 rec = trec;
1107                                 goto alloc_inode;
1108                         }
1109
1110                         /* free inodes to the right? */
1111                         if (!useleft && rec.ir_freecount) {
1112                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1113
1114                                 pag->pagl_leftrec = trec.ir_startino;
1115                                 pag->pagl_rightrec = rec.ir_startino;
1116                                 pag->pagl_pagino = pagino;
1117                                 goto alloc_inode;
1118                         }
1119
1120                         /* get next record to check */
1121                         if (useleft) {
1122                                 error = xfs_ialloc_next_rec(tcur, &trec,
1123                                                                  &doneleft, 1);
1124                         } else {
1125                                 error = xfs_ialloc_next_rec(cur, &rec,
1126                                                                  &doneright, 0);
1127                         }
1128                         if (error)
1129                                 goto error1;
1130                 }
1131
1132                 if (searchdistance <= 0) {
1133                         /*
1134                          * Not in range - save last search
1135                          * location and allocate a new inode
1136                          */
1137                         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1138                         pag->pagl_leftrec = trec.ir_startino;
1139                         pag->pagl_rightrec = rec.ir_startino;
1140                         pag->pagl_pagino = pagino;
1141
1142                 } else {
1143                         /*
1144                          * We've reached the end of the btree. because
1145                          * we are only searching a small chunk of the
1146                          * btree each search, there is obviously free
1147                          * inodes closer to the parent inode than we
1148                          * are now. restart the search again.
1149                          */
1150                         pag->pagl_pagino = NULLAGINO;
1151                         pag->pagl_leftrec = NULLAGINO;
1152                         pag->pagl_rightrec = NULLAGINO;
1153                         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1154                         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1155                         goto restart_pagno;
1156                 }
1157         }
1158
1159         /*
1160          * In a different AG from the parent.
1161          * See if the most recently allocated block has any free.
1162          */
1163         if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1164                 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1165                                          XFS_LOOKUP_EQ, &i);
1166                 if (error)
1167                         goto error0;
1168
1169                 if (i == 1) {
1170                         error = xfs_inobt_get_rec(cur, &rec, &j);
1171                         if (error)
1172                                 goto error0;
1173
1174                         if (j == 1 && rec.ir_freecount > 0) {
1175                                 /*
1176                                  * The last chunk allocated in the group
1177                                  * still has a free inode.
1178                                  */
1179                                 goto alloc_inode;
1180                         }
1181                 }
1182         }
1183
1184         /*
1185          * None left in the last group, search the whole AG
1186          */
1187         error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1188         if (error)
1189                 goto error0;
1190         if (XFS_IS_CORRUPT(mp, i != 1)) {
1191                 error = -EFSCORRUPTED;
1192                 goto error0;
1193         }
1194
1195         for (;;) {
1196                 error = xfs_inobt_get_rec(cur, &rec, &i);
1197                 if (error)
1198                         goto error0;
1199                 if (XFS_IS_CORRUPT(mp, i != 1)) {
1200                         error = -EFSCORRUPTED;
1201                         goto error0;
1202                 }
1203                 if (rec.ir_freecount > 0)
1204                         break;
1205                 error = xfs_btree_increment(cur, 0, &i);
1206                 if (error)
1207                         goto error0;
1208                 if (XFS_IS_CORRUPT(mp, i != 1)) {
1209                         error = -EFSCORRUPTED;
1210                         goto error0;
1211                 }
1212         }
1213
1214 alloc_inode:
1215         offset = xfs_inobt_first_free_inode(&rec);
1216         ASSERT(offset >= 0);
1217         ASSERT(offset < XFS_INODES_PER_CHUNK);
1218         ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1219                                    XFS_INODES_PER_CHUNK) == 0);
1220         ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
1221         rec.ir_free &= ~XFS_INOBT_MASK(offset);
1222         rec.ir_freecount--;
1223         error = xfs_inobt_update(cur, &rec);
1224         if (error)
1225                 goto error0;
1226         be32_add_cpu(&agi->agi_freecount, -1);
1227         xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1228         pag->pagi_freecount--;
1229
1230         error = xfs_check_agi_freecount(cur);
1231         if (error)
1232                 goto error0;
1233
1234         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1235         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1236         *inop = ino;
1237         return 0;
1238 error1:
1239         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1240 error0:
1241         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1242         return error;
1243 }
1244
1245 /*
1246  * Use the free inode btree to allocate an inode based on distance from the
1247  * parent. Note that the provided cursor may be deleted and replaced.
1248  */
1249 STATIC int
1250 xfs_dialloc_ag_finobt_near(
1251         xfs_agino_t                     pagino,
1252         struct xfs_btree_cur            **ocur,
1253         struct xfs_inobt_rec_incore     *rec)
1254 {
1255         struct xfs_btree_cur            *lcur = *ocur;  /* left search cursor */
1256         struct xfs_btree_cur            *rcur;  /* right search cursor */
1257         struct xfs_inobt_rec_incore     rrec;
1258         int                             error;
1259         int                             i, j;
1260
1261         error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
1262         if (error)
1263                 return error;
1264
1265         if (i == 1) {
1266                 error = xfs_inobt_get_rec(lcur, rec, &i);
1267                 if (error)
1268                         return error;
1269                 if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1))
1270                         return -EFSCORRUPTED;
1271
1272                 /*
1273                  * See if we've landed in the parent inode record. The finobt
1274                  * only tracks chunks with at least one free inode, so record
1275                  * existence is enough.
1276                  */
1277                 if (pagino >= rec->ir_startino &&
1278                     pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
1279                         return 0;
1280         }
1281
1282         error = xfs_btree_dup_cursor(lcur, &rcur);
1283         if (error)
1284                 return error;
1285
1286         error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
1287         if (error)
1288                 goto error_rcur;
1289         if (j == 1) {
1290                 error = xfs_inobt_get_rec(rcur, &rrec, &j);
1291                 if (error)
1292                         goto error_rcur;
1293                 if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
1294                         error = -EFSCORRUPTED;
1295                         goto error_rcur;
1296                 }
1297         }
1298
1299         if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
1300                 error = -EFSCORRUPTED;
1301                 goto error_rcur;
1302         }
1303         if (i == 1 && j == 1) {
1304                 /*
1305                  * Both the left and right records are valid. Choose the closer
1306                  * inode chunk to the target.
1307                  */
1308                 if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
1309                     (rrec.ir_startino - pagino)) {
1310                         *rec = rrec;
1311                         xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1312                         *ocur = rcur;
1313                 } else {
1314                         xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1315                 }
1316         } else if (j == 1) {
1317                 /* only the right record is valid */
1318                 *rec = rrec;
1319                 xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1320                 *ocur = rcur;
1321         } else if (i == 1) {
1322                 /* only the left record is valid */
1323                 xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1324         }
1325
1326         return 0;
1327
1328 error_rcur:
1329         xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
1330         return error;
1331 }
1332
1333 /*
1334  * Use the free inode btree to find a free inode based on a newino hint. If
1335  * the hint is NULL, find the first free inode in the AG.
1336  */
1337 STATIC int
1338 xfs_dialloc_ag_finobt_newino(
1339         struct xfs_agi                  *agi,
1340         struct xfs_btree_cur            *cur,
1341         struct xfs_inobt_rec_incore     *rec)
1342 {
1343         int error;
1344         int i;
1345
1346         if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1347                 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1348                                          XFS_LOOKUP_EQ, &i);
1349                 if (error)
1350                         return error;
1351                 if (i == 1) {
1352                         error = xfs_inobt_get_rec(cur, rec, &i);
1353                         if (error)
1354                                 return error;
1355                         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1356                                 return -EFSCORRUPTED;
1357                         return 0;
1358                 }
1359         }
1360
1361         /*
1362          * Find the first inode available in the AG.
1363          */
1364         error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1365         if (error)
1366                 return error;
1367         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1368                 return -EFSCORRUPTED;
1369
1370         error = xfs_inobt_get_rec(cur, rec, &i);
1371         if (error)
1372                 return error;
1373         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1374                 return -EFSCORRUPTED;
1375
1376         return 0;
1377 }
1378
1379 /*
1380  * Update the inobt based on a modification made to the finobt. Also ensure that
1381  * the records from both trees are equivalent post-modification.
1382  */
1383 STATIC int
1384 xfs_dialloc_ag_update_inobt(
1385         struct xfs_btree_cur            *cur,   /* inobt cursor */
1386         struct xfs_inobt_rec_incore     *frec,  /* finobt record */
1387         int                             offset) /* inode offset */
1388 {
1389         struct xfs_inobt_rec_incore     rec;
1390         int                             error;
1391         int                             i;
1392
1393         error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1394         if (error)
1395                 return error;
1396         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1397                 return -EFSCORRUPTED;
1398
1399         error = xfs_inobt_get_rec(cur, &rec, &i);
1400         if (error)
1401                 return error;
1402         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
1403                 return -EFSCORRUPTED;
1404         ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1405                                    XFS_INODES_PER_CHUNK) == 0);
1406
1407         rec.ir_free &= ~XFS_INOBT_MASK(offset);
1408         rec.ir_freecount--;
1409
1410         if (XFS_IS_CORRUPT(cur->bc_mp,
1411                            rec.ir_free != frec->ir_free ||
1412                            rec.ir_freecount != frec->ir_freecount))
1413                 return -EFSCORRUPTED;
1414
1415         return xfs_inobt_update(cur, &rec);
1416 }
1417
1418 /*
1419  * Allocate an inode using the free inode btree, if available. Otherwise, fall
1420  * back to the inobt search algorithm.
1421  *
1422  * The caller selected an AG for us, and made sure that free inodes are
1423  * available.
1424  */
1425 static int
1426 xfs_dialloc_ag(
1427         struct xfs_perag        *pag,
1428         struct xfs_trans        *tp,
1429         struct xfs_buf          *agbp,
1430         xfs_ino_t               parent,
1431         xfs_ino_t               *inop)
1432 {
1433         struct xfs_mount                *mp = tp->t_mountp;
1434         struct xfs_agi                  *agi = agbp->b_addr;
1435         xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
1436         xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
1437         struct xfs_btree_cur            *cur;   /* finobt cursor */
1438         struct xfs_btree_cur            *icur;  /* inobt cursor */
1439         struct xfs_inobt_rec_incore     rec;
1440         xfs_ino_t                       ino;
1441         int                             error;
1442         int                             offset;
1443         int                             i;
1444
1445         if (!xfs_has_finobt(mp))
1446                 return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop);
1447
1448         /*
1449          * If pagino is 0 (this is the root inode allocation) use newino.
1450          * This must work because we've just allocated some.
1451          */
1452         if (!pagino)
1453                 pagino = be32_to_cpu(agi->agi_newino);
1454
1455         cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
1456
1457         error = xfs_check_agi_freecount(cur);
1458         if (error)
1459                 goto error_cur;
1460
1461         /*
1462          * The search algorithm depends on whether we're in the same AG as the
1463          * parent. If so, find the closest available inode to the parent. If
1464          * not, consider the agi hint or find the first free inode in the AG.
1465          */
1466         if (pag->pag_agno == pagno)
1467                 error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
1468         else
1469                 error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
1470         if (error)
1471                 goto error_cur;
1472
1473         offset = xfs_inobt_first_free_inode(&rec);
1474         ASSERT(offset >= 0);
1475         ASSERT(offset < XFS_INODES_PER_CHUNK);
1476         ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1477                                    XFS_INODES_PER_CHUNK) == 0);
1478         ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset);
1479
1480         /*
1481          * Modify or remove the finobt record.
1482          */
1483         rec.ir_free &= ~XFS_INOBT_MASK(offset);
1484         rec.ir_freecount--;
1485         if (rec.ir_freecount)
1486                 error = xfs_inobt_update(cur, &rec);
1487         else
1488                 error = xfs_btree_delete(cur, &i);
1489         if (error)
1490                 goto error_cur;
1491
1492         /*
1493          * The finobt has now been updated appropriately. We haven't updated the
1494          * agi and superblock yet, so we can create an inobt cursor and validate
1495          * the original freecount. If all is well, make the equivalent update to
1496          * the inobt using the finobt record and offset information.
1497          */
1498         icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
1499
1500         error = xfs_check_agi_freecount(icur);
1501         if (error)
1502                 goto error_icur;
1503
1504         error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
1505         if (error)
1506                 goto error_icur;
1507
1508         /*
1509          * Both trees have now been updated. We must update the perag and
1510          * superblock before we can check the freecount for each btree.
1511          */
1512         be32_add_cpu(&agi->agi_freecount, -1);
1513         xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1514         pag->pagi_freecount--;
1515
1516         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1517
1518         error = xfs_check_agi_freecount(icur);
1519         if (error)
1520                 goto error_icur;
1521         error = xfs_check_agi_freecount(cur);
1522         if (error)
1523                 goto error_icur;
1524
1525         xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
1526         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1527         *inop = ino;
1528         return 0;
1529
1530 error_icur:
1531         xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
1532 error_cur:
1533         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1534         return error;
1535 }
1536
1537 static int
1538 xfs_dialloc_roll(
1539         struct xfs_trans        **tpp,
1540         struct xfs_buf          *agibp)
1541 {
1542         struct xfs_trans        *tp = *tpp;
1543         struct xfs_dquot_acct   *dqinfo;
1544         int                     error;
1545
1546         /*
1547          * Hold to on to the agibp across the commit so no other allocation can
1548          * come in and take the free inodes we just allocated for our caller.
1549          */
1550         xfs_trans_bhold(tp, agibp);
1551
1552         /*
1553          * We want the quota changes to be associated with the next transaction,
1554          * NOT this one. So, detach the dqinfo from this and attach it to the
1555          * next transaction.
1556          */
1557         dqinfo = tp->t_dqinfo;
1558         tp->t_dqinfo = NULL;
1559
1560         error = xfs_trans_roll(&tp);
1561
1562         /* Re-attach the quota info that we detached from prev trx. */
1563         tp->t_dqinfo = dqinfo;
1564
1565         /*
1566          * Join the buffer even on commit error so that the buffer is released
1567          * when the caller cancels the transaction and doesn't have to handle
1568          * this error case specially.
1569          */
1570         xfs_trans_bjoin(tp, agibp);
1571         *tpp = tp;
1572         return error;
1573 }
1574
1575 static bool
1576 xfs_dialloc_good_ag(
1577         struct xfs_perag        *pag,
1578         struct xfs_trans        *tp,
1579         umode_t                 mode,
1580         int                     flags,
1581         bool                    ok_alloc)
1582 {
1583         struct xfs_mount        *mp = tp->t_mountp;
1584         xfs_extlen_t            ineed;
1585         xfs_extlen_t            longest = 0;
1586         int                     needspace;
1587         int                     error;
1588
1589         if (!pag)
1590                 return false;
1591         if (!xfs_perag_allows_inodes(pag))
1592                 return false;
1593
1594         if (!xfs_perag_initialised_agi(pag)) {
1595                 error = xfs_ialloc_read_agi(pag, tp, NULL);
1596                 if (error)
1597                         return false;
1598         }
1599
1600         if (pag->pagi_freecount)
1601                 return true;
1602         if (!ok_alloc)
1603                 return false;
1604
1605         if (!xfs_perag_initialised_agf(pag)) {
1606                 error = xfs_alloc_read_agf(pag, tp, flags, NULL);
1607                 if (error)
1608                         return false;
1609         }
1610
1611         /*
1612          * Check that there is enough free space for the file plus a chunk of
1613          * inodes if we need to allocate some. If this is the first pass across
1614          * the AGs, take into account the potential space needed for alignment
1615          * of inode chunks when checking the longest contiguous free space in
1616          * the AG - this prevents us from getting ENOSPC because we have free
1617          * space larger than ialloc_blks but alignment constraints prevent us
1618          * from using it.
1619          *
1620          * If we can't find an AG with space for full alignment slack to be
1621          * taken into account, we must be near ENOSPC in all AGs.  Hence we
1622          * don't include alignment for the second pass and so if we fail
1623          * allocation due to alignment issues then it is most likely a real
1624          * ENOSPC condition.
1625          *
1626          * XXX(dgc): this calculation is now bogus thanks to the per-ag
1627          * reservations that xfs_alloc_fix_freelist() now does via
1628          * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
1629          * be more than large enough for the check below to succeed, but
1630          * xfs_alloc_space_available() will fail because of the non-zero
1631          * metadata reservation and hence we won't actually be able to allocate
1632          * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
1633          * because of this.
1634          */
1635         ineed = M_IGEO(mp)->ialloc_min_blks;
1636         if (flags && ineed > 1)
1637                 ineed += M_IGEO(mp)->cluster_align;
1638         longest = pag->pagf_longest;
1639         if (!longest)
1640                 longest = pag->pagf_flcount > 0;
1641         needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
1642
1643         if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
1644                 return false;
1645         return true;
1646 }
1647
1648 static int
1649 xfs_dialloc_try_ag(
1650         struct xfs_perag        *pag,
1651         struct xfs_trans        **tpp,
1652         xfs_ino_t               parent,
1653         xfs_ino_t               *new_ino,
1654         bool                    ok_alloc)
1655 {
1656         struct xfs_buf          *agbp;
1657         xfs_ino_t               ino;
1658         int                     error;
1659
1660         /*
1661          * Then read in the AGI buffer and recheck with the AGI buffer
1662          * lock held.
1663          */
1664         error = xfs_ialloc_read_agi(pag, *tpp, &agbp);
1665         if (error)
1666                 return error;
1667
1668         if (!pag->pagi_freecount) {
1669                 if (!ok_alloc) {
1670                         error = -EAGAIN;
1671                         goto out_release;
1672                 }
1673
1674                 error = xfs_ialloc_ag_alloc(pag, *tpp, agbp);
1675                 if (error < 0)
1676                         goto out_release;
1677
1678                 /*
1679                  * We successfully allocated space for an inode cluster in this
1680                  * AG.  Roll the transaction so that we can allocate one of the
1681                  * new inodes.
1682                  */
1683                 ASSERT(pag->pagi_freecount > 0);
1684                 error = xfs_dialloc_roll(tpp, agbp);
1685                 if (error)
1686                         goto out_release;
1687         }
1688
1689         /* Allocate an inode in the found AG */
1690         error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino);
1691         if (!error)
1692                 *new_ino = ino;
1693         return error;
1694
1695 out_release:
1696         xfs_trans_brelse(*tpp, agbp);
1697         return error;
1698 }
1699
1700 /*
1701  * Allocate an on-disk inode.
1702  *
1703  * Mode is used to tell whether the new inode is a directory and hence where to
1704  * locate it. The on-disk inode that is allocated will be returned in @new_ino
1705  * on success, otherwise an error will be set to indicate the failure (e.g.
1706  * -ENOSPC).
1707  */
1708 int
1709 xfs_dialloc(
1710         struct xfs_trans        **tpp,
1711         xfs_ino_t               parent,
1712         umode_t                 mode,
1713         xfs_ino_t               *new_ino)
1714 {
1715         struct xfs_mount        *mp = (*tpp)->t_mountp;
1716         xfs_agnumber_t          agno;
1717         int                     error = 0;
1718         xfs_agnumber_t          start_agno;
1719         struct xfs_perag        *pag;
1720         struct xfs_ino_geometry *igeo = M_IGEO(mp);
1721         bool                    ok_alloc = true;
1722         bool                    low_space = false;
1723         int                     flags;
1724         xfs_ino_t               ino = NULLFSINO;
1725
1726         /*
1727          * Directories, symlinks, and regular files frequently allocate at least
1728          * one block, so factor that potential expansion when we examine whether
1729          * an AG has enough space for file creation.
1730          */
1731         if (S_ISDIR(mode))
1732                 start_agno = (atomic_inc_return(&mp->m_agirotor) - 1) %
1733                                 mp->m_maxagi;
1734         else {
1735                 start_agno = XFS_INO_TO_AGNO(mp, parent);
1736                 if (start_agno >= mp->m_maxagi)
1737                         start_agno = 0;
1738         }
1739
1740         /*
1741          * If we have already hit the ceiling of inode blocks then clear
1742          * ok_alloc so we scan all available agi structures for a free
1743          * inode.
1744          *
1745          * Read rough value of mp->m_icount by percpu_counter_read_positive,
1746          * which will sacrifice the preciseness but improve the performance.
1747          */
1748         if (igeo->maxicount &&
1749             percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
1750                                                         > igeo->maxicount) {
1751                 ok_alloc = false;
1752         }
1753
1754         /*
1755          * If we are near to ENOSPC, we want to prefer allocation from AGs that
1756          * have free inodes in them rather than use up free space allocating new
1757          * inode chunks. Hence we turn off allocation for the first non-blocking
1758          * pass through the AGs if we are near ENOSPC to consume free inodes
1759          * that we can immediately allocate, but then we allow allocation on the
1760          * second pass if we fail to find an AG with free inodes in it.
1761          */
1762         if (percpu_counter_read_positive(&mp->m_fdblocks) <
1763                         mp->m_low_space[XFS_LOWSP_1_PCNT]) {
1764                 ok_alloc = false;
1765                 low_space = true;
1766         }
1767
1768         /*
1769          * Loop until we find an allocation group that either has free inodes
1770          * or in which we can allocate some inodes.  Iterate through the
1771          * allocation groups upward, wrapping at the end.
1772          */
1773         flags = XFS_ALLOC_FLAG_TRYLOCK;
1774 retry:
1775         for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) {
1776                 if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) {
1777                         error = xfs_dialloc_try_ag(pag, tpp, parent,
1778                                         &ino, ok_alloc);
1779                         if (error != -EAGAIN)
1780                                 break;
1781                         error = 0;
1782                 }
1783
1784                 if (xfs_is_shutdown(mp)) {
1785                         error = -EFSCORRUPTED;
1786                         break;
1787                 }
1788         }
1789         if (pag)
1790                 xfs_perag_rele(pag);
1791         if (error)
1792                 return error;
1793         if (ino == NULLFSINO) {
1794                 if (flags) {
1795                         flags = 0;
1796                         if (low_space)
1797                                 ok_alloc = true;
1798                         goto retry;
1799                 }
1800                 return -ENOSPC;
1801         }
1802         *new_ino = ino;
1803         return 0;
1804 }
1805
1806 /*
1807  * Free the blocks of an inode chunk. We must consider that the inode chunk
1808  * might be sparse and only free the regions that are allocated as part of the
1809  * chunk.
1810  */
1811 STATIC void
1812 xfs_difree_inode_chunk(
1813         struct xfs_trans                *tp,
1814         xfs_agnumber_t                  agno,
1815         struct xfs_inobt_rec_incore     *rec)
1816 {
1817         struct xfs_mount                *mp = tp->t_mountp;
1818         xfs_agblock_t                   sagbno = XFS_AGINO_TO_AGBNO(mp,
1819                                                         rec->ir_startino);
1820         int                             startidx, endidx;
1821         int                             nextbit;
1822         xfs_agblock_t                   agbno;
1823         int                             contigblk;
1824         DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
1825
1826         if (!xfs_inobt_issparse(rec->ir_holemask)) {
1827                 /* not sparse, calculate extent info directly */
1828                 xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
1829                                   M_IGEO(mp)->ialloc_blks,
1830                                   &XFS_RMAP_OINFO_INODES);
1831                 return;
1832         }
1833
1834         /* holemask is only 16-bits (fits in an unsigned long) */
1835         ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
1836         holemask[0] = rec->ir_holemask;
1837
1838         /*
1839          * Find contiguous ranges of zeroes (i.e., allocated regions) in the
1840          * holemask and convert the start/end index of each range to an extent.
1841          * We start with the start and end index both pointing at the first 0 in
1842          * the mask.
1843          */
1844         startidx = endidx = find_first_zero_bit(holemask,
1845                                                 XFS_INOBT_HOLEMASK_BITS);
1846         nextbit = startidx + 1;
1847         while (startidx < XFS_INOBT_HOLEMASK_BITS) {
1848                 nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
1849                                              nextbit);
1850                 /*
1851                  * If the next zero bit is contiguous, update the end index of
1852                  * the current range and continue.
1853                  */
1854                 if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
1855                     nextbit == endidx + 1) {
1856                         endidx = nextbit;
1857                         goto next;
1858                 }
1859
1860                 /*
1861                  * nextbit is not contiguous with the current end index. Convert
1862                  * the current start/end to an extent and add it to the free
1863                  * list.
1864                  */
1865                 agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
1866                                   mp->m_sb.sb_inopblock;
1867                 contigblk = ((endidx - startidx + 1) *
1868                              XFS_INODES_PER_HOLEMASK_BIT) /
1869                             mp->m_sb.sb_inopblock;
1870
1871                 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
1872                 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
1873                 xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
1874                                   contigblk, &XFS_RMAP_OINFO_INODES);
1875
1876                 /* reset range to current bit and carry on... */
1877                 startidx = endidx = nextbit;
1878
1879 next:
1880                 nextbit++;
1881         }
1882 }
1883
1884 STATIC int
1885 xfs_difree_inobt(
1886         struct xfs_perag                *pag,
1887         struct xfs_trans                *tp,
1888         struct xfs_buf                  *agbp,
1889         xfs_agino_t                     agino,
1890         struct xfs_icluster             *xic,
1891         struct xfs_inobt_rec_incore     *orec)
1892 {
1893         struct xfs_mount                *mp = pag->pag_mount;
1894         struct xfs_agi                  *agi = agbp->b_addr;
1895         struct xfs_btree_cur            *cur;
1896         struct xfs_inobt_rec_incore     rec;
1897         int                             ilen;
1898         int                             error;
1899         int                             i;
1900         int                             off;
1901
1902         ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
1903         ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
1904
1905         /*
1906          * Initialize the cursor.
1907          */
1908         cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
1909
1910         error = xfs_check_agi_freecount(cur);
1911         if (error)
1912                 goto error0;
1913
1914         /*
1915          * Look for the entry describing this inode.
1916          */
1917         if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1918                 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
1919                         __func__, error);
1920                 goto error0;
1921         }
1922         if (XFS_IS_CORRUPT(mp, i != 1)) {
1923                 error = -EFSCORRUPTED;
1924                 goto error0;
1925         }
1926         error = xfs_inobt_get_rec(cur, &rec, &i);
1927         if (error) {
1928                 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1929                         __func__, error);
1930                 goto error0;
1931         }
1932         if (XFS_IS_CORRUPT(mp, i != 1)) {
1933                 error = -EFSCORRUPTED;
1934                 goto error0;
1935         }
1936         /*
1937          * Get the offset in the inode chunk.
1938          */
1939         off = agino - rec.ir_startino;
1940         ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
1941         ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1942         /*
1943          * Mark the inode free & increment the count.
1944          */
1945         rec.ir_free |= XFS_INOBT_MASK(off);
1946         rec.ir_freecount++;
1947
1948         /*
1949          * When an inode chunk is free, it becomes eligible for removal. Don't
1950          * remove the chunk if the block size is large enough for multiple inode
1951          * chunks (that might not be free).
1952          */
1953         if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
1954             mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
1955                 struct xfs_perag        *pag = agbp->b_pag;
1956
1957                 xic->deleted = true;
1958                 xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
1959                                 rec.ir_startino);
1960                 xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
1961
1962                 /*
1963                  * Remove the inode cluster from the AGI B+Tree, adjust the
1964                  * AGI and Superblock inode counts, and mark the disk space
1965                  * to be freed when the transaction is committed.
1966                  */
1967                 ilen = rec.ir_freecount;
1968                 be32_add_cpu(&agi->agi_count, -ilen);
1969                 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1970                 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1971                 pag->pagi_freecount -= ilen - 1;
1972                 pag->pagi_count -= ilen;
1973                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1974                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1975
1976                 if ((error = xfs_btree_delete(cur, &i))) {
1977                         xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
1978                                 __func__, error);
1979                         goto error0;
1980                 }
1981
1982                 xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
1983         } else {
1984                 xic->deleted = false;
1985
1986                 error = xfs_inobt_update(cur, &rec);
1987                 if (error) {
1988                         xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
1989                                 __func__, error);
1990                         goto error0;
1991                 }
1992
1993                 /*
1994                  * Change the inode free counts and log the ag/sb changes.
1995                  */
1996                 be32_add_cpu(&agi->agi_freecount, 1);
1997                 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1998                 pag->pagi_freecount++;
1999                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
2000         }
2001
2002         error = xfs_check_agi_freecount(cur);
2003         if (error)
2004                 goto error0;
2005
2006         *orec = rec;
2007         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2008         return 0;
2009
2010 error0:
2011         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2012         return error;
2013 }
2014
2015 /*
2016  * Free an inode in the free inode btree.
2017  */
2018 STATIC int
2019 xfs_difree_finobt(
2020         struct xfs_perag                *pag,
2021         struct xfs_trans                *tp,
2022         struct xfs_buf                  *agbp,
2023         xfs_agino_t                     agino,
2024         struct xfs_inobt_rec_incore     *ibtrec) /* inobt record */
2025 {
2026         struct xfs_mount                *mp = pag->pag_mount;
2027         struct xfs_btree_cur            *cur;
2028         struct xfs_inobt_rec_incore     rec;
2029         int                             offset = agino - ibtrec->ir_startino;
2030         int                             error;
2031         int                             i;
2032
2033         cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
2034
2035         error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
2036         if (error)
2037                 goto error;
2038         if (i == 0) {
2039                 /*
2040                  * If the record does not exist in the finobt, we must have just
2041                  * freed an inode in a previously fully allocated chunk. If not,
2042                  * something is out of sync.
2043                  */
2044                 if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
2045                         error = -EFSCORRUPTED;
2046                         goto error;
2047                 }
2048
2049                 error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2050                                              ibtrec->ir_count,
2051                                              ibtrec->ir_freecount,
2052                                              ibtrec->ir_free, &i);
2053                 if (error)
2054                         goto error;
2055                 ASSERT(i == 1);
2056
2057                 goto out;
2058         }
2059
2060         /*
2061          * Read and update the existing record. We could just copy the ibtrec
2062          * across here, but that would defeat the purpose of having redundant
2063          * metadata. By making the modifications independently, we can catch
2064          * corruptions that we wouldn't see if we just copied from one record
2065          * to another.
2066          */
2067         error = xfs_inobt_get_rec(cur, &rec, &i);
2068         if (error)
2069                 goto error;
2070         if (XFS_IS_CORRUPT(mp, i != 1)) {
2071                 error = -EFSCORRUPTED;
2072                 goto error;
2073         }
2074
2075         rec.ir_free |= XFS_INOBT_MASK(offset);
2076         rec.ir_freecount++;
2077
2078         if (XFS_IS_CORRUPT(mp,
2079                            rec.ir_free != ibtrec->ir_free ||
2080                            rec.ir_freecount != ibtrec->ir_freecount)) {
2081                 error = -EFSCORRUPTED;
2082                 goto error;
2083         }
2084
2085         /*
2086          * The content of inobt records should always match between the inobt
2087          * and finobt. The lifecycle of records in the finobt is different from
2088          * the inobt in that the finobt only tracks records with at least one
2089          * free inode. Hence, if all of the inodes are free and we aren't
2090          * keeping inode chunks permanently on disk, remove the record.
2091          * Otherwise, update the record with the new information.
2092          *
2093          * Note that we currently can't free chunks when the block size is large
2094          * enough for multiple chunks. Leave the finobt record to remain in sync
2095          * with the inobt.
2096          */
2097         if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
2098             mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
2099                 error = xfs_btree_delete(cur, &i);
2100                 if (error)
2101                         goto error;
2102                 ASSERT(i == 1);
2103         } else {
2104                 error = xfs_inobt_update(cur, &rec);
2105                 if (error)
2106                         goto error;
2107         }
2108
2109 out:
2110         error = xfs_check_agi_freecount(cur);
2111         if (error)
2112                 goto error;
2113
2114         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2115         return 0;
2116
2117 error:
2118         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2119         return error;
2120 }
2121
2122 /*
2123  * Free disk inode.  Carefully avoids touching the incore inode, all
2124  * manipulations incore are the caller's responsibility.
2125  * The on-disk inode is not changed by this operation, only the
2126  * btree (free inode mask) is changed.
2127  */
2128 int
2129 xfs_difree(
2130         struct xfs_trans        *tp,
2131         struct xfs_perag        *pag,
2132         xfs_ino_t               inode,
2133         struct xfs_icluster     *xic)
2134 {
2135         /* REFERENCED */
2136         xfs_agblock_t           agbno;  /* block number containing inode */
2137         struct xfs_buf          *agbp;  /* buffer for allocation group header */
2138         xfs_agino_t             agino;  /* allocation group inode number */
2139         int                     error;  /* error return value */
2140         struct xfs_mount        *mp = tp->t_mountp;
2141         struct xfs_inobt_rec_incore rec;/* btree record */
2142
2143         /*
2144          * Break up inode number into its components.
2145          */
2146         if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) {
2147                 xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).",
2148                         __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno);
2149                 ASSERT(0);
2150                 return -EINVAL;
2151         }
2152         agino = XFS_INO_TO_AGINO(mp, inode);
2153         if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino))  {
2154                 xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
2155                         __func__, (unsigned long long)inode,
2156                         (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
2157                 ASSERT(0);
2158                 return -EINVAL;
2159         }
2160         agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2161         if (agbno >= mp->m_sb.sb_agblocks)  {
2162                 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
2163                         __func__, agbno, mp->m_sb.sb_agblocks);
2164                 ASSERT(0);
2165                 return -EINVAL;
2166         }
2167         /*
2168          * Get the allocation group header.
2169          */
2170         error = xfs_ialloc_read_agi(pag, tp, &agbp);
2171         if (error) {
2172                 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
2173                         __func__, error);
2174                 return error;
2175         }
2176
2177         /*
2178          * Fix up the inode allocation btree.
2179          */
2180         error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec);
2181         if (error)
2182                 goto error0;
2183
2184         /*
2185          * Fix up the free inode btree.
2186          */
2187         if (xfs_has_finobt(mp)) {
2188                 error = xfs_difree_finobt(pag, tp, agbp, agino, &rec);
2189                 if (error)
2190                         goto error0;
2191         }
2192
2193         return 0;
2194
2195 error0:
2196         return error;
2197 }
2198
2199 STATIC int
2200 xfs_imap_lookup(
2201         struct xfs_perag        *pag,
2202         struct xfs_trans        *tp,
2203         xfs_agino_t             agino,
2204         xfs_agblock_t           agbno,
2205         xfs_agblock_t           *chunk_agbno,
2206         xfs_agblock_t           *offset_agbno,
2207         int                     flags)
2208 {
2209         struct xfs_mount        *mp = pag->pag_mount;
2210         struct xfs_inobt_rec_incore rec;
2211         struct xfs_btree_cur    *cur;
2212         struct xfs_buf          *agbp;
2213         int                     error;
2214         int                     i;
2215
2216         error = xfs_ialloc_read_agi(pag, tp, &agbp);
2217         if (error) {
2218                 xfs_alert(mp,
2219                         "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
2220                         __func__, error, pag->pag_agno);
2221                 return error;
2222         }
2223
2224         /*
2225          * Lookup the inode record for the given agino. If the record cannot be
2226          * found, then it's an invalid inode number and we should abort. Once
2227          * we have a record, we need to ensure it contains the inode number
2228          * we are looking up.
2229          */
2230         cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
2231         error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
2232         if (!error) {
2233                 if (i)
2234                         error = xfs_inobt_get_rec(cur, &rec, &i);
2235                 if (!error && i == 0)
2236                         error = -EINVAL;
2237         }
2238
2239         xfs_trans_brelse(tp, agbp);
2240         xfs_btree_del_cursor(cur, error);
2241         if (error)
2242                 return error;
2243
2244         /* check that the returned record contains the required inode */
2245         if (rec.ir_startino > agino ||
2246             rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
2247                 return -EINVAL;
2248
2249         /* for untrusted inodes check it is allocated first */
2250         if ((flags & XFS_IGET_UNTRUSTED) &&
2251             (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
2252                 return -EINVAL;
2253
2254         *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
2255         *offset_agbno = agbno - *chunk_agbno;
2256         return 0;
2257 }
2258
2259 /*
2260  * Return the location of the inode in imap, for mapping it into a buffer.
2261  */
2262 int
2263 xfs_imap(
2264         struct xfs_perag        *pag,
2265         struct xfs_trans        *tp,
2266         xfs_ino_t               ino,    /* inode to locate */
2267         struct xfs_imap         *imap,  /* location map structure */
2268         uint                    flags)  /* flags for inode btree lookup */
2269 {
2270         struct xfs_mount        *mp = pag->pag_mount;
2271         xfs_agblock_t           agbno;  /* block number of inode in the alloc group */
2272         xfs_agino_t             agino;  /* inode number within alloc group */
2273         xfs_agblock_t           chunk_agbno;    /* first block in inode chunk */
2274         xfs_agblock_t           cluster_agbno;  /* first block in inode cluster */
2275         int                     error;  /* error code */
2276         int                     offset; /* index of inode in its buffer */
2277         xfs_agblock_t           offset_agbno;   /* blks from chunk start to inode */
2278
2279         ASSERT(ino != NULLFSINO);
2280
2281         /*
2282          * Split up the inode number into its parts.
2283          */
2284         agino = XFS_INO_TO_AGINO(mp, ino);
2285         agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2286         if (agbno >= mp->m_sb.sb_agblocks ||
2287             ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
2288                 error = -EINVAL;
2289 #ifdef DEBUG
2290                 /*
2291                  * Don't output diagnostic information for untrusted inodes
2292                  * as they can be invalid without implying corruption.
2293                  */
2294                 if (flags & XFS_IGET_UNTRUSTED)
2295                         return error;
2296                 if (agbno >= mp->m_sb.sb_agblocks) {
2297                         xfs_alert(mp,
2298                 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2299                                 __func__, (unsigned long long)agbno,
2300                                 (unsigned long)mp->m_sb.sb_agblocks);
2301                 }
2302                 if (ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) {
2303                         xfs_alert(mp,
2304                 "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
2305                                 __func__, ino,
2306                                 XFS_AGINO_TO_INO(mp, pag->pag_agno, agino));
2307                 }
2308                 xfs_stack_trace();
2309 #endif /* DEBUG */
2310                 return error;
2311         }
2312
2313         /*
2314          * For bulkstat and handle lookups, we have an untrusted inode number
2315          * that we have to verify is valid. We cannot do this just by reading
2316          * the inode buffer as it may have been unlinked and removed leaving
2317          * inodes in stale state on disk. Hence we have to do a btree lookup
2318          * in all cases where an untrusted inode number is passed.
2319          */
2320         if (flags & XFS_IGET_UNTRUSTED) {
2321                 error = xfs_imap_lookup(pag, tp, agino, agbno,
2322                                         &chunk_agbno, &offset_agbno, flags);
2323                 if (error)
2324                         return error;
2325                 goto out_map;
2326         }
2327
2328         /*
2329          * If the inode cluster size is the same as the blocksize or
2330          * smaller we get to the buffer by simple arithmetics.
2331          */
2332         if (M_IGEO(mp)->blocks_per_cluster == 1) {
2333                 offset = XFS_INO_TO_OFFSET(mp, ino);
2334                 ASSERT(offset < mp->m_sb.sb_inopblock);
2335
2336                 imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno);
2337                 imap->im_len = XFS_FSB_TO_BB(mp, 1);
2338                 imap->im_boffset = (unsigned short)(offset <<
2339                                                         mp->m_sb.sb_inodelog);
2340                 return 0;
2341         }
2342
2343         /*
2344          * If the inode chunks are aligned then use simple maths to
2345          * find the location. Otherwise we have to do a btree
2346          * lookup to find the location.
2347          */
2348         if (M_IGEO(mp)->inoalign_mask) {
2349                 offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
2350                 chunk_agbno = agbno - offset_agbno;
2351         } else {
2352                 error = xfs_imap_lookup(pag, tp, agino, agbno,
2353                                         &chunk_agbno, &offset_agbno, flags);
2354                 if (error)
2355                         return error;
2356         }
2357
2358 out_map:
2359         ASSERT(agbno >= chunk_agbno);
2360         cluster_agbno = chunk_agbno +
2361                 ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
2362                  M_IGEO(mp)->blocks_per_cluster);
2363         offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
2364                 XFS_INO_TO_OFFSET(mp, ino);
2365
2366         imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno);
2367         imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
2368         imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
2369
2370         /*
2371          * If the inode number maps to a block outside the bounds
2372          * of the file system then return NULL rather than calling
2373          * read_buf and panicing when we get an error from the
2374          * driver.
2375          */
2376         if ((imap->im_blkno + imap->im_len) >
2377             XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
2378                 xfs_alert(mp,
2379         "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2380                         __func__, (unsigned long long) imap->im_blkno,
2381                         (unsigned long long) imap->im_len,
2382                         XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
2383                 return -EINVAL;
2384         }
2385         return 0;
2386 }
2387
2388 /*
2389  * Log specified fields for the ag hdr (inode section). The growth of the agi
2390  * structure over time requires that we interpret the buffer as two logical
2391  * regions delineated by the end of the unlinked list. This is due to the size
2392  * of the hash table and its location in the middle of the agi.
2393  *
2394  * For example, a request to log a field before agi_unlinked and a field after
2395  * agi_unlinked could cause us to log the entire hash table and use an excessive
2396  * amount of log space. To avoid this behavior, log the region up through
2397  * agi_unlinked in one call and the region after agi_unlinked through the end of
2398  * the structure in another.
2399  */
2400 void
2401 xfs_ialloc_log_agi(
2402         struct xfs_trans        *tp,
2403         struct xfs_buf          *bp,
2404         uint32_t                fields)
2405 {
2406         int                     first;          /* first byte number */
2407         int                     last;           /* last byte number */
2408         static const short      offsets[] = {   /* field starting offsets */
2409                                         /* keep in sync with bit definitions */
2410                 offsetof(xfs_agi_t, agi_magicnum),
2411                 offsetof(xfs_agi_t, agi_versionnum),
2412                 offsetof(xfs_agi_t, agi_seqno),
2413                 offsetof(xfs_agi_t, agi_length),
2414                 offsetof(xfs_agi_t, agi_count),
2415                 offsetof(xfs_agi_t, agi_root),
2416                 offsetof(xfs_agi_t, agi_level),
2417                 offsetof(xfs_agi_t, agi_freecount),
2418                 offsetof(xfs_agi_t, agi_newino),
2419                 offsetof(xfs_agi_t, agi_dirino),
2420                 offsetof(xfs_agi_t, agi_unlinked),
2421                 offsetof(xfs_agi_t, agi_free_root),
2422                 offsetof(xfs_agi_t, agi_free_level),
2423                 offsetof(xfs_agi_t, agi_iblocks),
2424                 sizeof(xfs_agi_t)
2425         };
2426 #ifdef DEBUG
2427         struct xfs_agi          *agi = bp->b_addr;
2428
2429         ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2430 #endif
2431
2432         /*
2433          * Compute byte offsets for the first and last fields in the first
2434          * region and log the agi buffer. This only logs up through
2435          * agi_unlinked.
2436          */
2437         if (fields & XFS_AGI_ALL_BITS_R1) {
2438                 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
2439                                   &first, &last);
2440                 xfs_trans_log_buf(tp, bp, first, last);
2441         }
2442
2443         /*
2444          * Mask off the bits in the first region and calculate the first and
2445          * last field offsets for any bits in the second region.
2446          */
2447         fields &= ~XFS_AGI_ALL_BITS_R1;
2448         if (fields) {
2449                 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
2450                                   &first, &last);
2451                 xfs_trans_log_buf(tp, bp, first, last);
2452         }
2453 }
2454
2455 static xfs_failaddr_t
2456 xfs_agi_verify(
2457         struct xfs_buf  *bp)
2458 {
2459         struct xfs_mount *mp = bp->b_mount;
2460         struct xfs_agi  *agi = bp->b_addr;
2461         int             i;
2462
2463         if (xfs_has_crc(mp)) {
2464                 if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
2465                         return __this_address;
2466                 if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
2467                         return __this_address;
2468         }
2469
2470         /*
2471          * Validate the magic number of the agi block.
2472          */
2473         if (!xfs_verify_magic(bp, agi->agi_magicnum))
2474                 return __this_address;
2475         if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
2476                 return __this_address;
2477
2478         if (be32_to_cpu(agi->agi_level) < 1 ||
2479             be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
2480                 return __this_address;
2481
2482         if (xfs_has_finobt(mp) &&
2483             (be32_to_cpu(agi->agi_free_level) < 1 ||
2484              be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
2485                 return __this_address;
2486
2487         /*
2488          * during growfs operations, the perag is not fully initialised,
2489          * so we can't use it for any useful checking. growfs ensures we can't
2490          * use it by using uncached buffers that don't have the perag attached
2491          * so we can detect and avoid this problem.
2492          */
2493         if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
2494                 return __this_address;
2495
2496         for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
2497                 if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO))
2498                         continue;
2499                 if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i])))
2500                         return __this_address;
2501         }
2502
2503         return NULL;
2504 }
2505
2506 static void
2507 xfs_agi_read_verify(
2508         struct xfs_buf  *bp)
2509 {
2510         struct xfs_mount *mp = bp->b_mount;
2511         xfs_failaddr_t  fa;
2512
2513         if (xfs_has_crc(mp) &&
2514             !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
2515                 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2516         else {
2517                 fa = xfs_agi_verify(bp);
2518                 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
2519                         xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2520         }
2521 }
2522
2523 static void
2524 xfs_agi_write_verify(
2525         struct xfs_buf  *bp)
2526 {
2527         struct xfs_mount        *mp = bp->b_mount;
2528         struct xfs_buf_log_item *bip = bp->b_log_item;
2529         struct xfs_agi          *agi = bp->b_addr;
2530         xfs_failaddr_t          fa;
2531
2532         fa = xfs_agi_verify(bp);
2533         if (fa) {
2534                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2535                 return;
2536         }
2537
2538         if (!xfs_has_crc(mp))
2539                 return;
2540
2541         if (bip)
2542                 agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2543         xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
2544 }
2545
2546 const struct xfs_buf_ops xfs_agi_buf_ops = {
2547         .name = "xfs_agi",
2548         .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
2549         .verify_read = xfs_agi_read_verify,
2550         .verify_write = xfs_agi_write_verify,
2551         .verify_struct = xfs_agi_verify,
2552 };
2553
2554 /*
2555  * Read in the allocation group header (inode allocation section)
2556  */
2557 int
2558 xfs_read_agi(
2559         struct xfs_perag        *pag,
2560         struct xfs_trans        *tp,
2561         struct xfs_buf          **agibpp)
2562 {
2563         struct xfs_mount        *mp = pag->pag_mount;
2564         int                     error;
2565
2566         trace_xfs_read_agi(pag->pag_mount, pag->pag_agno);
2567
2568         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2569                         XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
2570                         XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
2571         if (error)
2572                 return error;
2573         if (tp)
2574                 xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
2575
2576         xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
2577         return 0;
2578 }
2579
2580 /*
2581  * Read in the agi and initialise the per-ag data. If the caller supplies a
2582  * @agibpp, return the locked AGI buffer to them, otherwise release it.
2583  */
2584 int
2585 xfs_ialloc_read_agi(
2586         struct xfs_perag        *pag,
2587         struct xfs_trans        *tp,
2588         struct xfs_buf          **agibpp)
2589 {
2590         struct xfs_buf          *agibp;
2591         struct xfs_agi          *agi;
2592         int                     error;
2593
2594         trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno);
2595
2596         error = xfs_read_agi(pag, tp, &agibp);
2597         if (error)
2598                 return error;
2599
2600         agi = agibp->b_addr;
2601         if (!xfs_perag_initialised_agi(pag)) {
2602                 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
2603                 pag->pagi_count = be32_to_cpu(agi->agi_count);
2604                 set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
2605         }
2606
2607         /*
2608          * It's possible for these to be out of sync if
2609          * we are in the middle of a forced shutdown.
2610          */
2611         ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
2612                 xfs_is_shutdown(pag->pag_mount));
2613         if (agibpp)
2614                 *agibpp = agibp;
2615         else
2616                 xfs_trans_brelse(tp, agibp);
2617         return 0;
2618 }
2619
2620 /* Is there an inode record covering a given range of inode numbers? */
2621 int
2622 xfs_ialloc_has_inode_record(
2623         struct xfs_btree_cur    *cur,
2624         xfs_agino_t             low,
2625         xfs_agino_t             high,
2626         bool                    *exists)
2627 {
2628         struct xfs_inobt_rec_incore     irec;
2629         xfs_agino_t             agino;
2630         uint16_t                holemask;
2631         int                     has_record;
2632         int                     i;
2633         int                     error;
2634
2635         *exists = false;
2636         error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
2637         while (error == 0 && has_record) {
2638                 error = xfs_inobt_get_rec(cur, &irec, &has_record);
2639                 if (error || irec.ir_startino > high)
2640                         break;
2641
2642                 agino = irec.ir_startino;
2643                 holemask = irec.ir_holemask;
2644                 for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1,
2645                                 i++, agino += XFS_INODES_PER_HOLEMASK_BIT) {
2646                         if (holemask & 1)
2647                                 continue;
2648                         if (agino + XFS_INODES_PER_HOLEMASK_BIT > low &&
2649                                         agino <= high) {
2650                                 *exists = true;
2651                                 return 0;
2652                         }
2653                 }
2654
2655                 error = xfs_btree_increment(cur, 0, &has_record);
2656         }
2657         return error;
2658 }
2659
2660 /* Is there an inode record covering a given extent? */
2661 int
2662 xfs_ialloc_has_inodes_at_extent(
2663         struct xfs_btree_cur    *cur,
2664         xfs_agblock_t           bno,
2665         xfs_extlen_t            len,
2666         bool                    *exists)
2667 {
2668         xfs_agino_t             low;
2669         xfs_agino_t             high;
2670
2671         low = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
2672         high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
2673
2674         return xfs_ialloc_has_inode_record(cur, low, high, exists);
2675 }
2676
2677 struct xfs_ialloc_count_inodes {
2678         xfs_agino_t                     count;
2679         xfs_agino_t                     freecount;
2680 };
2681
2682 /* Record inode counts across all inobt records. */
2683 STATIC int
2684 xfs_ialloc_count_inodes_rec(
2685         struct xfs_btree_cur            *cur,
2686         const union xfs_btree_rec       *rec,
2687         void                            *priv)
2688 {
2689         struct xfs_inobt_rec_incore     irec;
2690         struct xfs_ialloc_count_inodes  *ci = priv;
2691
2692         xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
2693         ci->count += irec.ir_count;
2694         ci->freecount += irec.ir_freecount;
2695
2696         return 0;
2697 }
2698
2699 /* Count allocated and free inodes under an inobt. */
2700 int
2701 xfs_ialloc_count_inodes(
2702         struct xfs_btree_cur            *cur,
2703         xfs_agino_t                     *count,
2704         xfs_agino_t                     *freecount)
2705 {
2706         struct xfs_ialloc_count_inodes  ci = {0};
2707         int                             error;
2708
2709         ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
2710         error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
2711         if (error)
2712                 return error;
2713
2714         *count = ci.count;
2715         *freecount = ci.freecount;
2716         return 0;
2717 }
2718
2719 /*
2720  * Initialize inode-related geometry information.
2721  *
2722  * Compute the inode btree min and max levels and set maxicount.
2723  *
2724  * Set the inode cluster size.  This may still be overridden by the file
2725  * system block size if it is larger than the chosen cluster size.
2726  *
2727  * For v5 filesystems, scale the cluster size with the inode size to keep a
2728  * constant ratio of inode per cluster buffer, but only if mkfs has set the
2729  * inode alignment value appropriately for larger cluster sizes.
2730  *
2731  * Then compute the inode cluster alignment information.
2732  */
2733 void
2734 xfs_ialloc_setup_geometry(
2735         struct xfs_mount        *mp)
2736 {
2737         struct xfs_sb           *sbp = &mp->m_sb;
2738         struct xfs_ino_geometry *igeo = M_IGEO(mp);
2739         uint64_t                icount;
2740         uint                    inodes;
2741
2742         igeo->new_diflags2 = 0;
2743         if (xfs_has_bigtime(mp))
2744                 igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
2745         if (xfs_has_large_extent_counts(mp))
2746                 igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
2747
2748         /* Compute inode btree geometry. */
2749         igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
2750         igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
2751         igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
2752         igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
2753         igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
2754
2755         igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
2756                         sbp->sb_inopblock);
2757         igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
2758
2759         if (sbp->sb_spino_align)
2760                 igeo->ialloc_min_blks = sbp->sb_spino_align;
2761         else
2762                 igeo->ialloc_min_blks = igeo->ialloc_blks;
2763
2764         /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
2765         inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2766         igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
2767                         inodes);
2768         ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
2769
2770         /*
2771          * Set the maximum inode count for this filesystem, being careful not
2772          * to use obviously garbage sb_inopblog/sb_inopblock values.  Regular
2773          * users should never get here due to failing sb verification, but
2774          * certain users (xfs_db) need to be usable even with corrupt metadata.
2775          */
2776         if (sbp->sb_imax_pct && igeo->ialloc_blks) {
2777                 /*
2778                  * Make sure the maximum inode count is a multiple
2779                  * of the units we allocate inodes in.
2780                  */
2781                 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
2782                 do_div(icount, 100);
2783                 do_div(icount, igeo->ialloc_blks);
2784                 igeo->maxicount = XFS_FSB_TO_INO(mp,
2785                                 icount * igeo->ialloc_blks);
2786         } else {
2787                 igeo->maxicount = 0;
2788         }
2789
2790         /*
2791          * Compute the desired size of an inode cluster buffer size, which
2792          * starts at 8K and (on v5 filesystems) scales up with larger inode
2793          * sizes.
2794          *
2795          * Preserve the desired inode cluster size because the sparse inodes
2796          * feature uses that desired size (not the actual size) to compute the
2797          * sparse inode alignment.  The mount code validates this value, so we
2798          * cannot change the behavior.
2799          */
2800         igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
2801         if (xfs_has_v3inodes(mp)) {
2802                 int     new_size = igeo->inode_cluster_size_raw;
2803
2804                 new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
2805                 if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
2806                         igeo->inode_cluster_size_raw = new_size;
2807         }
2808
2809         /* Calculate inode cluster ratios. */
2810         if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
2811                 igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
2812                                 igeo->inode_cluster_size_raw);
2813         else
2814                 igeo->blocks_per_cluster = 1;
2815         igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
2816         igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
2817
2818         /* Calculate inode cluster alignment. */
2819         if (xfs_has_align(mp) &&
2820             mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
2821                 igeo->cluster_align = mp->m_sb.sb_inoalignmt;
2822         else
2823                 igeo->cluster_align = 1;
2824         igeo->inoalign_mask = igeo->cluster_align - 1;
2825         igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
2826
2827         /*
2828          * If we are using stripe alignment, check whether
2829          * the stripe unit is a multiple of the inode alignment
2830          */
2831         if (mp->m_dalign && igeo->inoalign_mask &&
2832             !(mp->m_dalign & igeo->inoalign_mask))
2833                 igeo->ialloc_align = mp->m_dalign;
2834         else
2835                 igeo->ialloc_align = 0;
2836 }
2837
2838 /* Compute the location of the root directory inode that is laid out by mkfs. */
2839 xfs_ino_t
2840 xfs_ialloc_calc_rootino(
2841         struct xfs_mount        *mp,
2842         int                     sunit)
2843 {
2844         struct xfs_ino_geometry *igeo = M_IGEO(mp);
2845         xfs_agblock_t           first_bno;
2846
2847         /*
2848          * Pre-calculate the geometry of AG 0.  We know what it looks like
2849          * because libxfs knows how to create allocation groups now.
2850          *
2851          * first_bno is the first block in which mkfs could possibly have
2852          * allocated the root directory inode, once we factor in the metadata
2853          * that mkfs formats before it.  Namely, the four AG headers...
2854          */
2855         first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
2856
2857         /* ...the two free space btree roots... */
2858         first_bno += 2;
2859
2860         /* ...the inode btree root... */
2861         first_bno += 1;
2862
2863         /* ...the initial AGFL... */
2864         first_bno += xfs_alloc_min_freelist(mp, NULL);
2865
2866         /* ...the free inode btree root... */
2867         if (xfs_has_finobt(mp))
2868                 first_bno++;
2869
2870         /* ...the reverse mapping btree root... */
2871         if (xfs_has_rmapbt(mp))
2872                 first_bno++;
2873
2874         /* ...the reference count btree... */
2875         if (xfs_has_reflink(mp))
2876                 first_bno++;
2877
2878         /*
2879          * ...and the log, if it is allocated in the first allocation group.
2880          *
2881          * This can happen with filesystems that only have a single
2882          * allocation group, or very odd geometries created by old mkfs
2883          * versions on very small filesystems.
2884          */
2885         if (xfs_ag_contains_log(mp, 0))
2886                  first_bno += mp->m_sb.sb_logblocks;
2887
2888         /*
2889          * Now round first_bno up to whatever allocation alignment is given
2890          * by the filesystem or was passed in.
2891          */
2892         if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
2893                 first_bno = roundup(first_bno, sunit);
2894         else if (xfs_has_align(mp) &&
2895                         mp->m_sb.sb_inoalignmt > 1)
2896                 first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
2897
2898         return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
2899 }
2900
2901 /*
2902  * Ensure there are not sparse inode clusters that cross the new EOAG.
2903  *
2904  * This is a no-op for non-spinode filesystems since clusters are always fully
2905  * allocated and checking the bnobt suffices.  However, a spinode filesystem
2906  * could have a record where the upper inodes are free blocks.  If those blocks
2907  * were removed from the filesystem, the inode record would extend beyond EOAG,
2908  * which will be flagged as corruption.
2909  */
2910 int
2911 xfs_ialloc_check_shrink(
2912         struct xfs_perag        *pag,
2913         struct xfs_trans        *tp,
2914         struct xfs_buf          *agibp,
2915         xfs_agblock_t           new_length)
2916 {
2917         struct xfs_inobt_rec_incore rec;
2918         struct xfs_btree_cur    *cur;
2919         xfs_agino_t             agino;
2920         int                     has;
2921         int                     error;
2922
2923         if (!xfs_has_sparseinodes(pag->pag_mount))
2924                 return 0;
2925
2926         cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO);
2927
2928         /* Look up the inobt record that would correspond to the new EOFS. */
2929         agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
2930         error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
2931         if (error || !has)
2932                 goto out;
2933
2934         error = xfs_inobt_get_rec(cur, &rec, &has);
2935         if (error)
2936                 goto out;
2937
2938         if (!has) {
2939                 error = -EFSCORRUPTED;
2940                 goto out;
2941         }
2942
2943         /* If the record covers inodes that would be beyond EOFS, bail out. */
2944         if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
2945                 error = -ENOSPC;
2946                 goto out;
2947         }
2948 out:
2949         xfs_btree_del_cursor(cur, error);
2950         return error;
2951 }