OSDN Git Service

ocfs2: use allocation reservations for directory data
[android-x86/kernel.git] / fs / ocfs2 / suballoc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34
35 #include "ocfs2.h"
36
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 #define NOT_ALLOC_NEW_GROUP             0
51 #define ALLOC_NEW_GROUP                 0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL        0x2
53
54 #define OCFS2_MAX_TO_STEAL              1024
55
56 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
58 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
59 static int ocfs2_block_group_fill(handle_t *handle,
60                                   struct inode *alloc_inode,
61                                   struct buffer_head *bg_bh,
62                                   u64 group_blkno,
63                                   u16 my_chain,
64                                   struct ocfs2_chain_list *cl);
65 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
66                                    struct inode *alloc_inode,
67                                    struct buffer_head *bh,
68                                    u64 max_block,
69                                    u64 *last_alloc_group,
70                                    int flags);
71
72 static int ocfs2_cluster_group_search(struct inode *inode,
73                                       struct buffer_head *group_bh,
74                                       u32 bits_wanted, u32 min_bits,
75                                       u64 max_block,
76                                       u16 *bit_off, u16 *bits_found);
77 static int ocfs2_block_group_search(struct inode *inode,
78                                     struct buffer_head *group_bh,
79                                     u32 bits_wanted, u32 min_bits,
80                                     u64 max_block,
81                                     u16 *bit_off, u16 *bits_found);
82 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
83                                      struct ocfs2_alloc_context *ac,
84                                      handle_t *handle,
85                                      u32 bits_wanted,
86                                      u32 min_bits,
87                                      u16 *bit_off,
88                                      unsigned int *num_bits,
89                                      u64 *bg_blkno);
90 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
91                                          int nr);
92 static inline int ocfs2_block_group_set_bits(handle_t *handle,
93                                              struct inode *alloc_inode,
94                                              struct ocfs2_group_desc *bg,
95                                              struct buffer_head *group_bh,
96                                              unsigned int bit_off,
97                                              unsigned int num_bits);
98 static int ocfs2_relink_block_group(handle_t *handle,
99                                     struct inode *alloc_inode,
100                                     struct buffer_head *fe_bh,
101                                     struct buffer_head *bg_bh,
102                                     struct buffer_head *prev_bg_bh,
103                                     u16 chain);
104 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
105                                                      u32 wanted);
106 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
107                                                    u64 bg_blkno,
108                                                    u16 bg_bit_off);
109 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
110                                                 u64 data_blkno,
111                                                 u64 *bg_blkno,
112                                                 u16 *bg_bit_off);
113 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
114                                              u32 bits_wanted, u64 max_block,
115                                              int flags,
116                                              struct ocfs2_alloc_context **ac);
117
118 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
119 {
120         struct inode *inode = ac->ac_inode;
121
122         if (inode) {
123                 if (ac->ac_which != OCFS2_AC_USE_LOCAL)
124                         ocfs2_inode_unlock(inode, 1);
125
126                 mutex_unlock(&inode->i_mutex);
127
128                 iput(inode);
129                 ac->ac_inode = NULL;
130         }
131         brelse(ac->ac_bh);
132         ac->ac_bh = NULL;
133         ac->ac_resv = NULL;
134 }
135
136 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
137 {
138         ocfs2_free_ac_resource(ac);
139         kfree(ac);
140 }
141
142 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
143 {
144         return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
145 }
146
147 #define do_error(fmt, ...)                                              \
148         do{                                                             \
149                 if (resize)                                     \
150                         mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
151                 else                                                    \
152                         ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
153         } while (0)
154
155 static int ocfs2_validate_gd_self(struct super_block *sb,
156                                   struct buffer_head *bh,
157                                   int resize)
158 {
159         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
160
161         if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
162                 do_error("Group descriptor #%llu has bad signature %.*s",
163                          (unsigned long long)bh->b_blocknr, 7,
164                          gd->bg_signature);
165                 return -EINVAL;
166         }
167
168         if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
169                 do_error("Group descriptor #%llu has an invalid bg_blkno "
170                          "of %llu",
171                          (unsigned long long)bh->b_blocknr,
172                          (unsigned long long)le64_to_cpu(gd->bg_blkno));
173                 return -EINVAL;
174         }
175
176         if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
177                 do_error("Group descriptor #%llu has an invalid "
178                          "fs_generation of #%u",
179                          (unsigned long long)bh->b_blocknr,
180                          le32_to_cpu(gd->bg_generation));
181                 return -EINVAL;
182         }
183
184         if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
185                 do_error("Group descriptor #%llu has bit count %u but "
186                          "claims that %u are free",
187                          (unsigned long long)bh->b_blocknr,
188                          le16_to_cpu(gd->bg_bits),
189                          le16_to_cpu(gd->bg_free_bits_count));
190                 return -EINVAL;
191         }
192
193         if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
194                 do_error("Group descriptor #%llu has bit count %u but "
195                          "max bitmap bits of %u",
196                          (unsigned long long)bh->b_blocknr,
197                          le16_to_cpu(gd->bg_bits),
198                          8 * le16_to_cpu(gd->bg_size));
199                 return -EINVAL;
200         }
201
202         return 0;
203 }
204
205 static int ocfs2_validate_gd_parent(struct super_block *sb,
206                                     struct ocfs2_dinode *di,
207                                     struct buffer_head *bh,
208                                     int resize)
209 {
210         unsigned int max_bits;
211         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
212
213         if (di->i_blkno != gd->bg_parent_dinode) {
214                 do_error("Group descriptor #%llu has bad parent "
215                          "pointer (%llu, expected %llu)",
216                          (unsigned long long)bh->b_blocknr,
217                          (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
218                          (unsigned long long)le64_to_cpu(di->i_blkno));
219                 return -EINVAL;
220         }
221
222         max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
223         if (le16_to_cpu(gd->bg_bits) > max_bits) {
224                 do_error("Group descriptor #%llu has bit count of %u",
225                          (unsigned long long)bh->b_blocknr,
226                          le16_to_cpu(gd->bg_bits));
227                 return -EINVAL;
228         }
229
230         /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
231         if ((le16_to_cpu(gd->bg_chain) >
232              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
233             ((le16_to_cpu(gd->bg_chain) ==
234              le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
235                 do_error("Group descriptor #%llu has bad chain %u",
236                          (unsigned long long)bh->b_blocknr,
237                          le16_to_cpu(gd->bg_chain));
238                 return -EINVAL;
239         }
240
241         return 0;
242 }
243
244 #undef do_error
245
246 /*
247  * This version only prints errors.  It does not fail the filesystem, and
248  * exists only for resize.
249  */
250 int ocfs2_check_group_descriptor(struct super_block *sb,
251                                  struct ocfs2_dinode *di,
252                                  struct buffer_head *bh)
253 {
254         int rc;
255         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
256
257         BUG_ON(!buffer_uptodate(bh));
258
259         /*
260          * If the ecc fails, we return the error but otherwise
261          * leave the filesystem running.  We know any error is
262          * local to this block.
263          */
264         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
265         if (rc) {
266                 mlog(ML_ERROR,
267                      "Checksum failed for group descriptor %llu\n",
268                      (unsigned long long)bh->b_blocknr);
269         } else
270                 rc = ocfs2_validate_gd_self(sb, bh, 1);
271         if (!rc)
272                 rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
273
274         return rc;
275 }
276
277 static int ocfs2_validate_group_descriptor(struct super_block *sb,
278                                            struct buffer_head *bh)
279 {
280         int rc;
281         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
282
283         mlog(0, "Validating group descriptor %llu\n",
284              (unsigned long long)bh->b_blocknr);
285
286         BUG_ON(!buffer_uptodate(bh));
287
288         /*
289          * If the ecc fails, we return the error but otherwise
290          * leave the filesystem running.  We know any error is
291          * local to this block.
292          */
293         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
294         if (rc)
295                 return rc;
296
297         /*
298          * Errors after here are fatal.
299          */
300
301         return ocfs2_validate_gd_self(sb, bh, 0);
302 }
303
304 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
305                                 u64 gd_blkno, struct buffer_head **bh)
306 {
307         int rc;
308         struct buffer_head *tmp = *bh;
309
310         rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
311                               ocfs2_validate_group_descriptor);
312         if (rc)
313                 goto out;
314
315         rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
316         if (rc) {
317                 brelse(tmp);
318                 goto out;
319         }
320
321         /* If ocfs2_read_block() got us a new bh, pass it up. */
322         if (!*bh)
323                 *bh = tmp;
324
325 out:
326         return rc;
327 }
328
329 static int ocfs2_block_group_fill(handle_t *handle,
330                                   struct inode *alloc_inode,
331                                   struct buffer_head *bg_bh,
332                                   u64 group_blkno,
333                                   u16 my_chain,
334                                   struct ocfs2_chain_list *cl)
335 {
336         int status = 0;
337         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
338         struct super_block * sb = alloc_inode->i_sb;
339
340         mlog_entry_void();
341
342         if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
343                 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
344                             "b_blocknr (%llu)",
345                             (unsigned long long)group_blkno,
346                             (unsigned long long) bg_bh->b_blocknr);
347                 status = -EIO;
348                 goto bail;
349         }
350
351         status = ocfs2_journal_access_gd(handle,
352                                          INODE_CACHE(alloc_inode),
353                                          bg_bh,
354                                          OCFS2_JOURNAL_ACCESS_CREATE);
355         if (status < 0) {
356                 mlog_errno(status);
357                 goto bail;
358         }
359
360         memset(bg, 0, sb->s_blocksize);
361         strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
362         bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
363         bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
364         bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
365         bg->bg_chain = cpu_to_le16(my_chain);
366         bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
367         bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
368         bg->bg_blkno = cpu_to_le64(group_blkno);
369         /* set the 1st bit in the bitmap to account for the descriptor block */
370         ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
371         bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
372
373         ocfs2_journal_dirty(handle, bg_bh);
374
375         /* There is no need to zero out or otherwise initialize the
376          * other blocks in a group - All valid FS metadata in a block
377          * group stores the superblock fs_generation value at
378          * allocation time. */
379
380 bail:
381         mlog_exit(status);
382         return status;
383 }
384
385 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
386 {
387         u16 curr, best;
388
389         best = curr = 0;
390         while (curr < le16_to_cpu(cl->cl_count)) {
391                 if (le32_to_cpu(cl->cl_recs[best].c_total) >
392                     le32_to_cpu(cl->cl_recs[curr].c_total))
393                         best = curr;
394                 curr++;
395         }
396         return best;
397 }
398
399 /*
400  * We expect the block group allocator to already be locked.
401  */
402 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
403                                    struct inode *alloc_inode,
404                                    struct buffer_head *bh,
405                                    u64 max_block,
406                                    u64 *last_alloc_group,
407                                    int flags)
408 {
409         int status, credits;
410         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
411         struct ocfs2_chain_list *cl;
412         struct ocfs2_alloc_context *ac = NULL;
413         handle_t *handle = NULL;
414         u32 bit_off, num_bits;
415         u16 alloc_rec;
416         u64 bg_blkno;
417         struct buffer_head *bg_bh = NULL;
418         struct ocfs2_group_desc *bg;
419
420         BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
421
422         mlog_entry_void();
423
424         cl = &fe->id2.i_chain;
425         status = ocfs2_reserve_clusters_with_limit(osb,
426                                                    le16_to_cpu(cl->cl_cpg),
427                                                    max_block, flags, &ac);
428         if (status < 0) {
429                 if (status != -ENOSPC)
430                         mlog_errno(status);
431                 goto bail;
432         }
433
434         credits = ocfs2_calc_group_alloc_credits(osb->sb,
435                                                  le16_to_cpu(cl->cl_cpg));
436         handle = ocfs2_start_trans(osb, credits);
437         if (IS_ERR(handle)) {
438                 status = PTR_ERR(handle);
439                 handle = NULL;
440                 mlog_errno(status);
441                 goto bail;
442         }
443
444         if (last_alloc_group && *last_alloc_group != 0) {
445                 mlog(0, "use old allocation group %llu for block group alloc\n",
446                      (unsigned long long)*last_alloc_group);
447                 ac->ac_last_group = *last_alloc_group;
448         }
449         status = ocfs2_claim_clusters(osb,
450                                       handle,
451                                       ac,
452                                       le16_to_cpu(cl->cl_cpg),
453                                       &bit_off,
454                                       &num_bits);
455         if (status < 0) {
456                 if (status != -ENOSPC)
457                         mlog_errno(status);
458                 goto bail;
459         }
460
461         alloc_rec = ocfs2_find_smallest_chain(cl);
462
463         /* setup the group */
464         bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
465         mlog(0, "new descriptor, record %u, at block %llu\n",
466              alloc_rec, (unsigned long long)bg_blkno);
467
468         bg_bh = sb_getblk(osb->sb, bg_blkno);
469         if (!bg_bh) {
470                 status = -EIO;
471                 mlog_errno(status);
472                 goto bail;
473         }
474         ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
475
476         status = ocfs2_block_group_fill(handle,
477                                         alloc_inode,
478                                         bg_bh,
479                                         bg_blkno,
480                                         alloc_rec,
481                                         cl);
482         if (status < 0) {
483                 mlog_errno(status);
484                 goto bail;
485         }
486
487         bg = (struct ocfs2_group_desc *) bg_bh->b_data;
488
489         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
490                                          bh, OCFS2_JOURNAL_ACCESS_WRITE);
491         if (status < 0) {
492                 mlog_errno(status);
493                 goto bail;
494         }
495
496         le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
497                      le16_to_cpu(bg->bg_free_bits_count));
498         le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
499         cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
500         if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
501                 le16_add_cpu(&cl->cl_next_free_rec, 1);
502
503         le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
504                                         le16_to_cpu(bg->bg_free_bits_count));
505         le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
506         le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
507
508         ocfs2_journal_dirty(handle, bh);
509
510         spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
511         OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
512         fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
513                                              le32_to_cpu(fe->i_clusters)));
514         spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
515         i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
516         alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
517
518         status = 0;
519
520         /* save the new last alloc group so that the caller can cache it. */
521         if (last_alloc_group)
522                 *last_alloc_group = ac->ac_last_group;
523
524 bail:
525         if (handle)
526                 ocfs2_commit_trans(osb, handle);
527
528         if (ac)
529                 ocfs2_free_alloc_context(ac);
530
531         brelse(bg_bh);
532
533         mlog_exit(status);
534         return status;
535 }
536
537 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
538                                        struct ocfs2_alloc_context *ac,
539                                        int type,
540                                        u32 slot,
541                                        u64 *last_alloc_group,
542                                        int flags)
543 {
544         int status;
545         u32 bits_wanted = ac->ac_bits_wanted;
546         struct inode *alloc_inode;
547         struct buffer_head *bh = NULL;
548         struct ocfs2_dinode *fe;
549         u32 free_bits;
550
551         mlog_entry_void();
552
553         alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
554         if (!alloc_inode) {
555                 mlog_errno(-EINVAL);
556                 return -EINVAL;
557         }
558
559         mutex_lock(&alloc_inode->i_mutex);
560
561         status = ocfs2_inode_lock(alloc_inode, &bh, 1);
562         if (status < 0) {
563                 mutex_unlock(&alloc_inode->i_mutex);
564                 iput(alloc_inode);
565
566                 mlog_errno(status);
567                 return status;
568         }
569
570         ac->ac_inode = alloc_inode;
571         ac->ac_alloc_slot = slot;
572
573         fe = (struct ocfs2_dinode *) bh->b_data;
574
575         /* The bh was validated by the inode read inside
576          * ocfs2_inode_lock().  Any corruption is a code bug. */
577         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
578
579         if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
580                 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
581                             (unsigned long long)le64_to_cpu(fe->i_blkno));
582                 status = -EIO;
583                 goto bail;
584         }
585
586         free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
587                 le32_to_cpu(fe->id1.bitmap1.i_used);
588
589         if (bits_wanted > free_bits) {
590                 /* cluster bitmap never grows */
591                 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
592                         mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
593                              bits_wanted, free_bits);
594                         status = -ENOSPC;
595                         goto bail;
596                 }
597
598                 if (!(flags & ALLOC_NEW_GROUP)) {
599                         mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
600                              "and we don't alloc a new group for it.\n",
601                              slot, bits_wanted, free_bits);
602                         status = -ENOSPC;
603                         goto bail;
604                 }
605
606                 status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
607                                                  ac->ac_max_block,
608                                                  last_alloc_group, flags);
609                 if (status < 0) {
610                         if (status != -ENOSPC)
611                                 mlog_errno(status);
612                         goto bail;
613                 }
614                 atomic_inc(&osb->alloc_stats.bg_extends);
615
616                 /* You should never ask for this much metadata */
617                 BUG_ON(bits_wanted >
618                        (le32_to_cpu(fe->id1.bitmap1.i_total)
619                         - le32_to_cpu(fe->id1.bitmap1.i_used)));
620         }
621
622         get_bh(bh);
623         ac->ac_bh = bh;
624 bail:
625         brelse(bh);
626
627         mlog_exit(status);
628         return status;
629 }
630
631 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
632 {
633         spin_lock(&osb->osb_lock);
634         osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
635         spin_unlock(&osb->osb_lock);
636         atomic_set(&osb->s_num_inodes_stolen, 0);
637 }
638
639 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
640 {
641         spin_lock(&osb->osb_lock);
642         osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
643         spin_unlock(&osb->osb_lock);
644         atomic_set(&osb->s_num_meta_stolen, 0);
645 }
646
647 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
648 {
649         ocfs2_init_inode_steal_slot(osb);
650         ocfs2_init_meta_steal_slot(osb);
651 }
652
653 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
654 {
655         spin_lock(&osb->osb_lock);
656         if (type == INODE_ALLOC_SYSTEM_INODE)
657                 osb->s_inode_steal_slot = slot;
658         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
659                 osb->s_meta_steal_slot = slot;
660         spin_unlock(&osb->osb_lock);
661 }
662
663 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
664 {
665         int slot = OCFS2_INVALID_SLOT;
666
667         spin_lock(&osb->osb_lock);
668         if (type == INODE_ALLOC_SYSTEM_INODE)
669                 slot = osb->s_inode_steal_slot;
670         else if (type == EXTENT_ALLOC_SYSTEM_INODE)
671                 slot = osb->s_meta_steal_slot;
672         spin_unlock(&osb->osb_lock);
673
674         return slot;
675 }
676
677 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
678 {
679         return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
680 }
681
682 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
683 {
684         return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
685 }
686
687 static int ocfs2_steal_resource(struct ocfs2_super *osb,
688                                 struct ocfs2_alloc_context *ac,
689                                 int type)
690 {
691         int i, status = -ENOSPC;
692         int slot = __ocfs2_get_steal_slot(osb, type);
693
694         /* Start to steal resource from the first slot after ours. */
695         if (slot == OCFS2_INVALID_SLOT)
696                 slot = osb->slot_num + 1;
697
698         for (i = 0; i < osb->max_slots; i++, slot++) {
699                 if (slot == osb->max_slots)
700                         slot = 0;
701
702                 if (slot == osb->slot_num)
703                         continue;
704
705                 status = ocfs2_reserve_suballoc_bits(osb, ac,
706                                                      type,
707                                                      (u32)slot, NULL,
708                                                      NOT_ALLOC_NEW_GROUP);
709                 if (status >= 0) {
710                         __ocfs2_set_steal_slot(osb, slot, type);
711                         break;
712                 }
713
714                 ocfs2_free_ac_resource(ac);
715         }
716
717         return status;
718 }
719
720 static int ocfs2_steal_inode(struct ocfs2_super *osb,
721                              struct ocfs2_alloc_context *ac)
722 {
723         return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
724 }
725
726 static int ocfs2_steal_meta(struct ocfs2_super *osb,
727                             struct ocfs2_alloc_context *ac)
728 {
729         return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
730 }
731
732 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
733                                       int blocks,
734                                       struct ocfs2_alloc_context **ac)
735 {
736         int status;
737         int slot = ocfs2_get_meta_steal_slot(osb);
738
739         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
740         if (!(*ac)) {
741                 status = -ENOMEM;
742                 mlog_errno(status);
743                 goto bail;
744         }
745
746         (*ac)->ac_bits_wanted = blocks;
747         (*ac)->ac_which = OCFS2_AC_USE_META;
748         (*ac)->ac_group_search = ocfs2_block_group_search;
749
750         if (slot != OCFS2_INVALID_SLOT &&
751                 atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
752                 goto extent_steal;
753
754         atomic_set(&osb->s_num_meta_stolen, 0);
755         status = ocfs2_reserve_suballoc_bits(osb, (*ac),
756                                              EXTENT_ALLOC_SYSTEM_INODE,
757                                              (u32)osb->slot_num, NULL,
758                                              ALLOC_NEW_GROUP);
759
760
761         if (status >= 0) {
762                 status = 0;
763                 if (slot != OCFS2_INVALID_SLOT)
764                         ocfs2_init_meta_steal_slot(osb);
765                 goto bail;
766         } else if (status < 0 && status != -ENOSPC) {
767                 mlog_errno(status);
768                 goto bail;
769         }
770
771         ocfs2_free_ac_resource(*ac);
772
773 extent_steal:
774         status = ocfs2_steal_meta(osb, *ac);
775         atomic_inc(&osb->s_num_meta_stolen);
776         if (status < 0) {
777                 if (status != -ENOSPC)
778                         mlog_errno(status);
779                 goto bail;
780         }
781
782         status = 0;
783 bail:
784         if ((status < 0) && *ac) {
785                 ocfs2_free_alloc_context(*ac);
786                 *ac = NULL;
787         }
788
789         mlog_exit(status);
790         return status;
791 }
792
793 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
794                                struct ocfs2_extent_list *root_el,
795                                struct ocfs2_alloc_context **ac)
796 {
797         return ocfs2_reserve_new_metadata_blocks(osb,
798                                         ocfs2_extend_meta_needed(root_el),
799                                         ac);
800 }
801
802 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
803                             struct ocfs2_alloc_context **ac)
804 {
805         int status;
806         int slot = ocfs2_get_inode_steal_slot(osb);
807         u64 alloc_group;
808
809         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
810         if (!(*ac)) {
811                 status = -ENOMEM;
812                 mlog_errno(status);
813                 goto bail;
814         }
815
816         (*ac)->ac_bits_wanted = 1;
817         (*ac)->ac_which = OCFS2_AC_USE_INODE;
818
819         (*ac)->ac_group_search = ocfs2_block_group_search;
820
821         /*
822          * stat(2) can't handle i_ino > 32bits, so we tell the
823          * lower levels not to allocate us a block group past that
824          * limit.  The 'inode64' mount option avoids this behavior.
825          */
826         if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
827                 (*ac)->ac_max_block = (u32)~0U;
828
829         /*
830          * slot is set when we successfully steal inode from other nodes.
831          * It is reset in 3 places:
832          * 1. when we flush the truncate log
833          * 2. when we complete local alloc recovery.
834          * 3. when we successfully allocate from our own slot.
835          * After it is set, we will go on stealing inodes until we find the
836          * need to check our slots to see whether there is some space for us.
837          */
838         if (slot != OCFS2_INVALID_SLOT &&
839             atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
840                 goto inode_steal;
841
842         atomic_set(&osb->s_num_inodes_stolen, 0);
843         alloc_group = osb->osb_inode_alloc_group;
844         status = ocfs2_reserve_suballoc_bits(osb, *ac,
845                                              INODE_ALLOC_SYSTEM_INODE,
846                                              (u32)osb->slot_num,
847                                              &alloc_group,
848                                              ALLOC_NEW_GROUP |
849                                              ALLOC_GROUPS_FROM_GLOBAL);
850         if (status >= 0) {
851                 status = 0;
852
853                 spin_lock(&osb->osb_lock);
854                 osb->osb_inode_alloc_group = alloc_group;
855                 spin_unlock(&osb->osb_lock);
856                 mlog(0, "after reservation, new allocation group is "
857                      "%llu\n", (unsigned long long)alloc_group);
858
859                 /*
860                  * Some inodes must be freed by us, so try to allocate
861                  * from our own next time.
862                  */
863                 if (slot != OCFS2_INVALID_SLOT)
864                         ocfs2_init_inode_steal_slot(osb);
865                 goto bail;
866         } else if (status < 0 && status != -ENOSPC) {
867                 mlog_errno(status);
868                 goto bail;
869         }
870
871         ocfs2_free_ac_resource(*ac);
872
873 inode_steal:
874         status = ocfs2_steal_inode(osb, *ac);
875         atomic_inc(&osb->s_num_inodes_stolen);
876         if (status < 0) {
877                 if (status != -ENOSPC)
878                         mlog_errno(status);
879                 goto bail;
880         }
881
882         status = 0;
883 bail:
884         if ((status < 0) && *ac) {
885                 ocfs2_free_alloc_context(*ac);
886                 *ac = NULL;
887         }
888
889         mlog_exit(status);
890         return status;
891 }
892
893 /* local alloc code has to do the same thing, so rather than do this
894  * twice.. */
895 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
896                                       struct ocfs2_alloc_context *ac)
897 {
898         int status;
899
900         ac->ac_which = OCFS2_AC_USE_MAIN;
901         ac->ac_group_search = ocfs2_cluster_group_search;
902
903         status = ocfs2_reserve_suballoc_bits(osb, ac,
904                                              GLOBAL_BITMAP_SYSTEM_INODE,
905                                              OCFS2_INVALID_SLOT, NULL,
906                                              ALLOC_NEW_GROUP);
907         if (status < 0 && status != -ENOSPC) {
908                 mlog_errno(status);
909                 goto bail;
910         }
911
912 bail:
913         return status;
914 }
915
916 /* Callers don't need to care which bitmap (local alloc or main) to
917  * use so we figure it out for them, but unfortunately this clutters
918  * things a bit. */
919 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
920                                              u32 bits_wanted, u64 max_block,
921                                              int flags,
922                                              struct ocfs2_alloc_context **ac)
923 {
924         int status;
925
926         mlog_entry_void();
927
928         *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
929         if (!(*ac)) {
930                 status = -ENOMEM;
931                 mlog_errno(status);
932                 goto bail;
933         }
934
935         (*ac)->ac_bits_wanted = bits_wanted;
936         (*ac)->ac_max_block = max_block;
937
938         status = -ENOSPC;
939         if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
940             ocfs2_alloc_should_use_local(osb, bits_wanted)) {
941                 status = ocfs2_reserve_local_alloc_bits(osb,
942                                                         bits_wanted,
943                                                         *ac);
944                 if (status == -EFBIG) {
945                         /* The local alloc window is outside ac_max_block.
946                          * use the main bitmap. */
947                         status = -ENOSPC;
948                 } else if ((status < 0) && (status != -ENOSPC)) {
949                         mlog_errno(status);
950                         goto bail;
951                 }
952         }
953
954         if (status == -ENOSPC) {
955                 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
956                 if (status < 0) {
957                         if (status != -ENOSPC)
958                                 mlog_errno(status);
959                         goto bail;
960                 }
961         }
962
963         status = 0;
964 bail:
965         if ((status < 0) && *ac) {
966                 ocfs2_free_alloc_context(*ac);
967                 *ac = NULL;
968         }
969
970         mlog_exit(status);
971         return status;
972 }
973
974 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
975                            u32 bits_wanted,
976                            struct ocfs2_alloc_context **ac)
977 {
978         return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
979                                                  ALLOC_NEW_GROUP, ac);
980 }
981
982 /*
983  * More or less lifted from ext3. I'll leave their description below:
984  *
985  * "For ext3 allocations, we must not reuse any blocks which are
986  * allocated in the bitmap buffer's "last committed data" copy.  This
987  * prevents deletes from freeing up the page for reuse until we have
988  * committed the delete transaction.
989  *
990  * If we didn't do this, then deleting something and reallocating it as
991  * data would allow the old block to be overwritten before the
992  * transaction committed (because we force data to disk before commit).
993  * This would lead to corruption if we crashed between overwriting the
994  * data and committing the delete.
995  *
996  * @@@ We may want to make this allocation behaviour conditional on
997  * data-writes at some point, and disable it for metadata allocations or
998  * sync-data inodes."
999  *
1000  * Note: OCFS2 already does this differently for metadata vs data
1001  * allocations, as those bitmaps are separate and undo access is never
1002  * called on a metadata group descriptor.
1003  */
1004 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1005                                          int nr)
1006 {
1007         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1008         int ret;
1009
1010         if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1011                 return 0;
1012
1013         if (!buffer_jbd(bg_bh))
1014                 return 1;
1015
1016         jbd_lock_bh_state(bg_bh);
1017         bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1018         if (bg)
1019                 ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1020         else
1021                 ret = 1;
1022         jbd_unlock_bh_state(bg_bh);
1023
1024         return ret;
1025 }
1026
1027 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1028                                              struct buffer_head *bg_bh,
1029                                              unsigned int bits_wanted,
1030                                              unsigned int total_bits,
1031                                              u16 *bit_off,
1032                                              u16 *bits_found)
1033 {
1034         void *bitmap;
1035         u16 best_offset, best_size;
1036         int offset, start, found, status = 0;
1037         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1038
1039         /* Callers got this descriptor from
1040          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1041         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1042
1043         found = start = best_offset = best_size = 0;
1044         bitmap = bg->bg_bitmap;
1045
1046         while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1047                 if (offset == total_bits)
1048                         break;
1049
1050                 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1051                         /* We found a zero, but we can't use it as it
1052                          * hasn't been put to disk yet! */
1053                         found = 0;
1054                         start = offset + 1;
1055                 } else if (offset == start) {
1056                         /* we found a zero */
1057                         found++;
1058                         /* move start to the next bit to test */
1059                         start++;
1060                 } else {
1061                         /* got a zero after some ones */
1062                         found = 1;
1063                         start = offset + 1;
1064                 }
1065                 if (found > best_size) {
1066                         best_size = found;
1067                         best_offset = start - found;
1068                 }
1069                 /* we got everything we needed */
1070                 if (found == bits_wanted) {
1071                         /* mlog(0, "Found it all!\n"); */
1072                         break;
1073                 }
1074         }
1075
1076         /* XXX: I think the first clause is equivalent to the second
1077          *      - jlbec */
1078         if (found == bits_wanted) {
1079                 *bit_off = start - found;
1080                 *bits_found = found;
1081         } else if (best_size) {
1082                 *bit_off = best_offset;
1083                 *bits_found = best_size;
1084         } else {
1085                 status = -ENOSPC;
1086                 /* No error log here -- see the comment above
1087                  * ocfs2_test_bg_bit_allocatable */
1088         }
1089
1090         return status;
1091 }
1092
1093 static inline int ocfs2_block_group_set_bits(handle_t *handle,
1094                                              struct inode *alloc_inode,
1095                                              struct ocfs2_group_desc *bg,
1096                                              struct buffer_head *group_bh,
1097                                              unsigned int bit_off,
1098                                              unsigned int num_bits)
1099 {
1100         int status;
1101         void *bitmap = bg->bg_bitmap;
1102         int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1103
1104         mlog_entry_void();
1105
1106         /* All callers get the descriptor via
1107          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1108         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1109         BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1110
1111         mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1112              num_bits);
1113
1114         if (ocfs2_is_cluster_bitmap(alloc_inode))
1115                 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1116
1117         status = ocfs2_journal_access_gd(handle,
1118                                          INODE_CACHE(alloc_inode),
1119                                          group_bh,
1120                                          journal_type);
1121         if (status < 0) {
1122                 mlog_errno(status);
1123                 goto bail;
1124         }
1125
1126         le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1127         while(num_bits--)
1128                 ocfs2_set_bit(bit_off++, bitmap);
1129
1130         ocfs2_journal_dirty(handle, group_bh);
1131
1132 bail:
1133         mlog_exit(status);
1134         return status;
1135 }
1136
1137 /* find the one with the most empty bits */
1138 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1139 {
1140         u16 curr, best;
1141
1142         BUG_ON(!cl->cl_next_free_rec);
1143
1144         best = curr = 0;
1145         while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1146                 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1147                     le32_to_cpu(cl->cl_recs[best].c_free))
1148                         best = curr;
1149                 curr++;
1150         }
1151
1152         BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1153         return best;
1154 }
1155
1156 static int ocfs2_relink_block_group(handle_t *handle,
1157                                     struct inode *alloc_inode,
1158                                     struct buffer_head *fe_bh,
1159                                     struct buffer_head *bg_bh,
1160                                     struct buffer_head *prev_bg_bh,
1161                                     u16 chain)
1162 {
1163         int status;
1164         /* there is a really tiny chance the journal calls could fail,
1165          * but we wouldn't want inconsistent blocks in *any* case. */
1166         u64 fe_ptr, bg_ptr, prev_bg_ptr;
1167         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1168         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1169         struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1170
1171         /* The caller got these descriptors from
1172          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1173         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1174         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1175
1176         mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1177              (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1178              (unsigned long long)le64_to_cpu(bg->bg_blkno),
1179              (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1180
1181         fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1182         bg_ptr = le64_to_cpu(bg->bg_next_group);
1183         prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1184
1185         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1186                                          prev_bg_bh,
1187                                          OCFS2_JOURNAL_ACCESS_WRITE);
1188         if (status < 0) {
1189                 mlog_errno(status);
1190                 goto out_rollback;
1191         }
1192
1193         prev_bg->bg_next_group = bg->bg_next_group;
1194         ocfs2_journal_dirty(handle, prev_bg_bh);
1195
1196         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1197                                          bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1198         if (status < 0) {
1199                 mlog_errno(status);
1200                 goto out_rollback;
1201         }
1202
1203         bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1204         ocfs2_journal_dirty(handle, bg_bh);
1205
1206         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1207                                          fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1208         if (status < 0) {
1209                 mlog_errno(status);
1210                 goto out_rollback;
1211         }
1212
1213         fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1214         ocfs2_journal_dirty(handle, fe_bh);
1215
1216 out_rollback:
1217         if (status < 0) {
1218                 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1219                 bg->bg_next_group = cpu_to_le64(bg_ptr);
1220                 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1221         }
1222
1223         mlog_exit(status);
1224         return status;
1225 }
1226
1227 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1228                                                      u32 wanted)
1229 {
1230         return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1231 }
1232
1233 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1234  * value on error. */
1235 static int ocfs2_cluster_group_search(struct inode *inode,
1236                                       struct buffer_head *group_bh,
1237                                       u32 bits_wanted, u32 min_bits,
1238                                       u64 max_block,
1239                                       u16 *bit_off, u16 *bits_found)
1240 {
1241         int search = -ENOSPC;
1242         int ret;
1243         u64 blkoff;
1244         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1245         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1246         u16 tmp_off, tmp_found;
1247         unsigned int max_bits, gd_cluster_off;
1248
1249         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1250
1251         if (gd->bg_free_bits_count) {
1252                 max_bits = le16_to_cpu(gd->bg_bits);
1253
1254                 /* Tail groups in cluster bitmaps which aren't cpg
1255                  * aligned are prone to partial extention by a failed
1256                  * fs resize. If the file system resize never got to
1257                  * update the dinode cluster count, then we don't want
1258                  * to trust any clusters past it, regardless of what
1259                  * the group descriptor says. */
1260                 gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1261                                                           le64_to_cpu(gd->bg_blkno));
1262                 if ((gd_cluster_off + max_bits) >
1263                     OCFS2_I(inode)->ip_clusters) {
1264                         max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1265                         mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1266                              (unsigned long long)le64_to_cpu(gd->bg_blkno),
1267                              le16_to_cpu(gd->bg_bits),
1268                              OCFS2_I(inode)->ip_clusters, max_bits);
1269                 }
1270
1271                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1272                                                         group_bh, bits_wanted,
1273                                                         max_bits,
1274                                                         &tmp_off, &tmp_found);
1275                 if (ret)
1276                         return ret;
1277
1278                 if (max_block) {
1279                         blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1280                                                           gd_cluster_off +
1281                                                           tmp_off + tmp_found);
1282                         mlog(0, "Checking %llu against %llu\n",
1283                              (unsigned long long)blkoff,
1284                              (unsigned long long)max_block);
1285                         if (blkoff > max_block)
1286                                 return -ENOSPC;
1287                 }
1288
1289                 /* ocfs2_block_group_find_clear_bits() might
1290                  * return success, but we still want to return
1291                  * -ENOSPC unless it found the minimum number
1292                  * of bits. */
1293                 if (min_bits <= tmp_found) {
1294                         *bit_off = tmp_off;
1295                         *bits_found = tmp_found;
1296                         search = 0; /* success */
1297                 } else if (tmp_found) {
1298                         /*
1299                          * Don't show bits which we'll be returning
1300                          * for allocation to the local alloc bitmap.
1301                          */
1302                         ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1303                 }
1304         }
1305
1306         return search;
1307 }
1308
1309 static int ocfs2_block_group_search(struct inode *inode,
1310                                     struct buffer_head *group_bh,
1311                                     u32 bits_wanted, u32 min_bits,
1312                                     u64 max_block,
1313                                     u16 *bit_off, u16 *bits_found)
1314 {
1315         int ret = -ENOSPC;
1316         u64 blkoff;
1317         struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1318
1319         BUG_ON(min_bits != 1);
1320         BUG_ON(ocfs2_is_cluster_bitmap(inode));
1321
1322         if (bg->bg_free_bits_count) {
1323                 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1324                                                         group_bh, bits_wanted,
1325                                                         le16_to_cpu(bg->bg_bits),
1326                                                         bit_off, bits_found);
1327                 if (!ret && max_block) {
1328                         blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1329                                 *bits_found;
1330                         mlog(0, "Checking %llu against %llu\n",
1331                              (unsigned long long)blkoff,
1332                              (unsigned long long)max_block);
1333                         if (blkoff > max_block)
1334                                 ret = -ENOSPC;
1335                 }
1336         }
1337
1338         return ret;
1339 }
1340
1341 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1342                                        handle_t *handle,
1343                                        struct buffer_head *di_bh,
1344                                        u32 num_bits,
1345                                        u16 chain)
1346 {
1347         int ret;
1348         u32 tmp_used;
1349         struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1350         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1351
1352         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1353                                       OCFS2_JOURNAL_ACCESS_WRITE);
1354         if (ret < 0) {
1355                 mlog_errno(ret);
1356                 goto out;
1357         }
1358
1359         tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1360         di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1361         le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1362         ocfs2_journal_dirty(handle, di_bh);
1363
1364 out:
1365         return ret;
1366 }
1367
1368 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1369                                   handle_t *handle,
1370                                   u32 bits_wanted,
1371                                   u32 min_bits,
1372                                   u16 *bit_off,
1373                                   unsigned int *num_bits,
1374                                   u64 gd_blkno,
1375                                   u16 *bits_left)
1376 {
1377         int ret;
1378         u16 found;
1379         struct buffer_head *group_bh = NULL;
1380         struct ocfs2_group_desc *gd;
1381         struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1382         struct inode *alloc_inode = ac->ac_inode;
1383
1384         ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1385                                           &group_bh);
1386         if (ret < 0) {
1387                 mlog_errno(ret);
1388                 return ret;
1389         }
1390
1391         gd = (struct ocfs2_group_desc *) group_bh->b_data;
1392         ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1393                                   ac->ac_max_block, bit_off, &found);
1394         if (ret < 0) {
1395                 if (ret != -ENOSPC)
1396                         mlog_errno(ret);
1397                 goto out;
1398         }
1399
1400         *num_bits = found;
1401
1402         ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1403                                                *num_bits,
1404                                                le16_to_cpu(gd->bg_chain));
1405         if (ret < 0) {
1406                 mlog_errno(ret);
1407                 goto out;
1408         }
1409
1410         ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1411                                          *bit_off, *num_bits);
1412         if (ret < 0)
1413                 mlog_errno(ret);
1414
1415         *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1416
1417 out:
1418         brelse(group_bh);
1419
1420         return ret;
1421 }
1422
1423 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1424                               handle_t *handle,
1425                               u32 bits_wanted,
1426                               u32 min_bits,
1427                               u16 *bit_off,
1428                               unsigned int *num_bits,
1429                               u64 *bg_blkno,
1430                               u16 *bits_left)
1431 {
1432         int status;
1433         u16 chain, tmp_bits;
1434         u32 tmp_used;
1435         u64 next_group;
1436         struct inode *alloc_inode = ac->ac_inode;
1437         struct buffer_head *group_bh = NULL;
1438         struct buffer_head *prev_group_bh = NULL;
1439         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1440         struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1441         struct ocfs2_group_desc *bg;
1442
1443         chain = ac->ac_chain;
1444         mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1445              bits_wanted, chain,
1446              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1447
1448         status = ocfs2_read_group_descriptor(alloc_inode, fe,
1449                                              le64_to_cpu(cl->cl_recs[chain].c_blkno),
1450                                              &group_bh);
1451         if (status < 0) {
1452                 mlog_errno(status);
1453                 goto bail;
1454         }
1455         bg = (struct ocfs2_group_desc *) group_bh->b_data;
1456
1457         status = -ENOSPC;
1458         /* for now, the chain search is a bit simplistic. We just use
1459          * the 1st group with any empty bits. */
1460         while ((status = ac->ac_group_search(alloc_inode, group_bh,
1461                                              bits_wanted, min_bits,
1462                                              ac->ac_max_block, bit_off,
1463                                              &tmp_bits)) == -ENOSPC) {
1464                 if (!bg->bg_next_group)
1465                         break;
1466
1467                 brelse(prev_group_bh);
1468                 prev_group_bh = NULL;
1469
1470                 next_group = le64_to_cpu(bg->bg_next_group);
1471                 prev_group_bh = group_bh;
1472                 group_bh = NULL;
1473                 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1474                                                      next_group, &group_bh);
1475                 if (status < 0) {
1476                         mlog_errno(status);
1477                         goto bail;
1478                 }
1479                 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1480         }
1481         if (status < 0) {
1482                 if (status != -ENOSPC)
1483                         mlog_errno(status);
1484                 goto bail;
1485         }
1486
1487         mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1488              tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1489
1490         *num_bits = tmp_bits;
1491
1492         BUG_ON(*num_bits == 0);
1493
1494         /*
1495          * Keep track of previous block descriptor read. When
1496          * we find a target, if we have read more than X
1497          * number of descriptors, and the target is reasonably
1498          * empty, relink him to top of his chain.
1499          *
1500          * We've read 0 extra blocks and only send one more to
1501          * the transaction, yet the next guy to search has a
1502          * much easier time.
1503          *
1504          * Do this *after* figuring out how many bits we're taking out
1505          * of our target group.
1506          */
1507         if (ac->ac_allow_chain_relink &&
1508             (prev_group_bh) &&
1509             (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1510                 status = ocfs2_relink_block_group(handle, alloc_inode,
1511                                                   ac->ac_bh, group_bh,
1512                                                   prev_group_bh, chain);
1513                 if (status < 0) {
1514                         mlog_errno(status);
1515                         goto bail;
1516                 }
1517         }
1518
1519         /* Ok, claim our bits now: set the info on dinode, chainlist
1520          * and then the group */
1521         status = ocfs2_journal_access_di(handle,
1522                                          INODE_CACHE(alloc_inode),
1523                                          ac->ac_bh,
1524                                          OCFS2_JOURNAL_ACCESS_WRITE);
1525         if (status < 0) {
1526                 mlog_errno(status);
1527                 goto bail;
1528         }
1529
1530         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1531         fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1532         le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1533         ocfs2_journal_dirty(handle, ac->ac_bh);
1534
1535         status = ocfs2_block_group_set_bits(handle,
1536                                             alloc_inode,
1537                                             bg,
1538                                             group_bh,
1539                                             *bit_off,
1540                                             *num_bits);
1541         if (status < 0) {
1542                 mlog_errno(status);
1543                 goto bail;
1544         }
1545
1546         mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1547              (unsigned long long)le64_to_cpu(fe->i_blkno));
1548
1549         *bg_blkno = le64_to_cpu(bg->bg_blkno);
1550         *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1551 bail:
1552         brelse(group_bh);
1553         brelse(prev_group_bh);
1554
1555         mlog_exit(status);
1556         return status;
1557 }
1558
1559 /* will give out up to bits_wanted contiguous bits. */
1560 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1561                                      struct ocfs2_alloc_context *ac,
1562                                      handle_t *handle,
1563                                      u32 bits_wanted,
1564                                      u32 min_bits,
1565                                      u16 *bit_off,
1566                                      unsigned int *num_bits,
1567                                      u64 *bg_blkno)
1568 {
1569         int status;
1570         u16 victim, i;
1571         u16 bits_left = 0;
1572         u64 hint_blkno = ac->ac_last_group;
1573         struct ocfs2_chain_list *cl;
1574         struct ocfs2_dinode *fe;
1575
1576         mlog_entry_void();
1577
1578         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1579         BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1580         BUG_ON(!ac->ac_bh);
1581
1582         fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1583
1584         /* The bh was validated by the inode read during
1585          * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1586         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1587
1588         if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1589             le32_to_cpu(fe->id1.bitmap1.i_total)) {
1590                 ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1591                             "bits but only %u total.",
1592                             (unsigned long long)le64_to_cpu(fe->i_blkno),
1593                             le32_to_cpu(fe->id1.bitmap1.i_used),
1594                             le32_to_cpu(fe->id1.bitmap1.i_total));
1595                 status = -EIO;
1596                 goto bail;
1597         }
1598
1599         if (hint_blkno) {
1600                 /* Attempt to short-circuit the usual search mechanism
1601                  * by jumping straight to the most recently used
1602                  * allocation group. This helps us mantain some
1603                  * contiguousness across allocations. */
1604                 status = ocfs2_search_one_group(ac, handle, bits_wanted,
1605                                                 min_bits, bit_off, num_bits,
1606                                                 hint_blkno, &bits_left);
1607                 if (!status) {
1608                         /* Be careful to update *bg_blkno here as the
1609                          * caller is expecting it to be filled in, and
1610                          * ocfs2_search_one_group() won't do that for
1611                          * us. */
1612                         *bg_blkno = hint_blkno;
1613                         goto set_hint;
1614                 }
1615                 if (status < 0 && status != -ENOSPC) {
1616                         mlog_errno(status);
1617                         goto bail;
1618                 }
1619         }
1620
1621         cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1622
1623         victim = ocfs2_find_victim_chain(cl);
1624         ac->ac_chain = victim;
1625         ac->ac_allow_chain_relink = 1;
1626
1627         status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1628                                     num_bits, bg_blkno, &bits_left);
1629         if (!status)
1630                 goto set_hint;
1631         if (status < 0 && status != -ENOSPC) {
1632                 mlog_errno(status);
1633                 goto bail;
1634         }
1635
1636         mlog(0, "Search of victim chain %u came up with nothing, "
1637              "trying all chains now.\n", victim);
1638
1639         /* If we didn't pick a good victim, then just default to
1640          * searching each chain in order. Don't allow chain relinking
1641          * because we only calculate enough journal credits for one
1642          * relink per alloc. */
1643         ac->ac_allow_chain_relink = 0;
1644         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1645                 if (i == victim)
1646                         continue;
1647                 if (!cl->cl_recs[i].c_free)
1648                         continue;
1649
1650                 ac->ac_chain = i;
1651                 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1652                                             bit_off, num_bits, bg_blkno,
1653                                             &bits_left);
1654                 if (!status)
1655                         break;
1656                 if (status < 0 && status != -ENOSPC) {
1657                         mlog_errno(status);
1658                         goto bail;
1659                 }
1660         }
1661
1662 set_hint:
1663         if (status != -ENOSPC) {
1664                 /* If the next search of this group is not likely to
1665                  * yield a suitable extent, then we reset the last
1666                  * group hint so as to not waste a disk read */
1667                 if (bits_left < min_bits)
1668                         ac->ac_last_group = 0;
1669                 else
1670                         ac->ac_last_group = *bg_blkno;
1671         }
1672
1673 bail:
1674         mlog_exit(status);
1675         return status;
1676 }
1677
1678 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1679                          handle_t *handle,
1680                          struct ocfs2_alloc_context *ac,
1681                          u32 bits_wanted,
1682                          u16 *suballoc_bit_start,
1683                          unsigned int *num_bits,
1684                          u64 *blkno_start)
1685 {
1686         int status;
1687         u64 bg_blkno;
1688
1689         BUG_ON(!ac);
1690         BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1691         BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1692
1693         status = ocfs2_claim_suballoc_bits(osb,
1694                                            ac,
1695                                            handle,
1696                                            bits_wanted,
1697                                            1,
1698                                            suballoc_bit_start,
1699                                            num_bits,
1700                                            &bg_blkno);
1701         if (status < 0) {
1702                 mlog_errno(status);
1703                 goto bail;
1704         }
1705         atomic_inc(&osb->alloc_stats.bg_allocs);
1706
1707         *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1708         ac->ac_bits_given += (*num_bits);
1709         status = 0;
1710 bail:
1711         mlog_exit(status);
1712         return status;
1713 }
1714
1715 static void ocfs2_init_inode_ac_group(struct inode *dir,
1716                                       struct buffer_head *parent_fe_bh,
1717                                       struct ocfs2_alloc_context *ac)
1718 {
1719         struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1720         /*
1721          * Try to allocate inodes from some specific group.
1722          *
1723          * If the parent dir has recorded the last group used in allocation,
1724          * cool, use it. Otherwise if we try to allocate new inode from the
1725          * same slot the parent dir belongs to, use the same chunk.
1726          *
1727          * We are very careful here to avoid the mistake of setting
1728          * ac_last_group to a group descriptor from a different (unlocked) slot.
1729          */
1730         if (OCFS2_I(dir)->ip_last_used_group &&
1731             OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1732                 ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1733         else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1734                 ac->ac_last_group = ocfs2_which_suballoc_group(
1735                                         le64_to_cpu(fe->i_blkno),
1736                                         le16_to_cpu(fe->i_suballoc_bit));
1737 }
1738
1739 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1740                                              struct ocfs2_alloc_context *ac)
1741 {
1742         OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1743         OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1744 }
1745
1746 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1747                           handle_t *handle,
1748                           struct inode *dir,
1749                           struct buffer_head *parent_fe_bh,
1750                           struct ocfs2_alloc_context *ac,
1751                           u16 *suballoc_bit,
1752                           u64 *fe_blkno)
1753 {
1754         int status;
1755         unsigned int num_bits;
1756         u64 bg_blkno;
1757
1758         mlog_entry_void();
1759
1760         BUG_ON(!ac);
1761         BUG_ON(ac->ac_bits_given != 0);
1762         BUG_ON(ac->ac_bits_wanted != 1);
1763         BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1764
1765         ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1766
1767         status = ocfs2_claim_suballoc_bits(osb,
1768                                            ac,
1769                                            handle,
1770                                            1,
1771                                            1,
1772                                            suballoc_bit,
1773                                            &num_bits,
1774                                            &bg_blkno);
1775         if (status < 0) {
1776                 mlog_errno(status);
1777                 goto bail;
1778         }
1779         atomic_inc(&osb->alloc_stats.bg_allocs);
1780
1781         BUG_ON(num_bits != 1);
1782
1783         *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1784         ac->ac_bits_given++;
1785         ocfs2_save_inode_ac_group(dir, ac);
1786         status = 0;
1787 bail:
1788         mlog_exit(status);
1789         return status;
1790 }
1791
1792 /* translate a group desc. blkno and it's bitmap offset into
1793  * disk cluster offset. */
1794 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1795                                                    u64 bg_blkno,
1796                                                    u16 bg_bit_off)
1797 {
1798         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1799         u32 cluster = 0;
1800
1801         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1802
1803         if (bg_blkno != osb->first_cluster_group_blkno)
1804                 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1805         cluster += (u32) bg_bit_off;
1806         return cluster;
1807 }
1808
1809 /* given a cluster offset, calculate which block group it belongs to
1810  * and return that block offset. */
1811 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1812 {
1813         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1814         u32 group_no;
1815
1816         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1817
1818         group_no = cluster / osb->bitmap_cpg;
1819         if (!group_no)
1820                 return osb->first_cluster_group_blkno;
1821         return ocfs2_clusters_to_blocks(inode->i_sb,
1822                                         group_no * osb->bitmap_cpg);
1823 }
1824
1825 /* given the block number of a cluster start, calculate which cluster
1826  * group and descriptor bitmap offset that corresponds to. */
1827 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1828                                                 u64 data_blkno,
1829                                                 u64 *bg_blkno,
1830                                                 u16 *bg_bit_off)
1831 {
1832         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1833         u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1834
1835         BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1836
1837         *bg_blkno = ocfs2_which_cluster_group(inode,
1838                                               data_cluster);
1839
1840         if (*bg_blkno == osb->first_cluster_group_blkno)
1841                 *bg_bit_off = (u16) data_cluster;
1842         else
1843                 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1844                                                              data_blkno - *bg_blkno);
1845 }
1846
1847 /*
1848  * min_bits - minimum contiguous chunk from this total allocation we
1849  * can handle. set to what we asked for originally for a full
1850  * contig. allocation, set to '1' to indicate we can deal with extents
1851  * of any size.
1852  */
1853 int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1854                            handle_t *handle,
1855                            struct ocfs2_alloc_context *ac,
1856                            u32 min_clusters,
1857                            u32 max_clusters,
1858                            u32 *cluster_start,
1859                            u32 *num_clusters)
1860 {
1861         int status;
1862         unsigned int bits_wanted = max_clusters;
1863         u64 bg_blkno = 0;
1864         u16 bg_bit_off;
1865
1866         mlog_entry_void();
1867
1868         BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1869
1870         BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1871                && ac->ac_which != OCFS2_AC_USE_MAIN);
1872
1873         if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1874                 status = ocfs2_claim_local_alloc_bits(osb,
1875                                                       handle,
1876                                                       ac,
1877                                                       bits_wanted,
1878                                                       cluster_start,
1879                                                       num_clusters);
1880                 if (!status)
1881                         atomic_inc(&osb->alloc_stats.local_data);
1882         } else {
1883                 if (min_clusters > (osb->bitmap_cpg - 1)) {
1884                         /* The only paths asking for contiguousness
1885                          * should know about this already. */
1886                         mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1887                              "group bitmap size %u!\n", min_clusters,
1888                              osb->bitmap_cpg);
1889                         status = -ENOSPC;
1890                         goto bail;
1891                 }
1892                 /* clamp the current request down to a realistic size. */
1893                 if (bits_wanted > (osb->bitmap_cpg - 1))
1894                         bits_wanted = osb->bitmap_cpg - 1;
1895
1896                 status = ocfs2_claim_suballoc_bits(osb,
1897                                                    ac,
1898                                                    handle,
1899                                                    bits_wanted,
1900                                                    min_clusters,
1901                                                    &bg_bit_off,
1902                                                    num_clusters,
1903                                                    &bg_blkno);
1904                 if (!status) {
1905                         *cluster_start =
1906                                 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1907                                                                  bg_blkno,
1908                                                                  bg_bit_off);
1909                         atomic_inc(&osb->alloc_stats.bitmap_data);
1910                 }
1911         }
1912         if (status < 0) {
1913                 if (status != -ENOSPC)
1914                         mlog_errno(status);
1915                 goto bail;
1916         }
1917
1918         ac->ac_bits_given += *num_clusters;
1919
1920 bail:
1921         mlog_exit(status);
1922         return status;
1923 }
1924
1925 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1926                          handle_t *handle,
1927                          struct ocfs2_alloc_context *ac,
1928                          u32 min_clusters,
1929                          u32 *cluster_start,
1930                          u32 *num_clusters)
1931 {
1932         unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1933
1934         return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1935                                       bits_wanted, cluster_start, num_clusters);
1936 }
1937
1938 static int ocfs2_block_group_clear_bits(handle_t *handle,
1939                                         struct inode *alloc_inode,
1940                                         struct ocfs2_group_desc *bg,
1941                                         struct buffer_head *group_bh,
1942                                         unsigned int bit_off,
1943                                         unsigned int num_bits,
1944                                         void (*undo_fn)(unsigned int bit,
1945                                                         unsigned long *bmap))
1946 {
1947         int status;
1948         unsigned int tmp;
1949         struct ocfs2_group_desc *undo_bg = NULL;
1950
1951         mlog_entry_void();
1952
1953         /* The caller got this descriptor from
1954          * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1955         BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1956
1957         mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1958
1959         BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
1960         status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1961                                          group_bh,
1962                                          undo_fn ?
1963                                          OCFS2_JOURNAL_ACCESS_UNDO :
1964                                          OCFS2_JOURNAL_ACCESS_WRITE);
1965         if (status < 0) {
1966                 mlog_errno(status);
1967                 goto bail;
1968         }
1969
1970         if (undo_fn) {
1971                 jbd_lock_bh_state(group_bh);
1972                 undo_bg = (struct ocfs2_group_desc *)
1973                                         bh2jh(group_bh)->b_committed_data;
1974                 BUG_ON(!undo_bg);
1975         }
1976
1977         tmp = num_bits;
1978         while(tmp--) {
1979                 ocfs2_clear_bit((bit_off + tmp),
1980                                 (unsigned long *) bg->bg_bitmap);
1981                 if (undo_fn)
1982                         undo_fn(bit_off + tmp,
1983                                 (unsigned long *) undo_bg->bg_bitmap);
1984         }
1985         le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1986
1987         if (undo_fn)
1988                 jbd_unlock_bh_state(group_bh);
1989
1990         ocfs2_journal_dirty(handle, group_bh);
1991 bail:
1992         return status;
1993 }
1994
1995 /*
1996  * expects the suballoc inode to already be locked.
1997  */
1998 static int _ocfs2_free_suballoc_bits(handle_t *handle,
1999                                      struct inode *alloc_inode,
2000                                      struct buffer_head *alloc_bh,
2001                                      unsigned int start_bit,
2002                                      u64 bg_blkno,
2003                                      unsigned int count,
2004                                      void (*undo_fn)(unsigned int bit,
2005                                                      unsigned long *bitmap))
2006 {
2007         int status = 0;
2008         u32 tmp_used;
2009         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2010         struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2011         struct buffer_head *group_bh = NULL;
2012         struct ocfs2_group_desc *group;
2013
2014         mlog_entry_void();
2015
2016         /* The alloc_bh comes from ocfs2_free_dinode() or
2017          * ocfs2_free_clusters().  The callers have all locked the
2018          * allocator and gotten alloc_bh from the lock call.  This
2019          * validates the dinode buffer.  Any corruption that has happended
2020          * is a code bug. */
2021         BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2022         BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2023
2024         mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
2025              (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
2026              (unsigned long long)bg_blkno, start_bit);
2027
2028         status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2029                                              &group_bh);
2030         if (status < 0) {
2031                 mlog_errno(status);
2032                 goto bail;
2033         }
2034         group = (struct ocfs2_group_desc *) group_bh->b_data;
2035
2036         BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2037
2038         status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2039                                               group, group_bh,
2040                                               start_bit, count, undo_fn);
2041         if (status < 0) {
2042                 mlog_errno(status);
2043                 goto bail;
2044         }
2045
2046         status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2047                                          alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2048         if (status < 0) {
2049                 mlog_errno(status);
2050                 goto bail;
2051         }
2052
2053         le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2054                      count);
2055         tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2056         fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2057         ocfs2_journal_dirty(handle, alloc_bh);
2058
2059 bail:
2060         brelse(group_bh);
2061
2062         mlog_exit(status);
2063         return status;
2064 }
2065
2066 int ocfs2_free_suballoc_bits(handle_t *handle,
2067                              struct inode *alloc_inode,
2068                              struct buffer_head *alloc_bh,
2069                              unsigned int start_bit,
2070                              u64 bg_blkno,
2071                              unsigned int count)
2072 {
2073         return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2074                                          start_bit, bg_blkno, count, NULL);
2075 }
2076
2077 int ocfs2_free_dinode(handle_t *handle,
2078                       struct inode *inode_alloc_inode,
2079                       struct buffer_head *inode_alloc_bh,
2080                       struct ocfs2_dinode *di)
2081 {
2082         u64 blk = le64_to_cpu(di->i_blkno);
2083         u16 bit = le16_to_cpu(di->i_suballoc_bit);
2084         u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2085
2086         return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2087                                         inode_alloc_bh, bit, bg_blkno, 1);
2088 }
2089
2090 static int _ocfs2_free_clusters(handle_t *handle,
2091                                 struct inode *bitmap_inode,
2092                                 struct buffer_head *bitmap_bh,
2093                                 u64 start_blk,
2094                                 unsigned int num_clusters,
2095                                 void (*undo_fn)(unsigned int bit,
2096                                                 unsigned long *bitmap))
2097 {
2098         int status;
2099         u16 bg_start_bit;
2100         u64 bg_blkno;
2101         struct ocfs2_dinode *fe;
2102
2103         /* You can't ever have a contiguous set of clusters
2104          * bigger than a block group bitmap so we never have to worry
2105          * about looping on them. */
2106
2107         mlog_entry_void();
2108
2109         /* This is expensive. We can safely remove once this stuff has
2110          * gotten tested really well. */
2111         BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2112
2113         fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2114
2115         ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2116                                      &bg_start_bit);
2117
2118         mlog(0, "want to free %u clusters starting at block %llu\n",
2119              num_clusters, (unsigned long long)start_blk);
2120         mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2121              (unsigned long long)bg_blkno, bg_start_bit);
2122
2123         status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2124                                            bg_start_bit, bg_blkno,
2125                                            num_clusters, undo_fn);
2126         if (status < 0) {
2127                 mlog_errno(status);
2128                 goto out;
2129         }
2130
2131         ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2132                                          num_clusters);
2133
2134 out:
2135         mlog_exit(status);
2136         return status;
2137 }
2138
2139 int ocfs2_free_clusters(handle_t *handle,
2140                         struct inode *bitmap_inode,
2141                         struct buffer_head *bitmap_bh,
2142                         u64 start_blk,
2143                         unsigned int num_clusters)
2144 {
2145         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2146                                     start_blk, num_clusters,
2147                                     _ocfs2_set_bit);
2148 }
2149
2150 /*
2151  * Give never-used clusters back to the global bitmap.  We don't need
2152  * to protect these bits in the undo buffer.
2153  */
2154 int ocfs2_release_clusters(handle_t *handle,
2155                            struct inode *bitmap_inode,
2156                            struct buffer_head *bitmap_bh,
2157                            u64 start_blk,
2158                            unsigned int num_clusters)
2159 {
2160         return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2161                                     start_blk, num_clusters,
2162                                     _ocfs2_clear_bit);
2163 }
2164
2165 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2166 {
2167         printk("Block Group:\n");
2168         printk("bg_signature:       %s\n", bg->bg_signature);
2169         printk("bg_size:            %u\n", bg->bg_size);
2170         printk("bg_bits:            %u\n", bg->bg_bits);
2171         printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2172         printk("bg_chain:           %u\n", bg->bg_chain);
2173         printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2174         printk("bg_next_group:      %llu\n",
2175                (unsigned long long)bg->bg_next_group);
2176         printk("bg_parent_dinode:   %llu\n",
2177                (unsigned long long)bg->bg_parent_dinode);
2178         printk("bg_blkno:           %llu\n",
2179                (unsigned long long)bg->bg_blkno);
2180 }
2181
2182 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2183 {
2184         int i;
2185
2186         printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2187         printk("i_signature:                  %s\n", fe->i_signature);
2188         printk("i_size:                       %llu\n",
2189                (unsigned long long)fe->i_size);
2190         printk("i_clusters:                   %u\n", fe->i_clusters);
2191         printk("i_generation:                 %u\n",
2192                le32_to_cpu(fe->i_generation));
2193         printk("id1.bitmap1.i_used:           %u\n",
2194                le32_to_cpu(fe->id1.bitmap1.i_used));
2195         printk("id1.bitmap1.i_total:          %u\n",
2196                le32_to_cpu(fe->id1.bitmap1.i_total));
2197         printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2198         printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2199         printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2200         printk("id2.i_chain.cl_next_free_rec: %u\n",
2201                fe->id2.i_chain.cl_next_free_rec);
2202         for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2203                 printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2204                        fe->id2.i_chain.cl_recs[i].c_free);
2205                 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2206                        fe->id2.i_chain.cl_recs[i].c_total);
2207                 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2208                        (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2209         }
2210 }
2211
2212 /*
2213  * For a given allocation, determine which allocators will need to be
2214  * accessed, and lock them, reserving the appropriate number of bits.
2215  *
2216  * Sparse file systems call this from ocfs2_write_begin_nolock()
2217  * and ocfs2_allocate_unwritten_extents().
2218  *
2219  * File systems which don't support holes call this from
2220  * ocfs2_extend_allocation().
2221  */
2222 int ocfs2_lock_allocators(struct inode *inode,
2223                           struct ocfs2_extent_tree *et,
2224                           u32 clusters_to_add, u32 extents_to_split,
2225                           struct ocfs2_alloc_context **data_ac,
2226                           struct ocfs2_alloc_context **meta_ac)
2227 {
2228         int ret = 0, num_free_extents;
2229         unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2230         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2231
2232         *meta_ac = NULL;
2233         if (data_ac)
2234                 *data_ac = NULL;
2235
2236         BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2237
2238         num_free_extents = ocfs2_num_free_extents(osb, et);
2239         if (num_free_extents < 0) {
2240                 ret = num_free_extents;
2241                 mlog_errno(ret);
2242                 goto out;
2243         }
2244
2245         /*
2246          * Sparse allocation file systems need to be more conservative
2247          * with reserving room for expansion - the actual allocation
2248          * happens while we've got a journal handle open so re-taking
2249          * a cluster lock (because we ran out of room for another
2250          * extent) will violate ordering rules.
2251          *
2252          * Most of the time we'll only be seeing this 1 cluster at a time
2253          * anyway.
2254          *
2255          * Always lock for any unwritten extents - we might want to
2256          * add blocks during a split.
2257          */
2258         if (!num_free_extents ||
2259             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2260                 ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2261                 if (ret < 0) {
2262                         if (ret != -ENOSPC)
2263                                 mlog_errno(ret);
2264                         goto out;
2265                 }
2266         }
2267
2268         if (clusters_to_add == 0)
2269                 goto out;
2270
2271         ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2272         if (ret < 0) {
2273                 if (ret != -ENOSPC)
2274                         mlog_errno(ret);
2275                 goto out;
2276         }
2277
2278 out:
2279         if (ret) {
2280                 if (*meta_ac) {
2281                         ocfs2_free_alloc_context(*meta_ac);
2282                         *meta_ac = NULL;
2283                 }
2284
2285                 /*
2286                  * We cannot have an error and a non null *data_ac.
2287                  */
2288         }
2289
2290         return ret;
2291 }
2292
2293 /*
2294  * Read the inode specified by blkno to get suballoc_slot and
2295  * suballoc_bit.
2296  */
2297 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2298                                        u16 *suballoc_slot, u16 *suballoc_bit)
2299 {
2300         int status;
2301         struct buffer_head *inode_bh = NULL;
2302         struct ocfs2_dinode *inode_fe;
2303
2304         mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2305
2306         /* dirty read disk */
2307         status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2308         if (status < 0) {
2309                 mlog(ML_ERROR, "read block %llu failed %d\n",
2310                      (unsigned long long)blkno, status);
2311                 goto bail;
2312         }
2313
2314         inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2315         if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2316                 mlog(ML_ERROR, "invalid inode %llu requested\n",
2317                      (unsigned long long)blkno);
2318                 status = -EINVAL;
2319                 goto bail;
2320         }
2321
2322         if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2323             (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2324                 mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2325                      (unsigned long long)blkno,
2326                      (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2327                 status = -EINVAL;
2328                 goto bail;
2329         }
2330
2331         if (suballoc_slot)
2332                 *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2333         if (suballoc_bit)
2334                 *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2335
2336 bail:
2337         brelse(inode_bh);
2338
2339         mlog_exit(status);
2340         return status;
2341 }
2342
2343 /*
2344  * test whether bit is SET in allocator bitmap or not.  on success, 0
2345  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2346  * is returned and *res is meaningless.  Call this after you have
2347  * cluster locked against suballoc, or you may get a result based on
2348  * non-up2date contents
2349  */
2350 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2351                                    struct inode *suballoc,
2352                                    struct buffer_head *alloc_bh, u64 blkno,
2353                                    u16 bit, int *res)
2354 {
2355         struct ocfs2_dinode *alloc_fe;
2356         struct ocfs2_group_desc *group;
2357         struct buffer_head *group_bh = NULL;
2358         u64 bg_blkno;
2359         int status;
2360
2361         mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2362                    (unsigned int)bit);
2363
2364         alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2365         if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2366                 mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2367                      (unsigned int)bit,
2368                      ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2369                 status = -EINVAL;
2370                 goto bail;
2371         }
2372
2373         bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2374         status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2375                                              &group_bh);
2376         if (status < 0) {
2377                 mlog(ML_ERROR, "read group %llu failed %d\n",
2378                      (unsigned long long)bg_blkno, status);
2379                 goto bail;
2380         }
2381
2382         group = (struct ocfs2_group_desc *) group_bh->b_data;
2383         *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2384
2385 bail:
2386         brelse(group_bh);
2387
2388         mlog_exit(status);
2389         return status;
2390 }
2391
2392 /*
2393  * Test if the bit representing this inode (blkno) is set in the
2394  * suballocator.
2395  *
2396  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2397  *
2398  * In the event of failure, a negative value is returned and *res is
2399  * meaningless.
2400  *
2401  * Callers must make sure to hold nfs_sync_lock to prevent
2402  * ocfs2_delete_inode() on another node from accessing the same
2403  * suballocator concurrently.
2404  */
2405 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2406 {
2407         int status;
2408         u16 suballoc_bit = 0, suballoc_slot = 0;
2409         struct inode *inode_alloc_inode;
2410         struct buffer_head *alloc_bh = NULL;
2411
2412         mlog_entry("blkno: %llu", (unsigned long long)blkno);
2413
2414         status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2415                                              &suballoc_bit);
2416         if (status < 0) {
2417                 mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2418                 goto bail;
2419         }
2420
2421         inode_alloc_inode =
2422                 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2423                                             suballoc_slot);
2424         if (!inode_alloc_inode) {
2425                 /* the error code could be inaccurate, but we are not able to
2426                  * get the correct one. */
2427                 status = -EINVAL;
2428                 mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2429                      (u32)suballoc_slot);
2430                 goto bail;
2431         }
2432
2433         mutex_lock(&inode_alloc_inode->i_mutex);
2434         status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2435         if (status < 0) {
2436                 mutex_unlock(&inode_alloc_inode->i_mutex);
2437                 mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2438                      (u32)suballoc_slot, status);
2439                 goto bail;
2440         }
2441
2442         status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2443                                          blkno, suballoc_bit, res);
2444         if (status < 0)
2445                 mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2446
2447         ocfs2_inode_unlock(inode_alloc_inode, 0);
2448         mutex_unlock(&inode_alloc_inode->i_mutex);
2449
2450         iput(inode_alloc_inode);
2451         brelse(alloc_bh);
2452 bail:
2453         mlog_exit(status);
2454         return status;
2455 }