2 * linux/fs/jbd2/commit.c
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
16 #include <linux/time.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
33 * IO end handler for temporary buffer_heads handling writes to the journal.
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37 struct buffer_head *orig_bh = bh->b_private;
41 set_buffer_uptodate(bh);
43 clear_buffer_uptodate(bh);
45 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 smp_mb__after_atomic();
47 wake_up_bit(&orig_bh->b_state, BH_Shadow);
53 * When an ext4 file is truncated, it is possible that some pages are not
54 * successfully freed, because they are attached to a committing transaction.
55 * After the transaction commits, these pages are left on the LRU, with no
56 * ->mapping, and with attached buffers. These pages are trivially reclaimable
57 * by the VM, but their apparent absence upsets the VM accounting, and it makes
58 * the numbers in /proc/meminfo look odd.
60 * So here, we have a buffer which has just come off the forget list. Look to
61 * see if we can strip all buffers from the backing page.
63 * Called under lock_journal(), and possibly under journal_datalist_lock. The
64 * caller provided us with a ref against the buffer, and we drop that here.
66 static void release_buffer_page(struct buffer_head *bh)
72 if (atomic_read(&bh->b_count) != 1)
80 /* OK, it's a truncated page */
81 if (!trylock_page(page))
86 try_to_free_buffers(page);
88 page_cache_release(page);
95 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
97 struct commit_header *h;
100 if (!jbd2_journal_has_csum_v2or3(j))
103 h = (struct commit_header *)(bh->b_data);
104 h->h_chksum_type = 0;
105 h->h_chksum_size = 0;
107 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108 h->h_chksum[0] = cpu_to_be32(csum);
112 * Done it all: now submit the commit record. We should have
113 * cleaned up our previous buffers by now, so if we are in abort
114 * mode we can now just skip the rest of the journal write
117 * Returns 1 if the journal needs to be aborted or 0 on success
119 static int journal_submit_commit_record(journal_t *journal,
120 transaction_t *commit_transaction,
121 struct buffer_head **cbh,
124 struct commit_header *tmp;
125 struct buffer_head *bh;
127 struct timespec64 now = current_kernel_time64();
131 if (is_journal_aborted(journal))
134 bh = jbd2_journal_get_descriptor_buffer(journal);
138 tmp = (struct commit_header *)bh->b_data;
139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
141 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
142 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
143 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
145 if (jbd2_has_feature_checksum(journal)) {
146 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
147 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
148 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
150 jbd2_commit_block_csum_set(journal, bh);
152 BUFFER_TRACE(bh, "submit commit block");
154 clear_buffer_dirty(bh);
155 set_buffer_uptodate(bh);
156 bh->b_end_io = journal_end_buffer_io_sync;
158 if (journal->j_flags & JBD2_BARRIER &&
159 !jbd2_has_feature_async_commit(journal))
160 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
162 ret = submit_bh(WRITE_SYNC, bh);
169 * This function along with journal_submit_commit_record
170 * allows to write the commit record asynchronously.
172 static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
177 clear_buffer_dirty(bh);
180 if (unlikely(!buffer_uptodate(bh)))
182 put_bh(bh); /* One for getblk() */
188 * write the filemap data using writepage() address_space_operations.
189 * We don't do block allocation here even for delalloc. We don't
190 * use writepages() because with dealyed allocation we may be doing
191 * block allocation in writepages().
193 static int journal_submit_inode_data_buffers(struct address_space *mapping)
196 struct writeback_control wbc = {
197 .sync_mode = WB_SYNC_ALL,
198 .nr_to_write = mapping->nrpages * 2,
200 .range_end = i_size_read(mapping->host),
203 ret = generic_writepages(mapping, &wbc);
208 * Submit all the data buffers of inode associated with the transaction to
211 * We are in a committing transaction. Therefore no new inode can be added to
212 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
213 * operate on from being released while we write out pages.
215 static int journal_submit_data_buffers(journal_t *journal,
216 transaction_t *commit_transaction)
218 struct jbd2_inode *jinode;
220 struct address_space *mapping;
222 spin_lock(&journal->j_list_lock);
223 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
224 mapping = jinode->i_vfs_inode->i_mapping;
225 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
226 spin_unlock(&journal->j_list_lock);
228 * submit the inode data buffers. We use writepage
229 * instead of writepages. Because writepages can do
230 * block allocation with delalloc. We need to write
231 * only allocated blocks here.
233 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
234 err = journal_submit_inode_data_buffers(mapping);
237 spin_lock(&journal->j_list_lock);
238 J_ASSERT(jinode->i_transaction == commit_transaction);
239 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
240 smp_mb__after_atomic();
241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243 spin_unlock(&journal->j_list_lock);
248 * Wait for data submitted for writeout, refile inodes to proper
249 * transaction if needed.
252 static int journal_finish_inode_data_buffers(journal_t *journal,
253 transaction_t *commit_transaction)
255 struct jbd2_inode *jinode, *next_i;
258 /* For locking, see the comment in journal_submit_data_buffers() */
259 spin_lock(&journal->j_list_lock);
260 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
262 spin_unlock(&journal->j_list_lock);
263 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
266 * Because AS_EIO is cleared by
267 * filemap_fdatawait_range(), set it again so
268 * that user process can get -EIO from fsync().
271 &jinode->i_vfs_inode->i_mapping->flags);
276 spin_lock(&journal->j_list_lock);
277 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
278 smp_mb__after_atomic();
279 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
282 /* Now refile inode to proper lists */
283 list_for_each_entry_safe(jinode, next_i,
284 &commit_transaction->t_inode_list, i_list) {
285 list_del(&jinode->i_list);
286 if (jinode->i_next_transaction) {
287 jinode->i_transaction = jinode->i_next_transaction;
288 jinode->i_next_transaction = NULL;
289 list_add(&jinode->i_list,
290 &jinode->i_transaction->t_inode_list);
292 jinode->i_transaction = NULL;
295 spin_unlock(&journal->j_list_lock);
300 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
302 struct page *page = bh->b_page;
306 addr = kmap_atomic(page);
307 checksum = crc32_be(crc32_sum,
308 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
314 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
315 unsigned long long block)
317 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
318 if (jbd2_has_feature_64bit(j))
319 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
322 static void jbd2_descr_block_csum_set(journal_t *j,
323 struct buffer_head *bh)
325 struct jbd2_journal_block_tail *tail;
328 if (!jbd2_journal_has_csum_v2or3(j))
331 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
332 sizeof(struct jbd2_journal_block_tail));
333 tail->t_checksum = 0;
334 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
335 tail->t_checksum = cpu_to_be32(csum);
338 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
339 struct buffer_head *bh, __u32 sequence)
341 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
342 struct page *page = bh->b_page;
347 if (!jbd2_journal_has_csum_v2or3(j))
350 seq = cpu_to_be32(sequence);
351 addr = kmap_atomic(page);
352 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
353 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
357 if (jbd2_has_feature_csum3(j))
358 tag3->t_checksum = cpu_to_be32(csum32);
360 tag->t_checksum = cpu_to_be16(csum32);
363 * jbd2_journal_commit_transaction
365 * The primary function for committing a transaction to the log. This
366 * function is called by the journal thread to begin a complete commit.
368 void jbd2_journal_commit_transaction(journal_t *journal)
370 struct transaction_stats_s stats;
371 transaction_t *commit_transaction;
372 struct journal_head *jh;
373 struct buffer_head *descriptor;
374 struct buffer_head **wbuf = journal->j_wbuf;
378 unsigned long long blocknr;
382 journal_header_t *header;
383 journal_block_tag_t *tag = NULL;
388 int tag_bytes = journal_tag_bytes(journal);
389 struct buffer_head *cbh = NULL; /* For transactional checksums */
390 __u32 crc32_sum = ~0;
391 struct blk_plug plug;
392 /* Tail of the journal */
393 unsigned long first_block;
400 if (jbd2_journal_has_csum_v2or3(journal))
401 csum_size = sizeof(struct jbd2_journal_block_tail);
404 * First job: lock down the current transaction and wait for
405 * all outstanding updates to complete.
408 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
409 if (journal->j_flags & JBD2_FLUSHED) {
410 jbd_debug(3, "super block updated\n");
411 mutex_lock(&journal->j_checkpoint_mutex);
413 * We hold j_checkpoint_mutex so tail cannot change under us.
414 * We don't need any special data guarantees for writing sb
415 * since journal is empty and it is ok for write to be
416 * flushed only with transaction commit.
418 jbd2_journal_update_sb_log_tail(journal,
419 journal->j_tail_sequence,
422 mutex_unlock(&journal->j_checkpoint_mutex);
424 jbd_debug(3, "superblock not updated\n");
427 J_ASSERT(journal->j_running_transaction != NULL);
428 J_ASSERT(journal->j_committing_transaction == NULL);
430 commit_transaction = journal->j_running_transaction;
432 trace_jbd2_start_commit(journal, commit_transaction);
433 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
434 commit_transaction->t_tid);
436 write_lock(&journal->j_state_lock);
437 J_ASSERT(commit_transaction->t_state == T_RUNNING);
438 commit_transaction->t_state = T_LOCKED;
440 trace_jbd2_commit_locking(journal, commit_transaction);
441 stats.run.rs_wait = commit_transaction->t_max_wait;
442 stats.run.rs_request_delay = 0;
443 stats.run.rs_locked = jiffies;
444 if (commit_transaction->t_requested)
445 stats.run.rs_request_delay =
446 jbd2_time_diff(commit_transaction->t_requested,
447 stats.run.rs_locked);
448 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
449 stats.run.rs_locked);
451 spin_lock(&commit_transaction->t_handle_lock);
452 while (atomic_read(&commit_transaction->t_updates)) {
455 prepare_to_wait(&journal->j_wait_updates, &wait,
456 TASK_UNINTERRUPTIBLE);
457 if (atomic_read(&commit_transaction->t_updates)) {
458 spin_unlock(&commit_transaction->t_handle_lock);
459 write_unlock(&journal->j_state_lock);
461 write_lock(&journal->j_state_lock);
462 spin_lock(&commit_transaction->t_handle_lock);
464 finish_wait(&journal->j_wait_updates, &wait);
466 spin_unlock(&commit_transaction->t_handle_lock);
468 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
469 journal->j_max_transaction_buffers);
472 * First thing we are allowed to do is to discard any remaining
473 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
474 * that there are no such buffers: if a large filesystem
475 * operation like a truncate needs to split itself over multiple
476 * transactions, then it may try to do a jbd2_journal_restart() while
477 * there are still BJ_Reserved buffers outstanding. These must
478 * be released cleanly from the current transaction.
480 * In this case, the filesystem must still reserve write access
481 * again before modifying the buffer in the new transaction, but
482 * we do not require it to remember exactly which old buffers it
483 * has reserved. This is consistent with the existing behaviour
484 * that multiple jbd2_journal_get_write_access() calls to the same
485 * buffer are perfectly permissible.
487 while (commit_transaction->t_reserved_list) {
488 jh = commit_transaction->t_reserved_list;
489 JBUFFER_TRACE(jh, "reserved, unused: refile");
491 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
492 * leave undo-committed data.
494 if (jh->b_committed_data) {
495 struct buffer_head *bh = jh2bh(jh);
497 jbd_lock_bh_state(bh);
498 jbd2_free(jh->b_committed_data, bh->b_size);
499 jh->b_committed_data = NULL;
500 jbd_unlock_bh_state(bh);
502 jbd2_journal_refile_buffer(journal, jh);
506 * Now try to drop any written-back buffers from the journal's
507 * checkpoint lists. We do this *before* commit because it potentially
510 spin_lock(&journal->j_list_lock);
511 __jbd2_journal_clean_checkpoint_list(journal, false);
512 spin_unlock(&journal->j_list_lock);
514 jbd_debug(3, "JBD2: commit phase 1\n");
517 * Clear revoked flag to reflect there is no revoked buffers
518 * in the next transaction which is going to be started.
520 jbd2_clear_buffer_revoked_flags(journal);
523 * Switch to a new revoke table.
525 jbd2_journal_switch_revoke_table(journal);
528 * Reserved credits cannot be claimed anymore, free them
530 atomic_sub(atomic_read(&journal->j_reserved_credits),
531 &commit_transaction->t_outstanding_credits);
533 trace_jbd2_commit_flushing(journal, commit_transaction);
534 stats.run.rs_flushing = jiffies;
535 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
536 stats.run.rs_flushing);
538 commit_transaction->t_state = T_FLUSH;
539 journal->j_committing_transaction = commit_transaction;
540 journal->j_running_transaction = NULL;
541 start_time = ktime_get();
542 commit_transaction->t_log_start = journal->j_head;
543 wake_up(&journal->j_wait_transaction_locked);
544 write_unlock(&journal->j_state_lock);
546 jbd_debug(3, "JBD2: commit phase 2a\n");
549 * Now start flushing things to disk, in the order they appear
550 * on the transaction lists. Data blocks go first.
552 err = journal_submit_data_buffers(journal, commit_transaction);
554 jbd2_journal_abort(journal, err);
556 blk_start_plug(&plug);
557 jbd2_journal_write_revoke_records(journal, commit_transaction,
558 &log_bufs, WRITE_SYNC);
560 jbd_debug(3, "JBD2: commit phase 2b\n");
563 * Way to go: we have now written out all of the data for a
564 * transaction! Now comes the tricky part: we need to write out
565 * metadata. Loop over the transaction's entire buffer list:
567 write_lock(&journal->j_state_lock);
568 commit_transaction->t_state = T_COMMIT;
569 write_unlock(&journal->j_state_lock);
571 trace_jbd2_commit_logging(journal, commit_transaction);
572 stats.run.rs_logging = jiffies;
573 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
574 stats.run.rs_logging);
575 stats.run.rs_blocks =
576 atomic_read(&commit_transaction->t_outstanding_credits);
577 stats.run.rs_blocks_logged = 0;
579 J_ASSERT(commit_transaction->t_nr_buffers <=
580 atomic_read(&commit_transaction->t_outstanding_credits));
585 while (commit_transaction->t_buffers) {
587 /* Find the next buffer to be journaled... */
589 jh = commit_transaction->t_buffers;
591 /* If we're in abort mode, we just un-journal the buffer and
594 if (is_journal_aborted(journal)) {
595 clear_buffer_jbddirty(jh2bh(jh));
596 JBUFFER_TRACE(jh, "journal is aborting: refile");
597 jbd2_buffer_abort_trigger(jh,
599 jh->b_frozen_triggers :
601 jbd2_journal_refile_buffer(journal, jh);
602 /* If that was the last one, we need to clean up
603 * any descriptor buffers which may have been
604 * already allocated, even if we are now
606 if (!commit_transaction->t_buffers)
607 goto start_journal_io;
611 /* Make sure we have a descriptor block in which to
612 record the metadata buffer. */
615 J_ASSERT (bufs == 0);
617 jbd_debug(4, "JBD2: get descriptor\n");
619 descriptor = jbd2_journal_get_descriptor_buffer(journal);
621 jbd2_journal_abort(journal, -EIO);
625 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
626 (unsigned long long)descriptor->b_blocknr,
628 header = (journal_header_t *)descriptor->b_data;
629 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
630 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
631 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
633 tagp = &descriptor->b_data[sizeof(journal_header_t)];
634 space_left = descriptor->b_size -
635 sizeof(journal_header_t);
637 set_buffer_jwrite(descriptor);
638 set_buffer_dirty(descriptor);
639 wbuf[bufs++] = descriptor;
641 /* Record it so that we can wait for IO
643 BUFFER_TRACE(descriptor, "ph3: file as descriptor");
644 jbd2_file_log_bh(&log_bufs, descriptor);
647 /* Where is the buffer to be written? */
649 err = jbd2_journal_next_log_block(journal, &blocknr);
650 /* If the block mapping failed, just abandon the buffer
651 and repeat this loop: we'll fall into the
652 refile-on-abort condition above. */
654 jbd2_journal_abort(journal, err);
659 * start_this_handle() uses t_outstanding_credits to determine
660 * the free space in the log, but this counter is changed
661 * by jbd2_journal_next_log_block() also.
663 atomic_dec(&commit_transaction->t_outstanding_credits);
665 /* Bump b_count to prevent truncate from stumbling over
666 the shadowed buffer! @@@ This can go if we ever get
667 rid of the shadow pairing of buffers. */
668 atomic_inc(&jh2bh(jh)->b_count);
671 * Make a temporary IO buffer with which to write it out
672 * (this will requeue the metadata buffer to BJ_Shadow).
674 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
675 JBUFFER_TRACE(jh, "ph3: write metadata");
676 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
677 jh, &wbuf[bufs], blocknr);
679 jbd2_journal_abort(journal, flags);
682 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
684 /* Record the new block's tag in the current descriptor
689 tag_flag |= JBD2_FLAG_ESCAPE;
691 tag_flag |= JBD2_FLAG_SAME_UUID;
693 tag = (journal_block_tag_t *) tagp;
694 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
695 tag->t_flags = cpu_to_be16(tag_flag);
696 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
697 commit_transaction->t_tid);
699 space_left -= tag_bytes;
703 memcpy (tagp, journal->j_uuid, 16);
709 /* If there's no more to do, or if the descriptor is full,
712 if (bufs == journal->j_wbufsize ||
713 commit_transaction->t_buffers == NULL ||
714 space_left < tag_bytes + 16 + csum_size) {
716 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
718 /* Write an end-of-descriptor marker before
719 submitting the IOs. "tag" still points to
720 the last tag we set up. */
722 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
724 jbd2_descr_block_csum_set(journal, descriptor);
726 for (i = 0; i < bufs; i++) {
727 struct buffer_head *bh = wbuf[i];
731 if (jbd2_has_feature_checksum(journal)) {
733 jbd2_checksum_data(crc32_sum, bh);
737 clear_buffer_dirty(bh);
738 set_buffer_uptodate(bh);
739 bh->b_end_io = journal_end_buffer_io_sync;
740 submit_bh(WRITE_SYNC, bh);
744 /* Force a new descriptor to be generated next
745 time round the loop. */
751 err = journal_finish_inode_data_buffers(journal, commit_transaction);
754 "JBD2: Detected IO errors while flushing file data "
755 "on %s\n", journal->j_devname);
756 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
757 jbd2_journal_abort(journal, err);
762 * Get current oldest transaction in the log before we issue flush
763 * to the filesystem device. After the flush we can be sure that
764 * blocks of all older transactions are checkpointed to persistent
765 * storage and we will be safe to update journal start in the
766 * superblock with the numbers we get here.
769 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
771 write_lock(&journal->j_state_lock);
773 long freed = first_block - journal->j_tail;
775 if (first_block < journal->j_tail)
776 freed += journal->j_last - journal->j_first;
777 /* Update tail only if we free significant amount of space */
778 if (freed < journal->j_maxlen / 4)
781 J_ASSERT(commit_transaction->t_state == T_COMMIT);
782 commit_transaction->t_state = T_COMMIT_DFLUSH;
783 write_unlock(&journal->j_state_lock);
786 * If the journal is not located on the file system device,
787 * then we must flush the file system device before we issue
790 if (commit_transaction->t_need_data_flush &&
791 (journal->j_fs_dev != journal->j_dev) &&
792 (journal->j_flags & JBD2_BARRIER))
793 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
795 /* Done it all: now write the commit record asynchronously. */
796 if (jbd2_has_feature_async_commit(journal)) {
797 err = journal_submit_commit_record(journal, commit_transaction,
800 __jbd2_journal_abort_hard(journal);
803 blk_finish_plug(&plug);
805 /* Lo and behold: we have just managed to send a transaction to
806 the log. Before we can commit it, wait for the IO so far to
807 complete. Control buffers being written are on the
808 transaction's t_log_list queue, and metadata buffers are on
811 Wait for the buffers in reverse order. That way we are
812 less likely to be woken up until all IOs have completed, and
813 so we incur less scheduling load.
816 jbd_debug(3, "JBD2: commit phase 3\n");
818 while (!list_empty(&io_bufs)) {
819 struct buffer_head *bh = list_entry(io_bufs.prev,
826 if (unlikely(!buffer_uptodate(bh)))
828 jbd2_unfile_log_bh(bh);
829 stats.run.rs_blocks_logged++;
832 * The list contains temporary buffer heads created by
833 * jbd2_journal_write_metadata_buffer().
835 BUFFER_TRACE(bh, "dumping temporary bh");
837 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
838 free_buffer_head(bh);
840 /* We also have to refile the corresponding shadowed buffer */
841 jh = commit_transaction->t_shadow_list->b_tprev;
843 clear_buffer_jwrite(bh);
844 J_ASSERT_BH(bh, buffer_jbddirty(bh));
845 J_ASSERT_BH(bh, !buffer_shadow(bh));
847 /* The metadata is now released for reuse, but we need
848 to remember it against this transaction so that when
849 we finally commit, we can do any checkpointing
851 JBUFFER_TRACE(jh, "file as BJ_Forget");
852 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
853 JBUFFER_TRACE(jh, "brelse shadowed buffer");
857 J_ASSERT (commit_transaction->t_shadow_list == NULL);
859 jbd_debug(3, "JBD2: commit phase 4\n");
861 /* Here we wait for the revoke record and descriptor record buffers */
862 while (!list_empty(&log_bufs)) {
863 struct buffer_head *bh;
865 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
869 if (unlikely(!buffer_uptodate(bh)))
872 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
873 clear_buffer_jwrite(bh);
874 jbd2_unfile_log_bh(bh);
875 stats.run.rs_blocks_logged++;
876 __brelse(bh); /* One for getblk */
877 /* AKPM: bforget here */
881 jbd2_journal_abort(journal, err);
883 jbd_debug(3, "JBD2: commit phase 5\n");
884 write_lock(&journal->j_state_lock);
885 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
886 commit_transaction->t_state = T_COMMIT_JFLUSH;
887 write_unlock(&journal->j_state_lock);
889 if (!jbd2_has_feature_async_commit(journal)) {
890 err = journal_submit_commit_record(journal, commit_transaction,
893 __jbd2_journal_abort_hard(journal);
896 err = journal_wait_on_commit_record(journal, cbh);
897 stats.run.rs_blocks_logged++;
898 if (jbd2_has_feature_async_commit(journal) &&
899 journal->j_flags & JBD2_BARRIER) {
900 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
904 jbd2_journal_abort(journal, err);
907 * Now disk caches for filesystem device are flushed so we are safe to
908 * erase checkpointed transactions from the log by updating journal
912 jbd2_update_log_tail(journal, first_tid, first_block);
914 /* End of a transaction! Finally, we can do checkpoint
915 processing: any buffers committed as a result of this
916 transaction can be removed from any checkpoint list it was on
919 jbd_debug(3, "JBD2: commit phase 6\n");
921 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
922 J_ASSERT(commit_transaction->t_buffers == NULL);
923 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
924 J_ASSERT(commit_transaction->t_shadow_list == NULL);
928 * As there are other places (journal_unmap_buffer()) adding buffers
929 * to this list we have to be careful and hold the j_list_lock.
931 spin_lock(&journal->j_list_lock);
932 while (commit_transaction->t_forget) {
933 transaction_t *cp_transaction;
934 struct buffer_head *bh;
937 jh = commit_transaction->t_forget;
938 spin_unlock(&journal->j_list_lock);
941 * Get a reference so that bh cannot be freed before we are
945 jbd_lock_bh_state(bh);
946 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
949 * If there is undo-protected committed data against
950 * this buffer, then we can remove it now. If it is a
951 * buffer needing such protection, the old frozen_data
952 * field now points to a committed version of the
953 * buffer, so rotate that field to the new committed
956 * Otherwise, we can just throw away the frozen data now.
958 * We also know that the frozen data has already fired
959 * its triggers if they exist, so we can clear that too.
961 if (jh->b_committed_data) {
962 jbd2_free(jh->b_committed_data, bh->b_size);
963 jh->b_committed_data = NULL;
964 if (jh->b_frozen_data) {
965 jh->b_committed_data = jh->b_frozen_data;
966 jh->b_frozen_data = NULL;
967 jh->b_frozen_triggers = NULL;
969 } else if (jh->b_frozen_data) {
970 jbd2_free(jh->b_frozen_data, bh->b_size);
971 jh->b_frozen_data = NULL;
972 jh->b_frozen_triggers = NULL;
975 spin_lock(&journal->j_list_lock);
976 cp_transaction = jh->b_cp_transaction;
977 if (cp_transaction) {
978 JBUFFER_TRACE(jh, "remove from old cp transaction");
979 cp_transaction->t_chp_stats.cs_dropped++;
980 __jbd2_journal_remove_checkpoint(jh);
983 /* Only re-checkpoint the buffer_head if it is marked
984 * dirty. If the buffer was added to the BJ_Forget list
985 * by jbd2_journal_forget, it may no longer be dirty and
986 * there's no point in keeping a checkpoint record for
990 * A buffer which has been freed while still being journaled by
991 * a previous transaction.
993 if (buffer_freed(bh)) {
995 * If the running transaction is the one containing
996 * "add to orphan" operation (b_next_transaction !=
997 * NULL), we have to wait for that transaction to
998 * commit before we can really get rid of the buffer.
999 * So just clear b_modified to not confuse transaction
1000 * credit accounting and refile the buffer to
1001 * BJ_Forget of the running transaction. If the just
1002 * committed transaction contains "add to orphan"
1003 * operation, we can completely invalidate the buffer
1004 * now. We are rather through in that since the
1005 * buffer may be still accessible when blocksize <
1006 * pagesize and it is attached to the last partial
1010 if (!jh->b_next_transaction) {
1011 clear_buffer_freed(bh);
1012 clear_buffer_jbddirty(bh);
1013 clear_buffer_mapped(bh);
1014 clear_buffer_new(bh);
1015 clear_buffer_req(bh);
1020 if (buffer_jbddirty(bh)) {
1021 JBUFFER_TRACE(jh, "add to new checkpointing trans");
1022 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1023 if (is_journal_aborted(journal))
1024 clear_buffer_jbddirty(bh);
1026 J_ASSERT_BH(bh, !buffer_dirty(bh));
1028 * The buffer on BJ_Forget list and not jbddirty means
1029 * it has been freed by this transaction and hence it
1030 * could not have been reallocated until this
1031 * transaction has committed. *BUT* it could be
1032 * reallocated once we have written all the data to
1033 * disk and before we process the buffer on BJ_Forget
1036 if (!jh->b_next_transaction)
1039 JBUFFER_TRACE(jh, "refile or unfile buffer");
1040 __jbd2_journal_refile_buffer(jh);
1041 jbd_unlock_bh_state(bh);
1043 release_buffer_page(bh); /* Drops bh reference */
1046 cond_resched_lock(&journal->j_list_lock);
1048 spin_unlock(&journal->j_list_lock);
1050 * This is a bit sleazy. We use j_list_lock to protect transition
1051 * of a transaction into T_FINISHED state and calling
1052 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1053 * other checkpointing code processing the transaction...
1055 write_lock(&journal->j_state_lock);
1056 spin_lock(&journal->j_list_lock);
1058 * Now recheck if some buffers did not get attached to the transaction
1059 * while the lock was dropped...
1061 if (commit_transaction->t_forget) {
1062 spin_unlock(&journal->j_list_lock);
1063 write_unlock(&journal->j_state_lock);
1067 /* Add the transaction to the checkpoint list
1068 * __journal_remove_checkpoint() can not destroy transaction
1069 * under us because it is not marked as T_FINISHED yet */
1070 if (journal->j_checkpoint_transactions == NULL) {
1071 journal->j_checkpoint_transactions = commit_transaction;
1072 commit_transaction->t_cpnext = commit_transaction;
1073 commit_transaction->t_cpprev = commit_transaction;
1075 commit_transaction->t_cpnext =
1076 journal->j_checkpoint_transactions;
1077 commit_transaction->t_cpprev =
1078 commit_transaction->t_cpnext->t_cpprev;
1079 commit_transaction->t_cpnext->t_cpprev =
1081 commit_transaction->t_cpprev->t_cpnext =
1084 spin_unlock(&journal->j_list_lock);
1086 /* Done with this transaction! */
1088 jbd_debug(3, "JBD2: commit phase 7\n");
1090 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1092 commit_transaction->t_start = jiffies;
1093 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1094 commit_transaction->t_start);
1097 * File the transaction statistics
1099 stats.ts_tid = commit_transaction->t_tid;
1100 stats.run.rs_handle_count =
1101 atomic_read(&commit_transaction->t_handle_count);
1102 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1103 commit_transaction->t_tid, &stats.run);
1104 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1106 commit_transaction->t_state = T_COMMIT_CALLBACK;
1107 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1108 journal->j_commit_sequence = commit_transaction->t_tid;
1109 journal->j_committing_transaction = NULL;
1110 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1113 * weight the commit time higher than the average time so we don't
1114 * react too strongly to vast changes in the commit time
1116 if (likely(journal->j_average_commit_time))
1117 journal->j_average_commit_time = (commit_time +
1118 journal->j_average_commit_time*3) / 4;
1120 journal->j_average_commit_time = commit_time;
1122 write_unlock(&journal->j_state_lock);
1124 if (journal->j_commit_callback)
1125 journal->j_commit_callback(journal, commit_transaction);
1127 trace_jbd2_end_commit(journal, commit_transaction);
1128 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1129 journal->j_commit_sequence, journal->j_tail_sequence);
1131 write_lock(&journal->j_state_lock);
1132 spin_lock(&journal->j_list_lock);
1133 commit_transaction->t_state = T_FINISHED;
1134 /* Check if the transaction can be dropped now that we are finished */
1135 if (commit_transaction->t_checkpoint_list == NULL &&
1136 commit_transaction->t_checkpoint_io_list == NULL) {
1137 __jbd2_journal_drop_transaction(journal, commit_transaction);
1138 jbd2_journal_free_transaction(commit_transaction);
1140 spin_unlock(&journal->j_list_lock);
1141 write_unlock(&journal->j_state_lock);
1142 wake_up(&journal->j_wait_done_commit);
1145 * Calculate overall stats
1147 spin_lock(&journal->j_history_lock);
1148 journal->j_stats.ts_tid++;
1149 journal->j_stats.ts_requested += stats.ts_requested;
1150 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1151 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1152 journal->j_stats.run.rs_running += stats.run.rs_running;
1153 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1154 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1155 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1156 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1157 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1158 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1159 spin_unlock(&journal->j_history_lock);