fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25 #include <linux/writeback.h>
  26 #include <linux/backing-dev.h>
  27 #include <linux/bio.h>
  28 #include <linux/blkdev.h>
  29 #include <trace/events/jbd2.h>
  30
  31 /*
  32  * Default IO end handler for temporary BJ_IO buffer_heads.
  33  */
  34 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  35 {
  36         BUFFER_TRACE(bh, "");
  37         if (uptodate)
  38                 set_buffer_uptodate(bh);
  39         else
  40                 clear_buffer_uptodate(bh);
  41         unlock_buffer(bh);
  42 }
  43
  44 /*
  45  * When an ext4 file is truncated, it is possible that some pages are not
  46  * successfully freed, because they are attached to a committing transaction.
  47  * After the transaction commits, these pages are left on the LRU, with no
  48  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  49  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  50  * the numbers in /proc/meminfo look odd.
  51  *
  52  * So here, we have a buffer which has just come off the forget list.  Look to
  53  * see if we can strip all buffers from the backing page.
  54  *
  55  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  56  * caller provided us with a ref against the buffer, and we drop that here.
  57  */
  58 static void release_buffer_page(struct buffer_head *bh)
  59 {
  60         struct page *page;
  61
  62         if (buffer_dirty(bh))
  63                 goto nope;
  64         if (atomic_read(&bh->b_count) != 1)
  65                 goto nope;
  66         page = bh->b_page;
  67         if (!page)
  68                 goto nope;
  69         if (page->mapping)
  70                 goto nope;
  71
  72         /* OK, it's a truncated page */
  73         if (!trylock_page(page))
  74                 goto nope;
  75
  76         page_cache_get(page);
  77         __brelse(bh);
  78         try_to_free_buffers(page);
  79         unlock_page(page);
  80         page_cache_release(page);
  81         return;
  82
  83 nope:
  84         __brelse(bh);
  85 }
  86
  87 /*
  88  * Done it all: now submit the commit record.  We should have
  89  * cleaned up our previous buffers by now, so if we are in abort
  90  * mode we can now just skip the rest of the journal write
  91  * entirely.
  92  *
  93  * Returns 1 if the journal needs to be aborted or 0 on success
  94  */
  95 static int journal_submit_commit_record(journal_t *journal,
  96                                         transaction_t *commit_transaction,
  97                                         struct buffer_head **cbh,
  98                                         __u32 crc32_sum)
  99 {
 100         struct journal_head *descriptor;
 101         struct commit_header *tmp;
 102         struct buffer_head *bh;
 103         int ret;
 104         struct timespec now = current_kernel_time();
 105
 106         if (is_journal_aborted(journal))
 107                 return 0;
 108
 109         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 110         if (!descriptor)
 111                 return 1;
 112
 113         bh = jh2bh(descriptor);
 114
 115         tmp = (struct commit_header *)bh->b_data;
 116         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 117         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 118         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 119         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 120         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 121
 122         if (JBD2_HAS_COMPAT_FEATURE(journal,
 123                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 124                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 125                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 126                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 127         }
 128
 129         JBUFFER_TRACE(descriptor, "submit commit block");
 130         lock_buffer(bh);
 131         clear_buffer_dirty(bh);
 132         set_buffer_uptodate(bh);
 133         bh->b_end_io = journal_end_buffer_io_sync;
 134
 135         if (journal->j_flags & JBD2_BARRIER &&
 136             !JBD2_HAS_INCOMPAT_FEATURE(journal,
 137                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 138                 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
 139         else
 140                 ret = submit_bh(WRITE_SYNC_PLUG, bh);
 141
 142         *cbh = bh;
 143         return ret;
 144 }
 145
 146 /*
 147  * This function along with journal_submit_commit_record
 148  * allows to write the commit record asynchronously.
 149  */
 150 static int journal_wait_on_commit_record(journal_t *journal,
 151                                          struct buffer_head *bh)
 152 {
 153         int ret = 0;
 154
 155         clear_buffer_dirty(bh);
 156         wait_on_buffer(bh);
 157
 158         if (unlikely(!buffer_uptodate(bh)))
 159                 ret = -EIO;
 160         put_bh(bh);            /* One for getblk() */
 161         jbd2_journal_put_journal_head(bh2jh(bh));
 162
 163         return ret;
 164 }
 165
 166 /*
 167  * write the filemap data using writepage() address_space_operations.
 168  * We don't do block allocation here even for delalloc. We don't
 169  * use writepages() because with dealyed allocation we may be doing
 170  * block allocation in writepages().
 171  */
 172 static int journal_submit_inode_data_buffers(struct address_space *mapping)
 173 {
 174         int ret;
 175         struct writeback_control wbc = {
 176                 .sync_mode =  WB_SYNC_ALL,
 177                 .nr_to_write = mapping->nrpages * 2,
 178                 .range_start = 0,
 179                 .range_end = i_size_read(mapping->host),
 180         };
 181
 182         ret = generic_writepages(mapping, &wbc);
 183         return ret;
 184 }
 185
 186 /*
 187  * Submit all the data buffers of inode associated with the transaction to
 188  * disk.
 189  *
 190  * We are in a committing transaction. Therefore no new inode can be added to
 191  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 192  * operate on from being released while we write out pages.
 193  */
 194 static int journal_submit_data_buffers(journal_t *journal,
 195                 transaction_t *commit_transaction)
 196 {
 197         struct jbd2_inode *jinode;
 198         int err, ret = 0;
 199         struct address_space *mapping;
 200
 201         spin_lock(&journal->j_list_lock);
 202         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 203                 mapping = jinode->i_vfs_inode->i_mapping;
 204                 jinode->i_flags |= JI_COMMIT_RUNNING;
 205                 spin_unlock(&journal->j_list_lock);
 206                 /*
 207                  * submit the inode data buffers. We use writepage
 208                  * instead of writepages. Because writepages can do
 209                  * block allocation  with delalloc. We need to write
 210                  * only allocated blocks here.
 211                  */
 212                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 213                 err = journal_submit_inode_data_buffers(mapping);
 214                 if (!ret)
 215                         ret = err;
 216                 spin_lock(&journal->j_list_lock);
 217                 J_ASSERT(jinode->i_transaction == commit_transaction);
 218                 commit_transaction->t_flushed_data_blocks = 1;
 219                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 220                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 221         }
 222         spin_unlock(&journal->j_list_lock);
 223         return ret;
 224 }
 225
 226 /*
 227  * Wait for data submitted for writeout, refile inodes to proper
 228  * transaction if needed.
 229  *
 230  */
 231 static int journal_finish_inode_data_buffers(journal_t *journal,
 232                 transaction_t *commit_transaction)
 233 {
 234         struct jbd2_inode *jinode, *next_i;
 235         int err, ret = 0;
 236
 237         /* For locking, see the comment in journal_submit_data_buffers() */
 238         spin_lock(&journal->j_list_lock);
 239         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 240                 jinode->i_flags |= JI_COMMIT_RUNNING;
 241                 spin_unlock(&journal->j_list_lock);
 242                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 243                 if (err) {
 244                         /*
 245                          * Because AS_EIO is cleared by
 246                          * filemap_fdatawait_range(), set it again so
 247                          * that user process can get -EIO from fsync().
 248                          */
 249                         set_bit(AS_EIO,
 250                                 &jinode->i_vfs_inode->i_mapping->flags);
 251
 252                         if (!ret)
 253                                 ret = err;
 254                 }
 255                 spin_lock(&journal->j_list_lock);
 256                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 257                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 258         }
 259
 260         /* Now refile inode to proper lists */
 261         list_for_each_entry_safe(jinode, next_i,
 262                                  &commit_transaction->t_inode_list, i_list) {
 263                 list_del(&jinode->i_list);
 264                 if (jinode->i_next_transaction) {
 265                         jinode->i_transaction = jinode->i_next_transaction;
 266                         jinode->i_next_transaction = NULL;
 267                         list_add(&jinode->i_list,
 268                                 &jinode->i_transaction->t_inode_list);
 269                 } else {
 270                         jinode->i_transaction = NULL;
 271                 }
 272         }
 273         spin_unlock(&journal->j_list_lock);
 274
 275         return ret;
 276 }
 277
 278 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 279 {
 280         struct page *page = bh->b_page;
 281         char *addr;
 282         __u32 checksum;
 283
 284         addr = kmap_atomic(page, KM_USER0);
 285         checksum = crc32_be(crc32_sum,
 286                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 287         kunmap_atomic(addr, KM_USER0);
 288
 289         return checksum;
 290 }
 291
 292 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 293                                    unsigned long long block)
 294 {
 295         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 296         if (tag_bytes > JBD2_TAG_SIZE32)
 297                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 298 }
 299
 300 /*
 301  * jbd2_journal_commit_transaction
 302  *
 303  * The primary function for committing a transaction to the log.  This
 304  * function is called by the journal thread to begin a complete commit.
 305  */
 306 void jbd2_journal_commit_transaction(journal_t *journal)
 307 {
 308         struct transaction_stats_s stats;
 309         transaction_t *commit_transaction;
 310         struct journal_head *jh, *new_jh, *descriptor;
 311         struct buffer_head **wbuf = journal->j_wbuf;
 312         int bufs;
 313         int flags;
 314         int err;
 315         unsigned long long blocknr;
 316         ktime_t start_time;
 317         u64 commit_time;
 318         char *tagp = NULL;
 319         journal_header_t *header;
 320         journal_block_tag_t *tag = NULL;
 321         int space_left = 0;
 322         int first_tag = 0;
 323         int tag_flag;
 324         int i, to_free = 0;
 325         int tag_bytes = journal_tag_bytes(journal);
 326         struct buffer_head *cbh = NULL; /* For transactional checksums */
 327         __u32 crc32_sum = ~0;
 328         int write_op = WRITE_SYNC;
 329
 330         /*
 331          * First job: lock down the current transaction and wait for
 332          * all outstanding updates to complete.
 333          */
 334
 335 #ifdef COMMIT_STATS
 336         spin_lock(&journal->j_list_lock);
 337         summarise_journal_usage(journal);
 338         spin_unlock(&journal->j_list_lock);
 339 #endif
 340
 341         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 342         if (journal->j_flags & JBD2_FLUSHED) {
 343                 jbd_debug(3, "super block updated\n");
 344                 jbd2_journal_update_superblock(journal, 1);
 345         } else {
 346                 jbd_debug(3, "superblock not updated\n");
 347         }
 348
 349         J_ASSERT(journal->j_running_transaction != NULL);
 350         J_ASSERT(journal->j_committing_transaction == NULL);
 351
 352         commit_transaction = journal->j_running_transaction;
 353         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 354
 355         trace_jbd2_start_commit(journal, commit_transaction);
 356         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 357                         commit_transaction->t_tid);
 358
 359         write_lock(&journal->j_state_lock);
 360         commit_transaction->t_state = T_LOCKED;
 361
 362         /*
 363          * Use plugged writes here, since we want to submit several before
 364          * we unplug the device. We don't do explicit unplugging in here,
 365          * instead we rely on sync_buffer() doing the unplug for us.
 366          */
 367         if (commit_transaction->t_synchronous_commit)
 368                 write_op = WRITE_SYNC_PLUG;
 369         trace_jbd2_commit_locking(journal, commit_transaction);
 370         stats.run.rs_wait = commit_transaction->t_max_wait;
 371         stats.run.rs_locked = jiffies;
 372         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 373                                               stats.run.rs_locked);
 374
 375         spin_lock(&commit_transaction->t_handle_lock);
 376         while (atomic_read(&commit_transaction->t_updates)) {
 377                 DEFINE_WAIT(wait);
 378
 379                 prepare_to_wait(&journal->j_wait_updates, &wait,
 380                                         TASK_UNINTERRUPTIBLE);
 381                 if (atomic_read(&commit_transaction->t_updates)) {
 382                         spin_unlock(&commit_transaction->t_handle_lock);
 383                         write_unlock(&journal->j_state_lock);
 384                         schedule();
 385                         write_lock(&journal->j_state_lock);
 386                         spin_lock(&commit_transaction->t_handle_lock);
 387                 }
 388                 finish_wait(&journal->j_wait_updates, &wait);
 389         }
 390         spin_unlock(&commit_transaction->t_handle_lock);
 391
 392         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 393                         journal->j_max_transaction_buffers);
 394
 395         /*
 396          * First thing we are allowed to do is to discard any remaining
 397          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 398          * that there are no such buffers: if a large filesystem
 399          * operation like a truncate needs to split itself over multiple
 400          * transactions, then it may try to do a jbd2_journal_restart() while
 401          * there are still BJ_Reserved buffers outstanding.  These must
 402          * be released cleanly from the current transaction.
 403          *
 404          * In this case, the filesystem must still reserve write access
 405          * again before modifying the buffer in the new transaction, but
 406          * we do not require it to remember exactly which old buffers it
 407          * has reserved.  This is consistent with the existing behaviour
 408          * that multiple jbd2_journal_get_write_access() calls to the same
 409          * buffer are perfectly permissable.
 410          */
 411         while (commit_transaction->t_reserved_list) {
 412                 jh = commit_transaction->t_reserved_list;
 413                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 414                 /*
 415                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 416                  * leave undo-committed data.
 417                  */
 418                 if (jh->b_committed_data) {
 419                         struct buffer_head *bh = jh2bh(jh);
 420
 421                         jbd_lock_bh_state(bh);
 422                         jbd2_free(jh->b_committed_data, bh->b_size);
 423                         jh->b_committed_data = NULL;
 424                         jbd_unlock_bh_state(bh);
 425                 }
 426                 jbd2_journal_refile_buffer(journal, jh);
 427         }
 428
 429         /*
 430          * Now try to drop any written-back buffers from the journal's
 431          * checkpoint lists.  We do this *before* commit because it potentially
 432          * frees some memory
 433          */
 434         spin_lock(&journal->j_list_lock);
 435         __jbd2_journal_clean_checkpoint_list(journal);
 436         spin_unlock(&journal->j_list_lock);
 437
 438         jbd_debug (3, "JBD: commit phase 1\n");
 439
 440         /*
 441          * Switch to a new revoke table.
 442          */
 443         jbd2_journal_switch_revoke_table(journal);
 444
 445         trace_jbd2_commit_flushing(journal, commit_transaction);
 446         stats.run.rs_flushing = jiffies;
 447         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 448                                              stats.run.rs_flushing);
 449
 450         commit_transaction->t_state = T_FLUSH;
 451         journal->j_committing_transaction = commit_transaction;
 452         journal->j_running_transaction = NULL;
 453         start_time = ktime_get();
 454         commit_transaction->t_log_start = journal->j_head;
 455         wake_up(&journal->j_wait_transaction_locked);
 456         write_unlock(&journal->j_state_lock);
 457
 458         jbd_debug (3, "JBD: commit phase 2\n");
 459
 460         /*
 461          * Now start flushing things to disk, in the order they appear
 462          * on the transaction lists.  Data blocks go first.
 463          */
 464         err = journal_submit_data_buffers(journal, commit_transaction);
 465         if (err)
 466                 jbd2_journal_abort(journal, err);
 467
 468         jbd2_journal_write_revoke_records(journal, commit_transaction,
 469                                           write_op);
 470
 471         jbd_debug(3, "JBD: commit phase 2\n");
 472
 473         /*
 474          * Way to go: we have now written out all of the data for a
 475          * transaction!  Now comes the tricky part: we need to write out
 476          * metadata.  Loop over the transaction's entire buffer list:
 477          */
 478         write_lock(&journal->j_state_lock);
 479         commit_transaction->t_state = T_COMMIT;
 480         write_unlock(&journal->j_state_lock);
 481
 482         trace_jbd2_commit_logging(journal, commit_transaction);
 483         stats.run.rs_logging = jiffies;
 484         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 485                                                stats.run.rs_logging);
 486         stats.run.rs_blocks =
 487                 atomic_read(&commit_transaction->t_outstanding_credits);
 488         stats.run.rs_blocks_logged = 0;
 489
 490         J_ASSERT(commit_transaction->t_nr_buffers <=
 491                  atomic_read(&commit_transaction->t_outstanding_credits));
 492
 493         err = 0;
 494         descriptor = NULL;
 495         bufs = 0;
 496         while (commit_transaction->t_buffers) {
 497
 498                 /* Find the next buffer to be journaled... */
 499
 500                 jh = commit_transaction->t_buffers;
 501
 502                 /* If we're in abort mode, we just un-journal the buffer and
 503                    release it. */
 504
 505                 if (is_journal_aborted(journal)) {
 506                         clear_buffer_jbddirty(jh2bh(jh));
 507                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 508                         jbd2_buffer_abort_trigger(jh,
 509                                                   jh->b_frozen_data ?
 510                                                   jh->b_frozen_triggers :
 511                                                   jh->b_triggers);
 512                         jbd2_journal_refile_buffer(journal, jh);
 513                         /* If that was the last one, we need to clean up
 514                          * any descriptor buffers which may have been
 515                          * already allocated, even if we are now
 516                          * aborting. */
 517                         if (!commit_transaction->t_buffers)
 518                                 goto start_journal_io;
 519                         continue;
 520                 }
 521
 522                 /* Make sure we have a descriptor block in which to
 523                    record the metadata buffer. */
 524
 525                 if (!descriptor) {
 526                         struct buffer_head *bh;
 527
 528                         J_ASSERT (bufs == 0);
 529
 530                         jbd_debug(4, "JBD: get descriptor\n");
 531
 532                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 533                         if (!descriptor) {
 534                                 jbd2_journal_abort(journal, -EIO);
 535                                 continue;
 536                         }
 537
 538                         bh = jh2bh(descriptor);
 539                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 540                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 541                         header = (journal_header_t *)&bh->b_data[0];
 542                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 543                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 544                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 545
 546                         tagp = &bh->b_data[sizeof(journal_header_t)];
 547                         space_left = bh->b_size - sizeof(journal_header_t);
 548                         first_tag = 1;
 549                         set_buffer_jwrite(bh);
 550                         set_buffer_dirty(bh);
 551                         wbuf[bufs++] = bh;
 552
 553                         /* Record it so that we can wait for IO
 554                            completion later */
 555                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 556                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 557                                         BJ_LogCtl);
 558                 }
 559
 560                 /* Where is the buffer to be written? */
 561
 562                 err = jbd2_journal_next_log_block(journal, &blocknr);
 563                 /* If the block mapping failed, just abandon the buffer
 564                    and repeat this loop: we'll fall into the
 565                    refile-on-abort condition above. */
 566                 if (err) {
 567                         jbd2_journal_abort(journal, err);
 568                         continue;
 569                 }
 570
 571                 /*
 572                  * start_this_handle() uses t_outstanding_credits to determine
 573                  * the free space in the log, but this counter is changed
 574                  * by jbd2_journal_next_log_block() also.
 575                  */
 576                 atomic_dec(&commit_transaction->t_outstanding_credits);
 577
 578                 /* Bump b_count to prevent truncate from stumbling over
 579                    the shadowed buffer!  @@@ This can go if we ever get
 580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 581                 atomic_inc(&jh2bh(jh)->b_count);
 582
 583                 /* Make a temporary IO buffer with which to write it out
 584                    (this will requeue both the metadata buffer and the
 585                    temporary IO buffer). new_bh goes on BJ_IO*/
 586
 587                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 588                 /*
 589                  * akpm: jbd2_journal_write_metadata_buffer() sets
 590                  * new_bh->b_transaction to commit_transaction.
 591                  * We need to clean this up before we release new_bh
 592                  * (which is of type BJ_IO)
 593                  */
 594                 JBUFFER_TRACE(jh, "ph3: write metadata");
 595                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 596                                                       jh, &new_jh, blocknr);
 597                 if (flags < 0) {
 598                         jbd2_journal_abort(journal, flags);
 599                         continue;
 600                 }
 601                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 602                 wbuf[bufs++] = jh2bh(new_jh);
 603
 604                 /* Record the new block's tag in the current descriptor
 605                    buffer */
 606
 607                 tag_flag = 0;
 608                 if (flags & 1)
 609                         tag_flag |= JBD2_FLAG_ESCAPE;
 610                 if (!first_tag)
 611                         tag_flag |= JBD2_FLAG_SAME_UUID;
 612
 613                 tag = (journal_block_tag_t *) tagp;
 614                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 615                 tag->t_flags = cpu_to_be32(tag_flag);
 616                 tagp += tag_bytes;
 617                 space_left -= tag_bytes;
 618
 619                 if (first_tag) {
 620                         memcpy (tagp, journal->j_uuid, 16);
 621                         tagp += 16;
 622                         space_left -= 16;
 623                         first_tag = 0;
 624                 }
 625
 626                 /* If there's no more to do, or if the descriptor is full,
 627                    let the IO rip! */
 628
 629                 if (bufs == journal->j_wbufsize ||
 630                     commit_transaction->t_buffers == NULL ||
 631                     space_left < tag_bytes + 16) {
 632
 633                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 634
 635                         /* Write an end-of-descriptor marker before
 636                            submitting the IOs.  "tag" still points to
 637                            the last tag we set up. */
 638
 639                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 640
 641 start_journal_io:
 642                         for (i = 0; i < bufs; i++) {
 643                                 struct buffer_head *bh = wbuf[i];
 644                                 /*
 645                                  * Compute checksum.
 646                                  */
 647                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 648                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 649                                         crc32_sum =
 650                                             jbd2_checksum_data(crc32_sum, bh);
 651                                 }
 652
 653                                 lock_buffer(bh);
 654                                 clear_buffer_dirty(bh);
 655                                 set_buffer_uptodate(bh);
 656                                 bh->b_end_io = journal_end_buffer_io_sync;
 657                                 submit_bh(write_op, bh);
 658                         }
 659                         cond_resched();
 660                         stats.run.rs_blocks_logged += bufs;
 661
 662                         /* Force a new descriptor to be generated next
 663                            time round the loop. */
 664                         descriptor = NULL;
 665                         bufs = 0;
 666                 }
 667         }
 668
 669         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 670         if (err) {
 671                 printk(KERN_WARNING
 672                         "JBD2: Detected IO errors while flushing file data "
 673                        "on %s\n", journal->j_devname);
 674                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 675                         jbd2_journal_abort(journal, err);
 676                 err = 0;
 677         }
 678
 679         /*
 680          * If the journal is not located on the file system device,
 681          * then we must flush the file system device before we issue
 682          * the commit record
 683          */
 684         if (commit_transaction->t_flushed_data_blocks &&
 685             (journal->j_fs_dev != journal->j_dev) &&
 686             (journal->j_flags & JBD2_BARRIER))
 687                 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 688
 689         /* Done it all: now write the commit record asynchronously. */
 690         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 691                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 692                 err = journal_submit_commit_record(journal, commit_transaction,
 693                                                  &cbh, crc32_sum);
 694                 if (err)
 695                         __jbd2_journal_abort_hard(journal);
 696         }
 697
 698         /* Lo and behold: we have just managed to send a transaction to
 699            the log.  Before we can commit it, wait for the IO so far to
 700            complete.  Control buffers being written are on the
 701            transaction's t_log_list queue, and metadata buffers are on
 702            the t_iobuf_list queue.
 703
 704            Wait for the buffers in reverse order.  That way we are
 705            less likely to be woken up until all IOs have completed, and
 706            so we incur less scheduling load.
 707         */
 708
 709         jbd_debug(3, "JBD: commit phase 3\n");
 710
 711         /*
 712          * akpm: these are BJ_IO, and j_list_lock is not needed.
 713          * See __journal_try_to_free_buffer.
 714          */
 715 wait_for_iobuf:
 716         while (commit_transaction->t_iobuf_list != NULL) {
 717                 struct buffer_head *bh;
 718
 719                 jh = commit_transaction->t_iobuf_list->b_tprev;
 720                 bh = jh2bh(jh);
 721                 if (buffer_locked(bh)) {
 722                         wait_on_buffer(bh);
 723                         goto wait_for_iobuf;
 724                 }
 725                 if (cond_resched())
 726                         goto wait_for_iobuf;
 727
 728                 if (unlikely(!buffer_uptodate(bh)))
 729                         err = -EIO;
 730
 731                 clear_buffer_jwrite(bh);
 732
 733                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 734                 jbd2_journal_unfile_buffer(journal, jh);
 735
 736                 /*
 737                  * ->t_iobuf_list should contain only dummy buffer_heads
 738                  * which were created by jbd2_journal_write_metadata_buffer().
 739                  */
 740                 BUFFER_TRACE(bh, "dumping temporary bh");
 741                 jbd2_journal_put_journal_head(jh);
 742                 __brelse(bh);
 743                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 744                 free_buffer_head(bh);
 745
 746                 /* We also have to unlock and free the corresponding
 747                    shadowed buffer */
 748                 jh = commit_transaction->t_shadow_list->b_tprev;
 749                 bh = jh2bh(jh);
 750                 clear_bit(BH_JWrite, &bh->b_state);
 751                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 752
 753                 /* The metadata is now released for reuse, but we need
 754                    to remember it against this transaction so that when
 755                    we finally commit, we can do any checkpointing
 756                    required. */
 757                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 758                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 759                 /* Wake up any transactions which were waiting for this
 760                    IO to complete */
 761                 wake_up_bit(&bh->b_state, BH_Unshadow);
 762                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 763                 __brelse(bh);
 764         }
 765
 766         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 767
 768         jbd_debug(3, "JBD: commit phase 4\n");
 769
 770         /* Here we wait for the revoke record and descriptor record buffers */
 771  wait_for_ctlbuf:
 772         while (commit_transaction->t_log_list != NULL) {
 773                 struct buffer_head *bh;
 774
 775                 jh = commit_transaction->t_log_list->b_tprev;
 776                 bh = jh2bh(jh);
 777                 if (buffer_locked(bh)) {
 778                         wait_on_buffer(bh);
 779                         goto wait_for_ctlbuf;
 780                 }
 781                 if (cond_resched())
 782                         goto wait_for_ctlbuf;
 783
 784                 if (unlikely(!buffer_uptodate(bh)))
 785                         err = -EIO;
 786
 787                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 788                 clear_buffer_jwrite(bh);
 789                 jbd2_journal_unfile_buffer(journal, jh);
 790                 jbd2_journal_put_journal_head(jh);
 791                 __brelse(bh);           /* One for getblk */
 792                 /* AKPM: bforget here */
 793         }
 794
 795         if (err)
 796                 jbd2_journal_abort(journal, err);
 797
 798         jbd_debug(3, "JBD: commit phase 5\n");
 799
 800         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 801                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 802                 err = journal_submit_commit_record(journal, commit_transaction,
 803                                                 &cbh, crc32_sum);
 804                 if (err)
 805                         __jbd2_journal_abort_hard(journal);
 806         }
 807         if (!err && !is_journal_aborted(journal))
 808                 err = journal_wait_on_commit_record(journal, cbh);
 809         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 810                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 811             journal->j_flags & JBD2_BARRIER) {
 812                 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
 813         }
 814
 815         if (err)
 816                 jbd2_journal_abort(journal, err);
 817
 818         /* End of a transaction!  Finally, we can do checkpoint
 819            processing: any buffers committed as a result of this
 820            transaction can be removed from any checkpoint list it was on
 821            before. */
 822
 823         jbd_debug(3, "JBD: commit phase 6\n");
 824
 825         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 826         J_ASSERT(commit_transaction->t_buffers == NULL);
 827         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 828         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 829         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 830         J_ASSERT(commit_transaction->t_log_list == NULL);
 831
 832 restart_loop:
 833         /*
 834          * As there are other places (journal_unmap_buffer()) adding buffers
 835          * to this list we have to be careful and hold the j_list_lock.
 836          */
 837         spin_lock(&journal->j_list_lock);
 838         while (commit_transaction->t_forget) {
 839                 transaction_t *cp_transaction;
 840                 struct buffer_head *bh;
 841
 842                 jh = commit_transaction->t_forget;
 843                 spin_unlock(&journal->j_list_lock);
 844                 bh = jh2bh(jh);
 845                 jbd_lock_bh_state(bh);
 846                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 847
 848                 /*
 849                  * If there is undo-protected committed data against
 850                  * this buffer, then we can remove it now.  If it is a
 851                  * buffer needing such protection, the old frozen_data
 852                  * field now points to a committed version of the
 853                  * buffer, so rotate that field to the new committed
 854                  * data.
 855                  *
 856                  * Otherwise, we can just throw away the frozen data now.
 857                  *
 858                  * We also know that the frozen data has already fired
 859                  * its triggers if they exist, so we can clear that too.
 860                  */
 861                 if (jh->b_committed_data) {
 862                         jbd2_free(jh->b_committed_data, bh->b_size);
 863                         jh->b_committed_data = NULL;
 864                         if (jh->b_frozen_data) {
 865                                 jh->b_committed_data = jh->b_frozen_data;
 866                                 jh->b_frozen_data = NULL;
 867                                 jh->b_frozen_triggers = NULL;
 868                         }
 869                 } else if (jh->b_frozen_data) {
 870                         jbd2_free(jh->b_frozen_data, bh->b_size);
 871                         jh->b_frozen_data = NULL;
 872                         jh->b_frozen_triggers = NULL;
 873                 }
 874
 875                 spin_lock(&journal->j_list_lock);
 876                 cp_transaction = jh->b_cp_transaction;
 877                 if (cp_transaction) {
 878                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 879                         cp_transaction->t_chp_stats.cs_dropped++;
 880                         __jbd2_journal_remove_checkpoint(jh);
 881                 }
 882
 883                 /* Only re-checkpoint the buffer_head if it is marked
 884                  * dirty.  If the buffer was added to the BJ_Forget list
 885                  * by jbd2_journal_forget, it may no longer be dirty and
 886                  * there's no point in keeping a checkpoint record for
 887                  * it. */
 888
 889                 /* A buffer which has been freed while still being
 890                  * journaled by a previous transaction may end up still
 891                  * being dirty here, but we want to avoid writing back
 892                  * that buffer in the future after the "add to orphan"
 893                  * operation been committed,  That's not only a performance
 894                  * gain, it also stops aliasing problems if the buffer is
 895                  * left behind for writeback and gets reallocated for another
 896                  * use in a different page. */
 897                 if (buffer_freed(bh) && !jh->b_next_transaction) {
 898                         clear_buffer_freed(bh);
 899                         clear_buffer_jbddirty(bh);
 900                 }
 901
 902                 if (buffer_jbddirty(bh)) {
 903                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 904                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 905                         if (is_journal_aborted(journal))
 906                                 clear_buffer_jbddirty(bh);
 907                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 908                         __jbd2_journal_refile_buffer(jh);
 909                         jbd_unlock_bh_state(bh);
 910                 } else {
 911                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 912                         /* The buffer on BJ_Forget list and not jbddirty means
 913                          * it has been freed by this transaction and hence it
 914                          * could not have been reallocated until this
 915                          * transaction has committed. *BUT* it could be
 916                          * reallocated once we have written all the data to
 917                          * disk and before we process the buffer on BJ_Forget
 918                          * list. */
 919                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 920                         __jbd2_journal_refile_buffer(jh);
 921                         if (!jh->b_transaction) {
 922                                 jbd_unlock_bh_state(bh);
 923                                  /* needs a brelse */
 924                                 jbd2_journal_remove_journal_head(bh);
 925                                 release_buffer_page(bh);
 926                         } else
 927                                 jbd_unlock_bh_state(bh);
 928                 }
 929                 cond_resched_lock(&journal->j_list_lock);
 930         }
 931         spin_unlock(&journal->j_list_lock);
 932         /*
 933          * This is a bit sleazy.  We use j_list_lock to protect transition
 934          * of a transaction into T_FINISHED state and calling
 935          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 936          * other checkpointing code processing the transaction...
 937          */
 938         write_lock(&journal->j_state_lock);
 939         spin_lock(&journal->j_list_lock);
 940         /*
 941          * Now recheck if some buffers did not get attached to the transaction
 942          * while the lock was dropped...
 943          */
 944         if (commit_transaction->t_forget) {
 945                 spin_unlock(&journal->j_list_lock);
 946                 write_unlock(&journal->j_state_lock);
 947                 goto restart_loop;
 948         }
 949
 950         /* Done with this transaction! */
 951
 952         jbd_debug(3, "JBD: commit phase 7\n");
 953
 954         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 955
 956         commit_transaction->t_start = jiffies;
 957         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
 958                                               commit_transaction->t_start);
 959
 960         /*
 961          * File the transaction statistics
 962          */
 963         stats.ts_tid = commit_transaction->t_tid;
 964         stats.run.rs_handle_count =
 965                 atomic_read(&commit_transaction->t_handle_count);
 966         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
 967                              commit_transaction->t_tid, &stats.run);
 968
 969         /*
 970          * Calculate overall stats
 971          */
 972         spin_lock(&journal->j_history_lock);
 973         journal->j_stats.ts_tid++;
 974         journal->j_stats.run.rs_wait += stats.run.rs_wait;
 975         journal->j_stats.run.rs_running += stats.run.rs_running;
 976         journal->j_stats.run.rs_locked += stats.run.rs_locked;
 977         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
 978         journal->j_stats.run.rs_logging += stats.run.rs_logging;
 979         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
 980         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
 981         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
 982         spin_unlock(&journal->j_history_lock);
 983
 984         commit_transaction->t_state = T_FINISHED;
 985         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 986         journal->j_commit_sequence = commit_transaction->t_tid;
 987         journal->j_committing_transaction = NULL;
 988         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
 989
 990         /*
 991          * weight the commit time higher than the average time so we don't
 992          * react too strongly to vast changes in the commit time
 993          */
 994         if (likely(journal->j_average_commit_time))
 995                 journal->j_average_commit_time = (commit_time +
 996                                 journal->j_average_commit_time*3) / 4;
 997         else
 998                 journal->j_average_commit_time = commit_time;
 999         write_unlock(&journal->j_state_lock);
1000
1001         if (commit_transaction->t_checkpoint_list == NULL &&
1002             commit_transaction->t_checkpoint_io_list == NULL) {
1003                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1004                 to_free = 1;
1005         } else {
1006                 if (journal->j_checkpoint_transactions == NULL) {
1007                         journal->j_checkpoint_transactions = commit_transaction;
1008                         commit_transaction->t_cpnext = commit_transaction;
1009                         commit_transaction->t_cpprev = commit_transaction;
1010                 } else {
1011                         commit_transaction->t_cpnext =
1012                                 journal->j_checkpoint_transactions;
1013                         commit_transaction->t_cpprev =
1014                                 commit_transaction->t_cpnext->t_cpprev;
1015                         commit_transaction->t_cpnext->t_cpprev =
1016                                 commit_transaction;
1017                         commit_transaction->t_cpprev->t_cpnext =
1018                                 commit_transaction;
1019                 }
1020         }
1021         spin_unlock(&journal->j_list_lock);
1022
1023         if (journal->j_commit_callback)
1024                 journal->j_commit_callback(journal, commit_transaction);
1025
1026         trace_jbd2_end_commit(journal, commit_transaction);
1027         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1028                   journal->j_commit_sequence, journal->j_tail_sequence);
1029         if (to_free)
1030                 kfree(commit_transaction);
1031
1032         wake_up(&journal->j_wait_done_commit);
1033 }