Merge branch 'next' into upstream-merge

author Theodore Ts'o <tytso@mit.edu>

Thu, 28 Oct 2010 03:44:47 +0000 (23:44 -0400)

committer Theodore Ts'o <tytso@mit.edu>

Thu, 28 Oct 2010 03:44:47 +0000 (23:44 -0400)
author Theodore Ts'o <tytso@mit.edu>
Thu, 28 Oct 2010 03:44:47 +0000 (23:44 -0400)
committer Theodore Ts'o <tytso@mit.edu>
Thu, 28 Oct 2010 03:44:47 +0000 (23:44 -0400)
diff --combined fs/ext4/extents.c

index 06328d3e5717fd368ce7872fe0aa3db879cf6817,a17a676a310652475e6d86d3fc104d099b3a6f51..0554c48cb1fddbc97bc81c3443218b6a9e97fca8
--- 1/fs/ext4/extents.c
--- 2/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@@ -44,55 -44,6 +44,6 @@@
   #include "ext4_jbd2.h"
   #include "ext4_extents.h"
   
- 
- /*
-  * ext_pblock:
-  * combine low and high parts of physical block number into ext4_fsblk_t
-  */
- ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
- {
-       ext4_fsblk_t block;
- 
-       block = le32_to_cpu(ex->ee_start_lo);
-       block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-       return block;
- }
- 
- /*
-  * idx_pblock:
-  * combine low and high parts of a leaf physical block number into ext4_fsblk_t
-  */
- ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
- {
-       ext4_fsblk_t block;
- 
-       block = le32_to_cpu(ix->ei_leaf_lo);
-       block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-       return block;
- }
- 
- /*
-  * ext4_ext_store_pblock:
-  * stores a large physical block number into an extent struct,
-  * breaking it into parts
-  */
- void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
- {
-       ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-       ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
- }
- 
- /*
-  * ext4_idx_store_pblock:
-  * stores a large physical block number into an index struct,
-  * breaking it into parts
-  */
- static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
- {
-       ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-       ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
- }
- 
   static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                             struct inode *inode,
                                             int needed)
@@@ -169,7 -120,8 +120,8 @@@ static ext4_fsblk_t ext4_ext_find_goal(
                 /* try to predict block placement */
                 ex = path[depth].p_ext;
                 if (ex)
-                       return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+                       return (ext4_ext_pblock(ex) +
+                               (block - le32_to_cpu(ex->ee_block)));
   
                 /* it looks like index is empty;
                  * try to find starting block from index itself */
@@@ -354,7 -306,7 +306,7 @@@ ext4_ext_max_entries(struct inode *inod
   
   static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
   {
-       ext4_fsblk_t block = ext_pblock(ext);
+       ext4_fsblk_t block = ext4_ext_pblock(ext);
         int len = ext4_ext_get_actual_len(ext);
   
         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
@@@ -363,7 -315,7 +315,7 @@@
   static int ext4_valid_extent_idx(struct inode *inode,
                                 struct ext4_extent_idx *ext_idx)
   {
-       ext4_fsblk_t block = idx_pblock(ext_idx);
+       ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
   
         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
   }
@@@ -463,13 -415,13 +415,13 @@@ static void ext4_ext_show_path(struct i
         for (k = 0; k <= l; k++, path++) {
                 if (path->p_idx) {
                   ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-                           idx_pblock(path->p_idx));
+                           ext4_idx_pblock(path->p_idx));
                 } else if (path->p_ext) {
                         ext_debug("  %d:[%d]%d:%llu ",
                                   le32_to_cpu(path->p_ext->ee_block),
                                   ext4_ext_is_uninitialized(path->p_ext),
                                   ext4_ext_get_actual_len(path->p_ext),
-                                 ext_pblock(path->p_ext));
+                                 ext4_ext_pblock(path->p_ext));
                 } else
                         ext_debug("  []");
         }
@@@ -494,7 -446,7 +446,7 @@@ static void ext4_ext_show_leaf(struct i
         for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
                 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
                           ext4_ext_is_uninitialized(ex),
-                         ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                         ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
         }
         ext_debug("\n");
   }
@@@ -545,7 -497,7 +497,7 @@@ ext4_ext_binsearch_idx(struct inode *in
   
         path->p_idx = l - 1;
         ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-                 idx_pblock(path->p_idx));
+                 ext4_idx_pblock(path->p_idx));
   
   #ifdef CHECK_BINSEARCH
         {
@@@ -614,7 -566,7 +566,7 @@@ ext4_ext_binsearch(struct inode *inode
         path->p_ext = l - 1;
         ext_debug("  -> %d:%llu:[%d]%d ",
                         le32_to_cpu(path->p_ext->ee_block),
-                       ext_pblock(path->p_ext),
+                       ext4_ext_pblock(path->p_ext),
                         ext4_ext_is_uninitialized(path->p_ext),
                         ext4_ext_get_actual_len(path->p_ext));
   
@@@ -682,7 -634,7 +634,7 @@@ ext4_ext_find_extent(struct inode *inod
                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
   
                 ext4_ext_binsearch_idx(inode, path + ppos, block);
-               path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+               path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
                 path[ppos].p_depth = i;
                 path[ppos].p_ext = NULL;
   
@@@ -721,7 -673,7 +673,7 @@@
         ext4_ext_binsearch(inode, path + ppos, block);
         /* if not an empty leaf */
         if (path[ppos].p_ext)
-               path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+               path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
   
         ext4_ext_show_path(inode, path);
   
@@@ -739,9 -691,9 +691,9 @@@ err
    * insert new index [@logical;@ptr] into the block at @curp;
    * check where to insert: before @curp or after @curp
    */
- int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-                               struct ext4_ext_path *curp,
-                               int logical, ext4_fsblk_t ptr)
+ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+                                struct ext4_ext_path *curp,
+                                int logical, ext4_fsblk_t ptr)
   {
         struct ext4_extent_idx *ix;
         int len, err;
@@@ -917,7 -869,7 +869,7 @@@ static int ext4_ext_split(handle_t *han
                         EXT_MAX_EXTENT(path[depth].p_hdr)) {
                 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
                                 le32_to_cpu(path[depth].p_ext->ee_block),
-                               ext_pblock(path[depth].p_ext),
+                               ext4_ext_pblock(path[depth].p_ext),
                                 ext4_ext_is_uninitialized(path[depth].p_ext),
                                 ext4_ext_get_actual_len(path[depth].p_ext),
                                 newblock);
@@@ -1007,7 -959,7 +959,7 @@@
                 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
                         ext_debug("%d: move %d:%llu in new index %llu\n", i,
                                         le32_to_cpu(path[i].p_idx->ei_block),
-                                       idx_pblock(path[i].p_idx),
+                                       ext4_idx_pblock(path[i].p_idx),
                                         newblock);
                         /*memmove(++fidx, path[i].p_idx++,
                                         sizeof(struct ext4_extent_idx));
@@@ -1146,7 -1098,7 +1098,7 @@@ static int ext4_ext_grow_indepth(handle
         ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
                   le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
                   le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
-                 idx_pblock(EXT_FIRST_INDEX(neh)));
+                 ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
   
         neh->eh_depth = cpu_to_le16(path->p_depth + 1);
         err = ext4_ext_dirty(handle, inode, curp);
@@@ -1232,9 -1184,9 +1184,9 @@@ out
    * returns 0 at @phys
    * return value contains 0 (success) or error code
    */
- int
- ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
-                       ext4_lblk_t *logical, ext4_fsblk_t *phys)
+ static int ext4_ext_search_left(struct inode *inode,
+                               struct ext4_ext_path *path,
+                               ext4_lblk_t *logical, ext4_fsblk_t *phys)
   {
         struct ext4_extent_idx *ix;
         struct ext4_extent *ex;
@@@ -1286,7 -1238,7 +1238,7 @@@
         }
   
         *logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-       *phys = ext_pblock(ex) + ee_len - 1;
+       *phys = ext4_ext_pblock(ex) + ee_len - 1;
         return 0;
   }
   
@@@ -1297,9 -1249,9 +1249,9 @@@
    * returns 0 at @phys
    * return value contains 0 (success) or error code
    */
- int
- ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
-                       ext4_lblk_t *logical, ext4_fsblk_t *phys)
+ static int ext4_ext_search_right(struct inode *inode,
+                                struct ext4_ext_path *path,
+                                ext4_lblk_t *logical, ext4_fsblk_t *phys)
   {
         struct buffer_head *bh = NULL;
         struct ext4_extent_header *eh;
@@@ -1342,7 -1294,7 +1294,7 @@@
                         }
                 }
                 *logical = le32_to_cpu(ex->ee_block);
-               *phys = ext_pblock(ex);
+               *phys = ext4_ext_pblock(ex);
                 return 0;
         }
   
@@@ -1357,7 -1309,7 +1309,7 @@@
                 /* next allocated block in this leaf */
                 ex++;
                 *logical = le32_to_cpu(ex->ee_block);
-               *phys = ext_pblock(ex);
+               *phys = ext4_ext_pblock(ex);
                 return 0;
         }
   
@@@ -1376,7 -1328,7 +1328,7 @@@ got_index
          * follow it and find the closest allocated
          * block to the right */
         ix++;
-       block = idx_pblock(ix);
+       block = ext4_idx_pblock(ix);
         while (++depth < path->p_depth) {
                 bh = sb_bread(inode->i_sb, block);
                 if (bh == NULL)
@@@ -1388,7 -1340,7 +1340,7 @@@
                         return -EIO;
                 }
                 ix = EXT_FIRST_INDEX(eh);
-               block = idx_pblock(ix);
+               block = ext4_idx_pblock(ix);
                 put_bh(bh);
         }
   
@@@ -1402,7 -1354,7 +1354,7 @@@
         }
         ex = EXT_FIRST_EXTENT(eh);
         *logical = le32_to_cpu(ex->ee_block);
-       *phys = ext_pblock(ex);
+       *phys = ext4_ext_pblock(ex);
         put_bh(bh);
         return 0;
   }
@@@ -1573,7 -1525,7 +1525,7 @@@ ext4_can_extents_be_merged(struct inod
                 return 0;
   #endif
   
-       if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+       if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
                 return 1;
         return 0;
   }
@@@ -1585,9 -1537,9 +1537,9 @@@
    * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
    * 1 if they got merged.
    */
- int ext4_ext_try_to_merge(struct inode *inode,
-                         struct ext4_ext_path *path,
-                         struct ext4_extent *ex)
+ static int ext4_ext_try_to_merge(struct inode *inode,
+                                struct ext4_ext_path *path,
+                                struct ext4_extent *ex)
   {
         struct ext4_extent_header *eh;
         unsigned int depth, len;
@@@ -1632,9 -1584,9 +1584,9 @@@
    * such that there will be no overlap, and then returns 1.
    * If there is no overlap found, it returns 0.
    */
- unsigned int ext4_ext_check_overlap(struct inode *inode,
-                                   struct ext4_extent *newext,
-                                   struct ext4_ext_path *path)
+ static unsigned int ext4_ext_check_overlap(struct inode *inode,
+                                          struct ext4_extent *newext,
+                                          struct ext4_ext_path *path)
   {
         ext4_lblk_t b1, b2;
         unsigned int depth, len1;
@@@ -1706,11 -1658,12 +1658,12 @@@ int ext4_ext_insert_extent(handle_t *ha
         if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
                 && ext4_can_extents_be_merged(inode, ex, newext)) {
                 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
-                               ext4_ext_is_uninitialized(newext),
-                               ext4_ext_get_actual_len(newext),
-                               le32_to_cpu(ex->ee_block),
-                               ext4_ext_is_uninitialized(ex),
-                               ext4_ext_get_actual_len(ex), ext_pblock(ex));
+                         ext4_ext_is_uninitialized(newext),
+                         ext4_ext_get_actual_len(newext),
+                         le32_to_cpu(ex->ee_block),
+                         ext4_ext_is_uninitialized(ex),
+                         ext4_ext_get_actual_len(ex),
+                         ext4_ext_pblock(ex));
                 err = ext4_ext_get_access(handle, inode, path + depth);
                 if (err)
                         return err;
@@@ -1780,7 -1733,7 +1733,7 @@@ has_space
                 /* there is no extent in this leaf, create first one */
                 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
                                 le32_to_cpu(newext->ee_block),
-                               ext_pblock(newext),
+                               ext4_ext_pblock(newext),
                                 ext4_ext_is_uninitialized(newext),
                                 ext4_ext_get_actual_len(newext));
                 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
@@@ -1794,7 -1747,7 +1747,7 @@@
                         ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
                                         "move %d from 0x%p to 0x%p\n",
                                         le32_to_cpu(newext->ee_block),
-                                       ext_pblock(newext),
+                                       ext4_ext_pblock(newext),
                                         ext4_ext_is_uninitialized(newext),
                                         ext4_ext_get_actual_len(newext),
                                         nearex, len, nearex + 1, nearex + 2);
@@@ -1808,7 -1761,7 +1761,7 @@@
                 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
                                 "move %d from 0x%p to 0x%p\n",
                                 le32_to_cpu(newext->ee_block),
-                               ext_pblock(newext),
+                               ext4_ext_pblock(newext),
                                 ext4_ext_is_uninitialized(newext),
                                 ext4_ext_get_actual_len(newext),
                                 nearex, len, nearex + 1, nearex + 2);
@@@ -1819,7 -1772,7 +1772,7 @@@
         le16_add_cpu(&eh->eh_entries, 1);
         nearex = path[depth].p_ext;
         nearex->ee_block = newext->ee_block;
-       ext4_ext_store_pblock(nearex, ext_pblock(newext));
+       ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
         nearex->ee_len = newext->ee_len;
   
   merge:
@@@ -1845,9 -1798,9 +1798,9 @@@ cleanup
         return err;
   }
   
- int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
-                       ext4_lblk_t num, ext_prepare_callback func,
-                       void *cbdata)
+ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+                              ext4_lblk_t num, ext_prepare_callback func,
+                              void *cbdata)
   {
         struct ext4_ext_path *path = NULL;
         struct ext4_ext_cache cbex;
@@@ -1923,7 -1876,7 +1876,7 @@@
                 } else {
                         cbex.ec_block = le32_to_cpu(ex->ee_block);
                         cbex.ec_len = ext4_ext_get_actual_len(ex);
-                       cbex.ec_start = ext_pblock(ex);
+                       cbex.ec_start = ext4_ext_pblock(ex);
                         cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
                 }
   
@@@ -2073,7 -2026,7 +2026,7 @@@ static int ext4_ext_rm_idx(handle_t *ha
   
         /* free index block */
         path--;
-       leaf = idx_pblock(path->p_idx);
+       leaf = ext4_idx_pblock(path->p_idx);
         if (unlikely(path->p_hdr->eh_entries == 0)) {
                 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
                 return -EIO;
@@@ -2181,7 -2134,7 +2134,7 @@@ static int ext4_remove_blocks(handle_t 
                 ext4_fsblk_t start;
   
                 num = le32_to_cpu(ex->ee_block) + ee_len - from;
-               start = ext_pblock(ex) + ee_len - num;
+               start = ext4_ext_pblock(ex) + ee_len - num;
                 ext_debug("free last %u blocks starting %llu\n", num, start);
                 ext4_free_blocks(handle, inode, 0, start, num, flags);
         } else if (from == le32_to_cpu(ex->ee_block)
@@@ -2310,7 -2263,7 +2263,7 @@@ ext4_ext_rm_leaf(handle_t *handle, stru
                         goto out;
   
                 ext_debug("new extent: %u:%u:%llu\n", block, num,
-                               ext_pblock(ex));
+                               ext4_ext_pblock(ex));
                 ex--;
                 ex_ee_block = le32_to_cpu(ex->ee_block);
                 ex_ee_len = ext4_ext_get_actual_len(ex);
@@@ -2421,9 -2374,9 +2374,9 @@@ again
                         struct buffer_head *bh;
                         /* go to the next level */
                         ext_debug("move to level %d (block %llu)\n",
-                                 i + 1, idx_pblock(path[i].p_idx));
+                                 i + 1, ext4_idx_pblock(path[i].p_idx));
                         memset(path + i + 1, 0, sizeof(*path));
-                       bh = sb_bread(sb, idx_pblock(path[i].p_idx));
+                       bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
                         if (!bh) {
                                 /* should we reset i_size? */
                                 err = -EIO;
@@@ -2535,77 -2488,22 +2488,21 @@@ void ext4_ext_release(struct super_bloc
   #endif
   }
   
- static void bi_complete(struct bio *bio, int error)
- {
-       complete((struct completion *)bio->bi_private);
- }
- 
   /* FIXME!! we need to try to merge to left or right after zero-out  */
   static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
   {
+       ext4_fsblk_t ee_pblock;
+       unsigned int ee_len;
         int ret;
-       struct bio *bio;
-       int blkbits, blocksize;
-       sector_t ee_pblock;
-       struct completion event;
-       unsigned int ee_len, len, done, offset;
   
- 
-       blkbits   = inode->i_blkbits;
-       blocksize = inode->i_sb->s_blocksize;
         ee_len    = ext4_ext_get_actual_len(ex);
-       ee_pblock = ext_pblock(ex);
- 
-       /* convert ee_pblock to 512 byte sectors */
-       ee_pblock = ee_pblock << (blkbits - 9);
- 
-       while (ee_len > 0) {
- 
-               if (ee_len > BIO_MAX_PAGES)
-                       len = BIO_MAX_PAGES;
-               else
-                       len = ee_len;
- 
-               bio = bio_alloc(GFP_NOIO, len);
-               if (!bio)
-                       return -ENOMEM;
+       ee_pblock = ext4_ext_pblock(ex);
   
-               bio->bi_sector = ee_pblock;
-               bio->bi_bdev   = inode->i_sb->s_bdev;
- -      ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len,
- -                             GFP_NOFS, BLKDEV_IFL_WAIT);
++      ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+       if (ret > 0)
+               ret = 0;
   
-               done = 0;
-               offset = 0;
-               while (done < len) {
-                       ret = bio_add_page(bio, ZERO_PAGE(0),
-                                                       blocksize, offset);
-                       if (ret != blocksize) {
-                               /*
-                                * We can't add any more pages because of
-                                * hardware limitations.  Start a new bio.
-                                */
-                               break;
-                       }
-                       done++;
-                       offset += blocksize;
-                       if (offset >= PAGE_CACHE_SIZE)
-                               offset = 0;
-               }
- 
-               init_completion(&event);
-               bio->bi_private = &event;
-               bio->bi_end_io = bi_complete;
-               submit_bio(WRITE, bio);
-               wait_for_completion(&event);
- 
-               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-                       bio_put(bio);
-                       return -EIO;
-               }
-               bio_put(bio);
-               ee_len    -= done;
-               ee_pblock += done  << (blkbits - 9);
-       }
-       return 0;
+       return ret;
   }
   
   #define EXT4_EXT_ZERO_LEN 7
@@@ -2651,12 -2549,12 +2548,12 @@@ static int ext4_ext_convert_to_initiali
         ee_block = le32_to_cpu(ex->ee_block);
         ee_len = ext4_ext_get_actual_len(ex);
         allocated = ee_len - (map->m_lblk - ee_block);
-       newblock = map->m_lblk - ee_block + ext_pblock(ex);
+       newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
   
         ex2 = ex;
         orig_ex.ee_block = ex->ee_block;
         orig_ex.ee_len   = cpu_to_le16(ee_len);
-       ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+       ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
   
         /*
          * It is safe to convert extent to initialized via explicit
@@@ -2675,7 -2573,7 +2572,7 @@@
                 /* update the extent length and mark as initialized */
                 ex->ee_block = orig_ex.ee_block;
                 ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                 ext4_ext_dirty(handle, inode, path + depth);
                 /* zeroed the full extent */
                 return allocated;
@@@ -2710,7 -2608,7 +2607,7 @@@
                         ex->ee_block = orig_ex.ee_block;
                         ex->ee_len   = cpu_to_le16(ee_len - allocated);
                         ext4_ext_mark_uninitialized(ex);
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                         ext4_ext_dirty(handle, inode, path + depth);
   
                         ex3 = &newex;
@@@ -2725,7 -2623,8 +2622,8 @@@
                                         goto fix_extent_len;
                                 ex->ee_block = orig_ex.ee_block;
                                 ex->ee_len   = orig_ex.ee_len;
-                               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                               ext4_ext_store_pblock(ex,
+                                       ext4_ext_pblock(&orig_ex));
                                 ext4_ext_dirty(handle, inode, path + depth);
                                 /* blocks available from map->m_lblk */
                                 return allocated;
@@@ -2782,7 -2681,7 +2680,7 @@@
                         /* update the extent length and mark as initialized */
                         ex->ee_block = orig_ex.ee_block;
                         ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                         ext4_ext_dirty(handle, inode, path + depth);
                         /* zeroed the full extent */
                         /* blocks available from map->m_lblk */
@@@ -2833,7 -2732,7 +2731,7 @@@
                         /* update the extent length and mark as initialized */
                         ex->ee_block = orig_ex.ee_block;
                         ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                         ext4_ext_dirty(handle, inode, path + depth);
                         /* zero out the first half */
                         /* blocks available from map->m_lblk */
@@@ -2902,7 -2801,7 +2800,7 @@@ insert
                 /* update the extent length and mark as initialized */
                 ex->ee_block = orig_ex.ee_block;
                 ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                 ext4_ext_dirty(handle, inode, path + depth);
                 /* zero out the first half */
                 return allocated;
@@@ -2915,7 -2814,7 +2813,7 @@@ out
   fix_extent_len:
         ex->ee_block = orig_ex.ee_block;
         ex->ee_len   = orig_ex.ee_len;
-       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
         ext4_ext_mark_uninitialized(ex);
         ext4_ext_dirty(handle, inode, path + depth);
         return err;
@@@ -2973,12 -2872,12 +2871,12 @@@ static int ext4_split_unwritten_extents
         ee_block = le32_to_cpu(ex->ee_block);
         ee_len = ext4_ext_get_actual_len(ex);
         allocated = ee_len - (map->m_lblk - ee_block);
-       newblock = map->m_lblk - ee_block + ext_pblock(ex);
+       newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
   
         ex2 = ex;
         orig_ex.ee_block = ex->ee_block;
         orig_ex.ee_len   = cpu_to_le16(ee_len);
-       ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+       ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
   
         /*
          * It is safe to convert extent to initialized via explicit
@@@ -3027,7 -2926,7 +2925,7 @@@
                         /* update the extent length and mark as initialized */
                         ex->ee_block = orig_ex.ee_block;
                         ex->ee_len   = orig_ex.ee_len;
-                       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+                       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                         ext4_ext_dirty(handle, inode, path + depth);
                         /* zeroed the full extent */
                         /* blocks available from map->m_lblk */
@@@ -3099,7 -2998,7 +2997,7 @@@ insert
                 /* update the extent length and mark as initialized */
                 ex->ee_block = orig_ex.ee_block;
                 ex->ee_len   = orig_ex.ee_len;
-               ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+               ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
                 ext4_ext_dirty(handle, inode, path + depth);
                 /* zero out the first half */
                 return allocated;
@@@ -3112,7 -3011,7 +3010,7 @@@ out
   fix_extent_len:
         ex->ee_block = orig_ex.ee_block;
         ex->ee_len   = orig_ex.ee_len;
-       ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+       ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
         ext4_ext_mark_uninitialized(ex);
         ext4_ext_dirty(handle, inode, path + depth);
         return err;
@@@ -3180,6 -3079,57 +3078,57 @@@ static void unmap_underlying_metadata_b
                   unmap_underlying_metadata(bdev, block + i);
   }
   
+ /*
+  * Handle EOFBLOCKS_FL flag, clearing it if necessary
+  */
+ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+                             struct ext4_map_blocks *map,
+                             struct ext4_ext_path *path,
+                             unsigned int len)
+ {
+       int i, depth;
+       struct ext4_extent_header *eh;
+       struct ext4_extent *ex, *last_ex;
+ 
+       if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+               return 0;
+ 
+       depth = ext_depth(inode);
+       eh = path[depth].p_hdr;
+       ex = path[depth].p_ext;
+ 
+       if (unlikely(!eh->eh_entries)) {
+               EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
+                                "EOFBLOCKS_FL set");
+               return -EIO;
+       }
+       last_ex = EXT_LAST_EXTENT(eh);
+       /*
+        * We should clear the EOFBLOCKS_FL flag if we are writing the
+        * last block in the last extent in the file.  We test this by
+        * first checking to see if the caller to
+        * ext4_ext_get_blocks() was interested in the last block (or
+        * a block beyond the last block) in the current extent.  If
+        * this turns out to be false, we can bail out from this
+        * function immediately.
+        */
+       if (map->m_lblk + len < le32_to_cpu(last_ex->ee_block) +
+           ext4_ext_get_actual_len(last_ex))
+               return 0;
+       /*
+        * If the caller does appear to be planning to write at or
+        * beyond the end of the current extent, we then test to see
+        * if the current extent is the last extent in the file, by
+        * checking to make sure it was reached via the rightmost node
+        * at each level of the tree.
+        */
+       for (i = depth-1; i >= 0; i--)
+               if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                       return 0;
+       ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+       return ext4_mark_inode_dirty(handle, inode);
+ }
+ 
   static int
   ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                         struct ext4_map_blocks *map,
@@@ -3206,7 -3156,7 +3155,7 @@@
                  * completed
                  */
                 if (io)
-                       io->flag = EXT4_IO_UNWRITTEN;
+                       io->flag = EXT4_IO_END_UNWRITTEN;
                 else
                         ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                 if (ext4_should_dioread_nolock(inode))
@@@ -3217,8 -3167,12 +3166,12 @@@
         if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
                 ret = ext4_convert_unwritten_extents_endio(handle, inode,
                                                         path);
-               if (ret >= 0)
+               if (ret >= 0) {
                         ext4_update_inode_fsync_trans(handle, inode, 1);
+                       err = check_eofblocks_fl(handle, inode, map, path,
+                                                map->m_len);
+               } else
+                       err = ret;
                 goto out2;
         }
         /* buffered IO case */
@@@ -3244,8 -3198,13 +3197,13 @@@
   
         /* buffered write, writepage time, convert*/
         ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-       if (ret >= 0)
+       if (ret >= 0) {
                 ext4_update_inode_fsync_trans(handle, inode, 1);
+               err = check_eofblocks_fl(handle, inode, map, path, map->m_len);
+               if (err < 0)
+                       goto out2;
+       }
+ 
   out:
         if (ret <= 0) {
                 err = ret;
@@@ -3292,6 -3251,7 +3250,7 @@@ out2
         }
         return err ? err : allocated;
   }
+ 
   /*
    * Block allocation/map/preallocation routine for extents based files
    *
@@@ -3315,9 -3275,9 +3274,9 @@@ int ext4_ext_map_blocks(handle_t *handl
   {
         struct ext4_ext_path *path = NULL;
         struct ext4_extent_header *eh;
-       struct ext4_extent newex, *ex, *last_ex;
+       struct ext4_extent newex, *ex;
         ext4_fsblk_t newblock;
-       int i, err = 0, depth, ret, cache_type;
+       int err = 0, depth, ret, cache_type;
         unsigned int allocated = 0;
         struct ext4_allocation_request ar;
         ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@@ -3341,7 -3301,7 +3300,7 @@@
                         /* block is already allocated */
                         newblock = map->m_lblk
                                    - le32_to_cpu(newex.ee_block)
-                                  + ext_pblock(&newex);
+                                  + ext4_ext_pblock(&newex);
                         /* number of remaining blocks in the extent */
                         allocated = ext4_ext_get_actual_len(&newex) -
                                 (map->m_lblk - le32_to_cpu(newex.ee_block));
@@@ -3379,7 -3339,7 +3338,7 @@@
         ex = path[depth].p_ext;
         if (ex) {
                 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-               ext4_fsblk_t ee_start = ext_pblock(ex);
+               ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                 unsigned short ee_len;
   
                 /*
@@@ -3488,7 -3448,7 +3447,7 @@@
                  */
                 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
                         if (io)
-                               io->flag = EXT4_IO_UNWRITTEN;
+                               io->flag = EXT4_IO_END_UNWRITTEN;
                         else
                                 ext4_set_inode_state(inode,
                                                      EXT4_STATE_DIO_UNWRITTEN);
@@@ -3497,44 -3457,23 +3456,23 @@@
                         map->m_flags |= EXT4_MAP_UNINIT;
         }
   
-       if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
-               if (unlikely(!eh->eh_entries)) {
-                       EXT4_ERROR_INODE(inode,
-                                        "eh->eh_entries == 0 and "
-                                        "EOFBLOCKS_FL set");
-                       err = -EIO;
-                       goto out2;
-               }
-               last_ex = EXT_LAST_EXTENT(eh);
-               /*
-                * If the current leaf block was reached by looking at
-                * the last index block all the way down the tree, and
-                * we are extending the inode beyond the last extent
-                * in the current leaf block, then clear the
-                * EOFBLOCKS_FL flag.
-                */
-               for (i = depth-1; i >= 0; i--) {
-                       if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
-                               break;
-               }
-               if ((i < 0) &&
-                   (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
-                    ext4_ext_get_actual_len(last_ex)))
-                       ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
-       }
+       err = check_eofblocks_fl(handle, inode, map, path, ar.len);
+       if (err)
+               goto out2;
+ 
         err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
         if (err) {
                 /* free data blocks we just allocated */
                 /* not a good idea to call discard here directly,
                  * but otherwise we'd need to call it every free() */
                 ext4_discard_preallocations(inode);
-               ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+               ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
                                  ext4_ext_get_actual_len(&newex), 0);
                 goto out2;
         }
   
         /* previous routine could use block we allocated */
-       newblock = ext_pblock(&newex);
+       newblock = ext4_ext_pblock(&newex);
         allocated = ext4_ext_get_actual_len(&newex);
         if (allocated > map->m_len)
                 allocated = map->m_len;
@@@ -3729,7 -3668,7 +3667,7 @@@ retry
                         printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                     "returned error inode#%lu, block=%u, "
                                     "max_blocks=%u", __func__,
-                                   inode->i_ino, block, max_blocks);
+                                   inode->i_ino, map.m_lblk, max_blocks);
   #endif
                         ext4_mark_inode_dirty(handle, inode);
                         ret2 = ext4_journal_stop(handle);
diff --combined fs/ext4/fsync.c

index 3f3ff5ee8f9d620b70c4d4f3adf119cde0b343af,1c701f635961491b2f433a7c7ddf4acf95bbcee2..c1a7bc923cf6084c84f32b91294ec5feecf8b92c
--- 1/fs/ext4/fsync.c
--- 2/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@@ -34,6 -34,89 +34,89 @@@
   
   #include <trace/events/ext4.h>
   
+ static void dump_completed_IO(struct inode * inode)
+ {
+ #ifdef        EXT4_DEBUG
+       struct list_head *cur, *before, *after;
+       ext4_io_end_t *io, *io0, *io1;
+       unsigned long flags;
+ 
+       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+               return;
+       }
+ 
+       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+               cur = &io->list;
+               before = cur->prev;
+               io0 = container_of(before, ext4_io_end_t, list);
+               after = cur->next;
+               io1 = container_of(after, ext4_io_end_t, list);
+ 
+               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+                           io, inode->i_ino, io0, io1);
+       }
+       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+ #endif
+ }
+ 
+ /*
+  * This function is called from ext4_sync_file().
+  *
+  * When IO is completed, the work to convert unwritten extents to
+  * written is queued on workqueue but may not get immediately
+  * scheduled. When fsync is called, we need to ensure the
+  * conversion is complete before fsync returns.
+  * The inode keeps track of a list of pending/completed IO that
+  * might needs to do the conversion. This function walks through
+  * the list and convert the related unwritten extents for completed IO
+  * to written.
+  * The function return the number of pending IOs on success.
+  */
+ static int flush_completed_IO(struct inode *inode)
+ {
+       ext4_io_end_t *io;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       unsigned long flags;
+       int ret = 0;
+       int ret2 = 0;
+ 
+       if (list_empty(&ei->i_completed_io_list))
+               return ret;
+ 
+       dump_completed_IO(inode);
+       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+       while (!list_empty(&ei->i_completed_io_list)){
+               io = list_entry(ei->i_completed_io_list.next,
+                               ext4_io_end_t, list);
+               /*
+                * Calling ext4_end_io_nolock() to convert completed
+                * IO to written.
+                *
+                * When ext4_sync_file() is called, run_queue() may already
+                * about to flush the work corresponding to this io structure.
+                * It will be upset if it founds the io structure related
+                * to the work-to-be schedule is freed.
+                *
+                * Thus we need to keep the io structure still valid here after
+                * convertion finished. The io structure has a flag to
+                * avoid double converting from both fsync and background work
+                * queue work.
+                */
+               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+               ret = ext4_end_io_nolock(io);
+               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+               if (ret < 0)
+                       ret2 = ret;
+               else
+                       list_del_init(&io->list);
+       }
+       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+       return (ret2 < 0) ? ret2 : 0;
+ }
+ 
   /*
    * If we're not journaling and this is a just-created file, we have to
    * sync our parent directory (if it was freshly created) since
@@@ -128,9 -211,10 +211,9 @@@ int ext4_sync_file(struct file *file, i
                     (journal->j_fs_dev != journal->j_dev) &&
                     (journal->j_flags & JBD2_BARRIER))
                         blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
- -                                      NULL, BLKDEV_IFL_WAIT);
+ +                                      NULL);
                 ret = jbd2_log_wait_commit(journal, commit_tid);
         } else if (journal->j_flags & JBD2_BARRIER)
- -              blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
- -                      BLKDEV_IFL_WAIT);
+ +              blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
         return ret;
   }
diff --combined fs/ext4/ialloc.c

index 45853e0d1f218a673809fb23522b7bdc5966d9e0,509f429f71e8b0a8f8108293feb9ee8eea41bf41..1ce240a23ebb84963ce2126b78f126657fd7d6df
--- 1/fs/ext4/ialloc.c
--- 2/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@@ -50,7 -50,7 +50,7 @@@
    * need to use it within a single byte (to ensure we get endianness right).
    * We can use memset for the rest of the bitmap as there are no other users.
    */
- void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
   {
         int i;
   
@@@ -65,9 -65,10 +65,10 @@@
   }
   
   /* Initializes an uninitialized inode bitmap */
- unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
-                               ext4_group_t block_group,
-                               struct ext4_group_desc *gdp)
+ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+                                      struct buffer_head *bh,
+                                      ext4_group_t block_group,
+                                      struct ext4_group_desc *gdp)
   {
         struct ext4_sb_info *sbi = EXT4_SB(sb);
   
@@@ -85,7 -86,7 +86,7 @@@
         }
   
         memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+       ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                         bh->b_data);
   
         return EXT4_INODES_PER_GROUP(sb);
@@@ -107,6 -108,7 +108,7 @@@ ext4_read_inode_bitmap(struct super_blo
         desc = ext4_get_group_desc(sb, block_group, NULL);
         if (!desc)
                 return NULL;
+ 
         bitmap_blk = ext4_inode_bitmap(sb, desc);
         bh = sb_getblk(sb, bitmap_blk);
         if (unlikely(!bh)) {
@@@ -123,6 -125,7 +125,7 @@@
                 unlock_buffer(bh);
                 return bh;
         }
+ 
         ext4_lock_group(sb, block_group);
         if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                 ext4_init_inode_bitmap(sb, bh, block_group, desc);
@@@ -133,6 -136,7 +136,7 @@@
                 return bh;
         }
         ext4_unlock_group(sb, block_group);
+ 
         if (buffer_uptodate(bh)) {
                 /*
                  * if not uninit if bh is uptodate,
@@@ -411,8 -415,8 +415,8 @@@ struct orlov_stats 
    * for a particular block group or flex_bg.  If flex_size is 1, then g
    * is a block group number; otherwise it is flex_bg number.
    */
- void get_orlov_stats(struct super_block *sb, ext4_group_t g,
-                      int flex_size, struct orlov_stats *stats)
+ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+                           int flex_size, struct orlov_stats *stats)
   {
         struct ext4_group_desc *desc;
         struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
@@@ -712,8 -716,17 +716,17 @@@ static int ext4_claim_inode(struct supe
   {
         int free = 0, retval = 0, count;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
         struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
   
+       /*
+        * We have to be sure that new inode allocation does not race with
+        * inode table initialization, because otherwise we may end up
+        * allocating and writing new inode right before sb_issue_zeroout
+        * takes place and overwriting our new inode with zeroes. So we
+        * take alloc_sem to prevent it.
+        */
+       down_read(&grp->alloc_sem);
         ext4_lock_group(sb, group);
         if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                 /* not a free inode */
@@@ -724,6 -737,7 +737,7 @@@
         if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                         ino > EXT4_INODES_PER_GROUP(sb)) {
                 ext4_unlock_group(sb, group);
+               up_read(&grp->alloc_sem);
                 ext4_error(sb, "reserved inode or inode > inodes count - "
                            "block_group = %u, inode=%lu", group,
                            ino + group * EXT4_INODES_PER_GROUP(sb));
@@@ -772,6 -786,7 +786,7 @@@
         gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
   err_ret:
         ext4_unlock_group(sb, group);
+       up_read(&grp->alloc_sem);
         return retval;
   }
   
@@@ -1205,3 -1220,110 +1220,109 @@@ unsigned long ext4_count_dirs(struct su
         }
         return count;
   }
- -      unsigned long flags = BLKDEV_IFL_WAIT;
+ 
+ /*
+  * Zeroes not yet zeroed inode table - just write zeroes through the whole
+  * inode table. Must be called without any spinlock held. The only place
+  * where it is called from on active part of filesystem is ext4lazyinit
+  * thread, so we do not need any special locks, however we have to prevent
+  * inode allocation from the current group, so we take alloc_sem lock, to
+  * block ext4_claim_inode until we are finished.
+  */
+ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+                                int barrier)
+ {
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_group_desc *gdp = NULL;
+       struct buffer_head *group_desc_bh;
+       handle_t *handle;
+       ext4_fsblk_t blk;
+       int num, ret = 0, used_blks = 0;
- -      if (barrier)
- -              flags |= BLKDEV_IFL_BARRIER;
- -      ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS, flags);
+ 
+       /* This should not happen, but just to be sure check this */
+       if (sb->s_flags & MS_RDONLY) {
+               ret = 1;
+               goto out;
+       }
+ 
+       gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+       if (!gdp)
+               goto out;
+ 
+       /*
+        * We do not need to lock this, because we are the only one
+        * handling this flag.
+        */
+       if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+               goto out;
+ 
+       handle = ext4_journal_start_sb(sb, 1);
+       if (IS_ERR(handle)) {
+               ret = PTR_ERR(handle);
+               goto out;
+       }
+ 
+       down_write(&grp->alloc_sem);
+       /*
+        * If inode bitmap was already initialized there may be some
+        * used inodes so we need to skip blocks with used inodes in
+        * inode table.
+        */
+       if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+               used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+                           ext4_itable_unused_count(sb, gdp)),
+                           sbi->s_inodes_per_block);
+ 
+       if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+               ext4_error(sb, "Something is wrong with group %u\n"
+                          "Used itable blocks: %d"
+                          "itable unused count: %u\n",
+                          group, used_blks,
+                          ext4_itable_unused_count(sb, gdp));
+               ret = 1;
+               goto out;
+       }
+ 
+       blk = ext4_inode_table(sb, gdp) + used_blks;
+       num = sbi->s_itb_per_group - used_blks;
+ 
+       BUFFER_TRACE(group_desc_bh, "get_write_access");
+       ret = ext4_journal_get_write_access(handle,
+                                           group_desc_bh);
+       if (ret)
+               goto err_out;
+ 
+       /*
+        * Skip zeroout if the inode table is full. But we set the ZEROED
+        * flag anyway, because obviously, when it is full it does not need
+        * further zeroing.
+        */
+       if (unlikely(num == 0))
+               goto skip_zeroout;
+ 
+       ext4_debug("going to zero out inode table in group %d\n",
+                  group);
++      ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+       if (ret < 0)
+               goto err_out;
++      if (barrier)
++              blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+ 
+ skip_zeroout:
+       ext4_lock_group(sb, group);
+       gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+       ext4_unlock_group(sb, group);
+ 
+       BUFFER_TRACE(group_desc_bh,
+                    "call ext4_handle_dirty_metadata");
+       ret = ext4_handle_dirty_metadata(handle, NULL,
+                                        group_desc_bh);
+ 
+ err_out:
+       up_write(&grp->alloc_sem);
+       ext4_journal_stop(handle);
+ out:
+       return ret;
+ }
diff --combined fs/ext4/inode.c

index 49635ef236f84d40b0090a78a1a96860fae1fb6b,3ba237b0b2aa6ef2738b286eb924b7def916f74e..2d6c6c8c036df496f0b44e549d800d26494abdef
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -60,6 -60,12 +60,12 @@@ static inline int ext4_begin_ordered_tr
   }
   
   static void ext4_invalidatepage(struct page *page, unsigned long offset);
+ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create);
+ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+ static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
   
   /*
    * Test whether an inode is a fast symlink.
@@@ -755,6 -761,11 +761,11 @@@ static int ext4_alloc_branch(handle_t *
                  * parent to disk.
                  */
                 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+               if (unlikely(!bh)) {
+                       err = -EIO;
+                       goto failed;
+               }
+ 
                 branch[n].bh = bh;
                 lock_buffer(bh);
                 BUFFER_TRACE(bh, "call get_create_access");
@@@ -1207,8 -1218,10 +1218,10 @@@ static pgoff_t ext4_num_dirty_pages(str
                                 break;
                         idx++;
                         num++;
-                       if (num >= max_pages)
+                       if (num >= max_pages) {
+                               done = 1;
                                 break;
+                       }
                 }
                 pagevec_release(&pvec);
         }
@@@ -1538,10 -1551,10 +1551,10 @@@ static int do_journal_get_write_access(
         if (!buffer_mapped(bh) || buffer_freed(bh))
                 return 0;
         /*
- -       * __block_prepare_write() could have dirtied some buffers. Clean
+ +       * __block_write_begin() could have dirtied some buffers. Clean
          * the dirty bit as jbd2_journal_get_write_access() could complain
          * otherwise about fs integrity issues. Setting of the dirty bit
- -       * by __block_prepare_write() isn't a real problem here as we clear
+ +       * by __block_write_begin() isn't a real problem here as we clear
          * the bit before releasing a page lock and thus writeback cannot
          * ever write the buffer.
          */
@@@ -1995,16 -2008,23 +2008,23 @@@ static void ext4_da_page_release_reserv
    *
    * As pages are already locked by write_cache_pages(), we can't use it
    */
- static int mpage_da_submit_io(struct mpage_da_data *mpd)
+ static int mpage_da_submit_io(struct mpage_da_data *mpd,
+                             struct ext4_map_blocks *map)
   {
-       long pages_skipped;
         struct pagevec pvec;
         unsigned long index, end;
         int ret = 0, err, nr_pages, i;
         struct inode *inode = mpd->inode;
         struct address_space *mapping = inode->i_mapping;
+       loff_t size = i_size_read(inode);
+       unsigned int len, block_start;
+       struct buffer_head *bh, *page_bufs = NULL;
+       int journal_data = ext4_should_journal_data(inode);
+       sector_t pblock = 0, cur_logical = 0;
+       struct ext4_io_submit io_submit;
   
         BUG_ON(mpd->next_page <= mpd->first_page);
+       memset(&io_submit, 0, sizeof(io_submit));
         /*
          * We need to start from the first_page to the next_page - 1
          * to make sure we also write the mapped dirty buffer_heads.
@@@ -2020,122 -2040,108 +2040,108 @@@
                 if (nr_pages == 0)
                         break;
                 for (i = 0; i < nr_pages; i++) {
+                       int commit_write = 0, redirty_page = 0;
                         struct page *page = pvec.pages[i];
   
                         index = page->index;
                         if (index > end)
                                 break;
+ 
+                       if (index == size >> PAGE_CACHE_SHIFT)
+                               len = size & ~PAGE_CACHE_MASK;
+                       else
+                               len = PAGE_CACHE_SIZE;
+                       if (map) {
+                               cur_logical = index << (PAGE_CACHE_SHIFT -
+                                                       inode->i_blkbits);
+                               pblock = map->m_pblk + (cur_logical -
+                                                       map->m_lblk);
+                       }
                         index++;
   
                         BUG_ON(!PageLocked(page));
                         BUG_ON(PageWriteback(page));
   
-                       pages_skipped = mpd->wbc->pages_skipped;
-                       err = mapping->a_ops->writepage(page, mpd->wbc);
-                       if (!err && (pages_skipped == mpd->wbc->pages_skipped))
-                               /*
-                                * have successfully written the page
-                                * without skipping the same
-                                */
-                               mpd->pages_written++;
                         /*
-                        * In error case, we have to continue because
-                        * remaining pages are still locked
-                        * XXX: unlock and re-dirty them?
+                        * If the page does not have buffers (for
+                        * whatever reason), try to create them using
- -                       * block_prepare_write.  If this fails,
++                       * __block_write_begin.  If this fails,
+                        * redirty the page and move on.
                          */
-                       if (ret == 0)
-                               ret = err;
-               }
-               pagevec_release(&pvec);
-       }
-       return ret;
- }
- 
- /*
-  * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
-  *
-  * the function goes through all passed space and put actual disk
-  * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
-  */
- static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                struct ext4_map_blocks *map)
- {
-       struct inode *inode = mpd->inode;
-       struct address_space *mapping = inode->i_mapping;
-       int blocks = map->m_len;
-       sector_t pblock = map->m_pblk, cur_logical;
-       struct buffer_head *head, *bh;
-       pgoff_t index, end;
-       struct pagevec pvec;
-       int nr_pages, i;
- 
-       index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-       cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- 
-       pagevec_init(&pvec, 0);
- 
-       while (index <= end) {
-               /* XXX: optimize tail */
-               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-               if (nr_pages == 0)
-                       break;
-               for (i = 0; i < nr_pages; i++) {
-                       struct page *page = pvec.pages[i];
- 
-                       index = page->index;
-                       if (index > end)
-                               break;
-                       index++;
- 
-                       BUG_ON(!PageLocked(page));
-                       BUG_ON(PageWriteback(page));
-                       BUG_ON(!page_has_buffers(page));
- 
-                       bh = page_buffers(page);
-                       head = bh;
- 
-                       /* skip blocks out of the range */
-                       do {
-                               if (cur_logical >= map->m_lblk)
-                                       break;
-                               cur_logical++;
-                       } while ((bh = bh->b_this_page) != head);
+                       if (!page_has_buffers(page)) {
- -                              if (block_prepare_write(page, 0, len,
++                              if (__block_write_begin(page, 0, len,
+                                               noalloc_get_block_write)) {
+                               redirty_page:
+                                       redirty_page_for_writepage(mpd->wbc,
+                                                                  page);
+                                       unlock_page(page);
+                                       continue;
+                               }
+                               commit_write = 1;
+                       }
   
+                       bh = page_bufs = page_buffers(page);
+                       block_start = 0;
                         do {
-                               if (cur_logical >= map->m_lblk + blocks)
-                                       break;
- 
-                               if (buffer_delay(bh) || buffer_unwritten(bh)) {
- 
-                                       BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
- 
+                               if (!bh)
+                                       goto redirty_page;
+                               if (map && (cur_logical >= map->m_lblk) &&
+                                   (cur_logical <= (map->m_lblk +
+                                                    (map->m_len - 1)))) {
                                         if (buffer_delay(bh)) {
                                                 clear_buffer_delay(bh);
                                                 bh->b_blocknr = pblock;
-                                       } else {
-                                               /*
-                                                * unwritten already should have
-                                                * blocknr assigned. Verify that
-                                                */
-                                               clear_buffer_unwritten(bh);
-                                               BUG_ON(bh->b_blocknr != pblock);
                                         }
+                                       if (buffer_unwritten(bh) ||
+                                           buffer_mapped(bh))
+                                               BUG_ON(bh->b_blocknr != pblock);
+                                       if (map->m_flags & EXT4_MAP_UNINIT)
+                                               set_buffer_uninit(bh);
+                                       clear_buffer_unwritten(bh);
+                               }
   
-                               } else if (buffer_mapped(bh))
-                                       BUG_ON(bh->b_blocknr != pblock);
- 
-                               if (map->m_flags & EXT4_MAP_UNINIT)
-                                       set_buffer_uninit(bh);
+                               /* redirty page if block allocation undone */
+                               if (buffer_delay(bh) || buffer_unwritten(bh))
+                                       redirty_page = 1;
+                               bh = bh->b_this_page;
+                               block_start += bh->b_size;
                                 cur_logical++;
                                 pblock++;
-                       } while ((bh = bh->b_this_page) != head);
+                       } while (bh != page_bufs);
+ 
+                       if (redirty_page)
+                               goto redirty_page;
+ 
+                       if (commit_write)
+                               /* mark the buffer_heads as dirty & uptodate */
+                               block_commit_write(page, 0, len);
+ 
+                       /*
+                        * Delalloc doesn't support data journalling,
+                        * but eventually maybe we'll lift this
+                        * restriction.
+                        */
+                       if (unlikely(journal_data && PageChecked(page)))
+                               err = __ext4_journalled_writepage(page, len);
+                       else
+                               err = ext4_bio_write_page(&io_submit, page,
+                                                         len, mpd->wbc);
+ 
+                       if (!err)
+                               mpd->pages_written++;
+                       /*
+                        * In error case, we have to continue because
+                        * remaining pages are still locked
+                        */
+                       if (ret == 0)
+                               ret = err;
                 }
                 pagevec_release(&pvec);
         }
+       ext4_io_submit(&io_submit);
+       return ret;
   }
   
- 
   static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                         sector_t logical, long blk_cnt)
   {
@@@ -2187,35 -2193,32 +2193,32 @@@ static void ext4_print_free_blocks(stru
   }
   
   /*
-  * mpage_da_map_blocks - go through given space
+  * mpage_da_map_and_submit - go through given space, map them
+  *       if necessary, and then submit them for I/O
    *
    * @mpd - bh describing space
    *
    * The function skips space we know is already mapped to disk blocks.
    *
    */
- static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
   {
         int err, blks, get_blocks_flags;
-       struct ext4_map_blocks map;
+       struct ext4_map_blocks map, *mapp = NULL;
         sector_t next = mpd->b_blocknr;
         unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
         loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
         handle_t *handle = NULL;
   
         /*
-        * We consider only non-mapped and non-allocated blocks
-        */
-       if ((mpd->b_state  & (1 << BH_Mapped)) &&
-               !(mpd->b_state & (1 << BH_Delay)) &&
-               !(mpd->b_state & (1 << BH_Unwritten)))
-               return 0;
- 
-       /*
-        * If we didn't accumulate anything to write simply return
+        * If the blocks are mapped already, or we couldn't accumulate
+        * any blocks, then proceed immediately to the submission stage.
          */
-       if (!mpd->b_size)
-               return 0;
+       if ((mpd->b_size == 0) ||
+           ((mpd->b_state  & (1 << BH_Mapped)) &&
+            !(mpd->b_state & (1 << BH_Delay)) &&
+            !(mpd->b_state & (1 << BH_Unwritten))))
+               goto submit_io;
   
         handle = ext4_journal_current_handle();
         BUG_ON(!handle);
@@@ -2252,17 -2255,18 +2255,18 @@@
   
                 err = blks;
                 /*
-                * If get block returns with error we simply
-                * return. Later writepage will redirty the page and
-                * writepages will find the dirty page again
+                * If get block returns EAGAIN or ENOSPC and there
+                * appears to be free blocks we will call
+                * ext4_writepage() for all of the pages which will
+                * just redirty the pages.
                  */
                 if (err == -EAGAIN)
-                       return 0;
+                       goto submit_io;
   
                 if (err == -ENOSPC &&
                     ext4_count_free_blocks(sb)) {
                         mpd->retval = err;
-                       return 0;
+                       goto submit_io;
                 }
   
                 /*
@@@ -2287,10 -2291,11 +2291,11 @@@
                 /* invalidate all the pages */
                 ext4_da_block_invalidatepages(mpd, next,
                                 mpd->b_size >> mpd->inode->i_blkbits);
-               return err;
+               return;
         }
         BUG_ON(blks == 0);
   
+       mapp = &map;
         if (map.m_flags & EXT4_MAP_NEW) {
                 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
                 int i;
@@@ -2299,18 -2304,11 +2304,11 @@@
                         unmap_underlying_metadata(bdev, map.m_pblk + i);
         }
   
-       /*
-        * If blocks are delayed marked, we need to
-        * put actual blocknr and drop delayed bit
-        */
-       if ((mpd->b_state & (1 << BH_Delay)) ||
-           (mpd->b_state & (1 << BH_Unwritten)))
-               mpage_put_bnr_to_bhs(mpd, &map);
- 
         if (ext4_should_order_data(mpd->inode)) {
                 err = ext4_jbd2_file_inode(handle, mpd->inode);
                 if (err)
-                       return err;
+                       /* This only happens if the journal is aborted */
+                       return;
         }
   
         /*
@@@ -2321,10 -2319,16 +2319,16 @@@
                 disksize = i_size_read(mpd->inode);
         if (disksize > EXT4_I(mpd->inode)->i_disksize) {
                 ext4_update_i_disksize(mpd->inode, disksize);
-               return ext4_mark_inode_dirty(handle, mpd->inode);
+               err = ext4_mark_inode_dirty(handle, mpd->inode);
+               if (err)
+                       ext4_error(mpd->inode->i_sb,
+                                  "Failed to mark inode %lu dirty",
+                                  mpd->inode->i_ino);
         }
   
-       return 0;
+ submit_io:
+       mpage_da_submit_io(mpd, mapp);
+       mpd->io_done = 1;
   }
   
   #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@@ -2401,9 -2405,7 +2405,7 @@@ flush_it
          * We couldn't merge the block to our extent, so we
          * need to flush current  extent and start new one
          */
-       if (mpage_da_map_blocks(mpd) == 0)
-               mpage_da_submit_io(mpd);
-       mpd->io_done = 1;
+       mpage_da_map_and_submit(mpd);
         return;
   }
   
@@@ -2422,9 -2424,9 +2424,9 @@@ static int ext4_bh_delay_or_unwritten(h
    * The function finds extents of pages and scan them for all blocks.
    */
   static int __mpage_da_writepage(struct page *page,
-                               struct writeback_control *wbc, void *data)
+                               struct writeback_control *wbc,
+                               struct mpage_da_data *mpd)
   {
-       struct mpage_da_data *mpd = data;
         struct inode *inode = mpd->inode;
         struct buffer_head *bh, *head;
         sector_t logical;
@@@ -2435,15 -2437,13 +2437,13 @@@
         if (mpd->next_page != page->index) {
                 /*
                  * Nope, we can't. So, we map non-allocated blocks
-                * and start IO on them using writepage()
+                * and start IO on them
                  */
                 if (mpd->next_page != mpd->first_page) {
-                       if (mpage_da_map_blocks(mpd) == 0)
-                               mpage_da_submit_io(mpd);
+                       mpage_da_map_and_submit(mpd);
                         /*
                          * skip rest of the page in the page_vec
                          */
-                       mpd->io_done = 1;
                         redirty_page_for_writepage(wbc, page);
                         unlock_page(page);
                         return MPAGE_DA_EXTENT_TAIL;
@@@ -2550,7 -2550,8 +2550,7 @@@ static int ext4_da_get_block_prep(struc
                 if (buffer_delay(bh))
                         return 0; /* Not sure this could or should happen */
                 /*
- -               * XXX: __block_prepare_write() unmaps passed block,
- -               * is it OK?
+ +               * XXX: __block_write_begin() unmaps passed block, is it OK?
                  */
                 ret = ext4_da_reserve_space(inode, iblock);
                 if (ret)
@@@ -2582,7 -2583,7 +2582,7 @@@
   /*
    * This function is used as a standard get_block_t calback function
    * when there is no desire to allocate any blocks.  It is used as a
- - * callback function for block_prepare_write() and block_write_full_page().
+ + * callback function for block_write_begin() and block_write_full_page().
    * These functions should only try to map a single block at a time.
    *
    * Since this function doesn't do block allocations even if the caller
@@@ -2622,6 -2623,7 +2622,7 @@@ static int __ext4_journalled_writepage(
         int ret = 0;
         int err;
   
+       ClearPageChecked(page);
         page_bufs = page_buffers(page);
         BUG_ON(!page_bufs);
         walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
@@@ -2699,7 -2701,7 +2700,7 @@@ static void ext4_end_io_buffer_write(st
   static int ext4_writepage(struct page *page,
                           struct writeback_control *wbc)
   {
-       int ret = 0;
+       int ret = 0, commit_write = 0;
         loff_t size;
         unsigned int len;
         struct buffer_head *page_bufs = NULL;
@@@ -2712,71 -2714,46 +2713,46 @@@
         else
                 len = PAGE_CACHE_SIZE;
   
-       if (page_has_buffers(page)) {
-               page_bufs = page_buffers(page);
-               if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                       ext4_bh_delay_or_unwritten)) {
-                       /*
-                        * We don't want to do  block allocation
-                        * So redirty the page and return
-                        * We may reach here when we do a journal commit
-                        * via journal_submit_inode_data_buffers.
-                        * If we don't have mapping block we just ignore
-                        * them. We can also reach here via shrink_page_list
-                        */
+       /*
+        * If the page does not have buffers (for whatever reason),
- -       * try to create them using block_prepare_write.  If this
++       * try to create them using __block_write_begin.  If this
+        * fails, redirty the page and move on.
+        */
+       if (!page_buffers(page)) {
- -              if (block_prepare_write(page, 0, len,
++              if (__block_write_begin(page, 0, len,
+                                       noalloc_get_block_write)) {
+               redirty_page:
                         redirty_page_for_writepage(wbc, page);
                         unlock_page(page);
                         return 0;
                 }
-       } else {
+               commit_write = 1;
+       }
+       page_bufs = page_buffers(page);
+       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                             ext4_bh_delay_or_unwritten)) {
                 /*
-                * The test for page_has_buffers() is subtle:
-                * We know the page is dirty but it lost buffers. That means
-                * that at some moment in time after write_begin()/write_end()
-                * has been called all buffers have been clean and thus they
-                * must have been written at least once. So they are all
-                * mapped and we can happily proceed with mapping them
-                * and writing the page.
-                *
-                * Try to initialize the buffer_heads and check whether
-                * all are mapped and non delay. We don't want to
-                * do block allocation here.
+                * We don't want to do block allocation So redirty the
+                * page and return We may reach here when we do a
+                * journal commit via
+                * journal_submit_inode_data_buffers.  If we don't
+                * have mapping block we just ignore them. We can also
+                * reach here via shrink_page_list
                  */
-               ret = __block_write_begin(page, 0, len,
-                                         noalloc_get_block_write);
-               if (!ret) {
-                       page_bufs = page_buffers(page);
-                       /* check whether all are mapped and non delay */
-                       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
-                                               ext4_bh_delay_or_unwritten)) {
-                               redirty_page_for_writepage(wbc, page);
-                               unlock_page(page);
-                               return 0;
-                       }
-               } else {
-                       /*
-                        * We can't do block allocation here
-                        * so just redity the page and unlock
-                        * and return
-                        */
-                       redirty_page_for_writepage(wbc, page);
-                       unlock_page(page);
-                       return 0;
-               }
+               goto redirty_page;
+       }
+       if (commit_write)
                 /* now mark the buffer_heads as dirty and uptodate */
                 block_commit_write(page, 0, len);
-       }
   
-       if (PageChecked(page) && ext4_should_journal_data(inode)) {
+       if (PageChecked(page) && ext4_should_journal_data(inode))
                 /*
                  * It's mmapped pagecache.  Add buffers and journal it.  There
                  * doesn't seem much point in redirtying the page here.
                  */
-               ClearPageChecked(page);
                 return __ext4_journalled_writepage(page, len);
-       }
   
-       if (page_bufs && buffer_uninit(page_bufs)) {
+       if (buffer_uninit(page_bufs)) {
                 ext4_set_bh_endio(page_bufs, inode);
                 ret = block_write_full_page_endio(page, noalloc_get_block_write,
                                             wbc, ext4_end_io_buffer_write);
@@@ -2823,25 -2800,32 +2799,32 @@@ static int ext4_da_writepages_trans_blo
    */
   static int write_cache_pages_da(struct address_space *mapping,
                                 struct writeback_control *wbc,
-                               struct mpage_da_data *mpd)
+                               struct mpage_da_data *mpd,
+                               pgoff_t *done_index)
   {
         int ret = 0;
         int done = 0;
         struct pagevec pvec;
-       int nr_pages;
+       unsigned nr_pages;
         pgoff_t index;
         pgoff_t end;            /* Inclusive */
         long nr_to_write = wbc->nr_to_write;
+       int tag;
   
         pagevec_init(&pvec, 0);
         index = wbc->range_start >> PAGE_CACHE_SHIFT;
         end = wbc->range_end >> PAGE_CACHE_SHIFT;
   
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag = PAGECACHE_TAG_TOWRITE;
+       else
+               tag = PAGECACHE_TAG_DIRTY;
+ 
+       *done_index = index;
         while (!done && (index <= end)) {
                 int i;
   
-               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                             PAGECACHE_TAG_DIRTY,
+               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                 if (nr_pages == 0)
                         break;
@@@ -2861,6 -2845,8 +2844,8 @@@
                                 break;
                         }
   
+                       *done_index = page->index + 1;
+ 
                         lock_page(page);
   
                         /*
@@@ -2946,6 -2932,8 +2931,8 @@@ static int ext4_da_writepages(struct ad
         long desired_nr_to_write, nr_to_writebump = 0;
         loff_t range_start = wbc->range_start;
         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+       pgoff_t done_index = 0;
+       pgoff_t end;
   
         trace_ext4_da_writepages(inode, wbc);
   
@@@ -2981,8 -2969,11 +2968,11 @@@
                 wbc->range_start = index << PAGE_CACHE_SHIFT;
                 wbc->range_end  = LLONG_MAX;
                 wbc->range_cyclic = 0;
-       } else
+               end = -1;
+       } else {
                 index = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+       }
   
         /*
          * This works around two forms of stupidity.  The first is in
@@@ -3001,9 -2992,12 +2991,12 @@@
          * sbi->max_writeback_mb_bump whichever is smaller.
          */
         max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
-       if (!range_cyclic && range_whole)
-               desired_nr_to_write = wbc->nr_to_write * 8;
-       else
+       if (!range_cyclic && range_whole) {
+               if (wbc->nr_to_write == LONG_MAX)
+                       desired_nr_to_write = wbc->nr_to_write;
+               else
+                       desired_nr_to_write = wbc->nr_to_write * 8;
+       } else
                 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
                                                            max_pages);
         if (desired_nr_to_write > max_pages)
@@@ -3020,6 -3014,9 +3013,9 @@@
         pages_skipped = wbc->pages_skipped;
   
   retry:
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag_pages_for_writeback(mapping, index, end);
+ 
         while (!ret && wbc->nr_to_write > 0) {
   
                 /*
@@@ -3058,16 -3055,14 +3054,14 @@@
                 mpd.io_done = 0;
                 mpd.pages_written = 0;
                 mpd.retval = 0;
-               ret = write_cache_pages_da(mapping, wbc, &mpd);
+               ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
                 /*
                  * If we have a contiguous extent of pages and we
                  * haven't done the I/O yet, map the blocks and submit
                  * them for I/O.
                  */
                 if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-                       if (mpage_da_map_blocks(&mpd) == 0)
-                               mpage_da_submit_io(&mpd);
-                       mpd.io_done = 1;
+                       mpage_da_map_and_submit(&mpd);
                         ret = MPAGE_DA_EXTENT_TAIL;
                 }
                 trace_ext4_da_write_pages(inode, &mpd);
@@@ -3114,14 -3109,13 +3108,13 @@@
                          __func__, wbc->nr_to_write, ret);
   
         /* Update index */
-       index += pages_written;
         wbc->range_cyclic = range_cyclic;
         if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                 /*
                  * set the writeback_index so that range_cyclic
                  * mode will write it back later
                  */
-               mapping->writeback_index = index;
+               mapping->writeback_index = done_index;
   
   out_writepages:
         wbc->nr_to_write -= nr_to_writebump;
@@@ -3456,15 -3450,6 +3449,6 @@@ ext4_readpages(struct file *file, struc
         return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
   }
   
- static void ext4_free_io_end(ext4_io_end_t *io)
- {
-       BUG_ON(!io);
-       if (io->page)
-               put_page(io->page);
-       iput(io->inode);
-       kfree(io);
- }
- 
   static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
   {
         struct buffer_head *head, *bh;
@@@ -3641,173 -3626,6 +3625,6 @@@ static int ext4_get_block_write(struct 
                                EXT4_GET_BLOCKS_IO_CREATE_EXT);
   }
   
- static void dump_completed_IO(struct inode * inode)
- {
- #ifdef        EXT4_DEBUG
-       struct list_head *cur, *before, *after;
-       ext4_io_end_t *io, *io0, *io1;
-       unsigned long flags;
- 
-       if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
-               ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
-               return;
-       }
- 
-       ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
-       spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
-       list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
-               cur = &io->list;
-               before = cur->prev;
-               io0 = container_of(before, ext4_io_end_t, list);
-               after = cur->next;
-               io1 = container_of(after, ext4_io_end_t, list);
- 
-               ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
-                           io, inode->i_ino, io0, io1);
-       }
-       spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
- #endif
- }
- 
- /*
-  * check a range of space and convert unwritten extents to written.
-  */
- static int ext4_end_io_nolock(ext4_io_end_t *io)
- {
-       struct inode *inode = io->inode;
-       loff_t offset = io->offset;
-       ssize_t size = io->size;
-       int ret = 0;
- 
-       ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
-                  "list->prev 0x%p\n",
-                  io, inode->i_ino, io->list.next, io->list.prev);
- 
-       if (list_empty(&io->list))
-               return ret;
- 
-       if (io->flag != EXT4_IO_UNWRITTEN)
-               return ret;
- 
-       ret = ext4_convert_unwritten_extents(inode, offset, size);
-       if (ret < 0) {
-               printk(KERN_EMERG "%s: failed to convert unwritten"
-                       "extents to written extents, error is %d"
-                       " io is still on inode %lu aio dio list\n",
-                        __func__, ret, inode->i_ino);
-               return ret;
-       }
- 
-       if (io->iocb)
-               aio_complete(io->iocb, io->result, 0);
-       /* clear the DIO AIO unwritten flag */
-       io->flag = 0;
-       return ret;
- }
- 
- /*
-  * work on completed aio dio IO, to convert unwritten extents to extents
-  */
- static void ext4_end_io_work(struct work_struct *work)
- {
-       ext4_io_end_t           *io = container_of(work, ext4_io_end_t, work);
-       struct inode            *inode = io->inode;
-       struct ext4_inode_info  *ei = EXT4_I(inode);
-       unsigned long           flags;
-       int                     ret;
- 
-       mutex_lock(&inode->i_mutex);
-       ret = ext4_end_io_nolock(io);
-       if (ret < 0) {
-               mutex_unlock(&inode->i_mutex);
-               return;
-       }
- 
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       if (!list_empty(&io->list))
-               list_del_init(&io->list);
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       mutex_unlock(&inode->i_mutex);
-       ext4_free_io_end(io);
- }
- 
- /*
-  * This function is called from ext4_sync_file().
-  *
-  * When IO is completed, the work to convert unwritten extents to
-  * written is queued on workqueue but may not get immediately
-  * scheduled. When fsync is called, we need to ensure the
-  * conversion is complete before fsync returns.
-  * The inode keeps track of a list of pending/completed IO that
-  * might needs to do the conversion. This function walks through
-  * the list and convert the related unwritten extents for completed IO
-  * to written.
-  * The function return the number of pending IOs on success.
-  */
- int flush_completed_IO(struct inode *inode)
- {
-       ext4_io_end_t *io;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       unsigned long flags;
-       int ret = 0;
-       int ret2 = 0;
- 
-       if (list_empty(&ei->i_completed_io_list))
-               return ret;
- 
-       dump_completed_IO(inode);
-       spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-       while (!list_empty(&ei->i_completed_io_list)){
-               io = list_entry(ei->i_completed_io_list.next,
-                               ext4_io_end_t, list);
-               /*
-                * Calling ext4_end_io_nolock() to convert completed
-                * IO to written.
-                *
-                * When ext4_sync_file() is called, run_queue() may already
-                * about to flush the work corresponding to this io structure.
-                * It will be upset if it founds the io structure related
-                * to the work-to-be schedule is freed.
-                *
-                * Thus we need to keep the io structure still valid here after
-                * convertion finished. The io structure has a flag to
-                * avoid double converting from both fsync and background work
-                * queue work.
-                */
-               spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-               ret = ext4_end_io_nolock(io);
-               spin_lock_irqsave(&ei->i_completed_io_lock, flags);
-               if (ret < 0)
-                       ret2 = ret;
-               else
-                       list_del_init(&io->list);
-       }
-       spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-       return (ret2 < 0) ? ret2 : 0;
- }
- 
- static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
- {
-       ext4_io_end_t *io = NULL;
- 
-       io = kmalloc(sizeof(*io), flags);
- 
-       if (io) {
-               igrab(inode);
-               io->inode = inode;
-               io->flag = 0;
-               io->offset = 0;
-               io->size = 0;
-               io->page = NULL;
-               io->iocb = NULL;
-               io->result = 0;
-               INIT_WORK(&io->work, ext4_end_io_work);
-               INIT_LIST_HEAD(&io->list);
-       }
- 
-       return io;
- }
- 
   static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
                             ssize_t size, void *private, int ret,
                             bool is_async)
@@@ -3827,7 -3645,7 +3644,7 @@@
                   size);
   
         /* if not aio dio with unwritten extents, just free io and return */
-       if (io_end->flag != EXT4_IO_UNWRITTEN){
+       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
                 ext4_free_io_end(io_end);
                 iocb->private = NULL;
   out:
@@@ -3844,14 -3662,14 +3661,14 @@@
         }
         wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
   
-       /* queue the work to convert unwritten extents to written */
-       queue_work(wq, &io_end->work);
- 
         /* Add the io_end to per-inode completed aio dio list*/
         ei = EXT4_I(io_end->inode);
         spin_lock_irqsave(&ei->i_completed_io_lock, flags);
         list_add_tail(&io_end->list, &ei->i_completed_io_list);
         spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ 
+       /* queue the work to convert unwritten extents to written */
+       queue_work(wq, &io_end->work);
         iocb->private = NULL;
   }
   
@@@ -3872,7 -3690,7 +3689,7 @@@ static void ext4_end_io_buffer_write(st
                 goto out;
         }
   
-       io_end->flag = EXT4_IO_UNWRITTEN;
+       io_end->flag = EXT4_IO_END_UNWRITTEN;
         inode = io_end->inode;
   
         /* Add the io_end to per-inode completed io list*/
@@@ -5463,6 -5281,7 +5280,7 @@@ int ext4_setattr(struct dentry *dentry
   {
         struct inode *inode = dentry->d_inode;
         int error, rc = 0;
+       int orphan = 0;
         const unsigned int ia_valid = attr->ia_valid;
   
         error = inode_change_ok(inode, attr);
@@@ -5518,8 -5337,10 +5336,10 @@@
                         error = PTR_ERR(handle);
                         goto err_out;
                 }
- 
-               error = ext4_orphan_add(handle, inode);
+               if (ext4_handle_valid(handle)) {
+                       error = ext4_orphan_add(handle, inode);
+                       orphan = 1;
+               }
                 EXT4_I(inode)->i_disksize = attr->ia_size;
                 rc = ext4_mark_inode_dirty(handle, inode);
                 if (!error)
@@@ -5537,6 -5358,7 +5357,7 @@@
                                         goto err_out;
                                 }
                                 ext4_orphan_del(handle, inode);
+                               orphan = 0;
                                 ext4_journal_stop(handle);
                                 goto err_out;
                         }
@@@ -5559,7 -5381,7 +5380,7 @@@
          * If the call to ext4_truncate failed to get a transaction handle at
          * all, we need to clean up the in-core orphan list manually.
          */
-       if (inode->i_nlink)
+       if (orphan && inode->i_nlink)
                 ext4_orphan_del(NULL, inode);
   
         if (!rc && (ia_valid & ATTR_MODE))
@@@ -5642,7 -5464,7 +5463,7 @@@ static int ext4_index_trans_blocks(stru
    *
    * Also account for superblock, inode, quota and xattr blocks
    */
- int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
   {
         ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
         int gdpblocks;
diff --combined fs/ext4/mballoc.c

index 42f77b1dc72d810f48deb583baeb0f984c6fbc61,328ea9cec57b6bc2f7474ec9fb90997a4edb3fba..c58eba34724a4281f1cb8bb405ab607ef09b49ff
--- 1/fs/ext4/mballoc.c
--- 2/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@@ -338,6 -338,14 +338,14 @@@
   static struct kmem_cache *ext4_pspace_cachep;
   static struct kmem_cache *ext4_ac_cachep;
   static struct kmem_cache *ext4_free_ext_cachep;
+ 
+ /* We create slab caches for groupinfo data structures based on the
+  * superblock block size.  There will be one per mounted filesystem for
+  * each unique s_blocksize_bits */
+ #define NR_GRPINFO_CACHES     \
+       (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+ static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+ 
   static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                         ext4_group_t group);
   static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@@ -938,6 -946,85 +946,85 @@@ out
         return err;
   }
   
+ /*
+  * lock the group_info alloc_sem of all the groups
+  * belonging to the same buddy cache page. This
+  * make sure other parallel operation on the buddy
+  * cache doesn't happen  whild holding the buddy cache
+  * lock
+  */
+ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
+                                       ext4_group_t group)
+ {
+       int i;
+       int block, pnum;
+       int blocks_per_page;
+       int groups_per_page;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
+       ext4_group_t first_group;
+       struct ext4_group_info *grp;
+ 
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       first_group = pnum * blocks_per_page / 2;
+ 
+       groups_per_page = blocks_per_page >> 1;
+       if (groups_per_page == 0)
+               groups_per_page = 1;
+       /* read all groups the page covers into the cache */
+       for (i = 0; i < groups_per_page; i++) {
+ 
+               if ((first_group + i) >= ngroups)
+                       break;
+               grp = ext4_get_group_info(sb, first_group + i);
+               /* take all groups write allocation
+                * semaphore. This make sure there is
+                * no block allocation going on in any
+                * of that groups
+                */
+               down_write_nested(&grp->alloc_sem, i);
+       }
+       return i;
+ }
+ 
+ static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+                                        ext4_group_t group, int locked_group)
+ {
+       int i;
+       int block, pnum;
+       int blocks_per_page;
+       ext4_group_t first_group;
+       struct ext4_group_info *grp;
+ 
+       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+       /*
+        * the buddy cache inode stores the block bitmap
+        * and buddy information in consecutive blocks.
+        * So for each group we need two blocks.
+        */
+       block = group * 2;
+       pnum = block / blocks_per_page;
+       first_group = pnum * blocks_per_page / 2;
+       /* release locks on all the groups */
+       for (i = 0; i < locked_group; i++) {
+ 
+               grp = ext4_get_group_info(sb, first_group + i);
+               /* take all groups write allocation
+                * semaphore. This make sure there is
+                * no block allocation going on in any
+                * of that groups
+                */
+               up_write(&grp->alloc_sem);
+       }
+ 
+ }
+ 
   /*
    * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
    * block group lock of all groups for this page; do not hold the BG lock when
@@@ -1915,84 -2002,6 +2002,6 @@@ static int ext4_mb_good_group(struct ex
         return 0;
   }
   
- /*
-  * lock the group_info alloc_sem of all the groups
-  * belonging to the same buddy cache page. This
-  * make sure other parallel operation on the buddy
-  * cache doesn't happen  whild holding the buddy cache
-  * lock
-  */
- int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
- {
-       int i;
-       int block, pnum;
-       int blocks_per_page;
-       int groups_per_page;
-       ext4_group_t ngroups = ext4_get_groups_count(sb);
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
- 
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
- 
-       groups_per_page = blocks_per_page >> 1;
-       if (groups_per_page == 0)
-               groups_per_page = 1;
-       /* read all groups the page covers into the cache */
-       for (i = 0; i < groups_per_page; i++) {
- 
-               if ((first_group + i) >= ngroups)
-                       break;
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               down_write_nested(&grp->alloc_sem, i);
-       }
-       return i;
- }
- 
- void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
-                                       ext4_group_t group, int locked_group)
- {
-       int i;
-       int block, pnum;
-       int blocks_per_page;
-       ext4_group_t first_group;
-       struct ext4_group_info *grp;
- 
-       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-       /*
-        * the buddy cache inode stores the block bitmap
-        * and buddy information in consecutive blocks.
-        * So for each group we need two blocks.
-        */
-       block = group * 2;
-       pnum = block / blocks_per_page;
-       first_group = pnum * blocks_per_page / 2;
-       /* release locks on all the groups */
-       for (i = 0; i < locked_group; i++) {
- 
-               grp = ext4_get_group_info(sb, first_group + i);
-               /* take all groups write allocation
-                * semaphore. This make sure there is
-                * no block allocation going on in any
-                * of that groups
-                */
-               up_write(&grp->alloc_sem);
-       }
- 
- }
- 
   static noinline_for_stack int
   ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
   {
@@@ -2233,15 -2242,24 +2242,24 @@@ static const struct file_operations ext
         .release        = seq_release,
   };
   
+ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+ {
+       int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+       struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+ 
+       BUG_ON(!cachep);
+       return cachep;
+ }
   
   /* Create and initialize ext4_group_info data for the given group. */
   int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                           struct ext4_group_desc *desc)
   {
-       int i, len;
+       int i;
         int metalen = 0;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct ext4_group_info **meta_group_info;
+       struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
   
         /*
          * First check if this group is the first of a reserved block.
@@@ -2261,22 -2279,16 +2279,16 @@@
                         meta_group_info;
         }
   
-       /*
-        * calculate needed size. if change bb_counters size,
-        * don't forget about ext4_mb_generate_buddy()
-        */
-       len = offsetof(typeof(**meta_group_info),
-                      bb_counters[sb->s_blocksize_bits + 2]);
- 
         meta_group_info =
                 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
         i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
   
-       meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+       meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
         if (meta_group_info[i] == NULL) {
                 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
                 goto exit_group_info;
         }
+       memset(meta_group_info[i], 0, kmem_cache_size(cachep));
         set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
                 &(meta_group_info[i]->bb_state));
   
@@@ -2331,6 -2343,7 +2343,7 @@@ static int ext4_mb_init_backend(struct 
         int num_meta_group_infos_max;
         int array_size;
         struct ext4_group_desc *desc;
+       struct kmem_cache *cachep;
   
         /* This is the number of blocks used by GDT */
         num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@@ -2373,7 -2386,6 +2386,7 @@@
                 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
                 goto err_freesgi;
         }
+ +      sbi->s_buddy_cache->i_ino = get_next_ino();
         EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
         for (i = 0; i < ngroups; i++) {
                 desc = ext4_get_group_desc(sb, i, NULL);
@@@ -2389,8 -2401,9 +2402,9 @@@
         return 0;
   
   err_freebuddy:
+       cachep = get_groupinfo_cache(sb->s_blocksize_bits);
         while (i-- > 0)
-               kfree(ext4_get_group_info(sb, i));
+               kmem_cache_free(cachep, ext4_get_group_info(sb, i));
         i = num_meta_group_infos;
         while (i-- > 0)
                 kfree(sbi->s_group_info[i]);
@@@ -2407,19 -2420,48 +2421,48 @@@ int ext4_mb_init(struct super_block *sb
         unsigned offset;
         unsigned max;
         int ret;
+       int cache_index;
+       struct kmem_cache *cachep;
+       char *namep = NULL;
   
         i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
   
         sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
         if (sbi->s_mb_offsets == NULL) {
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
         }
   
         i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
         sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
         if (sbi->s_mb_maxs == NULL) {
-               kfree(sbi->s_mb_offsets);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
+       }
+ 
+       cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+       cachep = ext4_groupinfo_caches[cache_index];
+       if (!cachep) {
+               char name[32];
+               int len = offsetof(struct ext4_group_info,
+                                       bb_counters[sb->s_blocksize_bits + 2]);
+ 
+               sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+               namep = kstrdup(name, GFP_KERNEL);
+               if (!namep) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+ 
+               /* Need to free the kmem_cache_name() when we
+                * destroy the slab */
+               cachep = kmem_cache_create(namep, len, 0,
+                                            SLAB_RECLAIM_ACCOUNT, NULL);
+               if (!cachep) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               ext4_groupinfo_caches[cache_index] = cachep;
         }
   
         /* order 0 is regular bitmap */
@@@ -2440,9 -2482,7 +2483,7 @@@
         /* init file for buddy data */
         ret = ext4_mb_init_backend(sb);
         if (ret != 0) {
-               kfree(sbi->s_mb_offsets);
-               kfree(sbi->s_mb_maxs);
-               return ret;
+               goto out;
         }
   
         spin_lock_init(&sbi->s_md_lock);
@@@ -2457,9 -2497,8 +2498,8 @@@
   
         sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
         if (sbi->s_locality_groups == NULL) {
-               kfree(sbi->s_mb_offsets);
-               kfree(sbi->s_mb_maxs);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out;
         }
         for_each_possible_cpu(i) {
                 struct ext4_locality_group *lg;
@@@ -2476,7 -2515,13 +2516,13 @@@
   
         if (sbi->s_journal)
                 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-       return 0;
+ out:
+       if (ret) {
+               kfree(sbi->s_mb_offsets);
+               kfree(sbi->s_mb_maxs);
+               kfree(namep);
+       }
+       return ret;
   }
   
   /* need to called with the ext4 group lock held */
@@@ -2504,6 -2549,7 +2550,7 @@@ int ext4_mb_release(struct super_block 
         int num_meta_group_infos;
         struct ext4_group_info *grinfo;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
   
         if (sbi->s_group_info) {
                 for (i = 0; i < ngroups; i++) {
@@@ -2514,7 -2560,7 +2561,7 @@@
                         ext4_lock_group(sb, i);
                         ext4_mb_cleanup_pa(grinfo);
                         ext4_unlock_group(sb, i);
-                       kfree(grinfo);
+                       kmem_cache_free(cachep, grinfo);
                 }
                 num_meta_group_infos = (ngroups +
                                 EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@@ -2558,7 -2604,7 +2605,7 @@@
         return 0;
   }
   
- static inline void ext4_issue_discard(struct super_block *sb,
+ static inline int ext4_issue_discard(struct super_block *sb,
                 ext4_group_t block_group, ext4_grpblk_t block, int count)
   {
         int ret;
@@@ -2567,11 -2613,12 +2614,12 @@@
         discard_block = block + ext4_group_first_block_no(sb, block_group);
         trace_ext4_discard_blocks(sb,
                         (unsigned long long) discard_block, count);
- -      ret = sb_issue_discard(sb, discard_block, count);
+ +      ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
-       if (ret == EOPNOTSUPP) {
+       if (ret == -EOPNOTSUPP) {
                 ext4_warning(sb, "discard not supported, disabling");
                 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
         }
+       return ret;
   }
   
   /*
@@@ -2659,28 -2706,22 +2707,22 @@@ static void ext4_remove_debugfs_entry(v
   
   #endif
   
- int __init init_ext4_mballoc(void)
+ int __init ext4_init_mballoc(void)
   {
-       ext4_pspace_cachep =
-               kmem_cache_create("ext4_prealloc_space",
-                                    sizeof(struct ext4_prealloc_space),
-                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+                                       SLAB_RECLAIM_ACCOUNT);
         if (ext4_pspace_cachep == NULL)
                 return -ENOMEM;
   
-       ext4_ac_cachep =
-               kmem_cache_create("ext4_alloc_context",
-                                    sizeof(struct ext4_allocation_context),
-                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+                                   SLAB_RECLAIM_ACCOUNT);
         if (ext4_ac_cachep == NULL) {
                 kmem_cache_destroy(ext4_pspace_cachep);
                 return -ENOMEM;
         }
   
-       ext4_free_ext_cachep =
-               kmem_cache_create("ext4_free_block_extents",
-                                    sizeof(struct ext4_free_data),
-                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
+       ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
+                                         SLAB_RECLAIM_ACCOUNT);
         if (ext4_free_ext_cachep == NULL) {
                 kmem_cache_destroy(ext4_pspace_cachep);
                 kmem_cache_destroy(ext4_ac_cachep);
@@@ -2690,8 -2731,9 +2732,9 @@@
         return 0;
   }
   
- void exit_ext4_mballoc(void)
+ void ext4_exit_mballoc(void)
   {
+       int i;
         /*
          * Wait for completion of call_rcu()'s on ext4_pspace_cachep
          * before destroying the slab cache.
@@@ -2700,6 -2742,15 +2743,15 @@@
         kmem_cache_destroy(ext4_pspace_cachep);
         kmem_cache_destroy(ext4_ac_cachep);
         kmem_cache_destroy(ext4_free_ext_cachep);
+ 
+       for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+               struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+               if (cachep) {
+                       char *name = (char *)kmem_cache_name(cachep);
+                       kmem_cache_destroy(cachep);
+                       kfree(name);
+               }
+       }
         ext4_remove_debugfs_entry();
   }
   
@@@ -3536,8 -3587,7 +3588,7 @@@ static int ext4_mb_new_preallocation(st
    */
   static noinline_for_stack int
   ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
-                       struct ext4_prealloc_space *pa,
-                       struct ext4_allocation_context *ac)
+                       struct ext4_prealloc_space *pa)
   {
         struct super_block *sb = e4b->bd_sb;
         struct ext4_sb_info *sbi = EXT4_SB(sb);
@@@ -3555,11 -3605,6 +3606,6 @@@
         BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
         end = bit + pa->pa_len;
   
-       if (ac) {
-               ac->ac_sb = sb;
-               ac->ac_inode = pa->pa_inode;
-       }
- 
         while (bit < end) {
                 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
                 if (bit >= end)
@@@ -3570,16 -3615,9 +3616,9 @@@
                          (unsigned) next - bit, (unsigned) group);
                 free += next - bit;
   
-               if (ac) {
-                       ac->ac_b_ex.fe_group = group;
-                       ac->ac_b_ex.fe_start = bit;
-                       ac->ac_b_ex.fe_len = next - bit;
-                       ac->ac_b_ex.fe_logical = 0;
-                       trace_ext4_mballoc_discard(ac);
-               }
- 
-               trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
-                                              next - bit);
+               trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+               trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
+                                              grp_blk_start + bit, next - bit);
                 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                 bit = next + 1;
         }
@@@ -3602,29 -3640,19 +3641,19 @@@
   
   static noinline_for_stack int
   ext4_mb_release_group_pa(struct ext4_buddy *e4b,
-                               struct ext4_prealloc_space *pa,
-                               struct ext4_allocation_context *ac)
+                               struct ext4_prealloc_space *pa)
   {
         struct super_block *sb = e4b->bd_sb;
         ext4_group_t group;
         ext4_grpblk_t bit;
   
-       trace_ext4_mb_release_group_pa(sb, ac, pa);
+       trace_ext4_mb_release_group_pa(sb, pa);
         BUG_ON(pa->pa_deleted == 0);
         ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
         BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
         mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
         atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
- 
-       if (ac) {
-               ac->ac_sb = sb;
-               ac->ac_inode = NULL;
-               ac->ac_b_ex.fe_group = group;
-               ac->ac_b_ex.fe_start = bit;
-               ac->ac_b_ex.fe_len = pa->pa_len;
-               ac->ac_b_ex.fe_logical = 0;
-               trace_ext4_mballoc_discard(ac);
-       }
+       trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
   
         return 0;
   }
@@@ -3645,7 -3673,6 +3674,6 @@@ ext4_mb_discard_group_preallocations(st
         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
         struct buffer_head *bitmap_bh = NULL;
         struct ext4_prealloc_space *pa, *tmp;
-       struct ext4_allocation_context *ac;
         struct list_head list;
         struct ext4_buddy e4b;
         int err;
@@@ -3674,9 -3701,6 +3702,6 @@@
                 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
   
         INIT_LIST_HEAD(&list);
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac)
-               ac->ac_sb = sb;
   repeat:
         ext4_lock_group(sb, group);
         list_for_each_entry_safe(pa, tmp,
@@@ -3731,9 -3755,9 +3756,9 @@@
                 spin_unlock(pa->pa_obj_lock);
   
                 if (pa->pa_type == MB_GROUP_PA)
-                       ext4_mb_release_group_pa(&e4b, pa, ac);
+                       ext4_mb_release_group_pa(&e4b, pa);
                 else
-                       ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+                       ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
   
                 list_del(&pa->u.pa_tmp_list);
                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@@ -3741,8 -3765,6 +3766,6 @@@
   
   out:
         ext4_unlock_group(sb, group);
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
         ext4_mb_unload_buddy(&e4b);
         put_bh(bitmap_bh);
         return free;
@@@ -3763,7 -3785,6 +3786,6 @@@ void ext4_discard_preallocations(struc
         struct super_block *sb = inode->i_sb;
         struct buffer_head *bitmap_bh = NULL;
         struct ext4_prealloc_space *pa, *tmp;
-       struct ext4_allocation_context *ac;
         ext4_group_t group = 0;
         struct list_head list;
         struct ext4_buddy e4b;
@@@ -3779,11 -3800,6 +3801,6 @@@
   
         INIT_LIST_HEAD(&list);
   
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac) {
-               ac->ac_sb = sb;
-               ac->ac_inode = inode;
-       }
   repeat:
         /* first, collect all pa's in the inode */
         spin_lock(&ei->i_prealloc_lock);
@@@ -3853,7 -3869,7 +3870,7 @@@
   
                 ext4_lock_group(sb, group);
                 list_del(&pa->pa_group_list);
-               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
                 ext4_unlock_group(sb, group);
   
                 ext4_mb_unload_buddy(&e4b);
@@@ -3862,8 -3878,6 +3879,6 @@@
                 list_del(&pa->u.pa_tmp_list);
                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
         }
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
   }
   
   /*
@@@ -4061,14 -4075,10 +4076,10 @@@ ext4_mb_discard_lg_preallocations(struc
         struct ext4_buddy e4b;
         struct list_head discard_list;
         struct ext4_prealloc_space *pa, *tmp;
-       struct ext4_allocation_context *ac;
   
         mb_debug(1, "discard locality group preallocation\n");
   
         INIT_LIST_HEAD(&discard_list);
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac)
-               ac->ac_sb = sb;
   
         spin_lock(&lg->lg_prealloc_lock);
         list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@@ -4120,15 -4130,13 +4131,13 @@@
                 }
                 ext4_lock_group(sb, group);
                 list_del(&pa->pa_group_list);
-               ext4_mb_release_group_pa(&e4b, pa, ac);
+               ext4_mb_release_group_pa(&e4b, pa);
                 ext4_unlock_group(sb, group);
   
                 ext4_mb_unload_buddy(&e4b);
                 list_del(&pa->u.pa_tmp_list);
                 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
         }
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
   }
   
   /*
@@@ -4492,7 -4500,6 +4501,6 @@@ void ext4_free_blocks(handle_t *handle
   {
         struct buffer_head *bitmap_bh = NULL;
         struct super_block *sb = inode->i_sb;
-       struct ext4_allocation_context *ac = NULL;
         struct ext4_group_desc *gdp;
         unsigned long freed = 0;
         unsigned int overflow;
@@@ -4532,6 -4539,8 +4540,8 @@@
                         if (!bh)
                                 tbh = sb_find_get_block(inode->i_sb,
                                                         block + i);
+                       if (unlikely(!tbh))
+                               continue;
                         ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                     inode, tbh, block + i);
                 }
@@@ -4547,12 -4556,6 +4557,6 @@@
         if (!ext4_should_writeback_data(inode))
                 flags |= EXT4_FREE_BLOCKS_METADATA;
   
-       ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-       if (ac) {
-               ac->ac_inode = inode;
-               ac->ac_sb = sb;
-       }
- 
   do_more:
         overflow = 0;
         ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@@ -4610,12 -4613,7 +4614,7 @@@
                         BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
         }
   #endif
-       if (ac) {
-               ac->ac_b_ex.fe_group = block_group;
-               ac->ac_b_ex.fe_start = bit;
-               ac->ac_b_ex.fe_len = count;
-               trace_ext4_mballoc_free(ac);
-       }
+       trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
   
         err = ext4_mb_load_buddy(sb, block_group, &e4b);
         if (err)
@@@ -4641,12 -4639,12 +4640,12 @@@
                  * with group lock held. generate_buddy look at
                  * them with group lock_held
                  */
+               if (test_opt(sb, DISCARD))
+                       ext4_issue_discard(sb, block_group, bit, count);
                 ext4_lock_group(sb, block_group);
                 mb_clear_bits(bitmap_bh->b_data, bit, count);
                 mb_free_blocks(inode, &e4b, bit, count);
                 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-               if (test_opt(sb, DISCARD))
-                       ext4_issue_discard(sb, block_group, bit, count);
         }
   
         ret = ext4_free_blks_count(sb, gdp) + count;
@@@ -4686,7 -4684,190 +4685,190 @@@ error_return
                 dquot_free_block(inode, freed);
         brelse(bitmap_bh);
         ext4_std_error(sb, err);
-       if (ac)
-               kmem_cache_free(ext4_ac_cachep, ac);
         return;
   }
+ 
+ /**
+  * ext4_trim_extent -- function to TRIM one single free extent in the group
+  * @sb:               super block for the file system
+  * @start:    starting block of the free extent in the alloc. group
+  * @count:    number of blocks to TRIM
+  * @group:    alloc. group we are working with
+  * @e4b:      ext4 buddy for the group
+  *
+  * Trim "count" blocks starting at "start" in the "group". To assure that no
+  * one will allocate those blocks, mark it as used in buddy bitmap. This must
+  * be called with under the group lock.
+  */
+ static int ext4_trim_extent(struct super_block *sb, int start, int count,
+               ext4_group_t group, struct ext4_buddy *e4b)
+ {
+       struct ext4_free_extent ex;
+       int ret = 0;
+ 
+       assert_spin_locked(ext4_group_lock_ptr(sb, group));
+ 
+       ex.fe_start = start;
+       ex.fe_group = group;
+       ex.fe_len = count;
+ 
+       /*
+        * Mark blocks used, so no one can reuse them while
+        * being trimmed.
+        */
+       mb_mark_used(e4b, &ex);
+       ext4_unlock_group(sb, group);
+ 
+       ret = ext4_issue_discard(sb, group, start, count);
+       if (ret)
+               ext4_std_error(sb, ret);
+ 
+       ext4_lock_group(sb, group);
+       mb_free_blocks(NULL, e4b, start, ex.fe_len);
+       return ret;
+ }
+ 
+ /**
+  * ext4_trim_all_free -- function to trim all free space in alloc. group
+  * @sb:                       super block for file system
+  * @e4b:              ext4 buddy
+  * @start:            first group block to examine
+  * @max:              last group block to examine
+  * @minblocks:                minimum extent block count
+  *
+  * ext4_trim_all_free walks through group's buddy bitmap searching for free
+  * extents. When the free block is found, ext4_trim_extent is called to TRIM
+  * the extent.
+  *
+  *
+  * ext4_trim_all_free walks through group's block bitmap searching for free
+  * extents. When the free extent is found, mark it as used in group buddy
+  * bitmap. Then issue a TRIM command on this extent and free the extent in
+  * the group buddy bitmap. This is done until whole group is scanned.
+  */
+ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+               ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+ {
+       void *bitmap;
+       ext4_grpblk_t next, count = 0;
+       ext4_group_t group;
+       int ret = 0;
+ 
+       BUG_ON(e4b == NULL);
+ 
+       bitmap = e4b->bd_bitmap;
+       group = e4b->bd_group;
+       start = (e4b->bd_info->bb_first_free > start) ?
+               e4b->bd_info->bb_first_free : start;
+       ext4_lock_group(sb, group);
+ 
+       while (start < max) {
+               start = mb_find_next_zero_bit(bitmap, max, start);
+               if (start >= max)
+                       break;
+               next = mb_find_next_bit(bitmap, max, start);
+ 
+               if ((next - start) >= minblocks) {
+                       ret = ext4_trim_extent(sb, start,
+                               next - start, group, e4b);
+                       if (ret < 0)
+                               break;
+                       count += next - start;
+               }
+               start = next + 1;
+ 
+               if (fatal_signal_pending(current)) {
+                       count = -ERESTARTSYS;
+                       break;
+               }
+ 
+               if (need_resched()) {
+                       ext4_unlock_group(sb, group);
+                       cond_resched();
+                       ext4_lock_group(sb, group);
+               }
+ 
+               if ((e4b->bd_info->bb_free - count) < minblocks)
+                       break;
+       }
+       ext4_unlock_group(sb, group);
+ 
+       ext4_debug("trimmed %d blocks in the group %d\n",
+               count, group);
+ 
+       if (ret < 0)
+               count = ret;
+ 
+       return count;
+ }
+ 
+ /**
+  * ext4_trim_fs() -- trim ioctl handle function
+  * @sb:                       superblock for filesystem
+  * @range:            fstrim_range structure
+  *
+  * start:     First Byte to trim
+  * len:               number of Bytes to trim from start
+  * minlen:    minimum extent length in Bytes
+  * ext4_trim_fs goes through all allocation groups containing Bytes from
+  * start to start+len. For each such a group ext4_trim_all_free function
+  * is invoked to trim all free space.
+  */
+ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+ {
+       struct ext4_buddy e4b;
+       ext4_group_t first_group, last_group;
+       ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+       ext4_grpblk_t cnt = 0, first_block, last_block;
+       uint64_t start, len, minlen, trimmed;
+       int ret = 0;
+ 
+       start = range->start >> sb->s_blocksize_bits;
+       len = range->len >> sb->s_blocksize_bits;
+       minlen = range->minlen >> sb->s_blocksize_bits;
+       trimmed = 0;
+ 
+       if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+               return -EINVAL;
+ 
+       /* Determine first and last group to examine based on start and len */
+       ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+                                    &first_group, &first_block);
+       ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+                                    &last_group, &last_block);
+       last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+       last_block = EXT4_BLOCKS_PER_GROUP(sb);
+ 
+       if (first_group > last_group)
+               return -EINVAL;
+ 
+       for (group = first_group; group <= last_group; group++) {
+               ret = ext4_mb_load_buddy(sb, group, &e4b);
+               if (ret) {
+                       ext4_error(sb, "Error in loading buddy "
+                                       "information for %u", group);
+                       break;
+               }
+ 
+               if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+                       len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+               else
+                       last_block = len;
+ 
+               if (e4b.bd_info->bb_free >= minlen) {
+                       cnt = ext4_trim_all_free(sb, &e4b, first_block,
+                                               last_block, minlen);
+                       if (cnt < 0) {
+                               ret = cnt;
+                               ext4_mb_unload_buddy(&e4b);
+                               break;
+                       }
+               }
+               ext4_mb_unload_buddy(&e4b);
+               trimmed += cnt;
+               first_block = 0;
+       }
+       range->len = trimmed * sb->s_blocksize;
+ 
+       return ret;
+ }
diff --combined fs/ext4/namei.c

index bd39885b599854329922fa443a016e405aeac1f9,86a7870babbde3c35c2ea2e1b320473fa8aed2c5..92203b8a099f076ebbc0d794b3356289096c663f
--- 1/fs/ext4/namei.c
--- 2/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@@ -856,6 -856,7 +856,7 @@@ static struct buffer_head * ext4_find_e
         struct buffer_head *bh_use[NAMEI_RA_SIZE];
         struct buffer_head *bh, *ret = NULL;
         ext4_lblk_t start, block, b;
+       const u8 *name = d_name->name;
         int ra_max = 0;         /* Number of bh's in the readahead
                                    buffer, bh_use[] */
         int ra_ptr = 0;         /* Current index into readahead
@@@ -870,6 -871,16 +871,16 @@@
         namelen = d_name->len;
         if (namelen > EXT4_NAME_LEN)
                 return NULL;
+       if ((namelen <= 2) && (name[0] == '.') &&
+           (name[1] == '.' || name[1] == '0')) {
+               /*
+                * "." or ".." will only be in the first block
+                * NFS may look up ".."; "." should be handled by the VFS
+                */
+               block = start = 0;
+               nblocks = 1;
+               goto restart;
+       }
         if (is_dx(dir)) {
                 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
                 /*
@@@ -960,55 -971,35 +971,35 @@@ cleanup_and_exit
   static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
                        struct ext4_dir_entry_2 **res_dir, int *err)
   {
-       struct super_block * sb;
+       struct super_block * sb = dir->i_sb;
         struct dx_hash_info     hinfo;
-       u32 hash;
         struct dx_frame frames[2], *frame;
-       struct ext4_dir_entry_2 *de, *top;
         struct buffer_head *bh;
         ext4_lblk_t block;
         int retval;
-       int namelen = d_name->len;
-       const u8 *name = d_name->name;
   
-       sb = dir->i_sb;
-       /* NFS may look up ".." - look at dx_root directory block */
-       if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-               if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
-                       return NULL;
-       } else {
-               frame = frames;
-               frame->bh = NULL;                       /* for dx_release() */
-               frame->at = (struct dx_entry *)frames;  /* hack for zero entry*/
-               dx_set_block(frame->at, 0);             /* dx_root block is 0 */
-       }
-       hash = hinfo.hash;
+       if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+               return NULL;
         do {
                 block = dx_get_block(frame->at);
-               if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+               if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
                         goto errout;
-               de = (struct ext4_dir_entry_2 *) bh->b_data;
-               top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-                                      EXT4_DIR_REC_LEN(0));
-               for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
-                       int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
-                                 + ((char *) de - bh->b_data);
- 
-                       if (!ext4_check_dir_entry(dir, de, bh, off)) {
-                               brelse(bh);
-                               *err = ERR_BAD_DX_DIR;
-                               goto errout;
-                       }
   
-                       if (ext4_match(namelen, name, de)) {
-                               *res_dir = de;
-                               dx_release(frames);
-                               return bh;
-                       }
+               retval = search_dirblock(bh, dir, d_name,
+                                        block << EXT4_BLOCK_SIZE_BITS(sb),
+                                        res_dir);
+               if (retval == 1) {      /* Success! */
+                       dx_release(frames);
+                       return bh;
                 }
                 brelse(bh);
+               if (retval == -1) {
+                       *err = ERR_BAD_DX_DIR;
+                       goto errout;
+               }
+ 
                 /* Check to see if we should continue to search */
-               retval = ext4_htree_next_block(dir, hash, frame,
+               retval = ext4_htree_next_block(dir, hinfo.hash, frame,
                                                frames, NULL);
                 if (retval < 0) {
                         ext4_warning(sb,
@@@ -2312,7 -2303,7 +2303,7 @@@ retry
   
         inode->i_ctime = ext4_current_time(inode);
         ext4_inc_count(handle, inode);
- -      atomic_inc(&inode->i_count);
+ +      ihold(inode);
   
         err = ext4_add_entry(handle, dentry, inode);
         if (!err) {
diff --combined fs/ext4/resize.c

index ca5c8aa00a2fe10a621348913a0c85908c6e7f6e,f398474e2784cd0bdd9af0f067cbbdfa6a661b60..dc963929de652cb997550e38338855823832e53c
--- 1/fs/ext4/resize.c
--- 2/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@@ -226,23 -226,13 +226,13 @@@ static int setup_new_group_blocks(struc
         }
   
         /* Zero out all of the reserved backup group descriptor table blocks */
-       for (i = 0, bit = gdblocks + 1, block = start + bit;
-            i < reserved_gdb; i++, block++, bit++) {
-               struct buffer_head *gdb;
- 
-               ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
- 
-               if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                       goto exit_bh;
+       ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+                       block, sbi->s_itb_per_group);
+       err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
- -                             GFP_NOFS, BLKDEV_IFL_WAIT);
++                             GFP_NOFS);
+       if (err)
+               goto exit_bh;
   
-               if (IS_ERR(gdb = bclean(handle, sb, block))) {
-                       err = PTR_ERR(gdb);
-                       goto exit_bh;
-               }
-               ext4_handle_dirty_metadata(handle, NULL, gdb);
-               ext4_set_bit(bit, bh->b_data);
-               brelse(gdb);
-       }
         ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
                    input->block_bitmap - start);
         ext4_set_bit(input->block_bitmap - start, bh->b_data);
@@@ -251,28 -241,19 +241,18 @@@
         ext4_set_bit(input->inode_bitmap - start, bh->b_data);
   
         /* Zero out all of the inode table blocks */
-       for (i = 0, block = input->inode_table, bit = block - start;
-            i < sbi->s_itb_per_group; i++, bit++, block++) {
-               struct buffer_head *it;
- 
-               ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
- 
-               if ((err = extend_or_restart_transaction(handle, 1, bh)))
-                       goto exit_bh;
- 
-               if (IS_ERR(it = bclean(handle, sb, block))) {
-                       err = PTR_ERR(it);
-                       goto exit_bh;
-               }
-               ext4_handle_dirty_metadata(handle, NULL, it);
-               brelse(it);
-               ext4_set_bit(bit, bh->b_data);
-       }
+       block = input->inode_table;
+       ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+                       block, sbi->s_itb_per_group);
- -      err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
- -                             GFP_NOFS, BLKDEV_IFL_WAIT);
++      err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
+       if (err)
+               goto exit_bh;
   
         if ((err = extend_or_restart_transaction(handle, 2, bh)))
                 goto exit_bh;
   
-       mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
+       ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
+                            bh->b_data);
         ext4_handle_dirty_metadata(handle, NULL, bh);
         brelse(bh);
         /* Mark unused entries in inode bitmap used */
@@@ -283,8 -264,8 +263,8 @@@
                 goto exit_journal;
         }
   
-       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
-                       bh->b_data);
+       ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+                            bh->b_data);
         ext4_handle_dirty_metadata(handle, NULL, bh);
   exit_bh:
         brelse(bh);
diff --combined fs/ext4/super.c

index 8ecc1e590303841b0c5f13d07568082230acdf4d,3b4984d37a681769dcbcdc4d2682f00218f9c4f7..0348ce0665929f45933bb04fa538d2c18fe863f8
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -26,6 -26,7 +26,6 @@@
   #include <linux/init.h>
   #include <linux/blkdev.h>
   #include <linux/parser.h>
- -#include <linux/smp_lock.h>
   #include <linux/buffer_head.h>
   #include <linux/exportfs.h>
   #include <linux/vfs.h>
@@@ -40,6 -41,9 +40,9 @@@
   #include <linux/crc16.h>
   #include <asm/uaccess.h>
   
+ #include <linux/kthread.h>
+ #include <linux/freezer.h>
+ 
   #include "ext4.h"
   #include "ext4_jbd2.h"
   #include "xattr.h"
@@@ -49,8 -53,11 +52,11 @@@
   #define CREATE_TRACE_POINTS
   #include <trace/events/ext4.h>
   
- struct proc_dir_entry *ext4_proc_root;
+ static struct proc_dir_entry *ext4_proc_root;
   static struct kset *ext4_kset;
+ struct ext4_lazy_init *ext4_li_info;
+ struct mutex ext4_li_mtx;
+ struct ext4_features *ext4_feat;
   
   static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                              unsigned long journal_devnum);
@@@ -69,6 -76,8 +75,8 @@@ static void ext4_write_super(struct sup
   static int ext4_freeze(struct super_block *sb);
   static int ext4_get_sb(struct file_system_type *fs_type, int flags,
                        const char *dev_name, void *data, struct vfsmount *mnt);
+ static void ext4_destroy_lazyinit_thread(void);
+ static void ext4_unregister_li_request(struct super_block *sb);
   
   #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
   static struct file_system_type ext3_fs_type = {
@@@ -701,12 -710,14 +709,13 @@@ static void ext4_put_super(struct super
         struct ext4_super_block *es = sbi->s_es;
         int i, err;
   
+       ext4_unregister_li_request(sb);
         dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
   
         flush_workqueue(sbi->dio_unwritten_wq);
         destroy_workqueue(sbi->dio_unwritten_wq);
   
         lock_super(sb);
- -      lock_kernel();
         if (sb->s_dirt)
                 ext4_commit_super(sb, 1);
   
@@@ -717,6 -728,7 +726,7 @@@
                         ext4_abort(sb, "Couldn't clean up the journal");
         }
   
+       del_timer(&sbi->s_err_report);
         ext4_release_system_zone(sb);
         ext4_mb_release(sb);
         ext4_ext_release(sb);
@@@ -773,6 -785,7 +783,6 @@@
          * Now that we are completely done shutting down the
          * superblock, we need to actually destroy the kobject.
          */
- -      unlock_kernel();
         unlock_super(sb);
         kobject_put(&sbi->s_kobj);
         wait_for_completion(&sbi->s_kobj_unregister);
@@@ -1042,6 -1055,12 +1052,12 @@@ static int ext4_show_options(struct seq
             !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
                 seq_puts(seq, ",block_validity");
   
+       if (!test_opt(sb, INIT_INODE_TABLE))
+               seq_puts(seq, ",noinit_inode_table");
+       else if (sbi->s_li_wait_mult)
+               seq_printf(seq, ",init_inode_table=%u",
+                          (unsigned) sbi->s_li_wait_mult);
+ 
         ext4_show_quota_options(seq, sb);
   
         return 0;
@@@ -1170,6 -1189,7 +1186,7 @@@ static const struct super_operations ex
         .quota_write    = ext4_quota_write,
   #endif
         .bdev_try_to_free_page = bdev_try_to_free_page,
+       .trim_fs        = ext4_trim_fs
   };
   
   static const struct super_operations ext4_nojournal_sops = {
@@@ -1216,6 -1236,7 +1233,7 @@@ enum 
         Opt_inode_readahead_blks, Opt_journal_ioprio,
         Opt_dioread_nolock, Opt_dioread_lock,
         Opt_discard, Opt_nodiscard,
+       Opt_init_inode_table, Opt_noinit_inode_table,
   };
   
   static const match_table_t tokens = {
@@@ -1286,6 -1307,9 +1304,9 @@@
         {Opt_dioread_lock, "dioread_lock"},
         {Opt_discard, "discard"},
         {Opt_nodiscard, "nodiscard"},
+       {Opt_init_inode_table, "init_itable=%u"},
+       {Opt_init_inode_table, "init_itable"},
+       {Opt_noinit_inode_table, "noinit_itable"},
         {Opt_err, NULL},
   };
   
@@@ -1756,6 -1780,20 +1777,20 @@@ set_qf_format
                 case Opt_dioread_lock:
                         clear_opt(sbi->s_mount_opt, DIOREAD_NOLOCK);
                         break;
+               case Opt_init_inode_table:
+                       set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                       if (args[0].from) {
+                               if (match_int(&args[0], &option))
+                                       return 0;
+                       } else
+                               option = EXT4_DEF_LI_WAIT_MULT;
+                       if (option < 0)
+                               return 0;
+                       sbi->s_li_wait_mult = option;
+                       break;
+               case Opt_noinit_inode_table:
+                       clear_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
+                       break;
                 default:
                         ext4_msg(sb, KERN_ERR,
                                "Unrecognized mount option \"%s\" "
@@@ -1939,7 -1977,8 +1974,8 @@@ int ext4_group_desc_csum_verify(struct 
   }
   
   /* Called at mount-time, super-block is locked */
- static int ext4_check_descriptors(struct super_block *sb)
+ static int ext4_check_descriptors(struct super_block *sb,
+                                 ext4_group_t *first_not_zeroed)
   {
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@@ -1948,7 -1987,7 +1984,7 @@@
         ext4_fsblk_t inode_bitmap;
         ext4_fsblk_t inode_table;
         int flexbg_flag = 0;
-       ext4_group_t i;
+       ext4_group_t i, grp = sbi->s_groups_count;
   
         if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                 flexbg_flag = 1;
@@@ -1964,6 -2003,10 +2000,10 @@@
                         last_block = first_block +
                                 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
   
+               if ((grp == sbi->s_groups_count) &&
+                  !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                       grp = i;
+ 
                 block_bitmap = ext4_block_bitmap(sb, gdp);
                 if (block_bitmap < first_block || block_bitmap > last_block) {
                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@@ -2001,6 -2044,8 +2041,8 @@@
                 if (!flexbg_flag)
                         first_block += EXT4_BLOCKS_PER_GROUP(sb);
         }
+       if (NULL != first_not_zeroed)
+               *first_not_zeroed = grp;
   
         ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
         sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
@@@ -2373,6 -2418,7 +2415,7 @@@ static struct ext4_attr ext4_attr_##_na
   #define EXT4_ATTR(name, mode, show, store) \
   static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
   
+ #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
   #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
   #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
   #define EXT4_RW_ATTR_SBI_UI(name, elname)     \
@@@ -2409,6 -2455,16 +2452,16 @@@ static struct attribute *ext4_attrs[] 
         NULL,
   };
   
+ /* Features this copy of ext4 supports */
+ EXT4_INFO_ATTR(lazy_itable_init);
+ EXT4_INFO_ATTR(batched_discard);
+ 
+ static struct attribute *ext4_feat_attrs[] = {
+       ATTR_LIST(lazy_itable_init),
+       ATTR_LIST(batched_discard),
+       NULL,
+ };
+ 
   static ssize_t ext4_attr_show(struct kobject *kobj,
                               struct attribute *attr, char *buf)
   {
@@@ -2437,7 -2493,6 +2490,6 @@@ static void ext4_sb_release(struct kobj
         complete(&sbi->s_kobj_unregister);
   }
   
- 
   static const struct sysfs_ops ext4_attr_ops = {
         .show   = ext4_attr_show,
         .store  = ext4_attr_store,
@@@ -2449,6 -2504,17 +2501,17 @@@ static struct kobj_type ext4_ktype = 
         .release        = ext4_sb_release,
   };
   
+ static void ext4_feat_release(struct kobject *kobj)
+ {
+       complete(&ext4_feat->f_kobj_unregister);
+ }
+ 
+ static struct kobj_type ext4_feat_ktype = {
+       .default_attrs  = ext4_feat_attrs,
+       .sysfs_ops      = &ext4_attr_ops,
+       .release        = ext4_feat_release,
+ };
+ 
   /*
    * Check whether this filesystem can be mounted based on
    * the features present and the RDONLY/RDWR mount requested.
@@@ -2539,6 -2605,372 +2602,372 @@@ static void print_daily_error_info(unsi
         mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
   }
   
+ static void ext4_lazyinode_timeout(unsigned long data)
+ {
+       struct task_struct *p = (struct task_struct *)data;
+       wake_up_process(p);
+ }
+ 
+ /* Find next suitable group and run ext4_init_inode_table */
+ static int ext4_run_li_request(struct ext4_li_request *elr)
+ {
+       struct ext4_group_desc *gdp = NULL;
+       ext4_group_t group, ngroups;
+       struct super_block *sb;
+       unsigned long timeout = 0;
+       int ret = 0;
+ 
+       sb = elr->lr_super;
+       ngroups = EXT4_SB(sb)->s_groups_count;
+ 
+       for (group = elr->lr_next_group; group < ngroups; group++) {
+               gdp = ext4_get_group_desc(sb, group, NULL);
+               if (!gdp) {
+                       ret = 1;
+                       break;
+               }
+ 
+               if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                       break;
+       }
+ 
+       if (group == ngroups)
+               ret = 1;
+ 
+       if (!ret) {
+               timeout = jiffies;
+               ret = ext4_init_inode_table(sb, group,
+                                           elr->lr_timeout ? 0 : 1);
+               if (elr->lr_timeout == 0) {
+                       timeout = jiffies - timeout;
+                       if (elr->lr_sbi->s_li_wait_mult)
+                               timeout *= elr->lr_sbi->s_li_wait_mult;
+                       else
+                               timeout *= 20;
+                       elr->lr_timeout = timeout;
+               }
+               elr->lr_next_sched = jiffies + elr->lr_timeout;
+               elr->lr_next_group = group + 1;
+       }
+ 
+       return ret;
+ }
+ 
+ /*
+  * Remove lr_request from the list_request and free the
+  * request tructure. Should be called with li_list_mtx held
+  */
+ static void ext4_remove_li_request(struct ext4_li_request *elr)
+ {
+       struct ext4_sb_info *sbi;
+ 
+       if (!elr)
+               return;
+ 
+       sbi = elr->lr_sbi;
+ 
+       list_del(&elr->lr_request);
+       sbi->s_li_request = NULL;
+       kfree(elr);
+ }
+ 
+ static void ext4_unregister_li_request(struct super_block *sb)
+ {
+       struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
+ 
+       if (!ext4_li_info)
+               return;
+ 
+       mutex_lock(&ext4_li_info->li_list_mtx);
+       ext4_remove_li_request(elr);
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+ }
+ 
+ /*
+  * This is the function where ext4lazyinit thread lives. It walks
+  * through the request list searching for next scheduled filesystem.
+  * When such a fs is found, run the lazy initialization request
+  * (ext4_rn_li_request) and keep track of the time spend in this
+  * function. Based on that time we compute next schedule time of
+  * the request. When walking through the list is complete, compute
+  * next waking time and put itself into sleep.
+  */
+ static int ext4_lazyinit_thread(void *arg)
+ {
+       struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+       struct list_head *pos, *n;
+       struct ext4_li_request *elr;
+       unsigned long next_wakeup;
+       DEFINE_WAIT(wait);
+       int ret;
+ 
+       BUG_ON(NULL == eli);
+ 
+       eli->li_timer.data = (unsigned long)current;
+       eli->li_timer.function = ext4_lazyinode_timeout;
+ 
+       eli->li_task = current;
+       wake_up(&eli->li_wait_task);
+ 
+ cont_thread:
+       while (true) {
+               next_wakeup = MAX_JIFFY_OFFSET;
+ 
+               mutex_lock(&eli->li_list_mtx);
+               if (list_empty(&eli->li_request_list)) {
+                       mutex_unlock(&eli->li_list_mtx);
+                       goto exit_thread;
+               }
+ 
+               list_for_each_safe(pos, n, &eli->li_request_list) {
+                       elr = list_entry(pos, struct ext4_li_request,
+                                        lr_request);
+ 
+                       if (time_after_eq(jiffies, elr->lr_next_sched))
+                               ret = ext4_run_li_request(elr);
+ 
+                       if (ret) {
+                               ret = 0;
+                               ext4_remove_li_request(elr);
+                               continue;
+                       }
+ 
+                       if (time_before(elr->lr_next_sched, next_wakeup))
+                               next_wakeup = elr->lr_next_sched;
+               }
+               mutex_unlock(&eli->li_list_mtx);
+ 
+               if (freezing(current))
+                       refrigerator();
+ 
+               if (time_after_eq(jiffies, next_wakeup)) {
+                       cond_resched();
+                       continue;
+               }
+ 
+               eli->li_timer.expires = next_wakeup;
+               add_timer(&eli->li_timer);
+               prepare_to_wait(&eli->li_wait_daemon, &wait,
+                               TASK_INTERRUPTIBLE);
+               if (time_before(jiffies, next_wakeup))
+                       schedule();
+               finish_wait(&eli->li_wait_daemon, &wait);
+       }
+ 
+ exit_thread:
+       /*
+        * It looks like the request list is empty, but we need
+        * to check it under the li_list_mtx lock, to prevent any
+        * additions into it, and of course we should lock ext4_li_mtx
+        * to atomically free the list and ext4_li_info, because at
+        * this point another ext4 filesystem could be registering
+        * new one.
+        */
+       mutex_lock(&ext4_li_mtx);
+       mutex_lock(&eli->li_list_mtx);
+       if (!list_empty(&eli->li_request_list)) {
+               mutex_unlock(&eli->li_list_mtx);
+               mutex_unlock(&ext4_li_mtx);
+               goto cont_thread;
+       }
+       mutex_unlock(&eli->li_list_mtx);
+       del_timer_sync(&ext4_li_info->li_timer);
+       eli->li_task = NULL;
+       wake_up(&eli->li_wait_task);
+ 
+       kfree(ext4_li_info);
+       ext4_li_info = NULL;
+       mutex_unlock(&ext4_li_mtx);
+ 
+       return 0;
+ }
+ 
+ static void ext4_clear_request_list(void)
+ {
+       struct list_head *pos, *n;
+       struct ext4_li_request *elr;
+ 
+       mutex_lock(&ext4_li_info->li_list_mtx);
+       if (list_empty(&ext4_li_info->li_request_list))
+               return;
+ 
+       list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+               elr = list_entry(pos, struct ext4_li_request,
+                                lr_request);
+               ext4_remove_li_request(elr);
+       }
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+ }
+ 
+ static int ext4_run_lazyinit_thread(void)
+ {
+       struct task_struct *t;
+ 
+       t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+       if (IS_ERR(t)) {
+               int err = PTR_ERR(t);
+               ext4_clear_request_list();
+               del_timer_sync(&ext4_li_info->li_timer);
+               kfree(ext4_li_info);
+               ext4_li_info = NULL;
+               printk(KERN_CRIT "EXT4: error %d creating inode table "
+                                "initialization thread\n",
+                                err);
+               return err;
+       }
+       ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+ 
+       wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
+       return 0;
+ }
+ 
+ /*
+  * Check whether it make sense to run itable init. thread or not.
+  * If there is at least one uninitialized inode table, return
+  * corresponding group number, else the loop goes through all
+  * groups and return total number of groups.
+  */
+ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+ {
+       ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+       struct ext4_group_desc *gdp = NULL;
+ 
+       for (group = 0; group < ngroups; group++) {
+               gdp = ext4_get_group_desc(sb, group, NULL);
+               if (!gdp)
+                       continue;
+ 
+               if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+                       break;
+       }
+ 
+       return group;
+ }
+ 
+ static int ext4_li_info_new(void)
+ {
+       struct ext4_lazy_init *eli = NULL;
+ 
+       eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+       if (!eli)
+               return -ENOMEM;
+ 
+       eli->li_task = NULL;
+       INIT_LIST_HEAD(&eli->li_request_list);
+       mutex_init(&eli->li_list_mtx);
+ 
+       init_waitqueue_head(&eli->li_wait_daemon);
+       init_waitqueue_head(&eli->li_wait_task);
+       init_timer(&eli->li_timer);
+       eli->li_state |= EXT4_LAZYINIT_QUIT;
+ 
+       ext4_li_info = eli;
+ 
+       return 0;
+ }
+ 
+ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+                                           ext4_group_t start)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_li_request *elr;
+       unsigned long rnd;
+ 
+       elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+       if (!elr)
+               return NULL;
+ 
+       elr->lr_super = sb;
+       elr->lr_sbi = sbi;
+       elr->lr_next_group = start;
+ 
+       /*
+        * Randomize first schedule time of the request to
+        * spread the inode table initialization requests
+        * better.
+        */
+       get_random_bytes(&rnd, sizeof(rnd));
+       elr->lr_next_sched = jiffies + (unsigned long)rnd %
+                            (EXT4_DEF_LI_MAX_START_DELAY * HZ);
+ 
+       return elr;
+ }
+ 
+ static int ext4_register_li_request(struct super_block *sb,
+                                   ext4_group_t first_not_zeroed)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_li_request *elr;
+       ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+       int ret;
+ 
+       if (sbi->s_li_request != NULL)
+               return 0;
+ 
+       if (first_not_zeroed == ngroups ||
+           (sb->s_flags & MS_RDONLY) ||
+           !test_opt(sb, INIT_INODE_TABLE)) {
+               sbi->s_li_request = NULL;
+               return 0;
+       }
+ 
+       if (first_not_zeroed == ngroups) {
+               sbi->s_li_request = NULL;
+               return 0;
+       }
+ 
+       elr = ext4_li_request_new(sb, first_not_zeroed);
+       if (!elr)
+               return -ENOMEM;
+ 
+       mutex_lock(&ext4_li_mtx);
+ 
+       if (NULL == ext4_li_info) {
+               ret = ext4_li_info_new();
+               if (ret)
+                       goto out;
+       }
+ 
+       mutex_lock(&ext4_li_info->li_list_mtx);
+       list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+ 
+       sbi->s_li_request = elr;
+ 
+       if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+               ret = ext4_run_lazyinit_thread();
+               if (ret)
+                       goto out;
+       }
+ out:
+       mutex_unlock(&ext4_li_mtx);
+       if (ret)
+               kfree(elr);
+       return ret;
+ }
+ 
+ /*
+  * We do not need to lock anything since this is called on
+  * module unload.
+  */
+ static void ext4_destroy_lazyinit_thread(void)
+ {
+       /*
+        * If thread exited earlier
+        * there's nothing to be done.
+        */
+       if (!ext4_li_info)
+               return;
+ 
+       ext4_clear_request_list();
+ 
+       while (ext4_li_info->li_task) {
+               wake_up(&ext4_li_info->li_wait_daemon);
+               wait_event(ext4_li_info->li_wait_task,
+                          ext4_li_info->li_task == NULL);
+       }
+ }
+ 
   static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                 __releases(kernel_lock)
                                 __acquires(kernel_lock)
@@@ -2564,6 -2996,7 +2993,7 @@@
         __u64 blocks_count;
         int err;
         unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+       ext4_group_t first_not_zeroed;
   
         sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
         if (!sbi)
@@@ -2585,6 -3018,8 +3015,6 @@@
                 sbi->s_sectors_written_start =
                         part_stat_read(sb->s_bdev->bd_part, sectors[1]);
   
- -      unlock_kernel();
- -
         /* Cleanup superblock name */
         for (cp = sb->s_id; (cp = strchr(cp, '/'));)
                 *cp = '!';
@@@ -2624,6 -3059,7 +3054,7 @@@
   
         /* Set defaults before we parse the mount options */
         def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+       set_opt(sbi->s_mount_opt, INIT_INODE_TABLE);
         if (def_mount_opts & EXT4_DEFM_DEBUG)
                 set_opt(sbi->s_mount_opt, DEBUG);
         if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
@@@ -2826,13 -3262,15 +3257,13 @@@
          * Test whether we have more sectors than will fit in sector_t,
          * and whether the max offset is addressable by the page cache.
          */
- -      if ((ext4_blocks_count(es) >
- -           (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
- -          (ext4_blocks_count(es) >
- -           (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
+ +      ret = generic_check_addressable(sb->s_blocksize_bits,
+ +                                      ext4_blocks_count(es));
+ +      if (ret) {
                 ext4_msg(sb, KERN_ERR, "filesystem"
                          " too large to mount safely on this system");
                 if (sizeof(sector_t) < 8)
                         ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
- -              ret = -EFBIG;
                 goto failed_mount;
         }
   
@@@ -2901,7 -3339,7 +3332,7 @@@
                         goto failed_mount2;
                 }
         }
-       if (!ext4_check_descriptors(sb)) {
+       if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                 goto failed_mount2;
         }
@@@ -3122,6 -3560,10 +3553,10 @@@ no_journal
                 goto failed_mount4;
         }
   
+       err = ext4_register_li_request(sb, first_not_zeroed);
+       if (err)
+               goto failed_mount4;
+ 
         sbi->s_kobj.kset = ext4_kset;
         init_completion(&sbi->s_kobj_unregister);
         err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
@@@ -3159,6 -3601,7 +3594,6 @@@
         if (es->s_error_count)
                 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
   
- -      lock_kernel();
         kfree(orig_data);
         return 0;
   
@@@ -3205,6 -3648,7 +3640,6 @@@ out_fail
         sb->s_fs_info = NULL;
         kfree(sbi->s_blockgroup_lock);
         kfree(sbi);
- -      lock_kernel();
   out_free_orig:
         kfree(orig_data);
         return ret;
@@@ -3461,7 -3905,7 +3896,7 @@@ static int ext4_load_journal(struct sup
         EXT4_SB(sb)->s_journal = journal;
         ext4_clear_journal_err(sb, es);
   
-       if (journal_devnum &&
+       if (!really_read_only && journal_devnum &&
             journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                 es->s_journal_dev = cpu_to_le32(journal_devnum);
   
@@@ -3514,9 -3958,12 +3949,12 @@@ static int ext4_commit_super(struct sup
         else
                 es->s_kbytes_written =
                         cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-       ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+       if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
+               ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                         &EXT4_SB(sb)->s_freeblocks_counter));
-       es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+       if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
+               es->s_free_inodes_count =
+                       cpu_to_le32(percpu_counter_sum_positive(
                                         &EXT4_SB(sb)->s_freeinodes_counter));
         sb->s_dirt = 0;
         BUFFER_TRACE(sbh, "marking dirty");
@@@ -3713,6 -4160,8 +4151,6 @@@ static int ext4_remount(struct super_bl
   #endif
         char *orig_data = kstrdup(data, GFP_KERNEL);
   
- -      lock_kernel();
- -
         /* Store the original options */
         lock_super(sb);
         old_sb_flags = sb->s_flags;
@@@ -3835,6 -4284,19 +4273,19 @@@
                         enable_quota = 1;
                 }
         }
+ 
+       /*
+        * Reinitialize lazy itable initialization thread based on
+        * current settings
+        */
+       if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+               ext4_unregister_li_request(sb);
+       else {
+               ext4_group_t first_not_zeroed;
+               first_not_zeroed = ext4_has_uninit_itable(sb);
+               ext4_register_li_request(sb, first_not_zeroed);
+       }
+ 
         ext4_setup_system_zone(sb);
         if (sbi->s_journal == NULL)
                 ext4_commit_super(sb, 1);
@@@ -3847,6 -4309,7 +4298,6 @@@
                         kfree(old_opts.s_qf_names[i]);
   #endif
         unlock_super(sb);
- -      unlock_kernel();
         if (enable_quota)
                 dquot_resume(sb, -1);
   
@@@ -3872,6 -4335,7 +4323,6 @@@ restore_opts
         }
   #endif
         unlock_super(sb);
- -      unlock_kernel();
         kfree(orig_data);
         return err;
   }
@@@ -4276,23 -4740,53 +4727,53 @@@ static struct file_system_type ext4_fs_
         .fs_flags       = FS_REQUIRES_DEV,
   };
   
- static int __init init_ext4_fs(void)
+ int __init ext4_init_feat_adverts(void)
+ {
+       struct ext4_features *ef;
+       int ret = -ENOMEM;
+ 
+       ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+       if (!ef)
+               goto out;
+ 
+       ef->f_kobj.kset = ext4_kset;
+       init_completion(&ef->f_kobj_unregister);
+       ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+                                  "features");
+       if (ret) {
+               kfree(ef);
+               goto out;
+       }
+ 
+       ext4_feat = ef;
+       ret = 0;
+ out:
+       return ret;
+ }
+ 
+ static int __init ext4_init_fs(void)
   {
         int err;
   
         ext4_check_flag_values();
-       err = init_ext4_system_zone();
+       err = ext4_init_pageio();
         if (err)
                 return err;
+       err = ext4_init_system_zone();
+       if (err)
+               goto out5;
         ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
         if (!ext4_kset)
                 goto out4;
         ext4_proc_root = proc_mkdir("fs/ext4", NULL);
-       err = init_ext4_mballoc();
+ 
+       err = ext4_init_feat_adverts();
+ 
+       err = ext4_init_mballoc();
         if (err)
                 goto out3;
   
-       err = init_ext4_xattr();
+       err = ext4_init_xattr();
         if (err)
                 goto out2;
         err = init_inodecache();
@@@ -4303,38 -4797,46 +4784,46 @@@
         err = register_filesystem(&ext4_fs_type);
         if (err)
                 goto out;
+ 
+       ext4_li_info = NULL;
+       mutex_init(&ext4_li_mtx);
         return 0;
   out:
         unregister_as_ext2();
         unregister_as_ext3();
         destroy_inodecache();
   out1:
-       exit_ext4_xattr();
+       ext4_exit_xattr();
   out2:
-       exit_ext4_mballoc();
+       ext4_exit_mballoc();
   out3:
+       kfree(ext4_feat);
         remove_proc_entry("fs/ext4", NULL);
         kset_unregister(ext4_kset);
   out4:
-       exit_ext4_system_zone();
+       ext4_exit_system_zone();
+ out5:
+       ext4_exit_pageio();
         return err;
   }
   
- static void __exit exit_ext4_fs(void)
+ static void __exit ext4_exit_fs(void)
   {
+       ext4_destroy_lazyinit_thread();
         unregister_as_ext2();
         unregister_as_ext3();
         unregister_filesystem(&ext4_fs_type);
         destroy_inodecache();
-       exit_ext4_xattr();
-       exit_ext4_mballoc();
+       ext4_exit_xattr();
+       ext4_exit_mballoc();
         remove_proc_entry("fs/ext4", NULL);
         kset_unregister(ext4_kset);
-       exit_ext4_system_zone();
+       ext4_exit_system_zone();
+       ext4_exit_pageio();
   }
   
   MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
   MODULE_DESCRIPTION("Fourth Extended Filesystem");
   MODULE_LICENSE("GPL");
- module_init(init_ext4_fs)
- module_exit(exit_ext4_fs)
+ module_init(ext4_init_fs)
+ module_exit(ext4_exit_fs)
diff --combined fs/jbd2/checkpoint.c

index 6571a056e55d6df336056925381499ec9b937d68,524800dce207a3fb577fa82b8812a9012621cb60..6a79fd0a1a32cdd018fea9879a6026e4dcf9c8b6
--- 1/fs/jbd2/checkpoint.c
--- 2/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@@ -299,6 -299,16 +299,16 @@@ static int __process_buffer(journal_t *
                 transaction->t_chp_stats.cs_forced_to_close++;
                 spin_unlock(&journal->j_list_lock);
                 jbd_unlock_bh_state(bh);
+               if (unlikely(journal->j_flags & JBD2_UNMOUNT))
+                       /*
+                        * The journal thread is dead; so starting and
+                        * waiting for a commit to finish will cause
+                        * us to wait for a _very_ long time.
+                        */
+                       printk(KERN_ERR "JBD2: %s: "
+                              "Waiting for Godot: block %llu\n",
+                              journal->j_devname,
+                              (unsigned long long) bh->b_blocknr);
                 jbd2_log_start_commit(journal, tid);
                 jbd2_log_wait_commit(journal, tid);
                 ret = 1;
@@@ -532,7 -542,8 +542,7 @@@ int jbd2_cleanup_journal_tail(journal_
          */
         if ((journal->j_fs_dev != journal->j_dev) &&
             (journal->j_flags & JBD2_BARRIER))
- -              blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
- -                      BLKDEV_IFL_WAIT);
+ +              blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
         if (!(journal->j_flags & JBD2_ABORT))
                 jbd2_journal_update_superblock(journal, 1);
         return 0;
diff --combined fs/jbd2/commit.c

index bc6be8bda1cc067d3230acfbe20847b45906ee34,6494c81e3b0a9cb3d07d2eec52d585e6e3a46117..f3ad1598b20128bc3acaaa1bd81e7ece1e27e270
--- 1/fs/jbd2/commit.c
--- 2/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@@ -26,7 -26,9 +26,9 @@@
   #include <linux/backing-dev.h>
   #include <linux/bio.h>
   #include <linux/blkdev.h>
+ #include <linux/bitops.h>
   #include <trace/events/jbd2.h>
+ #include <asm/system.h>
   
   /*
    * Default IO end handler for temporary BJ_IO buffer_heads.
@@@ -134,11 -136,25 +136,11 @@@ static int journal_submit_commit_record
   
         if (journal->j_flags & JBD2_BARRIER &&
             !JBD2_HAS_INCOMPAT_FEATURE(journal,
- -                                     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
- -              ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
- -              if (ret == -EOPNOTSUPP) {
- -                      printk(KERN_WARNING
- -                             "JBD2: Disabling barriers on %s, "
- -                             "not supported by device\n", journal->j_devname);
- -                      write_lock(&journal->j_state_lock);
- -                      journal->j_flags &= ~JBD2_BARRIER;
- -                      write_unlock(&journal->j_state_lock);
- -
- -                      /* And try again, without the barrier */
- -                      lock_buffer(bh);
- -                      set_buffer_uptodate(bh);
- -                      clear_buffer_dirty(bh);
- -                      ret = submit_bh(WRITE_SYNC_PLUG, bh);
- -              }
- -      } else {
+ +                                     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+ +              ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+ +      else
                 ret = submit_bh(WRITE_SYNC_PLUG, bh);
- -      }
+ +
         *cbh = bh;
         return ret;
   }
@@@ -152,8 -168,29 +154,8 @@@ static int journal_wait_on_commit_recor
   {
         int ret = 0;
   
- -retry:
         clear_buffer_dirty(bh);
         wait_on_buffer(bh);
- -      if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
- -              printk(KERN_WARNING
- -                     "JBD2: %s: disabling barries on %s - not supported "
- -                     "by device\n", __func__, journal->j_devname);
- -              write_lock(&journal->j_state_lock);
- -              journal->j_flags &= ~JBD2_BARRIER;
- -              write_unlock(&journal->j_state_lock);
- -
- -              lock_buffer(bh);
- -              clear_buffer_dirty(bh);
- -              set_buffer_uptodate(bh);
- -              bh->b_end_io = journal_end_buffer_io_sync;
- -
- -              ret = submit_bh(WRITE_SYNC_PLUG, bh);
- -              if (ret) {
- -                      unlock_buffer(bh);
- -                      return ret;
- -              }
- -              goto retry;
- -      }
   
         if (unlikely(!buffer_uptodate(bh)))
                 ret = -EIO;
@@@ -201,7 -238,7 +203,7 @@@ static int journal_submit_data_buffers(
         spin_lock(&journal->j_list_lock);
         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
                 mapping = jinode->i_vfs_inode->i_mapping;
-               jinode->i_flags |= JI_COMMIT_RUNNING;
+               set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                 spin_unlock(&journal->j_list_lock);
                 /*
                  * submit the inode data buffers. We use writepage
@@@ -216,7 -253,8 +218,8 @@@
                 spin_lock(&journal->j_list_lock);
                 J_ASSERT(jinode->i_transaction == commit_transaction);
                 commit_transaction->t_flushed_data_blocks = 1;
-               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+               smp_mb__after_clear_bit();
                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
         }
         spin_unlock(&journal->j_list_lock);
@@@ -237,7 -275,7 +240,7 @@@ static int journal_finish_inode_data_bu
         /* For locking, see the comment in journal_submit_data_buffers() */
         spin_lock(&journal->j_list_lock);
         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
-               jinode->i_flags |= JI_COMMIT_RUNNING;
+               set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
                 spin_unlock(&journal->j_list_lock);
                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
                 if (err) {
@@@ -253,7 -291,8 +256,8 @@@
                                 ret = err;
                 }
                 spin_lock(&journal->j_list_lock);
-               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+               smp_mb__after_clear_bit();
                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
         }
   
@@@ -325,7 -364,7 +329,7 @@@ void jbd2_journal_commit_transaction(jo
         int tag_bytes = journal_tag_bytes(journal);
         struct buffer_head *cbh = NULL; /* For transactional checksums */
         __u32 crc32_sum = ~0;
- -      int write_op = WRITE;
+ +      int write_op = WRITE_SYNC;
   
         /*
          * First job: lock down the current transaction and wait for
@@@ -666,16 -705,6 +670,16 @@@ start_journal_io
                 }
         }
   
+ +      err = journal_finish_inode_data_buffers(journal, commit_transaction);
+ +      if (err) {
+ +              printk(KERN_WARNING
+ +                      "JBD2: Detected IO errors while flushing file data "
+ +                     "on %s\n", journal->j_devname);
+ +              if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+ +                      jbd2_journal_abort(journal, err);
+ +              err = 0;
+ +      }
+ +
         /* 
          * If the journal is not located on the file system device,
          * then we must flush the file system device before we issue
@@@ -684,7 -713,8 +688,7 @@@
         if (commit_transaction->t_flushed_data_blocks &&
             (journal->j_fs_dev != journal->j_dev) &&
             (journal->j_flags & JBD2_BARRIER))
- -              blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
- -                      BLKDEV_IFL_WAIT);
+ +              blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
   
         /* Done it all: now write the commit record asynchronously. */
         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@@ -693,6 -723,19 +697,6 @@@
                                                  &cbh, crc32_sum);
                 if (err)
                         __jbd2_journal_abort_hard(journal);
- -              if (journal->j_flags & JBD2_BARRIER)
- -                      blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
- -                              BLKDEV_IFL_WAIT);
- -      }
- -
- -      err = journal_finish_inode_data_buffers(journal, commit_transaction);
- -      if (err) {
- -              printk(KERN_WARNING
- -                      "JBD2: Detected IO errors while flushing file data "
- -                     "on %s\n", journal->j_devname);
- -              if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
- -                      jbd2_journal_abort(journal, err);
- -              err = 0;
         }
   
         /* Lo and behold: we have just managed to send a transaction to
@@@ -806,11 -849,6 +810,11 @@@ wait_for_iobuf
         }
         if (!err && !is_journal_aborted(journal))
                 err = journal_wait_on_commit_record(journal, cbh);
+ +      if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+ +                                    JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+ +          journal->j_flags & JBD2_BARRIER) {
+ +              blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+ +      }
   
         if (err)
                 jbd2_journal_abort(journal, err);
diff --combined fs/jbd2/journal.c

index 262419f83d800bfb6e4bbfb0ca93e3af2c3f64be,75e1b5a0bc2defa4aac837354dcb95cd4dedd328..13b0a92f96cf15270cc84e920f2df0645963dd88
--- 1/fs/jbd2/journal.c
--- 2/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@@ -42,12 -42,14 +42,14 @@@
   #include <linux/log2.h>
   #include <linux/vmalloc.h>
   #include <linux/backing-dev.h>
+ #include <linux/bitops.h>
   
   #define CREATE_TRACE_POINTS
   #include <trace/events/jbd2.h>
   
   #include <asm/uaccess.h>
   #include <asm/page.h>
+ #include <asm/system.h>
   
   EXPORT_SYMBOL(jbd2_journal_extend);
   EXPORT_SYMBOL(jbd2_journal_stop);
@@@ -1371,10 -1373,6 +1373,10 @@@ int jbd2_journal_check_used_features (j
   
         if (!compat && !ro && !incompat)
                 return 1;
+ +      /* Load journal superblock if it is not loaded yet. */
+ +      if (journal->j_format_version == 0 &&
+ +          journal_get_superblock(journal) != 0)
+ +              return 0;
         if (journal->j_format_version == 1)
                 return 0;
   
@@@ -2210,7 -2208,7 +2212,7 @@@ void jbd2_journal_release_jbd_inode(jou
   restart:
         spin_lock(&journal->j_list_lock);
         /* Is commit writing out inode - we have to wait */
-       if (jinode->i_flags & JI_COMMIT_RUNNING) {
+       if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
                 wait_queue_head_t *wq;
                 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
                 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --combined include/linux/blkdev.h

index 646b462d04df6819e542113f94695d0d716cf721,e5cb4d029689a1b9daef52eed22878581462cb82..5027a599077d89cd72aa5c09aaaa56254b971886
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -124,9 -124,6 +124,9 @@@ struct request 
          * physical address coalescing is performed.
          */
         unsigned short nr_phys_segments;
+ +#if defined(CONFIG_BLK_DEV_INTEGRITY)
+ +      unsigned short nr_integrity_segments;
+ +#endif
   
         unsigned short ioprio;
   
@@@ -246,7 -243,6 +246,7 @@@ struct queue_limits 
   
         unsigned short          logical_block_size;
         unsigned short          max_segments;
+ +      unsigned short          max_integrity_segments;
   
         unsigned char           misaligned;
         unsigned char           discard_misaligned;
@@@ -359,25 -355,18 +359,25 @@@ struct request_queu
         struct blk_trace        *blk_trace;
   #endif
         /*
- -       * reserved for flush operations
+ +       * for flush operations
          */
- -      unsigned int            ordered, next_ordered, ordseq;
- -      int                     orderr, ordcolor;
- -      struct request          pre_flush_rq, bar_rq, post_flush_rq;
- -      struct request          *orig_bar_rq;
+ +      unsigned int            flush_flags;
+ +      unsigned int            flush_seq;
+ +      int                     flush_err;
+ +      struct request          flush_rq;
+ +      struct request          *orig_flush_rq;
+ +      struct list_head        pending_flushes;
   
         struct mutex            sysfs_lock;
   
   #if defined(CONFIG_BLK_DEV_BSG)
         struct bsg_class_device bsg_dev;
   #endif
+ +
+ +#ifdef CONFIG_BLK_DEV_THROTTLING
+ +      /* Throttle data */
+ +      struct throtl_data *td;
+ +#endif
   };
   
   #define QUEUE_FLAG_CLUSTER    0       /* cluster several segments into 1 */
@@@ -473,6 -462,56 +473,6 @@@ static inline void queue_flag_clear(uns
         __clear_bit(flag, &q->queue_flags);
   }
   
- -enum {
- -      /*
- -       * Hardbarrier is supported with one of the following methods.
- -       *
- -       * NONE         : hardbarrier unsupported
- -       * DRAIN        : ordering by draining is enough
- -       * DRAIN_FLUSH  : ordering by draining w/ pre and post flushes
- -       * DRAIN_FUA    : ordering by draining w/ pre flush and FUA write
- -       * TAG          : ordering by tag is enough
- -       * TAG_FLUSH    : ordering by tag w/ pre and post flushes
- -       * TAG_FUA      : ordering by tag w/ pre flush and FUA write
- -       */
- -      QUEUE_ORDERED_BY_DRAIN          = 0x01,
- -      QUEUE_ORDERED_BY_TAG            = 0x02,
- -      QUEUE_ORDERED_DO_PREFLUSH       = 0x10,
- -      QUEUE_ORDERED_DO_BAR            = 0x20,
- -      QUEUE_ORDERED_DO_POSTFLUSH      = 0x40,
- -      QUEUE_ORDERED_DO_FUA            = 0x80,
- -
- -      QUEUE_ORDERED_NONE              = 0x00,
- -
- -      QUEUE_ORDERED_DRAIN             = QUEUE_ORDERED_BY_DRAIN |
- -                                        QUEUE_ORDERED_DO_BAR,
- -      QUEUE_ORDERED_DRAIN_FLUSH       = QUEUE_ORDERED_DRAIN |
- -                                        QUEUE_ORDERED_DO_PREFLUSH |
- -                                        QUEUE_ORDERED_DO_POSTFLUSH,
- -      QUEUE_ORDERED_DRAIN_FUA         = QUEUE_ORDERED_DRAIN |
- -                                        QUEUE_ORDERED_DO_PREFLUSH |
- -                                        QUEUE_ORDERED_DO_FUA,
- -
- -      QUEUE_ORDERED_TAG               = QUEUE_ORDERED_BY_TAG |
- -                                        QUEUE_ORDERED_DO_BAR,
- -      QUEUE_ORDERED_TAG_FLUSH         = QUEUE_ORDERED_TAG |
- -                                        QUEUE_ORDERED_DO_PREFLUSH |
- -                                        QUEUE_ORDERED_DO_POSTFLUSH,
- -      QUEUE_ORDERED_TAG_FUA           = QUEUE_ORDERED_TAG |
- -                                        QUEUE_ORDERED_DO_PREFLUSH |
- -                                        QUEUE_ORDERED_DO_FUA,
- -
- -      /*
- -       * Ordered operation sequence
- -       */
- -      QUEUE_ORDSEQ_STARTED    = 0x01, /* flushing in progress */
- -      QUEUE_ORDSEQ_DRAIN      = 0x02, /* waiting for the queue to be drained */
- -      QUEUE_ORDSEQ_PREFLUSH   = 0x04, /* pre-flushing in progress */
- -      QUEUE_ORDSEQ_BAR        = 0x08, /* original barrier req in progress */
- -      QUEUE_ORDSEQ_POSTFLUSH  = 0x10, /* post-flushing in progress */
- -      QUEUE_ORDSEQ_DONE       = 0x20,
- -};
- -
   #define blk_queue_plugged(q)  test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
   #define blk_queue_tagged(q)   test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
   #define blk_queue_stopped(q)  test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
@@@ -482,6 -521,7 +482,6 @@@
   #define blk_queue_nonrot(q)   test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
   #define blk_queue_io_stat(q)  test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
   #define blk_queue_add_random(q)       test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
- -#define blk_queue_flushing(q) ((q)->ordseq)
   #define blk_queue_stackable(q)        \
         test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
   #define blk_queue_discard(q)  test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
@@@ -552,8 -592,7 +552,8 @@@ static inline void blk_clear_queue_full
    * it already be started by driver.
    */
   #define RQ_NOMERGE_FLAGS      \
- -      (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
+ +      (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER | \
+ +       REQ_FLUSH | REQ_FUA)
   #define rq_mergeable(rq)      \
         (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
          (((rq)->cmd_flags & REQ_DISCARD) || \
@@@ -812,7 -851,7 +812,7 @@@ extern void blk_queue_max_segment_size(
   extern void blk_queue_max_discard_sectors(struct request_queue *q,
                 unsigned int max_discard_sectors);
   extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
- -extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+ +extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
   extern void blk_queue_alignment_offset(struct request_queue *q,
                                        unsigned int alignment);
   extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
@@@ -842,8 -881,12 +842,8 @@@ extern void blk_queue_update_dma_alignm
   extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
   extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
   extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
+ +extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
   extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
- -extern int blk_queue_ordered(struct request_queue *, unsigned);
- -extern bool blk_do_ordered(struct request_queue *, struct request **);
- -extern unsigned blk_ordered_cur_seq(struct request_queue *);
- -extern unsigned blk_ordered_req_seq(struct request *);
- -extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
   
   extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
   extern void blk_dump_rq_flags(struct request *, char *);
@@@ -876,21 -919,36 +876,29 @@@ static inline struct request *blk_map_q
                 return NULL;
         return bqt->tag_index[tag];
   }
- -enum{
- -      BLKDEV_WAIT,    /* wait for completion */
- -      BLKDEV_BARRIER, /* issue request with barrier */
- -      BLKDEV_SECURE,  /* secure discard */
- -};
- -#define BLKDEV_IFL_WAIT               (1 << BLKDEV_WAIT)
- -#define BLKDEV_IFL_BARRIER    (1 << BLKDEV_BARRIER)
- -#define BLKDEV_IFL_SECURE     (1 << BLKDEV_SECURE)
- -extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
- -                      unsigned long);
+ +
+ +#define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
+ +
+ +extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
   extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
                 sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
   extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- -                      sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
- -static inline int sb_issue_discard(struct super_block *sb,
- -                                 sector_t block, sector_t nr_blocks)
+ +                      sector_t nr_sects, gfp_t gfp_mask);
+ +static inline int sb_issue_discard(struct super_block *sb, sector_t block,
+ +              sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
   {
- -      block <<= (sb->s_blocksize_bits - 9);
- -      nr_blocks <<= (sb->s_blocksize_bits - 9);
- -      return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
- -                                 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+ +      return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
+ +                                  nr_blocks << (sb->s_blocksize_bits - 9),
+ +                                  gfp_mask, flags);
   }
- -              sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
+ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
- -                                  gfp_mask, flags);
++              sector_t nr_blocks, gfp_t gfp_mask)
+ {
+       return blkdev_issue_zeroout(sb->s_bdev,
+                                   block << (sb->s_blocksize_bits - 9),
+                                   nr_blocks << (sb->s_blocksize_bits - 9),
++                                  gfp_mask);
+ }
   
   extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
   
@@@ -954,7 -1012,7 +962,7 @@@ static inline unsigned int queue_physic
         return q->limits.physical_block_size;
   }
   
- -static inline int bdev_physical_block_size(struct block_device *bdev)
+ +static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
   {
         return queue_physical_block_size(bdev_get_queue(bdev));
   }
@@@ -1043,11 -1101,11 +1051,11 @@@ static inline int queue_dma_alignment(s
         return q ? q->dma_alignment : 511;
   }
   
- -static inline int blk_rq_aligned(struct request_queue *q, void *addr,
+ +static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
                                  unsigned int len)
   {
         unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
- -      return !((unsigned long)addr & alignment) && !(len & alignment);
+ +      return !(addr & alignment) && !(len & alignment);
   }
   
   /* assumes size > 256 */
@@@ -1077,7 -1135,6 +1085,7 @@@ static inline void put_dev_sector(Secto
   
   struct work_struct;
   int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+ +int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
   
   #ifdef CONFIG_BLK_CGROUP
   /*
@@@ -1121,24 -1178,6 +1129,24 @@@ static inline uint64_t rq_io_start_time
   }
   #endif
   
+ +#ifdef CONFIG_BLK_DEV_THROTTLING
+ +extern int blk_throtl_init(struct request_queue *q);
+ +extern void blk_throtl_exit(struct request_queue *q);
+ +extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
+ +extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
+ +extern void throtl_shutdown_timer_wq(struct request_queue *q);
+ +#else /* CONFIG_BLK_DEV_THROTTLING */
+ +static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
+ +{
+ +      return 0;
+ +}
+ +
+ +static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+ +static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
+ +static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
+ +static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
+ +#endif /* CONFIG_BLK_DEV_THROTTLING */
+ +
   #define MODULE_ALIAS_BLOCKDEV(major,minor) \
         MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
   #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
@@@ -1182,13 -1221,8 +1190,13 @@@ struct blk_integrity 
   extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
   extern void blk_integrity_unregister(struct gendisk *);
   extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
- -extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
- -extern int blk_rq_count_integrity_sg(struct request *);
+ +extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
+ +                                 struct scatterlist *);
+ +extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
+ +extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
+ +                                struct request *);
+ +extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
+ +                                 struct bio *);
   
   static inline
   struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
@@@ -1209,32 -1243,16 +1217,32 @@@ static inline int blk_integrity_rq(stru
         return bio_integrity(rq->bio);
   }
   
+ +static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+ +                                                  unsigned int segs)
+ +{
+ +      q->limits.max_integrity_segments = segs;
+ +}
+ +
+ +static inline unsigned short
+ +queue_max_integrity_segments(struct request_queue *q)
+ +{
+ +      return q->limits.max_integrity_segments;
+ +}
+ +
   #else /* CONFIG_BLK_DEV_INTEGRITY */
   
   #define blk_integrity_rq(rq)                  (0)
- -#define blk_rq_count_integrity_sg(a)          (0)
- -#define blk_rq_map_integrity_sg(a, b)         (0)
+ +#define blk_rq_count_integrity_sg(a, b)               (0)
+ +#define blk_rq_map_integrity_sg(a, b, c)      (0)
   #define bdev_get_integrity(a)                 (0)
   #define blk_get_integrity(a)                  (0)
   #define blk_integrity_compare(a, b)           (0)
   #define blk_integrity_register(a, b)          (0)
   #define blk_integrity_unregister(a)           do { } while (0);
+ +#define blk_queue_max_integrity_segments(a, b)        do { } while (0);
+ +#define queue_max_integrity_segments(a)               (0)
+ +#define blk_integrity_merge_rq(a, b, c)               (0)
+ +#define blk_integrity_merge_bio(a, b, c)      (0)
   
   #endif /* CONFIG_BLK_DEV_INTEGRITY */
   
diff --combined include/linux/fs.h

index b2a6009cba10f6e018093dd50fec8d913e7d5817,7008268e9b5ade024f7f552b4afcf3c4610e1d26..6ed7ace74b7cedf08f539c6b5c0e4412471dad2e
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -32,11 -32,17 +32,17 @@@
   #define SEEK_END      2       /* seek relative to end of file */
   #define SEEK_MAX      SEEK_END
   
+ struct fstrim_range {
+       uint64_t start;
+       uint64_t len;
+       uint64_t minlen;
+ };
+ 
   /* And dynamically-tunable limits and defaults: */
   struct files_stat_struct {
- -      int nr_files;           /* read only */
- -      int nr_free_files;      /* read only */
- -      int max_files;          /* tunable */
+ +      unsigned long nr_files;         /* read only */
+ +      unsigned long nr_free_files;    /* read only */
+ +      unsigned long max_files;                /* tunable */
   };
   
   struct inodes_stat_t {
@@@ -92,9 -98,6 +98,9 @@@
   /* Expect random access pattern */
   #define FMODE_RANDOM          ((__force fmode_t)0x1000)
   
+ +/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
+ +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
+ +
   /* File was opened by fanotify and shouldn't generate fanotify events */
   #define FMODE_NONOTIFY                ((__force fmode_t)0x1000000)
   
@@@ -138,12 -141,12 +144,12 @@@
    *                    immediately after submission. The write equivalent
    *                    of READ_SYNC.
    * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
- - * WRITE_BARRIER      Like WRITE_SYNC, but tells the block layer that all
- - *                    previously submitted writes must be safely on storage
- - *                    before this one is started. Also guarantees that when
- - *                    this write is complete, it itself is also safely on
- - *                    storage. Prevents reordering of writes on both sides
- - *                    of this IO.
+ + * WRITE_FLUSH                Like WRITE_SYNC but with preceding cache flush.
+ + * WRITE_FUA          Like WRITE_SYNC but data is guaranteed to be on
+ + *                    non-volatile media on completion.
+ + * WRITE_FLUSH_FUA    Combination of WRITE_FLUSH and FUA. The IO is preceded
+ + *                    by a cache flush and data is guaranteed to be on
+ + *                    non-volatile media on completion.
    *
    */
   #define RW_MASK                       REQ_WRITE
@@@ -159,12 -162,16 +165,12 @@@
   #define WRITE_SYNC            (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
   #define WRITE_ODIRECT_PLUG    (WRITE | REQ_SYNC)
   #define WRITE_META            (WRITE | REQ_META)
- -#define WRITE_BARRIER         (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
- -                               REQ_HARDBARRIER)
- -
- -/*
- - * These aren't really reads or writes, they pass down information about
- - * parts of device that are now unused by the file system.
- - */
- -#define DISCARD_NOBARRIER     (WRITE | REQ_DISCARD)
- -#define DISCARD_BARRIER               (WRITE | REQ_DISCARD | REQ_HARDBARRIER)
- -#define DISCARD_SECURE                (DISCARD_NOBARRIER | REQ_SECURE)
+ +#define WRITE_FLUSH           (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+ +                               REQ_FLUSH)
+ +#define WRITE_FUA             (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+ +                               REQ_FUA)
+ +#define WRITE_FLUSH_FUA               (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
+ +                               REQ_FLUSH | REQ_FUA)
   
   #define SEL_IN                1
   #define SEL_OUT               2
@@@ -234,7 -241,6 +240,7 @@@
   #define S_NOCMTIME    128     /* Do not update file c/mtime */
   #define S_SWAPFILE    256     /* Do not truncate: swapon got its bmaps */
   #define S_PRIVATE     512     /* Inode is fs-internal */
+ +#define S_IMA         1024    /* Inode has an associated IMA struct */
   
   /*
    * Note that nosuid etc flags are inode-specific: setting some file-system
@@@ -269,7 -275,6 +275,7 @@@
   #define IS_NOCMTIME(inode)    ((inode)->i_flags & S_NOCMTIME)
   #define IS_SWAPFILE(inode)    ((inode)->i_flags & S_SWAPFILE)
   #define IS_PRIVATE(inode)     ((inode)->i_flags & S_PRIVATE)
+ +#define IS_IMA(inode)         ((inode)->i_flags & S_IMA)
   
   /* the read-only stuff doesn't really belong here, but any other place is
      probably as bad and I don't want to create yet another include file. */
@@@ -317,6 -322,7 +323,7 @@@
   #define FIGETBSZ   _IO(0x00,2)        /* get the block size used for bmap */
   #define FIFREEZE      _IOWR('X', 119, int)    /* Freeze */
   #define FITHAW                _IOWR('X', 120, int)    /* Thaw */
+ #define FITRIM                _IOWR('X', 121, struct fstrim_range)    /* Trim */
   
   #define       FS_IOC_GETFLAGS                 _IOR('f', 1, long)
   #define       FS_IOC_SETFLAGS                 _IOW('f', 2, long)
@@@ -405,7 -411,7 +412,7 @@@ extern void __init inode_init_early(voi
   extern void __init files_init(unsigned long);
   
   extern struct files_stat_struct files_stat;
- -extern int get_max_files(void);
+ +extern unsigned long get_max_files(void);
   extern int sysctl_nr_open;
   extern struct inodes_stat_t inodes_stat;
   extern int leases_enable, lease_break_time;
@@@ -725,8 -731,7 +732,8 @@@ struct posix_acl
   
   struct inode {
         struct hlist_node       i_hash;
- -      struct list_head        i_list;         /* backing dev IO list */
+ +      struct list_head        i_wb_list;      /* backing dev IO list */
+ +      struct list_head        i_lru;          /* inode LRU list */
         struct list_head        i_sb_list;
         struct list_head        i_dentry;
         unsigned long           i_ino;
@@@ -778,10 -783,6 +785,10 @@@
   
         unsigned int            i_flags;
   
+ +#ifdef CONFIG_IMA
+ +      /* protected by i_lock */
+ +      unsigned int            i_readcount; /* struct files open RO */
+ +#endif
         atomic_t                i_writecount;
   #ifdef CONFIG_SECURITY
         void                    *i_security;
@@@ -793,11 -794,6 +800,11 @@@
         void                    *i_private; /* fs or device private pointer */
   };
   
+ +static inline int inode_unhashed(struct inode *inode)
+ +{
+ +      return hlist_unhashed(&inode->i_hash);
+ +}
+ +
   /*
    * inode->i_mutex nesting subclasses for the lock validator:
    *
@@@ -1104,6 -1100,10 +1111,6 @@@ struct file_lock 
   
   #include <linux/fcntl.h>
   
- -/* temporary stubs for BKL removal */
- -#define lock_flocks() lock_kernel()
- -#define unlock_flocks() unlock_kernel()
- -
   extern void send_sigio(struct fown_struct *fown, int fd, int band);
   
   #ifdef CONFIG_FILE_LOCKING
@@@ -1122,7 -1122,6 +1129,7 @@@ extern int fcntl_getlease(struct file *
   
   /* fs/locks.c */
   extern void locks_init_lock(struct file_lock *);
+ +extern struct file_lock * locks_alloc_lock(void);
   extern void locks_copy_lock(struct file_lock *, struct file_lock *);
   extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
   extern void locks_remove_posix(struct file *, fl_owner_t);
@@@ -1143,8 -1142,6 +1150,8 @@@ extern int vfs_setlease(struct file *, 
   extern int lease_modify(struct file_lock **, int);
   extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
   extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+ +extern void lock_flocks(void);
+ +extern void unlock_flocks(void);
   #else /* !CONFIG_FILE_LOCKING */
   static inline int fcntl_getlk(struct file *file, struct flock __user *user)
   {
@@@ -1287,14 -1284,6 +1294,14 @@@ static inline int lock_may_write(struc
         return 1;
   }
   
+ +static inline void lock_flocks(void)
+ +{
+ +}
+ +
+ +static inline void unlock_flocks(void)
+ +{
+ +}
+ +
   #endif /* !CONFIG_FILE_LOCKING */
   
   
@@@ -1311,11 -1300,6 +1318,11 @@@ struct fasync_struct 
   
   /* SMP safe fasync helpers: */
   extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
+ +extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
+ +extern int fasync_remove_entry(struct file *, struct fasync_struct **);
+ +extern struct fasync_struct *fasync_alloc(void);
+ +extern void fasync_free(struct fasync_struct *);
+ +
   /* can be called from interrupts */
   extern void kill_fasync(struct fasync_struct **, int, int);
   
@@@ -1407,7 -1391,7 +1414,7 @@@ struct super_block 
          * Saved mount options for lazy filesystems using
          * generic_show_options()
          */
- -      char *s_options;
+ +      char __rcu *s_options;
   };
   
   extern struct timespec current_fs_time(struct super_block *sb);
@@@ -1604,6 -1588,7 +1611,7 @@@ struct super_operations 
         ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
   #endif
         int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+       int (*trim_fs) (struct super_block *, struct fstrim_range *);
   };
   
   /*
@@@ -1654,17 -1639,16 +1662,17 @@@
    *
    * Q: What is the difference between I_WILL_FREE and I_FREEING?
    */
- -#define I_DIRTY_SYNC          1
- -#define I_DIRTY_DATASYNC      2
- -#define I_DIRTY_PAGES         4
+ +#define I_DIRTY_SYNC          (1 << 0)
+ +#define I_DIRTY_DATASYNC      (1 << 1)
+ +#define I_DIRTY_PAGES         (1 << 2)
   #define __I_NEW                       3
   #define I_NEW                 (1 << __I_NEW)
- -#define I_WILL_FREE           16
- -#define I_FREEING             32
- -#define I_CLEAR                       64
+ +#define I_WILL_FREE           (1 << 4)
+ +#define I_FREEING             (1 << 5)
+ +#define I_CLEAR                       (1 << 6)
   #define __I_SYNC              7
   #define I_SYNC                        (1 << __I_SYNC)
+ +#define I_REFERENCED          (1 << 8)
   
   #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
   
@@@ -1756,7 -1740,6 +1764,7 @@@ static inline void file_accessed(struc
   }
   
   int sync_inode(struct inode *inode, struct writeback_control *wbc);
+ +int sync_inode_metadata(struct inode *inode, int wait);
   
   struct file_system_type {
         const char *name;
@@@ -2101,6 -2084,7 +2109,6 @@@ extern int check_disk_change(struct blo
   extern int __invalidate_device(struct block_device *);
   extern int invalidate_partition(struct gendisk *, int);
   #endif
- -extern int invalidate_inodes(struct super_block *);
   unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                         pgoff_t start, pgoff_t end);
   
@@@ -2184,7 -2168,7 +2192,7 @@@ extern loff_t vfs_llseek(struct file *f
   
   extern int inode_init_always(struct super_block *, struct inode *);
   extern void inode_init_once(struct inode *);
- -extern void inode_add_to_lists(struct super_block *, struct inode *);
+ +extern void ihold(struct inode * inode);
   extern void iput(struct inode *);
   extern struct inode * igrab(struct inode *);
   extern ino_t iunique(struct super_block *, ino_t);
@@@ -2204,11 -2188,11 +2212,11 @@@ extern struct inode * iget_locked(struc
   extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
   extern int insert_inode_locked(struct inode *);
   extern void unlock_new_inode(struct inode *);
+ +extern unsigned int get_next_ino(void);
   
   extern void __iget(struct inode * inode);
   extern void iget_failed(struct inode *);
   extern void end_writeback(struct inode *);
- -extern void destroy_inode(struct inode *);
   extern void __destroy_inode(struct inode *);
   extern struct inode *new_inode(struct super_block *);
   extern int should_remove_suid(struct dentry *);
@@@ -2216,11 -2200,9 +2224,11 @@@ extern int file_remove_suid(struct fil
   
   extern void __insert_inode_hash(struct inode *, unsigned long hashval);
   extern void remove_inode_hash(struct inode *);
- -static inline void insert_inode_hash(struct inode *inode) {
+ +static inline void insert_inode_hash(struct inode *inode)
+ +{
         __insert_inode_hash(inode, inode->i_ino);
   }
+ +extern void inode_sb_list_add(struct inode *inode);
   
   #ifdef CONFIG_BLOCK
   extern void submit_bio(int, struct bio *);
@@@ -2404,8 -2386,6 +2412,8 @@@ extern ssize_t simple_write_to_buffer(v
   
   extern int generic_file_fsync(struct file *, int);
   
+ +extern int generic_check_addressable(unsigned, u64);
+ +
   #ifdef CONFIG_MIGRATION
   extern int buffer_migrate_page(struct address_space *,
                                 struct page *, struct page *);
@@@ -2482,7 -2462,6 +2490,7 @@@ static const struct file_operations __f
         .release = simple_attr_release,                                 \
         .read    = simple_attr_read,                                    \
         .write   = simple_attr_write,                                   \
+ +      .llseek  = generic_file_llseek,                                 \
   };
   
   static inline void __attribute__((format(printf, 1, 2)))
@@@ -2503,10 -2482,7 +2511,10 @@@ ssize_t simple_attr_write(struct file *
   struct ctl_table;
   int proc_nr_files(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos);
- -
+ +int proc_nr_dentry(struct ctl_table *table, int write,
+ +                void __user *buffer, size_t *lenp, loff_t *ppos);
+ +int proc_nr_inodes(struct ctl_table *table, int write,
+ +                 void __user *buffer, size_t *lenp, loff_t *ppos);
   int __init get_filesystem_list(char *buf);
   
   #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
diff --combined include/linux/writeback.h

index d5c7aaadda59a926032794a4d3a27a46f01bab3c,3d132bfb4f3d33d7497ec3ff398421139f263d3d..09eec350054d0259fb36f2e1b1447f6b5a0bde72
--- 1/include/linux/writeback.h
--- 2/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@@ -10,6 -10,8 +10,6 @@@
   struct backing_dev_info;
   
   extern spinlock_t inode_lock;
- -extern struct list_head inode_in_use;
- -extern struct list_head inode_unused;
   
   /*
    * fs/fs-writeback.c
@@@ -141,14 -143,14 +141,16 @@@ typedef int (*writepage_t)(struct page 
   
   int generic_writepages(struct address_space *mapping,
                        struct writeback_control *wbc);
+ void tag_pages_for_writeback(struct address_space *mapping,
+                            pgoff_t start, pgoff_t end);
   int write_cache_pages(struct address_space *mapping,
                       struct writeback_control *wbc, writepage_t writepage,
                       void *data);
   int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
   void set_page_dirty_balance(struct page *page, int page_mkwrite);
   void writeback_set_ratelimit(void);
+ +void tag_pages_for_writeback(struct address_space *mapping,
+ +                           pgoff_t start, pgoff_t end);
   
   /* pdflush.c */
   extern int nr_pdflush_threads;        /* Global so it can be exported to sysctl
diff --combined include/trace/events/ext4.h

index 6bcb00645de47ffb9262c1c366fa14e82291e887,8f59db107bbbff793ffb6f496d17343aa8cd675c..289010d3270bfa99191cfbdfa95a02564b6638bf
--- 1/include/trace/events/ext4.h
--- 2/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@@ -21,7 -21,8 +21,8 @@@ TRACE_EVENT(ext4_free_inode
         TP_ARGS(inode),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        umode_t, mode                   )
                 __field(        uid_t,  uid                     )
@@@ -30,7 -31,8 +31,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->mode   = inode->i_mode;
                 __entry->uid    = inode->i_uid;
@@@ -38,9 -40,10 +40,10 @@@
                 __entry->blocks = inode->i_blocks;
         ),
   
-       TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->mode, __entry->uid, __entry->gid,
+       TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->mode,
+                 __entry->uid, __entry->gid,
                   (unsigned long long) __entry->blocks)
   );
   
@@@ -50,20 -53,22 +53,22 @@@ TRACE_EVENT(ext4_request_inode
         TP_ARGS(dir, mode),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  dir                     )
                 __field(        umode_t, mode                   )
         ),
   
         TP_fast_assign(
-               __entry->dev    = dir->i_sb->s_dev;
+               __entry->dev_major = MAJOR(dir->i_sb->s_dev);
+               __entry->dev_minor = MINOR(dir->i_sb->s_dev);
                 __entry->dir    = dir->i_ino;
                 __entry->mode   = mode;
         ),
   
-       TP_printk("dev %s dir %lu mode 0%o",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir,
-                 __entry->mode)
+       TP_printk("dev %d,%d dir %lu mode 0%o",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->dir, __entry->mode)
   );
   
   TRACE_EVENT(ext4_allocate_inode,
@@@ -72,21 -77,24 +77,24 @@@
         TP_ARGS(inode, dir, mode),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        ino_t,  dir                     )
                 __field(        umode_t, mode                   )
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->dir    = dir->i_ino;
                 __entry->mode   = mode;
         ),
   
-       TP_printk("dev %s ino %lu dir %lu mode 0%o",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   (unsigned long) __entry->dir, __entry->mode)
   );
   
@@@ -98,7 -106,8 +106,8 @@@ DECLARE_EVENT_CLASS(ext4__write_begin
         TP_ARGS(inode, pos, len, flags),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        loff_t, pos                     )
                 __field(        unsigned int, len               )
@@@ -106,15 -115,17 +115,17 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->pos    = pos;
                 __entry->len    = len;
                 __entry->flags  = flags;
         ),
   
-       TP_printk("dev %s ino %lu pos %llu len %u flags %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->pos, __entry->len, __entry->flags)
   );
   
@@@ -141,7 -152,8 +152,8 @@@ DECLARE_EVENT_CLASS(ext4__write_end
         TP_ARGS(inode, pos, len, copied),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        loff_t, pos                     )
                 __field(        unsigned int, len               )
@@@ -149,16 -161,18 +161,18 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->pos    = pos;
                 __entry->len    = len;
                 __entry->copied = copied;
         ),
   
-       TP_printk("dev %s ino %lu pos %llu len %u copied %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pos, __entry->len, __entry->copied)
+       TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->pos,
+                 __entry->len, __entry->copied)
   );
   
   DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end,
@@@ -199,21 -213,23 +213,23 @@@ TRACE_EVENT(ext4_writepage
         TP_ARGS(inode, page),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        pgoff_t, index                  )
   
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->index  = page->index;
         ),
   
-       TP_printk("dev %s ino %lu page_index %lu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->index)
+       TP_printk("dev %d,%d ino %lu page_index %lu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->index)
   );
   
   TRACE_EVENT(ext4_da_writepages,
@@@ -222,13 -238,14 +238,13 @@@
         TP_ARGS(inode, wbc),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        long,   nr_to_write             )
                 __field(        long,   pages_skipped           )
                 __field(        loff_t, range_start             )
                 __field(        loff_t, range_end               )
--              __field(        char,   nonblocking             )
                 __field(        char,   for_kupdate             )
                 __field(        char,   for_reclaim             )
                 __field(        char,   range_cyclic            )
@@@ -236,26 -253,25 +252,27 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                 __entry->ino            = inode->i_ino;
                 __entry->nr_to_write    = wbc->nr_to_write;
                 __entry->pages_skipped  = wbc->pages_skipped;
                 __entry->range_start    = wbc->range_start;
                 __entry->range_end      = wbc->range_end;
- -              __entry->nonblocking    = wbc->nonblocking;
                 __entry->for_kupdate    = wbc->for_kupdate;
                 __entry->for_reclaim    = wbc->for_reclaim;
                 __entry->range_cyclic   = wbc->range_cyclic;
                 __entry->writeback_index = inode->i_mapping->writeback_index;
         ),
   
-       TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld "
- -      TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu",
++      TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
+ +                "range_start %llu range_end %llu "
+ +                "for_kupdate %d for_reclaim %d "
+ +                "range_cyclic %d writeback_index %lu",
-                 jbd2_dev_to_name(__entry->dev),
+                 __entry->dev_major, __entry->dev_minor,
                   (unsigned long) __entry->ino, __entry->nr_to_write,
                   __entry->pages_skipped, __entry->range_start,
- -                __entry->range_end, __entry->nonblocking,
+ +                __entry->range_end,
                   __entry->for_kupdate, __entry->for_reclaim,
                   __entry->range_cyclic,
                   (unsigned long) __entry->writeback_index)
@@@ -267,7 -283,8 +284,8 @@@ TRACE_EVENT(ext4_da_write_pages
         TP_ARGS(inode, mpd),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        __u64,  b_blocknr               )
                 __field(        __u32,  b_size                  )
@@@ -278,7 -295,8 +296,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                 __entry->ino            = inode->i_ino;
                 __entry->b_blocknr      = mpd->b_blocknr;
                 __entry->b_size         = mpd->b_size;
@@@ -288,8 -306,9 +307,9 @@@
                 __entry->pages_written  = mpd->pages_written;
         ),
   
-       TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->b_blocknr, __entry->b_size,
                   __entry->b_state, __entry->first_page,
                   __entry->io_done, __entry->pages_written)
@@@ -302,7 -321,8 +322,8 @@@ TRACE_EVENT(ext4_da_writepages_result
         TP_ARGS(inode, wbc, ret, pages_written),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        int,    ret                     )
                 __field(        int,    pages_written           )
@@@ -312,7 -332,8 +333,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                 __entry->ino            = inode->i_ino;
                 __entry->ret            = ret;
                 __entry->pages_written  = pages_written;
@@@ -321,8 -342,8 +343,8 @@@
                 __entry->writeback_index = inode->i_mapping->writeback_index;
         ),
   
-       TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu",
-                 jbd2_dev_to_name(__entry->dev),
+       TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu",
+                 __entry->dev_major, __entry->dev_minor,
                   (unsigned long) __entry->ino, __entry->ret,
                   __entry->pages_written, __entry->pages_skipped,
                   __entry->more_io,
@@@ -336,20 -357,23 +358,23 @@@ TRACE_EVENT(ext4_discard_blocks
         TP_ARGS(sb, blk, count),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        __u64,  blk                     )
                 __field(        __u64,  count                   )
   
         ),
   
         TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                 __entry->blk    = blk;
                 __entry->count  = count;
         ),
   
-       TP_printk("dev %s blk %llu count %llu",
-                 jbd2_dev_to_name(__entry->dev), __entry->blk, __entry->count)
+       TP_printk("dev %d,%d blk %llu count %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->blk, __entry->count)
   );
   
   DECLARE_EVENT_CLASS(ext4__mb_new_pa,
@@@ -359,7 -383,8 +384,8 @@@
         TP_ARGS(ac, pa),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        __u64,  pa_pstart               )
                 __field(        __u32,  pa_len                  )
@@@ -368,16 -393,18 +394,18 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev            = ac->ac_sb->s_dev;
+               __entry->dev_major      = MAJOR(ac->ac_sb->s_dev);
+               __entry->dev_minor      = MINOR(ac->ac_sb->s_dev);
                 __entry->ino            = ac->ac_inode->i_ino;
                 __entry->pa_pstart      = pa->pa_pstart;
                 __entry->pa_len         = pa->pa_len;
                 __entry->pa_lstart      = pa->pa_lstart;
         ),
   
-       TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
+       TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->pa_pstart,
+                 __entry->pa_len, __entry->pa_lstart)
   );
   
   DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,
@@@ -398,14 -425,15 +426,15 @@@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_n
   
   TRACE_EVENT(ext4_mb_release_inode_pa,
         TP_PROTO(struct super_block *sb,
-                struct ext4_allocation_context *ac,
+                struct inode *inode,
                  struct ext4_prealloc_space *pa,
                  unsigned long long block, unsigned int count),
   
-       TP_ARGS(sb, ac, pa, block, count),
+       TP_ARGS(sb, inode, pa, block, count),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        __u64,  block                   )
                 __field(        __u32,  count                   )
@@@ -413,43 -441,42 +442,42 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev            = sb->s_dev;
-               __entry->ino            = (ac && ac->ac_inode) ? 
-                                               ac->ac_inode->i_ino : 0;
+               __entry->dev_major      = MAJOR(sb->s_dev);
+               __entry->dev_minor      = MINOR(sb->s_dev);
+               __entry->ino            = inode->i_ino;
                 __entry->block          = block;
                 __entry->count          = count;
         ),
   
-       TP_printk("dev %s ino %lu block %llu count %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->block, __entry->count)
+       TP_printk("dev %d,%d ino %lu block %llu count %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->block, __entry->count)
   );
   
   TRACE_EVENT(ext4_mb_release_group_pa,
         TP_PROTO(struct super_block *sb,
-                struct ext4_allocation_context *ac,
                  struct ext4_prealloc_space *pa),
   
-       TP_ARGS(sb, ac, pa),
+       TP_ARGS(sb, pa),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
-               __field(        ino_t,  ino                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        __u64,  pa_pstart               )
                 __field(        __u32,  pa_len                  )
   
         ),
   
         TP_fast_assign(
-               __entry->dev            = sb->s_dev;
-               __entry->ino            = (ac && ac->ac_inode) ?
-                                               ac->ac_inode->i_ino : 0;
+               __entry->dev_major      = MAJOR(sb->s_dev);
+               __entry->dev_minor      = MINOR(sb->s_dev);
                 __entry->pa_pstart      = pa->pa_pstart;
                 __entry->pa_len         = pa->pa_len;
         ),
   
-       TP_printk("dev %s pstart %llu len %u",
-                 jbd2_dev_to_name(__entry->dev), __entry->pa_pstart, __entry->pa_len)
+       TP_printk("dev %d,%d pstart %llu len %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 __entry->pa_pstart, __entry->pa_len)
   );
   
   TRACE_EVENT(ext4_discard_preallocations,
@@@ -458,18 -485,21 +486,21 @@@
         TP_ARGS(inode),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
   
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
         ),
   
-       TP_printk("dev %s ino %lu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino)
+       TP_printk("dev %d,%d ino %lu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino)
   );
   
   TRACE_EVENT(ext4_mb_discard_preallocations,
@@@ -478,18 -508,20 +509,20 @@@
         TP_ARGS(sb, needed),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        int,    needed                  )
   
         ),
   
         TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                 __entry->needed = needed;
         ),
   
-       TP_printk("dev %s needed %d",
-                 jbd2_dev_to_name(__entry->dev), __entry->needed)
+       TP_printk("dev %d,%d needed %d",
+                 __entry->dev_major, __entry->dev_minor, __entry->needed)
   );
   
   TRACE_EVENT(ext4_request_blocks,
@@@ -498,7 -530,8 +531,8 @@@
         TP_ARGS(ar),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        unsigned int, flags             )
                 __field(        unsigned int, len               )
@@@ -511,7 -544,8 +545,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev    = ar->inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev);
                 __entry->ino    = ar->inode->i_ino;
                 __entry->flags  = ar->flags;
                 __entry->len    = ar->len;
@@@ -523,8 -557,9 +558,9 @@@
                 __entry->pright = ar->pright;
         ),
   
-       TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->flags, __entry->len,
                   (unsigned long long) __entry->logical,
                   (unsigned long long) __entry->goal,
@@@ -540,7 -575,8 +576,8 @@@ TRACE_EVENT(ext4_allocate_blocks
         TP_ARGS(ar, block),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        __u64,  block                   )
                 __field(        unsigned int, flags             )
@@@ -554,7 -590,8 +591,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev    = ar->inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev);
                 __entry->ino    = ar->inode->i_ino;
                 __entry->block  = block;
                 __entry->flags  = ar->flags;
@@@ -567,9 -604,10 +605,10 @@@
                 __entry->pright = ar->pright;
         ),
   
-       TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->flags, __entry->len, __entry->block,
+       TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->flags,
+                 __entry->len, __entry->block,
                   (unsigned long long) __entry->logical,
                   (unsigned long long) __entry->goal,
                   (unsigned long long) __entry->lleft,
@@@ -585,7 -623,8 +624,8 @@@ TRACE_EVENT(ext4_free_blocks
         TP_ARGS(inode, block, count, flags),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(      umode_t, mode                     )
                 __field(        __u64,  block                   )
@@@ -594,7 -633,8 +634,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev            = inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(inode->i_sb->s_dev);
                 __entry->ino            = inode->i_ino;
                 __entry->mode           = inode->i_mode;
                 __entry->block          = block;
@@@ -602,8 -642,9 +643,9 @@@
                 __entry->flags          = flags;
         ),
   
-       TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->mode, __entry->block, __entry->count,
                   __entry->flags)
   );
@@@ -614,7 -655,8 +656,8 @@@ TRACE_EVENT(ext4_sync_file
         TP_ARGS(file, datasync),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        ino_t,  parent                  )
                 __field(        int,    datasync                )
@@@ -623,14 -665,16 +666,16 @@@
         TP_fast_assign(
                 struct dentry *dentry = file->f_path.dentry;
   
-               __entry->dev            = dentry->d_inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(dentry->d_inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(dentry->d_inode->i_sb->s_dev);
                 __entry->ino            = dentry->d_inode->i_ino;
                 __entry->datasync       = datasync;
                 __entry->parent         = dentry->d_parent->d_inode->i_ino;
         ),
   
-       TP_printk("dev %s ino %ld parent %ld datasync %d ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %ld parent %ld datasync %d ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   (unsigned long) __entry->parent, __entry->datasync)
   );
   
@@@ -640,18 -684,20 +685,20 @@@ TRACE_EVENT(ext4_sync_fs
         TP_ARGS(sb, wait),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        int,    wait                    )
   
         ),
   
         TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                 __entry->wait   = wait;
         ),
   
-       TP_printk("dev %s wait %d", jbd2_dev_to_name(__entry->dev),
-                 __entry->wait)
+       TP_printk("dev %d,%d wait %d", __entry->dev_major,
+                 __entry->dev_minor, __entry->wait)
   );
   
   TRACE_EVENT(ext4_alloc_da_blocks,
@@@ -660,21 -706,24 +707,24 @@@
         TP_ARGS(inode),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field( unsigned int,  data_blocks     )
                 __field( unsigned int,  meta_blocks     )
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                 __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
         ),
   
-       TP_printk("dev %s ino %lu data_blocks %u meta_blocks %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->data_blocks, __entry->meta_blocks)
   );
   
@@@ -684,7 -733,8 +734,8 @@@ TRACE_EVENT(ext4_mballoc_alloc
         TP_ARGS(ac),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        __u16,  found                   )
                 __field(        __u16,  groups                  )
@@@ -707,7 -757,8 +758,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev            = ac->ac_inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(ac->ac_inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(ac->ac_inode->i_sb->s_dev);
                 __entry->ino            = ac->ac_inode->i_ino;
                 __entry->found          = ac->ac_found;
                 __entry->flags          = ac->ac_flags;
@@@ -729,10 -780,11 +781,11 @@@
                 __entry->result_len     = ac->ac_f_ex.fe_len;
         ),
   
-       TP_printk("dev %s inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
+       TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
                   "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x "
                   "tail %u broken %u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->orig_group, __entry->orig_start,
                   __entry->orig_len, __entry->orig_logical,
                   __entry->goal_group, __entry->goal_start,
@@@ -750,7 -802,8 +803,8 @@@ TRACE_EVENT(ext4_mballoc_prealloc
         TP_ARGS(ac),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        __u32,  orig_logical            )
                 __field(          int,  orig_start              )
@@@ -763,7 -816,8 +817,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev            = ac->ac_inode->i_sb->s_dev;
+               __entry->dev_major      = MAJOR(ac->ac_inode->i_sb->s_dev);
+               __entry->dev_minor      = MINOR(ac->ac_inode->i_sb->s_dev);
                 __entry->ino            = ac->ac_inode->i_ino;
                 __entry->orig_logical   = ac->ac_o_ex.fe_logical;
                 __entry->orig_start     = ac->ac_o_ex.fe_start;
@@@ -775,8 -829,9 +830,9 @@@
                 __entry->result_len     = ac->ac_b_ex.fe_len;
         ),
   
-       TP_printk("dev %s inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->orig_group, __entry->orig_start,
                   __entry->orig_len, __entry->orig_logical,
                   __entry->result_group, __entry->result_start,
@@@ -784,46 -839,59 +840,59 @@@
   );
   
   DECLARE_EVENT_CLASS(ext4__mballoc,
-       TP_PROTO(struct ext4_allocation_context *ac),
+       TP_PROTO(struct super_block *sb,
+                struct inode *inode,
+                ext4_group_t group,
+                ext4_grpblk_t start,
+                ext4_grpblk_t len),
   
-       TP_ARGS(ac),
+       TP_ARGS(sb, inode, group, start, len),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
-               __field(        __u32,  result_logical          )
                 __field(          int,  result_start            )
                 __field(        __u32,  result_group            )
                 __field(          int,  result_len              )
         ),
   
         TP_fast_assign(
-               __entry->dev            = ac->ac_inode->i_sb->s_dev;
-               __entry->ino            = ac->ac_inode->i_ino;
-               __entry->result_logical = ac->ac_b_ex.fe_logical;
-               __entry->result_start   = ac->ac_b_ex.fe_start;
-               __entry->result_group   = ac->ac_b_ex.fe_group;
-               __entry->result_len     = ac->ac_b_ex.fe_len;
+               __entry->dev_major      = MAJOR(sb->s_dev);
+               __entry->dev_minor      = MINOR(sb->s_dev);
+               __entry->ino            = inode ? inode->i_ino : 0;
+               __entry->result_start   = start;
+               __entry->result_group   = group;
+               __entry->result_len     = len;
         ),
   
-       TP_printk("dev %s inode %lu extent %u/%d/%u@%u ",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d inode %lu extent %u/%d/%u ",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->result_group, __entry->result_start,
-                 __entry->result_len, __entry->result_logical)
+                 __entry->result_len)
   );
   
   DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,
   
-       TP_PROTO(struct ext4_allocation_context *ac),
+       TP_PROTO(struct super_block *sb,
+                struct inode *inode,
+                ext4_group_t group,
+                ext4_grpblk_t start,
+                ext4_grpblk_t len),
   
-       TP_ARGS(ac)
+       TP_ARGS(sb, inode, group, start, len)
   );
   
   DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,
   
-       TP_PROTO(struct ext4_allocation_context *ac),
+       TP_PROTO(struct super_block *sb,
+                struct inode *inode,
+                ext4_group_t group,
+                ext4_grpblk_t start,
+                ext4_grpblk_t len),
   
-       TP_ARGS(ac)
+       TP_ARGS(sb, inode, group, start, len)
   );
   
   TRACE_EVENT(ext4_forget,
@@@ -832,7 -900,8 +901,8 @@@
         TP_ARGS(inode, is_metadata, block),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        umode_t, mode                   )
                 __field(        int,    is_metadata             )
@@@ -840,16 -909,18 +910,18 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->mode   = inode->i_mode;
                 __entry->is_metadata = is_metadata;
                 __entry->block  = block;
         ),
   
-       TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->mode, __entry->is_metadata, __entry->block)
+       TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->mode,
+                 __entry->is_metadata, __entry->block)
   );
   
   TRACE_EVENT(ext4_da_update_reserve_space,
@@@ -858,7 -929,8 +930,8 @@@
         TP_ARGS(inode, used_blocks),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        umode_t, mode                   )
                 __field(        __u64,  i_blocks                )
@@@ -869,7 -941,8 +942,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->mode   = inode->i_mode;
                 __entry->i_blocks = inode->i_blocks;
@@@ -879,9 -952,10 +953,10 @@@
                 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
         ),
   
-       TP_printk("dev %s ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-                 __entry->mode,  (unsigned long long) __entry->i_blocks,
+       TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino, __entry->mode,
+                 (unsigned long long) __entry->i_blocks,
                   __entry->used_blocks, __entry->reserved_data_blocks,
                   __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
   );
@@@ -892,7 -966,8 +967,8 @@@ TRACE_EVENT(ext4_da_reserve_space
         TP_ARGS(inode, md_needed),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        umode_t, mode                   )
                 __field(        __u64,  i_blocks                )
@@@ -902,7 -977,8 +978,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->mode   = inode->i_mode;
                 __entry->i_blocks = inode->i_blocks;
@@@ -911,8 -987,9 +988,9 @@@
                 __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks;
         ),
   
-       TP_printk("dev %s ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->mode, (unsigned long long) __entry->i_blocks,
                   __entry->md_needed, __entry->reserved_data_blocks,
                   __entry->reserved_meta_blocks)
@@@ -924,7 -1001,8 +1002,8 @@@ TRACE_EVENT(ext4_da_release_space
         TP_ARGS(inode, freed_blocks),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        ino_t,  ino                     )
                 __field(        umode_t, mode                   )
                 __field(        __u64,  i_blocks                )
@@@ -935,7 -1013,8 +1014,8 @@@
         ),
   
         TP_fast_assign(
-               __entry->dev    = inode->i_sb->s_dev;
+               __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+               __entry->dev_minor = MINOR(inode->i_sb->s_dev);
                 __entry->ino    = inode->i_ino;
                 __entry->mode   = inode->i_mode;
                 __entry->i_blocks = inode->i_blocks;
@@@ -945,8 -1024,9 +1025,9 @@@
                 __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks;
         ),
   
-       TP_printk("dev %s ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
-                 jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
+       TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d",
+                 __entry->dev_major, __entry->dev_minor,
+                 (unsigned long) __entry->ino,
                   __entry->mode, (unsigned long long) __entry->i_blocks,
                   __entry->freed_blocks, __entry->reserved_data_blocks,
                   __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
@@@ -958,18 -1038,20 +1039,20 @@@ DECLARE_EVENT_CLASS(ext4__bitmap_load
         TP_ARGS(sb, group),
   
         TP_STRUCT__entry(
-               __field(        dev_t,  dev                     )
+               __field(        int,   dev_major                )
+               __field(        int,   dev_minor                )
                 __field(        __u32,  group                   )
   
         ),
   
         TP_fast_assign(
-               __entry->dev    = sb->s_dev;
+               __entry->dev_major = MAJOR(sb->s_dev);
+               __entry->dev_minor = MINOR(sb->s_dev);
                 __entry->group  = group;
         ),
   
-       TP_printk("dev %s group %u",
-                 jbd2_dev_to_name(__entry->dev), __entry->group)
+       TP_printk("dev %d,%d group %u",
+                 __entry->dev_major, __entry->dev_minor, __entry->group)
   );
   
   DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,
author	Theodore Ts'o <tytso@mit.edu>
	Thu, 28 Oct 2010 03:44:47 +0000 (23:44 -0400)
committer	Theodore Ts'o <tytso@mit.edu>
	Thu, 28 Oct 2010 03:44:47 +0000 (23:44 -0400)
		1	2
fs/ext4/extents.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/fsync.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/ialloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/mballoc.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/resize.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/jbd2/checkpoint.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/jbd2/commit.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/jbd2/journal.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/writeback.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/trace/events/ext4.h	patch \|	diff1 \|	diff2 \|	blob \| history