struct extent_buffer *eb;
int level;
int ret;
- u64 refs;
+ u64 refs = 1;
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
if (!path->nodes[level])
btrfs_set_trans_block_group(trans, inode);
ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- printk(KERN_ERR"btrfs: fail to dirty inode %lu error %d\n",
- inode->i_ino, ret);
+ if (ret && ret == -ENOSPC) {
+ /* whoops, lets try again with the full transaction */
+ btrfs_end_transaction(trans, root);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ if (printk_ratelimit()) {
+ printk(KERN_ERR "btrfs: fail to "
+ "dirty inode %lu error %ld\n",
+ inode->i_ino, PTR_ERR(trans));
+ }
+ return;
+ }
+ btrfs_set_trans_block_group(trans, inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ if (printk_ratelimit()) {
+ printk(KERN_ERR "btrfs: fail to "
+ "dirty inode %lu error %d\n",
+ inode->i_ino, ret);
+ }
+ }
+ }
btrfs_end_transaction(trans, root);
}
if (ret != 0)
goto fail;
- inode->i_uid = current_fsuid();
-
- if (dir && (dir->i_mode & S_ISGID)) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
-
- inode->i_mode = mode;
+ inode_init_owner(inode, dir, mode);
inode->i_ino = objectid;
inode_set_bytes(inode, 0);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return em;
}
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 len)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ u64 backref_offset;
+ u64 extent_end;
+ u64 num_bytes;
+ int slot;
+ int found_type;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ offset, 0);
+ if (ret < 0)
+ goto out;
+
+ slot = path->slots[0];
+ if (ret == 1) {
+ if (slot == 0) {
+ /* can't find the item, must cow */
+ ret = 0;
+ goto out;
+ }
+ slot--;
+ }
+ ret = 0;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != inode->i_ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* not our file or wrong item type, must cow */
+ goto out;
+ }
+
+ if (key.offset > offset) {
+ /* Wrong offset, must cow */
+ goto out;
+ }
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (found_type != BTRFS_FILE_EXTENT_REG &&
+ found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+ /* not a regular extent, must cow */
+ goto out;
+ }
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_end < offset + len) {
+ /* extent doesn't include our full range, must cow */
+ goto out;
+ }
+
+ if (btrfs_extent_readonly(root, disk_bytenr))
+ goto out;
+
+ /*
+ * look for other files referencing this extent, if we
+ * find any we must cow
+ */
+ if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+ key.offset - backref_offset, disk_bytenr))
+ goto out;
+
+ /*
+ * adjust disk_bytenr and num_bytes to cover just the bytes
+ * in this extent we are about to write. If there
+ * are any csums in that range we have to cow in order
+ * to keep the csums correct
+ */
+ disk_bytenr += backref_offset;
+ disk_bytenr += offset - key.offset;
+ num_bytes = min(offset + len, extent_end) - offset;
+ if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ goto out;
+ /*
+ * all of the above have passed, it is safe to overwrite this extent
+ * without cow
+ */
+ ret = 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start = iblock << inode->i_blkbits;
u64 len = bh_result->b_size;
+ struct btrfs_trans_handle *trans;
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em))
* just use the extent.
*
*/
- if (!create)
+ if (!create) {
+ len = em->len - (start - em->start);
goto map;
+ }
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- u64 block_start;
int type;
int ret;
+ u64 block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
type = BTRFS_ORDERED_PREALLOC;
else
type = BTRFS_ORDERED_NOCOW;
- len = min(len, em->block_len - (start - em->start));
+ len = min(len, em->len - (start - em->start));
block_start = em->block_start + (start - em->start);
- ret = btrfs_add_ordered_extent_dio(inode, start,
- start, len, len, type);
- if (ret) {
- free_extent_map(em);
- return ret;
+
+ /*
+ * we're not going to log anything, but we do need
+ * to make sure the current transaction stays open
+ * while we look for nocow cross refs
+ */
+ trans = btrfs_join_transaction(root, 0);
+ if (!trans)
+ goto must_cow;
+
+ if (can_nocow_odirect(trans, inode, start, len) == 1) {
+ ret = btrfs_add_ordered_extent_dio(inode, start,
+ block_start, len, len, type);
+ btrfs_end_transaction(trans, root);
+ if (ret) {
+ free_extent_map(em);
+ return ret;
+ }
+ goto unlock;
}
- } else {
- free_extent_map(em);
- em = btrfs_new_extent_direct(inode, start, len);
- if (IS_ERR(em))
- return PTR_ERR(em);
- len = min(len, em->block_len);
+ btrfs_end_transaction(trans, root);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len - 1,
- GFP_NOFS);
+must_cow:
+ /*
+ * this will cow the extent, reset the len in case we changed
+ * it above
+ */
+ len = bh_result->b_size;
+ free_extent_map(em);
+ em = btrfs_new_extent_direct(inode, start, len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ len = min(len, em->len - (start - em->start));
+unlock:
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+ 0, NULL, GFP_NOFS);
map:
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
- bh_result->b_size = em->len - (start - em->start);
+ bh_result->b_size = len;
bh_result->b_bdev = em->bdev;
set_buffer_mapped(bh_result);
if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
bvec++;
} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
- dip->disk_bytenr = bio->bi_sector << 9;
+ dip->disk_bytenr = (u64)bio->bi_sector << 9;
bio->bi_private = dip;
if (write)
bio_endio(bio, ret);
}
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ int seg;
+ size_t size;
+ unsigned long addr;
+ unsigned blocksize_mask = root->sectorsize - 1;
+ ssize_t retval = -EINVAL;
+ loff_t end = offset;
+
+ if (offset & blocksize_mask)
+ goto out;
+
+ /* Check the memory alignment. Blocks cannot straddle pages */
+ for (seg = 0; seg < nr_segs; seg++) {
+ addr = (unsigned long)iov[seg].iov_base;
+ size = iov[seg].iov_len;
+ end += size;
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
+ goto out;
+ }
+ retval = 0;
+out:
+ return retval;
+}
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
ssize_t ret;
+ int writing = rw & WRITE;
+ int write_bits = 0;
+ size_t count = iov_length(iov, nr_segs);
+
+ if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+ offset, nr_segs)) {
+ return 0;
+ }
lockstart = offset;
- lockend = offset + iov_length(iov, nr_segs) - 1;
+ lockend = offset + count - 1;
+
+ if (writing) {
+ ret = btrfs_delalloc_reserve_space(inode, count);
+ if (ret)
+ goto out;
+ }
+
while (1) {
- lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- GFP_NOFS);
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, &cached_state, GFP_NOFS);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
lockend - lockstart + 1);
if (!ordered)
break;
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- GFP_NOFS);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
cond_resched();
}
- ret = __blockdev_direct_IO(rw, iocb, inode, NULL, iov, offset, nr_segs,
- btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, 0);
+ /*
+ * we don't use btrfs_set_extent_delalloc because we don't want
+ * the dirty or uptodate bits
+ */
+ if (writing) {
+ write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+ ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_DELALLOC, 0, NULL, &cached_state,
+ GFP_NOFS);
+ if (ret) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_LOCKED | write_bits,
+ 1, 0, &cached_state, GFP_NOFS);
+ goto out;
+ }
+ }
+
+ free_extent_state(cached_state);
+ cached_state = NULL;
+
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, 0);
if (ret < 0 && ret != -EIOCBQUEUED) {
- unlock_extent(&BTRFS_I(inode)->io_tree, offset,
- offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+ offset + iov_length(iov, nr_segs) - 1,
+ EXTENT_LOCKED | write_bits, 1, 0,
+ &cached_state, GFP_NOFS);
} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
/*
* We're falling back to buffered, unlock the section we didn't
* do IO on.
*/
- unlock_extent(&BTRFS_I(inode)->io_tree, offset + ret,
- offset + iov_length(iov, nr_segs) - 1, GFP_NOFS);
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+ offset + iov_length(iov, nr_segs) - 1,
+ EXTENT_LOCKED | write_bits, 1, 0,
+ &cached_state, GFP_NOFS);
}
-
+out:
+ free_extent_state(cached_state);
return ret;
}
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+ ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
last_byte - cur_offset,
1 << inode->i_blkbits,
offset + len,