[net-next-2.6.git] / fs / reiserfs / file.c

/*
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */

#include <linux/time.h>
#include <linux/reiserfs_fs.h>
#include <linux/reiserfs_acl.h>
#include <linux/reiserfs_xattr.h>
#include <asm/uaccess.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/quotaops.h>

/*
** We pack the tails of files on file close, not at the time they are written.
** This implies an unnecessary copy of the tail and an unnecessary indirect item
** insertion/balancing, for files that are written in one write.
** It avoids unnecessary tail packings (balances) for files that are written in
** multiple writes and are small enough to have tails.
**
** file_release is called by the VFS layer when the file is closed.  If
** this is the last open file descriptor, and the file
** small enough to have a tail, and the tail is currently in an
** unformatted node, the tail is converted back into a direct item.
**
** We use reiserfs_truncate_file to pack the tail, since it already has
** all the conditions coded.
*/
static int reiserfs_file_release(struct inode *inode, struct file *filp)
{

	struct reiserfs_transaction_handle th;
	int err;
	int jbegin_failure = 0;

	BUG_ON(!S_ISREG(inode->i_mode));

        if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
		return 0;

	mutex_lock(&(REISERFS_I(inode)->tailpack));

        if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
		mutex_unlock(&(REISERFS_I(inode)->tailpack));
		return 0;
	}

	/* fast out for when nothing needs to be done */
	if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
	     !tail_has_to_be_packed(inode)) &&
	    REISERFS_I(inode)->i_prealloc_count <= 0) {
		mutex_unlock(&(REISERFS_I(inode)->tailpack));
		return 0;
	}

	reiserfs_write_lock(inode->i_sb);
	/* freeing preallocation only involves relogging blocks that
	 * are already in the current transaction.  preallocation gets
	 * freed at the end of each transaction, so it is impossible for
	 * us to log any additional blocks (including quota blocks)
	 */
	err = journal_begin(&th, inode->i_sb, 1);
	if (err) {
		/* uh oh, we can't allow the inode to go away while there
		 * is still preallocation blocks pending.  Try to join the
		 * aborted transaction
		 */
		jbegin_failure = err;
		err = journal_join_abort(&th, inode->i_sb, 1);

		if (err) {
			/* hmpf, our choices here aren't good.  We can pin the inode
			 * which will disallow unmount from every happening, we can
			 * do nothing, which will corrupt random memory on unmount,
			 * or we can forcibly remove the file from the preallocation
			 * list, which will leak blocks on disk.  Lets pin the inode
			 * and let the admin know what is going on.
			 */
			igrab(inode);
			reiserfs_warning(inode->i_sb, "clm-9001",
					 "pinning inode %lu because the "
					 "preallocation can't be freed",
					 inode->i_ino);
			goto out;
		}
	}
	reiserfs_update_inode_transaction(inode);

#ifdef REISERFS_PREALLOCATE
	reiserfs_discard_prealloc(&th, inode);
#endif
	err = journal_end(&th, inode->i_sb, 1);

	/* copy back the error code from journal_begin */
	if (!err)
		err = jbegin_failure;

	if (!err &&
	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
	    tail_has_to_be_packed(inode)) {

		/* if regular file is released by last holder and it has been
		   appended (we append by unformatted node only) or its direct
		   item(s) had to be converted, then it may have to be
		   indirect2direct converted */
		err = reiserfs_truncate_file(inode, 0);
	}
      out:
	reiserfs_write_unlock(inode->i_sb);
	mutex_unlock(&(REISERFS_I(inode)->tailpack));
	return err;
}

static int reiserfs_file_open(struct inode *inode, struct file *file)
{
	int err = dquot_file_open(inode, file);
        if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
		/* somebody might be tailpacking on final close; wait for it */
		mutex_lock(&(REISERFS_I(inode)->tailpack));
		atomic_inc(&REISERFS_I(inode)->openers);
		mutex_unlock(&(REISERFS_I(inode)->tailpack));
	}
	return err;
}

static void reiserfs_vfs_truncate_file(struct inode *inode)
{
	mutex_lock(&(REISERFS_I(inode)->tailpack));
	reiserfs_truncate_file(inode, 1);
	mutex_unlock(&(REISERFS_I(inode)->tailpack));
}

/* Sync a reiserfs file. */

/*
 * FIXME: sync_mapping_buffers() never has anything to sync.  Can
 * be removed...
 */

static int reiserfs_sync_file(struct file *filp, int datasync)
{
	struct inode *inode = filp->f_mapping->host;
	int err;
	int barrier_done;

	BUG_ON(!S_ISREG(inode->i_mode));
	err = sync_mapping_buffers(inode->i_mapping);
	reiserfs_write_lock(inode->i_sb);
	barrier_done = reiserfs_commit_for_inode(inode);
	reiserfs_write_unlock(inode->i_sb);
	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
	if (barrier_done < 0)
		return barrier_done;
	return (err < 0) ? -EIO : 0;
}

/* taken fs/buffer.c:__block_commit_write */
int reiserfs_commit_page(struct inode *inode, struct page *page,
			 unsigned from, unsigned to)
{
	unsigned block_start, block_end;
	int partial = 0;
	unsigned blocksize;
	struct buffer_head *bh, *head;
	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
	int new;
	int logit = reiserfs_file_data_log(inode);
	struct super_block *s = inode->i_sb;
	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
	struct reiserfs_transaction_handle th;
	int ret = 0;

	th.t_trans_id = 0;
	blocksize = 1 << inode->i_blkbits;

	if (logit) {
		reiserfs_write_lock(s);
		ret = journal_begin(&th, s, bh_per_page + 1);
		if (ret)
			goto drop_write_lock;
		reiserfs_update_inode_transaction(inode);
	}
	for (bh = head = page_buffers(page), block_start = 0;
	     bh != head || !block_start;
	     block_start = block_end, bh = bh->b_this_page) {

		new = buffer_new(bh);
		clear_buffer_new(bh);
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (!buffer_uptodate(bh))
				partial = 1;
		} else {
			set_buffer_uptodate(bh);
			if (logit) {
				reiserfs_prepare_for_journal(s, bh, 1);
				journal_mark_dirty(&th, s, bh);
			} else if (!buffer_dirty(bh)) {
				mark_buffer_dirty(bh);
				/* do data=ordered on any page past the end
				 * of file and any buffer marked BH_New.
				 */
				if (reiserfs_data_ordered(inode->i_sb) &&
				    (new || page->index >= i_size_index)) {
					reiserfs_add_ordered_list(inode, bh);
				}
			}
		}
	}
	if (logit) {
		ret = journal_end(&th, s, bh_per_page + 1);
	      drop_write_lock:
		reiserfs_write_unlock(s);
	}
	/*
	 * If this is a partial write which happened to make all buffers
	 * uptodate then we can optimize away a bogus readpage() for
	 * the next read(). Here we 'discover' whether the page went
	 * uptodate as a result of this (potentially partial) write.
	 */
	if (!partial)
		SetPageUptodate(page);
	return ret;
}

/* Write @count bytes at position @ppos in a file indicated by @file
   from the buffer @buf.

   generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
   something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
   written for (ext2/3).  This is for several reasons:

   * It has no understanding of any filesystem specific optimizations.

   * It enters the filesystem repeatedly for each page that is written.

   * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
   * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
   * to reiserfs which allows for fewer tree traversals.

   * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.

   * Asking the block allocation code for blocks one at a time is slightly less efficient.

   All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
   use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
   things right finally.

   Future Features: providing search_by_key with hints.

*/
static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
				   const char __user * buf,	/*  pointer to user supplied data
								   (in userspace) */
				   size_t count,	/* amount of bytes to write */
				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
							 * new current position before returning. */
				   )
{
	struct inode *inode = file->f_path.dentry->d_inode;	// Inode of the file that we are writing to.
	/* To simplify coding at this time, we store
	   locked pages in array for now */
	struct reiserfs_transaction_handle th;
	th.t_trans_id = 0;

	/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
	* lying around (most of the disk, in fact). Despite the filesystem
	* now being a v3.6 format, the old items still can't support large
	* file sizes. Catch this case here, as the rest of the VFS layer is
	* oblivious to the different limitations between old and new items.
	* reiserfs_setattr catches this for truncates. This chunk is lifted
	* from generic_write_checks. */
	if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
	    *ppos + count > MAX_NON_LFS) {
		if (*ppos >= MAX_NON_LFS) {
			return -EFBIG;
		}
		if (count > MAX_NON_LFS - (unsigned long)*ppos)
			count = MAX_NON_LFS - (unsigned long)*ppos;
	}

	return do_sync_write(file, buf, count, ppos);
}

const struct file_operations reiserfs_file_operations = {
	.read = do_sync_read,
	.write = reiserfs_file_write,
	.unlocked_ioctl = reiserfs_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = reiserfs_compat_ioctl,
#endif
	.mmap = generic_file_mmap,
	.open = reiserfs_file_open,
	.release = reiserfs_file_release,
	.fsync = reiserfs_sync_file,
	.aio_read = generic_file_aio_read,
	.aio_write = generic_file_aio_write,
	.splice_read = generic_file_splice_read,
	.splice_write = generic_file_splice_write,
	.llseek = generic_file_llseek,
};

const struct inode_operations reiserfs_file_inode_operations = {
	.truncate = reiserfs_vfs_truncate_file,
	.setattr = reiserfs_setattr,
	.setxattr = reiserfs_setxattr,
	.getxattr = reiserfs_getxattr,
	.listxattr = reiserfs_listxattr,
	.removexattr = reiserfs_removexattr,
	.permission = reiserfs_permission,
};
Commit	Line	Data
	1	/*
	2	* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
	3	*/
	4
	5	#include <linux/time.h>
	6	#include <linux/reiserfs_fs.h>
	7	#include <linux/reiserfs_acl.h>
	8	#include <linux/reiserfs_xattr.h>
	9	#include <asm/uaccess.h>
	10	#include <linux/pagemap.h>
	11	#include <linux/swap.h>
	12	#include <linux/writeback.h>
	13	#include <linux/blkdev.h>
	14	#include <linux/buffer_head.h>
	15	#include <linux/quotaops.h>
	16
	17	/*
	18	** We pack the tails of files on file close, not at the time they are written.
	19	** This implies an unnecessary copy of the tail and an unnecessary indirect item
	20	** insertion/balancing, for files that are written in one write.
	21	** It avoids unnecessary tail packings (balances) for files that are written in
	22	** multiple writes and are small enough to have tails.
	23	**
	24	** file_release is called by the VFS layer when the file is closed. If
	25	** this is the last open file descriptor, and the file
	26	** small enough to have a tail, and the tail is currently in an
	27	** unformatted node, the tail is converted back into a direct item.
	28	**
	29	** We use reiserfs_truncate_file to pack the tail, since it already has
	30	** all the conditions coded.
	31	*/
	32	static int reiserfs_file_release(struct inode inode, struct file filp)
	33	{
	34
	35	struct reiserfs_transaction_handle th;
	36	int err;
	37	int jbegin_failure = 0;
	38
	39	BUG_ON(!S_ISREG(inode->i_mode));
	40
	41	if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
	42	return 0;
	43
	44	mutex_lock(&(REISERFS_I(inode)->tailpack));
	45
	46	if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
	47	mutex_unlock(&(REISERFS_I(inode)->tailpack));
	48	return 0;
	49	}
	50
	51	/* fast out for when nothing needs to be done */
	52	if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) \|\|
	53	!tail_has_to_be_packed(inode)) &&
	54	REISERFS_I(inode)->i_prealloc_count <= 0) {
	55	mutex_unlock(&(REISERFS_I(inode)->tailpack));
	56	return 0;
	57	}
	58
	59	reiserfs_write_lock(inode->i_sb);
	60	/* freeing preallocation only involves relogging blocks that
	61	* are already in the current transaction. preallocation gets
	62	* freed at the end of each transaction, so it is impossible for
	63	* us to log any additional blocks (including quota blocks)
	64	*/
	65	err = journal_begin(&th, inode->i_sb, 1);
	66	if (err) {
	67	/* uh oh, we can't allow the inode to go away while there
	68	* is still preallocation blocks pending. Try to join the
	69	* aborted transaction
	70	*/
	71	jbegin_failure = err;
	72	err = journal_join_abort(&th, inode->i_sb, 1);
	73
	74	if (err) {
	75	/* hmpf, our choices here aren't good. We can pin the inode
	76	* which will disallow unmount from every happening, we can
	77	* do nothing, which will corrupt random memory on unmount,
	78	* or we can forcibly remove the file from the preallocation
	79	* list, which will leak blocks on disk. Lets pin the inode
	80	* and let the admin know what is going on.
	81	*/
	82	igrab(inode);
	83	reiserfs_warning(inode->i_sb, "clm-9001",
	84	"pinning inode %lu because the "
	85	"preallocation can't be freed",
	86	inode->i_ino);
	87	goto out;
	88	}
	89	}
	90	reiserfs_update_inode_transaction(inode);
	91
	92	#ifdef REISERFS_PREALLOCATE
	93	reiserfs_discard_prealloc(&th, inode);
	94	#endif
	95	err = journal_end(&th, inode->i_sb, 1);
	96
	97	/* copy back the error code from journal_begin */
	98	if (!err)
	99	err = jbegin_failure;
	100
	101	if (!err &&
	102	(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
	103	tail_has_to_be_packed(inode)) {
	104
	105	/* if regular file is released by last holder and it has been
	106	appended (we append by unformatted node only) or its direct
	107	item(s) had to be converted, then it may have to be
	108	indirect2direct converted */
	109	err = reiserfs_truncate_file(inode, 0);
	110	}
	111	out:
	112	reiserfs_write_unlock(inode->i_sb);
	113	mutex_unlock(&(REISERFS_I(inode)->tailpack));
	114	return err;
	115	}
	116
	117	static int reiserfs_file_open(struct inode inode, struct file file)
	118	{
	119	int err = dquot_file_open(inode, file);
	120	if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
	121	/* somebody might be tailpacking on final close; wait for it */
	122	mutex_lock(&(REISERFS_I(inode)->tailpack));
	123	atomic_inc(&REISERFS_I(inode)->openers);
	124	mutex_unlock(&(REISERFS_I(inode)->tailpack));
	125	}
	126	return err;
	127	}
	128
	129	static void reiserfs_vfs_truncate_file(struct inode *inode)
	130	{
	131	mutex_lock(&(REISERFS_I(inode)->tailpack));
	132	reiserfs_truncate_file(inode, 1);
	133	mutex_unlock(&(REISERFS_I(inode)->tailpack));
	134	}
	135
	136	/* Sync a reiserfs file. */
	137
	138	/*
	139	* FIXME: sync_mapping_buffers() never has anything to sync. Can
	140	* be removed...
	141	*/
	142
	143	static int reiserfs_sync_file(struct file *filp, int datasync)
	144	{
	145	struct inode *inode = filp->f_mapping->host;
	146	int err;
	147	int barrier_done;
	148
	149	BUG_ON(!S_ISREG(inode->i_mode));
	150	err = sync_mapping_buffers(inode->i_mapping);
	151	reiserfs_write_lock(inode->i_sb);
	152	barrier_done = reiserfs_commit_for_inode(inode);
	153	reiserfs_write_unlock(inode->i_sb);
	154	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
	155	blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
	156	if (barrier_done < 0)
	157	return barrier_done;
	158	return (err < 0) ? -EIO : 0;
	159	}
	160
	161	/* taken fs/buffer.c:__block_commit_write */
	162	int reiserfs_commit_page(struct inode inode, struct page page,
	163	unsigned from, unsigned to)
	164	{
	165	unsigned block_start, block_end;
	166	int partial = 0;
	167	unsigned blocksize;
	168	struct buffer_head bh, head;
	169	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
	170	int new;
	171	int logit = reiserfs_file_data_log(inode);
	172	struct super_block *s = inode->i_sb;
	173	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
	174	struct reiserfs_transaction_handle th;
	175	int ret = 0;
	176
	177	th.t_trans_id = 0;
	178	blocksize = 1 << inode->i_blkbits;
	179
	180	if (logit) {
	181	reiserfs_write_lock(s);
	182	ret = journal_begin(&th, s, bh_per_page + 1);
	183	if (ret)
	184	goto drop_write_lock;
	185	reiserfs_update_inode_transaction(inode);
	186	}
	187	for (bh = head = page_buffers(page), block_start = 0;
	188	bh != head \|\| !block_start;
	189	block_start = block_end, bh = bh->b_this_page) {
	190
	191	new = buffer_new(bh);
	192	clear_buffer_new(bh);
	193	block_end = block_start + blocksize;
	194	if (block_end <= from \|\| block_start >= to) {
	195	if (!buffer_uptodate(bh))
	196	partial = 1;
	197	} else {
	198	set_buffer_uptodate(bh);
	199	if (logit) {
	200	reiserfs_prepare_for_journal(s, bh, 1);
	201	journal_mark_dirty(&th, s, bh);
	202	} else if (!buffer_dirty(bh)) {
	203	mark_buffer_dirty(bh);
	204	/* do data=ordered on any page past the end
	205	* of file and any buffer marked BH_New.
	206	*/
	207	if (reiserfs_data_ordered(inode->i_sb) &&
	208	(new \|\| page->index >= i_size_index)) {
	209	reiserfs_add_ordered_list(inode, bh);
	210	}
	211	}
	212	}
	213	}
	214	if (logit) {
	215	ret = journal_end(&th, s, bh_per_page + 1);
	216	drop_write_lock:
	217	reiserfs_write_unlock(s);
	218	}
	219	/*
	220	* If this is a partial write which happened to make all buffers
	221	* uptodate then we can optimize away a bogus readpage() for
	222	* the next read(). Here we 'discover' whether the page went
	223	* uptodate as a result of this (potentially partial) write.
	224	*/
	225	if (!partial)
	226	SetPageUptodate(page);
	227	return ret;
	228	}
	229
	230	/* Write @count bytes at position @ppos in a file indicated by @file
	231	from the buffer @buf.
	232
	233	generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
	234	something simple that works. It is not for serious use by general purpose filesystems, excepting the one that it was
	235	written for (ext2/3). This is for several reasons:
	236
	237	* It has no understanding of any filesystem specific optimizations.
	238
	239	* It enters the filesystem repeatedly for each page that is written.
	240
	241	* It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
	242	* operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
	243	* to reiserfs which allows for fewer tree traversals.
	244
	245	* Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
	246
	247	* Asking the block allocation code for blocks one at a time is slightly less efficient.
	248
	249	All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
	250	use it, but we were in a hurry to make code freeze, and so it couldn't be revised then. This new code should make
	251	things right finally.
	252
	253	Future Features: providing search_by_key with hints.
	254
	255	*/
	256	static ssize_t reiserfs_file_write(struct file file, / the file we are going to write into */
	257	const char __user * buf, /* pointer to user supplied data
	258	(in userspace) */
	259	size_t count, /* amount of bytes to write */
	260	loff_t * ppos /* pointer to position in file that we start writing at. Should be updated to
	261	* new current position before returning. */
	262	)
	263	{
	264	struct inode *inode = file->f_path.dentry->d_inode; // Inode of the file that we are writing to.
	265	/* To simplify coding at this time, we store
	266	locked pages in array for now */
	267	struct reiserfs_transaction_handle th;
	268	th.t_trans_id = 0;
	269
	270	/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
	271	* lying around (most of the disk, in fact). Despite the filesystem
	272	* now being a v3.6 format, the old items still can't support large
	273	* file sizes. Catch this case here, as the rest of the VFS layer is
	274	* oblivious to the different limitations between old and new items.
	275	* reiserfs_setattr catches this for truncates. This chunk is lifted
	276	* from generic_write_checks. */
	277	if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
	278	*ppos + count > MAX_NON_LFS) {
	279	if (*ppos >= MAX_NON_LFS) {
	280	return -EFBIG;
	281	}
	282	if (count > MAX_NON_LFS - (unsigned long)*ppos)
	283	count = MAX_NON_LFS - (unsigned long)*ppos;
	284	}
	285
	286	return do_sync_write(file, buf, count, ppos);
	287	}
	288
	289	const struct file_operations reiserfs_file_operations = {
	290	.read = do_sync_read,
	291	.write = reiserfs_file_write,
	292	.unlocked_ioctl = reiserfs_ioctl,
	293	#ifdef CONFIG_COMPAT
	294	.compat_ioctl = reiserfs_compat_ioctl,
	295	#endif
	296	.mmap = generic_file_mmap,
	297	.open = reiserfs_file_open,
	298	.release = reiserfs_file_release,
	299	.fsync = reiserfs_sync_file,
	300	.aio_read = generic_file_aio_read,
	301	.aio_write = generic_file_aio_write,
	302	.splice_read = generic_file_splice_read,
	303	.splice_write = generic_file_splice_write,
	304	.llseek = generic_file_llseek,
	305	};
	306
	307	const struct inode_operations reiserfs_file_inode_operations = {
	308	.truncate = reiserfs_vfs_truncate_file,
	309	.setattr = reiserfs_setattr,
	310	.setxattr = reiserfs_setxattr,
	311	.getxattr = reiserfs_getxattr,
	312	.listxattr = reiserfs_listxattr,
	313	.removexattr = reiserfs_removexattr,
	314	.permission = reiserfs_permission,
	315	};