[net-next-2.6.git] / fs / xfs / linux-2.6 / xfs_sync.c

/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
#include "xfs_btree.h"
#include "xfs_dir2_sf.h"
#include "xfs_attr_sf.h"
#include "xfs_inode.h"
#include "xfs_dinode.h"
#include "xfs_error.h"
#include "xfs_mru_cache.h"
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_utils.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
#include "xfs_rw.h"

#include <linux/kthread.h>
#include <linux/freezer.h>

/*
 * xfs_sync flushes any pending I/O to file system vfsp.
 *
 * This routine is called by vfs_sync() to make sure that things make it
 * out to disk eventually, on sync() system calls to flush out everything,
 * and when the file system is unmounted.  For the vfs_sync() case, all
 * we really need to do is sync out the log to make all of our meta-data
 * updates permanent (except for timestamps).  For calls from pflushd(),
 * dirty pages are kept moving by calling pdflush() on the inodes
 * containing them.  We also flush the inodes that we can lock without
 * sleeping and the superblock if we can lock it without sleeping from
 * vfs_sync() so that items at the tail of the log are always moving out.
 *
 * Flags:
 *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
 *		       to sleep if we can help it.  All we really need
 *		       to do is ensure that the log is synced at least
 *		       periodically.  We also push the inodes and
 *		       superblock if we can lock them without sleeping
 *			and they are not pinned.
 *      SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
 *		       set, then we really want to lock each inode and flush
 *		       it.
 *      SYNC_WAIT    - All the flushes that take place in this call should
 *		       be synchronous.
 *      SYNC_DELWRI  - This tells us to push dirty pages associated with
 *		       inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
 *		       determine if they should be flushed sync, async, or
 *		       delwri.
 *      SYNC_CLOSE   - This flag is passed when the system is being
 *		       unmounted.  We should sync and invalidate everything.
 *      SYNC_FSDATA  - This indicates that the caller would like to make
 *		       sure the superblock is safe on disk.  We can ensure
 *		       this by simply making sure the log gets flushed
 *		       if SYNC_BDFLUSH is set, and by actually writing it
 *		       out otherwise.
 *	SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
 *		       before we return (including direct I/O). Forms the drain
 *		       side of the write barrier needed to safely quiesce the
 *		       filesystem.
 *
 */
int
xfs_sync(
	xfs_mount_t	*mp,
	int		flags)
{
	int		error;

	/*
	 * Get the Quota Manager to flush the dquots.
	 *
	 * If XFS quota support is not enabled or this filesystem
	 * instance does not use quotas XFS_QM_DQSYNC will always
	 * return zero.
	 */
	error = XFS_QM_DQSYNC(mp, flags);
	if (error) {
		/*
		 * If we got an IO error, we will be shutting down.
		 * So, there's nothing more for us to do here.
		 */
		ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
		if (XFS_FORCED_SHUTDOWN(mp))
			return XFS_ERROR(error);
	}

	if (flags & SYNC_IOWAIT)
		xfs_filestream_flush(mp);

	return xfs_syncsub(mp, flags, NULL);
}

/*
 * Sync all the inodes in the given AG according to the
 * direction given by the flags.
 */
STATIC int
xfs_sync_inodes_ag(
	xfs_mount_t	*mp,
	int		ag,
	int		flags,
	int		*bypassed)
{
	xfs_inode_t	*ip = NULL;
	struct inode	*vp = NULL;
	xfs_perag_t	*pag = &mp->m_perag[ag];
	boolean_t	vnode_refed = B_FALSE;
	int		nr_found;
	int		first_index = 0;
	int		error = 0;
	int		last_error = 0;
	int		fflag = XFS_B_ASYNC;
	int		lock_flags = XFS_ILOCK_SHARED;

	if (flags & SYNC_DELWRI)
		fflag = XFS_B_DELWRI;
	if (flags & SYNC_WAIT)
		fflag = 0;		/* synchronous overrides all */

	if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
		/*
		 * We need the I/O lock if we're going to call any of
		 * the flush/inval routines.
		 */
		lock_flags |= XFS_IOLOCK_SHARED;
	}

	do {
		/*
		 * use a gang lookup to find the next inode in the tree
		 * as the tree is sparse and a gang lookup walks to find
		 * the number of objects requested.
		 */
		read_lock(&pag->pag_ici_lock);
		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
				(void**)&ip, first_index, 1);

		if (!nr_found) {
			read_unlock(&pag->pag_ici_lock);
			break;
		}

		/* update the index for the next lookup */
		first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);

		/*
		 * skip inodes in reclaim. Let xfs_syncsub do that for
		 * us so we don't need to worry.
		 */
		vp = VFS_I(ip);
		if (!vp) {
			read_unlock(&pag->pag_ici_lock);
			continue;
		}

		/* bad inodes are dealt with elsewhere */
		if (VN_BAD(vp)) {
			read_unlock(&pag->pag_ici_lock);
			continue;
		}

		/* nothing to sync during shutdown */
		if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
			read_unlock(&pag->pag_ici_lock);
			return 0;
		}

		/*
		 * The inode lock here actually coordinates with the almost
		 * spurious inode lock in xfs_ireclaim() to prevent the vnode
		 * we handle here without a reference from being freed while we
		 * reference it.  If we lock the inode while it's on the mount
		 * list here, then the spurious inode lock in xfs_ireclaim()
		 * after the inode is pulled from the mount list will sleep
		 * until we release it here.  This keeps the vnode from being
		 * freed while we reference it.
		 */
		if (xfs_ilock_nowait(ip, lock_flags) == 0) {
			vp = vn_grab(vp);
			read_unlock(&pag->pag_ici_lock);
			if (!vp)
				continue;
			xfs_ilock(ip, lock_flags);

			ASSERT(vp == VFS_I(ip));
			ASSERT(ip->i_mount == mp);

			vnode_refed = B_TRUE;
		} else {
			/* safe to unlock here as we have a reference */
			read_unlock(&pag->pag_ici_lock);
		}
		/*
		 * If we have to flush data or wait for I/O completion
		 * we need to drop the ilock that we currently hold.
		 * If we need to drop the lock, insert a marker if we
		 * have not already done so.
		 */
		if (flags & SYNC_CLOSE) {
			xfs_iunlock(ip, XFS_ILOCK_SHARED);
			if (XFS_FORCED_SHUTDOWN(mp))
				xfs_tosspages(ip, 0, -1, FI_REMAPF);
			else
				error = xfs_flushinval_pages(ip, 0, -1,
							FI_REMAPF);
			/* wait for I/O on freeze */
			if (flags & SYNC_IOWAIT)
				vn_iowait(ip);

			xfs_ilock(ip, XFS_ILOCK_SHARED);
		}

		if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
			xfs_iunlock(ip, XFS_ILOCK_SHARED);
			error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
			if (flags & SYNC_IOWAIT)
				vn_iowait(ip);
			xfs_ilock(ip, XFS_ILOCK_SHARED);
		}

		if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
			if (flags & SYNC_WAIT) {
				xfs_iflock(ip);
				if (!xfs_inode_clean(ip))
					error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
				else
					xfs_ifunlock(ip);
			} else if (xfs_iflock_nowait(ip)) {
				if (!xfs_inode_clean(ip))
					error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
				else
					xfs_ifunlock(ip);
			} else if (bypassed) {
				(*bypassed)++;
			}
		}

		if (lock_flags)
			xfs_iunlock(ip, lock_flags);

		if (vnode_refed) {
			IRELE(ip);
			vnode_refed = B_FALSE;
		}

		if (error)
			last_error = error;
		/*
		 * bail out if the filesystem is corrupted.
		 */
		if (error == EFSCORRUPTED)
			return XFS_ERROR(error);

	} while (nr_found);

	return last_error;
}

int
xfs_sync_inodes(
	xfs_mount_t	*mp,
	int		flags,
	int             *bypassed)
{
	int		error;
	int		last_error;
	int		i;

	if (bypassed)
		*bypassed = 0;
	if (mp->m_flags & XFS_MOUNT_RDONLY)
		return 0;
	error = 0;
	last_error = 0;

	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
		if (!mp->m_perag[i].pag_ici_init)
			continue;
		error = xfs_sync_inodes_ag(mp, i, flags, bypassed);
		if (error)
			last_error = error;
		if (error == EFSCORRUPTED)
			break;
	}
	return XFS_ERROR(last_error);
}

/*
 * xfs sync routine for internal use
 *
 * This routine supports all of the flags defined for the generic vfs_sync
 * interface as explained above under xfs_sync.
 *
 */
int
xfs_syncsub(
	xfs_mount_t	*mp,
	int		flags,
	int             *bypassed)
{
	int		error = 0;
	int		last_error = 0;
	uint		log_flags = XFS_LOG_FORCE;
	xfs_buf_t	*bp;
	xfs_buf_log_item_t	*bip;

	/*
	 * Sync out the log.  This ensures that the log is periodically
	 * flushed even if there is not enough activity to fill it up.
	 */
	if (flags & SYNC_WAIT)
		log_flags |= XFS_LOG_SYNC;

	xfs_log_force(mp, (xfs_lsn_t)0, log_flags);

	if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
		if (flags & SYNC_BDFLUSH)
			xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
		else
			error = xfs_sync_inodes(mp, flags, bypassed);
	}

	/*
	 * Flushing out dirty data above probably generated more
	 * log activity, so if this isn't vfs_sync() then flush
	 * the log again.
	 */
	if (flags & SYNC_DELWRI) {
		xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
	}

	if (flags & SYNC_FSDATA) {
		/*
		 * If this is vfs_sync() then only sync the superblock
		 * if we can lock it without sleeping and it is not pinned.
		 */
		if (flags & SYNC_BDFLUSH) {
			bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
			if (bp != NULL) {
				bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
				if ((bip != NULL) &&
				    xfs_buf_item_dirty(bip)) {
					if (!(XFS_BUF_ISPINNED(bp))) {
						XFS_BUF_ASYNC(bp);
						error = xfs_bwrite(mp, bp);
					} else {
						xfs_buf_relse(bp);
					}
				} else {
					xfs_buf_relse(bp);
				}
			}
		} else {
			bp = xfs_getsb(mp, 0);
			/*
			 * If the buffer is pinned then push on the log so
			 * we won't get stuck waiting in the write for
			 * someone, maybe ourselves, to flush the log.
			 * Even though we just pushed the log above, we
			 * did not have the superblock buffer locked at
			 * that point so it can become pinned in between
			 * there and here.
			 */
			if (XFS_BUF_ISPINNED(bp))
				xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
			if (flags & SYNC_WAIT)
				XFS_BUF_UNASYNC(bp);
			else
				XFS_BUF_ASYNC(bp);
			error = xfs_bwrite(mp, bp);
		}
		if (error) {
			last_error = error;
		}
	}

	/*
	 * Now check to see if the log needs a "dummy" transaction.
	 */
	if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
		xfs_trans_t *tp;
		xfs_inode_t *ip;

		/*
		 * Put a dummy transaction in the log to tell
		 * recovery that all others are OK.
		 */
		tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
		if ((error = xfs_trans_reserve(tp, 0,
				XFS_ICHANGE_LOG_RES(mp),
				0, 0, 0)))  {
			xfs_trans_cancel(tp, 0);
			return error;
		}

		ip = mp->m_rootip;
		xfs_ilock(ip, XFS_ILOCK_EXCL);

		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
		xfs_trans_ihold(tp, ip);
		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
		error = xfs_trans_commit(tp, 0);
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
	}

	/*
	 * When shutting down, we need to insure that the AIL is pushed
	 * to disk or the filesystem can appear corrupt from the PROM.
	 */
	if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
		XFS_bflush(mp->m_ddev_targp);
		if (mp->m_rtdev_targp) {
			XFS_bflush(mp->m_rtdev_targp);
		}
	}

	return XFS_ERROR(last_error);
}

/*
 * Enqueue a work item to be picked up by the vfs xfssyncd thread.
 * Doing this has two advantages:
 * - It saves on stack space, which is tight in certain situations
 * - It can be used (with care) as a mechanism to avoid deadlocks.
 * Flushing while allocating in a full filesystem requires both.
 */
STATIC void
xfs_syncd_queue_work(
	struct xfs_mount *mp,
	void		*data,
	void		(*syncer)(struct xfs_mount *, void *))
{
	struct bhv_vfs_sync_work *work;

	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
	INIT_LIST_HEAD(&work->w_list);
	work->w_syncer = syncer;
	work->w_data = data;
	work->w_mount = mp;
	spin_lock(&mp->m_sync_lock);
	list_add_tail(&work->w_list, &mp->m_sync_list);
	spin_unlock(&mp->m_sync_lock);
	wake_up_process(mp->m_sync_task);
}

/*
 * Flush delayed allocate data, attempting to free up reserved space
 * from existing allocations.  At this point a new allocation attempt
 * has failed with ENOSPC and we are in the process of scratching our
 * heads, looking about for more room...
 */
STATIC void
xfs_flush_inode_work(
	struct xfs_mount *mp,
	void		*arg)
{
	struct inode	*inode = arg;
	filemap_flush(inode->i_mapping);
	iput(inode);
}

void
xfs_flush_inode(
	xfs_inode_t	*ip)
{
	struct inode	*inode = VFS_I(ip);

	igrab(inode);
	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
	delay(msecs_to_jiffies(500));
}

/*
 * This is the "bigger hammer" version of xfs_flush_inode_work...
 * (IOW, "If at first you don't succeed, use a Bigger Hammer").
 */
STATIC void
xfs_flush_device_work(
	struct xfs_mount *mp,
	void		*arg)
{
	struct inode	*inode = arg;
	sync_blockdev(mp->m_super->s_bdev);
	iput(inode);
}

void
xfs_flush_device(
	xfs_inode_t	*ip)
{
	struct inode	*inode = VFS_I(ip);

	igrab(inode);
	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
	delay(msecs_to_jiffies(500));
	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
}

STATIC void
xfs_sync_worker(
	struct xfs_mount *mp,
	void		*unused)
{
	int		error;

	if (!(mp->m_flags & XFS_MOUNT_RDONLY))
		error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
	mp->m_sync_seq++;
	wake_up(&mp->m_wait_single_sync_task);
}

STATIC int
xfssyncd(
	void			*arg)
{
	struct xfs_mount	*mp = arg;
	long			timeleft;
	bhv_vfs_sync_work_t	*work, *n;
	LIST_HEAD		(tmp);

	set_freezable();
	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
	for (;;) {
		timeleft = schedule_timeout_interruptible(timeleft);
		/* swsusp */
		try_to_freeze();
		if (kthread_should_stop() && list_empty(&mp->m_sync_list))
			break;

		spin_lock(&mp->m_sync_lock);
		/*
		 * We can get woken by laptop mode, to do a sync -
		 * that's the (only!) case where the list would be
		 * empty with time remaining.
		 */
		if (!timeleft || list_empty(&mp->m_sync_list)) {
			if (!timeleft)
				timeleft = xfs_syncd_centisecs *
							msecs_to_jiffies(10);
			INIT_LIST_HEAD(&mp->m_sync_work.w_list);
			list_add_tail(&mp->m_sync_work.w_list,
					&mp->m_sync_list);
		}
		list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
			list_move(&work->w_list, &tmp);
		spin_unlock(&mp->m_sync_lock);

		list_for_each_entry_safe(work, n, &tmp, w_list) {
			(*work->w_syncer)(mp, work->w_data);
			list_del(&work->w_list);
			if (work == &mp->m_sync_work)
				continue;
			kmem_free(work);
		}
	}

	return 0;
}

int
xfs_syncd_init(
	struct xfs_mount	*mp)
{
	mp->m_sync_work.w_syncer = xfs_sync_worker;
	mp->m_sync_work.w_mount = mp;
	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
	if (IS_ERR(mp->m_sync_task))
		return -PTR_ERR(mp->m_sync_task);
	return 0;
}

void
xfs_syncd_stop(
	struct xfs_mount	*mp)
{
	kthread_stop(mp->m_sync_task);
}
Commit	Line	Data
fe4fa4b8 DC	1	/*
	2	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
	3	* All Rights Reserved.
	4	*
	5	* This program is free software; you can redistribute it and/or
	6	* modify it under the terms of the GNU General Public License as
	7	* published by the Free Software Foundation.
	8	*
	9	* This program is distributed in the hope that it would be useful,
	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	* GNU General Public License for more details.
	13	*
	14	* You should have received a copy of the GNU General Public License
	15	* along with this program; if not, write the Free Software Foundation,
	16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	17	*/
	18	#include "xfs.h"
	19	#include "xfs_fs.h"
	20	#include "xfs_types.h"
	21	#include "xfs_bit.h"
	22	#include "xfs_log.h"
	23	#include "xfs_inum.h"
	24	#include "xfs_trans.h"
	25	#include "xfs_sb.h"
	26	#include "xfs_ag.h"
	27	#include "xfs_dir2.h"
	28	#include "xfs_dmapi.h"
	29	#include "xfs_mount.h"
	30	#include "xfs_bmap_btree.h"
	31	#include "xfs_alloc_btree.h"
	32	#include "xfs_ialloc_btree.h"
	33	#include "xfs_btree.h"
	34	#include "xfs_dir2_sf.h"
	35	#include "xfs_attr_sf.h"
	36	#include "xfs_inode.h"
	37	#include "xfs_dinode.h"
	38	#include "xfs_error.h"
	39	#include "xfs_mru_cache.h"
	40	#include "xfs_filestream.h"
	41	#include "xfs_vnodeops.h"
	42	#include "xfs_utils.h"
	43	#include "xfs_buf_item.h"
	44	#include "xfs_inode_item.h"
	45	#include "xfs_rw.h"
	46
a167b17e DC	47	#include <linux/kthread.h>
	48	#include <linux/freezer.h>
	49
fe4fa4b8 DC	50	/*
	51	* xfs_sync flushes any pending I/O to file system vfsp.
	52	*
	53	* This routine is called by vfs_sync() to make sure that things make it
	54	* out to disk eventually, on sync() system calls to flush out everything,
	55	* and when the file system is unmounted. For the vfs_sync() case, all
	56	* we really need to do is sync out the log to make all of our meta-data
	57	* updates permanent (except for timestamps). For calls from pflushd(),
	58	* dirty pages are kept moving by calling pdflush() on the inodes
	59	* containing them. We also flush the inodes that we can lock without
	60	* sleeping and the superblock if we can lock it without sleeping from
	61	* vfs_sync() so that items at the tail of the log are always moving out.
	62	*
	63	* Flags:
	64	* SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
	65	* to sleep if we can help it. All we really need
	66	* to do is ensure that the log is synced at least
	67	* periodically. We also push the inodes and
	68	* superblock if we can lock them without sleeping
	69	* and they are not pinned.
	70	* SYNC_ATTR - We need to flush the inodes. If SYNC_BDFLUSH is not
	71	* set, then we really want to lock each inode and flush
	72	* it.
	73	* SYNC_WAIT - All the flushes that take place in this call should
	74	* be synchronous.
	75	* SYNC_DELWRI - This tells us to push dirty pages associated with
	76	* inodes. SYNC_WAIT and SYNC_BDFLUSH are used to
	77	* determine if they should be flushed sync, async, or
	78	* delwri.
	79	* SYNC_CLOSE - This flag is passed when the system is being
	80	* unmounted. We should sync and invalidate everything.
	81	* SYNC_FSDATA - This indicates that the caller would like to make
	82	* sure the superblock is safe on disk. We can ensure
	83	* this by simply making sure the log gets flushed
	84	* if SYNC_BDFLUSH is set, and by actually writing it
	85	* out otherwise.
	86	* SYNC_IOWAIT - The caller wants us to wait for all data I/O to complete
	87	* before we return (including direct I/O). Forms the drain
	88	* side of the write barrier needed to safely quiesce the
	89	* filesystem.
	90	*
	91	*/
	92	int
	93	xfs_sync(
	94	xfs_mount_t *mp,
	95	int flags)
	96	{
	97	int error;
	98
	99	/*
	100	* Get the Quota Manager to flush the dquots.
	101	*
	102	* If XFS quota support is not enabled or this filesystem
	103	* instance does not use quotas XFS_QM_DQSYNC will always
	104	* return zero.
	105	*/
	106	error = XFS_QM_DQSYNC(mp, flags);
	107	if (error) {
	108	/*
	109	* If we got an IO error, we will be shutting down.
	110	* So, there's nothing more for us to do here.
	111	*/
	112	ASSERT(error != EIO \|\| XFS_FORCED_SHUTDOWN(mp));
	113	if (XFS_FORCED_SHUTDOWN(mp))
114	return XFS_ERROR(error);
115	}
116
117	if (flags & SYNC_IOWAIT)
118	xfs_filestream_flush(mp);
119
120	return xfs_syncsub(mp, flags, NULL);
121	}
122
123	/*
683a8970 DC	124	* Sync all the inodes in the given AG according to the
683a8970 DC	125	* direction given by the flags.
fe4fa4b8	126	*/
683a8970 DC	127	STATIC int
683a8970 DC	128	xfs_sync_inodes_ag(
fe4fa4b8	129	xfs_mount_t *mp,
683a8970	130	int ag,
fe4fa4b8	131	int flags,
683a8970	132	int *bypassed)
fe4fa4b8 DC	133	{
	134	xfs_inode_t *ip = NULL;
	135	struct inode *vp = NULL;
683a8970 DC	136	xfs_perag_t *pag = &mp->m_perag[ag];
	137	boolean_t vnode_refed = B_FALSE;
	138	int nr_found;
	139	int first_index = 0;
	140	int error = 0;
	141	int last_error = 0;
	142	int fflag = XFS_B_ASYNC;
	143	int lock_flags = XFS_ILOCK_SHARED;
fe4fa4b8	144
fe4fa4b8 DC	145	if (flags & SYNC_DELWRI)
	146	fflag = XFS_B_DELWRI;
	147	if (flags & SYNC_WAIT)
	148	fflag = 0; /* synchronous overrides all */
	149
fe4fa4b8 DC	150	if (flags & (SYNC_DELWRI \| SYNC_CLOSE)) {
	151	/*
	152	* We need the I/O lock if we're going to call any of
	153	* the flush/inval routines.
	154	*/
683a8970	155	lock_flags \|= XFS_IOLOCK_SHARED;
fe4fa4b8 DC	156	}
fe4fa4b8 DC	157
fe4fa4b8	158	do {
fe4fa4b8	159	/*
683a8970 DC	160	* use a gang lookup to find the next inode in the tree
	161	* as the tree is sparse and a gang lookup walks to find
	162	* the number of objects requested.
fe4fa4b8	163	*/
683a8970 DC	164	read_lock(&pag->pag_ici_lock);
	165	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
	166	(void**)&ip, first_index, 1);
fe4fa4b8	167
683a8970 DC	168	if (!nr_found) {
	169	read_unlock(&pag->pag_ici_lock);
	170	break;
fe4fa4b8 DC	171	}
fe4fa4b8 DC	172
683a8970 DC	173	/* update the index for the next lookup */
683a8970 DC	174	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
fe4fa4b8 DC	175
fe4fa4b8 DC	176	/*
683a8970 DC	177	* skip inodes in reclaim. Let xfs_syncsub do that for
683a8970 DC	178	* us so we don't need to worry.
fe4fa4b8	179	*/
683a8970 DC	180	vp = VFS_I(ip);
	181	if (!vp) {
	182	read_unlock(&pag->pag_ici_lock);
fe4fa4b8 DC	183	continue;
	184	}
	185
683a8970	186	/* bad inodes are dealt with elsewhere */
fe4fa4b8	187	if (VN_BAD(vp)) {
683a8970	188	read_unlock(&pag->pag_ici_lock);
fe4fa4b8 DC	189	continue;
	190	}
	191
683a8970	192	/* nothing to sync during shutdown */
fe4fa4b8	193	if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
683a8970	194	read_unlock(&pag->pag_ici_lock);
fe4fa4b8 DC	195	return 0;
	196	}
	197
	198	/*
683a8970 DC	199	* The inode lock here actually coordinates with the almost
	200	* spurious inode lock in xfs_ireclaim() to prevent the vnode
	201	* we handle here without a reference from being freed while we
	202	* reference it. If we lock the inode while it's on the mount
	203	* list here, then the spurious inode lock in xfs_ireclaim()
	204	* after the inode is pulled from the mount list will sleep
	205	* until we release it here. This keeps the vnode from being
	206	* freed while we reference it.
fe4fa4b8 DC	207	*/
fe4fa4b8 DC	208	if (xfs_ilock_nowait(ip, lock_flags) == 0) {
fe4fa4b8	209	vp = vn_grab(vp);
683a8970 DC	210	read_unlock(&pag->pag_ici_lock);
683a8970 DC	211	if (!vp)
fe4fa4b8	212	continue;
fe4fa4b8 DC	213	xfs_ilock(ip, lock_flags);
	214
	215	ASSERT(vp == VFS_I(ip));
	216	ASSERT(ip->i_mount == mp);
	217
	218	vnode_refed = B_TRUE;
683a8970 DC	219	} else {
	220	/* safe to unlock here as we have a reference */
	221	read_unlock(&pag->pag_ici_lock);
fe4fa4b8	222	}
fe4fa4b8 DC	223	/*
	224	* If we have to flush data or wait for I/O completion
	225	* we need to drop the ilock that we currently hold.
	226	* If we need to drop the lock, insert a marker if we
	227	* have not already done so.
	228	*/
683a8970	229	if (flags & SYNC_CLOSE) {
fe4fa4b8	230	xfs_iunlock(ip, XFS_ILOCK_SHARED);
683a8970 DC	231	if (XFS_FORCED_SHUTDOWN(mp))
	232	xfs_tosspages(ip, 0, -1, FI_REMAPF);
	233	else
	234	error = xfs_flushinval_pages(ip, 0, -1,
	235	FI_REMAPF);
	236	/* wait for I/O on freeze */
fe4fa4b8 DC	237	if (flags & SYNC_IOWAIT)
	238	vn_iowait(ip);
	239
	240	xfs_ilock(ip, XFS_ILOCK_SHARED);
	241	}
	242
683a8970 DC	243	if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
	244	xfs_iunlock(ip, XFS_ILOCK_SHARED);
	245	error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
	246	if (flags & SYNC_IOWAIT)
	247	vn_iowait(ip);
	248	xfs_ilock(ip, XFS_ILOCK_SHARED);
	249	}
fe4fa4b8	250
683a8970	251	if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
fe4fa4b8 DC	252	if (flags & SYNC_WAIT) {
fe4fa4b8 DC	253	xfs_iflock(ip);
683a8970 DC	254	if (!xfs_inode_clean(ip))
	255	error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
	256	else
	257	xfs_ifunlock(ip);
fe4fa4b8	258	} else if (xfs_iflock_nowait(ip)) {
683a8970 DC	259	if (!xfs_inode_clean(ip))
	260	error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
	261	else
	262	xfs_ifunlock(ip);
fe4fa4b8 DC	263	} else if (bypassed) {
	264	(*bypassed)++;
	265	}
	266	}
	267
683a8970	268	if (lock_flags)
fe4fa4b8	269	xfs_iunlock(ip, lock_flags);
fe4fa4b8 DC	270
fe4fa4b8 DC	271	if (vnode_refed) {
fe4fa4b8	272	IRELE(ip);
fe4fa4b8 DC	273	vnode_refed = B_FALSE;
	274	}
	275
683a8970	276	if (error)
fe4fa4b8	277	last_error = error;
fe4fa4b8 DC	278	/*
	279	* bail out if the filesystem is corrupted.
	280	*/
683a8970	281	if (error == EFSCORRUPTED)
fe4fa4b8	282	return XFS_ERROR(error);
fe4fa4b8	283
683a8970	284	} while (nr_found);
fe4fa4b8	285
683a8970 DC	286	return last_error;
683a8970 DC	287	}
fe4fa4b8	288
683a8970 DC	289	int
	290	xfs_sync_inodes(
	291	xfs_mount_t *mp,
	292	int flags,
	293	int *bypassed)
	294	{
	295	int error;
	296	int last_error;
	297	int i;
fe4fa4b8	298
683a8970 DC	299	if (bypassed)
	300	*bypassed = 0;
	301	if (mp->m_flags & XFS_MOUNT_RDONLY)
	302	return 0;
	303	error = 0;
	304	last_error = 0;
fe4fa4b8	305
683a8970 DC	306	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
	307	if (!mp->m_perag[i].pag_ici_init)
	308	continue;
	309	error = xfs_sync_inodes_ag(mp, i, flags, bypassed);
	310	if (error)
	311	last_error = error;
	312	if (error == EFSCORRUPTED)
	313	break;
	314	}
fe4fa4b8 DC	315	return XFS_ERROR(last_error);
	316	}
	317
	318	/*
	319	* xfs sync routine for internal use
	320	*
	321	* This routine supports all of the flags defined for the generic vfs_sync
	322	* interface as explained above under xfs_sync.
	323	*
	324	*/
	325	int
	326	xfs_syncsub(
	327	xfs_mount_t *mp,
	328	int flags,
	329	int *bypassed)
	330	{
	331	int error = 0;
	332	int last_error = 0;
	333	uint log_flags = XFS_LOG_FORCE;
	334	xfs_buf_t *bp;
	335	xfs_buf_log_item_t *bip;
	336
	337	/*
	338	* Sync out the log. This ensures that the log is periodically
	339	* flushed even if there is not enough activity to fill it up.
	340	*/
	341	if (flags & SYNC_WAIT)
	342	log_flags \|= XFS_LOG_SYNC;
	343
	344	xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
	345
	346	if (flags & (SYNC_ATTR\|SYNC_DELWRI)) {
	347	if (flags & SYNC_BDFLUSH)
75c68f41	348	xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
fe4fa4b8 DC	349	else
	350	error = xfs_sync_inodes(mp, flags, bypassed);
	351	}
	352
	353	/*
	354	* Flushing out dirty data above probably generated more
	355	* log activity, so if this isn't vfs_sync() then flush
	356	* the log again.
	357	*/
	358	if (flags & SYNC_DELWRI) {
	359	xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
	360	}
	361
	362	if (flags & SYNC_FSDATA) {
	363	/*
	364	* If this is vfs_sync() then only sync the superblock
	365	* if we can lock it without sleeping and it is not pinned.
	366	*/
	367	if (flags & SYNC_BDFLUSH) {
	368	bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
	369	if (bp != NULL) {
	370	bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
	371	if ((bip != NULL) &&
	372	xfs_buf_item_dirty(bip)) {
	373	if (!(XFS_BUF_ISPINNED(bp))) {
	374	XFS_BUF_ASYNC(bp);
	375	error = xfs_bwrite(mp, bp);
	376	} else {
	377	xfs_buf_relse(bp);
	378	}
	379	} else {
	380	xfs_buf_relse(bp);
	381	}
	382	}
	383	} else {
	384	bp = xfs_getsb(mp, 0);
	385	/*
	386	* If the buffer is pinned then push on the log so
	387	* we won't get stuck waiting in the write for
	388	* someone, maybe ourselves, to flush the log.
	389	* Even though we just pushed the log above, we
	390	* did not have the superblock buffer locked at
	391	* that point so it can become pinned in between
	392	* there and here.
	393	*/
	394	if (XFS_BUF_ISPINNED(bp))
	395	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
	396	if (flags & SYNC_WAIT)
	397	XFS_BUF_UNASYNC(bp);
	398	else
	399	XFS_BUF_ASYNC(bp);
	400	error = xfs_bwrite(mp, bp);
	401	}
	402	if (error) {
	403	last_error = error;
	404	}
	405	}
	406
	407	/*
	408	* Now check to see if the log needs a "dummy" transaction.
	409	*/
	410	if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
	411	xfs_trans_t *tp;
	412	xfs_inode_t *ip;
413
414	/*
415	* Put a dummy transaction in the log to tell
416	* recovery that all others are OK.
417	*/
418	tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
419	if ((error = xfs_trans_reserve(tp, 0,
420	XFS_ICHANGE_LOG_RES(mp),
421	0, 0, 0))) {
422	xfs_trans_cancel(tp, 0);
423	return error;
424	}
425
426	ip = mp->m_rootip;
427	xfs_ilock(ip, XFS_ILOCK_EXCL);
428
429	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
430	xfs_trans_ihold(tp, ip);
431	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
432	error = xfs_trans_commit(tp, 0);
433	xfs_iunlock(ip, XFS_ILOCK_EXCL);
434	xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
435	}
436
437	/*
438	* When shutting down, we need to insure that the AIL is pushed
439	* to disk or the filesystem can appear corrupt from the PROM.
440	*/
441	if ((flags & (SYNC_CLOSE\|SYNC_WAIT)) == (SYNC_CLOSE\|SYNC_WAIT)) {
442	XFS_bflush(mp->m_ddev_targp);
443	if (mp->m_rtdev_targp) {
444	XFS_bflush(mp->m_rtdev_targp);
445	}
446	}
447
448	return XFS_ERROR(last_error);
449	}
a167b17e DC	450
	451	/*
	452	* Enqueue a work item to be picked up by the vfs xfssyncd thread.
	453	* Doing this has two advantages:
	454	* - It saves on stack space, which is tight in certain situations
	455	* - It can be used (with care) as a mechanism to avoid deadlocks.
	456	* Flushing while allocating in a full filesystem requires both.
	457	*/
	458	STATIC void
	459	xfs_syncd_queue_work(
	460	struct xfs_mount *mp,
	461	void *data,
	462	void (syncer)(struct xfs_mount , void *))
	463	{
	464	struct bhv_vfs_sync_work *work;
	465
	466	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
	467	INIT_LIST_HEAD(&work->w_list);
	468	work->w_syncer = syncer;
	469	work->w_data = data;
	470	work->w_mount = mp;
	471	spin_lock(&mp->m_sync_lock);
	472	list_add_tail(&work->w_list, &mp->m_sync_list);
	473	spin_unlock(&mp->m_sync_lock);
	474	wake_up_process(mp->m_sync_task);
	475	}
	476
	477	/*
	478	* Flush delayed allocate data, attempting to free up reserved space
	479	* from existing allocations. At this point a new allocation attempt
	480	* has failed with ENOSPC and we are in the process of scratching our
	481	* heads, looking about for more room...
	482	*/
	483	STATIC void
	484	xfs_flush_inode_work(
	485	struct xfs_mount *mp,
	486	void *arg)
	487	{
	488	struct inode *inode = arg;
	489	filemap_flush(inode->i_mapping);
	490	iput(inode);
	491	}
	492
	493	void
	494	xfs_flush_inode(
	495	xfs_inode_t *ip)
	496	{
	497	struct inode *inode = VFS_I(ip);
	498
	499	igrab(inode);
	500	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
	501	delay(msecs_to_jiffies(500));
	502	}
	503
	504	/*
	505	* This is the "bigger hammer" version of xfs_flush_inode_work...
	506	* (IOW, "If at first you don't succeed, use a Bigger Hammer").
	507	*/
	508	STATIC void
	509	xfs_flush_device_work(
	510	struct xfs_mount *mp,
	511	void *arg)
	512	{
	513	struct inode *inode = arg;
514	sync_blockdev(mp->m_super->s_bdev);
515	iput(inode);
516	}
517
518	void
519	xfs_flush_device(
520	xfs_inode_t *ip)
521	{
522	struct inode *inode = VFS_I(ip);
523
524	igrab(inode);
525	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
526	delay(msecs_to_jiffies(500));
527	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE\|XFS_LOG_SYNC);
528	}
529
530	STATIC void
531	xfs_sync_worker(
532	struct xfs_mount *mp,
533	void *unused)
534	{
535	int error;
536
537	if (!(mp->m_flags & XFS_MOUNT_RDONLY))
538	error = xfs_sync(mp, SYNC_FSDATA \| SYNC_BDFLUSH \| SYNC_ATTR);
539	mp->m_sync_seq++;
540	wake_up(&mp->m_wait_single_sync_task);
541	}
542
543	STATIC int
544	xfssyncd(
545	void *arg)
546	{
547	struct xfs_mount *mp = arg;
548	long timeleft;
549	bhv_vfs_sync_work_t work, n;
550	LIST_HEAD (tmp);
551
552	set_freezable();
553	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
554	for (;;) {
555	timeleft = schedule_timeout_interruptible(timeleft);
556	/* swsusp */
557	try_to_freeze();
558	if (kthread_should_stop() && list_empty(&mp->m_sync_list))
559	break;
560
561	spin_lock(&mp->m_sync_lock);
562	/*
563	* We can get woken by laptop mode, to do a sync -
564	* that's the (only!) case where the list would be
565	* empty with time remaining.
566	*/
567	if (!timeleft \|\| list_empty(&mp->m_sync_list)) {
568	if (!timeleft)
569	timeleft = xfs_syncd_centisecs *
570	msecs_to_jiffies(10);
571	INIT_LIST_HEAD(&mp->m_sync_work.w_list);
572	list_add_tail(&mp->m_sync_work.w_list,
573	&mp->m_sync_list);
574	}
575	list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
576	list_move(&work->w_list, &tmp);
577	spin_unlock(&mp->m_sync_lock);
578
579	list_for_each_entry_safe(work, n, &tmp, w_list) {
580	(*work->w_syncer)(mp, work->w_data);
581	list_del(&work->w_list);
582	if (work == &mp->m_sync_work)
583	continue;
584	kmem_free(work);
585	}
586	}
587
588	return 0;
589	}
590
591	int
592	xfs_syncd_init(
593	struct xfs_mount *mp)
594	{
595	mp->m_sync_work.w_syncer = xfs_sync_worker;
596	mp->m_sync_work.w_mount = mp;
597	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
598	if (IS_ERR(mp->m_sync_task))
599	return -PTR_ERR(mp->m_sync_task);
600	return 0;
601	}
602
603	void
604	xfs_syncd_stop(
605	struct xfs_mount *mp)
606	{
607	kthread_stop(mp->m_sync_task);
608	}
609