[net-next-2.6.git] / block / blktrace.c

/*
 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/debugfs.h>
#include <linux/time.h>
#include <asm/uaccess.h>

static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
static unsigned int blktrace_seq __read_mostly = 1;

/*
 * Send out a notify message.
 */
static void trace_note(struct blk_trace *bt, pid_t pid, int action,
		       const void *data, size_t len)
{
	struct blk_io_trace *t;

	t = relay_reserve(bt->rchan, sizeof(*t) + len);
	if (t) {
		const int cpu = smp_processor_id();

		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
		t->device = bt->dev;
		t->action = action;
		t->pid = pid;
		t->cpu = cpu;
		t->pdu_len = len;
		memcpy((void *) t + sizeof(*t), data, len);
	}
}

/*
 * Send out a notify for this process, if we haven't done so since a trace
 * started
 */
static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
{
	tsk->btrace_seq = blktrace_seq;
	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
}

static void trace_note_time(struct blk_trace *bt)
{
	struct timespec now;
	unsigned long flags;
	u32 words[2];

	getnstimeofday(&now);
	words[0] = now.tv_sec;
	words[1] = now.tv_nsec;

	local_irq_save(flags);
	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
	local_irq_restore(flags);
}

static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
			 pid_t pid)
{
	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
		return 1;
	if (sector < bt->start_lba || sector > bt->end_lba)
		return 1;
	if (bt->pid && pid != bt->pid)
		return 1;

	return 0;
}

/*
 * Data direction bit lookup
 */
static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };

/*
 * Bio action bits of interest
 */
static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };

/*
 * More could be added as needed, taking care to increment the decrementer
 * to get correct indexing
 */
#define trace_barrier_bit(rw)	\
	(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
#define trace_sync_bit(rw)	\
	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
#define trace_ahead_bit(rw)	\
	(((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
#define trace_meta_bit(rw)	\
	(((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))

/*
 * The worker for the various blk_add_trace*() types. Fills out a
 * blk_io_trace structure and places it in a per-cpu subbuffer.
 */
void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
{
	struct task_struct *tsk = current;
	struct blk_io_trace *t;
	unsigned long flags;
	unsigned long *sequence;
	pid_t pid;
	int cpu;

	if (unlikely(bt->trace_state != Blktrace_running))
		return;

	what |= ddir_act[rw & WRITE];
	what |= bio_act[trace_barrier_bit(rw)];
	what |= bio_act[trace_sync_bit(rw)];
	what |= bio_act[trace_ahead_bit(rw)];
	what |= bio_act[trace_meta_bit(rw)];

	pid = tsk->pid;
	if (unlikely(act_log_check(bt, what, sector, pid)))
		return;

	/*
	 * A word about the locking here - we disable interrupts to reserve
	 * some space in the relay per-cpu buffer, to prevent an irq
	 * from coming in and stepping on our toes. Once reserved, it's
	 * enough to get preemption disabled to prevent read of this data
	 * before we are through filling it. get_cpu()/put_cpu() does this
	 * for us
	 */
	local_irq_save(flags);

	if (unlikely(tsk->btrace_seq != blktrace_seq))
		trace_note_tsk(bt, tsk);

	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
	if (t) {
		cpu = smp_processor_id();
		sequence = per_cpu_ptr(bt->sequence, cpu);

		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
		t->sequence = ++(*sequence);
		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
		t->sector = sector;
		t->bytes = bytes;
		t->action = what;
		t->pid = pid;
		t->device = bt->dev;
		t->cpu = cpu;
		t->error = error;
		t->pdu_len = pdu_len;

		if (pdu_len)
			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
	}

	local_irq_restore(flags);
}

EXPORT_SYMBOL_GPL(__blk_add_trace);

static struct dentry *blk_tree_root;
static struct mutex blk_tree_mutex;
static unsigned int root_users;

static inline void blk_remove_root(void)
{
	if (blk_tree_root) {
		debugfs_remove(blk_tree_root);
		blk_tree_root = NULL;
	}
}

static void blk_remove_tree(struct dentry *dir)
{
	mutex_lock(&blk_tree_mutex);
	debugfs_remove(dir);
	if (--root_users == 0)
		blk_remove_root();
	mutex_unlock(&blk_tree_mutex);
}

static struct dentry *blk_create_tree(const char *blk_name)
{
	struct dentry *dir = NULL;

	mutex_lock(&blk_tree_mutex);

	if (!blk_tree_root) {
		blk_tree_root = debugfs_create_dir("block", NULL);
		if (!blk_tree_root)
			goto err;
	}

	dir = debugfs_create_dir(blk_name, blk_tree_root);
	if (dir)
		root_users++;
	else
		blk_remove_root();

err:
	mutex_unlock(&blk_tree_mutex);
	return dir;
}

static void blk_trace_cleanup(struct blk_trace *bt)
{
	relay_close(bt->rchan);
	debugfs_remove(bt->dropped_file);
	blk_remove_tree(bt->dir);
	free_percpu(bt->sequence);
	kfree(bt);
}

static int blk_trace_remove(request_queue_t *q)
{
	struct blk_trace *bt;

	bt = xchg(&q->blk_trace, NULL);
	if (!bt)
		return -EINVAL;

	if (bt->trace_state == Blktrace_setup ||
	    bt->trace_state == Blktrace_stopped)
		blk_trace_cleanup(bt);

	return 0;
}

static int blk_dropped_open(struct inode *inode, struct file *filp)
{
	filp->private_data = inode->i_private;

	return 0;
}

static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
				size_t count, loff_t *ppos)
{
	struct blk_trace *bt = filp->private_data;
	char buf[16];

	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));

	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
}

static struct file_operations blk_dropped_fops = {
	.owner =	THIS_MODULE,
	.open =		blk_dropped_open,
	.read =		blk_dropped_read,
};

/*
 * Keep track of how many times we encountered a full subbuffer, to aid
 * the user space app in telling how many lost events there were.
 */
static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
				     void *prev_subbuf, size_t prev_padding)
{
	struct blk_trace *bt;

	if (!relay_buf_full(buf))
		return 1;

	bt = buf->chan->private_data;
	atomic_inc(&bt->dropped);
	return 0;
}

static int blk_remove_buf_file_callback(struct dentry *dentry)
{
	debugfs_remove(dentry);
	return 0;
}

static struct dentry *blk_create_buf_file_callback(const char *filename,
						   struct dentry *parent,
						   int mode,
						   struct rchan_buf *buf,
						   int *is_global)
{
	return debugfs_create_file(filename, mode, parent, buf,
					&relay_file_operations);
}

static struct rchan_callbacks blk_relay_callbacks = {
	.subbuf_start		= blk_subbuf_start_callback,
	.create_buf_file	= blk_create_buf_file_callback,
	.remove_buf_file	= blk_remove_buf_file_callback,
};

/*
 * Setup everything required to start tracing
 */
static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
			   char __user *arg)
{
	struct blk_user_trace_setup buts;
	struct blk_trace *old_bt, *bt = NULL;
	struct dentry *dir = NULL;
	char b[BDEVNAME_SIZE];
	int ret, i;

	if (copy_from_user(&buts, arg, sizeof(buts)))
		return -EFAULT;

	if (!buts.buf_size || !buts.buf_nr)
		return -EINVAL;

	strcpy(buts.name, bdevname(bdev, b));

	/*
	 * some device names have larger paths - convert the slashes
	 * to underscores for this to work as expected
	 */
	for (i = 0; i < strlen(buts.name); i++)
		if (buts.name[i] == '/')
			buts.name[i] = '_';

	if (copy_to_user(arg, &buts, sizeof(buts)))
		return -EFAULT;

	ret = -ENOMEM;
	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
	if (!bt)
		goto err;

	bt->sequence = alloc_percpu(unsigned long);
	if (!bt->sequence)
		goto err;

	ret = -ENOENT;
	dir = blk_create_tree(buts.name);
	if (!dir)
		goto err;

	bt->dir = dir;
	bt->dev = bdev->bd_dev;
	atomic_set(&bt->dropped, 0);

	ret = -EIO;
	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
	if (!bt->dropped_file)
		goto err;

	bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
	if (!bt->rchan)
		goto err;
	bt->rchan->private_data = bt;

	bt->act_mask = buts.act_mask;
	if (!bt->act_mask)
		bt->act_mask = (u16) -1;

	bt->start_lba = buts.start_lba;
	bt->end_lba = buts.end_lba;
	if (!bt->end_lba)
		bt->end_lba = -1ULL;

	bt->pid = buts.pid;
	bt->trace_state = Blktrace_setup;

	ret = -EBUSY;
	old_bt = xchg(&q->blk_trace, bt);
	if (old_bt) {
		(void) xchg(&q->blk_trace, old_bt);
		goto err;
	}

	return 0;
err:
	if (dir)
		blk_remove_tree(dir);
	if (bt) {
		if (bt->dropped_file)
			debugfs_remove(bt->dropped_file);
		free_percpu(bt->sequence);
		if (bt->rchan)
			relay_close(bt->rchan);
		kfree(bt);
	}
	return ret;
}

static int blk_trace_startstop(request_queue_t *q, int start)
{
	struct blk_trace *bt;
	int ret;

	if ((bt = q->blk_trace) == NULL)
		return -EINVAL;

	/*
	 * For starting a trace, we can transition from a setup or stopped
	 * trace. For stopping a trace, the state must be running
	 */
	ret = -EINVAL;
	if (start) {
		if (bt->trace_state == Blktrace_setup ||
		    bt->trace_state == Blktrace_stopped) {
			blktrace_seq++;
			smp_mb();
			bt->trace_state = Blktrace_running;

			trace_note_time(bt);
			ret = 0;
		}
	} else {
		if (bt->trace_state == Blktrace_running) {
			bt->trace_state = Blktrace_stopped;
			relay_flush(bt->rchan);
			ret = 0;
		}
	}

	return ret;
}

/**
 * blk_trace_ioctl: - handle the ioctls associated with tracing
 * @bdev:	the block device
 * @cmd: 	the ioctl cmd
 * @arg:	the argument data, if any
 *
 **/
int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
{
	request_queue_t *q;
	int ret, start = 0;

	q = bdev_get_queue(bdev);
	if (!q)
		return -ENXIO;

	mutex_lock(&bdev->bd_mutex);

	switch (cmd) {
	case BLKTRACESETUP:
		ret = blk_trace_setup(q, bdev, arg);
		break;
	case BLKTRACESTART:
		start = 1;
	case BLKTRACESTOP:
		ret = blk_trace_startstop(q, start);
		break;
	case BLKTRACETEARDOWN:
		ret = blk_trace_remove(q);
		break;
	default:
		ret = -ENOTTY;
		break;
	}

	mutex_unlock(&bdev->bd_mutex);
	return ret;
}

/**
 * blk_trace_shutdown: - stop and cleanup trace structures
 * @q:    the request queue associated with the device
 *
 **/
void blk_trace_shutdown(request_queue_t *q)
{
	if (q->blk_trace) {
		blk_trace_startstop(q, 0);
		blk_trace_remove(q);
	}
}

/*
 * Average offset over two calls to sched_clock() with a gettimeofday()
 * in the middle
 */
static void blk_check_time(unsigned long long *t)
{
	unsigned long long a, b;
	struct timeval tv;

	a = sched_clock();
	do_gettimeofday(&tv);
	b = sched_clock();

	*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
	*t -= (a + b) / 2;
}

/*
 * calibrate our inter-CPU timings
 */
static void blk_trace_check_cpu_time(void *data)
{
	unsigned long long *t;
	int cpu = get_cpu();

	t = &per_cpu(blk_trace_cpu_offset, cpu);

	/*
	 * Just call it twice, hopefully the second call will be cache hot
	 * and a little more precise
	 */
	blk_check_time(t);
	blk_check_time(t);

	put_cpu();
}

static void blk_trace_set_ht_offsets(void)
{
#if defined(CONFIG_SCHED_SMT)
	int cpu, i;

	/*
	 * now make sure HT siblings have the same time offset
	 */
	preempt_disable();
	for_each_online_cpu(cpu) {
		unsigned long long *cpu_off, *sibling_off;

		for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
			if (i == cpu)
				continue;

			cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
			sibling_off = &per_cpu(blk_trace_cpu_offset, i);
			*sibling_off = *cpu_off;
		}
	}
	preempt_enable();
#endif
}

static __init int blk_trace_init(void)
{
	mutex_init(&blk_tree_mutex);
	on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
	blk_trace_set_ht_offsets();

	return 0;
}

module_init(blk_trace_init);
Commit	Line	Data
2056a782	1	/*
0fe23479	2	* Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
2056a782 JA	3	*
	4	* This program is free software; you can redistribute it and/or modify
	5	* it under the terms of the GNU General Public License version 2 as
	6	* published by the Free Software Foundation.
	7	*
	8	* This program is distributed in the hope that it will be useful,
	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	* GNU General Public License for more details.
	12	*
	13	* You should have received a copy of the GNU General Public License
	14	* along with this program; if not, write to the Free Software
	15	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	16	*
	17	*/
2056a782 JA	18	#include <linux/kernel.h>
	19	#include <linux/blkdev.h>
	20	#include <linux/blktrace_api.h>
	21	#include <linux/percpu.h>
	22	#include <linux/init.h>
	23	#include <linux/mutex.h>
	24	#include <linux/debugfs.h>
be1c6341	25	#include <linux/time.h>
2056a782 JA	26	#include <asm/uaccess.h>
	27
	28	static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
	29	static unsigned int blktrace_seq __read_mostly = 1;
	30
be1c6341 OK	31	/*
	32	* Send out a notify message.
	33	*/
a863055b JA	34	static void trace_note(struct blk_trace *bt, pid_t pid, int action,
a863055b JA	35	const void *data, size_t len)
be1c6341 OK	36	{
be1c6341 OK	37	struct blk_io_trace *t;
be1c6341 OK	38
be1c6341 OK	39	t = relay_reserve(bt->rchan, sizeof(*t) + len);
d3d9d2a5 JA	40	if (t) {
	41	const int cpu = smp_processor_id();
	42
	43	t->magic = BLK_IO_TRACE_MAGIC \| BLK_IO_TRACE_VERSION;
	44	t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
	45	t->device = bt->dev;
	46	t->action = action;
	47	t->pid = pid;
	48	t->cpu = cpu;
	49	t->pdu_len = len;
	50	memcpy((void ) t + sizeof(t), data, len);
	51	}
be1c6341 OK	52	}
be1c6341 OK	53
2056a782 JA	54	/*
	55	* Send out a notify for this process, if we haven't done so since a trace
	56	* started
	57	*/
	58	static void trace_note_tsk(struct blk_trace bt, struct task_struct tsk)
	59	{
a863055b JA	60	tsk->btrace_seq = blktrace_seq;
a863055b JA	61	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
be1c6341	62	}
2056a782	63
be1c6341 OK	64	static void trace_note_time(struct blk_trace *bt)
	65	{
	66	struct timespec now;
	67	unsigned long flags;
	68	u32 words[2];
	69
	70	getnstimeofday(&now);
	71	words[0] = now.tv_sec;
	72	words[1] = now.tv_nsec;
	73
	74	local_irq_save(flags);
	75	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
	76	local_irq_restore(flags);
2056a782 JA	77	}
	78
	79	static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
	80	pid_t pid)
	81	{
	82	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
	83	return 1;
	84	if (sector < bt->start_lba \|\| sector > bt->end_lba)
	85	return 1;
	86	if (bt->pid && pid != bt->pid)
	87	return 1;
	88
	89	return 0;
	90	}
	91
	92	/*
	93	* Data direction bit lookup
	94	*/
	95	static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
	96
	97	/*
	98	* Bio action bits of interest
	99	*/
7457e6e2	100	static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
2056a782 JA	101
	102	/*
	103	* More could be added as needed, taking care to increment the decrementer
	104	* to get correct indexing
	105	*/
	106	#define trace_barrier_bit(rw) \
	107	(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
	108	#define trace_sync_bit(rw) \
	109	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
40359ccb	110	#define trace_ahead_bit(rw) \
ad01b1ca	111	(((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
7457e6e2 JA	112	#define trace_meta_bit(rw) \
7457e6e2 JA	113	(((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
2056a782 JA	114
	115	/*
	116	* The worker for the various blk_add_trace*() types. Fills out a
	117	* blk_io_trace structure and places it in a per-cpu subbuffer.
	118	*/
	119	void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
	120	int rw, u32 what, int error, int pdu_len, void *pdu_data)
	121	{
	122	struct task_struct *tsk = current;
	123	struct blk_io_trace *t;
	124	unsigned long flags;
	125	unsigned long *sequence;
	126	pid_t pid;
	127	int cpu;
	128
	129	if (unlikely(bt->trace_state != Blktrace_running))
	130	return;
	131
	132	what \|= ddir_act[rw & WRITE];
	133	what \|= bio_act[trace_barrier_bit(rw)];
	134	what \|= bio_act[trace_sync_bit(rw)];
40359ccb	135	what \|= bio_act[trace_ahead_bit(rw)];
7457e6e2	136	what \|= bio_act[trace_meta_bit(rw)];
2056a782 JA	137
	138	pid = tsk->pid;
	139	if (unlikely(act_log_check(bt, what, sector, pid)))
	140	return;
	141
	142	/*
	143	* A word about the locking here - we disable interrupts to reserve
	144	* some space in the relay per-cpu buffer, to prevent an irq
	145	* from coming in and stepping on our toes. Once reserved, it's
	146	* enough to get preemption disabled to prevent read of this data
	147	* before we are through filling it. get_cpu()/put_cpu() does this
	148	* for us
	149	*/
	150	local_irq_save(flags);
	151
	152	if (unlikely(tsk->btrace_seq != blktrace_seq))
	153	trace_note_tsk(bt, tsk);
	154
	155	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
	156	if (t) {
	157	cpu = smp_processor_id();
	158	sequence = per_cpu_ptr(bt->sequence, cpu);
	159
	160	t->magic = BLK_IO_TRACE_MAGIC \| BLK_IO_TRACE_VERSION;
	161	t->sequence = ++(*sequence);
	162	t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
	163	t->sector = sector;
	164	t->bytes = bytes;
	165	t->action = what;
	166	t->pid = pid;
	167	t->device = bt->dev;
	168	t->cpu = cpu;
	169	t->error = error;
	170	t->pdu_len = pdu_len;
	171
	172	if (pdu_len)
	173	memcpy((void ) t + sizeof(t), pdu_data, pdu_len);
	174	}
	175
	176	local_irq_restore(flags);
	177	}
	178
	179	EXPORT_SYMBOL_GPL(__blk_add_trace);
	180
	181	static struct dentry *blk_tree_root;
	182	static struct mutex blk_tree_mutex;
	183	static unsigned int root_users;
	184
	185	static inline void blk_remove_root(void)
	186	{
	187	if (blk_tree_root) {
	188	debugfs_remove(blk_tree_root);
	189	blk_tree_root = NULL;
	190	}
	191	}
	192
	193	static void blk_remove_tree(struct dentry *dir)
	194	{
	195	mutex_lock(&blk_tree_mutex);
	196	debugfs_remove(dir);
	197	if (--root_users == 0)
	198	blk_remove_root();
	199	mutex_unlock(&blk_tree_mutex);
	200	}
201
202	static struct dentry blk_create_tree(const char blk_name)
203	{
204	struct dentry *dir = NULL;
205
206	mutex_lock(&blk_tree_mutex);
207
208	if (!blk_tree_root) {
209	blk_tree_root = debugfs_create_dir("block", NULL);
210	if (!blk_tree_root)
211	goto err;
212	}
213
214	dir = debugfs_create_dir(blk_name, blk_tree_root);
215	if (dir)
216	root_users++;
217	else
218	blk_remove_root();
219
220	err:
221	mutex_unlock(&blk_tree_mutex);
222	return dir;
223	}
224
225	static void blk_trace_cleanup(struct blk_trace *bt)
226	{
227	relay_close(bt->rchan);
228	debugfs_remove(bt->dropped_file);
229	blk_remove_tree(bt->dir);
230	free_percpu(bt->sequence);
231	kfree(bt);
232	}
233
234	static int blk_trace_remove(request_queue_t *q)
235	{
236	struct blk_trace *bt;
237
238	bt = xchg(&q->blk_trace, NULL);
239	if (!bt)
240	return -EINVAL;
241
242	if (bt->trace_state == Blktrace_setup \|\|
243	bt->trace_state == Blktrace_stopped)
244	blk_trace_cleanup(bt);
245
246	return 0;
247	}
248
249	static int blk_dropped_open(struct inode inode, struct file filp)
250	{
8e18e294	251	filp->private_data = inode->i_private;
2056a782 JA	252
	253	return 0;
	254	}
	255
	256	static ssize_t blk_dropped_read(struct file filp, char __user buffer,
	257	size_t count, loff_t *ppos)
	258	{
	259	struct blk_trace *bt = filp->private_data;
	260	char buf[16];
	261
	262	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
	263
	264	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
	265	}
	266
	267	static struct file_operations blk_dropped_fops = {
	268	.owner = THIS_MODULE,
	269	.open = blk_dropped_open,
	270	.read = blk_dropped_read,
	271	};
	272
	273	/*
	274	* Keep track of how many times we encountered a full subbuffer, to aid
	275	* the user space app in telling how many lost events there were.
	276	*/
	277	static int blk_subbuf_start_callback(struct rchan_buf buf, void subbuf,
	278	void *prev_subbuf, size_t prev_padding)
	279	{
	280	struct blk_trace *bt;
	281
	282	if (!relay_buf_full(buf))
	283	return 1;
	284
	285	bt = buf->chan->private_data;
	286	atomic_inc(&bt->dropped);
	287	return 0;
	288	}
	289
	290	static int blk_remove_buf_file_callback(struct dentry *dentry)
	291	{
	292	debugfs_remove(dentry);
	293	return 0;
	294	}
	295
	296	static struct dentry blk_create_buf_file_callback(const char filename,
	297	struct dentry *parent,
	298	int mode,
	299	struct rchan_buf *buf,
	300	int *is_global)
	301	{
	302	return debugfs_create_file(filename, mode, parent, buf,
	303	&relay_file_operations);
	304	}
	305
	306	static struct rchan_callbacks blk_relay_callbacks = {
	307	.subbuf_start = blk_subbuf_start_callback,
	308	.create_buf_file = blk_create_buf_file_callback,
	309	.remove_buf_file = blk_remove_buf_file_callback,
	310	};
	311
	312	/*
	313	* Setup everything required to start tracing
	314	*/
	315	static int blk_trace_setup(request_queue_t q, struct block_device bdev,
316	char __user *arg)
317	{
318	struct blk_user_trace_setup buts;
319	struct blk_trace old_bt, bt = NULL;
320	struct dentry *dir = NULL;
321	char b[BDEVNAME_SIZE];
322	int ret, i;
323
324	if (copy_from_user(&buts, arg, sizeof(buts)))
325	return -EFAULT;
326
327	if (!buts.buf_size \|\| !buts.buf_nr)
328	return -EINVAL;
329
330	strcpy(buts.name, bdevname(bdev, b));
331
332	/*
333	* some device names have larger paths - convert the slashes
334	* to underscores for this to work as expected
335	*/
336	for (i = 0; i < strlen(buts.name); i++)
337	if (buts.name[i] == '/')
338	buts.name[i] = '_';
339
340	if (copy_to_user(arg, &buts, sizeof(buts)))
341	return -EFAULT;
342
343	ret = -ENOMEM;
344	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
345	if (!bt)
346	goto err;
347
348	bt->sequence = alloc_percpu(unsigned long);
349	if (!bt->sequence)
350	goto err;
351
352	ret = -ENOENT;
353	dir = blk_create_tree(buts.name);
354	if (!dir)
355	goto err;
356
357	bt->dir = dir;
358	bt->dev = bdev->bd_dev;
359	atomic_set(&bt->dropped, 0);
360
361	ret = -EIO;
362	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
363	if (!bt->dropped_file)
364	goto err;
365
366	bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
367	if (!bt->rchan)
368	goto err;
369	bt->rchan->private_data = bt;
370
371	bt->act_mask = buts.act_mask;
372	if (!bt->act_mask)
373	bt->act_mask = (u16) -1;
374
375	bt->start_lba = buts.start_lba;
376	bt->end_lba = buts.end_lba;
377	if (!bt->end_lba)
378	bt->end_lba = -1ULL;
379
380	bt->pid = buts.pid;
381	bt->trace_state = Blktrace_setup;
382
383	ret = -EBUSY;
384	old_bt = xchg(&q->blk_trace, bt);
385	if (old_bt) {
386	(void) xchg(&q->blk_trace, old_bt);
387	goto err;
388	}
389
390	return 0;
391	err:
392	if (dir)
393	blk_remove_tree(dir);
394	if (bt) {
395	if (bt->dropped_file)
396	debugfs_remove(bt->dropped_file);
a1205868	397	free_percpu(bt->sequence);
2056a782 JA	398	if (bt->rchan)
	399	relay_close(bt->rchan);
	400	kfree(bt);
	401	}
	402	return ret;
	403	}
	404
	405	static int blk_trace_startstop(request_queue_t *q, int start)
	406	{
	407	struct blk_trace *bt;
	408	int ret;
	409
	410	if ((bt = q->blk_trace) == NULL)
	411	return -EINVAL;
	412
	413	/*
	414	* For starting a trace, we can transition from a setup or stopped
	415	* trace. For stopping a trace, the state must be running
	416	*/
	417	ret = -EINVAL;
	418	if (start) {
	419	if (bt->trace_state == Blktrace_setup \|\|
	420	bt->trace_state == Blktrace_stopped) {
	421	blktrace_seq++;
	422	smp_mb();
	423	bt->trace_state = Blktrace_running;
be1c6341 OK	424
be1c6341 OK	425	trace_note_time(bt);
2056a782 JA	426	ret = 0;
	427	}
	428	} else {
	429	if (bt->trace_state == Blktrace_running) {
	430	bt->trace_state = Blktrace_stopped;
	431	relay_flush(bt->rchan);
	432	ret = 0;
	433	}
	434	}
	435
	436	return ret;
	437	}
	438
	439	/**
	440	* blk_trace_ioctl: - handle the ioctls associated with tracing
	441	* @bdev: the block device
	442	* @cmd: the ioctl cmd
	443	* @arg: the argument data, if any
	444	*
	445	**/
	446	int blk_trace_ioctl(struct block_device bdev, unsigned cmd, char __user arg)
	447	{
	448	request_queue_t *q;
	449	int ret, start = 0;
	450
	451	q = bdev_get_queue(bdev);
	452	if (!q)
	453	return -ENXIO;
	454
	455	mutex_lock(&bdev->bd_mutex);
	456
	457	switch (cmd) {
	458	case BLKTRACESETUP:
	459	ret = blk_trace_setup(q, bdev, arg);
	460	break;
	461	case BLKTRACESTART:
	462	start = 1;
	463	case BLKTRACESTOP:
	464	ret = blk_trace_startstop(q, start);
	465	break;
	466	case BLKTRACETEARDOWN:
	467	ret = blk_trace_remove(q);
	468	break;
	469	default:
	470	ret = -ENOTTY;
	471	break;
	472	}
	473
	474	mutex_unlock(&bdev->bd_mutex);
	475	return ret;
	476	}
	477
	478	/**
	479	* blk_trace_shutdown: - stop and cleanup trace structures
	480	* @q: the request queue associated with the device
	481	*
	482	**/
	483	void blk_trace_shutdown(request_queue_t *q)
	484	{
6c5c9341 AD	485	if (q->blk_trace) {
	486	blk_trace_startstop(q, 0);
	487	blk_trace_remove(q);
	488	}
2056a782 JA	489	}
	490
	491	/*
	492	* Average offset over two calls to sched_clock() with a gettimeofday()
	493	* in the middle
	494	*/
	495	static void blk_check_time(unsigned long long *t)
	496	{
	497	unsigned long long a, b;
	498	struct timeval tv;
	499
	500	a = sched_clock();
	501	do_gettimeofday(&tv);
	502	b = sched_clock();
	503
	504	t = tv.tv_sec 1000000000 + tv.tv_usec * 1000;
	505	*t -= (a + b) / 2;
	506	}
	507
4090959a MP	508	/*
	509	* calibrate our inter-CPU timings
	510	*/
2056a782 JA	511	static void blk_trace_check_cpu_time(void *data)
	512	{
	513	unsigned long long *t;
	514	int cpu = get_cpu();
	515
	516	t = &per_cpu(blk_trace_cpu_offset, cpu);
	517
	518	/*
	519	* Just call it twice, hopefully the second call will be cache hot
	520	* and a little more precise
	521	*/
	522	blk_check_time(t);
	523	blk_check_time(t);
	524
	525	put_cpu();
	526	}
	527
2056a782 JA	528	static void blk_trace_set_ht_offsets(void)
	529	{
	530	#if defined(CONFIG_SCHED_SMT)
	531	int cpu, i;
	532
	533	/*
	534	* now make sure HT siblings have the same time offset
	535	*/
	536	preempt_disable();
	537	for_each_online_cpu(cpu) {
	538	unsigned long long cpu_off, sibling_off;
	539
	540	for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
	541	if (i == cpu)
	542	continue;
	543
	544	cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
	545	sibling_off = &per_cpu(blk_trace_cpu_offset, i);
	546	sibling_off = cpu_off;
	547	}
	548	}
	549	preempt_enable();
	550	#endif
	551	}
	552
	553	static __init int blk_trace_init(void)
	554	{
	555	mutex_init(&blk_tree_mutex);
4090959a	556	on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
2056a782 JA	557	blk_trace_set_ht_offsets();
	558
	559	return 0;
	560	}
	561
	562	module_init(blk_trace_init);
	563