[net-next-2.6.git] / kernel / sched_stats.h


#ifdef CONFIG_SCHEDSTATS
/*
 * bump this up when changing the output format or the meaning of an existing
 * format, so that tools can adapt (or abort)
 */
#define SCHEDSTAT_VERSION 15

static int show_schedstat(struct seq_file *seq, void *v)
{
	int cpu;
	int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
	char *mask_str = kmalloc(mask_len, GFP_KERNEL);

	if (mask_str == NULL)
		return -ENOMEM;

	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
	seq_printf(seq, "timestamp %lu\n", jiffies);
	for_each_online_cpu(cpu) {
		struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_SMP
		struct sched_domain *sd;
		int dcount = 0;
#endif

		/* runqueue-specific stats */
		seq_printf(seq,
		    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
		    cpu, rq->yld_count,
		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
		    rq->ttwu_count, rq->ttwu_local,
		    rq->rq_cpu_time,
		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);

		seq_printf(seq, "\n");

#ifdef CONFIG_SMP
		/* domain-specific stats */
		preempt_disable();
		for_each_domain(cpu, sd) {
			enum cpu_idle_type itype;

			cpumask_scnprintf(mask_str, mask_len,
					  sched_domain_span(sd));
			seq_printf(seq, "domain%d %s", dcount++, mask_str);
			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
					itype++) {
				seq_printf(seq, " %u %u %u %u %u %u %u %u",
				    sd->lb_count[itype],
				    sd->lb_balanced[itype],
				    sd->lb_failed[itype],
				    sd->lb_imbalance[itype],
				    sd->lb_gained[itype],
				    sd->lb_hot_gained[itype],
				    sd->lb_nobusyq[itype],
				    sd->lb_nobusyg[itype]);
			}
			seq_printf(seq,
				   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
			    sd->ttwu_move_balance);
		}
		preempt_enable();
#endif
	}
	kfree(mask_str);
	return 0;
}

static int schedstat_open(struct inode *inode, struct file *file)
{
	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
	char *buf = kmalloc(size, GFP_KERNEL);
	struct seq_file *m;
	int res;

	if (!buf)
		return -ENOMEM;
	res = single_open(file, show_schedstat, NULL);
	if (!res) {
		m = file->private_data;
		m->buf = buf;
		m->size = size;
	} else
		kfree(buf);
	return res;
}

static const struct file_operations proc_schedstat_operations = {
	.open    = schedstat_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
	.release = single_release,
};

static int __init proc_schedstat_init(void)
{
	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
	return 0;
}
module_init(proc_schedstat_init);

/*
 * Expects runqueue lock to be held for atomicity of update
 */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
{
	if (rq) {
		rq->rq_sched_info.run_delay += delta;
		rq->rq_sched_info.pcount++;
	}
}

/*
 * Expects runqueue lock to be held for atomicity of update
 */
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{
	if (rq)
		rq->rq_cpu_time += delta;
}

static inline void
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
{
	if (rq)
		rq->rq_sched_info.run_delay += delta;
}
# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
# define schedstat_set(var, val)	do { var = (val); } while (0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
{}
static inline void
rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
{}
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
# define schedstat_inc(rq, field)	do { } while (0)
# define schedstat_add(rq, field, amt)	do { } while (0)
# define schedstat_set(var, val)	do { } while (0)
#endif

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
static inline void sched_info_reset_dequeued(struct task_struct *t)
{
	t->sched_info.last_queued = 0;
}

/*
 * We are interested in knowing how long it was from the *first* time a
 * task was queued to the time that it finally hit a cpu, we call this routine
 * from dequeue_task() to account for possible rq->clock skew across cpus. The
 * delta taken on each cpu would annul the skew.
 */
static inline void sched_info_dequeued(struct task_struct *t)
{
	unsigned long long now = task_rq(t)->clock, delta = 0;

	if (unlikely(sched_info_on()))
		if (t->sched_info.last_queued)
			delta = now - t->sched_info.last_queued;
	sched_info_reset_dequeued(t);
	t->sched_info.run_delay += delta;

	rq_sched_info_dequeued(task_rq(t), delta);
}

/*
 * Called when a task finally hits the cpu.  We can now calculate how
 * long it was waiting to run.  We also note when it began so that we
 * can keep stats on how long its timeslice is.
 */
static void sched_info_arrive(struct task_struct *t)
{
	unsigned long long now = task_rq(t)->clock, delta = 0;

	if (t->sched_info.last_queued)
		delta = now - t->sched_info.last_queued;
	sched_info_reset_dequeued(t);
	t->sched_info.run_delay += delta;
	t->sched_info.last_arrival = now;
	t->sched_info.pcount++;

	rq_sched_info_arrive(task_rq(t), delta);
}

/*
 * This function is only called from enqueue_task(), but also only updates
 * the timestamp if it is already not set.  It's assumed that
 * sched_info_dequeued() will clear that stamp when appropriate.
 */
static inline void sched_info_queued(struct task_struct *t)
{
	if (unlikely(sched_info_on()))
		if (!t->sched_info.last_queued)
			t->sched_info.last_queued = task_rq(t)->clock;
}

/*
 * Called when a process ceases being the active-running process, either
 * voluntarily or involuntarily.  Now we can calculate how long we ran.
 * Also, if the process is still in the TASK_RUNNING state, call
 * sched_info_queued() to mark that it has now again started waiting on
 * the runqueue.
 */
static inline void sched_info_depart(struct task_struct *t)
{
	unsigned long long delta = task_rq(t)->clock -
					t->sched_info.last_arrival;

	rq_sched_info_depart(task_rq(t), delta);

	if (t->state == TASK_RUNNING)
		sched_info_queued(t);
}

/*
 * Called when tasks are switched involuntarily due, typically, to expiring
 * their time slice.  (This may also be called when switching to or from
 * the idle task.)  We are only called when prev != next.
 */
static inline void
__sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
	struct rq *rq = task_rq(prev);

	/*
	 * prev now departs the cpu.  It's not interesting to record
	 * stats about how efficient we were at scheduling the idle
	 * process, however.
	 */
	if (prev != rq->idle)
		sched_info_depart(prev);

	if (next != rq->idle)
		sched_info_arrive(next);
}
static inline void
sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
	if (unlikely(sched_info_on()))
		__sched_info_switch(prev, next);
}
#else
#define sched_info_queued(t)			do { } while (0)
#define sched_info_reset_dequeued(t)	do { } while (0)
#define sched_info_dequeued(t)			do { } while (0)
#define sched_info_switch(t, next)		do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */

/*
 * The following are functions that support scheduler-internal time accounting.
 * These functions are generally called at the timer tick.  None of this depends
 * on CONFIG_SCHEDSTATS.
 */

/**
 * account_group_user_time - Maintain utime for a thread group.
 *
 * @tsk:	Pointer to task structure.
 * @cputime:	Time value by which to increment the utime field of the
 *		thread_group_cputime structure.
 *
 * If thread group time is being maintained, get the structure for the
 * running CPU and update the utime field there.
 */
static inline void account_group_user_time(struct task_struct *tsk,
					   cputime_t cputime)
{
	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;

	if (!cputimer->running)
		return;

	spin_lock(&cputimer->lock);
	cputimer->cputime.utime =
		cputime_add(cputimer->cputime.utime, cputime);
	spin_unlock(&cputimer->lock);
}

/**
 * account_group_system_time - Maintain stime for a thread group.
 *
 * @tsk:	Pointer to task structure.
 * @cputime:	Time value by which to increment the stime field of the
 *		thread_group_cputime structure.
 *
 * If thread group time is being maintained, get the structure for the
 * running CPU and update the stime field there.
 */
static inline void account_group_system_time(struct task_struct *tsk,
					     cputime_t cputime)
{
	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;

	if (!cputimer->running)
		return;

	spin_lock(&cputimer->lock);
	cputimer->cputime.stime =
		cputime_add(cputimer->cputime.stime, cputime);
	spin_unlock(&cputimer->lock);
}

/**
 * account_group_exec_runtime - Maintain exec runtime for a thread group.
 *
 * @tsk:	Pointer to task structure.
 * @ns:		Time value by which to increment the sum_exec_runtime field
 *		of the thread_group_cputime structure.
 *
 * If thread group time is being maintained, get the structure for the
 * running CPU and update the sum_exec_runtime field there.
 */
static inline void account_group_exec_runtime(struct task_struct *tsk,
					      unsigned long long ns)
{
	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;

	if (!cputimer->running)
		return;

	spin_lock(&cputimer->lock);
	cputimer->cputime.sum_exec_runtime += ns;
	spin_unlock(&cputimer->lock);
}
Commit	Line	Data
	1
	2	#ifdef CONFIG_SCHEDSTATS
	3	/*
	4	* bump this up when changing the output format or the meaning of an existing
	5	* format, so that tools can adapt (or abort)
	6	*/
	7	#define SCHEDSTAT_VERSION 15
	8
	9	static int show_schedstat(struct seq_file seq, void v)
	10	{
	11	int cpu;
	12	int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
	13	char *mask_str = kmalloc(mask_len, GFP_KERNEL);
	14
	15	if (mask_str == NULL)
	16	return -ENOMEM;
	17
	18	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
	19	seq_printf(seq, "timestamp %lu\n", jiffies);
	20	for_each_online_cpu(cpu) {
	21	struct rq *rq = cpu_rq(cpu);
	22	#ifdef CONFIG_SMP
	23	struct sched_domain *sd;
	24	int dcount = 0;
	25	#endif
	26
	27	/* runqueue-specific stats */
	28	seq_printf(seq,
	29	"cpu%d %u %u %u %u %u %u %llu %llu %lu",
	30	cpu, rq->yld_count,
	31	rq->sched_switch, rq->sched_count, rq->sched_goidle,
	32	rq->ttwu_count, rq->ttwu_local,
	33	rq->rq_cpu_time,
	34	rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
	35
	36	seq_printf(seq, "\n");
	37
	38	#ifdef CONFIG_SMP
	39	/* domain-specific stats */
	40	preempt_disable();
	41	for_each_domain(cpu, sd) {
	42	enum cpu_idle_type itype;
	43
	44	cpumask_scnprintf(mask_str, mask_len,
	45	sched_domain_span(sd));
	46	seq_printf(seq, "domain%d %s", dcount++, mask_str);
	47	for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
	48	itype++) {
	49	seq_printf(seq, " %u %u %u %u %u %u %u %u",
	50	sd->lb_count[itype],
	51	sd->lb_balanced[itype],
	52	sd->lb_failed[itype],
	53	sd->lb_imbalance[itype],
	54	sd->lb_gained[itype],
	55	sd->lb_hot_gained[itype],
	56	sd->lb_nobusyq[itype],
	57	sd->lb_nobusyg[itype]);
	58	}
	59	seq_printf(seq,
	60	" %u %u %u %u %u %u %u %u %u %u %u %u\n",
	61	sd->alb_count, sd->alb_failed, sd->alb_pushed,
	62	sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
	63	sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
	64	sd->ttwu_wake_remote, sd->ttwu_move_affine,
	65	sd->ttwu_move_balance);
	66	}
	67	preempt_enable();
	68	#endif
	69	}
	70	kfree(mask_str);
	71	return 0;
	72	}
	73
	74	static int schedstat_open(struct inode inode, struct file file)
	75	{
	76	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
	77	char *buf = kmalloc(size, GFP_KERNEL);
	78	struct seq_file *m;
	79	int res;
	80
	81	if (!buf)
	82	return -ENOMEM;
	83	res = single_open(file, show_schedstat, NULL);
	84	if (!res) {
	85	m = file->private_data;
	86	m->buf = buf;
	87	m->size = size;
	88	} else
	89	kfree(buf);
	90	return res;
	91	}
	92
	93	static const struct file_operations proc_schedstat_operations = {
	94	.open = schedstat_open,
	95	.read = seq_read,
	96	.llseek = seq_lseek,
	97	.release = single_release,
	98	};
	99
	100	static int __init proc_schedstat_init(void)
	101	{
	102	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
	103	return 0;
	104	}
	105	module_init(proc_schedstat_init);
	106
	107	/*
	108	* Expects runqueue lock to be held for atomicity of update
	109	*/
	110	static inline void
	111	rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
	112	{
	113	if (rq) {
	114	rq->rq_sched_info.run_delay += delta;
	115	rq->rq_sched_info.pcount++;
	116	}
	117	}
	118
	119	/*
	120	* Expects runqueue lock to be held for atomicity of update
	121	*/
	122	static inline void
	123	rq_sched_info_depart(struct rq *rq, unsigned long long delta)
	124	{
	125	if (rq)
	126	rq->rq_cpu_time += delta;
	127	}
	128
	129	static inline void
	130	rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
	131	{
	132	if (rq)
	133	rq->rq_sched_info.run_delay += delta;
	134	}
	135	# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
	136	# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
	137	# define schedstat_set(var, val) do { var = (val); } while (0)
	138	#else /* !CONFIG_SCHEDSTATS */
	139	static inline void
	140	rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
	141	{}
	142	static inline void
	143	rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
	144	{}
	145	static inline void
	146	rq_sched_info_depart(struct rq *rq, unsigned long long delta)
	147	{}
	148	# define schedstat_inc(rq, field) do { } while (0)
	149	# define schedstat_add(rq, field, amt) do { } while (0)
	150	# define schedstat_set(var, val) do { } while (0)
	151	#endif
	152
	153	#if defined(CONFIG_SCHEDSTATS) \|\| defined(CONFIG_TASK_DELAY_ACCT)
	154	static inline void sched_info_reset_dequeued(struct task_struct *t)
	155	{
	156	t->sched_info.last_queued = 0;
	157	}
	158
	159	/*
	160	* We are interested in knowing how long it was from the first time a
	161	* task was queued to the time that it finally hit a cpu, we call this routine
	162	* from dequeue_task() to account for possible rq->clock skew across cpus. The
	163	* delta taken on each cpu would annul the skew.
	164	*/
	165	static inline void sched_info_dequeued(struct task_struct *t)
	166	{
	167	unsigned long long now = task_rq(t)->clock, delta = 0;
	168
	169	if (unlikely(sched_info_on()))
	170	if (t->sched_info.last_queued)
	171	delta = now - t->sched_info.last_queued;
	172	sched_info_reset_dequeued(t);
	173	t->sched_info.run_delay += delta;
	174
	175	rq_sched_info_dequeued(task_rq(t), delta);
	176	}
	177
	178	/*
	179	* Called when a task finally hits the cpu. We can now calculate how
	180	* long it was waiting to run. We also note when it began so that we
	181	* can keep stats on how long its timeslice is.
	182	*/
	183	static void sched_info_arrive(struct task_struct *t)
	184	{
	185	unsigned long long now = task_rq(t)->clock, delta = 0;
	186
	187	if (t->sched_info.last_queued)
	188	delta = now - t->sched_info.last_queued;
	189	sched_info_reset_dequeued(t);
	190	t->sched_info.run_delay += delta;
	191	t->sched_info.last_arrival = now;
	192	t->sched_info.pcount++;
	193
	194	rq_sched_info_arrive(task_rq(t), delta);
	195	}
	196
	197	/*
	198	* This function is only called from enqueue_task(), but also only updates
	199	* the timestamp if it is already not set. It's assumed that
	200	* sched_info_dequeued() will clear that stamp when appropriate.
	201	*/
	202	static inline void sched_info_queued(struct task_struct *t)
	203	{
	204	if (unlikely(sched_info_on()))
	205	if (!t->sched_info.last_queued)
	206	t->sched_info.last_queued = task_rq(t)->clock;
	207	}
	208
	209	/*
	210	* Called when a process ceases being the active-running process, either
	211	* voluntarily or involuntarily. Now we can calculate how long we ran.
	212	* Also, if the process is still in the TASK_RUNNING state, call
	213	* sched_info_queued() to mark that it has now again started waiting on
	214	* the runqueue.
	215	*/
	216	static inline void sched_info_depart(struct task_struct *t)
	217	{
	218	unsigned long long delta = task_rq(t)->clock -
	219	t->sched_info.last_arrival;
	220
	221	rq_sched_info_depart(task_rq(t), delta);
	222
	223	if (t->state == TASK_RUNNING)
	224	sched_info_queued(t);
	225	}
	226
	227	/*
	228	* Called when tasks are switched involuntarily due, typically, to expiring
	229	* their time slice. (This may also be called when switching to or from
	230	* the idle task.) We are only called when prev != next.
	231	*/
	232	static inline void
	233	__sched_info_switch(struct task_struct prev, struct task_struct next)
	234	{
	235	struct rq *rq = task_rq(prev);
	236
	237	/*
	238	* prev now departs the cpu. It's not interesting to record
	239	* stats about how efficient we were at scheduling the idle
	240	* process, however.
	241	*/
	242	if (prev != rq->idle)
	243	sched_info_depart(prev);
	244
	245	if (next != rq->idle)
	246	sched_info_arrive(next);
	247	}
	248	static inline void
	249	sched_info_switch(struct task_struct prev, struct task_struct next)
	250	{
	251	if (unlikely(sched_info_on()))
	252	__sched_info_switch(prev, next);
	253	}
	254	#else
	255	#define sched_info_queued(t) do { } while (0)
	256	#define sched_info_reset_dequeued(t) do { } while (0)
	257	#define sched_info_dequeued(t) do { } while (0)
	258	#define sched_info_switch(t, next) do { } while (0)
	259	#endif /* CONFIG_SCHEDSTATS \|\| CONFIG_TASK_DELAY_ACCT */
	260
	261	/*
	262	* The following are functions that support scheduler-internal time accounting.
	263	* These functions are generally called at the timer tick. None of this depends
	264	* on CONFIG_SCHEDSTATS.
	265	*/
	266
	267	/**
	268	* account_group_user_time - Maintain utime for a thread group.
	269	*
	270	* @tsk: Pointer to task structure.
	271	* @cputime: Time value by which to increment the utime field of the
	272	* thread_group_cputime structure.
	273	*
	274	* If thread group time is being maintained, get the structure for the
	275	* running CPU and update the utime field there.
	276	*/
	277	static inline void account_group_user_time(struct task_struct *tsk,
	278	cputime_t cputime)
	279	{
	280	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
	281
	282	if (!cputimer->running)
	283	return;
	284
	285	spin_lock(&cputimer->lock);
	286	cputimer->cputime.utime =
	287	cputime_add(cputimer->cputime.utime, cputime);
	288	spin_unlock(&cputimer->lock);
	289	}
	290
	291	/**
	292	* account_group_system_time - Maintain stime for a thread group.
	293	*
	294	* @tsk: Pointer to task structure.
	295	* @cputime: Time value by which to increment the stime field of the
	296	* thread_group_cputime structure.
	297	*
	298	* If thread group time is being maintained, get the structure for the
	299	* running CPU and update the stime field there.
	300	*/
	301	static inline void account_group_system_time(struct task_struct *tsk,
	302	cputime_t cputime)
	303	{
	304	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
	305
	306	if (!cputimer->running)
	307	return;
	308
	309	spin_lock(&cputimer->lock);
	310	cputimer->cputime.stime =
	311	cputime_add(cputimer->cputime.stime, cputime);
	312	spin_unlock(&cputimer->lock);
	313	}
	314
	315	/**
	316	* account_group_exec_runtime - Maintain exec runtime for a thread group.
	317	*
	318	* @tsk: Pointer to task structure.
	319	* @ns: Time value by which to increment the sum_exec_runtime field
	320	* of the thread_group_cputime structure.
	321	*
	322	* If thread group time is being maintained, get the structure for the
	323	* running CPU and update the sum_exec_runtime field there.
	324	*/
	325	static inline void account_group_exec_runtime(struct task_struct *tsk,
	326	unsigned long long ns)
	327	{
	328	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
	329
	330	if (!cputimer->running)
	331	return;
	332
	333	spin_lock(&cputimer->lock);
	334	cputimer->cputime.sum_exec_runtime += ns;
	335	spin_unlock(&cputimer->lock);
	336	}