[net-next-2.6.git] / kernel / profile.c

/*
 *  linux/kernel/profile.c
 *  Simple profiling. Manages a direct-mapped profile hit count buffer,
 *  with configurable resolution, support for restricting the cpus on
 *  which profiling is done, and switching between cpu time and
 *  schedule() calls via kernel command line parameters passed at boot.
 *
 *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
 *	Red Hat, July 2004
 *  Consolidation of architecture support code for profiling,
 *	William Irwin, Oracle, July 2004
 *  Amortized hit count accounting via per-cpu open-addressed hashtables
 *	to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
 */

#include <linux/module.h>
#include <linux/profile.h>
#include <linux/bootmem.h>
#include <linux/notifier.h>
#include <linux/mm.h>
#include <linux/cpumask.h>
#include <linux/cpu.h>
#include <linux/highmem.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <asm/sections.h>
#include <asm/irq_regs.h>
#include <asm/ptrace.h>

struct profile_hit {
	u32 pc, hits;
};
#define PROFILE_GRPSHIFT	3
#define PROFILE_GRPSZ		(1 << PROFILE_GRPSHIFT)
#define NR_PROFILE_HIT		(PAGE_SIZE/sizeof(struct profile_hit))
#define NR_PROFILE_GRP		(NR_PROFILE_HIT/PROFILE_GRPSZ)

/* Oprofile timer tick hook */
static int (*timer_hook)(struct pt_regs *) __read_mostly;

static atomic_t *prof_buffer;
static unsigned long prof_len, prof_shift;

int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);

static cpumask_var_t prof_cpu_mask;
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
static DEFINE_PER_CPU(int, cpu_profile_flip);
static DEFINE_MUTEX(profile_flip_mutex);
#endif /* CONFIG_SMP */

int profile_setup(char *str)
{
	static char schedstr[] = "schedule";
	static char sleepstr[] = "sleep";
	static char kvmstr[] = "kvm";
	int par;

	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
		prof_on = SLEEP_PROFILING;
		if (str[strlen(sleepstr)] == ',')
			str += strlen(sleepstr) + 1;
		if (get_option(&str, &par))
			prof_shift = par;
		printk(KERN_INFO
			"kernel sleep profiling enabled (shift: %ld)\n",
			prof_shift);
#else
		printk(KERN_WARNING
			"kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
#endif /* CONFIG_SCHEDSTATS */
	} else if (!strncmp(str, schedstr, strlen(schedstr))) {
		prof_on = SCHED_PROFILING;
		if (str[strlen(schedstr)] == ',')
			str += strlen(schedstr) + 1;
		if (get_option(&str, &par))
			prof_shift = par;
		printk(KERN_INFO
			"kernel schedule profiling enabled (shift: %ld)\n",
			prof_shift);
	} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
		prof_on = KVM_PROFILING;
		if (str[strlen(kvmstr)] == ',')
			str += strlen(kvmstr) + 1;
		if (get_option(&str, &par))
			prof_shift = par;
		printk(KERN_INFO
			"kernel KVM profiling enabled (shift: %ld)\n",
			prof_shift);
	} else if (get_option(&str, &par)) {
		prof_shift = par;
		prof_on = CPU_PROFILING;
		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
			prof_shift);
	}
	return 1;
}
__setup("profile=", profile_setup);


int __ref profile_init(void)
{
	int buffer_bytes;
	if (!prof_on)
		return 0;

	/* only text is profiled */
	prof_len = (_etext - _stext) >> prof_shift;
	buffer_bytes = prof_len*sizeof(atomic_t);

	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
		return -ENOMEM;

	cpumask_copy(prof_cpu_mask, cpu_possible_mask);

	prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
	if (prof_buffer)
		return 0;

	prof_buffer = alloc_pages_exact(buffer_bytes,
					GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
	if (prof_buffer)
		return 0;

	prof_buffer = vmalloc(buffer_bytes);
	if (prof_buffer) {
		memset(prof_buffer, 0, buffer_bytes);
		return 0;
	}

	free_cpumask_var(prof_cpu_mask);
	return -ENOMEM;
}

/* Profile event notifications */

static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
static BLOCKING_NOTIFIER_HEAD(munmap_notifier);

void profile_task_exit(struct task_struct *task)
{
	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
}

int profile_handoff_task(struct task_struct *task)
{
	int ret;
	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
	return (ret == NOTIFY_OK) ? 1 : 0;
}

void profile_munmap(unsigned long addr)
{
	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
}

int task_handoff_register(struct notifier_block *n)
{
	return atomic_notifier_chain_register(&task_free_notifier, n);
}
EXPORT_SYMBOL_GPL(task_handoff_register);

int task_handoff_unregister(struct notifier_block *n)
{
	return atomic_notifier_chain_unregister(&task_free_notifier, n);
}
EXPORT_SYMBOL_GPL(task_handoff_unregister);

int profile_event_register(enum profile_type type, struct notifier_block *n)
{
	int err = -EINVAL;

	switch (type) {
	case PROFILE_TASK_EXIT:
		err = blocking_notifier_chain_register(
				&task_exit_notifier, n);
		break;
	case PROFILE_MUNMAP:
		err = blocking_notifier_chain_register(
				&munmap_notifier, n);
		break;
	}

	return err;
}
EXPORT_SYMBOL_GPL(profile_event_register);

int profile_event_unregister(enum profile_type type, struct notifier_block *n)
{
	int err = -EINVAL;

	switch (type) {
	case PROFILE_TASK_EXIT:
		err = blocking_notifier_chain_unregister(
				&task_exit_notifier, n);
		break;
	case PROFILE_MUNMAP:
		err = blocking_notifier_chain_unregister(
				&munmap_notifier, n);
		break;
	}

	return err;
}
EXPORT_SYMBOL_GPL(profile_event_unregister);

int register_timer_hook(int (*hook)(struct pt_regs *))
{
	if (timer_hook)
		return -EBUSY;
	timer_hook = hook;
	return 0;
}
EXPORT_SYMBOL_GPL(register_timer_hook);

void unregister_timer_hook(int (*hook)(struct pt_regs *))
{
	WARN_ON(hook != timer_hook);
	timer_hook = NULL;
	/* make sure all CPUs see the NULL hook */
	synchronize_sched();  /* Allow ongoing interrupts to complete. */
}
EXPORT_SYMBOL_GPL(unregister_timer_hook);


#ifdef CONFIG_SMP
/*
 * Each cpu has a pair of open-addressed hashtables for pending
 * profile hits. read_profile() IPI's all cpus to request them
 * to flip buffers and flushes their contents to prof_buffer itself.
 * Flip requests are serialized by the profile_flip_mutex. The sole
 * use of having a second hashtable is for avoiding cacheline
 * contention that would otherwise happen during flushes of pending
 * profile hits required for the accuracy of reported profile hits
 * and so resurrect the interrupt livelock issue.
 *
 * The open-addressed hashtables are indexed by profile buffer slot
 * and hold the number of pending hits to that profile buffer slot on
 * a cpu in an entry. When the hashtable overflows, all pending hits
 * are accounted to their corresponding profile buffer slots with
 * atomic_add() and the hashtable emptied. As numerous pending hits
 * may be accounted to a profile buffer slot in a hashtable entry,
 * this amortizes a number of atomic profile buffer increments likely
 * to be far larger than the number of entries in the hashtable,
 * particularly given that the number of distinct profile buffer
 * positions to which hits are accounted during short intervals (e.g.
 * several seconds) is usually very small. Exclusion from buffer
 * flipping is provided by interrupt disablement (note that for
 * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
 * process context).
 * The hash function is meant to be lightweight as opposed to strong,
 * and was vaguely inspired by ppc64 firmware-supported inverted
 * pagetable hash functions, but uses a full hashtable full of finite
 * collision chains, not just pairs of them.
 *
 * -- wli
 */
static void __profile_flip_buffers(void *unused)
{
	int cpu = smp_processor_id();

	per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
}

static void profile_flip_buffers(void)
{
	int i, j, cpu;

	mutex_lock(&profile_flip_mutex);
	j = per_cpu(cpu_profile_flip, get_cpu());
	put_cpu();
	on_each_cpu(__profile_flip_buffers, NULL, 1);
	for_each_online_cpu(cpu) {
		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
		for (i = 0; i < NR_PROFILE_HIT; ++i) {
			if (!hits[i].hits) {
				if (hits[i].pc)
					hits[i].pc = 0;
				continue;
			}
			atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
			hits[i].hits = hits[i].pc = 0;
		}
	}
	mutex_unlock(&profile_flip_mutex);
}

static void profile_discard_flip_buffers(void)
{
	int i, cpu;

	mutex_lock(&profile_flip_mutex);
	i = per_cpu(cpu_profile_flip, get_cpu());
	put_cpu();
	on_each_cpu(__profile_flip_buffers, NULL, 1);
	for_each_online_cpu(cpu) {
		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
	}
	mutex_unlock(&profile_flip_mutex);
}

void profile_hits(int type, void *__pc, unsigned int nr_hits)
{
	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
	int i, j, cpu;
	struct profile_hit *hits;

	if (prof_on != type || !prof_buffer)
		return;
	pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
	i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	cpu = get_cpu();
	hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
	if (!hits) {
		put_cpu();
		return;
	}
	/*
	 * We buffer the global profiler buffer into a per-CPU
	 * queue and thus reduce the number of global (and possibly
	 * NUMA-alien) accesses. The write-queue is self-coalescing:
	 */
	local_irq_save(flags);
	do {
		for (j = 0; j < PROFILE_GRPSZ; ++j) {
			if (hits[i + j].pc == pc) {
				hits[i + j].hits += nr_hits;
				goto out;
			} else if (!hits[i + j].hits) {
				hits[i + j].pc = pc;
				hits[i + j].hits = nr_hits;
				goto out;
			}
		}
		i = (i + secondary) & (NR_PROFILE_HIT - 1);
	} while (i != primary);

	/*
	 * Add the current hit(s) and flush the write-queue out
	 * to the global buffer:
	 */
	atomic_add(nr_hits, &prof_buffer[pc]);
	for (i = 0; i < NR_PROFILE_HIT; ++i) {
		atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
		hits[i].pc = hits[i].hits = 0;
	}
out:
	local_irq_restore(flags);
	put_cpu();
}

static int __cpuinit profile_cpu_callback(struct notifier_block *info,
					unsigned long action, void *__cpu)
{
	int node, cpu = (unsigned long)__cpu;
	struct page *page;

	switch (action) {
	case CPU_UP_PREPARE:
	case CPU_UP_PREPARE_FROZEN:
		node = cpu_to_mem(cpu);
		per_cpu(cpu_profile_flip, cpu) = 0;
		if (!per_cpu(cpu_profile_hits, cpu)[1]) {
			page = alloc_pages_exact_node(node,
					GFP_KERNEL | __GFP_ZERO,
					0);
			if (!page)
				return notifier_from_errno(-ENOMEM);
			per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
		}
		if (!per_cpu(cpu_profile_hits, cpu)[0]) {
			page = alloc_pages_exact_node(node,
					GFP_KERNEL | __GFP_ZERO,
					0);
			if (!page)
				goto out_free;
			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
		}
		break;
out_free:
		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
		__free_page(page);
		return notifier_from_errno(-ENOMEM);
	case CPU_ONLINE:
	case CPU_ONLINE_FROZEN:
		if (prof_cpu_mask != NULL)
			cpumask_set_cpu(cpu, prof_cpu_mask);
		break;
	case CPU_UP_CANCELED:
	case CPU_UP_CANCELED_FROZEN:
	case CPU_DEAD:
	case CPU_DEAD_FROZEN:
		if (prof_cpu_mask != NULL)
			cpumask_clear_cpu(cpu, prof_cpu_mask);
		if (per_cpu(cpu_profile_hits, cpu)[0]) {
			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
			__free_page(page);
		}
		if (per_cpu(cpu_profile_hits, cpu)[1]) {
			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
			__free_page(page);
		}
		break;
	}
	return NOTIFY_OK;
}
#else /* !CONFIG_SMP */
#define profile_flip_buffers()		do { } while (0)
#define profile_discard_flip_buffers()	do { } while (0)
#define profile_cpu_callback		NULL

void profile_hits(int type, void *__pc, unsigned int nr_hits)
{
	unsigned long pc;

	if (prof_on != type || !prof_buffer)
		return;
	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
}
#endif /* !CONFIG_SMP */
EXPORT_SYMBOL_GPL(profile_hits);

void profile_tick(int type)
{
	struct pt_regs *regs = get_irq_regs();

	if (type == CPU_PROFILING && timer_hook)
		timer_hook(regs);
	if (!user_mode(regs) && prof_cpu_mask != NULL &&
	    cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
		profile_hit(type, (void *)profile_pc(regs));
}

#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <asm/uaccess.h>

static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
{
	seq_cpumask(m, prof_cpu_mask);
	seq_putc(m, '\n');
	return 0;
}

static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
{
	return single_open(file, prof_cpu_mask_proc_show, NULL);
}

static ssize_t prof_cpu_mask_proc_write(struct file *file,
	const char __user *buffer, size_t count, loff_t *pos)
{
	cpumask_var_t new_value;
	int err;

	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
		return -ENOMEM;

	err = cpumask_parse_user(buffer, count, new_value);
	if (!err) {
		cpumask_copy(prof_cpu_mask, new_value);
		err = count;
	}
	free_cpumask_var(new_value);
	return err;
}

static const struct file_operations prof_cpu_mask_proc_fops = {
	.open		= prof_cpu_mask_proc_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
	.write		= prof_cpu_mask_proc_write,
};

void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
{
	/* create /proc/irq/prof_cpu_mask */
	proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
}

/*
 * This function accesses profiling information. The returned data is
 * binary: the sampling step and the actual contents of the profile
 * buffer. Use of the program readprofile is recommended in order to
 * get meaningful info out of these data.
 */
static ssize_t
read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
	unsigned long p = *ppos;
	ssize_t read;
	char *pnt;
	unsigned int sample_step = 1 << prof_shift;

	profile_flip_buffers();
	if (p >= (prof_len+1)*sizeof(unsigned int))
		return 0;
	if (count > (prof_len+1)*sizeof(unsigned int) - p)
		count = (prof_len+1)*sizeof(unsigned int) - p;
	read = 0;

	while (p < sizeof(unsigned int) && count > 0) {
		if (put_user(*((char *)(&sample_step)+p), buf))
			return -EFAULT;
		buf++; p++; count--; read++;
	}
	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
	if (copy_to_user(buf, (void *)pnt, count))
		return -EFAULT;
	read += count;
	*ppos += read;
	return read;
}

/*
 * Writing to /proc/profile resets the counters
 *
 * Writing a 'profiling multiplier' value into it also re-sets the profiling
 * interrupt frequency, on architectures that support this.
 */
static ssize_t write_profile(struct file *file, const char __user *buf,
			     size_t count, loff_t *ppos)
{
#ifdef CONFIG_SMP
	extern int setup_profiling_timer(unsigned int multiplier);

	if (count == sizeof(int)) {
		unsigned int multiplier;

		if (copy_from_user(&multiplier, buf, sizeof(int)))
			return -EFAULT;

		if (setup_profiling_timer(multiplier))
			return -EINVAL;
	}
#endif
	profile_discard_flip_buffers();
	memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
	return count;
}

static const struct file_operations proc_profile_operations = {
	.read		= read_profile,
	.write		= write_profile,
	.llseek		= default_llseek,
};

#ifdef CONFIG_SMP
static void profile_nop(void *unused)
{
}

static int create_hash_tables(void)
{
	int cpu;

	for_each_online_cpu(cpu) {
		int node = cpu_to_mem(cpu);
		struct page *page;

		page = alloc_pages_exact_node(node,
				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
				0);
		if (!page)
			goto out_cleanup;
		per_cpu(cpu_profile_hits, cpu)[1]
				= (struct profile_hit *)page_address(page);
		page = alloc_pages_exact_node(node,
				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
				0);
		if (!page)
			goto out_cleanup;
		per_cpu(cpu_profile_hits, cpu)[0]
				= (struct profile_hit *)page_address(page);
	}
	return 0;
out_cleanup:
	prof_on = 0;
	smp_mb();
	on_each_cpu(profile_nop, NULL, 1);
	for_each_online_cpu(cpu) {
		struct page *page;

		if (per_cpu(cpu_profile_hits, cpu)[0]) {
			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
			per_cpu(cpu_profile_hits, cpu)[0] = NULL;
			__free_page(page);
		}
		if (per_cpu(cpu_profile_hits, cpu)[1]) {
			page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
			per_cpu(cpu_profile_hits, cpu)[1] = NULL;
			__free_page(page);
		}
	}
	return -1;
}
#else
#define create_hash_tables()			({ 0; })
#endif

int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
{
	struct proc_dir_entry *entry;

	if (!prof_on)
		return 0;
	if (create_hash_tables())
		return -ENOMEM;
	entry = proc_create("profile", S_IWUSR | S_IRUGO,
			    NULL, &proc_profile_operations);
	if (!entry)
		return 0;
	entry->size = (1+prof_len) * sizeof(atomic_t);
	hotcpu_notifier(profile_cpu_callback, 0);
	return 0;
}
module_init(create_proc_profile);
#endif /* CONFIG_PROC_FS */
Commit	Line	Data
	1	/*
	2	* linux/kernel/profile.c
	3	* Simple profiling. Manages a direct-mapped profile hit count buffer,
	4	* with configurable resolution, support for restricting the cpus on
	5	* which profiling is done, and switching between cpu time and
	6	* schedule() calls via kernel command line parameters passed at boot.
	7	*
	8	* Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
	9	* Red Hat, July 2004
	10	* Consolidation of architecture support code for profiling,
	11	* William Irwin, Oracle, July 2004
	12	* Amortized hit count accounting via per-cpu open-addressed hashtables
	13	* to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
	14	*/
	15
	16	#include <linux/module.h>
	17	#include <linux/profile.h>
	18	#include <linux/bootmem.h>
	19	#include <linux/notifier.h>
	20	#include <linux/mm.h>
	21	#include <linux/cpumask.h>
	22	#include <linux/cpu.h>
	23	#include <linux/highmem.h>
	24	#include <linux/mutex.h>
	25	#include <linux/slab.h>
	26	#include <linux/vmalloc.h>
	27	#include <asm/sections.h>
	28	#include <asm/irq_regs.h>
	29	#include <asm/ptrace.h>
	30
	31	struct profile_hit {
	32	u32 pc, hits;
	33	};
	34	#define PROFILE_GRPSHIFT 3
	35	#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT)
	36	#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
	37	#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
	38
	39	/* Oprofile timer tick hook */
	40	static int (timer_hook)(struct pt_regs ) __read_mostly;
	41
	42	static atomic_t *prof_buffer;
	43	static unsigned long prof_len, prof_shift;
	44
	45	int prof_on __read_mostly;
	46	EXPORT_SYMBOL_GPL(prof_on);
	47
	48	static cpumask_var_t prof_cpu_mask;
	49	#ifdef CONFIG_SMP
	50	static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
	51	static DEFINE_PER_CPU(int, cpu_profile_flip);
	52	static DEFINE_MUTEX(profile_flip_mutex);
	53	#endif /* CONFIG_SMP */
	54
	55	int profile_setup(char *str)
	56	{
	57	static char schedstr[] = "schedule";
	58	static char sleepstr[] = "sleep";
	59	static char kvmstr[] = "kvm";
	60	int par;
	61
	62	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
	63	#ifdef CONFIG_SCHEDSTATS
	64	prof_on = SLEEP_PROFILING;
	65	if (str[strlen(sleepstr)] == ',')
	66	str += strlen(sleepstr) + 1;
	67	if (get_option(&str, &par))
	68	prof_shift = par;
	69	printk(KERN_INFO
	70	"kernel sleep profiling enabled (shift: %ld)\n",
	71	prof_shift);
	72	#else
	73	printk(KERN_WARNING
	74	"kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
	75	#endif /* CONFIG_SCHEDSTATS */
	76	} else if (!strncmp(str, schedstr, strlen(schedstr))) {
	77	prof_on = SCHED_PROFILING;
	78	if (str[strlen(schedstr)] == ',')
	79	str += strlen(schedstr) + 1;
	80	if (get_option(&str, &par))
	81	prof_shift = par;
	82	printk(KERN_INFO
	83	"kernel schedule profiling enabled (shift: %ld)\n",
	84	prof_shift);
	85	} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
	86	prof_on = KVM_PROFILING;
	87	if (str[strlen(kvmstr)] == ',')
	88	str += strlen(kvmstr) + 1;
	89	if (get_option(&str, &par))
	90	prof_shift = par;
	91	printk(KERN_INFO
	92	"kernel KVM profiling enabled (shift: %ld)\n",
	93	prof_shift);
	94	} else if (get_option(&str, &par)) {
	95	prof_shift = par;
	96	prof_on = CPU_PROFILING;
	97	printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
	98	prof_shift);
	99	}
	100	return 1;
	101	}
	102	__setup("profile=", profile_setup);
	103
	104
	105	int __ref profile_init(void)
	106	{
	107	int buffer_bytes;
	108	if (!prof_on)
	109	return 0;
	110
	111	/* only text is profiled */
	112	prof_len = (_etext - _stext) >> prof_shift;
	113	buffer_bytes = prof_len*sizeof(atomic_t);
	114
	115	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
	116	return -ENOMEM;
	117
	118	cpumask_copy(prof_cpu_mask, cpu_possible_mask);
	119
	120	prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL\|__GFP_NOWARN);
	121	if (prof_buffer)
	122	return 0;
	123
	124	prof_buffer = alloc_pages_exact(buffer_bytes,
	125	GFP_KERNEL\|__GFP_ZERO\|__GFP_NOWARN);
	126	if (prof_buffer)
	127	return 0;
	128
	129	prof_buffer = vmalloc(buffer_bytes);
	130	if (prof_buffer) {
	131	memset(prof_buffer, 0, buffer_bytes);
	132	return 0;
	133	}
	134
	135	free_cpumask_var(prof_cpu_mask);
	136	return -ENOMEM;
	137	}
	138
	139	/* Profile event notifications */
	140
	141	static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
	142	static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
	143	static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
	144
	145	void profile_task_exit(struct task_struct *task)
	146	{
	147	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
	148	}
	149
	150	int profile_handoff_task(struct task_struct *task)
	151	{
	152	int ret;
	153	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
	154	return (ret == NOTIFY_OK) ? 1 : 0;
	155	}
	156
	157	void profile_munmap(unsigned long addr)
	158	{
	159	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
	160	}
	161
	162	int task_handoff_register(struct notifier_block *n)
	163	{
	164	return atomic_notifier_chain_register(&task_free_notifier, n);
	165	}
	166	EXPORT_SYMBOL_GPL(task_handoff_register);
	167
	168	int task_handoff_unregister(struct notifier_block *n)
	169	{
	170	return atomic_notifier_chain_unregister(&task_free_notifier, n);
	171	}
	172	EXPORT_SYMBOL_GPL(task_handoff_unregister);
	173
	174	int profile_event_register(enum profile_type type, struct notifier_block *n)
	175	{
	176	int err = -EINVAL;
	177
	178	switch (type) {
	179	case PROFILE_TASK_EXIT:
	180	err = blocking_notifier_chain_register(
	181	&task_exit_notifier, n);
	182	break;
	183	case PROFILE_MUNMAP:
	184	err = blocking_notifier_chain_register(
	185	&munmap_notifier, n);
	186	break;
	187	}
	188
	189	return err;
	190	}
	191	EXPORT_SYMBOL_GPL(profile_event_register);
	192
	193	int profile_event_unregister(enum profile_type type, struct notifier_block *n)
	194	{
	195	int err = -EINVAL;
	196
	197	switch (type) {
	198	case PROFILE_TASK_EXIT:
	199	err = blocking_notifier_chain_unregister(
	200	&task_exit_notifier, n);
	201	break;
	202	case PROFILE_MUNMAP:
	203	err = blocking_notifier_chain_unregister(
	204	&munmap_notifier, n);
	205	break;
	206	}
	207
	208	return err;
	209	}
	210	EXPORT_SYMBOL_GPL(profile_event_unregister);
	211
	212	int register_timer_hook(int (hook)(struct pt_regs ))
	213	{
	214	if (timer_hook)
	215	return -EBUSY;
	216	timer_hook = hook;
	217	return 0;
	218	}
	219	EXPORT_SYMBOL_GPL(register_timer_hook);
	220
	221	void unregister_timer_hook(int (hook)(struct pt_regs ))
	222	{
	223	WARN_ON(hook != timer_hook);
	224	timer_hook = NULL;
	225	/* make sure all CPUs see the NULL hook */
	226	synchronize_sched(); /* Allow ongoing interrupts to complete. */
	227	}
	228	EXPORT_SYMBOL_GPL(unregister_timer_hook);
	229
	230
	231	#ifdef CONFIG_SMP
	232	/*
	233	* Each cpu has a pair of open-addressed hashtables for pending
	234	* profile hits. read_profile() IPI's all cpus to request them
	235	* to flip buffers and flushes their contents to prof_buffer itself.
	236	* Flip requests are serialized by the profile_flip_mutex. The sole
	237	* use of having a second hashtable is for avoiding cacheline
	238	* contention that would otherwise happen during flushes of pending
	239	* profile hits required for the accuracy of reported profile hits
	240	* and so resurrect the interrupt livelock issue.
	241	*
	242	* The open-addressed hashtables are indexed by profile buffer slot
	243	* and hold the number of pending hits to that profile buffer slot on
	244	* a cpu in an entry. When the hashtable overflows, all pending hits
	245	* are accounted to their corresponding profile buffer slots with
	246	* atomic_add() and the hashtable emptied. As numerous pending hits
	247	* may be accounted to a profile buffer slot in a hashtable entry,
	248	* this amortizes a number of atomic profile buffer increments likely
	249	* to be far larger than the number of entries in the hashtable,
	250	* particularly given that the number of distinct profile buffer
	251	* positions to which hits are accounted during short intervals (e.g.
	252	* several seconds) is usually very small. Exclusion from buffer
	253	* flipping is provided by interrupt disablement (note that for
	254	* SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
	255	* process context).
	256	* The hash function is meant to be lightweight as opposed to strong,
	257	* and was vaguely inspired by ppc64 firmware-supported inverted
	258	* pagetable hash functions, but uses a full hashtable full of finite
	259	* collision chains, not just pairs of them.
	260	*
	261	* -- wli
	262	*/
	263	static void __profile_flip_buffers(void *unused)
	264	{
	265	int cpu = smp_processor_id();
	266
	267	per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
	268	}
	269
	270	static void profile_flip_buffers(void)
	271	{
	272	int i, j, cpu;
	273
	274	mutex_lock(&profile_flip_mutex);
	275	j = per_cpu(cpu_profile_flip, get_cpu());
	276	put_cpu();
	277	on_each_cpu(__profile_flip_buffers, NULL, 1);
	278	for_each_online_cpu(cpu) {
	279	struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
	280	for (i = 0; i < NR_PROFILE_HIT; ++i) {
	281	if (!hits[i].hits) {
	282	if (hits[i].pc)
	283	hits[i].pc = 0;
	284	continue;
	285	}
	286	atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
	287	hits[i].hits = hits[i].pc = 0;
	288	}
	289	}
	290	mutex_unlock(&profile_flip_mutex);
	291	}
	292
	293	static void profile_discard_flip_buffers(void)
	294	{
	295	int i, cpu;
	296
	297	mutex_lock(&profile_flip_mutex);
	298	i = per_cpu(cpu_profile_flip, get_cpu());
	299	put_cpu();
	300	on_each_cpu(__profile_flip_buffers, NULL, 1);
	301	for_each_online_cpu(cpu) {
	302	struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
	303	memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
	304	}
	305	mutex_unlock(&profile_flip_mutex);
	306	}
	307
	308	void profile_hits(int type, void *__pc, unsigned int nr_hits)
	309	{
	310	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
	311	int i, j, cpu;
	312	struct profile_hit *hits;
	313
	314	if (prof_on != type \|\| !prof_buffer)
	315	return;
	316	pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
	317	i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	318	secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
	319	cpu = get_cpu();
	320	hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
	321	if (!hits) {
	322	put_cpu();
	323	return;
	324	}
	325	/*
	326	* We buffer the global profiler buffer into a per-CPU
	327	* queue and thus reduce the number of global (and possibly
	328	* NUMA-alien) accesses. The write-queue is self-coalescing:
	329	*/
	330	local_irq_save(flags);
	331	do {
	332	for (j = 0; j < PROFILE_GRPSZ; ++j) {
	333	if (hits[i + j].pc == pc) {
	334	hits[i + j].hits += nr_hits;
	335	goto out;
	336	} else if (!hits[i + j].hits) {
	337	hits[i + j].pc = pc;
	338	hits[i + j].hits = nr_hits;
	339	goto out;
	340	}
	341	}
	342	i = (i + secondary) & (NR_PROFILE_HIT - 1);
	343	} while (i != primary);
	344
	345	/*
	346	* Add the current hit(s) and flush the write-queue out
	347	* to the global buffer:
	348	*/
	349	atomic_add(nr_hits, &prof_buffer[pc]);
	350	for (i = 0; i < NR_PROFILE_HIT; ++i) {
	351	atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
	352	hits[i].pc = hits[i].hits = 0;
	353	}
	354	out:
	355	local_irq_restore(flags);
	356	put_cpu();
	357	}
	358
	359	static int __cpuinit profile_cpu_callback(struct notifier_block *info,
	360	unsigned long action, void *__cpu)
	361	{
	362	int node, cpu = (unsigned long)__cpu;
	363	struct page *page;
	364
	365	switch (action) {
	366	case CPU_UP_PREPARE:
	367	case CPU_UP_PREPARE_FROZEN:
	368	node = cpu_to_mem(cpu);
	369	per_cpu(cpu_profile_flip, cpu) = 0;
	370	if (!per_cpu(cpu_profile_hits, cpu)[1]) {
	371	page = alloc_pages_exact_node(node,
	372	GFP_KERNEL \| __GFP_ZERO,
	373	0);
	374	if (!page)
	375	return notifier_from_errno(-ENOMEM);
	376	per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
	377	}
	378	if (!per_cpu(cpu_profile_hits, cpu)[0]) {
	379	page = alloc_pages_exact_node(node,
	380	GFP_KERNEL \| __GFP_ZERO,
	381	0);
	382	if (!page)
	383	goto out_free;
	384	per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
	385	}
	386	break;
	387	out_free:
	388	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
	389	per_cpu(cpu_profile_hits, cpu)[1] = NULL;
	390	__free_page(page);
	391	return notifier_from_errno(-ENOMEM);
	392	case CPU_ONLINE:
	393	case CPU_ONLINE_FROZEN:
	394	if (prof_cpu_mask != NULL)
	395	cpumask_set_cpu(cpu, prof_cpu_mask);
	396	break;
	397	case CPU_UP_CANCELED:
	398	case CPU_UP_CANCELED_FROZEN:
	399	case CPU_DEAD:
	400	case CPU_DEAD_FROZEN:
	401	if (prof_cpu_mask != NULL)
	402	cpumask_clear_cpu(cpu, prof_cpu_mask);
	403	if (per_cpu(cpu_profile_hits, cpu)[0]) {
	404	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
	405	per_cpu(cpu_profile_hits, cpu)[0] = NULL;
	406	__free_page(page);
	407	}
	408	if (per_cpu(cpu_profile_hits, cpu)[1]) {
	409	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
	410	per_cpu(cpu_profile_hits, cpu)[1] = NULL;
	411	__free_page(page);
	412	}
	413	break;
	414	}
	415	return NOTIFY_OK;
	416	}
	417	#else /* !CONFIG_SMP */
	418	#define profile_flip_buffers() do { } while (0)
	419	#define profile_discard_flip_buffers() do { } while (0)
	420	#define profile_cpu_callback NULL
	421
	422	void profile_hits(int type, void *__pc, unsigned int nr_hits)
	423	{
	424	unsigned long pc;
	425
	426	if (prof_on != type \|\| !prof_buffer)
	427	return;
	428	pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
	429	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
	430	}
	431	#endif /* !CONFIG_SMP */
	432	EXPORT_SYMBOL_GPL(profile_hits);
	433
	434	void profile_tick(int type)
	435	{
	436	struct pt_regs *regs = get_irq_regs();
	437
	438	if (type == CPU_PROFILING && timer_hook)
	439	timer_hook(regs);
	440	if (!user_mode(regs) && prof_cpu_mask != NULL &&
	441	cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
	442	profile_hit(type, (void *)profile_pc(regs));
	443	}
	444
	445	#ifdef CONFIG_PROC_FS
	446	#include <linux/proc_fs.h>
	447	#include <linux/seq_file.h>
	448	#include <asm/uaccess.h>
	449
	450	static int prof_cpu_mask_proc_show(struct seq_file m, void v)
	451	{
	452	seq_cpumask(m, prof_cpu_mask);
	453	seq_putc(m, '\n');
	454	return 0;
	455	}
	456
	457	static int prof_cpu_mask_proc_open(struct inode inode, struct file file)
	458	{
	459	return single_open(file, prof_cpu_mask_proc_show, NULL);
	460	}
	461
	462	static ssize_t prof_cpu_mask_proc_write(struct file *file,
	463	const char __user buffer, size_t count, loff_t pos)
	464	{
	465	cpumask_var_t new_value;
	466	int err;
	467
	468	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
	469	return -ENOMEM;
	470
	471	err = cpumask_parse_user(buffer, count, new_value);
	472	if (!err) {
	473	cpumask_copy(prof_cpu_mask, new_value);
	474	err = count;
	475	}
	476	free_cpumask_var(new_value);
	477	return err;
	478	}
	479
	480	static const struct file_operations prof_cpu_mask_proc_fops = {
	481	.open = prof_cpu_mask_proc_open,
	482	.read = seq_read,
	483	.llseek = seq_lseek,
	484	.release = single_release,
	485	.write = prof_cpu_mask_proc_write,
	486	};
	487
	488	void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
	489	{
	490	/* create /proc/irq/prof_cpu_mask */
	491	proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
	492	}
	493
	494	/*
	495	* This function accesses profiling information. The returned data is
	496	* binary: the sampling step and the actual contents of the profile
	497	* buffer. Use of the program readprofile is recommended in order to
	498	* get meaningful info out of these data.
	499	*/
	500	static ssize_t
	501	read_profile(struct file file, char __user buf, size_t count, loff_t *ppos)
	502	{
	503	unsigned long p = *ppos;
	504	ssize_t read;
	505	char *pnt;
	506	unsigned int sample_step = 1 << prof_shift;
	507
	508	profile_flip_buffers();
	509	if (p >= (prof_len+1)*sizeof(unsigned int))
	510	return 0;
	511	if (count > (prof_len+1)*sizeof(unsigned int) - p)
	512	count = (prof_len+1)*sizeof(unsigned int) - p;
	513	read = 0;
	514
	515	while (p < sizeof(unsigned int) && count > 0) {
	516	if (put_user(((char )(&sample_step)+p), buf))
	517	return -EFAULT;
	518	buf++; p++; count--; read++;
	519	}
	520	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
	521	if (copy_to_user(buf, (void *)pnt, count))
	522	return -EFAULT;
	523	read += count;
	524	*ppos += read;
	525	return read;
	526	}
	527
	528	/*
	529	* Writing to /proc/profile resets the counters
	530	*
	531	* Writing a 'profiling multiplier' value into it also re-sets the profiling
	532	* interrupt frequency, on architectures that support this.
	533	*/
	534	static ssize_t write_profile(struct file file, const char __user buf,
	535	size_t count, loff_t *ppos)
	536	{
	537	#ifdef CONFIG_SMP
	538	extern int setup_profiling_timer(unsigned int multiplier);
	539
	540	if (count == sizeof(int)) {
	541	unsigned int multiplier;
	542
	543	if (copy_from_user(&multiplier, buf, sizeof(int)))
	544	return -EFAULT;
	545
	546	if (setup_profiling_timer(multiplier))
	547	return -EINVAL;
	548	}
	549	#endif
	550	profile_discard_flip_buffers();
	551	memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
	552	return count;
	553	}
	554
	555	static const struct file_operations proc_profile_operations = {
	556	.read = read_profile,
	557	.write = write_profile,
	558	.llseek = default_llseek,
	559	};
	560
	561	#ifdef CONFIG_SMP
	562	static void profile_nop(void *unused)
	563	{
	564	}
	565
	566	static int create_hash_tables(void)
	567	{
	568	int cpu;
	569
	570	for_each_online_cpu(cpu) {
	571	int node = cpu_to_mem(cpu);
	572	struct page *page;
	573
	574	page = alloc_pages_exact_node(node,
	575	GFP_KERNEL \| __GFP_ZERO \| GFP_THISNODE,
	576	0);
	577	if (!page)
	578	goto out_cleanup;
	579	per_cpu(cpu_profile_hits, cpu)[1]
	580	= (struct profile_hit *)page_address(page);
	581	page = alloc_pages_exact_node(node,
	582	GFP_KERNEL \| __GFP_ZERO \| GFP_THISNODE,
	583	0);
	584	if (!page)
	585	goto out_cleanup;
	586	per_cpu(cpu_profile_hits, cpu)[0]
	587	= (struct profile_hit *)page_address(page);
	588	}
	589	return 0;
	590	out_cleanup:
	591	prof_on = 0;
	592	smp_mb();
	593	on_each_cpu(profile_nop, NULL, 1);
	594	for_each_online_cpu(cpu) {
	595	struct page *page;
	596
	597	if (per_cpu(cpu_profile_hits, cpu)[0]) {
	598	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
	599	per_cpu(cpu_profile_hits, cpu)[0] = NULL;
	600	__free_page(page);
	601	}
	602	if (per_cpu(cpu_profile_hits, cpu)[1]) {
	603	page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
	604	per_cpu(cpu_profile_hits, cpu)[1] = NULL;
	605	__free_page(page);
	606	}
	607	}
	608	return -1;
	609	}
	610	#else
	611	#define create_hash_tables() ({ 0; })
	612	#endif
	613
	614	int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
	615	{
	616	struct proc_dir_entry *entry;
	617
	618	if (!prof_on)
	619	return 0;
	620	if (create_hash_tables())
	621	return -ENOMEM;
	622	entry = proc_create("profile", S_IWUSR \| S_IRUGO,
	623	NULL, &proc_profile_operations);
	624	if (!entry)
	625	return 0;
	626	entry->size = (1+prof_len) * sizeof(atomic_t);
	627	hotcpu_notifier(profile_cpu_callback, 0);
	628	return 0;
	629	}
	630	module_init(create_proc_profile);
	631	#endif /* CONFIG_PROC_FS */