]> bbs.cooldavid.org Git - net-next-2.6.git/blob - kernel/perf_counter.c
performance counters: core code
[net-next-2.6.git] / kernel / perf_counter.c
1 /*
2  * Performance counter core code
3  *
4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6  *
7  *  For licencing details see kernel-base/COPYING
8  */
9
10 #include <linux/fs.h>
11 #include <linux/cpu.h>
12 #include <linux/smp.h>
13 #include <linux/poll.h>
14 #include <linux/sysfs.h>
15 #include <linux/ptrace.h>
16 #include <linux/percpu.h>
17 #include <linux/uaccess.h>
18 #include <linux/syscalls.h>
19 #include <linux/anon_inodes.h>
20 #include <linux/perf_counter.h>
21
22 /*
23  * Each CPU has a list of per CPU counters:
24  */
25 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
26
27 int perf_max_counters __read_mostly;
28 static int perf_reserved_percpu __read_mostly;
29 static int perf_overcommit __read_mostly = 1;
30
31 /*
32  * Mutex for (sysadmin-configurable) counter reservations:
33  */
34 static DEFINE_MUTEX(perf_resource_mutex);
35
36 /*
37  * Architecture provided APIs - weak aliases:
38  */
39
40 int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type)
41 {
42         return -EINVAL;
43 }
44
45 void __weak hw_perf_counter_enable(struct perf_counter *counter)         { }
46 void __weak hw_perf_counter_disable(struct perf_counter *counter)        { }
47 void __weak hw_perf_counter_read(struct perf_counter *counter)           { }
48 void __weak hw_perf_disable_all(void) { }
49 void __weak hw_perf_enable_all(void) { }
50 void __weak hw_perf_counter_setup(void) { }
51
52 #if BITS_PER_LONG == 64
53
54 /*
55  * Read the cached counter in counter safe against cross CPU / NMI
56  * modifications. 64 bit version - no complications.
57  */
58 static inline u64 perf_read_counter_safe(struct perf_counter *counter)
59 {
60         return (u64) atomic64_read(&counter->count);
61 }
62
63 #else
64
65 /*
66  * Read the cached counter in counter safe against cross CPU / NMI
67  * modifications. 32 bit version.
68  */
69 static u64 perf_read_counter_safe(struct perf_counter *counter)
70 {
71         u32 cntl, cnth;
72
73         local_irq_disable();
74         do {
75                 cnth = atomic_read(&counter->count32[1]);
76                 cntl = atomic_read(&counter->count32[0]);
77         } while (cnth != atomic_read(&counter->count32[1]));
78
79         local_irq_enable();
80
81         return cntl | ((u64) cnth) << 32;
82 }
83
84 #endif
85
86 /*
87  * Cross CPU call to remove a performance counter
88  *
89  * We disable the counter on the hardware level first. After that we
90  * remove it from the context list.
91  */
92 static void __perf_remove_from_context(void *info)
93 {
94         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
95         struct perf_counter *counter = info;
96         struct perf_counter_context *ctx = counter->ctx;
97
98         /*
99          * If this is a task context, we need to check whether it is
100          * the current task context of this cpu. If not it has been
101          * scheduled out before the smp call arrived.
102          */
103         if (ctx->task && cpuctx->task_ctx != ctx)
104                 return;
105
106         spin_lock(&ctx->lock);
107
108         if (counter->active) {
109                 hw_perf_counter_disable(counter);
110                 counter->active = 0;
111                 ctx->nr_active--;
112                 cpuctx->active_oncpu--;
113                 counter->task = NULL;
114         }
115         ctx->nr_counters--;
116
117         /*
118          * Protect the list operation against NMI by disabling the
119          * counters on a global level. NOP for non NMI based counters.
120          */
121         hw_perf_disable_all();
122         list_del_init(&counter->list);
123         hw_perf_enable_all();
124
125         if (!ctx->task) {
126                 /*
127                  * Allow more per task counters with respect to the
128                  * reservation:
129                  */
130                 cpuctx->max_pertask =
131                         min(perf_max_counters - ctx->nr_counters,
132                             perf_max_counters - perf_reserved_percpu);
133         }
134
135         spin_unlock(&ctx->lock);
136 }
137
138
139 /*
140  * Remove the counter from a task's (or a CPU's) list of counters.
141  *
142  * Must be called with counter->mutex held.
143  *
144  * CPU counters are removed with a smp call. For task counters we only
145  * call when the task is on a CPU.
146  */
147 static void perf_remove_from_context(struct perf_counter *counter)
148 {
149         struct perf_counter_context *ctx = counter->ctx;
150         struct task_struct *task = ctx->task;
151
152         if (!task) {
153                 /*
154                  * Per cpu counters are removed via an smp call and
155                  * the removal is always sucessful.
156                  */
157                 smp_call_function_single(counter->cpu,
158                                          __perf_remove_from_context,
159                                          counter, 1);
160                 return;
161         }
162
163 retry:
164         task_oncpu_function_call(task, __perf_remove_from_context,
165                                  counter);
166
167         spin_lock_irq(&ctx->lock);
168         /*
169          * If the context is active we need to retry the smp call.
170          */
171         if (ctx->nr_active && !list_empty(&counter->list)) {
172                 spin_unlock_irq(&ctx->lock);
173                 goto retry;
174         }
175
176         /*
177          * The lock prevents that this context is scheduled in so we
178          * can remove the counter safely, if it the call above did not
179          * succeed.
180          */
181         if (!list_empty(&counter->list)) {
182                 ctx->nr_counters--;
183                 list_del_init(&counter->list);
184                 counter->task = NULL;
185         }
186         spin_unlock_irq(&ctx->lock);
187 }
188
189 /*
190  * Cross CPU call to install and enable a preformance counter
191  */
192 static void __perf_install_in_context(void *info)
193 {
194         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
195         struct perf_counter *counter = info;
196         struct perf_counter_context *ctx = counter->ctx;
197         int cpu = smp_processor_id();
198
199         /*
200          * If this is a task context, we need to check whether it is
201          * the current task context of this cpu. If not it has been
202          * scheduled out before the smp call arrived.
203          */
204         if (ctx->task && cpuctx->task_ctx != ctx)
205                 return;
206
207         spin_lock(&ctx->lock);
208
209         /*
210          * Protect the list operation against NMI by disabling the
211          * counters on a global level. NOP for non NMI based counters.
212          */
213         hw_perf_disable_all();
214         list_add_tail(&counter->list, &ctx->counters);
215         hw_perf_enable_all();
216
217         ctx->nr_counters++;
218
219         if (cpuctx->active_oncpu < perf_max_counters) {
220                 hw_perf_counter_enable(counter);
221                 counter->active = 1;
222                 counter->oncpu = cpu;
223                 ctx->nr_active++;
224                 cpuctx->active_oncpu++;
225         }
226
227         if (!ctx->task && cpuctx->max_pertask)
228                 cpuctx->max_pertask--;
229
230         spin_unlock(&ctx->lock);
231 }
232
233 /*
234  * Attach a performance counter to a context
235  *
236  * First we add the counter to the list with the hardware enable bit
237  * in counter->hw_config cleared.
238  *
239  * If the counter is attached to a task which is on a CPU we use a smp
240  * call to enable it in the task context. The task might have been
241  * scheduled away, but we check this in the smp call again.
242  */
243 static void
244 perf_install_in_context(struct perf_counter_context *ctx,
245                         struct perf_counter *counter,
246                         int cpu)
247 {
248         struct task_struct *task = ctx->task;
249
250         counter->ctx = ctx;
251         if (!task) {
252                 /*
253                  * Per cpu counters are installed via an smp call and
254                  * the install is always sucessful.
255                  */
256                 smp_call_function_single(cpu, __perf_install_in_context,
257                                          counter, 1);
258                 return;
259         }
260
261         counter->task = task;
262 retry:
263         task_oncpu_function_call(task, __perf_install_in_context,
264                                  counter);
265
266         spin_lock_irq(&ctx->lock);
267         /*
268          * If the context is active and the counter has not been added
269          * we need to retry the smp call.
270          */
271         if (ctx->nr_active && list_empty(&counter->list)) {
272                 spin_unlock_irq(&ctx->lock);
273                 goto retry;
274         }
275
276         /*
277          * The lock prevents that this context is scheduled in so we
278          * can add the counter safely, if it the call above did not
279          * succeed.
280          */
281         if (list_empty(&counter->list)) {
282                 list_add_tail(&counter->list, &ctx->counters);
283                 ctx->nr_counters++;
284         }
285         spin_unlock_irq(&ctx->lock);
286 }
287
288 /*
289  * Called from scheduler to remove the counters of the current task,
290  * with interrupts disabled.
291  *
292  * We stop each counter and update the counter value in counter->count.
293  *
294  * This does not protect us against NMI, but hw_perf_counter_disable()
295  * sets the disabled bit in the control field of counter _before_
296  * accessing the counter control register. If a NMI hits, then it will
297  * not restart the counter.
298  */
299 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
300 {
301         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
302         struct perf_counter_context *ctx = &task->perf_counter_ctx;
303         struct perf_counter *counter;
304
305         if (likely(!cpuctx->task_ctx))
306                 return;
307
308         spin_lock(&ctx->lock);
309         list_for_each_entry(counter, &ctx->counters, list) {
310                 if (!ctx->nr_active)
311                         break;
312                 if (counter->active) {
313                         hw_perf_counter_disable(counter);
314                         counter->active = 0;
315                         counter->oncpu = -1;
316                         ctx->nr_active--;
317                         cpuctx->active_oncpu--;
318                 }
319         }
320         spin_unlock(&ctx->lock);
321         cpuctx->task_ctx = NULL;
322 }
323
324 /*
325  * Called from scheduler to add the counters of the current task
326  * with interrupts disabled.
327  *
328  * We restore the counter value and then enable it.
329  *
330  * This does not protect us against NMI, but hw_perf_counter_enable()
331  * sets the enabled bit in the control field of counter _before_
332  * accessing the counter control register. If a NMI hits, then it will
333  * keep the counter running.
334  */
335 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
336 {
337         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
338         struct perf_counter_context *ctx = &task->perf_counter_ctx;
339         struct perf_counter *counter;
340
341         if (likely(!ctx->nr_counters))
342                 return;
343
344         spin_lock(&ctx->lock);
345         list_for_each_entry(counter, &ctx->counters, list) {
346                 if (ctx->nr_active == cpuctx->max_pertask)
347                         break;
348                 if (counter->cpu != -1 && counter->cpu != cpu)
349                         continue;
350
351                 hw_perf_counter_enable(counter);
352                 counter->active = 1;
353                 counter->oncpu = cpu;
354                 ctx->nr_active++;
355                 cpuctx->active_oncpu++;
356         }
357         spin_unlock(&ctx->lock);
358         cpuctx->task_ctx = ctx;
359 }
360
361 void perf_counter_task_tick(struct task_struct *curr, int cpu)
362 {
363         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
364         struct perf_counter *counter;
365
366         if (likely(!ctx->nr_counters))
367                 return;
368
369         perf_counter_task_sched_out(curr, cpu);
370
371         spin_lock(&ctx->lock);
372
373         /*
374          * Rotate the first entry last:
375          */
376         hw_perf_disable_all();
377         list_for_each_entry(counter, &ctx->counters, list) {
378                 list_del(&counter->list);
379                 list_add_tail(&counter->list, &ctx->counters);
380                 break;
381         }
382         hw_perf_enable_all();
383
384         spin_unlock(&ctx->lock);
385
386         perf_counter_task_sched_in(curr, cpu);
387 }
388
389 /*
390  * Initialize the perf_counter context in task_struct
391  */
392 void perf_counter_init_task(struct task_struct *task)
393 {
394         struct perf_counter_context *ctx = &task->perf_counter_ctx;
395
396         spin_lock_init(&ctx->lock);
397         INIT_LIST_HEAD(&ctx->counters);
398         ctx->nr_counters = 0;
399         ctx->task = task;
400 }
401
402 /*
403  * Cross CPU call to read the hardware counter
404  */
405 static void __hw_perf_counter_read(void *info)
406 {
407         hw_perf_counter_read(info);
408 }
409
410 static u64 perf_read_counter(struct perf_counter *counter)
411 {
412         /*
413          * If counter is enabled and currently active on a CPU, update the
414          * value in the counter structure:
415          */
416         if (counter->active) {
417                 smp_call_function_single(counter->oncpu,
418                                          __hw_perf_counter_read, counter, 1);
419         }
420
421         return perf_read_counter_safe(counter);
422 }
423
424 /*
425  * Cross CPU call to switch performance data pointers
426  */
427 static void __perf_switch_irq_data(void *info)
428 {
429         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
430         struct perf_counter *counter = info;
431         struct perf_counter_context *ctx = counter->ctx;
432         struct perf_data *oldirqdata = counter->irqdata;
433
434         /*
435          * If this is a task context, we need to check whether it is
436          * the current task context of this cpu. If not it has been
437          * scheduled out before the smp call arrived.
438          */
439         if (ctx->task) {
440                 if (cpuctx->task_ctx != ctx)
441                         return;
442                 spin_lock(&ctx->lock);
443         }
444
445         /* Change the pointer NMI safe */
446         atomic_long_set((atomic_long_t *)&counter->irqdata,
447                         (unsigned long) counter->usrdata);
448         counter->usrdata = oldirqdata;
449
450         if (ctx->task)
451                 spin_unlock(&ctx->lock);
452 }
453
454 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
455 {
456         struct perf_counter_context *ctx = counter->ctx;
457         struct perf_data *oldirqdata = counter->irqdata;
458         struct task_struct *task = ctx->task;
459
460         if (!task) {
461                 smp_call_function_single(counter->cpu,
462                                          __perf_switch_irq_data,
463                                          counter, 1);
464                 return counter->usrdata;
465         }
466
467 retry:
468         spin_lock_irq(&ctx->lock);
469         if (!counter->active) {
470                 counter->irqdata = counter->usrdata;
471                 counter->usrdata = oldirqdata;
472                 spin_unlock_irq(&ctx->lock);
473                 return oldirqdata;
474         }
475         spin_unlock_irq(&ctx->lock);
476         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
477         /* Might have failed, because task was scheduled out */
478         if (counter->irqdata == oldirqdata)
479                 goto retry;
480
481         return counter->usrdata;
482 }
483
484 static void put_context(struct perf_counter_context *ctx)
485 {
486         if (ctx->task)
487                 put_task_struct(ctx->task);
488 }
489
490 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
491 {
492         struct perf_cpu_context *cpuctx;
493         struct perf_counter_context *ctx;
494         struct task_struct *task;
495
496         /*
497          * If cpu is not a wildcard then this is a percpu counter:
498          */
499         if (cpu != -1) {
500                 /* Must be root to operate on a CPU counter: */
501                 if (!capable(CAP_SYS_ADMIN))
502                         return ERR_PTR(-EACCES);
503
504                 if (cpu < 0 || cpu > num_possible_cpus())
505                         return ERR_PTR(-EINVAL);
506
507                 /*
508                  * We could be clever and allow to attach a counter to an
509                  * offline CPU and activate it when the CPU comes up, but
510                  * that's for later.
511                  */
512                 if (!cpu_isset(cpu, cpu_online_map))
513                         return ERR_PTR(-ENODEV);
514
515                 cpuctx = &per_cpu(perf_cpu_context, cpu);
516                 ctx = &cpuctx->ctx;
517
518                 WARN_ON_ONCE(ctx->task);
519                 return ctx;
520         }
521
522         rcu_read_lock();
523         if (!pid)
524                 task = current;
525         else
526                 task = find_task_by_vpid(pid);
527         if (task)
528                 get_task_struct(task);
529         rcu_read_unlock();
530
531         if (!task)
532                 return ERR_PTR(-ESRCH);
533
534         ctx = &task->perf_counter_ctx;
535         ctx->task = task;
536
537         /* Reuse ptrace permission checks for now. */
538         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
539                 put_context(ctx);
540                 return ERR_PTR(-EACCES);
541         }
542
543         return ctx;
544 }
545
546 /*
547  * Called when the last reference to the file is gone.
548  */
549 static int perf_release(struct inode *inode, struct file *file)
550 {
551         struct perf_counter *counter = file->private_data;
552         struct perf_counter_context *ctx = counter->ctx;
553
554         file->private_data = NULL;
555
556         mutex_lock(&counter->mutex);
557
558         perf_remove_from_context(counter);
559         put_context(ctx);
560
561         mutex_unlock(&counter->mutex);
562
563         kfree(counter);
564
565         return 0;
566 }
567
568 /*
569  * Read the performance counter - simple non blocking version for now
570  */
571 static ssize_t
572 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
573 {
574         u64 cntval;
575
576         if (count != sizeof(cntval))
577                 return -EINVAL;
578
579         mutex_lock(&counter->mutex);
580         cntval = perf_read_counter(counter);
581         mutex_unlock(&counter->mutex);
582
583         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
584 }
585
586 static ssize_t
587 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
588 {
589         if (!usrdata->len)
590                 return 0;
591
592         count = min(count, (size_t)usrdata->len);
593         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
594                 return -EFAULT;
595
596         /* Adjust the counters */
597         usrdata->len -= count;
598         if (!usrdata->len)
599                 usrdata->rd_idx = 0;
600         else
601                 usrdata->rd_idx += count;
602
603         return count;
604 }
605
606 static ssize_t
607 perf_read_irq_data(struct perf_counter  *counter,
608                    char __user          *buf,
609                    size_t               count,
610                    int                  nonblocking)
611 {
612         struct perf_data *irqdata, *usrdata;
613         DECLARE_WAITQUEUE(wait, current);
614         ssize_t res;
615
616         irqdata = counter->irqdata;
617         usrdata = counter->usrdata;
618
619         if (usrdata->len + irqdata->len >= count)
620                 goto read_pending;
621
622         if (nonblocking)
623                 return -EAGAIN;
624
625         spin_lock_irq(&counter->waitq.lock);
626         __add_wait_queue(&counter->waitq, &wait);
627         for (;;) {
628                 set_current_state(TASK_INTERRUPTIBLE);
629                 if (usrdata->len + irqdata->len >= count)
630                         break;
631
632                 if (signal_pending(current))
633                         break;
634
635                 spin_unlock_irq(&counter->waitq.lock);
636                 schedule();
637                 spin_lock_irq(&counter->waitq.lock);
638         }
639         __remove_wait_queue(&counter->waitq, &wait);
640         __set_current_state(TASK_RUNNING);
641         spin_unlock_irq(&counter->waitq.lock);
642
643         if (usrdata->len + irqdata->len < count)
644                 return -ERESTARTSYS;
645 read_pending:
646         mutex_lock(&counter->mutex);
647
648         /* Drain pending data first: */
649         res = perf_copy_usrdata(usrdata, buf, count);
650         if (res < 0 || res == count)
651                 goto out;
652
653         /* Switch irq buffer: */
654         usrdata = perf_switch_irq_data(counter);
655         if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
656                 if (!res)
657                         res = -EFAULT;
658         } else {
659                 res = count;
660         }
661 out:
662         mutex_unlock(&counter->mutex);
663
664         return res;
665 }
666
667 static ssize_t
668 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
669 {
670         struct perf_counter *counter = file->private_data;
671
672         switch (counter->record_type) {
673         case PERF_RECORD_SIMPLE:
674                 return perf_read_hw(counter, buf, count);
675
676         case PERF_RECORD_IRQ:
677         case PERF_RECORD_GROUP:
678                 return perf_read_irq_data(counter, buf, count,
679                                           file->f_flags & O_NONBLOCK);
680         }
681         return -EINVAL;
682 }
683
684 static unsigned int perf_poll(struct file *file, poll_table *wait)
685 {
686         struct perf_counter *counter = file->private_data;
687         unsigned int events = 0;
688         unsigned long flags;
689
690         poll_wait(file, &counter->waitq, wait);
691
692         spin_lock_irqsave(&counter->waitq.lock, flags);
693         if (counter->usrdata->len || counter->irqdata->len)
694                 events |= POLLIN;
695         spin_unlock_irqrestore(&counter->waitq.lock, flags);
696
697         return events;
698 }
699
700 static const struct file_operations perf_fops = {
701         .release                = perf_release,
702         .read                   = perf_read,
703         .poll                   = perf_poll,
704 };
705
706 /*
707  * Allocate and initialize a counter structure
708  */
709 static struct perf_counter *
710 perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
711 {
712         struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
713
714         if (!counter)
715                 return NULL;
716
717         mutex_init(&counter->mutex);
718         INIT_LIST_HEAD(&counter->list);
719         init_waitqueue_head(&counter->waitq);
720
721         counter->irqdata        = &counter->data[0];
722         counter->usrdata        = &counter->data[1];
723         counter->cpu            = cpu;
724         counter->record_type    = record_type;
725         counter->__irq_period   = hw_event_period;
726         counter->wakeup_pending = 0;
727
728         return counter;
729 }
730
731 /**
732  * sys_perf_task_open - open a performance counter associate it to a task
733  * @hw_event_type:      event type for monitoring/sampling...
734  * @pid:                target pid
735  */
736 asmlinkage int
737 sys_perf_counter_open(u32 hw_event_type,
738                       u32 hw_event_period,
739                       u32 record_type,
740                       pid_t pid,
741                       int cpu)
742 {
743         struct perf_counter_context *ctx;
744         struct perf_counter *counter;
745         int ret;
746
747         ctx = find_get_context(pid, cpu);
748         if (IS_ERR(ctx))
749                 return PTR_ERR(ctx);
750
751         ret = -ENOMEM;
752         counter = perf_counter_alloc(hw_event_period, cpu, record_type);
753         if (!counter)
754                 goto err_put_context;
755
756         ret = hw_perf_counter_init(counter, hw_event_type);
757         if (ret)
758                 goto err_free_put_context;
759
760         perf_install_in_context(ctx, counter, cpu);
761
762         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
763         if (ret < 0)
764                 goto err_remove_free_put_context;
765
766         return ret;
767
768 err_remove_free_put_context:
769         mutex_lock(&counter->mutex);
770         perf_remove_from_context(counter);
771         mutex_unlock(&counter->mutex);
772
773 err_free_put_context:
774         kfree(counter);
775
776 err_put_context:
777         put_context(ctx);
778
779         return ret;
780 }
781
782 static void __cpuinit perf_init_cpu(int cpu)
783 {
784         struct perf_cpu_context *ctx;
785
786         ctx = &per_cpu(perf_cpu_context, cpu);
787         spin_lock_init(&ctx->ctx.lock);
788         INIT_LIST_HEAD(&ctx->ctx.counters);
789
790         mutex_lock(&perf_resource_mutex);
791         ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
792         mutex_unlock(&perf_resource_mutex);
793         hw_perf_counter_setup();
794 }
795
796 #ifdef CONFIG_HOTPLUG_CPU
797 static void __perf_exit_cpu(void *info)
798 {
799         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
800         struct perf_counter_context *ctx = &cpuctx->ctx;
801         struct perf_counter *counter, *tmp;
802
803         list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
804                 __perf_remove_from_context(counter);
805
806 }
807 static void perf_exit_cpu(int cpu)
808 {
809         smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
810 }
811 #else
812 static inline void perf_exit_cpu(int cpu) { }
813 #endif
814
815 static int __cpuinit
816 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
817 {
818         unsigned int cpu = (long)hcpu;
819
820         switch (action) {
821
822         case CPU_UP_PREPARE:
823         case CPU_UP_PREPARE_FROZEN:
824                 perf_init_cpu(cpu);
825                 break;
826
827         case CPU_DOWN_PREPARE:
828         case CPU_DOWN_PREPARE_FROZEN:
829                 perf_exit_cpu(cpu);
830                 break;
831
832         default:
833                 break;
834         }
835
836         return NOTIFY_OK;
837 }
838
839 static struct notifier_block __cpuinitdata perf_cpu_nb = {
840         .notifier_call          = perf_cpu_notify,
841 };
842
843 static int __init perf_counter_init(void)
844 {
845         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
846                         (void *)(long)smp_processor_id());
847         register_cpu_notifier(&perf_cpu_nb);
848
849         return 0;
850 }
851 early_initcall(perf_counter_init);
852
853 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
854 {
855         return sprintf(buf, "%d\n", perf_reserved_percpu);
856 }
857
858 static ssize_t
859 perf_set_reserve_percpu(struct sysdev_class *class,
860                         const char *buf,
861                         size_t count)
862 {
863         struct perf_cpu_context *cpuctx;
864         unsigned long val;
865         int err, cpu, mpt;
866
867         err = strict_strtoul(buf, 10, &val);
868         if (err)
869                 return err;
870         if (val > perf_max_counters)
871                 return -EINVAL;
872
873         mutex_lock(&perf_resource_mutex);
874         perf_reserved_percpu = val;
875         for_each_online_cpu(cpu) {
876                 cpuctx = &per_cpu(perf_cpu_context, cpu);
877                 spin_lock_irq(&cpuctx->ctx.lock);
878                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
879                           perf_max_counters - perf_reserved_percpu);
880                 cpuctx->max_pertask = mpt;
881                 spin_unlock_irq(&cpuctx->ctx.lock);
882         }
883         mutex_unlock(&perf_resource_mutex);
884
885         return count;
886 }
887
888 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
889 {
890         return sprintf(buf, "%d\n", perf_overcommit);
891 }
892
893 static ssize_t
894 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
895 {
896         unsigned long val;
897         int err;
898
899         err = strict_strtoul(buf, 10, &val);
900         if (err)
901                 return err;
902         if (val > 1)
903                 return -EINVAL;
904
905         mutex_lock(&perf_resource_mutex);
906         perf_overcommit = val;
907         mutex_unlock(&perf_resource_mutex);
908
909         return count;
910 }
911
912 static SYSDEV_CLASS_ATTR(
913                                 reserve_percpu,
914                                 0644,
915                                 perf_show_reserve_percpu,
916                                 perf_set_reserve_percpu
917                         );
918
919 static SYSDEV_CLASS_ATTR(
920                                 overcommit,
921                                 0644,
922                                 perf_show_overcommit,
923                                 perf_set_overcommit
924                         );
925
926 static struct attribute *perfclass_attrs[] = {
927         &attr_reserve_percpu.attr,
928         &attr_overcommit.attr,
929         NULL
930 };
931
932 static struct attribute_group perfclass_attr_group = {
933         .attrs                  = perfclass_attrs,
934         .name                   = "perf_counters",
935 };
936
937 static int __init perf_counter_sysfs_init(void)
938 {
939         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
940                                   &perfclass_attr_group);
941 }
942 device_initcall(perf_counter_sysfs_init);
943