]> bbs.cooldavid.org Git - net-next-2.6.git/blob - kernel/perf_event.c
perf/core: Provide a kernel-internal interface to get to performance counters
[net-next-2.6.git] / kernel / perf_event.c
1 /*
2  * Performance events core code:
3  *
4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7  *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8  *
9  * For licensing details see kernel-base/COPYING
10  */
11
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/file.h>
17 #include <linux/poll.h>
18 #include <linux/sysfs.h>
19 #include <linux/dcache.h>
20 #include <linux/percpu.h>
21 #include <linux/ptrace.h>
22 #include <linux/vmstat.h>
23 #include <linux/vmalloc.h>
24 #include <linux/hardirq.h>
25 #include <linux/rculist.h>
26 #include <linux/uaccess.h>
27 #include <linux/syscalls.h>
28 #include <linux/anon_inodes.h>
29 #include <linux/kernel_stat.h>
30 #include <linux/perf_event.h>
31 #include <linux/ftrace_event.h>
32
33 #include <asm/irq_regs.h>
34
35 /*
36  * Each CPU has a list of per CPU events:
37  */
38 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
39
40 int perf_max_events __read_mostly = 1;
41 static int perf_reserved_percpu __read_mostly;
42 static int perf_overcommit __read_mostly = 1;
43
44 static atomic_t nr_events __read_mostly;
45 static atomic_t nr_mmap_events __read_mostly;
46 static atomic_t nr_comm_events __read_mostly;
47 static atomic_t nr_task_events __read_mostly;
48
49 /*
50  * perf event paranoia level:
51  *  -1 - not paranoid at all
52  *   0 - disallow raw tracepoint access for unpriv
53  *   1 - disallow cpu events for unpriv
54  *   2 - disallow kernel profiling for unpriv
55  */
56 int sysctl_perf_event_paranoid __read_mostly = 1;
57
58 static inline bool perf_paranoid_tracepoint_raw(void)
59 {
60         return sysctl_perf_event_paranoid > -1;
61 }
62
63 static inline bool perf_paranoid_cpu(void)
64 {
65         return sysctl_perf_event_paranoid > 0;
66 }
67
68 static inline bool perf_paranoid_kernel(void)
69 {
70         return sysctl_perf_event_paranoid > 1;
71 }
72
73 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
74
75 /*
76  * max perf event sample rate
77  */
78 int sysctl_perf_event_sample_rate __read_mostly = 100000;
79
80 static atomic64_t perf_event_id;
81
82 /*
83  * Lock for (sysadmin-configurable) event reservations:
84  */
85 static DEFINE_SPINLOCK(perf_resource_lock);
86
87 /*
88  * Architecture provided APIs - weak aliases:
89  */
90 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
91 {
92         return NULL;
93 }
94
95 void __weak hw_perf_disable(void)               { barrier(); }
96 void __weak hw_perf_enable(void)                { barrier(); }
97
98 void __weak hw_perf_event_setup(int cpu)        { barrier(); }
99 void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
100
101 int __weak
102 hw_perf_group_sched_in(struct perf_event *group_leader,
103                struct perf_cpu_context *cpuctx,
104                struct perf_event_context *ctx, int cpu)
105 {
106         return 0;
107 }
108
109 void __weak perf_event_print_debug(void)        { }
110
111 static DEFINE_PER_CPU(int, perf_disable_count);
112
113 void __perf_disable(void)
114 {
115         __get_cpu_var(perf_disable_count)++;
116 }
117
118 bool __perf_enable(void)
119 {
120         return !--__get_cpu_var(perf_disable_count);
121 }
122
123 void perf_disable(void)
124 {
125         __perf_disable();
126         hw_perf_disable();
127 }
128
129 void perf_enable(void)
130 {
131         if (__perf_enable())
132                 hw_perf_enable();
133 }
134
135 static void get_ctx(struct perf_event_context *ctx)
136 {
137         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
138 }
139
140 static void free_ctx(struct rcu_head *head)
141 {
142         struct perf_event_context *ctx;
143
144         ctx = container_of(head, struct perf_event_context, rcu_head);
145         kfree(ctx);
146 }
147
148 static void put_ctx(struct perf_event_context *ctx)
149 {
150         if (atomic_dec_and_test(&ctx->refcount)) {
151                 if (ctx->parent_ctx)
152                         put_ctx(ctx->parent_ctx);
153                 if (ctx->task)
154                         put_task_struct(ctx->task);
155                 call_rcu(&ctx->rcu_head, free_ctx);
156         }
157 }
158
159 static void unclone_ctx(struct perf_event_context *ctx)
160 {
161         if (ctx->parent_ctx) {
162                 put_ctx(ctx->parent_ctx);
163                 ctx->parent_ctx = NULL;
164         }
165 }
166
167 /*
168  * If we inherit events we want to return the parent event id
169  * to userspace.
170  */
171 static u64 primary_event_id(struct perf_event *event)
172 {
173         u64 id = event->id;
174
175         if (event->parent)
176                 id = event->parent->id;
177
178         return id;
179 }
180
181 /*
182  * Get the perf_event_context for a task and lock it.
183  * This has to cope with with the fact that until it is locked,
184  * the context could get moved to another task.
185  */
186 static struct perf_event_context *
187 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
188 {
189         struct perf_event_context *ctx;
190
191         rcu_read_lock();
192  retry:
193         ctx = rcu_dereference(task->perf_event_ctxp);
194         if (ctx) {
195                 /*
196                  * If this context is a clone of another, it might
197                  * get swapped for another underneath us by
198                  * perf_event_task_sched_out, though the
199                  * rcu_read_lock() protects us from any context
200                  * getting freed.  Lock the context and check if it
201                  * got swapped before we could get the lock, and retry
202                  * if so.  If we locked the right context, then it
203                  * can't get swapped on us any more.
204                  */
205                 spin_lock_irqsave(&ctx->lock, *flags);
206                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
207                         spin_unlock_irqrestore(&ctx->lock, *flags);
208                         goto retry;
209                 }
210
211                 if (!atomic_inc_not_zero(&ctx->refcount)) {
212                         spin_unlock_irqrestore(&ctx->lock, *flags);
213                         ctx = NULL;
214                 }
215         }
216         rcu_read_unlock();
217         return ctx;
218 }
219
220 /*
221  * Get the context for a task and increment its pin_count so it
222  * can't get swapped to another task.  This also increments its
223  * reference count so that the context can't get freed.
224  */
225 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
226 {
227         struct perf_event_context *ctx;
228         unsigned long flags;
229
230         ctx = perf_lock_task_context(task, &flags);
231         if (ctx) {
232                 ++ctx->pin_count;
233                 spin_unlock_irqrestore(&ctx->lock, flags);
234         }
235         return ctx;
236 }
237
238 static void perf_unpin_context(struct perf_event_context *ctx)
239 {
240         unsigned long flags;
241
242         spin_lock_irqsave(&ctx->lock, flags);
243         --ctx->pin_count;
244         spin_unlock_irqrestore(&ctx->lock, flags);
245         put_ctx(ctx);
246 }
247
248 /*
249  * Add a event from the lists for its context.
250  * Must be called with ctx->mutex and ctx->lock held.
251  */
252 static void
253 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
254 {
255         struct perf_event *group_leader = event->group_leader;
256
257         /*
258          * Depending on whether it is a standalone or sibling event,
259          * add it straight to the context's event list, or to the group
260          * leader's sibling list:
261          */
262         if (group_leader == event)
263                 list_add_tail(&event->group_entry, &ctx->group_list);
264         else {
265                 list_add_tail(&event->group_entry, &group_leader->sibling_list);
266                 group_leader->nr_siblings++;
267         }
268
269         list_add_rcu(&event->event_entry, &ctx->event_list);
270         ctx->nr_events++;
271         if (event->attr.inherit_stat)
272                 ctx->nr_stat++;
273 }
274
275 /*
276  * Remove a event from the lists for its context.
277  * Must be called with ctx->mutex and ctx->lock held.
278  */
279 static void
280 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
281 {
282         struct perf_event *sibling, *tmp;
283
284         if (list_empty(&event->group_entry))
285                 return;
286         ctx->nr_events--;
287         if (event->attr.inherit_stat)
288                 ctx->nr_stat--;
289
290         list_del_init(&event->group_entry);
291         list_del_rcu(&event->event_entry);
292
293         if (event->group_leader != event)
294                 event->group_leader->nr_siblings--;
295
296         /*
297          * If this was a group event with sibling events then
298          * upgrade the siblings to singleton events by adding them
299          * to the context list directly:
300          */
301         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
302
303                 list_move_tail(&sibling->group_entry, &ctx->group_list);
304                 sibling->group_leader = sibling;
305         }
306 }
307
308 static void
309 event_sched_out(struct perf_event *event,
310                   struct perf_cpu_context *cpuctx,
311                   struct perf_event_context *ctx)
312 {
313         if (event->state != PERF_EVENT_STATE_ACTIVE)
314                 return;
315
316         event->state = PERF_EVENT_STATE_INACTIVE;
317         if (event->pending_disable) {
318                 event->pending_disable = 0;
319                 event->state = PERF_EVENT_STATE_OFF;
320         }
321         event->tstamp_stopped = ctx->time;
322         event->pmu->disable(event);
323         event->oncpu = -1;
324
325         if (!is_software_event(event))
326                 cpuctx->active_oncpu--;
327         ctx->nr_active--;
328         if (event->attr.exclusive || !cpuctx->active_oncpu)
329                 cpuctx->exclusive = 0;
330 }
331
332 static void
333 group_sched_out(struct perf_event *group_event,
334                 struct perf_cpu_context *cpuctx,
335                 struct perf_event_context *ctx)
336 {
337         struct perf_event *event;
338
339         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
340                 return;
341
342         event_sched_out(group_event, cpuctx, ctx);
343
344         /*
345          * Schedule out siblings (if any):
346          */
347         list_for_each_entry(event, &group_event->sibling_list, group_entry)
348                 event_sched_out(event, cpuctx, ctx);
349
350         if (group_event->attr.exclusive)
351                 cpuctx->exclusive = 0;
352 }
353
354 /*
355  * Cross CPU call to remove a performance event
356  *
357  * We disable the event on the hardware level first. After that we
358  * remove it from the context list.
359  */
360 static void __perf_event_remove_from_context(void *info)
361 {
362         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
363         struct perf_event *event = info;
364         struct perf_event_context *ctx = event->ctx;
365
366         /*
367          * If this is a task context, we need to check whether it is
368          * the current task context of this cpu. If not it has been
369          * scheduled out before the smp call arrived.
370          */
371         if (ctx->task && cpuctx->task_ctx != ctx)
372                 return;
373
374         spin_lock(&ctx->lock);
375         /*
376          * Protect the list operation against NMI by disabling the
377          * events on a global level.
378          */
379         perf_disable();
380
381         event_sched_out(event, cpuctx, ctx);
382
383         list_del_event(event, ctx);
384
385         if (!ctx->task) {
386                 /*
387                  * Allow more per task events with respect to the
388                  * reservation:
389                  */
390                 cpuctx->max_pertask =
391                         min(perf_max_events - ctx->nr_events,
392                             perf_max_events - perf_reserved_percpu);
393         }
394
395         perf_enable();
396         spin_unlock(&ctx->lock);
397 }
398
399
400 /*
401  * Remove the event from a task's (or a CPU's) list of events.
402  *
403  * Must be called with ctx->mutex held.
404  *
405  * CPU events are removed with a smp call. For task events we only
406  * call when the task is on a CPU.
407  *
408  * If event->ctx is a cloned context, callers must make sure that
409  * every task struct that event->ctx->task could possibly point to
410  * remains valid.  This is OK when called from perf_release since
411  * that only calls us on the top-level context, which can't be a clone.
412  * When called from perf_event_exit_task, it's OK because the
413  * context has been detached from its task.
414  */
415 static void perf_event_remove_from_context(struct perf_event *event)
416 {
417         struct perf_event_context *ctx = event->ctx;
418         struct task_struct *task = ctx->task;
419
420         if (!task) {
421                 /*
422                  * Per cpu events are removed via an smp call and
423                  * the removal is always sucessful.
424                  */
425                 smp_call_function_single(event->cpu,
426                                          __perf_event_remove_from_context,
427                                          event, 1);
428                 return;
429         }
430
431 retry:
432         task_oncpu_function_call(task, __perf_event_remove_from_context,
433                                  event);
434
435         spin_lock_irq(&ctx->lock);
436         /*
437          * If the context is active we need to retry the smp call.
438          */
439         if (ctx->nr_active && !list_empty(&event->group_entry)) {
440                 spin_unlock_irq(&ctx->lock);
441                 goto retry;
442         }
443
444         /*
445          * The lock prevents that this context is scheduled in so we
446          * can remove the event safely, if the call above did not
447          * succeed.
448          */
449         if (!list_empty(&event->group_entry)) {
450                 list_del_event(event, ctx);
451         }
452         spin_unlock_irq(&ctx->lock);
453 }
454
455 static inline u64 perf_clock(void)
456 {
457         return cpu_clock(smp_processor_id());
458 }
459
460 /*
461  * Update the record of the current time in a context.
462  */
463 static void update_context_time(struct perf_event_context *ctx)
464 {
465         u64 now = perf_clock();
466
467         ctx->time += now - ctx->timestamp;
468         ctx->timestamp = now;
469 }
470
471 /*
472  * Update the total_time_enabled and total_time_running fields for a event.
473  */
474 static void update_event_times(struct perf_event *event)
475 {
476         struct perf_event_context *ctx = event->ctx;
477         u64 run_end;
478
479         if (event->state < PERF_EVENT_STATE_INACTIVE ||
480             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
481                 return;
482
483         event->total_time_enabled = ctx->time - event->tstamp_enabled;
484
485         if (event->state == PERF_EVENT_STATE_INACTIVE)
486                 run_end = event->tstamp_stopped;
487         else
488                 run_end = ctx->time;
489
490         event->total_time_running = run_end - event->tstamp_running;
491 }
492
493 /*
494  * Update total_time_enabled and total_time_running for all events in a group.
495  */
496 static void update_group_times(struct perf_event *leader)
497 {
498         struct perf_event *event;
499
500         update_event_times(leader);
501         list_for_each_entry(event, &leader->sibling_list, group_entry)
502                 update_event_times(event);
503 }
504
505 /*
506  * Cross CPU call to disable a performance event
507  */
508 static void __perf_event_disable(void *info)
509 {
510         struct perf_event *event = info;
511         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
512         struct perf_event_context *ctx = event->ctx;
513
514         /*
515          * If this is a per-task event, need to check whether this
516          * event's task is the current task on this cpu.
517          */
518         if (ctx->task && cpuctx->task_ctx != ctx)
519                 return;
520
521         spin_lock(&ctx->lock);
522
523         /*
524          * If the event is on, turn it off.
525          * If it is in error state, leave it in error state.
526          */
527         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
528                 update_context_time(ctx);
529                 update_group_times(event);
530                 if (event == event->group_leader)
531                         group_sched_out(event, cpuctx, ctx);
532                 else
533                         event_sched_out(event, cpuctx, ctx);
534                 event->state = PERF_EVENT_STATE_OFF;
535         }
536
537         spin_unlock(&ctx->lock);
538 }
539
540 /*
541  * Disable a event.
542  *
543  * If event->ctx is a cloned context, callers must make sure that
544  * every task struct that event->ctx->task could possibly point to
545  * remains valid.  This condition is satisifed when called through
546  * perf_event_for_each_child or perf_event_for_each because they
547  * hold the top-level event's child_mutex, so any descendant that
548  * goes to exit will block in sync_child_event.
549  * When called from perf_pending_event it's OK because event->ctx
550  * is the current context on this CPU and preemption is disabled,
551  * hence we can't get into perf_event_task_sched_out for this context.
552  */
553 static void perf_event_disable(struct perf_event *event)
554 {
555         struct perf_event_context *ctx = event->ctx;
556         struct task_struct *task = ctx->task;
557
558         if (!task) {
559                 /*
560                  * Disable the event on the cpu that it's on
561                  */
562                 smp_call_function_single(event->cpu, __perf_event_disable,
563                                          event, 1);
564                 return;
565         }
566
567  retry:
568         task_oncpu_function_call(task, __perf_event_disable, event);
569
570         spin_lock_irq(&ctx->lock);
571         /*
572          * If the event is still active, we need to retry the cross-call.
573          */
574         if (event->state == PERF_EVENT_STATE_ACTIVE) {
575                 spin_unlock_irq(&ctx->lock);
576                 goto retry;
577         }
578
579         /*
580          * Since we have the lock this context can't be scheduled
581          * in, so we can change the state safely.
582          */
583         if (event->state == PERF_EVENT_STATE_INACTIVE) {
584                 update_group_times(event);
585                 event->state = PERF_EVENT_STATE_OFF;
586         }
587
588         spin_unlock_irq(&ctx->lock);
589 }
590
591 static int
592 event_sched_in(struct perf_event *event,
593                  struct perf_cpu_context *cpuctx,
594                  struct perf_event_context *ctx,
595                  int cpu)
596 {
597         if (event->state <= PERF_EVENT_STATE_OFF)
598                 return 0;
599
600         event->state = PERF_EVENT_STATE_ACTIVE;
601         event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
602         /*
603          * The new state must be visible before we turn it on in the hardware:
604          */
605         smp_wmb();
606
607         if (event->pmu->enable(event)) {
608                 event->state = PERF_EVENT_STATE_INACTIVE;
609                 event->oncpu = -1;
610                 return -EAGAIN;
611         }
612
613         event->tstamp_running += ctx->time - event->tstamp_stopped;
614
615         if (!is_software_event(event))
616                 cpuctx->active_oncpu++;
617         ctx->nr_active++;
618
619         if (event->attr.exclusive)
620                 cpuctx->exclusive = 1;
621
622         return 0;
623 }
624
625 static int
626 group_sched_in(struct perf_event *group_event,
627                struct perf_cpu_context *cpuctx,
628                struct perf_event_context *ctx,
629                int cpu)
630 {
631         struct perf_event *event, *partial_group;
632         int ret;
633
634         if (group_event->state == PERF_EVENT_STATE_OFF)
635                 return 0;
636
637         ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
638         if (ret)
639                 return ret < 0 ? ret : 0;
640
641         if (event_sched_in(group_event, cpuctx, ctx, cpu))
642                 return -EAGAIN;
643
644         /*
645          * Schedule in siblings as one group (if any):
646          */
647         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
648                 if (event_sched_in(event, cpuctx, ctx, cpu)) {
649                         partial_group = event;
650                         goto group_error;
651                 }
652         }
653
654         return 0;
655
656 group_error:
657         /*
658          * Groups can be scheduled in as one unit only, so undo any
659          * partial group before returning:
660          */
661         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
662                 if (event == partial_group)
663                         break;
664                 event_sched_out(event, cpuctx, ctx);
665         }
666         event_sched_out(group_event, cpuctx, ctx);
667
668         return -EAGAIN;
669 }
670
671 /*
672  * Return 1 for a group consisting entirely of software events,
673  * 0 if the group contains any hardware events.
674  */
675 static int is_software_only_group(struct perf_event *leader)
676 {
677         struct perf_event *event;
678
679         if (!is_software_event(leader))
680                 return 0;
681
682         list_for_each_entry(event, &leader->sibling_list, group_entry)
683                 if (!is_software_event(event))
684                         return 0;
685
686         return 1;
687 }
688
689 /*
690  * Work out whether we can put this event group on the CPU now.
691  */
692 static int group_can_go_on(struct perf_event *event,
693                            struct perf_cpu_context *cpuctx,
694                            int can_add_hw)
695 {
696         /*
697          * Groups consisting entirely of software events can always go on.
698          */
699         if (is_software_only_group(event))
700                 return 1;
701         /*
702          * If an exclusive group is already on, no other hardware
703          * events can go on.
704          */
705         if (cpuctx->exclusive)
706                 return 0;
707         /*
708          * If this group is exclusive and there are already
709          * events on the CPU, it can't go on.
710          */
711         if (event->attr.exclusive && cpuctx->active_oncpu)
712                 return 0;
713         /*
714          * Otherwise, try to add it if all previous groups were able
715          * to go on.
716          */
717         return can_add_hw;
718 }
719
720 static void add_event_to_ctx(struct perf_event *event,
721                                struct perf_event_context *ctx)
722 {
723         list_add_event(event, ctx);
724         event->tstamp_enabled = ctx->time;
725         event->tstamp_running = ctx->time;
726         event->tstamp_stopped = ctx->time;
727 }
728
729 /*
730  * Cross CPU call to install and enable a performance event
731  *
732  * Must be called with ctx->mutex held
733  */
734 static void __perf_install_in_context(void *info)
735 {
736         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
737         struct perf_event *event = info;
738         struct perf_event_context *ctx = event->ctx;
739         struct perf_event *leader = event->group_leader;
740         int cpu = smp_processor_id();
741         int err;
742
743         /*
744          * If this is a task context, we need to check whether it is
745          * the current task context of this cpu. If not it has been
746          * scheduled out before the smp call arrived.
747          * Or possibly this is the right context but it isn't
748          * on this cpu because it had no events.
749          */
750         if (ctx->task && cpuctx->task_ctx != ctx) {
751                 if (cpuctx->task_ctx || ctx->task != current)
752                         return;
753                 cpuctx->task_ctx = ctx;
754         }
755
756         spin_lock(&ctx->lock);
757         ctx->is_active = 1;
758         update_context_time(ctx);
759
760         /*
761          * Protect the list operation against NMI by disabling the
762          * events on a global level. NOP for non NMI based events.
763          */
764         perf_disable();
765
766         add_event_to_ctx(event, ctx);
767
768         /*
769          * Don't put the event on if it is disabled or if
770          * it is in a group and the group isn't on.
771          */
772         if (event->state != PERF_EVENT_STATE_INACTIVE ||
773             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
774                 goto unlock;
775
776         /*
777          * An exclusive event can't go on if there are already active
778          * hardware events, and no hardware event can go on if there
779          * is already an exclusive event on.
780          */
781         if (!group_can_go_on(event, cpuctx, 1))
782                 err = -EEXIST;
783         else
784                 err = event_sched_in(event, cpuctx, ctx, cpu);
785
786         if (err) {
787                 /*
788                  * This event couldn't go on.  If it is in a group
789                  * then we have to pull the whole group off.
790                  * If the event group is pinned then put it in error state.
791                  */
792                 if (leader != event)
793                         group_sched_out(leader, cpuctx, ctx);
794                 if (leader->attr.pinned) {
795                         update_group_times(leader);
796                         leader->state = PERF_EVENT_STATE_ERROR;
797                 }
798         }
799
800         if (!err && !ctx->task && cpuctx->max_pertask)
801                 cpuctx->max_pertask--;
802
803  unlock:
804         perf_enable();
805
806         spin_unlock(&ctx->lock);
807 }
808
809 /*
810  * Attach a performance event to a context
811  *
812  * First we add the event to the list with the hardware enable bit
813  * in event->hw_config cleared.
814  *
815  * If the event is attached to a task which is on a CPU we use a smp
816  * call to enable it in the task context. The task might have been
817  * scheduled away, but we check this in the smp call again.
818  *
819  * Must be called with ctx->mutex held.
820  */
821 static void
822 perf_install_in_context(struct perf_event_context *ctx,
823                         struct perf_event *event,
824                         int cpu)
825 {
826         struct task_struct *task = ctx->task;
827
828         if (!task) {
829                 /*
830                  * Per cpu events are installed via an smp call and
831                  * the install is always sucessful.
832                  */
833                 smp_call_function_single(cpu, __perf_install_in_context,
834                                          event, 1);
835                 return;
836         }
837
838 retry:
839         task_oncpu_function_call(task, __perf_install_in_context,
840                                  event);
841
842         spin_lock_irq(&ctx->lock);
843         /*
844          * we need to retry the smp call.
845          */
846         if (ctx->is_active && list_empty(&event->group_entry)) {
847                 spin_unlock_irq(&ctx->lock);
848                 goto retry;
849         }
850
851         /*
852          * The lock prevents that this context is scheduled in so we
853          * can add the event safely, if it the call above did not
854          * succeed.
855          */
856         if (list_empty(&event->group_entry))
857                 add_event_to_ctx(event, ctx);
858         spin_unlock_irq(&ctx->lock);
859 }
860
861 /*
862  * Put a event into inactive state and update time fields.
863  * Enabling the leader of a group effectively enables all
864  * the group members that aren't explicitly disabled, so we
865  * have to update their ->tstamp_enabled also.
866  * Note: this works for group members as well as group leaders
867  * since the non-leader members' sibling_lists will be empty.
868  */
869 static void __perf_event_mark_enabled(struct perf_event *event,
870                                         struct perf_event_context *ctx)
871 {
872         struct perf_event *sub;
873
874         event->state = PERF_EVENT_STATE_INACTIVE;
875         event->tstamp_enabled = ctx->time - event->total_time_enabled;
876         list_for_each_entry(sub, &event->sibling_list, group_entry)
877                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
878                         sub->tstamp_enabled =
879                                 ctx->time - sub->total_time_enabled;
880 }
881
882 /*
883  * Cross CPU call to enable a performance event
884  */
885 static void __perf_event_enable(void *info)
886 {
887         struct perf_event *event = info;
888         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
889         struct perf_event_context *ctx = event->ctx;
890         struct perf_event *leader = event->group_leader;
891         int err;
892
893         /*
894          * If this is a per-task event, need to check whether this
895          * event's task is the current task on this cpu.
896          */
897         if (ctx->task && cpuctx->task_ctx != ctx) {
898                 if (cpuctx->task_ctx || ctx->task != current)
899                         return;
900                 cpuctx->task_ctx = ctx;
901         }
902
903         spin_lock(&ctx->lock);
904         ctx->is_active = 1;
905         update_context_time(ctx);
906
907         if (event->state >= PERF_EVENT_STATE_INACTIVE)
908                 goto unlock;
909         __perf_event_mark_enabled(event, ctx);
910
911         /*
912          * If the event is in a group and isn't the group leader,
913          * then don't put it on unless the group is on.
914          */
915         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
916                 goto unlock;
917
918         if (!group_can_go_on(event, cpuctx, 1)) {
919                 err = -EEXIST;
920         } else {
921                 perf_disable();
922                 if (event == leader)
923                         err = group_sched_in(event, cpuctx, ctx,
924                                              smp_processor_id());
925                 else
926                         err = event_sched_in(event, cpuctx, ctx,
927                                                smp_processor_id());
928                 perf_enable();
929         }
930
931         if (err) {
932                 /*
933                  * If this event can't go on and it's part of a
934                  * group, then the whole group has to come off.
935                  */
936                 if (leader != event)
937                         group_sched_out(leader, cpuctx, ctx);
938                 if (leader->attr.pinned) {
939                         update_group_times(leader);
940                         leader->state = PERF_EVENT_STATE_ERROR;
941                 }
942         }
943
944  unlock:
945         spin_unlock(&ctx->lock);
946 }
947
948 /*
949  * Enable a event.
950  *
951  * If event->ctx is a cloned context, callers must make sure that
952  * every task struct that event->ctx->task could possibly point to
953  * remains valid.  This condition is satisfied when called through
954  * perf_event_for_each_child or perf_event_for_each as described
955  * for perf_event_disable.
956  */
957 static void perf_event_enable(struct perf_event *event)
958 {
959         struct perf_event_context *ctx = event->ctx;
960         struct task_struct *task = ctx->task;
961
962         if (!task) {
963                 /*
964                  * Enable the event on the cpu that it's on
965                  */
966                 smp_call_function_single(event->cpu, __perf_event_enable,
967                                          event, 1);
968                 return;
969         }
970
971         spin_lock_irq(&ctx->lock);
972         if (event->state >= PERF_EVENT_STATE_INACTIVE)
973                 goto out;
974
975         /*
976          * If the event is in error state, clear that first.
977          * That way, if we see the event in error state below, we
978          * know that it has gone back into error state, as distinct
979          * from the task having been scheduled away before the
980          * cross-call arrived.
981          */
982         if (event->state == PERF_EVENT_STATE_ERROR)
983                 event->state = PERF_EVENT_STATE_OFF;
984
985  retry:
986         spin_unlock_irq(&ctx->lock);
987         task_oncpu_function_call(task, __perf_event_enable, event);
988
989         spin_lock_irq(&ctx->lock);
990
991         /*
992          * If the context is active and the event is still off,
993          * we need to retry the cross-call.
994          */
995         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
996                 goto retry;
997
998         /*
999          * Since we have the lock this context can't be scheduled
1000          * in, so we can change the state safely.
1001          */
1002         if (event->state == PERF_EVENT_STATE_OFF)
1003                 __perf_event_mark_enabled(event, ctx);
1004
1005  out:
1006         spin_unlock_irq(&ctx->lock);
1007 }
1008
1009 static int perf_event_refresh(struct perf_event *event, int refresh)
1010 {
1011         /*
1012          * not supported on inherited events
1013          */
1014         if (event->attr.inherit)
1015                 return -EINVAL;
1016
1017         atomic_add(refresh, &event->event_limit);
1018         perf_event_enable(event);
1019
1020         return 0;
1021 }
1022
1023 void __perf_event_sched_out(struct perf_event_context *ctx,
1024                               struct perf_cpu_context *cpuctx)
1025 {
1026         struct perf_event *event;
1027
1028         spin_lock(&ctx->lock);
1029         ctx->is_active = 0;
1030         if (likely(!ctx->nr_events))
1031                 goto out;
1032         update_context_time(ctx);
1033
1034         perf_disable();
1035         if (ctx->nr_active)
1036                 list_for_each_entry(event, &ctx->group_list, group_entry)
1037                         group_sched_out(event, cpuctx, ctx);
1038
1039         perf_enable();
1040  out:
1041         spin_unlock(&ctx->lock);
1042 }
1043
1044 /*
1045  * Test whether two contexts are equivalent, i.e. whether they
1046  * have both been cloned from the same version of the same context
1047  * and they both have the same number of enabled events.
1048  * If the number of enabled events is the same, then the set
1049  * of enabled events should be the same, because these are both
1050  * inherited contexts, therefore we can't access individual events
1051  * in them directly with an fd; we can only enable/disable all
1052  * events via prctl, or enable/disable all events in a family
1053  * via ioctl, which will have the same effect on both contexts.
1054  */
1055 static int context_equiv(struct perf_event_context *ctx1,
1056                          struct perf_event_context *ctx2)
1057 {
1058         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1059                 && ctx1->parent_gen == ctx2->parent_gen
1060                 && !ctx1->pin_count && !ctx2->pin_count;
1061 }
1062
1063 static void __perf_event_read(void *event);
1064
1065 static void __perf_event_sync_stat(struct perf_event *event,
1066                                      struct perf_event *next_event)
1067 {
1068         u64 value;
1069
1070         if (!event->attr.inherit_stat)
1071                 return;
1072
1073         /*
1074          * Update the event value, we cannot use perf_event_read()
1075          * because we're in the middle of a context switch and have IRQs
1076          * disabled, which upsets smp_call_function_single(), however
1077          * we know the event must be on the current CPU, therefore we
1078          * don't need to use it.
1079          */
1080         switch (event->state) {
1081         case PERF_EVENT_STATE_ACTIVE:
1082                 __perf_event_read(event);
1083                 break;
1084
1085         case PERF_EVENT_STATE_INACTIVE:
1086                 update_event_times(event);
1087                 break;
1088
1089         default:
1090                 break;
1091         }
1092
1093         /*
1094          * In order to keep per-task stats reliable we need to flip the event
1095          * values when we flip the contexts.
1096          */
1097         value = atomic64_read(&next_event->count);
1098         value = atomic64_xchg(&event->count, value);
1099         atomic64_set(&next_event->count, value);
1100
1101         swap(event->total_time_enabled, next_event->total_time_enabled);
1102         swap(event->total_time_running, next_event->total_time_running);
1103
1104         /*
1105          * Since we swizzled the values, update the user visible data too.
1106          */
1107         perf_event_update_userpage(event);
1108         perf_event_update_userpage(next_event);
1109 }
1110
1111 #define list_next_entry(pos, member) \
1112         list_entry(pos->member.next, typeof(*pos), member)
1113
1114 static void perf_event_sync_stat(struct perf_event_context *ctx,
1115                                    struct perf_event_context *next_ctx)
1116 {
1117         struct perf_event *event, *next_event;
1118
1119         if (!ctx->nr_stat)
1120                 return;
1121
1122         event = list_first_entry(&ctx->event_list,
1123                                    struct perf_event, event_entry);
1124
1125         next_event = list_first_entry(&next_ctx->event_list,
1126                                         struct perf_event, event_entry);
1127
1128         while (&event->event_entry != &ctx->event_list &&
1129                &next_event->event_entry != &next_ctx->event_list) {
1130
1131                 __perf_event_sync_stat(event, next_event);
1132
1133                 event = list_next_entry(event, event_entry);
1134                 next_event = list_next_entry(next_event, event_entry);
1135         }
1136 }
1137
1138 /*
1139  * Called from scheduler to remove the events of the current task,
1140  * with interrupts disabled.
1141  *
1142  * We stop each event and update the event value in event->count.
1143  *
1144  * This does not protect us against NMI, but disable()
1145  * sets the disabled bit in the control field of event _before_
1146  * accessing the event control register. If a NMI hits, then it will
1147  * not restart the event.
1148  */
1149 void perf_event_task_sched_out(struct task_struct *task,
1150                                  struct task_struct *next, int cpu)
1151 {
1152         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1153         struct perf_event_context *ctx = task->perf_event_ctxp;
1154         struct perf_event_context *next_ctx;
1155         struct perf_event_context *parent;
1156         struct pt_regs *regs;
1157         int do_switch = 1;
1158
1159         regs = task_pt_regs(task);
1160         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1161
1162         if (likely(!ctx || !cpuctx->task_ctx))
1163                 return;
1164
1165         update_context_time(ctx);
1166
1167         rcu_read_lock();
1168         parent = rcu_dereference(ctx->parent_ctx);
1169         next_ctx = next->perf_event_ctxp;
1170         if (parent && next_ctx &&
1171             rcu_dereference(next_ctx->parent_ctx) == parent) {
1172                 /*
1173                  * Looks like the two contexts are clones, so we might be
1174                  * able to optimize the context switch.  We lock both
1175                  * contexts and check that they are clones under the
1176                  * lock (including re-checking that neither has been
1177                  * uncloned in the meantime).  It doesn't matter which
1178                  * order we take the locks because no other cpu could
1179                  * be trying to lock both of these tasks.
1180                  */
1181                 spin_lock(&ctx->lock);
1182                 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1183                 if (context_equiv(ctx, next_ctx)) {
1184                         /*
1185                          * XXX do we need a memory barrier of sorts
1186                          * wrt to rcu_dereference() of perf_event_ctxp
1187                          */
1188                         task->perf_event_ctxp = next_ctx;
1189                         next->perf_event_ctxp = ctx;
1190                         ctx->task = next;
1191                         next_ctx->task = task;
1192                         do_switch = 0;
1193
1194                         perf_event_sync_stat(ctx, next_ctx);
1195                 }
1196                 spin_unlock(&next_ctx->lock);
1197                 spin_unlock(&ctx->lock);
1198         }
1199         rcu_read_unlock();
1200
1201         if (do_switch) {
1202                 __perf_event_sched_out(ctx, cpuctx);
1203                 cpuctx->task_ctx = NULL;
1204         }
1205 }
1206
1207 /*
1208  * Called with IRQs disabled
1209  */
1210 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1211 {
1212         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1213
1214         if (!cpuctx->task_ctx)
1215                 return;
1216
1217         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1218                 return;
1219
1220         __perf_event_sched_out(ctx, cpuctx);
1221         cpuctx->task_ctx = NULL;
1222 }
1223
1224 /*
1225  * Called with IRQs disabled
1226  */
1227 static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1228 {
1229         __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1230 }
1231
1232 static void
1233 __perf_event_sched_in(struct perf_event_context *ctx,
1234                         struct perf_cpu_context *cpuctx, int cpu)
1235 {
1236         struct perf_event *event;
1237         int can_add_hw = 1;
1238
1239         spin_lock(&ctx->lock);
1240         ctx->is_active = 1;
1241         if (likely(!ctx->nr_events))
1242                 goto out;
1243
1244         ctx->timestamp = perf_clock();
1245
1246         perf_disable();
1247
1248         /*
1249          * First go through the list and put on any pinned groups
1250          * in order to give them the best chance of going on.
1251          */
1252         list_for_each_entry(event, &ctx->group_list, group_entry) {
1253                 if (event->state <= PERF_EVENT_STATE_OFF ||
1254                     !event->attr.pinned)
1255                         continue;
1256                 if (event->cpu != -1 && event->cpu != cpu)
1257                         continue;
1258
1259                 if (group_can_go_on(event, cpuctx, 1))
1260                         group_sched_in(event, cpuctx, ctx, cpu);
1261
1262                 /*
1263                  * If this pinned group hasn't been scheduled,
1264                  * put it in error state.
1265                  */
1266                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1267                         update_group_times(event);
1268                         event->state = PERF_EVENT_STATE_ERROR;
1269                 }
1270         }
1271
1272         list_for_each_entry(event, &ctx->group_list, group_entry) {
1273                 /*
1274                  * Ignore events in OFF or ERROR state, and
1275                  * ignore pinned events since we did them already.
1276                  */
1277                 if (event->state <= PERF_EVENT_STATE_OFF ||
1278                     event->attr.pinned)
1279                         continue;
1280
1281                 /*
1282                  * Listen to the 'cpu' scheduling filter constraint
1283                  * of events:
1284                  */
1285                 if (event->cpu != -1 && event->cpu != cpu)
1286                         continue;
1287
1288                 if (group_can_go_on(event, cpuctx, can_add_hw))
1289                         if (group_sched_in(event, cpuctx, ctx, cpu))
1290                                 can_add_hw = 0;
1291         }
1292         perf_enable();
1293  out:
1294         spin_unlock(&ctx->lock);
1295 }
1296
1297 /*
1298  * Called from scheduler to add the events of the current task
1299  * with interrupts disabled.
1300  *
1301  * We restore the event value and then enable it.
1302  *
1303  * This does not protect us against NMI, but enable()
1304  * sets the enabled bit in the control field of event _before_
1305  * accessing the event control register. If a NMI hits, then it will
1306  * keep the event running.
1307  */
1308 void perf_event_task_sched_in(struct task_struct *task, int cpu)
1309 {
1310         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1311         struct perf_event_context *ctx = task->perf_event_ctxp;
1312
1313         if (likely(!ctx))
1314                 return;
1315         if (cpuctx->task_ctx == ctx)
1316                 return;
1317         __perf_event_sched_in(ctx, cpuctx, cpu);
1318         cpuctx->task_ctx = ctx;
1319 }
1320
1321 static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1322 {
1323         struct perf_event_context *ctx = &cpuctx->ctx;
1324
1325         __perf_event_sched_in(ctx, cpuctx, cpu);
1326 }
1327
1328 #define MAX_INTERRUPTS (~0ULL)
1329
1330 static void perf_log_throttle(struct perf_event *event, int enable);
1331
1332 static void perf_adjust_period(struct perf_event *event, u64 events)
1333 {
1334         struct hw_perf_event *hwc = &event->hw;
1335         u64 period, sample_period;
1336         s64 delta;
1337
1338         events *= hwc->sample_period;
1339         period = div64_u64(events, event->attr.sample_freq);
1340
1341         delta = (s64)(period - hwc->sample_period);
1342         delta = (delta + 7) / 8; /* low pass filter */
1343
1344         sample_period = hwc->sample_period + delta;
1345
1346         if (!sample_period)
1347                 sample_period = 1;
1348
1349         hwc->sample_period = sample_period;
1350 }
1351
1352 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1353 {
1354         struct perf_event *event;
1355         struct hw_perf_event *hwc;
1356         u64 interrupts, freq;
1357
1358         spin_lock(&ctx->lock);
1359         list_for_each_entry(event, &ctx->group_list, group_entry) {
1360                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1361                         continue;
1362
1363                 hwc = &event->hw;
1364
1365                 interrupts = hwc->interrupts;
1366                 hwc->interrupts = 0;
1367
1368                 /*
1369                  * unthrottle events on the tick
1370                  */
1371                 if (interrupts == MAX_INTERRUPTS) {
1372                         perf_log_throttle(event, 1);
1373                         event->pmu->unthrottle(event);
1374                         interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1375                 }
1376
1377                 if (!event->attr.freq || !event->attr.sample_freq)
1378                         continue;
1379
1380                 /*
1381                  * if the specified freq < HZ then we need to skip ticks
1382                  */
1383                 if (event->attr.sample_freq < HZ) {
1384                         freq = event->attr.sample_freq;
1385
1386                         hwc->freq_count += freq;
1387                         hwc->freq_interrupts += interrupts;
1388
1389                         if (hwc->freq_count < HZ)
1390                                 continue;
1391
1392                         interrupts = hwc->freq_interrupts;
1393                         hwc->freq_interrupts = 0;
1394                         hwc->freq_count -= HZ;
1395                 } else
1396                         freq = HZ;
1397
1398                 perf_adjust_period(event, freq * interrupts);
1399
1400                 /*
1401                  * In order to avoid being stalled by an (accidental) huge
1402                  * sample period, force reset the sample period if we didn't
1403                  * get any events in this freq period.
1404                  */
1405                 if (!interrupts) {
1406                         perf_disable();
1407                         event->pmu->disable(event);
1408                         atomic64_set(&hwc->period_left, 0);
1409                         event->pmu->enable(event);
1410                         perf_enable();
1411                 }
1412         }
1413         spin_unlock(&ctx->lock);
1414 }
1415
1416 /*
1417  * Round-robin a context's events:
1418  */
1419 static void rotate_ctx(struct perf_event_context *ctx)
1420 {
1421         struct perf_event *event;
1422
1423         if (!ctx->nr_events)
1424                 return;
1425
1426         spin_lock(&ctx->lock);
1427         /*
1428          * Rotate the first entry last (works just fine for group events too):
1429          */
1430         perf_disable();
1431         list_for_each_entry(event, &ctx->group_list, group_entry) {
1432                 list_move_tail(&event->group_entry, &ctx->group_list);
1433                 break;
1434         }
1435         perf_enable();
1436
1437         spin_unlock(&ctx->lock);
1438 }
1439
1440 void perf_event_task_tick(struct task_struct *curr, int cpu)
1441 {
1442         struct perf_cpu_context *cpuctx;
1443         struct perf_event_context *ctx;
1444
1445         if (!atomic_read(&nr_events))
1446                 return;
1447
1448         cpuctx = &per_cpu(perf_cpu_context, cpu);
1449         ctx = curr->perf_event_ctxp;
1450
1451         perf_ctx_adjust_freq(&cpuctx->ctx);
1452         if (ctx)
1453                 perf_ctx_adjust_freq(ctx);
1454
1455         perf_event_cpu_sched_out(cpuctx);
1456         if (ctx)
1457                 __perf_event_task_sched_out(ctx);
1458
1459         rotate_ctx(&cpuctx->ctx);
1460         if (ctx)
1461                 rotate_ctx(ctx);
1462
1463         perf_event_cpu_sched_in(cpuctx, cpu);
1464         if (ctx)
1465                 perf_event_task_sched_in(curr, cpu);
1466 }
1467
1468 /*
1469  * Enable all of a task's events that have been marked enable-on-exec.
1470  * This expects task == current.
1471  */
1472 static void perf_event_enable_on_exec(struct task_struct *task)
1473 {
1474         struct perf_event_context *ctx;
1475         struct perf_event *event;
1476         unsigned long flags;
1477         int enabled = 0;
1478
1479         local_irq_save(flags);
1480         ctx = task->perf_event_ctxp;
1481         if (!ctx || !ctx->nr_events)
1482                 goto out;
1483
1484         __perf_event_task_sched_out(ctx);
1485
1486         spin_lock(&ctx->lock);
1487
1488         list_for_each_entry(event, &ctx->group_list, group_entry) {
1489                 if (!event->attr.enable_on_exec)
1490                         continue;
1491                 event->attr.enable_on_exec = 0;
1492                 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1493                         continue;
1494                 __perf_event_mark_enabled(event, ctx);
1495                 enabled = 1;
1496         }
1497
1498         /*
1499          * Unclone this context if we enabled any event.
1500          */
1501         if (enabled)
1502                 unclone_ctx(ctx);
1503
1504         spin_unlock(&ctx->lock);
1505
1506         perf_event_task_sched_in(task, smp_processor_id());
1507  out:
1508         local_irq_restore(flags);
1509 }
1510
1511 /*
1512  * Cross CPU call to read the hardware event
1513  */
1514 static void __perf_event_read(void *info)
1515 {
1516         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1517         struct perf_event *event = info;
1518         struct perf_event_context *ctx = event->ctx;
1519         unsigned long flags;
1520
1521         /*
1522          * If this is a task context, we need to check whether it is
1523          * the current task context of this cpu.  If not it has been
1524          * scheduled out before the smp call arrived.  In that case
1525          * event->count would have been updated to a recent sample
1526          * when the event was scheduled out.
1527          */
1528         if (ctx->task && cpuctx->task_ctx != ctx)
1529                 return;
1530
1531         local_irq_save(flags);
1532         if (ctx->is_active)
1533                 update_context_time(ctx);
1534         event->pmu->read(event);
1535         update_event_times(event);
1536         local_irq_restore(flags);
1537 }
1538
1539 static u64 perf_event_read(struct perf_event *event)
1540 {
1541         /*
1542          * If event is enabled and currently active on a CPU, update the
1543          * value in the event structure:
1544          */
1545         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1546                 smp_call_function_single(event->oncpu,
1547                                          __perf_event_read, event, 1);
1548         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1549                 update_event_times(event);
1550         }
1551
1552         return atomic64_read(&event->count);
1553 }
1554
1555 /*
1556  * Initialize the perf_event context in a task_struct:
1557  */
1558 static void
1559 __perf_event_init_context(struct perf_event_context *ctx,
1560                             struct task_struct *task)
1561 {
1562         memset(ctx, 0, sizeof(*ctx));
1563         spin_lock_init(&ctx->lock);
1564         mutex_init(&ctx->mutex);
1565         INIT_LIST_HEAD(&ctx->group_list);
1566         INIT_LIST_HEAD(&ctx->event_list);
1567         atomic_set(&ctx->refcount, 1);
1568         ctx->task = task;
1569 }
1570
1571 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1572 {
1573         struct perf_event_context *ctx;
1574         struct perf_cpu_context *cpuctx;
1575         struct task_struct *task;
1576         unsigned long flags;
1577         int err;
1578
1579         /*
1580          * If cpu is not a wildcard then this is a percpu event:
1581          */
1582         if (cpu != -1) {
1583                 /* Must be root to operate on a CPU event: */
1584                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1585                         return ERR_PTR(-EACCES);
1586
1587                 if (cpu < 0 || cpu > num_possible_cpus())
1588                         return ERR_PTR(-EINVAL);
1589
1590                 /*
1591                  * We could be clever and allow to attach a event to an
1592                  * offline CPU and activate it when the CPU comes up, but
1593                  * that's for later.
1594                  */
1595                 if (!cpu_isset(cpu, cpu_online_map))
1596                         return ERR_PTR(-ENODEV);
1597
1598                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1599                 ctx = &cpuctx->ctx;
1600                 get_ctx(ctx);
1601
1602                 return ctx;
1603         }
1604
1605         rcu_read_lock();
1606         if (!pid)
1607                 task = current;
1608         else
1609                 task = find_task_by_vpid(pid);
1610         if (task)
1611                 get_task_struct(task);
1612         rcu_read_unlock();
1613
1614         if (!task)
1615                 return ERR_PTR(-ESRCH);
1616
1617         /*
1618          * Can't attach events to a dying task.
1619          */
1620         err = -ESRCH;
1621         if (task->flags & PF_EXITING)
1622                 goto errout;
1623
1624         /* Reuse ptrace permission checks for now. */
1625         err = -EACCES;
1626         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1627                 goto errout;
1628
1629  retry:
1630         ctx = perf_lock_task_context(task, &flags);
1631         if (ctx) {
1632                 unclone_ctx(ctx);
1633                 spin_unlock_irqrestore(&ctx->lock, flags);
1634         }
1635
1636         if (!ctx) {
1637                 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1638                 err = -ENOMEM;
1639                 if (!ctx)
1640                         goto errout;
1641                 __perf_event_init_context(ctx, task);
1642                 get_ctx(ctx);
1643                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1644                         /*
1645                          * We raced with some other task; use
1646                          * the context they set.
1647                          */
1648                         kfree(ctx);
1649                         goto retry;
1650                 }
1651                 get_task_struct(task);
1652         }
1653
1654         put_task_struct(task);
1655         return ctx;
1656
1657  errout:
1658         put_task_struct(task);
1659         return ERR_PTR(err);
1660 }
1661
1662 static void perf_event_free_filter(struct perf_event *event);
1663
1664 static void free_event_rcu(struct rcu_head *head)
1665 {
1666         struct perf_event *event;
1667
1668         event = container_of(head, struct perf_event, rcu_head);
1669         if (event->ns)
1670                 put_pid_ns(event->ns);
1671         perf_event_free_filter(event);
1672         kfree(event);
1673 }
1674
1675 static void perf_pending_sync(struct perf_event *event);
1676
1677 static void free_event(struct perf_event *event)
1678 {
1679         perf_pending_sync(event);
1680
1681         if (!event->parent) {
1682                 atomic_dec(&nr_events);
1683                 if (event->attr.mmap)
1684                         atomic_dec(&nr_mmap_events);
1685                 if (event->attr.comm)
1686                         atomic_dec(&nr_comm_events);
1687                 if (event->attr.task)
1688                         atomic_dec(&nr_task_events);
1689         }
1690
1691         if (event->output) {
1692                 fput(event->output->filp);
1693                 event->output = NULL;
1694         }
1695
1696         if (event->destroy)
1697                 event->destroy(event);
1698
1699         put_ctx(event->ctx);
1700         call_rcu(&event->rcu_head, free_event_rcu);
1701 }
1702
1703 /*
1704  * Called when the last reference to the file is gone.
1705  */
1706 static int perf_release(struct inode *inode, struct file *file)
1707 {
1708         struct perf_event *event = file->private_data;
1709         struct perf_event_context *ctx = event->ctx;
1710
1711         file->private_data = NULL;
1712
1713         WARN_ON_ONCE(ctx->parent_ctx);
1714         mutex_lock(&ctx->mutex);
1715         perf_event_remove_from_context(event);
1716         mutex_unlock(&ctx->mutex);
1717
1718         mutex_lock(&event->owner->perf_event_mutex);
1719         list_del_init(&event->owner_entry);
1720         mutex_unlock(&event->owner->perf_event_mutex);
1721         put_task_struct(event->owner);
1722
1723         free_event(event);
1724
1725         return 0;
1726 }
1727
1728 int perf_event_release_kernel(struct perf_event *event)
1729 {
1730         struct perf_event_context *ctx = event->ctx;
1731
1732         WARN_ON_ONCE(ctx->parent_ctx);
1733         mutex_lock(&ctx->mutex);
1734         perf_event_remove_from_context(event);
1735         mutex_unlock(&ctx->mutex);
1736
1737         mutex_lock(&event->owner->perf_event_mutex);
1738         list_del_init(&event->owner_entry);
1739         mutex_unlock(&event->owner->perf_event_mutex);
1740         put_task_struct(event->owner);
1741
1742         free_event(event);
1743
1744         return 0;
1745 }
1746 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1747
1748 static int perf_event_read_size(struct perf_event *event)
1749 {
1750         int entry = sizeof(u64); /* value */
1751         int size = 0;
1752         int nr = 1;
1753
1754         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1755                 size += sizeof(u64);
1756
1757         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1758                 size += sizeof(u64);
1759
1760         if (event->attr.read_format & PERF_FORMAT_ID)
1761                 entry += sizeof(u64);
1762
1763         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1764                 nr += event->group_leader->nr_siblings;
1765                 size += sizeof(u64);
1766         }
1767
1768         size += entry * nr;
1769
1770         return size;
1771 }
1772
1773 u64 perf_event_read_value(struct perf_event *event)
1774 {
1775         struct perf_event *child;
1776         u64 total = 0;
1777
1778         total += perf_event_read(event);
1779         list_for_each_entry(child, &event->child_list, child_list)
1780                 total += perf_event_read(child);
1781
1782         return total;
1783 }
1784 EXPORT_SYMBOL_GPL(perf_event_read_value);
1785
1786 static int perf_event_read_entry(struct perf_event *event,
1787                                    u64 read_format, char __user *buf)
1788 {
1789         int n = 0, count = 0;
1790         u64 values[2];
1791
1792         values[n++] = perf_event_read_value(event);
1793         if (read_format & PERF_FORMAT_ID)
1794                 values[n++] = primary_event_id(event);
1795
1796         count = n * sizeof(u64);
1797
1798         if (copy_to_user(buf, values, count))
1799                 return -EFAULT;
1800
1801         return count;
1802 }
1803
1804 static int perf_event_read_group(struct perf_event *event,
1805                                    u64 read_format, char __user *buf)
1806 {
1807         struct perf_event *leader = event->group_leader, *sub;
1808         int n = 0, size = 0, err = -EFAULT;
1809         u64 values[3];
1810
1811         values[n++] = 1 + leader->nr_siblings;
1812         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1813                 values[n++] = leader->total_time_enabled +
1814                         atomic64_read(&leader->child_total_time_enabled);
1815         }
1816         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1817                 values[n++] = leader->total_time_running +
1818                         atomic64_read(&leader->child_total_time_running);
1819         }
1820
1821         size = n * sizeof(u64);
1822
1823         if (copy_to_user(buf, values, size))
1824                 return -EFAULT;
1825
1826         err = perf_event_read_entry(leader, read_format, buf + size);
1827         if (err < 0)
1828                 return err;
1829
1830         size += err;
1831
1832         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1833                 err = perf_event_read_entry(sub, read_format,
1834                                 buf + size);
1835                 if (err < 0)
1836                         return err;
1837
1838                 size += err;
1839         }
1840
1841         return size;
1842 }
1843
1844 static int perf_event_read_one(struct perf_event *event,
1845                                  u64 read_format, char __user *buf)
1846 {
1847         u64 values[4];
1848         int n = 0;
1849
1850         values[n++] = perf_event_read_value(event);
1851         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1852                 values[n++] = event->total_time_enabled +
1853                         atomic64_read(&event->child_total_time_enabled);
1854         }
1855         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1856                 values[n++] = event->total_time_running +
1857                         atomic64_read(&event->child_total_time_running);
1858         }
1859         if (read_format & PERF_FORMAT_ID)
1860                 values[n++] = primary_event_id(event);
1861
1862         if (copy_to_user(buf, values, n * sizeof(u64)))
1863                 return -EFAULT;
1864
1865         return n * sizeof(u64);
1866 }
1867
1868 /*
1869  * Read the performance event - simple non blocking version for now
1870  */
1871 static ssize_t
1872 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1873 {
1874         u64 read_format = event->attr.read_format;
1875         int ret;
1876
1877         /*
1878          * Return end-of-file for a read on a event that is in
1879          * error state (i.e. because it was pinned but it couldn't be
1880          * scheduled on to the CPU at some point).
1881          */
1882         if (event->state == PERF_EVENT_STATE_ERROR)
1883                 return 0;
1884
1885         if (count < perf_event_read_size(event))
1886                 return -ENOSPC;
1887
1888         WARN_ON_ONCE(event->ctx->parent_ctx);
1889         mutex_lock(&event->child_mutex);
1890         if (read_format & PERF_FORMAT_GROUP)
1891                 ret = perf_event_read_group(event, read_format, buf);
1892         else
1893                 ret = perf_event_read_one(event, read_format, buf);
1894         mutex_unlock(&event->child_mutex);
1895
1896         return ret;
1897 }
1898
1899 static ssize_t
1900 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1901 {
1902         struct perf_event *event = file->private_data;
1903
1904         return perf_read_hw(event, buf, count);
1905 }
1906
1907 static unsigned int perf_poll(struct file *file, poll_table *wait)
1908 {
1909         struct perf_event *event = file->private_data;
1910         struct perf_mmap_data *data;
1911         unsigned int events = POLL_HUP;
1912
1913         rcu_read_lock();
1914         data = rcu_dereference(event->data);
1915         if (data)
1916                 events = atomic_xchg(&data->poll, 0);
1917         rcu_read_unlock();
1918
1919         poll_wait(file, &event->waitq, wait);
1920
1921         return events;
1922 }
1923
1924 static void perf_event_reset(struct perf_event *event)
1925 {
1926         (void)perf_event_read(event);
1927         atomic64_set(&event->count, 0);
1928         perf_event_update_userpage(event);
1929 }
1930
1931 /*
1932  * Holding the top-level event's child_mutex means that any
1933  * descendant process that has inherited this event will block
1934  * in sync_child_event if it goes to exit, thus satisfying the
1935  * task existence requirements of perf_event_enable/disable.
1936  */
1937 static void perf_event_for_each_child(struct perf_event *event,
1938                                         void (*func)(struct perf_event *))
1939 {
1940         struct perf_event *child;
1941
1942         WARN_ON_ONCE(event->ctx->parent_ctx);
1943         mutex_lock(&event->child_mutex);
1944         func(event);
1945         list_for_each_entry(child, &event->child_list, child_list)
1946                 func(child);
1947         mutex_unlock(&event->child_mutex);
1948 }
1949
1950 static void perf_event_for_each(struct perf_event *event,
1951                                   void (*func)(struct perf_event *))
1952 {
1953         struct perf_event_context *ctx = event->ctx;
1954         struct perf_event *sibling;
1955
1956         WARN_ON_ONCE(ctx->parent_ctx);
1957         mutex_lock(&ctx->mutex);
1958         event = event->group_leader;
1959
1960         perf_event_for_each_child(event, func);
1961         func(event);
1962         list_for_each_entry(sibling, &event->sibling_list, group_entry)
1963                 perf_event_for_each_child(event, func);
1964         mutex_unlock(&ctx->mutex);
1965 }
1966
1967 static int perf_event_period(struct perf_event *event, u64 __user *arg)
1968 {
1969         struct perf_event_context *ctx = event->ctx;
1970         unsigned long size;
1971         int ret = 0;
1972         u64 value;
1973
1974         if (!event->attr.sample_period)
1975                 return -EINVAL;
1976
1977         size = copy_from_user(&value, arg, sizeof(value));
1978         if (size != sizeof(value))
1979                 return -EFAULT;
1980
1981         if (!value)
1982                 return -EINVAL;
1983
1984         spin_lock_irq(&ctx->lock);
1985         if (event->attr.freq) {
1986                 if (value > sysctl_perf_event_sample_rate) {
1987                         ret = -EINVAL;
1988                         goto unlock;
1989                 }
1990
1991                 event->attr.sample_freq = value;
1992         } else {
1993                 event->attr.sample_period = value;
1994                 event->hw.sample_period = value;
1995         }
1996 unlock:
1997         spin_unlock_irq(&ctx->lock);
1998
1999         return ret;
2000 }
2001
2002 static int perf_event_set_output(struct perf_event *event, int output_fd);
2003 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2004
2005 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2006 {
2007         struct perf_event *event = file->private_data;
2008         void (*func)(struct perf_event *);
2009         u32 flags = arg;
2010
2011         switch (cmd) {
2012         case PERF_EVENT_IOC_ENABLE:
2013                 func = perf_event_enable;
2014                 break;
2015         case PERF_EVENT_IOC_DISABLE:
2016                 func = perf_event_disable;
2017                 break;
2018         case PERF_EVENT_IOC_RESET:
2019                 func = perf_event_reset;
2020                 break;
2021
2022         case PERF_EVENT_IOC_REFRESH:
2023                 return perf_event_refresh(event, arg);
2024
2025         case PERF_EVENT_IOC_PERIOD:
2026                 return perf_event_period(event, (u64 __user *)arg);
2027
2028         case PERF_EVENT_IOC_SET_OUTPUT:
2029                 return perf_event_set_output(event, arg);
2030
2031         case PERF_EVENT_IOC_SET_FILTER:
2032                 return perf_event_set_filter(event, (void __user *)arg);
2033
2034         default:
2035                 return -ENOTTY;
2036         }
2037
2038         if (flags & PERF_IOC_FLAG_GROUP)
2039                 perf_event_for_each(event, func);
2040         else
2041                 perf_event_for_each_child(event, func);
2042
2043         return 0;
2044 }
2045
2046 int perf_event_task_enable(void)
2047 {
2048         struct perf_event *event;
2049
2050         mutex_lock(&current->perf_event_mutex);
2051         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2052                 perf_event_for_each_child(event, perf_event_enable);
2053         mutex_unlock(&current->perf_event_mutex);
2054
2055         return 0;
2056 }
2057
2058 int perf_event_task_disable(void)
2059 {
2060         struct perf_event *event;
2061
2062         mutex_lock(&current->perf_event_mutex);
2063         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2064                 perf_event_for_each_child(event, perf_event_disable);
2065         mutex_unlock(&current->perf_event_mutex);
2066
2067         return 0;
2068 }
2069
2070 #ifndef PERF_EVENT_INDEX_OFFSET
2071 # define PERF_EVENT_INDEX_OFFSET 0
2072 #endif
2073
2074 static int perf_event_index(struct perf_event *event)
2075 {
2076         if (event->state != PERF_EVENT_STATE_ACTIVE)
2077                 return 0;
2078
2079         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2080 }
2081
2082 /*
2083  * Callers need to ensure there can be no nesting of this function, otherwise
2084  * the seqlock logic goes bad. We can not serialize this because the arch
2085  * code calls this from NMI context.
2086  */
2087 void perf_event_update_userpage(struct perf_event *event)
2088 {
2089         struct perf_event_mmap_page *userpg;
2090         struct perf_mmap_data *data;
2091
2092         rcu_read_lock();
2093         data = rcu_dereference(event->data);
2094         if (!data)
2095                 goto unlock;
2096
2097         userpg = data->user_page;
2098
2099         /*
2100          * Disable preemption so as to not let the corresponding user-space
2101          * spin too long if we get preempted.
2102          */
2103         preempt_disable();
2104         ++userpg->lock;
2105         barrier();
2106         userpg->index = perf_event_index(event);
2107         userpg->offset = atomic64_read(&event->count);
2108         if (event->state == PERF_EVENT_STATE_ACTIVE)
2109                 userpg->offset -= atomic64_read(&event->hw.prev_count);
2110
2111         userpg->time_enabled = event->total_time_enabled +
2112                         atomic64_read(&event->child_total_time_enabled);
2113
2114         userpg->time_running = event->total_time_running +
2115                         atomic64_read(&event->child_total_time_running);
2116
2117         barrier();
2118         ++userpg->lock;
2119         preempt_enable();
2120 unlock:
2121         rcu_read_unlock();
2122 }
2123
2124 static unsigned long perf_data_size(struct perf_mmap_data *data)
2125 {
2126         return data->nr_pages << (PAGE_SHIFT + data->data_order);
2127 }
2128
2129 #ifndef CONFIG_PERF_USE_VMALLOC
2130
2131 /*
2132  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2133  */
2134
2135 static struct page *
2136 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2137 {
2138         if (pgoff > data->nr_pages)
2139                 return NULL;
2140
2141         if (pgoff == 0)
2142                 return virt_to_page(data->user_page);
2143
2144         return virt_to_page(data->data_pages[pgoff - 1]);
2145 }
2146
2147 static struct perf_mmap_data *
2148 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2149 {
2150         struct perf_mmap_data *data;
2151         unsigned long size;
2152         int i;
2153
2154         WARN_ON(atomic_read(&event->mmap_count));
2155
2156         size = sizeof(struct perf_mmap_data);
2157         size += nr_pages * sizeof(void *);
2158
2159         data = kzalloc(size, GFP_KERNEL);
2160         if (!data)
2161                 goto fail;
2162
2163         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2164         if (!data->user_page)
2165                 goto fail_user_page;
2166
2167         for (i = 0; i < nr_pages; i++) {
2168                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2169                 if (!data->data_pages[i])
2170                         goto fail_data_pages;
2171         }
2172
2173         data->data_order = 0;
2174         data->nr_pages = nr_pages;
2175
2176         return data;
2177
2178 fail_data_pages:
2179         for (i--; i >= 0; i--)
2180                 free_page((unsigned long)data->data_pages[i]);
2181
2182         free_page((unsigned long)data->user_page);
2183
2184 fail_user_page:
2185         kfree(data);
2186
2187 fail:
2188         return NULL;
2189 }
2190
2191 static void perf_mmap_free_page(unsigned long addr)
2192 {
2193         struct page *page = virt_to_page((void *)addr);
2194
2195         page->mapping = NULL;
2196         __free_page(page);
2197 }
2198
2199 static void perf_mmap_data_free(struct perf_mmap_data *data)
2200 {
2201         int i;
2202
2203         perf_mmap_free_page((unsigned long)data->user_page);
2204         for (i = 0; i < data->nr_pages; i++)
2205                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2206 }
2207
2208 #else
2209
2210 /*
2211  * Back perf_mmap() with vmalloc memory.
2212  *
2213  * Required for architectures that have d-cache aliasing issues.
2214  */
2215
2216 static struct page *
2217 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2218 {
2219         if (pgoff > (1UL << data->data_order))
2220                 return NULL;
2221
2222         return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2223 }
2224
2225 static void perf_mmap_unmark_page(void *addr)
2226 {
2227         struct page *page = vmalloc_to_page(addr);
2228
2229         page->mapping = NULL;
2230 }
2231
2232 static void perf_mmap_data_free_work(struct work_struct *work)
2233 {
2234         struct perf_mmap_data *data;
2235         void *base;
2236         int i, nr;
2237
2238         data = container_of(work, struct perf_mmap_data, work);
2239         nr = 1 << data->data_order;
2240
2241         base = data->user_page;
2242         for (i = 0; i < nr + 1; i++)
2243                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2244
2245         vfree(base);
2246 }
2247
2248 static void perf_mmap_data_free(struct perf_mmap_data *data)
2249 {
2250         schedule_work(&data->work);
2251 }
2252
2253 static struct perf_mmap_data *
2254 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2255 {
2256         struct perf_mmap_data *data;
2257         unsigned long size;
2258         void *all_buf;
2259
2260         WARN_ON(atomic_read(&event->mmap_count));
2261
2262         size = sizeof(struct perf_mmap_data);
2263         size += sizeof(void *);
2264
2265         data = kzalloc(size, GFP_KERNEL);
2266         if (!data)
2267                 goto fail;
2268
2269         INIT_WORK(&data->work, perf_mmap_data_free_work);
2270
2271         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2272         if (!all_buf)
2273                 goto fail_all_buf;
2274
2275         data->user_page = all_buf;
2276         data->data_pages[0] = all_buf + PAGE_SIZE;
2277         data->data_order = ilog2(nr_pages);
2278         data->nr_pages = 1;
2279
2280         return data;
2281
2282 fail_all_buf:
2283         kfree(data);
2284
2285 fail:
2286         return NULL;
2287 }
2288
2289 #endif
2290
2291 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2292 {
2293         struct perf_event *event = vma->vm_file->private_data;
2294         struct perf_mmap_data *data;
2295         int ret = VM_FAULT_SIGBUS;
2296
2297         if (vmf->flags & FAULT_FLAG_MKWRITE) {
2298                 if (vmf->pgoff == 0)
2299                         ret = 0;
2300                 return ret;
2301         }
2302
2303         rcu_read_lock();
2304         data = rcu_dereference(event->data);
2305         if (!data)
2306                 goto unlock;
2307
2308         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2309                 goto unlock;
2310
2311         vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2312         if (!vmf->page)
2313                 goto unlock;
2314
2315         get_page(vmf->page);
2316         vmf->page->mapping = vma->vm_file->f_mapping;
2317         vmf->page->index   = vmf->pgoff;
2318
2319         ret = 0;
2320 unlock:
2321         rcu_read_unlock();
2322
2323         return ret;
2324 }
2325
2326 static void
2327 perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2328 {
2329         long max_size = perf_data_size(data);
2330
2331         atomic_set(&data->lock, -1);
2332
2333         if (event->attr.watermark) {
2334                 data->watermark = min_t(long, max_size,
2335                                         event->attr.wakeup_watermark);
2336         }
2337
2338         if (!data->watermark)
2339                 data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
2340
2341
2342         rcu_assign_pointer(event->data, data);
2343 }
2344
2345 static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2346 {
2347         struct perf_mmap_data *data;
2348
2349         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2350         perf_mmap_data_free(data);
2351         kfree(data);
2352 }
2353
2354 static void perf_mmap_data_release(struct perf_event *event)
2355 {
2356         struct perf_mmap_data *data = event->data;
2357
2358         WARN_ON(atomic_read(&event->mmap_count));
2359
2360         rcu_assign_pointer(event->data, NULL);
2361         call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2362 }
2363
2364 static void perf_mmap_open(struct vm_area_struct *vma)
2365 {
2366         struct perf_event *event = vma->vm_file->private_data;
2367
2368         atomic_inc(&event->mmap_count);
2369 }
2370
2371 static void perf_mmap_close(struct vm_area_struct *vma)
2372 {
2373         struct perf_event *event = vma->vm_file->private_data;
2374
2375         WARN_ON_ONCE(event->ctx->parent_ctx);
2376         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2377                 unsigned long size = perf_data_size(event->data);
2378                 struct user_struct *user = current_user();
2379
2380                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2381                 vma->vm_mm->locked_vm -= event->data->nr_locked;
2382                 perf_mmap_data_release(event);
2383                 mutex_unlock(&event->mmap_mutex);
2384         }
2385 }
2386
2387 static const struct vm_operations_struct perf_mmap_vmops = {
2388         .open           = perf_mmap_open,
2389         .close          = perf_mmap_close,
2390         .fault          = perf_mmap_fault,
2391         .page_mkwrite   = perf_mmap_fault,
2392 };
2393
2394 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2395 {
2396         struct perf_event *event = file->private_data;
2397         unsigned long user_locked, user_lock_limit;
2398         struct user_struct *user = current_user();
2399         unsigned long locked, lock_limit;
2400         struct perf_mmap_data *data;
2401         unsigned long vma_size;
2402         unsigned long nr_pages;
2403         long user_extra, extra;
2404         int ret = 0;
2405
2406         if (!(vma->vm_flags & VM_SHARED))
2407                 return -EINVAL;
2408
2409         vma_size = vma->vm_end - vma->vm_start;
2410         nr_pages = (vma_size / PAGE_SIZE) - 1;
2411
2412         /*
2413          * If we have data pages ensure they're a power-of-two number, so we
2414          * can do bitmasks instead of modulo.
2415          */
2416         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2417                 return -EINVAL;
2418
2419         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2420                 return -EINVAL;
2421
2422         if (vma->vm_pgoff != 0)
2423                 return -EINVAL;
2424
2425         WARN_ON_ONCE(event->ctx->parent_ctx);
2426         mutex_lock(&event->mmap_mutex);
2427         if (event->output) {
2428                 ret = -EINVAL;
2429                 goto unlock;
2430         }
2431
2432         if (atomic_inc_not_zero(&event->mmap_count)) {
2433                 if (nr_pages != event->data->nr_pages)
2434                         ret = -EINVAL;
2435                 goto unlock;
2436         }
2437
2438         user_extra = nr_pages + 1;
2439         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2440
2441         /*
2442          * Increase the limit linearly with more CPUs:
2443          */
2444         user_lock_limit *= num_online_cpus();
2445
2446         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2447
2448         extra = 0;
2449         if (user_locked > user_lock_limit)
2450                 extra = user_locked - user_lock_limit;
2451
2452         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2453         lock_limit >>= PAGE_SHIFT;
2454         locked = vma->vm_mm->locked_vm + extra;
2455
2456         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2457                 !capable(CAP_IPC_LOCK)) {
2458                 ret = -EPERM;
2459                 goto unlock;
2460         }
2461
2462         WARN_ON(event->data);
2463
2464         data = perf_mmap_data_alloc(event, nr_pages);
2465         ret = -ENOMEM;
2466         if (!data)
2467                 goto unlock;
2468
2469         ret = 0;
2470         perf_mmap_data_init(event, data);
2471
2472         atomic_set(&event->mmap_count, 1);
2473         atomic_long_add(user_extra, &user->locked_vm);
2474         vma->vm_mm->locked_vm += extra;
2475         event->data->nr_locked = extra;
2476         if (vma->vm_flags & VM_WRITE)
2477                 event->data->writable = 1;
2478
2479 unlock:
2480         mutex_unlock(&event->mmap_mutex);
2481
2482         vma->vm_flags |= VM_RESERVED;
2483         vma->vm_ops = &perf_mmap_vmops;
2484
2485         return ret;
2486 }
2487
2488 static int perf_fasync(int fd, struct file *filp, int on)
2489 {
2490         struct inode *inode = filp->f_path.dentry->d_inode;
2491         struct perf_event *event = filp->private_data;
2492         int retval;
2493
2494         mutex_lock(&inode->i_mutex);
2495         retval = fasync_helper(fd, filp, on, &event->fasync);
2496         mutex_unlock(&inode->i_mutex);
2497
2498         if (retval < 0)
2499                 return retval;
2500
2501         return 0;
2502 }
2503
2504 static const struct file_operations perf_fops = {
2505         .release                = perf_release,
2506         .read                   = perf_read,
2507         .poll                   = perf_poll,
2508         .unlocked_ioctl         = perf_ioctl,
2509         .compat_ioctl           = perf_ioctl,
2510         .mmap                   = perf_mmap,
2511         .fasync                 = perf_fasync,
2512 };
2513
2514 /*
2515  * Perf event wakeup
2516  *
2517  * If there's data, ensure we set the poll() state and publish everything
2518  * to user-space before waking everybody up.
2519  */
2520
2521 void perf_event_wakeup(struct perf_event *event)
2522 {
2523         wake_up_all(&event->waitq);
2524
2525         if (event->pending_kill) {
2526                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2527                 event->pending_kill = 0;
2528         }
2529 }
2530
2531 /*
2532  * Pending wakeups
2533  *
2534  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2535  *
2536  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2537  * single linked list and use cmpxchg() to add entries lockless.
2538  */
2539
2540 static void perf_pending_event(struct perf_pending_entry *entry)
2541 {
2542         struct perf_event *event = container_of(entry,
2543                         struct perf_event, pending);
2544
2545         if (event->pending_disable) {
2546                 event->pending_disable = 0;
2547                 __perf_event_disable(event);
2548         }
2549
2550         if (event->pending_wakeup) {
2551                 event->pending_wakeup = 0;
2552                 perf_event_wakeup(event);
2553         }
2554 }
2555
2556 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2557
2558 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2559         PENDING_TAIL,
2560 };
2561
2562 static void perf_pending_queue(struct perf_pending_entry *entry,
2563                                void (*func)(struct perf_pending_entry *))
2564 {
2565         struct perf_pending_entry **head;
2566
2567         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2568                 return;
2569
2570         entry->func = func;
2571
2572         head = &get_cpu_var(perf_pending_head);
2573
2574         do {
2575                 entry->next = *head;
2576         } while (cmpxchg(head, entry->next, entry) != entry->next);
2577
2578         set_perf_event_pending();
2579
2580         put_cpu_var(perf_pending_head);
2581 }
2582
2583 static int __perf_pending_run(void)
2584 {
2585         struct perf_pending_entry *list;
2586         int nr = 0;
2587
2588         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2589         while (list != PENDING_TAIL) {
2590                 void (*func)(struct perf_pending_entry *);
2591                 struct perf_pending_entry *entry = list;
2592
2593                 list = list->next;
2594
2595                 func = entry->func;
2596                 entry->next = NULL;
2597                 /*
2598                  * Ensure we observe the unqueue before we issue the wakeup,
2599                  * so that we won't be waiting forever.
2600                  * -- see perf_not_pending().
2601                  */
2602                 smp_wmb();
2603
2604                 func(entry);
2605                 nr++;
2606         }
2607
2608         return nr;
2609 }
2610
2611 static inline int perf_not_pending(struct perf_event *event)
2612 {
2613         /*
2614          * If we flush on whatever cpu we run, there is a chance we don't
2615          * need to wait.
2616          */
2617         get_cpu();
2618         __perf_pending_run();
2619         put_cpu();
2620
2621         /*
2622          * Ensure we see the proper queue state before going to sleep
2623          * so that we do not miss the wakeup. -- see perf_pending_handle()
2624          */
2625         smp_rmb();
2626         return event->pending.next == NULL;
2627 }
2628
2629 static void perf_pending_sync(struct perf_event *event)
2630 {
2631         wait_event(event->waitq, perf_not_pending(event));
2632 }
2633
2634 void perf_event_do_pending(void)
2635 {
2636         __perf_pending_run();
2637 }
2638
2639 /*
2640  * Callchain support -- arch specific
2641  */
2642
2643 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2644 {
2645         return NULL;
2646 }
2647
2648 /*
2649  * Output
2650  */
2651 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2652                               unsigned long offset, unsigned long head)
2653 {
2654         unsigned long mask;
2655
2656         if (!data->writable)
2657                 return true;
2658
2659         mask = perf_data_size(data) - 1;
2660
2661         offset = (offset - tail) & mask;
2662         head   = (head   - tail) & mask;
2663
2664         if ((int)(head - offset) < 0)
2665                 return false;
2666
2667         return true;
2668 }
2669
2670 static void perf_output_wakeup(struct perf_output_handle *handle)
2671 {
2672         atomic_set(&handle->data->poll, POLL_IN);
2673
2674         if (handle->nmi) {
2675                 handle->event->pending_wakeup = 1;
2676                 perf_pending_queue(&handle->event->pending,
2677                                    perf_pending_event);
2678         } else
2679                 perf_event_wakeup(handle->event);
2680 }
2681
2682 /*
2683  * Curious locking construct.
2684  *
2685  * We need to ensure a later event_id doesn't publish a head when a former
2686  * event_id isn't done writing. However since we need to deal with NMIs we
2687  * cannot fully serialize things.
2688  *
2689  * What we do is serialize between CPUs so we only have to deal with NMI
2690  * nesting on a single CPU.
2691  *
2692  * We only publish the head (and generate a wakeup) when the outer-most
2693  * event_id completes.
2694  */
2695 static void perf_output_lock(struct perf_output_handle *handle)
2696 {
2697         struct perf_mmap_data *data = handle->data;
2698         int cpu;
2699
2700         handle->locked = 0;
2701
2702         local_irq_save(handle->flags);
2703         cpu = smp_processor_id();
2704
2705         if (in_nmi() && atomic_read(&data->lock) == cpu)
2706                 return;
2707
2708         while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2709                 cpu_relax();
2710
2711         handle->locked = 1;
2712 }
2713
2714 static void perf_output_unlock(struct perf_output_handle *handle)
2715 {
2716         struct perf_mmap_data *data = handle->data;
2717         unsigned long head;
2718         int cpu;
2719
2720         data->done_head = data->head;
2721
2722         if (!handle->locked)
2723                 goto out;
2724
2725 again:
2726         /*
2727          * The xchg implies a full barrier that ensures all writes are done
2728          * before we publish the new head, matched by a rmb() in userspace when
2729          * reading this position.
2730          */
2731         while ((head = atomic_long_xchg(&data->done_head, 0)))
2732                 data->user_page->data_head = head;
2733
2734         /*
2735          * NMI can happen here, which means we can miss a done_head update.
2736          */
2737
2738         cpu = atomic_xchg(&data->lock, -1);
2739         WARN_ON_ONCE(cpu != smp_processor_id());
2740
2741         /*
2742          * Therefore we have to validate we did not indeed do so.
2743          */
2744         if (unlikely(atomic_long_read(&data->done_head))) {
2745                 /*
2746                  * Since we had it locked, we can lock it again.
2747                  */
2748                 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2749                         cpu_relax();
2750
2751                 goto again;
2752         }
2753
2754         if (atomic_xchg(&data->wakeup, 0))
2755                 perf_output_wakeup(handle);
2756 out:
2757         local_irq_restore(handle->flags);
2758 }
2759
2760 void perf_output_copy(struct perf_output_handle *handle,
2761                       const void *buf, unsigned int len)
2762 {
2763         unsigned int pages_mask;
2764         unsigned long offset;
2765         unsigned int size;
2766         void **pages;
2767
2768         offset          = handle->offset;
2769         pages_mask      = handle->data->nr_pages - 1;
2770         pages           = handle->data->data_pages;
2771
2772         do {
2773                 unsigned long page_offset;
2774                 unsigned long page_size;
2775                 int nr;
2776
2777                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
2778                 page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
2779                 page_offset = offset & (page_size - 1);
2780                 size        = min_t(unsigned int, page_size - page_offset, len);
2781
2782                 memcpy(pages[nr] + page_offset, buf, size);
2783
2784                 len         -= size;
2785                 buf         += size;
2786                 offset      += size;
2787         } while (len);
2788
2789         handle->offset = offset;
2790
2791         /*
2792          * Check we didn't copy past our reservation window, taking the
2793          * possible unsigned int wrap into account.
2794          */
2795         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2796 }
2797
2798 int perf_output_begin(struct perf_output_handle *handle,
2799                       struct perf_event *event, unsigned int size,
2800                       int nmi, int sample)
2801 {
2802         struct perf_event *output_event;
2803         struct perf_mmap_data *data;
2804         unsigned long tail, offset, head;
2805         int have_lost;
2806         struct {
2807                 struct perf_event_header header;
2808                 u64                      id;
2809                 u64                      lost;
2810         } lost_event;
2811
2812         rcu_read_lock();
2813         /*
2814          * For inherited events we send all the output towards the parent.
2815          */
2816         if (event->parent)
2817                 event = event->parent;
2818
2819         output_event = rcu_dereference(event->output);
2820         if (output_event)
2821                 event = output_event;
2822
2823         data = rcu_dereference(event->data);
2824         if (!data)
2825                 goto out;
2826
2827         handle->data    = data;
2828         handle->event   = event;
2829         handle->nmi     = nmi;
2830         handle->sample  = sample;
2831
2832         if (!data->nr_pages)
2833                 goto fail;
2834
2835         have_lost = atomic_read(&data->lost);
2836         if (have_lost)
2837                 size += sizeof(lost_event);
2838
2839         perf_output_lock(handle);
2840
2841         do {
2842                 /*
2843                  * Userspace could choose to issue a mb() before updating the
2844                  * tail pointer. So that all reads will be completed before the
2845                  * write is issued.
2846                  */
2847                 tail = ACCESS_ONCE(data->user_page->data_tail);
2848                 smp_rmb();
2849                 offset = head = atomic_long_read(&data->head);
2850                 head += size;
2851                 if (unlikely(!perf_output_space(data, tail, offset, head)))
2852                         goto fail;
2853         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2854
2855         handle->offset  = offset;
2856         handle->head    = head;
2857
2858         if (head - tail > data->watermark)
2859                 atomic_set(&data->wakeup, 1);
2860
2861         if (have_lost) {
2862                 lost_event.header.type = PERF_RECORD_LOST;
2863                 lost_event.header.misc = 0;
2864                 lost_event.header.size = sizeof(lost_event);
2865                 lost_event.id          = event->id;
2866                 lost_event.lost        = atomic_xchg(&data->lost, 0);
2867
2868                 perf_output_put(handle, lost_event);
2869         }
2870
2871         return 0;
2872
2873 fail:
2874         atomic_inc(&data->lost);
2875         perf_output_unlock(handle);
2876 out:
2877         rcu_read_unlock();
2878
2879         return -ENOSPC;
2880 }
2881
2882 void perf_output_end(struct perf_output_handle *handle)
2883 {
2884         struct perf_event *event = handle->event;
2885         struct perf_mmap_data *data = handle->data;
2886
2887         int wakeup_events = event->attr.wakeup_events;
2888
2889         if (handle->sample && wakeup_events) {
2890                 int events = atomic_inc_return(&data->events);
2891                 if (events >= wakeup_events) {
2892                         atomic_sub(wakeup_events, &data->events);
2893                         atomic_set(&data->wakeup, 1);
2894                 }
2895         }
2896
2897         perf_output_unlock(handle);
2898         rcu_read_unlock();
2899 }
2900
2901 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2902 {
2903         /*
2904          * only top level events have the pid namespace they were created in
2905          */
2906         if (event->parent)
2907                 event = event->parent;
2908
2909         return task_tgid_nr_ns(p, event->ns);
2910 }
2911
2912 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2913 {
2914         /*
2915          * only top level events have the pid namespace they were created in
2916          */
2917         if (event->parent)
2918                 event = event->parent;
2919
2920         return task_pid_nr_ns(p, event->ns);
2921 }
2922
2923 static void perf_output_read_one(struct perf_output_handle *handle,
2924                                  struct perf_event *event)
2925 {
2926         u64 read_format = event->attr.read_format;
2927         u64 values[4];
2928         int n = 0;
2929
2930         values[n++] = atomic64_read(&event->count);
2931         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2932                 values[n++] = event->total_time_enabled +
2933                         atomic64_read(&event->child_total_time_enabled);
2934         }
2935         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2936                 values[n++] = event->total_time_running +
2937                         atomic64_read(&event->child_total_time_running);
2938         }
2939         if (read_format & PERF_FORMAT_ID)
2940                 values[n++] = primary_event_id(event);
2941
2942         perf_output_copy(handle, values, n * sizeof(u64));
2943 }
2944
2945 /*
2946  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2947  */
2948 static void perf_output_read_group(struct perf_output_handle *handle,
2949                             struct perf_event *event)
2950 {
2951         struct perf_event *leader = event->group_leader, *sub;
2952         u64 read_format = event->attr.read_format;
2953         u64 values[5];
2954         int n = 0;
2955
2956         values[n++] = 1 + leader->nr_siblings;
2957
2958         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2959                 values[n++] = leader->total_time_enabled;
2960
2961         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2962                 values[n++] = leader->total_time_running;
2963
2964         if (leader != event)
2965                 leader->pmu->read(leader);
2966
2967         values[n++] = atomic64_read(&leader->count);
2968         if (read_format & PERF_FORMAT_ID)
2969                 values[n++] = primary_event_id(leader);
2970
2971         perf_output_copy(handle, values, n * sizeof(u64));
2972
2973         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2974                 n = 0;
2975
2976                 if (sub != event)
2977                         sub->pmu->read(sub);
2978
2979                 values[n++] = atomic64_read(&sub->count);
2980                 if (read_format & PERF_FORMAT_ID)
2981                         values[n++] = primary_event_id(sub);
2982
2983                 perf_output_copy(handle, values, n * sizeof(u64));
2984         }
2985 }
2986
2987 static void perf_output_read(struct perf_output_handle *handle,
2988                              struct perf_event *event)
2989 {
2990         if (event->attr.read_format & PERF_FORMAT_GROUP)
2991                 perf_output_read_group(handle, event);
2992         else
2993                 perf_output_read_one(handle, event);
2994 }
2995
2996 void perf_output_sample(struct perf_output_handle *handle,
2997                         struct perf_event_header *header,
2998                         struct perf_sample_data *data,
2999                         struct perf_event *event)
3000 {
3001         u64 sample_type = data->type;
3002
3003         perf_output_put(handle, *header);
3004
3005         if (sample_type & PERF_SAMPLE_IP)
3006                 perf_output_put(handle, data->ip);
3007
3008         if (sample_type & PERF_SAMPLE_TID)
3009                 perf_output_put(handle, data->tid_entry);
3010
3011         if (sample_type & PERF_SAMPLE_TIME)
3012                 perf_output_put(handle, data->time);
3013
3014         if (sample_type & PERF_SAMPLE_ADDR)
3015                 perf_output_put(handle, data->addr);
3016
3017         if (sample_type & PERF_SAMPLE_ID)
3018                 perf_output_put(handle, data->id);
3019
3020         if (sample_type & PERF_SAMPLE_STREAM_ID)
3021                 perf_output_put(handle, data->stream_id);
3022
3023         if (sample_type & PERF_SAMPLE_CPU)
3024                 perf_output_put(handle, data->cpu_entry);
3025
3026         if (sample_type & PERF_SAMPLE_PERIOD)
3027                 perf_output_put(handle, data->period);
3028
3029         if (sample_type & PERF_SAMPLE_READ)
3030                 perf_output_read(handle, event);
3031
3032         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3033                 if (data->callchain) {
3034                         int size = 1;
3035
3036                         if (data->callchain)
3037                                 size += data->callchain->nr;
3038
3039                         size *= sizeof(u64);
3040
3041                         perf_output_copy(handle, data->callchain, size);
3042                 } else {
3043                         u64 nr = 0;
3044                         perf_output_put(handle, nr);
3045                 }
3046         }
3047
3048         if (sample_type & PERF_SAMPLE_RAW) {
3049                 if (data->raw) {
3050                         perf_output_put(handle, data->raw->size);
3051                         perf_output_copy(handle, data->raw->data,
3052                                          data->raw->size);
3053                 } else {
3054                         struct {
3055                                 u32     size;
3056                                 u32     data;
3057                         } raw = {
3058                                 .size = sizeof(u32),
3059                                 .data = 0,
3060                         };
3061                         perf_output_put(handle, raw);
3062                 }
3063         }
3064 }
3065
3066 void perf_prepare_sample(struct perf_event_header *header,
3067                          struct perf_sample_data *data,
3068                          struct perf_event *event,
3069                          struct pt_regs *regs)
3070 {
3071         u64 sample_type = event->attr.sample_type;
3072
3073         data->type = sample_type;
3074
3075         header->type = PERF_RECORD_SAMPLE;
3076         header->size = sizeof(*header);
3077
3078         header->misc = 0;
3079         header->misc |= perf_misc_flags(regs);
3080
3081         if (sample_type & PERF_SAMPLE_IP) {
3082                 data->ip = perf_instruction_pointer(regs);
3083
3084                 header->size += sizeof(data->ip);
3085         }
3086
3087         if (sample_type & PERF_SAMPLE_TID) {
3088                 /* namespace issues */
3089                 data->tid_entry.pid = perf_event_pid(event, current);
3090                 data->tid_entry.tid = perf_event_tid(event, current);
3091
3092                 header->size += sizeof(data->tid_entry);
3093         }
3094
3095         if (sample_type & PERF_SAMPLE_TIME) {
3096                 data->time = perf_clock();
3097
3098                 header->size += sizeof(data->time);
3099         }
3100
3101         if (sample_type & PERF_SAMPLE_ADDR)
3102                 header->size += sizeof(data->addr);
3103
3104         if (sample_type & PERF_SAMPLE_ID) {
3105                 data->id = primary_event_id(event);
3106
3107                 header->size += sizeof(data->id);
3108         }
3109
3110         if (sample_type & PERF_SAMPLE_STREAM_ID) {
3111                 data->stream_id = event->id;
3112
3113                 header->size += sizeof(data->stream_id);
3114         }
3115
3116         if (sample_type & PERF_SAMPLE_CPU) {
3117                 data->cpu_entry.cpu             = raw_smp_processor_id();
3118                 data->cpu_entry.reserved        = 0;
3119
3120                 header->size += sizeof(data->cpu_entry);
3121         }
3122
3123         if (sample_type & PERF_SAMPLE_PERIOD)
3124                 header->size += sizeof(data->period);
3125
3126         if (sample_type & PERF_SAMPLE_READ)
3127                 header->size += perf_event_read_size(event);
3128
3129         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3130                 int size = 1;
3131
3132                 data->callchain = perf_callchain(regs);
3133
3134                 if (data->callchain)
3135                         size += data->callchain->nr;
3136
3137                 header->size += size * sizeof(u64);
3138         }
3139
3140         if (sample_type & PERF_SAMPLE_RAW) {
3141                 int size = sizeof(u32);
3142
3143                 if (data->raw)
3144                         size += data->raw->size;
3145                 else
3146                         size += sizeof(u32);
3147
3148                 WARN_ON_ONCE(size & (sizeof(u64)-1));
3149                 header->size += size;
3150         }
3151 }
3152
3153 static void perf_event_output(struct perf_event *event, int nmi,
3154                                 struct perf_sample_data *data,
3155                                 struct pt_regs *regs)
3156 {
3157         struct perf_output_handle handle;
3158         struct perf_event_header header;
3159
3160         perf_prepare_sample(&header, data, event, regs);
3161
3162         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3163                 return;
3164
3165         perf_output_sample(&handle, &header, data, event);
3166
3167         perf_output_end(&handle);
3168 }
3169
3170 /*
3171  * read event_id
3172  */
3173
3174 struct perf_read_event {
3175         struct perf_event_header        header;
3176
3177         u32                             pid;
3178         u32                             tid;
3179 };
3180
3181 static void
3182 perf_event_read_event(struct perf_event *event,
3183                         struct task_struct *task)
3184 {
3185         struct perf_output_handle handle;
3186         struct perf_read_event read_event = {
3187                 .header = {
3188                         .type = PERF_RECORD_READ,
3189                         .misc = 0,
3190                         .size = sizeof(read_event) + perf_event_read_size(event),
3191                 },
3192                 .pid = perf_event_pid(event, task),
3193                 .tid = perf_event_tid(event, task),
3194         };
3195         int ret;
3196
3197         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3198         if (ret)
3199                 return;
3200
3201         perf_output_put(&handle, read_event);
3202         perf_output_read(&handle, event);
3203
3204         perf_output_end(&handle);
3205 }
3206
3207 /*
3208  * task tracking -- fork/exit
3209  *
3210  * enabled by: attr.comm | attr.mmap | attr.task
3211  */
3212
3213 struct perf_task_event {
3214         struct task_struct              *task;
3215         struct perf_event_context       *task_ctx;
3216
3217         struct {
3218                 struct perf_event_header        header;
3219
3220                 u32                             pid;
3221                 u32                             ppid;
3222                 u32                             tid;
3223                 u32                             ptid;
3224                 u64                             time;
3225         } event_id;
3226 };
3227
3228 static void perf_event_task_output(struct perf_event *event,
3229                                      struct perf_task_event *task_event)
3230 {
3231         struct perf_output_handle handle;
3232         int size;
3233         struct task_struct *task = task_event->task;
3234         int ret;
3235
3236         size  = task_event->event_id.header.size;
3237         ret = perf_output_begin(&handle, event, size, 0, 0);
3238
3239         if (ret)
3240                 return;
3241
3242         task_event->event_id.pid = perf_event_pid(event, task);
3243         task_event->event_id.ppid = perf_event_pid(event, current);
3244
3245         task_event->event_id.tid = perf_event_tid(event, task);
3246         task_event->event_id.ptid = perf_event_tid(event, current);
3247
3248         task_event->event_id.time = perf_clock();
3249
3250         perf_output_put(&handle, task_event->event_id);
3251
3252         perf_output_end(&handle);
3253 }
3254
3255 static int perf_event_task_match(struct perf_event *event)
3256 {
3257         if (event->attr.comm || event->attr.mmap || event->attr.task)
3258                 return 1;
3259
3260         return 0;
3261 }
3262
3263 static void perf_event_task_ctx(struct perf_event_context *ctx,
3264                                   struct perf_task_event *task_event)
3265 {
3266         struct perf_event *event;
3267
3268         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3269                 return;
3270
3271         rcu_read_lock();
3272         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3273                 if (perf_event_task_match(event))
3274                         perf_event_task_output(event, task_event);
3275         }
3276         rcu_read_unlock();
3277 }
3278
3279 static void perf_event_task_event(struct perf_task_event *task_event)
3280 {
3281         struct perf_cpu_context *cpuctx;
3282         struct perf_event_context *ctx = task_event->task_ctx;
3283
3284         cpuctx = &get_cpu_var(perf_cpu_context);
3285         perf_event_task_ctx(&cpuctx->ctx, task_event);
3286         put_cpu_var(perf_cpu_context);
3287
3288         rcu_read_lock();
3289         if (!ctx)
3290                 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3291         if (ctx)
3292                 perf_event_task_ctx(ctx, task_event);
3293         rcu_read_unlock();
3294 }
3295
3296 static void perf_event_task(struct task_struct *task,
3297                               struct perf_event_context *task_ctx,
3298                               int new)
3299 {
3300         struct perf_task_event task_event;
3301
3302         if (!atomic_read(&nr_comm_events) &&
3303             !atomic_read(&nr_mmap_events) &&
3304             !atomic_read(&nr_task_events))
3305                 return;
3306
3307         task_event = (struct perf_task_event){
3308                 .task     = task,
3309                 .task_ctx = task_ctx,
3310                 .event_id    = {
3311                         .header = {
3312                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3313                                 .misc = 0,
3314                                 .size = sizeof(task_event.event_id),
3315                         },
3316                         /* .pid  */
3317                         /* .ppid */
3318                         /* .tid  */
3319                         /* .ptid */
3320                 },
3321         };
3322
3323         perf_event_task_event(&task_event);
3324 }
3325
3326 void perf_event_fork(struct task_struct *task)
3327 {
3328         perf_event_task(task, NULL, 1);
3329 }
3330
3331 /*
3332  * comm tracking
3333  */
3334
3335 struct perf_comm_event {
3336         struct task_struct      *task;
3337         char                    *comm;
3338         int                     comm_size;
3339
3340         struct {
3341                 struct perf_event_header        header;
3342
3343                 u32                             pid;
3344                 u32                             tid;
3345         } event_id;
3346 };
3347
3348 static void perf_event_comm_output(struct perf_event *event,
3349                                      struct perf_comm_event *comm_event)
3350 {
3351         struct perf_output_handle handle;
3352         int size = comm_event->event_id.header.size;
3353         int ret = perf_output_begin(&handle, event, size, 0, 0);
3354
3355         if (ret)
3356                 return;
3357
3358         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3359         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3360
3361         perf_output_put(&handle, comm_event->event_id);
3362         perf_output_copy(&handle, comm_event->comm,
3363                                    comm_event->comm_size);
3364         perf_output_end(&handle);
3365 }
3366
3367 static int perf_event_comm_match(struct perf_event *event)
3368 {
3369         if (event->attr.comm)
3370                 return 1;
3371
3372         return 0;
3373 }
3374
3375 static void perf_event_comm_ctx(struct perf_event_context *ctx,
3376                                   struct perf_comm_event *comm_event)
3377 {
3378         struct perf_event *event;
3379
3380         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3381                 return;
3382
3383         rcu_read_lock();
3384         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3385                 if (perf_event_comm_match(event))
3386                         perf_event_comm_output(event, comm_event);
3387         }
3388         rcu_read_unlock();
3389 }
3390
3391 static void perf_event_comm_event(struct perf_comm_event *comm_event)
3392 {
3393         struct perf_cpu_context *cpuctx;
3394         struct perf_event_context *ctx;
3395         unsigned int size;
3396         char comm[TASK_COMM_LEN];
3397
3398         memset(comm, 0, sizeof(comm));
3399         strncpy(comm, comm_event->task->comm, sizeof(comm));
3400         size = ALIGN(strlen(comm)+1, sizeof(u64));
3401
3402         comm_event->comm = comm;
3403         comm_event->comm_size = size;
3404
3405         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3406
3407         cpuctx = &get_cpu_var(perf_cpu_context);
3408         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3409         put_cpu_var(perf_cpu_context);
3410
3411         rcu_read_lock();
3412         /*
3413          * doesn't really matter which of the child contexts the
3414          * events ends up in.
3415          */
3416         ctx = rcu_dereference(current->perf_event_ctxp);
3417         if (ctx)
3418                 perf_event_comm_ctx(ctx, comm_event);
3419         rcu_read_unlock();
3420 }
3421
3422 void perf_event_comm(struct task_struct *task)
3423 {
3424         struct perf_comm_event comm_event;
3425
3426         if (task->perf_event_ctxp)
3427                 perf_event_enable_on_exec(task);
3428
3429         if (!atomic_read(&nr_comm_events))
3430                 return;
3431
3432         comm_event = (struct perf_comm_event){
3433                 .task   = task,
3434                 /* .comm      */
3435                 /* .comm_size */
3436                 .event_id  = {
3437                         .header = {
3438                                 .type = PERF_RECORD_COMM,
3439                                 .misc = 0,
3440                                 /* .size */
3441                         },
3442                         /* .pid */
3443                         /* .tid */
3444                 },
3445         };
3446
3447         perf_event_comm_event(&comm_event);
3448 }
3449
3450 /*
3451  * mmap tracking
3452  */
3453
3454 struct perf_mmap_event {
3455         struct vm_area_struct   *vma;
3456
3457         const char              *file_name;
3458         int                     file_size;
3459
3460         struct {
3461                 struct perf_event_header        header;
3462
3463                 u32                             pid;
3464                 u32                             tid;
3465                 u64                             start;
3466                 u64                             len;
3467                 u64                             pgoff;
3468         } event_id;
3469 };
3470
3471 static void perf_event_mmap_output(struct perf_event *event,
3472                                      struct perf_mmap_event *mmap_event)
3473 {
3474         struct perf_output_handle handle;
3475         int size = mmap_event->event_id.header.size;
3476         int ret = perf_output_begin(&handle, event, size, 0, 0);
3477
3478         if (ret)
3479                 return;
3480
3481         mmap_event->event_id.pid = perf_event_pid(event, current);
3482         mmap_event->event_id.tid = perf_event_tid(event, current);
3483
3484         perf_output_put(&handle, mmap_event->event_id);
3485         perf_output_copy(&handle, mmap_event->file_name,
3486                                    mmap_event->file_size);
3487         perf_output_end(&handle);
3488 }
3489
3490 static int perf_event_mmap_match(struct perf_event *event,
3491                                    struct perf_mmap_event *mmap_event)
3492 {
3493         if (event->attr.mmap)
3494                 return 1;
3495
3496         return 0;
3497 }
3498
3499 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3500                                   struct perf_mmap_event *mmap_event)
3501 {
3502         struct perf_event *event;
3503
3504         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3505                 return;
3506
3507         rcu_read_lock();
3508         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3509                 if (perf_event_mmap_match(event, mmap_event))
3510                         perf_event_mmap_output(event, mmap_event);
3511         }
3512         rcu_read_unlock();
3513 }
3514
3515 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3516 {
3517         struct perf_cpu_context *cpuctx;
3518         struct perf_event_context *ctx;
3519         struct vm_area_struct *vma = mmap_event->vma;
3520         struct file *file = vma->vm_file;
3521         unsigned int size;
3522         char tmp[16];
3523         char *buf = NULL;
3524         const char *name;
3525
3526         memset(tmp, 0, sizeof(tmp));
3527
3528         if (file) {
3529                 /*
3530                  * d_path works from the end of the buffer backwards, so we
3531                  * need to add enough zero bytes after the string to handle
3532                  * the 64bit alignment we do later.
3533                  */
3534                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3535                 if (!buf) {
3536                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3537                         goto got_name;
3538                 }
3539                 name = d_path(&file->f_path, buf, PATH_MAX);
3540                 if (IS_ERR(name)) {
3541                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3542                         goto got_name;
3543                 }
3544         } else {
3545                 if (arch_vma_name(mmap_event->vma)) {
3546                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3547                                        sizeof(tmp));
3548                         goto got_name;
3549                 }
3550
3551                 if (!vma->vm_mm) {
3552                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3553                         goto got_name;
3554                 }
3555
3556                 name = strncpy(tmp, "//anon", sizeof(tmp));
3557                 goto got_name;
3558         }
3559
3560 got_name:
3561         size = ALIGN(strlen(name)+1, sizeof(u64));
3562
3563         mmap_event->file_name = name;
3564         mmap_event->file_size = size;
3565
3566         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3567
3568         cpuctx = &get_cpu_var(perf_cpu_context);
3569         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3570         put_cpu_var(perf_cpu_context);
3571
3572         rcu_read_lock();
3573         /*
3574          * doesn't really matter which of the child contexts the
3575          * events ends up in.
3576          */
3577         ctx = rcu_dereference(current->perf_event_ctxp);
3578         if (ctx)
3579                 perf_event_mmap_ctx(ctx, mmap_event);
3580         rcu_read_unlock();
3581
3582         kfree(buf);
3583 }
3584
3585 void __perf_event_mmap(struct vm_area_struct *vma)
3586 {
3587         struct perf_mmap_event mmap_event;
3588
3589         if (!atomic_read(&nr_mmap_events))
3590                 return;
3591
3592         mmap_event = (struct perf_mmap_event){
3593                 .vma    = vma,
3594                 /* .file_name */
3595                 /* .file_size */
3596                 .event_id  = {
3597                         .header = {
3598                                 .type = PERF_RECORD_MMAP,
3599                                 .misc = 0,
3600                                 /* .size */
3601                         },
3602                         /* .pid */
3603                         /* .tid */
3604                         .start  = vma->vm_start,
3605                         .len    = vma->vm_end - vma->vm_start,
3606                         .pgoff  = vma->vm_pgoff,
3607                 },
3608         };
3609
3610         perf_event_mmap_event(&mmap_event);
3611 }
3612
3613 /*
3614  * IRQ throttle logging
3615  */
3616
3617 static void perf_log_throttle(struct perf_event *event, int enable)
3618 {
3619         struct perf_output_handle handle;
3620         int ret;
3621
3622         struct {
3623                 struct perf_event_header        header;
3624                 u64                             time;
3625                 u64                             id;
3626                 u64                             stream_id;
3627         } throttle_event = {
3628                 .header = {
3629                         .type = PERF_RECORD_THROTTLE,
3630                         .misc = 0,
3631                         .size = sizeof(throttle_event),
3632                 },
3633                 .time           = perf_clock(),
3634                 .id             = primary_event_id(event),
3635                 .stream_id      = event->id,
3636         };
3637
3638         if (enable)
3639                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3640
3641         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3642         if (ret)
3643                 return;
3644
3645         perf_output_put(&handle, throttle_event);
3646         perf_output_end(&handle);
3647 }
3648
3649 /*
3650  * Generic event overflow handling, sampling.
3651  */
3652
3653 static int __perf_event_overflow(struct perf_event *event, int nmi,
3654                                    int throttle, struct perf_sample_data *data,
3655                                    struct pt_regs *regs)
3656 {
3657         int events = atomic_read(&event->event_limit);
3658         struct hw_perf_event *hwc = &event->hw;
3659         int ret = 0;
3660
3661         throttle = (throttle && event->pmu->unthrottle != NULL);
3662
3663         if (!throttle) {
3664                 hwc->interrupts++;
3665         } else {
3666                 if (hwc->interrupts != MAX_INTERRUPTS) {
3667                         hwc->interrupts++;
3668                         if (HZ * hwc->interrupts >
3669                                         (u64)sysctl_perf_event_sample_rate) {
3670                                 hwc->interrupts = MAX_INTERRUPTS;
3671                                 perf_log_throttle(event, 0);
3672                                 ret = 1;
3673                         }
3674                 } else {
3675                         /*
3676                          * Keep re-disabling events even though on the previous
3677                          * pass we disabled it - just in case we raced with a
3678                          * sched-in and the event got enabled again:
3679                          */
3680                         ret = 1;
3681                 }
3682         }
3683
3684         if (event->attr.freq) {
3685                 u64 now = perf_clock();
3686                 s64 delta = now - hwc->freq_stamp;
3687
3688                 hwc->freq_stamp = now;
3689
3690                 if (delta > 0 && delta < TICK_NSEC)
3691                         perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3692         }
3693
3694         /*
3695          * XXX event_limit might not quite work as expected on inherited
3696          * events
3697          */
3698
3699         event->pending_kill = POLL_IN;
3700         if (events && atomic_dec_and_test(&event->event_limit)) {
3701                 ret = 1;
3702                 event->pending_kill = POLL_HUP;
3703                 if (nmi) {
3704                         event->pending_disable = 1;
3705                         perf_pending_queue(&event->pending,
3706                                            perf_pending_event);
3707                 } else
3708                         perf_event_disable(event);
3709         }
3710
3711         perf_event_output(event, nmi, data, regs);
3712         return ret;
3713 }
3714
3715 int perf_event_overflow(struct perf_event *event, int nmi,
3716                           struct perf_sample_data *data,
3717                           struct pt_regs *regs)
3718 {
3719         return __perf_event_overflow(event, nmi, 1, data, regs);
3720 }
3721
3722 /*
3723  * Generic software event infrastructure
3724  */
3725
3726 /*
3727  * We directly increment event->count and keep a second value in
3728  * event->hw.period_left to count intervals. This period event
3729  * is kept in the range [-sample_period, 0] so that we can use the
3730  * sign as trigger.
3731  */
3732
3733 static u64 perf_swevent_set_period(struct perf_event *event)
3734 {
3735         struct hw_perf_event *hwc = &event->hw;
3736         u64 period = hwc->last_period;
3737         u64 nr, offset;
3738         s64 old, val;
3739
3740         hwc->last_period = hwc->sample_period;
3741
3742 again:
3743         old = val = atomic64_read(&hwc->period_left);
3744         if (val < 0)
3745                 return 0;
3746
3747         nr = div64_u64(period + val, period);
3748         offset = nr * period;
3749         val -= offset;
3750         if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3751                 goto again;
3752
3753         return nr;
3754 }
3755
3756 static void perf_swevent_overflow(struct perf_event *event,
3757                                     int nmi, struct perf_sample_data *data,
3758                                     struct pt_regs *regs)
3759 {
3760         struct hw_perf_event *hwc = &event->hw;
3761         int throttle = 0;
3762         u64 overflow;
3763
3764         data->period = event->hw.last_period;
3765         overflow = perf_swevent_set_period(event);
3766
3767         if (hwc->interrupts == MAX_INTERRUPTS)
3768                 return;
3769
3770         for (; overflow; overflow--) {
3771                 if (__perf_event_overflow(event, nmi, throttle,
3772                                             data, regs)) {
3773                         /*
3774                          * We inhibit the overflow from happening when
3775                          * hwc->interrupts == MAX_INTERRUPTS.
3776                          */
3777                         break;
3778                 }
3779                 throttle = 1;
3780         }
3781 }
3782
3783 static void perf_swevent_unthrottle(struct perf_event *event)
3784 {
3785         /*
3786          * Nothing to do, we already reset hwc->interrupts.
3787          */
3788 }
3789
3790 static void perf_swevent_add(struct perf_event *event, u64 nr,
3791                                int nmi, struct perf_sample_data *data,
3792                                struct pt_regs *regs)
3793 {
3794         struct hw_perf_event *hwc = &event->hw;
3795
3796         atomic64_add(nr, &event->count);
3797
3798         if (!hwc->sample_period)
3799                 return;
3800
3801         if (!regs)
3802                 return;
3803
3804         if (!atomic64_add_negative(nr, &hwc->period_left))
3805                 perf_swevent_overflow(event, nmi, data, regs);
3806 }
3807
3808 static int perf_swevent_is_counting(struct perf_event *event)
3809 {
3810         /*
3811          * The event is active, we're good!
3812          */
3813         if (event->state == PERF_EVENT_STATE_ACTIVE)
3814                 return 1;
3815
3816         /*
3817          * The event is off/error, not counting.
3818          */
3819         if (event->state != PERF_EVENT_STATE_INACTIVE)
3820                 return 0;
3821
3822         /*
3823          * The event is inactive, if the context is active
3824          * we're part of a group that didn't make it on the 'pmu',
3825          * not counting.
3826          */
3827         if (event->ctx->is_active)
3828                 return 0;
3829
3830         /*
3831          * We're inactive and the context is too, this means the
3832          * task is scheduled out, we're counting events that happen
3833          * to us, like migration events.
3834          */
3835         return 1;
3836 }
3837
3838 static int perf_tp_event_match(struct perf_event *event,
3839                                 struct perf_sample_data *data);
3840
3841 static int perf_swevent_match(struct perf_event *event,
3842                                 enum perf_type_id type,
3843                                 u32 event_id,
3844                                 struct perf_sample_data *data,
3845                                 struct pt_regs *regs)
3846 {
3847         if (!perf_swevent_is_counting(event))
3848                 return 0;
3849
3850         if (event->attr.type != type)
3851                 return 0;
3852         if (event->attr.config != event_id)
3853                 return 0;
3854
3855         if (regs) {
3856                 if (event->attr.exclude_user && user_mode(regs))
3857                         return 0;
3858
3859                 if (event->attr.exclude_kernel && !user_mode(regs))
3860                         return 0;
3861         }
3862
3863         if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3864             !perf_tp_event_match(event, data))
3865                 return 0;
3866
3867         return 1;
3868 }
3869
3870 static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3871                                      enum perf_type_id type,
3872                                      u32 event_id, u64 nr, int nmi,
3873                                      struct perf_sample_data *data,
3874                                      struct pt_regs *regs)
3875 {
3876         struct perf_event *event;
3877
3878         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3879                 return;
3880
3881         rcu_read_lock();
3882         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3883                 if (perf_swevent_match(event, type, event_id, data, regs))
3884                         perf_swevent_add(event, nr, nmi, data, regs);
3885         }
3886         rcu_read_unlock();
3887 }
3888
3889 static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3890 {
3891         if (in_nmi())
3892                 return &cpuctx->recursion[3];
3893
3894         if (in_irq())
3895                 return &cpuctx->recursion[2];
3896
3897         if (in_softirq())
3898                 return &cpuctx->recursion[1];
3899
3900         return &cpuctx->recursion[0];
3901 }
3902
3903 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3904                                     u64 nr, int nmi,
3905                                     struct perf_sample_data *data,
3906                                     struct pt_regs *regs)
3907 {
3908         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3909         int *recursion = perf_swevent_recursion_context(cpuctx);
3910         struct perf_event_context *ctx;
3911
3912         if (*recursion)
3913                 goto out;
3914
3915         (*recursion)++;
3916         barrier();
3917
3918         perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3919                                  nr, nmi, data, regs);
3920         rcu_read_lock();
3921         /*
3922          * doesn't really matter which of the child contexts the
3923          * events ends up in.
3924          */
3925         ctx = rcu_dereference(current->perf_event_ctxp);
3926         if (ctx)
3927                 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3928         rcu_read_unlock();
3929
3930         barrier();
3931         (*recursion)--;
3932
3933 out:
3934         put_cpu_var(perf_cpu_context);
3935 }
3936
3937 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3938                             struct pt_regs *regs, u64 addr)
3939 {
3940         struct perf_sample_data data = {
3941                 .addr = addr,
3942         };
3943
3944         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3945                                 &data, regs);
3946 }
3947
3948 static void perf_swevent_read(struct perf_event *event)
3949 {
3950 }
3951
3952 static int perf_swevent_enable(struct perf_event *event)
3953 {
3954         struct hw_perf_event *hwc = &event->hw;
3955
3956         if (hwc->sample_period) {
3957                 hwc->last_period = hwc->sample_period;
3958                 perf_swevent_set_period(event);
3959         }
3960         return 0;
3961 }
3962
3963 static void perf_swevent_disable(struct perf_event *event)
3964 {
3965 }
3966
3967 static const struct pmu perf_ops_generic = {
3968         .enable         = perf_swevent_enable,
3969         .disable        = perf_swevent_disable,
3970         .read           = perf_swevent_read,
3971         .unthrottle     = perf_swevent_unthrottle,
3972 };
3973
3974 /*
3975  * hrtimer based swevent callback
3976  */
3977
3978 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3979 {
3980         enum hrtimer_restart ret = HRTIMER_RESTART;
3981         struct perf_sample_data data;
3982         struct pt_regs *regs;
3983         struct perf_event *event;
3984         u64 period;
3985
3986         event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
3987         event->pmu->read(event);
3988
3989         data.addr = 0;
3990         regs = get_irq_regs();
3991         /*
3992          * In case we exclude kernel IPs or are somehow not in interrupt
3993          * context, provide the next best thing, the user IP.
3994          */
3995         if ((event->attr.exclude_kernel || !regs) &&
3996                         !event->attr.exclude_user)
3997                 regs = task_pt_regs(current);
3998
3999         if (regs) {
4000                 if (perf_event_overflow(event, 0, &data, regs))
4001                         ret = HRTIMER_NORESTART;
4002         }
4003
4004         period = max_t(u64, 10000, event->hw.sample_period);
4005         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4006
4007         return ret;
4008 }
4009
4010 /*
4011  * Software event: cpu wall time clock
4012  */
4013
4014 static void cpu_clock_perf_event_update(struct perf_event *event)
4015 {
4016         int cpu = raw_smp_processor_id();
4017         s64 prev;
4018         u64 now;
4019
4020         now = cpu_clock(cpu);
4021         prev = atomic64_read(&event->hw.prev_count);
4022         atomic64_set(&event->hw.prev_count, now);
4023         atomic64_add(now - prev, &event->count);
4024 }
4025
4026 static int cpu_clock_perf_event_enable(struct perf_event *event)
4027 {
4028         struct hw_perf_event *hwc = &event->hw;
4029         int cpu = raw_smp_processor_id();
4030
4031         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4032         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4033         hwc->hrtimer.function = perf_swevent_hrtimer;
4034         if (hwc->sample_period) {
4035                 u64 period = max_t(u64, 10000, hwc->sample_period);
4036                 __hrtimer_start_range_ns(&hwc->hrtimer,
4037                                 ns_to_ktime(period), 0,
4038                                 HRTIMER_MODE_REL, 0);
4039         }
4040
4041         return 0;
4042 }
4043
4044 static void cpu_clock_perf_event_disable(struct perf_event *event)
4045 {
4046         if (event->hw.sample_period)
4047                 hrtimer_cancel(&event->hw.hrtimer);
4048         cpu_clock_perf_event_update(event);
4049 }
4050
4051 static void cpu_clock_perf_event_read(struct perf_event *event)
4052 {
4053         cpu_clock_perf_event_update(event);
4054 }
4055
4056 static const struct pmu perf_ops_cpu_clock = {
4057         .enable         = cpu_clock_perf_event_enable,
4058         .disable        = cpu_clock_perf_event_disable,
4059         .read           = cpu_clock_perf_event_read,
4060 };
4061
4062 /*
4063  * Software event: task time clock
4064  */
4065
4066 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4067 {
4068         u64 prev;
4069         s64 delta;
4070
4071         prev = atomic64_xchg(&event->hw.prev_count, now);
4072         delta = now - prev;
4073         atomic64_add(delta, &event->count);
4074 }
4075
4076 static int task_clock_perf_event_enable(struct perf_event *event)
4077 {
4078         struct hw_perf_event *hwc = &event->hw;
4079         u64 now;
4080
4081         now = event->ctx->time;
4082
4083         atomic64_set(&hwc->prev_count, now);
4084         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4085         hwc->hrtimer.function = perf_swevent_hrtimer;
4086         if (hwc->sample_period) {
4087                 u64 period = max_t(u64, 10000, hwc->sample_period);
4088                 __hrtimer_start_range_ns(&hwc->hrtimer,
4089                                 ns_to_ktime(period), 0,
4090                                 HRTIMER_MODE_REL, 0);
4091         }
4092
4093         return 0;
4094 }
4095
4096 static void task_clock_perf_event_disable(struct perf_event *event)
4097 {
4098         if (event->hw.sample_period)
4099                 hrtimer_cancel(&event->hw.hrtimer);
4100         task_clock_perf_event_update(event, event->ctx->time);
4101
4102 }
4103
4104 static void task_clock_perf_event_read(struct perf_event *event)
4105 {
4106         u64 time;
4107
4108         if (!in_nmi()) {
4109                 update_context_time(event->ctx);
4110                 time = event->ctx->time;
4111         } else {
4112                 u64 now = perf_clock();
4113                 u64 delta = now - event->ctx->timestamp;
4114                 time = event->ctx->time + delta;
4115         }
4116
4117         task_clock_perf_event_update(event, time);
4118 }
4119
4120 static const struct pmu perf_ops_task_clock = {
4121         .enable         = task_clock_perf_event_enable,
4122         .disable        = task_clock_perf_event_disable,
4123         .read           = task_clock_perf_event_read,
4124 };
4125
4126 #ifdef CONFIG_EVENT_PROFILE
4127
4128 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4129                           int entry_size)
4130 {
4131         struct perf_raw_record raw = {
4132                 .size = entry_size,
4133                 .data = record,
4134         };
4135
4136         struct perf_sample_data data = {
4137                 .addr = addr,
4138                 .raw = &raw,
4139         };
4140
4141         struct pt_regs *regs = get_irq_regs();
4142
4143         if (!regs)
4144                 regs = task_pt_regs(current);
4145
4146         do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4147                                 &data, regs);
4148 }
4149 EXPORT_SYMBOL_GPL(perf_tp_event);
4150
4151 static int perf_tp_event_match(struct perf_event *event,
4152                                 struct perf_sample_data *data)
4153 {
4154         void *record = data->raw->data;
4155
4156         if (likely(!event->filter) || filter_match_preds(event->filter, record))
4157                 return 1;
4158         return 0;
4159 }
4160
4161 static void tp_perf_event_destroy(struct perf_event *event)
4162 {
4163         ftrace_profile_disable(event->attr.config);
4164 }
4165
4166 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4167 {
4168         /*
4169          * Raw tracepoint data is a severe data leak, only allow root to
4170          * have these.
4171          */
4172         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4173                         perf_paranoid_tracepoint_raw() &&
4174                         !capable(CAP_SYS_ADMIN))
4175                 return ERR_PTR(-EPERM);
4176
4177         if (ftrace_profile_enable(event->attr.config))
4178                 return NULL;
4179
4180         event->destroy = tp_perf_event_destroy;
4181
4182         return &perf_ops_generic;
4183 }
4184
4185 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4186 {
4187         char *filter_str;
4188         int ret;
4189
4190         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4191                 return -EINVAL;
4192
4193         filter_str = strndup_user(arg, PAGE_SIZE);
4194         if (IS_ERR(filter_str))
4195                 return PTR_ERR(filter_str);
4196
4197         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4198
4199         kfree(filter_str);
4200         return ret;
4201 }
4202
4203 static void perf_event_free_filter(struct perf_event *event)
4204 {
4205         ftrace_profile_free_filter(event);
4206 }
4207
4208 #else
4209
4210 static int perf_tp_event_match(struct perf_event *event,
4211                                 struct perf_sample_data *data)
4212 {
4213         return 1;
4214 }
4215
4216 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4217 {
4218         return NULL;
4219 }
4220
4221 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4222 {
4223         return -ENOENT;
4224 }
4225
4226 static void perf_event_free_filter(struct perf_event *event)
4227 {
4228 }
4229
4230 #endif /* CONFIG_EVENT_PROFILE */
4231
4232 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4233
4234 static void sw_perf_event_destroy(struct perf_event *event)
4235 {
4236         u64 event_id = event->attr.config;
4237
4238         WARN_ON(event->parent);
4239
4240         atomic_dec(&perf_swevent_enabled[event_id]);
4241 }
4242
4243 static const struct pmu *sw_perf_event_init(struct perf_event *event)
4244 {
4245         const struct pmu *pmu = NULL;
4246         u64 event_id = event->attr.config;
4247
4248         /*
4249          * Software events (currently) can't in general distinguish
4250          * between user, kernel and hypervisor events.
4251          * However, context switches and cpu migrations are considered
4252          * to be kernel events, and page faults are never hypervisor
4253          * events.
4254          */
4255         switch (event_id) {
4256         case PERF_COUNT_SW_CPU_CLOCK:
4257                 pmu = &perf_ops_cpu_clock;
4258
4259                 break;
4260         case PERF_COUNT_SW_TASK_CLOCK:
4261                 /*
4262                  * If the user instantiates this as a per-cpu event,
4263                  * use the cpu_clock event instead.
4264                  */
4265                 if (event->ctx->task)
4266                         pmu = &perf_ops_task_clock;
4267                 else
4268                         pmu = &perf_ops_cpu_clock;
4269
4270                 break;
4271         case PERF_COUNT_SW_PAGE_FAULTS:
4272         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4273         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4274         case PERF_COUNT_SW_CONTEXT_SWITCHES:
4275         case PERF_COUNT_SW_CPU_MIGRATIONS:
4276                 if (!event->parent) {
4277                         atomic_inc(&perf_swevent_enabled[event_id]);
4278                         event->destroy = sw_perf_event_destroy;
4279                 }
4280                 pmu = &perf_ops_generic;
4281                 break;
4282         }
4283
4284         return pmu;
4285 }
4286
4287 /*
4288  * Allocate and initialize a event structure
4289  */
4290 static struct perf_event *
4291 perf_event_alloc(struct perf_event_attr *attr,
4292                    int cpu,
4293                    struct perf_event_context *ctx,
4294                    struct perf_event *group_leader,
4295                    struct perf_event *parent_event,
4296                    gfp_t gfpflags)
4297 {
4298         const struct pmu *pmu;
4299         struct perf_event *event;
4300         struct hw_perf_event *hwc;
4301         long err;
4302
4303         event = kzalloc(sizeof(*event), gfpflags);
4304         if (!event)
4305                 return ERR_PTR(-ENOMEM);
4306
4307         /*
4308          * Single events are their own group leaders, with an
4309          * empty sibling list:
4310          */
4311         if (!group_leader)
4312                 group_leader = event;
4313
4314         mutex_init(&event->child_mutex);
4315         INIT_LIST_HEAD(&event->child_list);
4316
4317         INIT_LIST_HEAD(&event->group_entry);
4318         INIT_LIST_HEAD(&event->event_entry);
4319         INIT_LIST_HEAD(&event->sibling_list);
4320         init_waitqueue_head(&event->waitq);
4321
4322         mutex_init(&event->mmap_mutex);
4323
4324         event->cpu              = cpu;
4325         event->attr             = *attr;
4326         event->group_leader     = group_leader;
4327         event->pmu              = NULL;
4328         event->ctx              = ctx;
4329         event->oncpu            = -1;
4330
4331         event->parent           = parent_event;
4332
4333         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4334         event->id               = atomic64_inc_return(&perf_event_id);
4335
4336         event->state            = PERF_EVENT_STATE_INACTIVE;
4337
4338         if (attr->disabled)
4339                 event->state = PERF_EVENT_STATE_OFF;
4340
4341         pmu = NULL;
4342
4343         hwc = &event->hw;
4344         hwc->sample_period = attr->sample_period;
4345         if (attr->freq && attr->sample_freq)
4346                 hwc->sample_period = 1;
4347         hwc->last_period = hwc->sample_period;
4348
4349         atomic64_set(&hwc->period_left, hwc->sample_period);
4350
4351         /*
4352          * we currently do not support PERF_FORMAT_GROUP on inherited events
4353          */
4354         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4355                 goto done;
4356
4357         switch (attr->type) {
4358         case PERF_TYPE_RAW:
4359         case PERF_TYPE_HARDWARE:
4360         case PERF_TYPE_HW_CACHE:
4361                 pmu = hw_perf_event_init(event);
4362                 break;
4363
4364         case PERF_TYPE_SOFTWARE:
4365                 pmu = sw_perf_event_init(event);
4366                 break;
4367
4368         case PERF_TYPE_TRACEPOINT:
4369                 pmu = tp_perf_event_init(event);
4370                 break;
4371
4372         default:
4373                 break;
4374         }
4375 done:
4376         err = 0;
4377         if (!pmu)
4378                 err = -EINVAL;
4379         else if (IS_ERR(pmu))
4380                 err = PTR_ERR(pmu);
4381
4382         if (err) {
4383                 if (event->ns)
4384                         put_pid_ns(event->ns);
4385                 kfree(event);
4386                 return ERR_PTR(err);
4387         }
4388
4389         event->pmu = pmu;
4390
4391         if (!event->parent) {
4392                 atomic_inc(&nr_events);
4393                 if (event->attr.mmap)
4394                         atomic_inc(&nr_mmap_events);
4395                 if (event->attr.comm)
4396                         atomic_inc(&nr_comm_events);
4397                 if (event->attr.task)
4398                         atomic_inc(&nr_task_events);
4399         }
4400
4401         return event;
4402 }
4403
4404 static int perf_copy_attr(struct perf_event_attr __user *uattr,
4405                           struct perf_event_attr *attr)
4406 {
4407         u32 size;
4408         int ret;
4409
4410         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4411                 return -EFAULT;
4412
4413         /*
4414          * zero the full structure, so that a short copy will be nice.
4415          */
4416         memset(attr, 0, sizeof(*attr));
4417
4418         ret = get_user(size, &uattr->size);
4419         if (ret)
4420                 return ret;
4421
4422         if (size > PAGE_SIZE)   /* silly large */
4423                 goto err_size;
4424
4425         if (!size)              /* abi compat */
4426                 size = PERF_ATTR_SIZE_VER0;
4427
4428         if (size < PERF_ATTR_SIZE_VER0)
4429                 goto err_size;
4430
4431         /*
4432          * If we're handed a bigger struct than we know of,
4433          * ensure all the unknown bits are 0 - i.e. new
4434          * user-space does not rely on any kernel feature
4435          * extensions we dont know about yet.
4436          */
4437         if (size > sizeof(*attr)) {
4438                 unsigned char __user *addr;
4439                 unsigned char __user *end;
4440                 unsigned char val;
4441
4442                 addr = (void __user *)uattr + sizeof(*attr);
4443                 end  = (void __user *)uattr + size;
4444
4445                 for (; addr < end; addr++) {
4446                         ret = get_user(val, addr);
4447                         if (ret)
4448                                 return ret;
4449                         if (val)
4450                                 goto err_size;
4451                 }
4452                 size = sizeof(*attr);
4453         }
4454
4455         ret = copy_from_user(attr, uattr, size);
4456         if (ret)
4457                 return -EFAULT;
4458
4459         /*
4460          * If the type exists, the corresponding creation will verify
4461          * the attr->config.
4462          */
4463         if (attr->type >= PERF_TYPE_MAX)
4464                 return -EINVAL;
4465
4466         if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4467                 return -EINVAL;
4468
4469         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4470                 return -EINVAL;
4471
4472         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4473                 return -EINVAL;
4474
4475 out:
4476         return ret;
4477
4478 err_size:
4479         put_user(sizeof(*attr), &uattr->size);
4480         ret = -E2BIG;
4481         goto out;
4482 }
4483
4484 static int perf_event_set_output(struct perf_event *event, int output_fd)
4485 {
4486         struct perf_event *output_event = NULL;
4487         struct file *output_file = NULL;
4488         struct perf_event *old_output;
4489         int fput_needed = 0;
4490         int ret = -EINVAL;
4491
4492         if (!output_fd)
4493                 goto set;
4494
4495         output_file = fget_light(output_fd, &fput_needed);
4496         if (!output_file)
4497                 return -EBADF;
4498
4499         if (output_file->f_op != &perf_fops)
4500                 goto out;
4501
4502         output_event = output_file->private_data;
4503
4504         /* Don't chain output fds */
4505         if (output_event->output)
4506                 goto out;
4507
4508         /* Don't set an output fd when we already have an output channel */
4509         if (event->data)
4510                 goto out;
4511
4512         atomic_long_inc(&output_file->f_count);
4513
4514 set:
4515         mutex_lock(&event->mmap_mutex);
4516         old_output = event->output;
4517         rcu_assign_pointer(event->output, output_event);
4518         mutex_unlock(&event->mmap_mutex);
4519
4520         if (old_output) {
4521                 /*
4522                  * we need to make sure no existing perf_output_*()
4523                  * is still referencing this event.
4524                  */
4525                 synchronize_rcu();
4526                 fput(old_output->filp);
4527         }
4528
4529         ret = 0;
4530 out:
4531         fput_light(output_file, fput_needed);
4532         return ret;
4533 }
4534
4535 /**
4536  * sys_perf_event_open - open a performance event, associate it to a task/cpu
4537  *
4538  * @attr_uptr:  event_id type attributes for monitoring/sampling
4539  * @pid:                target pid
4540  * @cpu:                target cpu
4541  * @group_fd:           group leader event fd
4542  */
4543 SYSCALL_DEFINE5(perf_event_open,
4544                 struct perf_event_attr __user *, attr_uptr,
4545                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4546 {
4547         struct perf_event *event, *group_leader;
4548         struct perf_event_attr attr;
4549         struct perf_event_context *ctx;
4550         struct file *event_file = NULL;
4551         struct file *group_file = NULL;
4552         int fput_needed = 0;
4553         int fput_needed2 = 0;
4554         int err;
4555
4556         /* for future expandability... */
4557         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4558                 return -EINVAL;
4559
4560         err = perf_copy_attr(attr_uptr, &attr);
4561         if (err)
4562                 return err;
4563
4564         if (!attr.exclude_kernel) {
4565                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4566                         return -EACCES;
4567         }
4568
4569         if (attr.freq) {
4570                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4571                         return -EINVAL;
4572         }
4573
4574         /*
4575          * Get the target context (task or percpu):
4576          */
4577         ctx = find_get_context(pid, cpu);
4578         if (IS_ERR(ctx))
4579                 return PTR_ERR(ctx);
4580
4581         /*
4582          * Look up the group leader (we will attach this event to it):
4583          */
4584         group_leader = NULL;
4585         if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4586                 err = -EINVAL;
4587                 group_file = fget_light(group_fd, &fput_needed);
4588                 if (!group_file)
4589                         goto err_put_context;
4590                 if (group_file->f_op != &perf_fops)
4591                         goto err_put_context;
4592
4593                 group_leader = group_file->private_data;
4594                 /*
4595                  * Do not allow a recursive hierarchy (this new sibling
4596                  * becoming part of another group-sibling):
4597                  */
4598                 if (group_leader->group_leader != group_leader)
4599                         goto err_put_context;
4600                 /*
4601                  * Do not allow to attach to a group in a different
4602                  * task or CPU context:
4603                  */
4604                 if (group_leader->ctx != ctx)
4605                         goto err_put_context;
4606                 /*
4607                  * Only a group leader can be exclusive or pinned
4608                  */
4609                 if (attr.exclusive || attr.pinned)
4610                         goto err_put_context;
4611         }
4612
4613         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4614                                      NULL, GFP_KERNEL);
4615         err = PTR_ERR(event);
4616         if (IS_ERR(event))
4617                 goto err_put_context;
4618
4619         err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4620         if (err < 0)
4621                 goto err_free_put_context;
4622
4623         event_file = fget_light(err, &fput_needed2);
4624         if (!event_file)
4625                 goto err_free_put_context;
4626
4627         if (flags & PERF_FLAG_FD_OUTPUT) {
4628                 err = perf_event_set_output(event, group_fd);
4629                 if (err)
4630                         goto err_fput_free_put_context;
4631         }
4632
4633         event->filp = event_file;
4634         WARN_ON_ONCE(ctx->parent_ctx);
4635         mutex_lock(&ctx->mutex);
4636         perf_install_in_context(ctx, event, cpu);
4637         ++ctx->generation;
4638         mutex_unlock(&ctx->mutex);
4639
4640         event->owner = current;
4641         get_task_struct(current);
4642         mutex_lock(&current->perf_event_mutex);
4643         list_add_tail(&event->owner_entry, &current->perf_event_list);
4644         mutex_unlock(&current->perf_event_mutex);
4645
4646 err_fput_free_put_context:
4647         fput_light(event_file, fput_needed2);
4648
4649 err_free_put_context:
4650         if (err < 0)
4651                 kfree(event);
4652
4653 err_put_context:
4654         if (err < 0)
4655                 put_ctx(ctx);
4656
4657         fput_light(group_file, fput_needed);
4658
4659         return err;
4660 }
4661
4662 /**
4663  * perf_event_create_kernel_counter
4664  *
4665  * @attr: attributes of the counter to create
4666  * @cpu: cpu in which the counter is bound
4667  * @pid: task to profile
4668  */
4669 struct perf_event *
4670 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4671                                  pid_t pid)
4672 {
4673         struct perf_event *event;
4674         struct perf_event_context *ctx;
4675         int err;
4676
4677         /*
4678          * Get the target context (task or percpu):
4679          */
4680
4681         ctx = find_get_context(pid, cpu);
4682         if (IS_ERR(ctx))
4683                 return NULL ;
4684
4685         event = perf_event_alloc(attr, cpu, ctx, NULL,
4686                                      NULL, GFP_KERNEL);
4687         err = PTR_ERR(event);
4688         if (IS_ERR(event))
4689                 goto err_put_context;
4690
4691         event->filp = NULL;
4692         WARN_ON_ONCE(ctx->parent_ctx);
4693         mutex_lock(&ctx->mutex);
4694         perf_install_in_context(ctx, event, cpu);
4695         ++ctx->generation;
4696         mutex_unlock(&ctx->mutex);
4697
4698         event->owner = current;
4699         get_task_struct(current);
4700         mutex_lock(&current->perf_event_mutex);
4701         list_add_tail(&event->owner_entry, &current->perf_event_list);
4702         mutex_unlock(&current->perf_event_mutex);
4703
4704         return event;
4705
4706 err_put_context:
4707         if (err < 0)
4708                 put_ctx(ctx);
4709
4710         return NULL;
4711 }
4712 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4713
4714 /*
4715  * inherit a event from parent task to child task:
4716  */
4717 static struct perf_event *
4718 inherit_event(struct perf_event *parent_event,
4719               struct task_struct *parent,
4720               struct perf_event_context *parent_ctx,
4721               struct task_struct *child,
4722               struct perf_event *group_leader,
4723               struct perf_event_context *child_ctx)
4724 {
4725         struct perf_event *child_event;
4726
4727         /*
4728          * Instead of creating recursive hierarchies of events,
4729          * we link inherited events back to the original parent,
4730          * which has a filp for sure, which we use as the reference
4731          * count:
4732          */
4733         if (parent_event->parent)
4734                 parent_event = parent_event->parent;
4735
4736         child_event = perf_event_alloc(&parent_event->attr,
4737                                            parent_event->cpu, child_ctx,
4738                                            group_leader, parent_event,
4739                                            GFP_KERNEL);
4740         if (IS_ERR(child_event))
4741                 return child_event;
4742         get_ctx(child_ctx);
4743
4744         /*
4745          * Make the child state follow the state of the parent event,
4746          * not its attr.disabled bit.  We hold the parent's mutex,
4747          * so we won't race with perf_event_{en, dis}able_family.
4748          */
4749         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4750                 child_event->state = PERF_EVENT_STATE_INACTIVE;
4751         else
4752                 child_event->state = PERF_EVENT_STATE_OFF;
4753
4754         if (parent_event->attr.freq)
4755                 child_event->hw.sample_period = parent_event->hw.sample_period;
4756
4757         /*
4758          * Link it up in the child's context:
4759          */
4760         add_event_to_ctx(child_event, child_ctx);
4761
4762         /*
4763          * Get a reference to the parent filp - we will fput it
4764          * when the child event exits. This is safe to do because
4765          * we are in the parent and we know that the filp still
4766          * exists and has a nonzero count:
4767          */
4768         atomic_long_inc(&parent_event->filp->f_count);
4769
4770         /*
4771          * Link this into the parent event's child list
4772          */
4773         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4774         mutex_lock(&parent_event->child_mutex);
4775         list_add_tail(&child_event->child_list, &parent_event->child_list);
4776         mutex_unlock(&parent_event->child_mutex);
4777
4778         return child_event;
4779 }
4780
4781 static int inherit_group(struct perf_event *parent_event,
4782               struct task_struct *parent,
4783               struct perf_event_context *parent_ctx,
4784               struct task_struct *child,
4785               struct perf_event_context *child_ctx)
4786 {
4787         struct perf_event *leader;
4788         struct perf_event *sub;
4789         struct perf_event *child_ctr;
4790
4791         leader = inherit_event(parent_event, parent, parent_ctx,
4792                                  child, NULL, child_ctx);
4793         if (IS_ERR(leader))
4794                 return PTR_ERR(leader);
4795         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4796                 child_ctr = inherit_event(sub, parent, parent_ctx,
4797                                             child, leader, child_ctx);
4798                 if (IS_ERR(child_ctr))
4799                         return PTR_ERR(child_ctr);
4800         }
4801         return 0;
4802 }
4803
4804 static void sync_child_event(struct perf_event *child_event,
4805                                struct task_struct *child)
4806 {
4807         struct perf_event *parent_event = child_event->parent;
4808         u64 child_val;
4809
4810         if (child_event->attr.inherit_stat)
4811                 perf_event_read_event(child_event, child);
4812
4813         child_val = atomic64_read(&child_event->count);
4814
4815         /*
4816          * Add back the child's count to the parent's count:
4817          */
4818         atomic64_add(child_val, &parent_event->count);
4819         atomic64_add(child_event->total_time_enabled,
4820                      &parent_event->child_total_time_enabled);
4821         atomic64_add(child_event->total_time_running,
4822                      &parent_event->child_total_time_running);
4823
4824         /*
4825          * Remove this event from the parent's list
4826          */
4827         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4828         mutex_lock(&parent_event->child_mutex);
4829         list_del_init(&child_event->child_list);
4830         mutex_unlock(&parent_event->child_mutex);
4831
4832         /*
4833          * Release the parent event, if this was the last
4834          * reference to it.
4835          */
4836         fput(parent_event->filp);
4837 }
4838
4839 static void
4840 __perf_event_exit_task(struct perf_event *child_event,
4841                          struct perf_event_context *child_ctx,
4842                          struct task_struct *child)
4843 {
4844         struct perf_event *parent_event;
4845
4846         update_event_times(child_event);
4847         perf_event_remove_from_context(child_event);
4848
4849         parent_event = child_event->parent;
4850         /*
4851          * It can happen that parent exits first, and has events
4852          * that are still around due to the child reference. These
4853          * events need to be zapped - but otherwise linger.
4854          */
4855         if (parent_event) {
4856                 sync_child_event(child_event, child);
4857                 free_event(child_event);
4858         }
4859 }
4860
4861 /*
4862  * When a child task exits, feed back event values to parent events.
4863  */
4864 void perf_event_exit_task(struct task_struct *child)
4865 {
4866         struct perf_event *child_event, *tmp;
4867         struct perf_event_context *child_ctx;
4868         unsigned long flags;
4869
4870         if (likely(!child->perf_event_ctxp)) {
4871                 perf_event_task(child, NULL, 0);
4872                 return;
4873         }
4874
4875         local_irq_save(flags);
4876         /*
4877          * We can't reschedule here because interrupts are disabled,
4878          * and either child is current or it is a task that can't be
4879          * scheduled, so we are now safe from rescheduling changing
4880          * our context.
4881          */
4882         child_ctx = child->perf_event_ctxp;
4883         __perf_event_task_sched_out(child_ctx);
4884
4885         /*
4886          * Take the context lock here so that if find_get_context is
4887          * reading child->perf_event_ctxp, we wait until it has
4888          * incremented the context's refcount before we do put_ctx below.
4889          */
4890         spin_lock(&child_ctx->lock);
4891         child->perf_event_ctxp = NULL;
4892         /*
4893          * If this context is a clone; unclone it so it can't get
4894          * swapped to another process while we're removing all
4895          * the events from it.
4896          */
4897         unclone_ctx(child_ctx);
4898         spin_unlock_irqrestore(&child_ctx->lock, flags);
4899
4900         /*
4901          * Report the task dead after unscheduling the events so that we
4902          * won't get any samples after PERF_RECORD_EXIT. We can however still
4903          * get a few PERF_RECORD_READ events.
4904          */
4905         perf_event_task(child, child_ctx, 0);
4906
4907         /*
4908          * We can recurse on the same lock type through:
4909          *
4910          *   __perf_event_exit_task()
4911          *     sync_child_event()
4912          *       fput(parent_event->filp)
4913          *         perf_release()
4914          *           mutex_lock(&ctx->mutex)
4915          *
4916          * But since its the parent context it won't be the same instance.
4917          */
4918         mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4919
4920 again:
4921         list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4922                                  group_entry)
4923                 __perf_event_exit_task(child_event, child_ctx, child);
4924
4925         /*
4926          * If the last event was a group event, it will have appended all
4927          * its siblings to the list, but we obtained 'tmp' before that which
4928          * will still point to the list head terminating the iteration.
4929          */
4930         if (!list_empty(&child_ctx->group_list))
4931                 goto again;
4932
4933         mutex_unlock(&child_ctx->mutex);
4934
4935         put_ctx(child_ctx);
4936 }
4937
4938 /*
4939  * free an unexposed, unused context as created by inheritance by
4940  * init_task below, used by fork() in case of fail.
4941  */
4942 void perf_event_free_task(struct task_struct *task)
4943 {
4944         struct perf_event_context *ctx = task->perf_event_ctxp;
4945         struct perf_event *event, *tmp;
4946
4947         if (!ctx)
4948                 return;
4949
4950         mutex_lock(&ctx->mutex);
4951 again:
4952         list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4953                 struct perf_event *parent = event->parent;
4954
4955                 if (WARN_ON_ONCE(!parent))
4956                         continue;
4957
4958                 mutex_lock(&parent->child_mutex);
4959                 list_del_init(&event->child_list);
4960                 mutex_unlock(&parent->child_mutex);
4961
4962                 fput(parent->filp);
4963
4964                 list_del_event(event, ctx);
4965                 free_event(event);
4966         }
4967
4968         if (!list_empty(&ctx->group_list))
4969                 goto again;
4970
4971         mutex_unlock(&ctx->mutex);
4972
4973         put_ctx(ctx);
4974 }
4975
4976 /*
4977  * Initialize the perf_event context in task_struct
4978  */
4979 int perf_event_init_task(struct task_struct *child)
4980 {
4981         struct perf_event_context *child_ctx, *parent_ctx;
4982         struct perf_event_context *cloned_ctx;
4983         struct perf_event *event;
4984         struct task_struct *parent = current;
4985         int inherited_all = 1;
4986         int ret = 0;
4987
4988         child->perf_event_ctxp = NULL;
4989
4990         mutex_init(&child->perf_event_mutex);
4991         INIT_LIST_HEAD(&child->perf_event_list);
4992
4993         if (likely(!parent->perf_event_ctxp))
4994                 return 0;
4995
4996         /*
4997          * This is executed from the parent task context, so inherit
4998          * events that have been marked for cloning.
4999          * First allocate and initialize a context for the child.
5000          */
5001
5002         child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
5003         if (!child_ctx)
5004                 return -ENOMEM;
5005
5006         __perf_event_init_context(child_ctx, child);
5007         child->perf_event_ctxp = child_ctx;
5008         get_task_struct(child);
5009
5010         /*
5011          * If the parent's context is a clone, pin it so it won't get
5012          * swapped under us.
5013          */
5014         parent_ctx = perf_pin_task_context(parent);
5015
5016         /*
5017          * No need to check if parent_ctx != NULL here; since we saw
5018          * it non-NULL earlier, the only reason for it to become NULL
5019          * is if we exit, and since we're currently in the middle of
5020          * a fork we can't be exiting at the same time.
5021          */
5022
5023         /*
5024          * Lock the parent list. No need to lock the child - not PID
5025          * hashed yet and not running, so nobody can access it.
5026          */
5027         mutex_lock(&parent_ctx->mutex);
5028
5029         /*
5030          * We dont have to disable NMIs - we are only looking at
5031          * the list, not manipulating it:
5032          */
5033         list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
5034
5035                 if (!event->attr.inherit) {
5036                         inherited_all = 0;
5037                         continue;
5038                 }
5039
5040                 ret = inherit_group(event, parent, parent_ctx,
5041                                              child, child_ctx);
5042                 if (ret) {
5043                         inherited_all = 0;
5044                         break;
5045                 }
5046         }
5047
5048         if (inherited_all) {
5049                 /*
5050                  * Mark the child context as a clone of the parent
5051                  * context, or of whatever the parent is a clone of.
5052                  * Note that if the parent is a clone, it could get
5053                  * uncloned at any point, but that doesn't matter
5054                  * because the list of events and the generation
5055                  * count can't have changed since we took the mutex.
5056                  */
5057                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5058                 if (cloned_ctx) {
5059                         child_ctx->parent_ctx = cloned_ctx;
5060                         child_ctx->parent_gen = parent_ctx->parent_gen;
5061                 } else {
5062                         child_ctx->parent_ctx = parent_ctx;
5063                         child_ctx->parent_gen = parent_ctx->generation;
5064                 }
5065                 get_ctx(child_ctx->parent_ctx);
5066         }
5067
5068         mutex_unlock(&parent_ctx->mutex);
5069
5070         perf_unpin_context(parent_ctx);
5071
5072         return ret;
5073 }
5074
5075 static void __cpuinit perf_event_init_cpu(int cpu)
5076 {
5077         struct perf_cpu_context *cpuctx;
5078
5079         cpuctx = &per_cpu(perf_cpu_context, cpu);
5080         __perf_event_init_context(&cpuctx->ctx, NULL);
5081
5082         spin_lock(&perf_resource_lock);
5083         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5084         spin_unlock(&perf_resource_lock);
5085
5086         hw_perf_event_setup(cpu);
5087 }
5088
5089 #ifdef CONFIG_HOTPLUG_CPU
5090 static void __perf_event_exit_cpu(void *info)
5091 {
5092         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5093         struct perf_event_context *ctx = &cpuctx->ctx;
5094         struct perf_event *event, *tmp;
5095
5096         list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
5097                 __perf_event_remove_from_context(event);
5098 }
5099 static void perf_event_exit_cpu(int cpu)
5100 {
5101         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5102         struct perf_event_context *ctx = &cpuctx->ctx;
5103
5104         mutex_lock(&ctx->mutex);
5105         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5106         mutex_unlock(&ctx->mutex);
5107 }
5108 #else
5109 static inline void perf_event_exit_cpu(int cpu) { }
5110 #endif
5111
5112 static int __cpuinit
5113 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5114 {
5115         unsigned int cpu = (long)hcpu;
5116
5117         switch (action) {
5118
5119         case CPU_UP_PREPARE:
5120         case CPU_UP_PREPARE_FROZEN:
5121                 perf_event_init_cpu(cpu);
5122                 break;
5123
5124         case CPU_ONLINE:
5125         case CPU_ONLINE_FROZEN:
5126                 hw_perf_event_setup_online(cpu);
5127                 break;
5128
5129         case CPU_DOWN_PREPARE:
5130         case CPU_DOWN_PREPARE_FROZEN:
5131                 perf_event_exit_cpu(cpu);
5132                 break;
5133
5134         default:
5135                 break;
5136         }
5137
5138         return NOTIFY_OK;
5139 }
5140
5141 /*
5142  * This has to have a higher priority than migration_notifier in sched.c.
5143  */
5144 static struct notifier_block __cpuinitdata perf_cpu_nb = {
5145         .notifier_call          = perf_cpu_notify,
5146         .priority               = 20,
5147 };
5148
5149 void __init perf_event_init(void)
5150 {
5151         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5152                         (void *)(long)smp_processor_id());
5153         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5154                         (void *)(long)smp_processor_id());
5155         register_cpu_notifier(&perf_cpu_nb);
5156 }
5157
5158 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5159 {
5160         return sprintf(buf, "%d\n", perf_reserved_percpu);
5161 }
5162
5163 static ssize_t
5164 perf_set_reserve_percpu(struct sysdev_class *class,
5165                         const char *buf,
5166                         size_t count)
5167 {
5168         struct perf_cpu_context *cpuctx;
5169         unsigned long val;
5170         int err, cpu, mpt;
5171
5172         err = strict_strtoul(buf, 10, &val);
5173         if (err)
5174                 return err;
5175         if (val > perf_max_events)
5176                 return -EINVAL;
5177
5178         spin_lock(&perf_resource_lock);
5179         perf_reserved_percpu = val;
5180         for_each_online_cpu(cpu) {
5181                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5182                 spin_lock_irq(&cpuctx->ctx.lock);
5183                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5184                           perf_max_events - perf_reserved_percpu);
5185                 cpuctx->max_pertask = mpt;
5186                 spin_unlock_irq(&cpuctx->ctx.lock);
5187         }
5188         spin_unlock(&perf_resource_lock);
5189
5190         return count;
5191 }
5192
5193 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5194 {
5195         return sprintf(buf, "%d\n", perf_overcommit);
5196 }
5197
5198 static ssize_t
5199 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5200 {
5201         unsigned long val;
5202         int err;
5203
5204         err = strict_strtoul(buf, 10, &val);
5205         if (err)
5206                 return err;
5207         if (val > 1)
5208                 return -EINVAL;
5209
5210         spin_lock(&perf_resource_lock);
5211         perf_overcommit = val;
5212         spin_unlock(&perf_resource_lock);
5213
5214         return count;
5215 }
5216
5217 static SYSDEV_CLASS_ATTR(
5218                                 reserve_percpu,
5219                                 0644,
5220                                 perf_show_reserve_percpu,
5221                                 perf_set_reserve_percpu
5222                         );
5223
5224 static SYSDEV_CLASS_ATTR(
5225                                 overcommit,
5226                                 0644,
5227                                 perf_show_overcommit,
5228                                 perf_set_overcommit
5229                         );
5230
5231 static struct attribute *perfclass_attrs[] = {
5232         &attr_reserve_percpu.attr,
5233         &attr_overcommit.attr,
5234         NULL
5235 };
5236
5237 static struct attribute_group perfclass_attr_group = {
5238         .attrs                  = perfclass_attrs,
5239         .name                   = "perf_events",
5240 };
5241
5242 static int __init perf_event_sysfs_init(void)
5243 {
5244         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5245                                   &perfclass_attr_group);
5246 }
5247 device_initcall(perf_event_sysfs_init);