]> bbs.cooldavid.org Git - net-next-2.6.git/blame - kernel/trace/trace_syscalls.c
perf, trace: Optimize tracepoints by using per-tracepoint-per-cpu hlist to track...
[net-next-2.6.git] / kernel / trace / trace_syscalls.c
CommitLineData
47788c58 1#include <trace/syscall.h>
1c569f02 2#include <trace/events/syscalls.h>
5a0e3ad6 3#include <linux/slab.h>
ee08c6ec 4#include <linux/kernel.h>
fb34a08c 5#include <linux/ftrace.h>
cdd6c482 6#include <linux/perf_event.h>
ee08c6ec
FW
7#include <asm/syscall.h>
8
9#include "trace_output.h"
10#include "trace.h"
11
5be71b61 12static DEFINE_MUTEX(syscall_trace_lock);
fb34a08c
JB
13static int sys_refcount_enter;
14static int sys_refcount_exit;
57421dbb
JB
15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
ee08c6ec 17
c44fc770
FW
18extern unsigned long __start_syscalls_metadata[];
19extern unsigned long __stop_syscalls_metadata[];
20
21static struct syscall_metadata **syscalls_metadata;
22
23static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
24{
25 struct syscall_metadata *start;
26 struct syscall_metadata *stop;
27 char str[KSYM_SYMBOL_LEN];
28
29
30 start = (struct syscall_metadata *)__start_syscalls_metadata;
31 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
32 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
33
34 for ( ; start < stop; start++) {
35 /*
36 * Only compare after the "sys" prefix. Archs that use
37 * syscall wrappers may have syscalls symbols aliases prefixed
38 * with "SyS" instead of "sys", leading to an unwanted
39 * mismatch.
40 */
41 if (start->name && !strcmp(start->name + 3, str + 3))
42 return start;
43 }
44 return NULL;
45}
46
47static struct syscall_metadata *syscall_nr_to_meta(int nr)
48{
49 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
50 return NULL;
51
52 return syscalls_metadata[nr];
53}
54
bed1ffca
FW
55enum print_line_t
56print_syscall_enter(struct trace_iterator *iter, int flags)
57{
58 struct trace_seq *s = &iter->seq;
59 struct trace_entry *ent = iter->ent;
60 struct syscall_trace_enter *trace;
61 struct syscall_metadata *entry;
62 int i, ret, syscall;
63
64c12e04 64 trace = (typeof(trace))ent;
bed1ffca 65 syscall = trace->nr;
bed1ffca 66 entry = syscall_nr_to_meta(syscall);
64c12e04 67
bed1ffca
FW
68 if (!entry)
69 goto end;
70
fcc19438 71 if (entry->enter_event->id != ent->type) {
64c12e04
JB
72 WARN_ON_ONCE(1);
73 goto end;
74 }
75
bed1ffca
FW
76 ret = trace_seq_printf(s, "%s(", entry->name);
77 if (!ret)
78 return TRACE_TYPE_PARTIAL_LINE;
79
80 for (i = 0; i < entry->nb_args; i++) {
81 /* parameter types */
ba8b3a40 82 if (trace_flags & TRACE_ITER_VERBOSE) {
bed1ffca
FW
83 ret = trace_seq_printf(s, "%s ", entry->types[i]);
84 if (!ret)
85 return TRACE_TYPE_PARTIAL_LINE;
86 }
87 /* parameter values */
4539f077 88 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
bed1ffca 89 trace->args[i],
4539f077 90 i == entry->nb_args - 1 ? "" : ", ");
bed1ffca
FW
91 if (!ret)
92 return TRACE_TYPE_PARTIAL_LINE;
93 }
94
4539f077
LZ
95 ret = trace_seq_putc(s, ')');
96 if (!ret)
97 return TRACE_TYPE_PARTIAL_LINE;
98
bed1ffca 99end:
4539f077
LZ
100 ret = trace_seq_putc(s, '\n');
101 if (!ret)
102 return TRACE_TYPE_PARTIAL_LINE;
103
bed1ffca
FW
104 return TRACE_TYPE_HANDLED;
105}
106
107enum print_line_t
108print_syscall_exit(struct trace_iterator *iter, int flags)
109{
110 struct trace_seq *s = &iter->seq;
111 struct trace_entry *ent = iter->ent;
112 struct syscall_trace_exit *trace;
113 int syscall;
114 struct syscall_metadata *entry;
115 int ret;
116
64c12e04 117 trace = (typeof(trace))ent;
bed1ffca 118 syscall = trace->nr;
bed1ffca 119 entry = syscall_nr_to_meta(syscall);
64c12e04 120
bed1ffca
FW
121 if (!entry) {
122 trace_seq_printf(s, "\n");
123 return TRACE_TYPE_HANDLED;
124 }
125
fcc19438 126 if (entry->exit_event->id != ent->type) {
64c12e04
JB
127 WARN_ON_ONCE(1);
128 return TRACE_TYPE_UNHANDLED;
129 }
130
bed1ffca
FW
131 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
132 trace->ret);
133 if (!ret)
134 return TRACE_TYPE_PARTIAL_LINE;
135
136 return TRACE_TYPE_HANDLED;
137}
138
e6971969
LZ
139extern char *__bad_type_size(void);
140
141#define SYSCALL_FIELD(type, name) \
142 sizeof(type) != sizeof(trace.name) ? \
143 __bad_type_size() : \
26a50744
TZ
144 #type, #name, offsetof(typeof(trace), name), \
145 sizeof(trace.name), is_signed_type(type)
e6971969 146
50307a45
LJ
147static
148int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
149{
150 int i;
151 int pos = 0;
152
153 /* When len=0, we just calculate the needed length */
154#define LEN_OR_ZERO (len ? len - pos : 0)
155
156 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
157 for (i = 0; i < entry->nb_args; i++) {
158 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
159 entry->args[i], sizeof(unsigned long),
160 i == entry->nb_args - 1 ? "" : ", ");
161 }
162 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
163
164 for (i = 0; i < entry->nb_args; i++) {
165 pos += snprintf(buf + pos, LEN_OR_ZERO,
166 ", ((unsigned long)(REC->%s))", entry->args[i]);
167 }
168
169#undef LEN_OR_ZERO
170
171 /* return the length of print_fmt */
172 return pos;
173}
174
175static int set_syscall_print_fmt(struct ftrace_event_call *call)
176{
177 char *print_fmt;
178 int len;
179 struct syscall_metadata *entry = call->data;
180
181 if (entry->enter_event != call) {
182 call->print_fmt = "\"0x%lx\", REC->ret";
183 return 0;
184 }
185
186 /* First: called with 0 length to calculate the needed length */
187 len = __set_enter_print_fmt(entry, NULL, 0);
188
189 print_fmt = kmalloc(len + 1, GFP_KERNEL);
190 if (!print_fmt)
191 return -ENOMEM;
192
193 /* Second: actually write the @print_fmt */
194 __set_enter_print_fmt(entry, print_fmt, len + 1);
195 call->print_fmt = print_fmt;
196
197 return 0;
198}
199
200static void free_syscall_print_fmt(struct ftrace_event_call *call)
201{
202 struct syscall_metadata *entry = call->data;
203
204 if (entry->enter_event == call)
205 kfree(call->print_fmt);
206}
207
540b7b8d
LZ
208int syscall_enter_define_fields(struct ftrace_event_call *call)
209{
210 struct syscall_trace_enter trace;
31c16b13 211 struct syscall_metadata *meta = call->data;
540b7b8d 212 int ret;
540b7b8d
LZ
213 int i;
214 int offset = offsetof(typeof(trace), args);
215
0f1ef51d
LJ
216 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
217 if (ret)
218 return ret;
219
540b7b8d 220 for (i = 0; i < meta->nb_args; i++) {
aeaeae11
FW
221 ret = trace_define_field(call, meta->types[i],
222 meta->args[i], offset,
43b51ead
LZ
223 sizeof(unsigned long), 0,
224 FILTER_OTHER);
540b7b8d
LZ
225 offset += sizeof(unsigned long);
226 }
227
228 return ret;
229}
230
231int syscall_exit_define_fields(struct ftrace_event_call *call)
232{
233 struct syscall_trace_exit trace;
234 int ret;
235
0f1ef51d
LJ
236 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
237 if (ret)
238 return ret;
239
26a50744 240 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
43b51ead 241 FILTER_OTHER);
540b7b8d
LZ
242
243 return ret;
244}
245
fb34a08c 246void ftrace_syscall_enter(struct pt_regs *regs, long id)
ee08c6ec 247{
bed1ffca
FW
248 struct syscall_trace_enter *entry;
249 struct syscall_metadata *sys_data;
250 struct ring_buffer_event *event;
e77405ad 251 struct ring_buffer *buffer;
bed1ffca 252 int size;
ee08c6ec
FW
253 int syscall_nr;
254
255 syscall_nr = syscall_get_nr(current, regs);
cd0980fc
HB
256 if (syscall_nr < 0)
257 return;
fb34a08c
JB
258 if (!test_bit(syscall_nr, enabled_enter_syscalls))
259 return;
ee08c6ec 260
bed1ffca
FW
261 sys_data = syscall_nr_to_meta(syscall_nr);
262 if (!sys_data)
263 return;
264
265 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
266
fcc19438
LJ
267 event = trace_current_buffer_lock_reserve(&buffer,
268 sys_data->enter_event->id, size, 0, 0);
bed1ffca
FW
269 if (!event)
270 return;
271
272 entry = ring_buffer_event_data(event);
273 entry->nr = syscall_nr;
274 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
275
e77405ad
SR
276 if (!filter_current_check_discard(buffer, sys_data->enter_event,
277 entry, event))
278 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
ee08c6ec
FW
279}
280
fb34a08c 281void ftrace_syscall_exit(struct pt_regs *regs, long ret)
ee08c6ec 282{
bed1ffca
FW
283 struct syscall_trace_exit *entry;
284 struct syscall_metadata *sys_data;
285 struct ring_buffer_event *event;
e77405ad 286 struct ring_buffer *buffer;
ee08c6ec
FW
287 int syscall_nr;
288
289 syscall_nr = syscall_get_nr(current, regs);
cd0980fc
HB
290 if (syscall_nr < 0)
291 return;
fb34a08c
JB
292 if (!test_bit(syscall_nr, enabled_exit_syscalls))
293 return;
ee08c6ec 294
bed1ffca
FW
295 sys_data = syscall_nr_to_meta(syscall_nr);
296 if (!sys_data)
297 return;
298
fcc19438
LJ
299 event = trace_current_buffer_lock_reserve(&buffer,
300 sys_data->exit_event->id, sizeof(*entry), 0, 0);
bed1ffca
FW
301 if (!event)
302 return;
303
304 entry = ring_buffer_event_data(event);
305 entry->nr = syscall_nr;
306 entry->ret = syscall_get_return_value(current, regs);
307
e77405ad
SR
308 if (!filter_current_check_discard(buffer, sys_data->exit_event,
309 entry, event))
310 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
ee08c6ec
FW
311}
312
bd1a5c84 313int reg_event_syscall_enter(struct ftrace_event_call *call)
ee08c6ec 314{
fb34a08c
JB
315 int ret = 0;
316 int num;
fb34a08c 317
c252f657 318 num = ((struct syscall_metadata *)call->data)->syscall_nr;
57421dbb 319 if (num < 0 || num >= NR_syscalls)
fb34a08c
JB
320 return -ENOSYS;
321 mutex_lock(&syscall_trace_lock);
322 if (!sys_refcount_enter)
1c569f02 323 ret = register_trace_sys_enter(ftrace_syscall_enter);
3b8e4273 324 if (!ret) {
fb34a08c
JB
325 set_bit(num, enabled_enter_syscalls);
326 sys_refcount_enter++;
327 }
328 mutex_unlock(&syscall_trace_lock);
329 return ret;
ee08c6ec
FW
330}
331
bd1a5c84 332void unreg_event_syscall_enter(struct ftrace_event_call *call)
ee08c6ec 333{
fb34a08c 334 int num;
ee08c6ec 335
c252f657 336 num = ((struct syscall_metadata *)call->data)->syscall_nr;
57421dbb 337 if (num < 0 || num >= NR_syscalls)
fb34a08c
JB
338 return;
339 mutex_lock(&syscall_trace_lock);
340 sys_refcount_enter--;
341 clear_bit(num, enabled_enter_syscalls);
342 if (!sys_refcount_enter)
1c569f02 343 unregister_trace_sys_enter(ftrace_syscall_enter);
fb34a08c
JB
344 mutex_unlock(&syscall_trace_lock);
345}
ee08c6ec 346
bd1a5c84 347int reg_event_syscall_exit(struct ftrace_event_call *call)
ee08c6ec 348{
fb34a08c
JB
349 int ret = 0;
350 int num;
fb34a08c 351
c252f657 352 num = ((struct syscall_metadata *)call->data)->syscall_nr;
57421dbb 353 if (num < 0 || num >= NR_syscalls)
fb34a08c
JB
354 return -ENOSYS;
355 mutex_lock(&syscall_trace_lock);
356 if (!sys_refcount_exit)
1c569f02 357 ret = register_trace_sys_exit(ftrace_syscall_exit);
3b8e4273 358 if (!ret) {
fb34a08c
JB
359 set_bit(num, enabled_exit_syscalls);
360 sys_refcount_exit++;
ee08c6ec 361 }
fb34a08c
JB
362 mutex_unlock(&syscall_trace_lock);
363 return ret;
364}
ee08c6ec 365
bd1a5c84 366void unreg_event_syscall_exit(struct ftrace_event_call *call)
fb34a08c
JB
367{
368 int num;
ee08c6ec 369
c252f657 370 num = ((struct syscall_metadata *)call->data)->syscall_nr;
57421dbb 371 if (num < 0 || num >= NR_syscalls)
fb34a08c
JB
372 return;
373 mutex_lock(&syscall_trace_lock);
374 sys_refcount_exit--;
375 clear_bit(num, enabled_exit_syscalls);
376 if (!sys_refcount_exit)
1c569f02 377 unregister_trace_sys_exit(ftrace_syscall_exit);
fb34a08c 378 mutex_unlock(&syscall_trace_lock);
ee08c6ec 379}
fb34a08c 380
a1301da0
LJ
381int init_syscall_trace(struct ftrace_event_call *call)
382{
383 int id;
384
50307a45
LJ
385 if (set_syscall_print_fmt(call) < 0)
386 return -ENOMEM;
387
c7ef3a90
SR
388 id = trace_event_raw_init(call);
389
390 if (id < 0) {
50307a45 391 free_syscall_print_fmt(call);
c7ef3a90 392 return id;
50307a45 393 }
c7ef3a90
SR
394
395 return id;
a1301da0
LJ
396}
397
e7b8e675
MF
398unsigned long __init arch_syscall_addr(int nr)
399{
400 return (unsigned long)sys_call_table[nr];
401}
402
c44fc770
FW
403int __init init_ftrace_syscalls(void)
404{
405 struct syscall_metadata *meta;
406 unsigned long addr;
407 int i;
408
409 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
410 NR_syscalls, GFP_KERNEL);
411 if (!syscalls_metadata) {
412 WARN_ON(1);
413 return -ENOMEM;
414 }
415
416 for (i = 0; i < NR_syscalls; i++) {
417 addr = arch_syscall_addr(i);
418 meta = find_syscall_meta(addr);
c252f657
LJ
419 if (!meta)
420 continue;
421
422 meta->syscall_nr = i;
c44fc770
FW
423 syscalls_metadata[i] = meta;
424 }
425
426 return 0;
427}
428core_initcall(init_ftrace_syscalls);
429
07b139c8 430#ifdef CONFIG_PERF_EVENTS
19007a67 431
97d5a220
FW
432static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
433static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
434static int sys_perf_refcount_enter;
435static int sys_perf_refcount_exit;
f4b5ffcc 436
97d5a220 437static void perf_syscall_enter(struct pt_regs *regs, long id)
f4b5ffcc
JB
438{
439 struct syscall_metadata *sys_data;
20ab4425 440 struct syscall_trace_enter *rec;
1c024eca 441 struct hlist_head *head;
f4b5ffcc 442 int syscall_nr;
4ed7c92d 443 int rctx;
19007a67 444 int size;
f4b5ffcc
JB
445
446 syscall_nr = syscall_get_nr(current, regs);
97d5a220 447 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
f4b5ffcc
JB
448 return;
449
450 sys_data = syscall_nr_to_meta(syscall_nr);
451 if (!sys_data)
452 return;
453
19007a67
FW
454 /* get the size after alignment with the u32 buffer size field */
455 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
456 size = ALIGN(size + sizeof(u32), sizeof(u64));
457 size -= sizeof(u32);
458
97d5a220
FW
459 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
460 "perf buffer not large enough"))
20ab4425
FW
461 return;
462
97d5a220 463 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
b7e2ecef 464 sys_data->enter_event->id, regs, &rctx);
430ad5a6
XG
465 if (!rec)
466 return;
20ab4425 467
20ab4425
FW
468 rec->nr = syscall_nr;
469 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
470 (unsigned long *)&rec->args);
1c024eca
PZ
471
472 head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id());
473 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
f4b5ffcc
JB
474}
475
97d5a220 476int perf_sysenter_enable(struct ftrace_event_call *call)
f4b5ffcc
JB
477{
478 int ret = 0;
479 int num;
480
3bbe84e9 481 num = ((struct syscall_metadata *)call->data)->syscall_nr;
f4b5ffcc
JB
482
483 mutex_lock(&syscall_trace_lock);
97d5a220
FW
484 if (!sys_perf_refcount_enter)
485 ret = register_trace_sys_enter(perf_syscall_enter);
f4b5ffcc
JB
486 if (ret) {
487 pr_info("event trace: Could not activate"
488 "syscall entry trace point");
489 } else {
97d5a220
FW
490 set_bit(num, enabled_perf_enter_syscalls);
491 sys_perf_refcount_enter++;
f4b5ffcc
JB
492 }
493 mutex_unlock(&syscall_trace_lock);
494 return ret;
495}
496
97d5a220 497void perf_sysenter_disable(struct ftrace_event_call *call)
f4b5ffcc
JB
498{
499 int num;
500
3bbe84e9 501 num = ((struct syscall_metadata *)call->data)->syscall_nr;
f4b5ffcc
JB
502
503 mutex_lock(&syscall_trace_lock);
97d5a220
FW
504 sys_perf_refcount_enter--;
505 clear_bit(num, enabled_perf_enter_syscalls);
506 if (!sys_perf_refcount_enter)
507 unregister_trace_sys_enter(perf_syscall_enter);
f4b5ffcc
JB
508 mutex_unlock(&syscall_trace_lock);
509}
510
97d5a220 511static void perf_syscall_exit(struct pt_regs *regs, long ret)
f4b5ffcc
JB
512{
513 struct syscall_metadata *sys_data;
20ab4425 514 struct syscall_trace_exit *rec;
1c024eca 515 struct hlist_head *head;
f4b5ffcc 516 int syscall_nr;
4ed7c92d 517 int rctx;
20ab4425 518 int size;
f4b5ffcc
JB
519
520 syscall_nr = syscall_get_nr(current, regs);
97d5a220 521 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
f4b5ffcc
JB
522 return;
523
524 sys_data = syscall_nr_to_meta(syscall_nr);
525 if (!sys_data)
526 return;
527
20ab4425
FW
528 /* We can probably do that at build time */
529 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
530 size -= sizeof(u32);
19007a67 531
20ab4425
FW
532 /*
533 * Impossible, but be paranoid with the future
534 * How to put this check outside runtime?
535 */
97d5a220
FW
536 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
537 "exit event has grown above perf buffer size"))
20ab4425
FW
538 return;
539
97d5a220 540 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
b7e2ecef 541 sys_data->exit_event->id, regs, &rctx);
430ad5a6
XG
542 if (!rec)
543 return;
20ab4425 544
20ab4425
FW
545 rec->nr = syscall_nr;
546 rec->ret = syscall_get_return_value(current, regs);
547
1c024eca
PZ
548 head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id());
549 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
f4b5ffcc
JB
550}
551
97d5a220 552int perf_sysexit_enable(struct ftrace_event_call *call)
f4b5ffcc
JB
553{
554 int ret = 0;
555 int num;
556
3bbe84e9 557 num = ((struct syscall_metadata *)call->data)->syscall_nr;
f4b5ffcc
JB
558
559 mutex_lock(&syscall_trace_lock);
97d5a220
FW
560 if (!sys_perf_refcount_exit)
561 ret = register_trace_sys_exit(perf_syscall_exit);
f4b5ffcc
JB
562 if (ret) {
563 pr_info("event trace: Could not activate"
6574658b 564 "syscall exit trace point");
f4b5ffcc 565 } else {
97d5a220
FW
566 set_bit(num, enabled_perf_exit_syscalls);
567 sys_perf_refcount_exit++;
f4b5ffcc
JB
568 }
569 mutex_unlock(&syscall_trace_lock);
570 return ret;
571}
572
97d5a220 573void perf_sysexit_disable(struct ftrace_event_call *call)
f4b5ffcc
JB
574{
575 int num;
576
3bbe84e9 577 num = ((struct syscall_metadata *)call->data)->syscall_nr;
f4b5ffcc
JB
578
579 mutex_lock(&syscall_trace_lock);
97d5a220
FW
580 sys_perf_refcount_exit--;
581 clear_bit(num, enabled_perf_exit_syscalls);
582 if (!sys_perf_refcount_exit)
583 unregister_trace_sys_exit(perf_syscall_exit);
f4b5ffcc
JB
584 mutex_unlock(&syscall_trace_lock);
585}
586
07b139c8 587#endif /* CONFIG_PERF_EVENTS */
f4b5ffcc 588