/* * builtin-trace.c * * Builtin 'trace' command: * * Display a continuously updated trace of any workload, CPU, specific PID, * system wide, etc. Default format is loosely strace like, but any other * event may be specified using --event. * * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> * * Initially based on the 'trace' prototype by Thomas Gleixner: * * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
*/
/* * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100 * * We have to explicitely mark the direction of the flow of data, if from the * kernel to user space or the other way around, since the BPF collector we * have so far copies only from user to kernel space, mark the arguments that * go that direction, so that we don´t end up collecting the previous contents * for syscall args that goes from kernel to user space.
*/ struct syscall_arg_fmt {
size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); bool (*strtoul)(char *bf, size_t size, struct syscall_arg *arg, u64 *val); unsignedlong (*mask_val)(struct syscall_arg *arg, unsignedlong val); void *parm; constchar *name;
u16 nr_entries; // for arrays bool from_user; bool show_zero; #ifdef HAVE_LIBBPF_SUPPORT conststruct btf_type *type; int type_id; /* used in btf_dump */ #endif
};
/* * The evsel->priv as used by 'perf trace' * sc: for raw_syscalls:sys_{enter,exit} and syscalls:sys_{enter,exit}_SYSCALLNAME * fmt: for all the other tracepoints
*/ struct evsel_trace { struct syscall_tp sc; struct syscall_arg_fmt *fmt;
};
staticvoid evsel_trace__delete(struct evsel_trace *et)
{ if (et == NULL) return;
zfree(&et->fmt);
free(et);
}
/* * Used with raw_syscalls:sys_{enter,exit} and with the * syscalls:sys_{enter,exit}_SYSCALL tracepoints
*/ staticinlinestruct syscall_tp *__evsel__syscall_tp(struct evsel *evsel)
{ struct evsel_trace *et = evsel->priv;
/* * Used with all the other tracepoints.
*/ staticinlinestruct syscall_arg_fmt *__evsel__syscall_arg_fmt(struct evsel *evsel)
{ struct evsel_trace *et = evsel->priv;
static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct syscall_arg *arg)
{ // XXX Hey, maybe for sched:sched_switch prev/next comm fields we can // fill missing comms using thread__set_comm()... // here or in a special syscall_arg__scnprintf_pid_sched_tp... return scnprintf(bf, size, "\"%-.*s\"", arg->fmt->nr_entries ?: arg->len, arg->val);
}
// 'argname' is just documentational at this point, to remove the previous comment with that info #define SCA_FILENAME_FROM_USER(argname) \
{ .scnprintf = SCA_FILENAME, \
.from_user = true, }
/* pretty print the struct data here */ if (btf_dump__dump_type_data(btf_dump, type_id, arg->augmented.args->value, type->size, &dump_data_opts) == 0) return 0;
if (arg_fmt->type == NULL) { // Check if this is an enum and if we have the BTF type for it.
syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type);
}
// Did we manage to find a BTF type for the syscall/tracepoint argument? if (arg_fmt->type == NULL) return 0;
/** * struct syscall
*/ struct syscall { /** @e_machine: The ELF machine associated with the entry. */ int e_machine; /** @id: id value from the tracepoint, the system call number. */ int id; struct tep_event *tp_format; int nr_args; /** * @args_size: sum of the sizes of the syscall arguments, anything * after that is augmented stuff: pathname for openat, etc.
*/
int args_size; struct { struct bpf_program *sys_enter,
*sys_exit;
} bpf_prog; /** @is_exit: is this "exit" or "exit_group"? */ bool is_exit; /** * @is_open: is this "open" or "openat"? To associate the fd returned in * sys_exit with the pathname in sys_enter.
*/ bool is_open; /** * @nonexistent: Name lookup failed. Just a hole in the syscall table, * syscall id not allocated.
*/ bool nonexistent; bool use_btf; struct tep_format_field *args; constchar *name; conststruct syscall_fmt *fmt; struct syscall_arg_fmt *arg_fmt;
};
/* * We need to have this 'calculated' boolean because in some cases we really * don't know what is the duration of a syscall, for instance, when we start * a session and some threads are waiting for a syscall to finish, say 'poll', * in which case all we can do is to print "( ? ) for duration and for the * start timestamp.
*/ static size_t fprintf_duration(unsignedlong t, bool calculated, FILE *fp)
{ double duration = (double)t / NSEC_PER_MSEC;
size_t printed = fprintf(fp, "(");
/** * filename.ptr: The filename char pointer that will be vfs_getname'd * filename.entry_str_pos: Where to insert the string translated from * filename.ptr by the vfs_getname tracepoint/kprobe. * ret_scnprintf: syscall args may set this to a different syscall return * formatter, for instance, fcntl may return fds, file flags, etc.
*/ struct thread_trace {
u64 entry_time; bool entry_pending; unsignedlong nr_events; unsignedlong pfmaj, pfmin; char *entry_str; double runtime_ms;
size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); struct { unsignedlong ptr; shortint entry_str_pos; bool pending_open; unsignedint namelen; char *name;
} filename; struct { int max; struct file *table;
} files;
static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
{ struct augmented_arg *augmented_arg = arg->augmented.args;
size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value); /* * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls * we would have two strings, each prefixed by its size.
*/ int consumed = sizeof(*augmented_arg) + augmented_arg->size;
/* * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are * using ttrace->entry_time for a thread that receives a sys_exit without * first having received a sys_enter ("poll" issued before tracing session * starts, lost sys_enter exit due to ring buffer overflow).
*/ static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
{ if (tstamp > 0) return __trace__fprintf_tstamp(trace, tstamp, fp);
if (symbol_conf.kptr_restrict) {
pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n" "Check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" "Kernel samples will not be resolved.\n");
machine->kptr_restrict_warned = true; return NULL;
}
for (; field; field = field->next, ++arg) {
last_field = field;
if (arg->scnprintf) continue;
len = strlen(field->name);
// As far as heuristics (or intention) goes this seems to hold true, and makes sense! if ((field->flags & TEP_FIELD_IS_POINTER) && strstarts(field->type, "const "))
arg->from_user = true;
/* * Fails to read trace point format via sysfs node, so the trace point * doesn't exist. Set the 'nonexistent' flag as true.
*/ if (IS_ERR(sc->tp_format)) {
sc->nonexistent = true;
err = PTR_ERR(sc->tp_format);
sc->tp_format = NULL; return err;
}
/* * The tracepoint format contains __syscall_nr field, so it's one more * than the actual number of syscall arguments.
*/ if (syscall__alloc_arg_fmts(sc, sc->tp_format->format.nr_fields - 1)) return -ENOMEM;
sc->args = sc->tp_format->format.fields; /* * We need to check and discard the first variable '__syscall_nr' * or 'nr' that mean the syscall number. It is needless here. * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
*/ if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
sc->args = sc->args->next;
--sc->nr_args;
}
if (trace->ev_qualifier_ids.entries == NULL) {
fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
trace->output);
err = -EINVAL; goto out;
}
strlist__for_each_entry(pos, trace->ev_qualifier) { constchar *sc = pos->s; /* * TODO: Assume more than the validation/warnings are all for * the same binary type as perf.
*/ int id = syscalltbl__id(EM_HOST, sc), match_next = -1;
if (id < 0) {
id = syscalltbl__strglobmatch_first(EM_HOST, sc, &match_next); if (id >= 0) goto matches;
if (in_ev_qualifier) return !trace->not_ev_qualifier;
return trace->not_ev_qualifier;
}
/* * args is to be interpreted as a series of longs but we need to handle * 8-byte unaligned accesses. args points to raw_data within the event * and raw_data is guaranteed to be 8-byte unaligned because it is * preceded by raw_size which is a u32. So we need to copy args to a temp * variable to read it. Most notably this avoids extended load instructions * on unaligned addresses
*/ unsignedlong syscall_arg__val(struct syscall_arg *arg, u8 idx)
{ unsignedlong val; unsignedchar *p = arg->args + sizeof(unsignedlong) * idx;
/* * Check if the value is in fact zero, i.e. mask whatever needs masking, such * as mount 'flags' argument that needs ignoring some magic flag, see comment * in tools/perf/trace/beauty/mount_flags.c
*/ staticunsignedlong syscall_arg_fmt__mask_val(struct syscall_arg_fmt *fmt, struct syscall_arg *arg, unsignedlong val)
{ if (fmt && fmt->mask_val) return fmt->mask_val(arg, val);
/* * Things like fcntl will set this in its 'cmd' formatter to pick the * right formatter for the return value (an fd? file flags?), which is * not needed for syscalls that always return a given type, say an fd.
*/
ttrace->ret_scnprintf = NULL;
if (sc->args != NULL) { struct tep_format_field *field;
for (field = sc->args; field;
field = field->next, ++arg.idx, bit <<= 1) { if (arg.mask & bit) continue;
arg.fmt = &sc->arg_fmt[arg.idx];
val = syscall_arg__val(&arg, arg.idx); /* * Some syscall args need some mask, most don't and * return val untouched.
*/
val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val);
/* * Suppress this argument if its value is zero and show_zero * property isn't set. * * If it has a BTF type, then override the zero suppression knob * as the common case is for zero in an enum to have an associated entry.
*/ if (val == 0 && !trace->show_zeros &&
!(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) &&
!(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE)) continue;
printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx],
bf + printed, size - printed, &arg, val);
}
} elseif (IS_ERR(sc->tp_format)) { /* * If we managed to read the tracepoint /format file, then we * may end up not having any args, like with gettid(), so only * print the raw args when we didn't manage to read it.
*/ while (arg.idx < sc->nr_args) { if (arg.mask & bit) goto next_arg;
val = syscall_arg__val(&arg, arg.idx); if (printed)
printed += scnprintf(bf + printed, size - printed, ", ");
printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val);
next_arg:
++arg.idx;
bit <<= 1;
}
}
staticstruct syscall *trace__syscall_info(struct trace *trace, struct evsel *evsel, int e_machine, int id)
{ struct syscall *sc; int err = 0;
if (id < 0) {
/* * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried * before that, leaving at a higher verbosity level till that is * explained. Reproduced with plain ftrace with: * * echo 1 > /t/events/raw_syscalls/sys_exit/enable * grep "NR -1 " /t/trace_pipe * * After generating some load on the machine.
*/ if (verbose > 1) { static u64 n;
fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
id, evsel__name(evsel), ++n);
} return NULL;
}
staticvoid *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int*augmented_args_size, int raw_augmented_args_size)
{ /* * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter * and there we get all 6 syscall args plus the tracepoint common fields * that gets calculated at the start and the syscall_nr (another long). * So we check if that is the case and if so don't look after the * sc->args_size but always after the full raw_syscalls:sys_enter payload, * which is fixed. * * We'll revisit this later to pass s->args_size to the BPF augmenter * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it * copies only what we need for each syscall, like what happens when we * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace * traffic to just what is needed for each syscall.
*/ int args_size = raw_augmented_args_size ?: sc->args_size;
if ((size_t)(*augmented_args_size) > sizeof(argbuf)) return NULL;
/* * The perf ring-buffer is 8-byte aligned but sample->raw_data * is not because it's preceded by u32 size. Later, beautifier * will use the augmented args with stricter alignments like in * some struct. To make sure it's aligned, let's copy the args * into a static buffer as it's single-threaded for now.
*/
memcpy(argbuf, sample->raw_data + args_size, *augmented_args_size);
return argbuf;
} return NULL;
}
staticint trace__sys_enter(struct trace *trace, struct evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample)
{ char *msg; void *args; int printed = 0; struct thread *thread; int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; int augmented_args_size = 0, e_machine; void *augmented_args = NULL; struct syscall *sc; struct thread_trace *ttrace;
if (ttrace->entry_str == NULL) {
ttrace->entry_str = malloc(trace__entry_str_size); if (!ttrace->entry_str) goto out_put;
}
if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
trace__printf_interrupted_entry(trace); /* * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible * arguments, even if the syscall being handled, say "openat", uses only 4 arguments * this breaks syscall__augmented_args() check for augmented args, as we calculate * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file, * so when handling, say the openat syscall, we end up getting 6 args for the * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly * thinking that the extra 2 u64 args are the augmented filename, so just check * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
*/ if (evsel != trace->syscalls.events.sys_enter)
augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
ttrace->entry_time = sample->time;
msg = ttrace->entry_str;
printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
e_machine = thread__e_machine(thread, trace->host);
sc = trace__syscall_info(trace, evsel, e_machine, id); if (sc == NULL) goto out_put;
ttrace = thread__trace(thread, trace); /* * We need to get ttrace just to make sure it is there when syscall__scnprintf_args() * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
*/ if (ttrace == NULL) goto out_put;
fprintf(trace->output, "%ld", ret); if (child != NULL) { if (thread__comm_set(child))
fprintf(trace->output, " (%s)", thread__comm_str(child));
thread__put(child);
}
} else goto signed_print;
fputc('\n', trace->output);
/* * We only consider an 'event' for the sake of --max-events a non-filtered * sys_enter + sys_exit and other tracepoint events.
*/ if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
interrupted = true;
switch (op) { case BINARY_PRINT_CHAR_DATA: return fprintf(fp, "%c", isprint(ch) ? ch : '.'); case BINARY_PRINT_DATA_BEGIN: case BINARY_PRINT_LINE_BEGIN: case BINARY_PRINT_ADDR: case BINARY_PRINT_NUM_DATA: case BINARY_PRINT_NUM_PAD: case BINARY_PRINT_SEP: case BINARY_PRINT_CHAR_PAD: case BINARY_PRINT_LINE_END: case BINARY_PRINT_DATA_END: default: break;
}
val = (uintptr_t)(sample->raw_data + offset);
} else
val = format_field__intval(field, sample, evsel->needs_swap); /* * Some syscall args need some mask, most don't and * return val untouched.
*/
val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val);
/* Suppress this argument if its value is zero and show_zero property isn't set. */ if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE) continue;
/* * XXX: Not having the associated syscall info or not finding/adding * the thread should never happen, but if it does... * fall thru and print it as a bpf_output event.
*/
}
staticvoid trace__set_base_time(struct trace *trace, struct evsel *evsel, struct perf_sample *sample)
{ /* * BPF events were not setting PERF_SAMPLE_TIME, so be more robust * and don't use sample->time unconditionally, we may end up having * some other event in the future without PERF_SAMPLE_TIME for good * reason, i.e. we may not be interested in its timestamps, just in * it taking place, picking some piece of information when it * appears in our event stream (vfs_getname comes to mind).
*/ if (trace->base_time == 0 && !trace->full_time &&
(evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
trace->base_time = sample->time;
}
if (callchain_param.enabled && !trace->kernel_syscallchains) { /* * We're interested only in the user space callchain * leading to the syscall, allow overriding that for * debugging reasons using --kernel_syscall_callchains
*/
sys_exit->core.attr.exclude_callchain_kernel = 1;
}
staticint trace__bpf_sys_enter_beauty_map(struct trace *trace, int e_machine, int key, unsignedint *beauty_array)
{ struct tep_format_field *field; struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, key); conststruct btf_type *bt; char *struct_offset, *tmp, name[32]; bool can_augment = false; int i, cnt;
if (sc == NULL) return -1;
trace__load_vmlinux_btf(trace); if (trace->btf == NULL) return -1;
for (i = 0, field = sc->args; field; ++i, field = field->next) { // XXX We're only collecting pointer payloads _from_ user space if (!sc->arg_fmt[i].from_user) continue;
struct_offset = strstr(field->type, "struct "); if (struct_offset == NULL)
struct_offset = strstr(field->type, "union "); else
struct_offset++; // "union" is shorter
if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct or union (think BPF's attr arg) */
struct_offset += 6;
/* for 'struct foo *', we only want 'foo' */ for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) {
}
for (field = sc->args, candidate_field = pair->args;
field && candidate_field; field = field->next, candidate_field = candidate_field->next) { bool is_pointer = field->flags & TEP_FIELD_IS_POINTER,
candidate_is_pointer = candidate_field->flags & TEP_FIELD_IS_POINTER;
if (is_pointer) { if (!candidate_is_pointer) { // The candidate just doesn't copies our pointer arg, might copy other pointers we want. continue;
}
} else { if (candidate_is_pointer) { // The candidate might copy a pointer we don't have, skip it. goto next_candidate;
} continue;
}
if (strcmp(field->type, candidate_field->type)) goto next_candidate;
/* * This is limited in the BPF program but sys_write * uses "const char *" for its "buf" arg so we need to * use some heuristic that is kinda future proof...
*/ if (strcmp(field->type, "const char *") == 0 &&
!(strstr(field->name, "name") ||
strstr(field->name, "path") ||
strstr(field->name, "file") ||
strstr(field->name, "root") ||
strstr(field->name, "description"))) goto next_candidate;
is_candidate = true;
}
if (!is_candidate) goto next_candidate;
/* * Check if the tentative pair syscall augmenter has more pointers, if it has, * then it may be collecting that and we then can't use it, as it would collect * more than what is common to the two syscalls.
*/ if (candidate_field) { for (candidate_field = candidate_field->next; candidate_field; candidate_field = candidate_field->next) if (candidate_field->flags & TEP_FIELD_IS_POINTER) goto next_candidate;
}
pair_prog = pair->bpf_prog.sys_enter; /* * If the pair isn't enabled, then its bpf_prog.sys_enter will not * have been searched for, so search it here and if it returns the * unaugmented one, then ignore it, otherwise we'll reuse that BPF * program for a filtered syscall on a non-filtered one. * * For instance, we have "!syscalls:sys_enter_renameat" and that is * useful for "renameat2".
*/ if (pair_prog == NULL) {
pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter"); if (pair_prog == unaugmented_prog) goto next_candidate;
}
staticint trace__init_syscalls_bpf_prog_array_maps(struct trace *trace, int e_machine)
{ int map_enter_fd; int map_exit_fd; int beauty_map_fd; int err = 0; unsignedint beauty_array[6];
if (augmented_syscalls__get_map_fds(&map_enter_fd, &map_exit_fd, &beauty_map_fd) < 0) return -1;
// It'll get at least the "!raw_syscalls:unaugmented"
prog_fd = trace__bpf_prog_sys_enter_fd(trace, e_machine, key);
err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY); if (err) break;
prog_fd = trace__bpf_prog_sys_exit_fd(trace, e_machine, key);
err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY); if (err) break;
/* use beauty_map to tell BPF how many bytes to collect, set beauty_map's value here */
memset(beauty_array, 0, sizeof(beauty_array));
err = trace__bpf_sys_enter_beauty_map(trace, e_machine, key, (unsignedint *)beauty_array); if (err) continue;
err = bpf_map_update_elem(beauty_map_fd, &key, beauty_array, BPF_ANY); if (err) break;
}
/* * Now lets do a second pass looking for enabled syscalls without * an augmenter that have a signature that is a superset of another * syscall with an augmenter so that we can auto-reuse it. * * I.e. if we have an augmenter for the "open" syscall that has * this signature: * * int open(const char *pathname, int flags, mode_t mode); * * I.e. that will collect just the first string argument, then we * can reuse it for the 'creat' syscall, that has this signature: * * int creat(const char *pathname, mode_t mode); * * and for: * * int stat(const char *pathname, struct stat *statbuf); * int lstat(const char *pathname, struct stat *statbuf); * * Because the 'open' augmenter will collect the first arg as a string, * and leave alone all the other args, which already helps with * beautifying 'stat' and 'lstat''s pathname arg. * * Then, in time, when 'stat' gets an augmenter that collects both * first and second arg (this one on the raw_syscalls:sys_exit prog * array tail call, then that one will be used.
*/ for (int i = 0, num_idx = syscalltbl__num_idx(e_machine); i < num_idx; ++i) { int key = syscalltbl__id_at_idx(e_machine, i); struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, key); struct bpf_program *pair_prog; int prog_fd;
if (sc == NULL || sc->bpf_prog.sys_enter == NULL) continue;
/* * For now we're just reusing the sys_enter prog, and if it * already has an augmenter, we don't need to find one.
*/ if (sc->bpf_prog.sys_enter != unaugmented_prog) continue;
/* * Look at all the other syscalls for one that has a signature * that is close enough that we can share:
*/
pair_prog = trace__find_usable_bpf_prog_entry(trace, sc); if (pair_prog == NULL) continue;
sc->bpf_prog.sys_enter = pair_prog;
/* * Update the BPF_MAP_TYPE_PROG_SHARED for raw_syscalls:sys_enter * with the fd for the program we're reusing:
*/
prog_fd = bpf_program__fd(sc->bpf_prog.sys_enter);
err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY); if (err) break;
}
staticint trace__set_filter_pids(struct trace *trace)
{ int err = 0; /* * Better not use !target__has_task() here because we need to cover the * case where no threads were specified in the command line, but a * workload was, and in that case we will fill in the thread_map when * we fork the workload in evlist__prepare_workload.
*/ if (trace->filter_pids.nr > 0) {
err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
trace->filter_pids.entries); if (!err) {
err = augmented_syscalls__set_filter_pids(trace->filter_pids.nr,
trace->filter_pids.entries);
}
} elseif (perf_thread_map__pid(trace->evlist->core.threads, 0) == -1) {
err = trace__set_filter_loop_pids(trace);
}
return err;
}
staticint __trace__deliver_event(struct trace *trace, union perf_event *event)
{ struct evlist *evlist = trace->evlist; struct perf_sample sample; int err;
while ((tok = strpbrk(left, "=<>!")) != NULL) { char *right = tok + 1, *right_end;
if (*right == '=')
++right;
while (isspace(*right))
++right;
if (*right == '\0') break;
while (!isalpha(*left)) if (++left == tok) { /* * Bail out, can't find the name of the argument that is being * used in the filter, let it try to set this filter, will fail later.
*/ return 0;
}
right_end = right + 1; while (isalnum(*right_end) || *right_end == '_' || *right_end == '|')
++right_end;
if (isalpha(*right)) { struct syscall_arg_fmt *fmt; int left_size = tok - left,
right_size = right_end - right; char arg[128], *type;
fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type); if (fmt == NULL) {
pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n",
arg, evsel->name, evsel->filter); return -1;
}
if (trace->summary_bpf) { if (trace_prepare_bpf_summary(trace->summary_mode) < 0) goto out_delete_evlist;
if (trace->summary_only) goto create_maps;
}
if (!trace->raw_augmented_syscalls) { if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) goto out_error_raw_syscalls;
if (trace->trace_syscalls)
trace->vfs_getname = evlist__add_vfs_getname(evlist);
}
if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
pgfault_maj = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); if (pgfault_maj == NULL) goto out_error_mem;
evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
evlist__add(evlist, pgfault_maj);
}
if ((trace->trace_pgfaults & TRACE_PFMIN)) {
pgfault_min = evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); if (pgfault_min == NULL) goto out_error_mem;
evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
evlist__add(evlist, pgfault_min);
}
/* Enable ignoring missing threads when -p option is defined. */
trace->opts.ignore_missing_thread = trace->opts.target.pid;
if (trace->sched &&
evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime)) goto out_error_sched_stat_runtime; /* * If a global cgroup was set, apply it to all the events without an * explicit cgroup. I.e.: * * trace -G A -e sched:*switch * * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc * _and_ sched:sched_switch to the 'A' cgroup, while: * * trace -e sched:*switch -G A * * will only set the sched:sched_switch event to the 'A' cgroup, all the * other events (raw_syscalls:sys_{enter,exit}, etc are left "without" * a cgroup (on the root cgroup, sys wide, etc). * * Multiple cgroups: * * trace -G A -e sched:*switch -G B * * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes * to the 'B' cgroup. * * evlist__set_default_cgroup() grabs a reference of the passed cgroup * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
*/ if (trace->cgroup)
evlist__set_default_cgroup(trace->evlist, trace->cgroup);
create_maps:
err = evlist__create_maps(evlist, &trace->opts.target); if (err < 0) {
fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); goto out_delete_evlist;
}
err = trace__symbols_init(trace, argc, argv, evlist); if (err < 0) {
fprintf(trace->output, "Problems initializing symbol libraries!\n"); goto out_delete_evlist;
}
if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
trace->syscall_stats = alloc_syscall_stats(); if (IS_ERR(trace->syscall_stats)) goto out_delete_evlist;
}
if (forks) {
err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL); if (err < 0) {
fprintf(trace->output, "Couldn't run the workload!\n"); goto out_delete_evlist;
}
workload_pid = evlist->workload.pid;
}
err = evlist__open(evlist); if (err < 0) goto out_error_open;
augmented_syscalls__setup_bpf_output();
err = trace__set_filter_pids(trace); if (err < 0) goto out_error_mem;
/* * TODO: Initialize for all host binary machine types, not just * those matching the perf binary.
*/
trace__init_syscalls_bpf_prog_array_maps(trace, EM_HOST);
if (trace->ev_qualifier_ids.nr > 0) {
err = trace__set_ev_qualifier_filter(trace); if (err < 0) goto out_errno;
/* * If the "close" syscall is not traced, then we will not have the * opportunity to, in syscall_arg__scnprintf_close_fd() invalidate the * fd->pathname table and were ending up showing the last value set by * syscalls opening a pathname and associating it with a descriptor or * reading it from /proc/pid/fd/ in cases where that doesn't make * sense. * * So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is * not in use.
*/ /* TODO: support for more than just perf binary machine type close. */
trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(EM_HOST, "close"));
err = trace__expand_filters(trace, &evsel); if (err) goto out_delete_evlist;
err = evlist__apply_filters(evlist, &evsel, &trace->opts.target); if (err < 0) goto out_error_apply_filters;
if (!trace->summary_only || !trace->summary_bpf) {
err = evlist__mmap(evlist, trace->opts.mmap_pages); if (err < 0) goto out_error_mmap;
}
if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay)
evlist__enable(evlist);
if (forks)
evlist__start_workload(evlist);
if (trace->opts.target.initial_delay) {
usleep(trace->opts.target.initial_delay * 1000);
evlist__enable(evlist);
}
if (trace->summary_bpf)
trace_start_bpf_summary();
/* * Now that we already used evsel->core.attr to ask the kernel to setup the * events, lets reuse evsel->core.attr.sample_max_stack as the limit in * trace__resolve_callchain(), allowing per-event max-stack settings * to override an explicitly set --max-stack global setting.
*/
evlist__for_each_entry(evlist, evsel) { if (evsel__has_callchain(evsel) &&
evsel->core.attr.sample_max_stack == 0)
evsel->core.attr.sample_max_stack = trace->max_stack;
}
again:
before = trace->nr_events;
for (i = 0; i < evlist->core.nr_mmaps; i++) { union perf_event *event; struct mmap *md;
md = &evlist->mmap[i]; if (perf_mmap__read_init(&md->core) < 0) continue;
while ((event = perf_mmap__read_event(&md->core)) != NULL) {
++trace->nr_events;
err = trace__deliver_event(trace, event); if (err) goto out_disable;
if (trace->errno_summary && stats->nr_failures) { int e;
for (e = 0; e < stats->max_errno; ++e) { if (stats->errnos[e] != 0)
fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]);
}
}
}
}
/* * XXX: Hackish, just splitting the combined -e+--event (syscalls * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use * existing facilities unchanged (trace->ev_qualifier + parse_options()). * * It'd be better to introduce a parse_options() variant that would return a * list with the terms it didn't match to an event...
*/ staticint trace__parse_events_option(conststruct option *opt, constchar *str, int unset __maybe_unused)
{ struct trace *trace = (struct trace *)opt->value; constchar *s = str; char *sep = NULL, *lists[2] = { NULL, NULL, }; int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; conststruct syscall_fmt *fmt;
if (strace_groups_dir == NULL) return -1;
if (*s == '!') {
++s;
trace->not_ev_qualifier = true;
}
while (1) { if ((sep = strchr(s, ',')) != NULL)
*sep = '\0';
list = 0; /* TODO: support for more than just perf binary machine type syscalls. */ if (syscalltbl__id(EM_HOST, s) >= 0 ||
syscalltbl__strglobmatch_first(EM_HOST, s, &idx) >= 0) {
list = 1; goto do_concat;
}
fmt = syscall_fmt__find_by_alias(s); if (fmt != NULL) {
list = 1;
s = fmt->name;
} else {
path__join(group_name, sizeof(group_name), strace_groups_dir, s); if (access(group_name, R_OK) == 0)
list = 1;
}
do_concat: if (lists[list]) {
sprintf(lists[list] + strlen(lists[list]), ",%s", s);
} else {
lists[list] = malloc(len); if (lists[list] == NULL) goto out;
strcpy(lists[list], s);
}
if (trace.evlist == NULL) {
pr_err("Not enough memory to run!\n");
err = -ENOMEM; goto out;
}
/* * Parsing .perfconfig may entail creating a BPF event, that may need * to create BPF maps, so bump RLIM_MEMLOCK as the default 64K setting * is too small. This affects just this process, not touching the * global setting. If it fails we'll get something in 'perf trace -v' * to help diagnose the problem.
*/
rlimit__bump_memlock();
err = perf_config(trace__config, &trace); if (err) goto out;
/* * Here we already passed thru trace__parse_events_option() and it has * already figured out if -e syscall_name, if not but if --event * foo:bar was used, the user is interested _just_ in those, say, * tracepoint events, not in the strace-like syscall-name-based mode. * * This is important because we need to check if strace-like mode is * needed to decided if we should filter out the eBPF * __augmented_syscalls__ code, if it is in the mix, say, via * .perfconfig trace.add_events, and filter those out.
*/ if (!trace.trace_syscalls && !trace.trace_pgfaults &&
trace.evlist->core.nr_entries == 0 /* Was --events used? */) {
trace.trace_syscalls = true;
} /* * Now that we have --verbose figured out, lets see if we need to parse * events from .perfconfig, so that if those events fail parsing, say some * BPF program fails, then we'll be able to use --verbose to see what went * wrong in more detail.
*/ if (trace.perfconfig_events != NULL) { struct parse_events_error parse_err;
parse_events_error__init(&parse_err);
err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err); if (err)
parse_events_error__print(&parse_err, trace.perfconfig_events);
parse_events_error__exit(&parse_err); if (err) goto out;
}
if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
usage_with_options_msg(trace_usage, trace_options, "cgroup monitoring only available in system-wide mode");
}
if (!trace.trace_syscalls) goto skip_augmentation;
if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) {
pr_debug("Syscall augmentation fails with record, disabling augmentation"); goto skip_augmentation;
}
if (trace.summary_bpf) { if (!trace.opts.target.system_wide) { /* TODO: Add filters in the BPF to support other targets. */
pr_err("Error: --bpf-summary only works for system-wide mode.\n"); goto out;
} if (trace.summary_only) goto skip_augmentation;
}
err = augmented_syscalls__prepare(); if (err < 0) goto skip_augmentation;
trace__add_syscall_newtp(&trace);
err = augmented_syscalls__create_bpf_output(trace.evlist); if (err == 0)
trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
skip_augmentation:
err = -1;
if (trace.trace_pgfaults) {
trace.opts.sample_address = true;
trace.opts.sample_time = true;
}
if (trace.opts.mmap_pages == UINT_MAX)
mmap_pages_user_set = false;
if (callchain_param.enabled) { if (!mmap_pages_user_set && geteuid() == 0)
trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
symbol_conf.use_callchain = true;
}
if (trace.evlist->core.nr_entries > 0) { bool use_btf = false;
evlist__set_default_evsel_handler(trace.evlist, trace__event_handler); if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) {
perror("failed to set syscalls:* tracepoint fields"); goto out;
}
if (use_btf)
trace__load_vmlinux_btf(&trace);
}
/* * If we are augmenting syscalls, then combine what we put in the * __augmented_syscalls__ BPF map with what is in the * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF, * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit. * * We'll switch to look at two BPF maps, one for sys_enter and the * other for sys_exit when we start augmenting the sys_exit paths with * buffers that are being copied from kernel to userspace, think 'read' * syscall.
*/ if (trace.syscalls.events.bpf_output) {
evlist__for_each_entry(trace.evlist, evsel) { bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit");
if (raw_syscalls_sys_exit) {
trace.raw_augmented_syscalls = true; goto init_augmented_syscall_tp;
}
if (trace.syscalls.events.bpf_output->priv == NULL &&
strstr(evsel__name(evsel), "syscalls:sys_enter")) { struct evsel *augmented = trace.syscalls.events.bpf_output; if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
evsel__init_augmented_syscall_tp_args(augmented)) goto out; /* * Augmented is __augmented_syscalls__ BPF_OUTPUT event * Above we made sure we can get from the payload the tp fields * that we get from syscalls:sys_enter tracefs format file.
*/
augmented->handler = trace__sys_enter; /* * Now we do the same for the *syscalls:sys_enter event so that * if we handle it directly, i.e. if the BPF prog returns 0 so * as not to filter it, then we'll handle it just like we would * for the BPF_OUTPUT one:
*/ if (evsel__init_augmented_syscall_tp(evsel, evsel) ||
evsel__init_augmented_syscall_tp_args(evsel)) goto out;
evsel->handler = trace__sys_enter;
}
if (strstarts(evsel__name(evsel), "syscalls:sys_exit_")) { struct syscall_tp *sc;
init_augmented_syscall_tp: if (evsel__init_augmented_syscall_tp(evsel, evsel)) goto out;
sc = __evsel__syscall_tp(evsel); /* * For now with BPF raw_augmented we hook into * raw_syscalls:sys_enter and there we get all * 6 syscall args plus the tracepoint common * fields and the syscall_nr (another long). * So we check if that is the case and if so * don't look after the sc->args_size but * always after the full raw_syscalls:sys_enter * payload, which is fixed. * * We'll revisit this later to pass * s->args_size to the BPF augmenter (now * tools/perf/examples/bpf/augmented_raw_syscalls.c, * so that it copies only what we need for each * syscall, like what happens when we use * syscalls:sys_enter_NAME, so that we reduce * the kernel/userspace traffic to just what is * needed for each syscall.
*/ if (trace.raw_augmented_syscalls)
trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
evsel__init_augmented_syscall_tp_ret(evsel);
evsel->handler = trace__sys_exit;
}
}
}
/* Using just --errno-summary will trigger --summary */ if (trace.errno_summary && !trace.summary && !trace.summary_only)
trace.summary_only = true;
/* summary_only implies summary option, but don't overwrite summary if set */ if (trace.summary_only)
trace.summary = trace.summary_only;
/* Keep exited threads, otherwise information might be lost for summary */ if (trace.summary) {
symbol_conf.keep_exited_threads = true; if (trace.summary_mode == SUMMARY__NONE)
trace.summary_mode = SUMMARY__BY_THREAD;
if (!trace.summary_bpf && trace.summary_mode == SUMMARY__BY_CGROUP) {
pr_err("Error: --summary-mode=cgroup only works with --bpf-summary\n");
err = -EINVAL; goto out;
}
}
if (output_name != NULL) {
err = trace__open_output(&trace, output_name); if (err < 0) {
perror("failed to create output file"); goto out;
}
}
err = evswitch__init(&trace.evswitch, trace.evlist, stderr); if (err) goto out_close;
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.137Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-29)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.