// SPDX-License-Identifier: GPL-2.0 /* * event tracer * * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * * - Added format output of fields of the trace point. * This was based off of work by Tom Zanussi <tzanussi@gmail.com>. *
*/
/* Double loops, do not use break, only goto's work */ #define do_for_each_event_file(tr, file) \
list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
list_for_each_entry(file, &tr->events, list)
staticint __trace_define_field(struct list_head *head, constchar *type, constchar *name, int offset, int size, int is_signed, int filter_type, int len, int need_test)
{ struct ftrace_event_field *field;
field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) return -ENOMEM;
int trace_define_field(struct trace_event_call *call, constchar *type, constchar *name, int offset, int size, int is_signed, int filter_type)
{ struct list_head *head;
staticint trace_define_field_ext(struct trace_event_call *call, constchar *type, constchar *name, int offset, int size, int is_signed, int filter_type, int len, int need_test)
{ struct list_head *head;
/* * run-time version of trace_event_get_offsets_<call>() that returns the last * accessible offset of trace fields excluding __dynamic_array bytes
*/ int trace_event_get_offsets(struct trace_event_call *call)
{ struct ftrace_event_field *tail; struct list_head *head;
head = trace_get_fields(call); /* * head->next points to the last field with the largest offset, * since it was added last by trace_define_field()
*/
tail = list_first_entry(head, struct ftrace_event_field, link); return tail->offset + tail->size;
}
if (!(len = str_has_prefix(fmt, "REC->"))) return NULL;
fmt += len; for (p = fmt; *p; p++) { if (!isalnum(*p) && *p != '_') break;
}
len = p - fmt;
for (; field->type; field++) { if (strncmp(field->name, fmt, len) || field->name[len]) continue;
return field;
} return NULL;
}
/* * Check if the referenced field is an array and return true, * as arrays are OK to dereference.
*/ staticbool test_field(constchar *fmt, struct trace_event_call *call)
{ struct trace_event_fields *field;
field = find_event_field(fmt, call); if (!field) returnfalse;
/* This is an array and is OK to dereference. */ return strchr(field->type, '[') != NULL;
}
/* Look for a string within an argument */ staticbool find_print_string(constchar *arg, constchar *str, constchar *end)
{ constchar *r;
r = strstr(arg, str); return r && r < end;
}
/* Return true if the argument pointer is safe */ staticbool process_pointer(constchar *fmt, int len, struct trace_event_call *call)
{ constchar *r, *e, *a;
e = fmt + len;
/* Find the REC-> in the argument */
r = strstr(fmt, "REC->"); if (r && r < e) { /* * Addresses of events on the buffer, or an array on the buffer is * OK to dereference. There's ways to fool this, but * this is to catch common mistakes, not malicious code.
*/
a = strchr(fmt, '&'); if ((a && (a < r)) || test_field(r, call)) returntrue;
} elseif (find_print_string(fmt, "__get_dynamic_array(", e)) { returntrue;
} elseif (find_print_string(fmt, "__get_rel_dynamic_array(", e)) { returntrue;
} elseif (find_print_string(fmt, "__get_dynamic_array_len(", e)) { returntrue;
} elseif (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) { returntrue;
} elseif (find_print_string(fmt, "__get_sockaddr(", e)) { returntrue;
} elseif (find_print_string(fmt, "__get_rel_sockaddr(", e)) { returntrue;
} returnfalse;
}
/* Return true if the string is safe */ staticbool process_string(constchar *fmt, int len, struct trace_event_call *call)
{ struct trace_event_fields *field; constchar *r, *e, *s;
e = fmt + len;
/* * There are several helper functions that return strings. * If the argument contains a function, then assume its field is valid. * It is considered that the argument has a function if it has: * alphanumeric or '_' before a parenthesis.
*/
s = fmt; do {
r = strstr(s, "("); if (!r || r >= e) break; for (int i = 1; r - i >= s; i++) { char ch = *(r - i); if (isspace(ch)) continue; if (isalnum(ch) || ch == '_') returntrue; /* Anything else, this isn't a function */ break;
} /* A function could be wrapped in parethesis, try the next one */
s = r + 1;
} while (s < e);
/* * Check for arrays. If the argument has: foo[REC->val] * then it is very likely that foo is an array of strings * that are safe to use.
*/
r = strstr(s, "["); if (r && r < e) {
r = strstr(r, "REC->"); if (r && r < e) returntrue;
}
/* * If there's any strings in the argument consider this arg OK as it * could be: REC->field ? "foo" : "bar" and we don't want to get into * verifying that logic here.
*/ if (find_print_string(fmt, "\"", e)) returntrue;
/* Dereferenced strings are also valid like any other pointer */ if (process_pointer(fmt, len, call)) returntrue;
/* Make sure the field is found */
field = find_event_field(fmt, call); if (!field) returnfalse;
/* Test this field's string before printing the event */
call->flags |= TRACE_EVENT_FL_TEST_STR;
field->needs_test = 1;
/* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and * much later referenced after the pointer was freed. Dereferencing * pointers are OK, if it is dereferenced into the event itself.
*/ staticvoid test_event_printk(struct trace_event_call *call)
{
u64 dereference_flags = 0;
u64 string_flags = 0; bool first = true; constchar *fmt; int parens = 0; char in_quote = 0; int start_arg = 0; int arg = 0; int i, e;
fmt = call->print_fmt;
if (!fmt) return;
for (i = 0; fmt[i]; i++) { switch (fmt[i]) { case'\\':
i++; if (!fmt[i]) return; continue; case'"': case'\'': /* * The print fmt starts with a string that * is processed first to find %p* usage, * then after the first string, the print fmt * contains arguments that are used to check * if the dereferenced %p* usage is safe.
*/ if (first) { if (fmt[i] == '\'') continue; if (in_quote) {
arg = 0;
first = false; /* * If there was no %p* uses * the fmt is OK.
*/ if (!dereference_flags) return;
}
} if (in_quote) { if (in_quote == fmt[i])
in_quote = 0;
} else {
in_quote = fmt[i];
} continue; case'%': if (!first || !in_quote) continue;
i++; if (!fmt[i]) return; switch (fmt[i]) { case'%': continue; case'p':
do_pointer: /* Find dereferencing fields */ switch (fmt[i + 1]) { case'B': case'R': case'r': case'b': case'M': case'm': case'I': case'i': case'E': case'U': case'V': case'N': case'a': case'd': case'D': case'g': case't': case'C': case'O': case'f': if (WARN_ONCE(arg == 63, "Too many args for event: %s",
trace_event_name(call))) return;
dereference_flags |= 1ULL << arg;
} break; default:
{ bool star = false; int j;
/* Increment arg if %*s exists. */ for (j = 0; fmt[i + j]; j++) { if (isdigit(fmt[i + j]) ||
fmt[i + j] == '.') continue; if (fmt[i + j] == '*') {
star = true; /* Handle %*pbl case */ if (!j && fmt[i + 1] == 'p') {
arg++;
i++; goto do_pointer;
} continue;
} if ((fmt[i + j] == 's')) { if (star)
arg++; if (WARN_ONCE(arg == 63, "Too many args for event: %s",
trace_event_name(call))) return;
dereference_flags |= 1ULL << arg;
string_flags |= 1ULL << arg;
} break;
} break;
} /* default */
} /* switch */
arg++; continue; case'(': if (in_quote) continue;
parens++; continue; case')': if (in_quote) continue;
parens--; if (WARN_ONCE(parens < 0, "Paren mismatch for event: %s\narg='%s'\n%*s",
trace_event_name(call),
fmt + start_arg,
(i - start_arg) + 5, "^")) return; continue; case',': if (in_quote || parens) continue;
e = i;
i++; while (isspace(fmt[i]))
i++;
/* * If start_arg is zero, then this is the start of the * first argument. The processing of the argument happens * when the end of the argument is found, as it needs to * handle paranthesis and such.
*/ if (!start_arg) {
start_arg = i; /* Balance out the i++ in the for loop */
i--; continue;
}
if (dereference_flags & (1ULL << arg)) {
handle_dereference_arg(fmt + start_arg, string_flags,
e - start_arg,
&dereference_flags, arg, call);
}
start_arg = i;
arg++; /* Balance out the i++ in the for loop */
i--;
}
}
if (dereference_flags & (1ULL << arg)) {
handle_dereference_arg(fmt + start_arg, string_flags,
i - start_arg,
&dereference_flags, arg, call);
}
/* * If you triggered the below warning, the trace event reported * uses an unsafe dereference pointer %p*. As the data stored * at the trace event time may no longer exist when the trace * event is printed, dereferencing to the original source is * unsafe. The source of the dereference must be copied into the * event itself, and the dereference must access the copy instead.
*/ if (WARN_ON_ONCE(dereference_flags)) {
arg = 1; while (!(dereference_flags & 1)) {
dereference_flags >>= 1;
arg++;
}
pr_warn("event %s has unsafe dereference of argument %d\n",
trace_event_name(call), arg);
pr_warn("print_fmt: %s\n", fmt);
}
}
int trace_event_raw_init(struct trace_event_call *call)
{ int id;
id = register_trace_event(&call->event); if (!id) return -ENODEV;
/* * This is recorded at every sched_switch for this task. * Thus, even if the task migrates the ignore value will be the same.
*/ return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0;
}
EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid);
if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) &&
trace_event_ignore_this_pid(trace_file)) return NULL;
/* * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables * preemption (adding one to the preempt_count). Since we are * interested in the preempt_count at the time the tracepoint was * hit, we need to subtract one to offset the increment.
*/
fbuffer->trace_ctx = tracing_gen_ctx_dec();
fbuffer->trace_file = trace_file;
staticint __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable)
{ struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; bool soft_mode = atomic_read(&file->sm_ref) != 0; int ret = 0; int disable;
switch (enable) { case 0: /* * When soft_disable is set and enable is cleared, the sm_ref * reference counter is decremented. If it reaches 0, we want * to clear the SOFT_DISABLED flag but leave the event in the * state that it was. That is, if the event was enabled and * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED * is set we do not want the event to be enabled before we * clear the bit. * * When soft_disable is not set but the soft_mode is, * we do nothing. Do not disable the tracepoint, otherwise * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
*/ if (soft_disable) { if (atomic_dec_return(&file->sm_ref) > 0) break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
soft_mode = false; /* Disable use of trace_buffered_event */
trace_buffered_event_disable();
} else
disable = !soft_mode;
if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); if (file->flags & EVENT_FILE_FL_RECORDED_CMD) {
tracing_stop_cmdline_record();
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
tracing_stop_tgid_record();
clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
}
ret = call->class->reg(call, TRACE_REG_UNREGISTER, file);
WARN_ON_ONCE(ret);
} /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */ if (soft_mode)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: /* * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do * nothing (but set soft_mode). If the event is disabled, we * set SOFT_DISABLED before enabling the event tracepoint, so * it still seems to be disabled.
*/ if (!soft_disable)
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else { if (atomic_inc_return(&file->sm_ref) > 1) break;
soft_mode = true; /* Enable use of trace_buffered_event */
trace_buffered_event_enable();
}
ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { if (cmd)
tracing_stop_cmdline_record(); if (tgid)
tracing_stop_tgid_record();
pr_info("event trace: Could not enable event " "%s\n", trace_event_name(call)); break;
}
set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
/* WAS_ENABLED gets set but never cleared. */
set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags);
} break;
}
return ret;
}
int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable)
{ return __ftrace_event_enable_disable(file, enable, soft_disable);
}
/* * Sched switch is funny, as we only want to ignore it * in the notrace case if both prev and next should be ignored.
*/
ret = trace_ignore_this_task(NULL, no_pid_list, prev) &&
trace_ignore_this_task(NULL, no_pid_list, next);
staticvoid __put_system_dir(struct trace_subsystem_dir *dir)
{
WARN_ON_ONCE(dir->ref_count == 0); /* If the subsystem is about to be freed, the dir must be too */
WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);
__put_system(dir->subsystem); if (!--dir->ref_count)
kfree(dir);
}
void event_file_put(struct trace_event_file *file)
{ if (WARN_ON_ONCE(!refcount_read(&file->ref))) { if (file->flags & EVENT_FILE_FL_FREED)
kmem_cache_free(file_cachep, file); return;
}
if (refcount_dec_and_test(&file->ref)) { /* Count should only go to zero when it is freed */ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) return;
kmem_cache_free(file_cachep, file);
}
}
/* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
*/ staticint
__ftrace_set_clr_event_nolock(struct trace_array *tr, constchar *match, constchar *sub, constchar *event, int set, constchar *mod)
{ struct trace_event_file *file; struct trace_event_call *call; char *module __free(kfree) = NULL; constchar *name; int ret = -EINVAL; int eret = 0;
if (mod) { char *p;
module = kstrdup(mod, GFP_KERNEL); if (!module) return -ENOMEM;
/* Replace all '-' with '_' as that's what modules do */ for (p = strchr(module, '-'); p; p = strchr(p + 1, '-'))
*p = '_';
}
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
/* If a module is specified, skip events that are not that module */ if (module && (!call->module || strcmp(module_name(call->module), module))) continue;
name = trace_event_name(call);
if (!name || !call->class || !call->class->reg) continue;
if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) continue;
if (sub && strcmp(sub, call->class->system) != 0) continue;
if (event && strcmp(event, name) != 0) continue;
ret = ftrace_event_enable_disable(file, set);
/* * Save the first error and return that. Some events * may still have been enabled, but let the user * know that something went wrong.
*/ if (ret && !eret)
eret = ret;
ret = eret;
}
/* * If this is a module setting and nothing was found, * check if the module was loaded. If it wasn't cache it.
*/ if (module && ret == -EINVAL && !eret)
ret = cache_mod(tr, module, set, match, sub, event);
return ret;
}
staticint __ftrace_set_clr_event(struct trace_array *tr, constchar *match, constchar *sub, constchar *event, int set, constchar *mod)
{ int ret;
mutex_lock(&event_mutex);
ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod);
mutex_unlock(&event_mutex);
return ret;
}
int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{ char *event = NULL, *sub = NULL, *match, *mod; int ret;
if (!tr) return -ENOENT;
/* Modules events can be appened with :mod:<module> */
mod = strstr(buf, ":mod:"); if (mod) {
*mod = '\0'; /* move to the module name */
mod += 5;
}
/* * The buf format can be <subsystem>:<event-name> * *:<event-name> means any event by that name. * :<event-name> is the same. * * <subsystem>:* means all events in that subsystem * <subsystem>: means the same. * * <name> (no ':') means all events in a subsystem with * the name <name> or any event that matches <name>
*/
match = strsep(&buf, ":"); if (buf) {
sub = match;
event = buf;
match = NULL;
if (!strlen(sub) || strcmp(sub, "*") == 0)
sub = NULL; if (!strlen(event) || strcmp(event, "*") == 0)
event = NULL;
} elseif (mod) { /* Allow wildcard for no length or star */ if (!strlen(match) || strcmp(match, "*") == 0)
match = NULL;
}
ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod);
/* Put back the colon to allow this to be called again */ if (buf)
*(buf - 1) = ':';
return ret;
}
/** * trace_set_clr_event - enable or disable an event * @system: system name to match (NULL for any system) * @event: event name to match (NULL for all events, within system) * @set: 1 to enable, 0 to disable * * This is a way for other parts of the kernel to enable or disable * event recording. * * Returns 0 on success, -EINVAL if the parameters do not match any * registered events.
*/ int trace_set_clr_event(constchar *system, constchar *event, int set)
{ struct trace_array *tr = top_trace_array();
/** * trace_array_set_clr_event - enable or disable an event for a trace array. * @tr: concerned trace array. * @system: system name to match (NULL for any system) * @event: event name to match (NULL for all events, within system) * @enable: true to enable, false to disable * * This is a way for other parts of the kernel to enable or disable * event recording. * * Returns 0 on success, -EINVAL if the parameters do not match any * registered events.
*/ int trace_array_set_clr_event(struct trace_array *tr, constchar *system, constchar *event, bool enable)
{ int set;
list_for_each_entry_continue(file, &tr->events, list) {
call = file->event_call; /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files.
*/ if (call->class && call->class->reg &&
!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file;
}
/* * The iter is allocated in s_start() and passed via the 'v' * parameter. To stop the iterator, NULL must be returned. But * the return value is what the 'v' parameter in s_stop() receives * and frees. Free iter here as it will no longer be used.
*/
kfree(iter); return NULL;
}
if (iter->type == SET_EVENT_FILE) return t_show(m, iter->file);
/* When match is set, system and event are not */ if (iter->event_mod->match) {
seq_printf(m, "%s:mod:%s\n", iter->event_mod->match,
iter->event_mod->module); return 0;
}
/* * Grab the mutex, to keep calls to p_next() having the same * tr->filtered_pids as p_start() has. * If we just passed the tr->filtered_pids around, then RCU would * have been enough, but doing that makes things more complex.
*/
mutex_lock(&event_mutex);
rcu_read_lock_sched();
ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret;
guard(mutex)(&event_mutex);
switch (val) { case 0: case 1:
file = event_file_file(filp); if (!file) return -ENODEV;
ret = tracing_update_buffers(file->tr); if (ret < 0) return ret;
ret = ftrace_event_enable_disable(file, val); if (ret < 0) return ret; break;
default: return -EINVAL;
}
*ppos += cnt;
return cnt;
}
/* * Returns: * 0 : no events exist? * 1 : all events are disabled * 2 : all events are enabled * 3 : some events are enabled and some are enabled
*/ int trace_events_enabled(struct trace_array *tr, constchar *system)
{ struct trace_event_call *call; struct trace_event_file *file; int set = 0;
if (system && strcmp(call->class->system, system) != 0) continue;
/* * We need to find out if all the events are set * or if all events or cleared, or if we have * a mixture.
*/
set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED));
/* * If we have a mixture, no need to look further.
*/ if (set == 3) break;
}
/* Make sure the system still exists */
mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) {
list_for_each_entry(iter_dir, &iter_tr->systems, list) { if (iter_dir == inode->i_private) { /* Don't open systems with no events */
tr = iter_tr;
dir = iter_dir; if (dir->nr_events) {
__get_system_dir(dir);
system = dir->subsystem;
} goto exit_loop;
}
}
}
exit_loop:
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
if (!system) return -ENODEV;
/* Still need to increment the ref count of the system */ if (trace_array_get(tr) < 0) {
put_system(dir); return -ENODEV;
}
ret = tracing_open_generic(inode, filp); if (ret < 0) {
trace_array_put(tr);
put_system(dir);
}
/* * If dir->subsystem is NULL, then this is a temporary * descriptor that was made for a trace_array to enable * all subsystems.
*/ if (dir->subsystem)
put_system(dir); else
kfree(dir);
/* * This function is called by on_each_cpu() while the * event_mutex is held.
*/
pid_list = rcu_dereference_protected(tr->filtered_pids,
mutex_is_locked(&event_mutex));
no_pid_list = rcu_dereference_protected(tr->filtered_no_pids,
mutex_is_locked(&event_mutex));
staticvoid register_pid_events(struct trace_array *tr)
{ /* * Register a probe that is called before all other probes * to set ignore_pid if next or prev do not match. * Register a probe this is called after all other probes * to only keep ignore_pid set if next pid matches.
*/
register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre,
tr, INT_MAX);
register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post,
tr, 0);
/* * Ignoring of pids is done at task switch. But we have to * check for those tasks that are currently running. * Always do this in case a pid was appended or removed.
*/
on_each_cpu(ignore_task_cpu, tr, 1);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.