// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> * * Parts came from builtin-{top,stat,record}.c, see those files for further * copyright notes.
*/ /* * Powerpc needs __SANE_USERSPACE_TYPES__ before <linux/types.h> to select * 'int-ll64.h' and avoid compile warnings when printing __u64 with %llu.
*/ #define __SANE_USERSPACE_TYPES__
int __evsel__sample_size(u64 sample_type)
{
u64 mask = sample_type & PERF_SAMPLE_MASK; int size = 0; int i;
for (i = 0; i < 64; i++) { if (mask & (1ULL << i))
size++;
}
size *= sizeof(u64);
return size;
}
/** * __perf_evsel__calc_id_pos - calculate id_pos. * @sample_type: sample type * * This function returns the position of the event id (PERF_SAMPLE_ID or * PERF_SAMPLE_IDENTIFIER) in a sample event i.e. in the array of struct * perf_record_sample.
*/ staticint __perf_evsel__calc_id_pos(u64 sample_type)
{ int idx = 0;
if (sample_type & PERF_SAMPLE_IDENTIFIER) return 0;
if (!(sample_type & PERF_SAMPLE_ID)) return -1;
if (sample_type & PERF_SAMPLE_IP)
idx += 1;
if (sample_type & PERF_SAMPLE_TID)
idx += 1;
if (sample_type & PERF_SAMPLE_TIME)
idx += 1;
if (sample_type & PERF_SAMPLE_ADDR)
idx += 1;
return idx;
}
/** * __perf_evsel__calc_is_pos - calculate is_pos. * @sample_type: sample type * * This function returns the position (counting backwards) of the event id * (PERF_SAMPLE_ID or PERF_SAMPLE_IDENTIFIER) in a non-sample event i.e. if * sample_id_all is used there is an id sample appended to non-sample events.
*/ staticint __perf_evsel__calc_is_pos(u64 sample_type)
{ int idx = 1;
if (sample_type & PERF_SAMPLE_IDENTIFIER) return 1;
if (!(sample_type & PERF_SAMPLE_ID)) return -1;
if (sample_type & PERF_SAMPLE_CPU)
idx += 1;
if (sample_type & PERF_SAMPLE_STREAM_ID)
idx += 1;
/** * evsel__is_function_event - Return whether given evsel is a function * trace event * * @evsel - evsel selector to be tested * * Return %true if event is function trace event
*/ bool evsel__is_function_event(struct evsel *evsel)
{ #define FUNCTION_EVENT "ftrace:function"
/** * evsel__clone - create a new evsel copied from @orig * @orig: original evsel * * The assumption is that @orig is not configured nor opened yet. * So we only care about the attributes that can be set while it's parsed.
*/ struct evsel *evsel__clone(struct evsel *dest, struct evsel *orig)
{ struct evsel *evsel;
/* * Returns the group details for the specified leader, * with following rules. * * For record -e '{cycles,instructions}' * 'anon group { cycles:u, instructions:u }' * * For record -e 'cycles,instructions' and report --group * 'cycles:u, instructions:u'
*/ int evsel__group_desc(struct evsel *evsel, char *buf, size_t size)
{ int ret = 0; bool first = true; struct evsel *pos; constchar *group_name = evsel__group_name(evsel);
if (!evsel->forced_leader)
ret = scnprintf(buf, size, "%s { ", group_name);
for_each_group_evsel(pos, evsel) { if (symbol_conf.skip_empty &&
evsel__hists(pos)->stats.nr_samples == 0) continue;
ret += scnprintf(buf + ret, size - ret, "%s%s",
first ? "" : ", ", evsel__name(pos));
first = false;
}
if (!evsel->forced_leader)
ret += scnprintf(buf + ret, size - ret, " }");
if (opts->kernel_callchains)
attr->exclude_callchain_user = 1; if (opts->user_callchains)
attr->exclude_callchain_kernel = 1; if (param->record_mode == CALLCHAIN_LBR) { if (!opts->branch_stack) { if (attr->exclude_user) {
pr_warning("LBR callstack option is only available " "to get user callchain information. " "Falling back to framepointers.\n");
} else {
evsel__set_sample_bit(evsel, BRANCH_STACK);
attr->branch_sample_type = PERF_SAMPLE_BRANCH_USER |
PERF_SAMPLE_BRANCH_CALL_STACK |
PERF_SAMPLE_BRANCH_NO_CYCLES |
PERF_SAMPLE_BRANCH_NO_FLAGS |
PERF_SAMPLE_BRANCH_HW_INDEX;
}
} else
pr_warning("Cannot use LBR callstack with branch stack. " "Falling back to framepointers.\n");
}
if (param->record_mode == CALLCHAIN_DWARF) { if (!function) { constchar *arch = perf_env__arch(evsel__env(evsel));
evsel__set_sample_bit(evsel, REGS_USER);
evsel__set_sample_bit(evsel, STACK_USER); if (opts->sample_user_regs &&
DWARF_MINIMAL_REGS(arch) != arch__user_reg_mask()) {
attr->sample_regs_user |= DWARF_MINIMAL_REGS(arch);
pr_warning("WARNING: The use of --call-graph=dwarf may require all the user registers, " "specifying a subset with --user-regs may render DWARF unwinding unreliable, " "so the minimal registers set (IP, SP) is explicitly forced.\n");
} else {
attr->sample_regs_user |= arch__user_reg_mask();
}
attr->sample_stack_user = param->dump_size;
attr->exclude_callchain_user = 1;
} else {
pr_info("Cannot use DWARF unwind for function trace event," " falling back to framepointers.\n");
}
}
if (function) {
pr_info("Disabling user space callchains for function trace event.\n");
attr->exclude_callchain_user = 1;
}
}
list_for_each_entry(term, config_terms, list) { switch (term->type) { case EVSEL__CONFIG_TERM_PERIOD: if (!(term->weak && opts->user_interval != ULLONG_MAX)) {
attr->sample_period = term->val.period;
attr->freq = 0;
evsel__reset_sample_bit(evsel, PERIOD);
} break; case EVSEL__CONFIG_TERM_FREQ: if (!(term->weak && opts->user_freq != UINT_MAX)) {
attr->sample_freq = term->val.freq;
attr->freq = 1;
evsel__set_sample_bit(evsel, PERIOD);
} break; case EVSEL__CONFIG_TERM_TIME: if (term->val.time)
evsel__set_sample_bit(evsel, TIME); else
evsel__reset_sample_bit(evsel, TIME); break; case EVSEL__CONFIG_TERM_CALLGRAPH:
callgraph_buf = term->val.str; break; case EVSEL__CONFIG_TERM_BRANCH: if (term->val.str && strcmp(term->val.str, "no")) {
evsel__set_sample_bit(evsel, BRANCH_STACK);
parse_branch_str(term->val.str,
&attr->branch_sample_type);
} else
evsel__reset_sample_bit(evsel, BRANCH_STACK); break; case EVSEL__CONFIG_TERM_STACK_USER:
dump_size = term->val.stack_user; break; case EVSEL__CONFIG_TERM_MAX_STACK:
max_stack = term->val.max_stack; break; case EVSEL__CONFIG_TERM_MAX_EVENTS:
evsel->max_events = term->val.max_events; break; case EVSEL__CONFIG_TERM_INHERIT: /* * attr->inherit should has already been set by * evsel__config. If user explicitly set * inherit using config terms, override global * opt->no_inherit setting.
*/
attr->inherit = term->val.inherit ? 1 : 0; break; case EVSEL__CONFIG_TERM_OVERWRITE:
attr->write_backward = term->val.overwrite ? 1 : 0; break; case EVSEL__CONFIG_TERM_DRV_CFG: break; case EVSEL__CONFIG_TERM_PERCORE: break; case EVSEL__CONFIG_TERM_AUX_OUTPUT:
attr->aux_output = term->val.aux_output ? 1 : 0; break; case EVSEL__CONFIG_TERM_AUX_ACTION: /* Already applied by auxtrace */ break; case EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE: /* Already applied by auxtrace */ break; case EVSEL__CONFIG_TERM_CFG_CHG: break; default: break;
}
}
/* User explicitly set per-event callgraph, clear the old setting and reset. */ if ((callgraph_buf != NULL) || (dump_size > 0) || max_stack) { bool sample_address = false;
if (max_stack) {
param.max_stack = max_stack; if (callgraph_buf == NULL)
callgraph_buf = "fp";
}
/* parse callgraph parameters */ if (callgraph_buf != NULL) { if (!strcmp(callgraph_buf, "no")) {
param.enabled = false;
param.record_mode = CALLCHAIN_NONE;
} else {
param.enabled = true; if (parse_callchain_record(callgraph_buf, ¶m)) {
pr_err("per-event callgraph setting for %s failed. " "Apply callgraph global setting for it\n",
evsel->name); return;
} if (param.record_mode == CALLCHAIN_DWARF)
sample_address = true;
}
} if (dump_size > 0) {
dump_size = round_up(dump_size, sizeof(u64));
param.dump_size = dump_size;
}
/* If global callgraph set, clear it */ if (callchain_param.enabled)
evsel__reset_callgraph(evsel, &callchain_param);
/* set perf-event callgraph */ if (param.enabled) { if (sample_address) {
evsel__set_sample_bit(evsel, ADDR);
evsel__set_sample_bit(evsel, DATA_SRC);
evsel->core.attr.mmap_data = track;
}
evsel__config_callchain(evsel, opts, ¶m);
}
}
}
/* * The enable_on_exec/disabled value strategy: * * 1) For any type of traced program: * - all independent events and group leaders are disabled * - all group members are enabled * * Group members are ruled by group leaders. They need to * be enabled, because the group scheduling relies on that. * * 2) For traced programs executed by perf: * - all independent events and group leaders have * enable_on_exec set * - we don't specifically enable or disable any event during * the record command * * Independent events and group leaders are initially disabled * and get enabled by exec. Group members are ruled by group * leaders as stated in 1). * * 3) For traced programs attached by perf (pid/tid): * - we specifically enable or disable all events during * the record command * * When attaching events to already running traced we * enable/disable events specifically, as there's no * initial traced exec call.
*/ void evsel__config(struct evsel *evsel, struct record_opts *opts, struct callchain_param *callchain)
{ struct evsel *leader = evsel__leader(evsel); struct perf_event_attr *attr = &evsel->core.attr; int track = evsel->tracking; bool per_cpu = opts->target.default_per_cpu && !opts->target.per_thread;
if (evsel->sample_read) {
evsel__set_sample_bit(evsel, READ);
/* * We need ID even in case of single event, because * PERF_SAMPLE_READ process ID specific data.
*/
evsel__set_sample_id(evsel, false);
/* * Apply group format only if we belong to group * with more than one members.
*/ if (leader->core.nr_members > 1) {
attr->read_format |= PERF_FORMAT_GROUP;
}
/* * Inherit + SAMPLE_READ requires SAMPLE_TID in the read_format
*/ if (attr->inherit) {
evsel__set_sample_bit(evsel, TID);
evsel->core.attr.read_format |=
PERF_FORMAT_ID;
}
}
/* * We default some events to have a default interval. But keep * it a weak assumption overridable by the user.
*/ if ((evsel->is_libpfm_event && !attr->sample_period) ||
(!evsel->is_libpfm_event && (!attr->sample_period ||
opts->user_freq != UINT_MAX ||
opts->user_interval != ULLONG_MAX)))
evsel__set_default_freq_period(opts, attr);
/* * If attr->freq was set (here or earlier), ask for period * to be sampled.
*/ if (attr->freq)
evsel__set_sample_bit(evsel, PERIOD);
if (opts->sample_address) {
evsel__set_sample_bit(evsel, ADDR);
attr->mmap_data = track;
}
/* * We don't allow user space callchains for function trace * event, due to issues with page faults while tracing page * fault handler and its overall trickiness nature.
*/ if (evsel__is_function_event(evsel))
evsel->core.attr.exclude_callchain_user = 1;
if (callchain && callchain->enabled && !evsel->no_aux_samples)
evsel__config_callchain(evsel, opts, callchain);
if (target__has_cpu(&opts->target) || opts->sample_cpu)
evsel__set_sample_bit(evsel, CPU);
/* * When the user explicitly disabled time don't force it here.
*/ if (opts->sample_time &&
(!perf_missing_features.sample_id_all &&
(!opts->no_inherit || target__has_cpu(&opts->target) || per_cpu ||
opts->sample_time_set)))
evsel__set_sample_bit(evsel, TIME);
/* * ksymbol is tracked separately with text poke because it needs to be * system wide and enabled immediately.
*/ if (!opts->text_poke)
attr->ksymbol = track && !perf_missing_features.ksymbol;
attr->bpf_event = track && !opts->no_bpf_event && !perf_missing_features.bpf;
if (opts->record_namespaces)
attr->namespaces = track;
if (opts->sample_data_page_size)
evsel__set_sample_bit(evsel, DATA_PAGE_SIZE);
if (opts->sample_code_page_size)
evsel__set_sample_bit(evsel, CODE_PAGE_SIZE);
if (opts->record_switch_events)
attr->context_switch = track;
if (opts->sample_transaction)
evsel__set_sample_bit(evsel, TRANSACTION);
if (opts->running_time) {
evsel->core.attr.read_format |=
PERF_FORMAT_TOTAL_TIME_ENABLED |
PERF_FORMAT_TOTAL_TIME_RUNNING;
}
/* * XXX see the function comment above * * Disabling only independent events or group leaders, * keeping group members enabled.
*/ if (evsel__is_group_leader(evsel))
attr->disabled = 1;
/* * Setting enable_on_exec for independent events and * group leaders for traced executed by perf.
*/ if (target__none(&opts->target) && evsel__is_group_leader(evsel) &&
!opts->target.initial_delay)
attr->enable_on_exec = 1;
if (evsel->immediate) {
attr->disabled = 0;
attr->enable_on_exec = 0;
}
/* The --period option takes the precedence. */ if (opts->period_set) { if (opts->period)
evsel__set_sample_bit(evsel, PERIOD); else
evsel__reset_sample_bit(evsel, PERIOD);
}
/* * A dummy event never triggers any actual counter and therefore * cannot be used with branch_stack. * * For initial_delay, a dummy event is added implicitly. * The software event will trigger -EOPNOTSUPP error out, * if BRANCH_STACK bit is set.
*/ if (evsel__is_dummy_event(evsel))
evsel__reset_sample_bit(evsel, BRANCH_STACK);
if (evsel__is_offcpu_event(evsel)) {
evsel->core.attr.sample_type &= OFFCPU_SAMPLE_TYPES;
attr->inherit = 0;
}
/* Caller has to clear disabled after going through all CPUs. */ int evsel__enable_cpu(struct evsel *evsel, int cpu_map_idx)
{ return perf_evsel__enable_cpu(&evsel->core, cpu_map_idx);
}
int evsel__enable(struct evsel *evsel)
{ int err = perf_evsel__enable(&evsel->core);
if (!err)
evsel->disabled = false; return err;
}
/* Caller has to set disabled after going through all CPUs. */ int evsel__disable_cpu(struct evsel *evsel, int cpu_map_idx)
{ return perf_evsel__disable_cpu(&evsel->core, cpu_map_idx);
}
int evsel__disable(struct evsel *evsel)
{ int err = perf_evsel__disable(&evsel->core); /* * We mark it disabled here so that tools that disable a event can * ignore events after they disable it. I.e. the ring buffer may have * already a few more events queued up before the kernel got the stop * request.
*/ if (!err)
evsel->disabled = true;
for_each_group_evsel(evsel, leader) { if (evsel__is_retire_lat(evsel))
nr--;
} return nr;
}
static u64 evsel__group_read_size(struct evsel *leader)
{
u64 read_format = leader->core.attr.read_format; int entry = sizeof(u64); /* value */ int size = 0; int nr = 1;
if (!evsel__group_has_tpebs(leader)) return perf_evsel__read_size(&leader->core);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
if (read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
if (read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
if (read_format & PERF_FORMAT_GROUP) {
nr = evsel__group_read_nr_members(leader);
size += sizeof(u64);
}
size += entry * nr; return size;
}
staticint evsel__process_group_data(struct evsel *leader, int cpu_map_idx, int thread, u64 *data)
{
u64 read_format = leader->core.attr.read_format; struct sample_read_value *v;
u64 nr, ena = 0, run = 0, lost = 0;
nr = *data++;
if (nr != evsel__group_read_nr_members(leader)) return -EINVAL;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
ena = *data++;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
run = *data++;
if (e_type != type) { return type == PERF_TYPE_HARDWARE && evsel->pmu && evsel->pmu->is_core &&
evsel->alternate_hw_config == config;
}
if ((type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) &&
perf_pmus__supports_extended_type())
e_config &= PERF_HW_EVENT_MASK;
return e_config == config;
}
int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
{ if (evsel__is_tool(evsel)) return evsel__tool_pmu_read(evsel, cpu_map_idx, thread);
if (evsel__is_hwmon(evsel)) return evsel__hwmon_pmu_read(evsel, cpu_map_idx, thread);
if (evsel__is_drm(evsel)) return evsel__drm_pmu_read(evsel, cpu_map_idx, thread);
if (evsel__is_retire_lat(evsel)) return evsel__tpebs_read(evsel, cpu_map_idx, thread);
if (evsel->core.attr.read_format & PERF_FORMAT_GROUP) return evsel__read_group(evsel, cpu_map_idx, thread);
/* * Since fds for next evsel has not been created, * there is no need to iterate whole event list.
*/ if (pos == evsel) break;
} return 0;
}
staticbool evsel__ignore_missing_thread(struct evsel *evsel, int nr_cpus, int cpu_map_idx, struct perf_thread_map *threads, int thread, int err)
{
pid_t ignore_pid = perf_thread_map__pid(threads, thread);
if (!evsel->ignore_missing_thread) returnfalse;
/* The system wide setup does not work with threads. */ if (evsel->core.system_wide) returnfalse;
/* The -ESRCH is perf event syscall errno for pid's not found. */ if (err != -ESRCH) returnfalse;
/* If there's only one thread, let it fail. */ if (threads->nr == 1) returnfalse;
/* * We should remove fd for missing_thread first * because thread_map__remove() will decrease threads->nr.
*/ if (update_fds(evsel, nr_cpus, cpu_map_idx, threads->nr, thread)) returnfalse;
if (thread_map__remove(threads, thread)) returnfalse;
pr_warning("WARNING: Ignored open failure for pid %d\n",
ignore_pid); returntrue;
}
bool evsel__precise_ip_fallback(struct evsel *evsel)
{ /* Do not try less precise if not requested. */ if (!evsel->precise_max) returnfalse;
/* * We tried all the precise_ip values, and it's * still failing, so leave it to standard fallback.
*/ if (!evsel->core.attr.precise_ip) {
evsel->core.attr.precise_ip = evsel->precise_ip_original; returnfalse;
}
if (!evsel->precise_ip_original)
evsel->precise_ip_original = evsel->core.attr.precise_ip;
evsel->core.attr.precise_ip--;
pr_debug2_peo("decreasing precise_ip by one (%d)\n", evsel->core.attr.precise_ip);
display_attr(&evsel->core.attr); returntrue;
}
if (pmu == NULL)
pmu = evsel->pmu = evsel__find_pmu(evsel);
if (pmu == NULL || pmu->missing_features.checked) goto out;
/* * Must probe features in the order they were added to the * perf_event_attr interface. These are kernel core limitation but * specific to PMUs with branch stack. So we can detect with the given * hardware event and stop on the first one succeeded.
*/
/* Please add new feature detection here. */
attr.exclude_guest = 1; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
pmu->missing_features.exclude_guest = true;
pr_debug2("switching off exclude_guest for PMU %s\n", pmu->name);
/* * Must probe features in the order they were added to the * perf_event_attr interface. These are PMU specific limitation * so we can detect with the given hardware event and stop on the * first one succeeded.
*/
/* Please add new feature detection here. */
attr.branch_sample_type = PERF_SAMPLE_BRANCH_COUNTERS; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.branch_counters = true;
pr_debug2("switching off branch counters support\n");
attr.branch_sample_type = PERF_SAMPLE_BRANCH_HW_INDEX; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.branch_hw_idx = true;
pr_debug2("switching off branch HW index support\n");
attr.branch_sample_type = PERF_SAMPLE_BRANCH_NO_CYCLES | PERF_SAMPLE_BRANCH_NO_FLAGS; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.lbr_flags = true;
pr_debug2_peo("switching off branch sample type no (cycles/flags)\n");
if (__has_attr_feature(&attr, cpu, /*flags=*/0)) {
errno = old_errno; returntrue;
}
/* * EOPNOTSUPP means the kernel supports the feature but the PMU does * not, so keep that distinction if possible.
*/ if (errno != EOPNOTSUPP)
errno = old_errno;
/* * Don't bother probing aux_action if it is not being used or has been * probed before.
*/ if (!evsel->core.attr.aux_action || detection_done) return;
detection_done = true;
/* * The leader is an AUX area event. If it has failed, assume the feature * is not supported.
*/
leader = evsel__leader(evsel); if (evsel == leader) {
perf_missing_features.aux_action = true; return;
}
/* * AUX area event with aux_action must have been opened successfully * already, so feature is supported.
*/ if (leader->core.attr.aux_action) return;
if (!evsel__probe_aux_action(leader, cpu))
perf_missing_features.aux_action = true;
}
if (evsel__has_br_stack(evsel))
evsel__detect_missing_brstack_features(evsel);
if (detection_done) goto check;
old_errno = errno;
/* * Must probe features in the order they were added to the * perf_event_attr interface. These are kernel core limitation * not PMU-specific so we can detect with a software event and * stop on the first one succeeded.
*/
/* Please add new feature detection here. */
attr.inherit = true;
attr.sample_type = PERF_SAMPLE_READ; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.inherit_sample_read = true;
pr_debug2("Using PERF_SAMPLE_READ / :S modifier is not compatible with inherit, falling back to no-inherit.\n");
attr.inherit = false;
attr.sample_type = 0;
attr.read_format = PERF_FORMAT_LOST; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.read_lost = true;
pr_debug2("switching off PERF_FORMAT_LOST support\n");
attr.read_format = 0;
attr.sample_type = PERF_SAMPLE_WEIGHT_STRUCT; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.weight_struct = true;
pr_debug2("switching off weight struct support\n");
attr.sample_type = 0;
attr.sample_type = PERF_SAMPLE_CODE_PAGE_SIZE; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.code_page_size = true;
pr_debug2_peo("Kernel has no PERF_SAMPLE_CODE_PAGE_SIZE support\n");
attr.sample_type = 0;
attr.sample_type = PERF_SAMPLE_DATA_PAGE_SIZE; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.data_page_size = true;
pr_debug2_peo("Kernel has no PERF_SAMPLE_DATA_PAGE_SIZE support\n");
attr.sample_type = 0;
attr.cgroup = 1; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.cgroup = true;
pr_debug2_peo("Kernel has no cgroup sampling support\n");
attr.cgroup = 0;
attr.aux_output = 1; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.aux_output = true;
pr_debug2_peo("Kernel has no attr.aux_output support\n");
attr.aux_output = 0;
attr.bpf_event = 1; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.bpf = true;
pr_debug2_peo("switching off bpf_event\n");
attr.bpf_event = 0;
attr.ksymbol = 1; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.ksymbol = true;
pr_debug2_peo("switching off ksymbol\n");
attr.ksymbol = 0;
attr.write_backward = 1; if (has_attr_feature(&attr, /*flags=*/0)) goto found;
perf_missing_features.write_backward = true;
pr_debug2_peo("switching off write_backward\n");
attr.write_backward = 0;
/* * If we succeeded but had to kill clockid, fail and * have evsel__open_strerror() print us a nice error.
*/ if (perf_missing_features.clockid ||
perf_missing_features.clockid_wrong) {
err = -EINVAL; goto out_close;
}
}
}
return 0;
try_fallback: if (evsel__ignore_missing_thread(evsel, perf_cpu_map__nr(cpus),
idx, threads, thread, err)) { /* We just removed 1 thread, so lower the upper nthreads limit. */
nthreads--;
/* ... and pretend like nothing have happened. */
err = 0; goto retry_open;
} /* * perf stat needs between 5 and 22 fds per CPU. When we run out * of them try to increase the limits.
*/ if (err == -EMFILE && rlimit__increase_nofile(&set_rlimit)) goto retry_open;
if (err == -EINVAL && evsel__detect_missing_features(evsel, cpu)) goto fallback_missing_features;
if (evsel__precise_ip_fallback(evsel)) goto retry_open;
out_close: if (err)
threads->err_thread = thread;
old_errno = errno; do { while (--thread >= 0) { if (FD(evsel, idx, thread) >= 0)
close(FD(evsel, idx, thread));
FD(evsel, idx, thread) = -1;
}
thread = nthreads;
} while (--idx >= 0);
errno = old_errno; return err;
}
staticint
perf_event__check_size(union perf_event *event, unsignedint sample_size)
{ /* * The evsel's sample_size is based on PERF_SAMPLE_MASK which includes * up to PERF_SAMPLE_PERIOD. After that overflow() must be used to * check the format does not go past the end of the event.
*/ if (sample_size + sizeof(event->header) > event->header.size) return -EFAULT;
if (type & PERF_SAMPLE_RAW) {
OVERFLOW_CHECK_u64(array);
u.val64 = *array;
/* * Undo swap of u64, then swap on individual u32s, * get the size of the raw area and undo all of the * swap. The pevent interface handles endianness by * itself.
*/ if (swapped) {
u.val64 = bswap_64(u.val64);
u.val32[0] = bswap_32(u.val32[0]);
u.val32[1] = bswap_32(u.val32[1]);
}
data->raw_size = u.val32[0];
/* * The raw data is aligned on 64bits including the * u32 size, so it's safe to use mem_bswap_64.
*/ if (swapped)
mem_bswap_64((void *) array, data->raw_size);
if (data->branch_stack->nr > max_branch_nr) return -EFAULT;
sz = data->branch_stack->nr * sizeof(struct branch_entry); if (evsel__has_branch_hw_idx(evsel)) {
sz += sizeof(u64);
e = &data->branch_stack->entries[0];
} else {
data->no_hw_idx = true; /* * if the PERF_SAMPLE_BRANCH_HW_INDEX is not applied, * only nr and entries[] will be output by kernel.
*/
e = (struct branch_entry *)&data->branch_stack->hw_idx;
}
if (swapped) { /* * struct branch_flag does not have endian * specific bit field definition. And bswap * will not resolve the issue, since these * are bit fields. * * evsel__bitfield_swap_branch_flags() uses a * bitfield_swap macro to swap the bit position * based on the host endians.
*/ for (i = 0; i < data->branch_stack->nr; i++, e++)
e->flags.value = evsel__bitfield_swap_branch_flags(e->flags.value);
}
if (!states || field != prev_state_field) {
states = parse_task_states(field); if (!states) return state;
prev_state_field = field;
}
/* * Note since the kernel exposes TASK_REPORT_MAX to userspace * to denote the 'preempted' state, we might as welll report * 'R' for this case, which make senses to users as well. * * We can change this if we have a good reason in the future.
*/
val = evsel__intval(evsel, sample, name);
bit = val ? ffs(val) : 0;
state = (!bit || bit > strlen(states)) ? 'R' : states[bit-1]; return state;
} #endif
bool evsel__fallback(struct evsel *evsel, struct target *target, int err, char *msg, size_t msgsize)
{ int paranoid;
if ((err == ENOENT || err == ENXIO || err == ENODEV) &&
evsel->core.attr.type == PERF_TYPE_HARDWARE &&
evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES) { /* * If it's cycles then fall back to hrtimer based cpu-clock sw * counter, which is always available even if no PMU support. * * PPC returns ENXIO until 2.6.37 (behavior changed with commit * b0a873e).
*/
evsel->core.attr.type = PERF_TYPE_SOFTWARE;
evsel->core.attr.config = target__has_cpu(target)
? PERF_COUNT_SW_CPU_CLOCK
: PERF_COUNT_SW_TASK_CLOCK;
scnprintf(msg, msgsize, "The cycles event is not supported, trying to fall back to %s",
target__has_cpu(target) ? "cpu-clock" : "task-clock");
/* Is there already the separator in the name. */ if (strchr(name, '/') ||
(strchr(name, ':') && !evsel->is_libpfm_event))
sep = "";
if (asprintf(&new_name, "%s%sH", name, sep) < 0) goto no_fallback;
free(evsel->name);
evsel->name = new_name; /* Apple M1 requires exclude_guest */
scnprintf(msg, msgsize, "Trying to fall back to excluding guest samples");
evsel->core.attr.exclude_guest = 1;
returntrue;
}
no_fallback:
scnprintf(msg, msgsize, "No fallback found for '%s' for error %d",
evsel__name(evsel), err); returnfalse;
}
staticbool find_process(constchar *name)
{
size_t len = strlen(name);
DIR *dir; struct dirent *d; int ret = -1;
dir = opendir(procfs__mountpoint()); if (!dir) returnfalse;
/* Walk through the directory. */ while (ret && (d = readdir(dir)) != NULL) { char path[PATH_MAX]; char *data;
size_t size;
if (filename__read_str(path, &data, &size)) continue;
ret = strncmp(name, data, len);
free(data);
}
closedir(dir); return ret ? false : true;
}
staticint dump_perf_event_processes(char *msg, size_t size)
{
DIR *proc_dir; struct dirent *proc_entry; int printed = 0;
proc_dir = opendir(procfs__mountpoint()); if (!proc_dir) return 0;
/* Walk through the /proc directory. */ while ((proc_entry = readdir(proc_dir)) != NULL) { char buf[256];
DIR *fd_dir; struct dirent *fd_entry; int fd_dir_fd;
if (fd_entry->d_type != DT_LNK) continue;
link_size = readlinkat(fd_dir_fd, fd_entry->d_name, buf, sizeof(buf)); if (link_size < 0) continue; /* Take care as readlink doesn't null terminate the string. */ if (!strncmp(buf, "anon_inode:[perf_event]", link_size)) { int cmdline_fd;
ssize_t cmdline_size;
scnprintf(buf, sizeof(buf), "%s/cmdline", proc_entry->d_name);
cmdline_fd = openat(dirfd(proc_dir), buf, O_RDONLY); if (cmdline_fd == -1) continue;
cmdline_size = read(cmdline_fd, buf, sizeof(buf) - 1);
close(cmdline_fd); if (cmdline_size < 0) continue;
buf[cmdline_size] = '\0'; for (ssize_t i = 0; i < cmdline_size; i++) { if (buf[i] == '\0')
buf[i] = ' ';
}
if (printed == 0)
printed += scnprintf(msg, size, "Possible processes:\n");
int evsel__open_strerror(struct evsel *evsel, struct target *target, int err, char *msg, size_t size)
{ char sbuf[STRERR_BUFSIZE]; int printed = 0, enforced = 0; int ret;
switch (err) { case EPERM: case EACCES:
printed += scnprintf(msg + printed, size - printed, "Access to performance monitoring and observability operations is limited.\n");
if (!sysfs__read_int("fs/selinux/enforce", &enforced)) { if (enforced) {
printed += scnprintf(msg + printed, size - printed, "Enforced MAC policy settings (SELinux) can limit access to performance\n" "monitoring and observability operations. Inspect system audit records for\n" "more perf_event access control information and adjusting the policy.\n");
}
}
if (err == EPERM)
printed += scnprintf(msg, size, "No permission to enable %s event.\n\n", evsel__name(evsel));
return printed + scnprintf(msg + printed, size - printed, "Consider adjusting /proc/sys/kernel/perf_event_paranoid setting to open\n" "access to performance monitoring and observability operations for processes\n" "without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.\n" "More information can be found at 'Perf events and tool security' document:\n" "https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html\n" "perf_event_paranoid setting is %d:\n" " -1: Allow use of (almost) all events by all users\n" " Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK\n" ">= 0: Disallow raw and ftrace function tracepoint access\n" ">= 1: Disallow CPU event access\n" ">= 2: Disallow kernel profiling\n" "To make the adjusted perf_event_paranoid setting permanent preserve it\n" "in /etc/sysctl.conf (e.g. kernel.perf_event_paranoid = <setting>)",
perf_event_paranoid()); case ENOENT: return scnprintf(msg, size, "The %s event is not supported.", evsel__name(evsel)); case EMFILE: return scnprintf(msg, size, "%s", "Too many events are opened.\n" "Probably the maximum number of open file descriptors has been reached.\n" "Hint: Try again after reducing the number of events.\n" "Hint: Try increasing the limit with 'ulimit -n <limit>'"); case ENOMEM: if (evsel__has_callchain(evsel) &&
access("/proc/sys/kernel/perf_event_max_stack", F_OK) == 0) return scnprintf(msg, size, "Not enough memory to setup event with callchain.\n" "Hint: Try tweaking /proc/sys/kernel/perf_event_max_stack\n" "Hint: Current value: %d", sysctl__max_stack()); break; case ENODEV: if (target->cpu_list) return scnprintf(msg, size, "%s", "No such device - did you specify an out-of-range profile CPU?"); break; case EOPNOTSUPP: if (evsel->core.attr.sample_type & PERF_SAMPLE_BRANCH_STACK) return scnprintf(msg, size, "%s: PMU Hardware or event type doesn't support branch stack sampling.",
evsel__name(evsel)); if (evsel->core.attr.aux_output) return scnprintf(msg, size, "%s: PMU Hardware doesn't support 'aux_output' feature",
evsel__name(evsel)); if (evsel->core.attr.aux_action) return scnprintf(msg, size, "%s: PMU Hardware doesn't support 'aux_action' feature",
evsel__name(evsel)); if (evsel->core.attr.sample_period != 0) return scnprintf(msg, size, "%s: PMU Hardware doesn't support sampling/overflow-interrupts. Try 'perf stat'",
evsel__name(evsel)); if (evsel->core.attr.precise_ip) return scnprintf(msg, size, "%s", "\'precise\' request may not be supported. Try removing 'p' modifier."); #ifdefined(__i386__) || defined(__x86_64__) if (evsel->core.attr.type == PERF_TYPE_HARDWARE) return scnprintf(msg, size, "%s", "No hardware sampling interrupt available.\n"); #endif if (!target__has_cpu(target)) return scnprintf(msg, size, "Unsupported event (%s) in per-thread mode, enable system wide with '-a'.",
evsel__name(evsel)); break; case EBUSY: if (find_process("oprofiled")) return scnprintf(msg, size, "The PMU counters are busy/taken by another profiler.\n" "We found oprofile daemon running, please stop it and try again.");
printed += scnprintf(
msg, size, "The PMU %s counters are busy and in use by another process.\n",
evsel->pmu ? evsel->pmu->name : ""); return printed + dump_perf_event_processes(msg + printed, size - printed); break; case EINVAL: if (evsel->core.attr.sample_type & PERF_SAMPLE_CODE_PAGE_SIZE && perf_missing_features.code_page_size) return scnprintf(msg, size, "Asking for the code page size isn't supported by this kernel."); if (evsel->core.attr.sample_type & PERF_SAMPLE_DATA_PAGE_SIZE && perf_missing_features.data_page_size) return scnprintf(msg, size, "Asking for the data page size isn't supported by this kernel."); if (evsel->core.attr.write_backward && perf_missing_features.write_backward) return scnprintf(msg, size, "Reading from overwrite event is not supported by this kernel."); if (perf_missing_features.clockid) return scnprintf(msg, size, "clockid feature not supported."); if (perf_missing_features.clockid_wrong) return scnprintf(msg, size, "wrong clockid (%d).", clockid); if (perf_missing_features.aux_action) return scnprintf(msg, size, "The 'aux_action' feature is not supported, update the kernel."); if (perf_missing_features.aux_output) return scnprintf(msg, size, "The 'aux_output' feature is not supported, update the kernel."); if (!target__has_cpu(target)) return scnprintf(msg, size, "Invalid event (%s) in per-thread mode, enable system wide with '-a'.",
evsel__name(evsel));
break; case ENODATA: return scnprintf(msg, size, "Cannot collect data source with the load latency event alone. " "Please add an auxiliary event in front of the load latency event."); default: break;
}
ret = arch_evsel__open_strerror(evsel, msg, size); if (ret) return ret;
return scnprintf(msg, size, "The sys_perf_event_open() syscall returned with %d (%s) for event (%s).\n" "\"dmesg | grep -i perf\" may provide additional information.\n",
err, str_error_r(err, sbuf, sizeof(sbuf)), evsel__name(evsel));
}
if (evsel->per_pkg_mask) {
hashmap__for_each_entry(evsel->per_pkg_mask, cur, bkt)
zfree(&cur->pkey);
hashmap__clear(evsel->per_pkg_mask);
}
}
/** * evsel__is_hybrid - does the evsel have a known PMU that is hybrid. Note, this * will be false on hybrid systems for hardware and legacy * cache events.
*/ bool evsel__is_hybrid(conststruct evsel *evsel)
{ if (!evsel->core.is_pmu_core) returnfalse;
/* * Remove an event from a given group (leader). * Some events, e.g., perf metrics Topdown events, * must always be grouped. Ignore the events.
*/ void evsel__remove_from_group(struct evsel *evsel, struct evsel *leader)
{ if (!arch_evsel__must_be_in_group(evsel) && evsel != leader) {
evsel__set_leader(evsel, evsel);
evsel->core.nr_members = 0;
leader->core.nr_members--;
}
}
if (counter->pmu && counter->pmu->is_core &&
counter->alternate_hw_config != PERF_COUNT_HW_MAX) { /* A sysfs or json event replacing a legacy event, don't uniquify. */ returnfalse;
}
if (config->aggr_mode == AGGR_NONE) { /* Always unique with no aggregation. */
counter->needs_uniquify = true; returntrue;
}
if (counter->first_wildcard_match != NULL) { /* * If stats are merged then only the first_wildcard_match is * displayed, there is no need to uniquify this evsel as the * name won't be shown.
*/ returnfalse;
}
/* * Do other non-merged events in the evlist have the same name? If so * uniquify is necessary.
*/
evlist__for_each_entry(counter->evlist, evsel) { if (evsel == counter || evsel->first_wildcard_match || evsel->pmu == counter->pmu) continue;
/* No uniquification necessary. */ if (!counter->needs_uniquify) return;
/* The evsel was already uniquified. */ if (counter->uniquified_name) return;
/* Avoid checking to uniquify twice. */
counter->uniquified_name = true;
name = evsel__name(counter);
config = strchr(name, '/');
pmu_name = counter->pmu->name;
/* Already prefixed by the PMU name? */
len = pmu_name_len_no_suffix(pmu_name);
if (!strncmp(name, pmu_name, len)) { /* * If the PMU name is there, then there is no sense in not * having a slash. Do this for robustness.
*/ if (config == NULL)
config = name - 1;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.