// SPDX-License-Identifier: GPL-2.0 /* * builtin-record.c * * Builtin record command: Record the profile of a workload * (or a CPU, or a PID) into the perf.data output file - for * later analysis via perf report.
*/ #include"builtin.h"
aio_errno = aio_error(cblock); if (aio_errno == EINPROGRESS) return 0;
written = aio_ret = aio_return(cblock); if (aio_ret < 0) { if (aio_errno != EINTR)
pr_err("failed to write perf data, error: %m\n");
written = 0;
}
rem_size = cblock->aio_nbytes - written;
if (rem_size == 0) {
cblock->aio_fildes = -1; /* * md->refcount is incremented in record__aio_pushfn() for * every aio write request started in record__aio_push() so * decrement it because the request is now complete.
*/
perf_mmap__put(&md->core);
rc = 1;
} else { /* * aio write request may require restart with the * remainder if the kernel didn't write whole * chunk at once.
*/
rem_off = cblock->aio_offset + written;
rem_buf = (void *)(cblock->aio_buf + written);
record__aio_write(cblock, cblock->aio_fildes,
rem_buf, rem_size, rem_off);
rc = 0;
}
do {
do_suspend = 0; for (i = 0; i < md->aio.nr_cblocks; ++i) { if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) { if (sync_all)
aiocb[i] = NULL; else return i;
} else { /* * Started aio write is not complete yet * so it has to be waited before the * next allocation.
*/
aiocb[i] = &cblocks[i];
do_suspend = 1;
}
} if (!do_suspend) return -1;
while (aio_suspend((conststruct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) { if (!(errno == EAGAIN || errno == EINTR))
pr_err("failed to sync perf data, error: %m\n");
}
} while (1);
}
/* * map->core.base data pointed by buf is copied into free map->aio.data[] buffer * to release space in the kernel buffer as fast as possible, calling * perf_mmap__consume() from perf_mmap__push() function. * * That lets the kernel to proceed with storing more profiling data into * the kernel buffer earlier than other per-cpu kernel buffers are handled. * * Coping can be done in two steps in case the chunk of profiling data * crosses the upper bound of the kernel buffer. In this case we first move * part of data from map->start till the upper bound and then the remainder * from the beginning of the kernel buffer till the end of the data chunk.
*/
if (!aio->size) { /* * Increment map->refcount to guard map->aio.data[] buffer * from premature deallocation because map object can be * released earlier than aio write request started on * map->aio.data[] buffer is complete. * * perf_mmap__put() is done at record__aio_complete() * after started aio request completion or at record__aio_push() * if the request failed to start.
*/
perf_mmap__get(&map->core);
}
aio->size += size;
return size;
}
staticint record__aio_push(struct record *rec, struct mmap *map, off_t *off)
{ int ret, idx; int trace_fd = rec->session->data->file.fd; struct record_aio aio = { .rec = rec, .size = 0 };
/* * Call record__aio_sync() to wait till map->aio.data[] buffer * becomes available after previous aio write operation.
*/
idx = record__aio_sync(map, false);
aio.data = map->aio.data[idx];
ret = perf_mmap__push(map, &aio, record__aio_pushfn); if (ret != 0) /* ret > 0 - no data, ret < 0 - error */ return ret;
rec->samples++;
ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off); if (!ret) {
*off += aio.size;
rec->bytes_written += aio.size; if (switch_output_size(rec))
trigger_hit(&switch_output_trigger);
} else { /* * Decrement map->refcount incremented in record__aio_pushfn() * back if record__aio_write() operation failed to start, otherwise * map->refcount is decremented in record__aio_complete() after * aio write operation finishes successfully.
*/
perf_mmap__put(&map->core);
}
/* * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan * error. We make it aligned here.
*/
event->data_size = compressed - sizeof(struct perf_record_compressed2);
event->header.size = PERF_ALIGN(compressed, sizeof(u64));
padding = event->header.size - compressed; return record__write(rec, map, bf, compressed) ||
record__write(rec, map, &pad, padding);
}
done = 1; #ifdef HAVE_EVENTFD_SUPPORT if (done_fd >= 0) {
u64 tmp = 1; int orig_errno = errno;
/* * It is possible for this signal handler to run after done is * checked in the main loop, but before the perf counter fds are * polled. If this happens, the poll() will continue to wait * even though done is set, and will only break out if either * another signal is received, or the counters are ready for * read. To ensure the poll() doesn't sleep when done is set, * use an eventfd (done_fd) to wake up the poll().
*/ if (write(done_fd, &tmp, sizeof(tmp)) < 0)
pr_err("failed to signal wakeup fd, error: %m\n");
staticvoid record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
{
pr_debug("Recording AUX area tracing snapshot\n"); if (record__auxtrace_read_snapshot_all(rec) < 0) {
trigger_error(&auxtrace_snapshot_trigger);
} else { if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
trigger_error(&auxtrace_snapshot_trigger); else
trigger_ready(&auxtrace_snapshot_trigger);
}
}
staticint record__auxtrace_snapshot_exit(struct record *rec)
{ if (trigger_is_error(&auxtrace_snapshot_trigger)) return 0;
if (!auxtrace_record__snapshot_started &&
auxtrace_record__snapshot_start(rec->itr)) return -1;
record__read_auxtrace_snapshot(rec, true); if (trigger_is_error(&auxtrace_snapshot_trigger)) return -1;
return 0;
}
staticint record__auxtrace_init(struct record *rec)
{ int err;
if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
&& record__threads_enabled(rec)) {
pr_err("AUX area tracing options are not available in parallel streaming mode.\n"); return -EINVAL;
}
if (!rec->itr) {
rec->itr = auxtrace_record__init(rec->evlist, &err); if (err) return err;
}
err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
rec->opts.auxtrace_snapshot_opts); if (err) return err;
err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
rec->opts.auxtrace_sample_opts); if (err) return err;
err = auxtrace_parse_aux_action(rec->evlist); if (err) return err;
return auxtrace_parse_filters(rec->evlist);
}
#else
staticinline int record__auxtrace_mmap_read(struct record *rec __maybe_unused, struct mmap *map __maybe_unused)
{ return 0;
}
/* * If non-dummy evsel exists, system_wide sideband is need to * help parse sample information. * For example, PERF_EVENT_MMAP event to help parse symbol, * and PERF_EVENT_COMM event to help parse task executable name.
*/
evlist__for_each_entry(evlist, evsel) { if (!evsel__is_dummy_event(evsel)) returntrue;
}
/* * For initial_delay, system wide or a hybrid system, we need to add * tracking event so that we can track PERF_RECORD_MMAP to cover the * delay of waiting or event synthesis.
*/ if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
perf_pmus__num_core_pmus() > 1) {
/* * User space tasks can migrate between CPUs, so when tracing * selected CPUs, sideband for all CPUs is still needed.
*/ if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
system_wide = true;
evsel = evlist__findnew_tracking_event(evlist, system_wide); if (!evsel) return -ENOMEM;
/* * Enable the tracking event when the process is forked for * initial_delay, immediately for system wide.
*/ if (opts->target.initial_delay && !evsel->immediate &&
!target__has_cpu(&opts->target))
evsel->core.attr.enable_on_exec = 1; else
evsel->immediate = 1;
}
return 0;
}
staticbool record__kcore_readable(struct machine *machine)
{ char kcore[PATH_MAX]; int fd;
staticvoid record__free_thread_data(struct record *rec)
{ int t; struct record_thread *thread_data = rec->thread_data;
if (thread_data == NULL) return;
for (t = 0; t < rec->nr_threads; t++) {
record__thread_data_close_pipes(&thread_data[t]);
zfree(&thread_data[t].maps);
zfree(&thread_data[t].overwrite_maps);
fdarray__exit(&thread_data[t].pollfd);
}
zfree(&rec->thread_data);
}
staticint record__map_thread_evlist_pollfd_indexes(struct record *rec, int evlist_pollfd_index, int thread_pollfd_index)
{
size_t x = rec->index_map_cnt;
for (i = 0; i < rec->index_map_cnt; i++) { int e_pos = rec->index_map[i].evlist_pollfd_index; int t_pos = rec->index_map[i].thread_pollfd_index;
if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
e_entries[e_pos].events != t_entries[t_pos].events) {
pr_err("Thread and evlist pollfd index mismatch\n");
err = -EINVAL; continue;
}
e_entries[e_pos].revents = t_entries[t_pos].revents;
} return err;
}
staticint record__dup_non_perf_events(struct record *rec, struct evlist *evlist, struct record_thread *thread_data)
{ struct fdarray *fda = &evlist->core.pollfd; int i, ret;
for (i = 0; i < fda->nr; i++) { if (!(fda->priv[i].flags & fdarray_flag__non_perf_event)) continue;
ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda); if (ret < 0) {
pr_err("Failed to duplicate descriptor in main thread pollfd\n"); return ret;
}
pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
thread_data, ret, fda->entries[i].fd);
ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret); if (ret < 0) {
pr_err("Failed to map thread and evlist pollfd indexes\n"); return ret;
}
} return 0;
}
staticint record__alloc_thread_data(struct record *rec, struct evlist *evlist)
{ int t, ret; struct record_thread *thread_data;
rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data))); if (!rec->thread_data) {
pr_err("Failed to allocate thread data\n"); return -ENOMEM;
}
thread_data = rec->thread_data;
for (t = 0; t < rec->nr_threads; t++)
record__thread_data_init_pipes(&thread_data[t]);
for (t = 0; t < rec->nr_threads; t++) {
thread_data[t].rec = rec;
thread_data[t].mask = &rec->thread_masks[t];
ret = record__thread_data_init_maps(&thread_data[t], evlist); if (ret) {
pr_err("Failed to initialize thread[%d] maps\n", t); goto out_free;
}
ret = record__thread_data_init_pollfd(&thread_data[t], evlist); if (ret) {
pr_err("Failed to initialize thread[%d] pollfd\n", t); goto out_free;
} if (t) {
thread_data[t].tid = -1;
ret = record__thread_data_open_pipes(&thread_data[t]); if (ret) {
pr_err("Failed to open thread[%d] communication pipes\n", t); goto out_free;
}
ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable); if (ret < 0) {
pr_err("Failed to add descriptor to thread[%d] pollfd\n", t); goto out_free;
}
thread_data[t].ctlfd_pos = ret;
pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
thread_data, thread_data[t].ctlfd_pos,
thread_data[t].pipes.msg[0]);
} else {
thread_data[t].tid = gettid();
ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]); if (ret < 0) goto out_free;
thread_data[t].ctlfd_pos = -1; /* Not used */
}
}
return 0;
out_free:
record__free_thread_data(rec);
return ret;
}
staticint record__mmap_evlist(struct record *rec, struct evlist *evlist)
{ int i, ret; struct record_opts *opts = &rec->opts; bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
opts->auxtrace_sample_mode; char msg[512];
if (opts->affinity != PERF_AFFINITY_SYS)
cpu__setup_cpunode_map();
if (evlist__mmap_ex(evlist, opts->mmap_pages,
opts->auxtrace_mmap_pages,
auxtrace_overwrite,
opts->nr_cblocks, opts->affinity,
opts->mmap_flush, opts->comp_level) < 0) { if (errno == EPERM) {
pr_err("Permission error mapping pages.\n" "Consider increasing " "/proc/sys/kernel/perf_event_mlock_kb,\n" "or try again with a smaller value of -m/--mmap_pages.\n" "(current value: %u,%u)\n",
opts->mmap_pages, opts->auxtrace_mmap_pages); return -errno;
} else {
pr_err("failed to mmap with %d (%s)\n", errno,
str_error_r(errno, msg, sizeof(msg))); if (errno) return -errno; else return -EINVAL;
}
}
if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack)) return -1;
ret = record__alloc_thread_data(rec, evlist); if (ret) return ret;
if (record__threads_enabled(rec)) {
ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps); if (ret) {
pr_err("Failed to create data directory: %s\n", strerror(-ret)); return ret;
} for (i = 0; i < evlist->core.nr_mmaps; i++) { if (evlist->mmap)
evlist->mmap[i].file = &rec->data.dir.files[i]; if (evlist->overwrite_mmap)
evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
}
}
return 0;
}
staticint record__mmap(struct record *rec)
{ return record__mmap_evlist(rec, rec->evlist);
}
if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
pr_warning( "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n" "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n" "Samples in kernel functions may not be resolved if a suitable vmlinux\n" "file is not found in the buildid cache or in the vmlinux path.\n\n" "Samples in kernel modules won't be resolved at all.\n\n" "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" "even with a suitable vmlinux or kallsyms file.\n\n");
}
if (evlist__apply_filters(evlist, &pos, &opts->target)) {
pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
pos->filter ?: "BPF", evsel__name(pos), errno,
str_error_r(errno, msg, sizeof(msg)));
rc = -1; goto out;
}
staticint process_buildids(struct record *rec)
{ struct perf_session *session = rec->session;
if (perf_data__size(&rec->data) == 0) return 0;
/* * During this process, it'll load kernel map and replace the * dso->long_name to a real pathname it found. In this case * we prefer the vmlinux path like * /lib/modules/3.16.4/build/vmlinux * * rather than build-id path (in debug directory). * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
*/
symbol_conf.ignore_vmlinux_buildid = true;
/* * If --buildid-all is given, it marks all DSO regardless of hits, * so no need to process samples. But if timestamp_boundary is enabled, * it still needs to walk on all samples to get the timestamps of * first/last samples.
*/ if (rec->buildid_all && !rec->timestamp_boundary)
rec->tool.sample = process_event_sample_stub;
return perf_session__process_events(session);
}
staticvoid perf_event__synthesize_guest_os(struct machine *machine, void *data)
{ int err; struct perf_tool *tool = data; /* *As for guest kernel when processing subcommand record&report, *we arrange module mmap prior to guest kernel mmap and trigger *a preload dso because default guest module symbols are loaded *from guest kallsyms instead of /lib/modules/XXX/XXX. This *method is used to avoid symbol missing when the first addr is *in module instead of in guest kernel.
*/
err = perf_event__synthesize_modules(tool, process_synthesized_event,
machine); if (err < 0)
pr_err("Couldn't record guest kernel [%d]'s reference" " relocation symbol.\n", machine->pid);
/* * We use _stext for guest kernel because guest kernel's /proc/kallsyms * have no _text sometimes.
*/
err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
machine); if (err < 0)
pr_err("Couldn't record guest kernel [%d]'s reference" " relocation symbol.\n", machine->pid);
}
if (record__aio_enabled(rec))
record__aio_set_pos(trace_fd, off);
/* * Mark the round finished in case we wrote * at least one event. * * No need for round events in directory mode, * because per-cpu maps and files have data * sorted by kernel.
*/ if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
if (overwrite)
evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
out: return rc;
}
staticint record__mmap_read_all(struct record *rec, bool synch)
{ int err;
err = record__mmap_read_evlist(rec, rec->evlist, false, synch); if (err) return err;
staticint record__synthesize(struct record *rec, bool tail);
staticint
record__switch_output(struct record *rec, bool at_exit)
{ struct perf_data *data = &rec->data; char *new_filename = NULL; int fd, err;
/* Same Size: "2015122520103046"*/ char timestamp[] = "InvalidTimestamp";
record__aio_mmap_read_sync(rec);
write_finished_init(rec, true);
record__synthesize(rec, true); if (target__none(&rec->opts.target))
record__synthesize_workload(rec, true);
rec->samples = 0;
record__finish_output(rec);
err = fetch_current_timestamp(timestamp, sizeof(timestamp)); if (err) {
pr_err("Failed to get current timestamp\n"); return -EINVAL;
}
if (rec->switch_output.num_files) { int n = rec->switch_output.cur_file + 1;
if (n >= rec->switch_output.num_files)
n = 0;
rec->switch_output.cur_file = n; if (rec->switch_output.filenames[n]) {
remove(rec->switch_output.filenames[n]);
zfree(&rec->switch_output.filenames[n]);
}
rec->switch_output.filenames[n] = new_filename;
} else {
free(new_filename);
}
/* Output tracking events */ if (!at_exit) {
record__synthesize(rec, false);
/* * In 'perf record --switch-output' without -a, * record__synthesize() in record__switch_output() won't * generate tracking events because there's no thread_map * in evlist. Which causes newly created perf.data doesn't * contain map and comm information. * Create a fake thread_map and directly call * perf_event__synthesize_thread_map() for those events.
*/ if (target__none(&rec->opts.target))
record__synthesize_workload(rec, false);
write_finished_init(rec, false);
} return fd;
}
staticvoid __record__save_lost_samples(struct record *rec, struct evsel *evsel, struct perf_record_lost_samples *lost, int cpu_idx, int thread_idx, u64 lost_count,
u16 misc_flag)
{ struct perf_sample_id *sid; struct perf_sample sample; int id_hdr_size;
/* * evlist__prepare_workload will send a SIGUSR1 * if the fork fails, since we asked by setting its * want_signal to true.
*/ staticvoid workload_exec_failed_signal(int signo __maybe_unused,
siginfo_t *info, void *ucontext __maybe_unused)
{
workload_exec_errno = info->si_value.sival_int;
done = 1;
child_finished = 1;
}
if (data->is_pipe) {
err = perf_event__synthesize_for_pipe(tool, session, data,
process_synthesized_event); if (err < 0) goto out;
rec->bytes_written += err;
}
err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
process_synthesized_event, machine); if (err) goto out;
/* Synthesize id_index before auxtrace_info */
err = perf_event__synthesize_id_index(tool,
process_synthesized_event,
session->evlist, machine); if (err) goto out;
if (rec->opts.full_auxtrace) {
err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
session, process_synthesized_event); if (err) goto out;
}
if (!evlist__exclude_kernel(rec->evlist)) {
err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
machine);
WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n" "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" "Check /proc/kallsyms permission or run as root.\n");
err = perf_event__synthesize_modules(tool, process_synthesized_event,
machine);
WARN_ONCE(err < 0, "Couldn't record kernel module information.\n" "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" "Check /proc/modules permission or run as root.\n");
}
if (perf_guest) {
machines__process_guests(&session->machines,
perf_event__synthesize_guest_os, tool);
}
err = perf_event__synthesize_extra_attr(&rec->tool,
rec->evlist,
process_synthesized_event,
data->is_pipe); if (err) goto out;
staticint record__setup_sb_evlist(struct record *rec)
{ struct record_opts *opts = &rec->opts;
if (rec->sb_evlist != NULL) { /* * We get here if --switch-output-event populated the * sb_evlist, so associate a callback that will send a SIGUSR2 * to the main thread.
*/
evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
rec->thread_id = pthread_self();
} #ifdef HAVE_LIBBPF_SUPPORT if (!opts->no_bpf_event) { if (rec->sb_evlist == NULL) {
rec->sb_evlist = evlist__new();
if (rec->sb_evlist == NULL) {
pr_err("Couldn't create side band evlist.\n."); return -1;
}
}
if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) {
pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n."); return -1;
}
} #endif if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
opts->no_bpf_event = true;
}
if (rec->opts.kcore &&
!record__kcore_readable(&session->machines.host)) {
pr_err("ERROR: kcore is not readable.\n"); return -1;
}
if (record__init_clock(rec)) return -1;
record__init_features(rec);
if (forks) {
err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
workload_exec_failed_signal); if (err < 0) {
pr_err("Couldn't run the workload!\n");
status = err; goto out_delete_session;
}
}
/* * If we have just single event and are sending data * through pipe, we need to force the ids allocation, * because we synthesize event name through the pipe * and need the id for that.
*/ if (data->is_pipe && rec->evlist->core.nr_entries == 1)
rec->opts.sample_id = true;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.