buffer = auxtrace_buffer__next(queue, buffer); /* If no more data, drop the previous auxtrace_buffer and return */ if (!buffer) { if (old_buffer)
auxtrace_buffer__drop_data(old_buffer);
b->len = 0; return 0;
}
speq->buffer = buffer;
/* If the aux_buffer doesn't have data associated, try to load it */ if (!buffer->data) { /* get the file desc associated with the perf data file */ int fd = perf_data__fd(speq->spe->session->data);
buffer->data = auxtrace_buffer__get_data(buffer, fd); if (!buffer->data) return -ENOMEM;
}
b->len = buffer->size;
b->buf = buffer->data;
if (b->len) { if (old_buffer)
auxtrace_buffer__drop_data(old_buffer);
speq->old_buffer = buffer;
} else {
auxtrace_buffer__drop_data(buffer); return arm_spe_get_trace(b, data);
}
if (record->type & ARM_SPE_BRANCH_MISS)
speq->flags |= PERF_IP_FLAG_BRANCH_MISS;
if (record->type & ARM_SPE_BRANCH_NOT_TAKEN)
speq->flags |= PERF_IP_FLAG_NOT_TAKEN;
if (record->type & ARM_SPE_IN_TXN)
speq->flags |= PERF_IP_FLAG_IN_TX;
if (record->op & ARM_SPE_OP_BR_COND)
speq->flags |= PERF_IP_FLAG_CONDITIONAL;
if (record->op & ARM_SPE_OP_BR_CR_BL)
speq->flags |= PERF_IP_FLAG_CALL; elseif (record->op & ARM_SPE_OP_BR_CR_RET)
speq->flags |= PERF_IP_FLAG_RETURN; /* * Indirect branch instruction without link (e.g. BR), * take it as a function return.
*/ elseif (record->op & ARM_SPE_OP_BR_INDIRECT)
speq->flags |= PERF_IP_FLAG_RETURN;
}
}
staticvoid arm_spe__synth_data_source_common(conststruct arm_spe_record *record, union perf_mem_data_src *data_src)
{ /* * Even though four levels of cache hierarchy are possible, no known * production Neoverse systems currently include more than three levels * so for the time being we assume three exist. If a production system * is built with four the this function would have to be changed to * detect the number of levels for reporting.
*/
/* * We have no data on the hit level or data source for stores in the * Neoverse SPE records.
*/ if (record->op & ARM_SPE_OP_ST) {
data_src->mem_lvl = PERF_MEM_LVL_NA;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
data_src->mem_snoop = PERF_MEM_SNOOP_NA; return;
}
switch (record->source) { case ARM_SPE_COMMON_DS_L1D:
data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
data_src->mem_snoop = PERF_MEM_SNOOP_NONE; break; case ARM_SPE_COMMON_DS_L2:
data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
data_src->mem_snoop = PERF_MEM_SNOOP_NONE; break; case ARM_SPE_COMMON_DS_PEER_CORE:
data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; break; /* * We don't know if this is L1, L2 but we do know it was a cache-2-cache * transfer, so set SNOOPX_PEER
*/ case ARM_SPE_COMMON_DS_LOCAL_CLUSTER: case ARM_SPE_COMMON_DS_PEER_CLUSTER:
data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; break; /* * System cache is assumed to be L3
*/ case ARM_SPE_COMMON_DS_SYS_CACHE:
data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
data_src->mem_snoop = PERF_MEM_SNOOP_HIT; break; /* * We don't know what level it hit in, except it came from the other * socket
*/ case ARM_SPE_COMMON_DS_REMOTE:
data_src->mem_lvl = PERF_MEM_LVL_NA;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; break; case ARM_SPE_COMMON_DS_DRAM:
data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
data_src->mem_snoop = PERF_MEM_SNOOP_NONE; break; default: break;
}
}
/* * Source is IMPDEF. Here we convert the source code used on AmpereOne cores * to the common (Neoverse, Cortex) to avoid duplicating the decoding code.
*/ staticvoid arm_spe__synth_data_source_ampereone(conststruct arm_spe_record *record, union perf_mem_data_src *data_src)
{ struct arm_spe_record common_record;
switch (record->source) { case ARM_SPE_AMPEREONE_LOCAL_CHIP_CACHE_OR_DEVICE:
common_record.source = ARM_SPE_COMMON_DS_PEER_CORE; break; case ARM_SPE_AMPEREONE_SLC:
common_record.source = ARM_SPE_COMMON_DS_SYS_CACHE; break; case ARM_SPE_AMPEREONE_REMOTE_CHIP_CACHE:
common_record.source = ARM_SPE_COMMON_DS_REMOTE; break; case ARM_SPE_AMPEREONE_DDR:
common_record.source = ARM_SPE_COMMON_DS_DRAM; break; case ARM_SPE_AMPEREONE_L1D:
common_record.source = ARM_SPE_COMMON_DS_L1D; break; case ARM_SPE_AMPEREONE_L2D:
common_record.source = ARM_SPE_COMMON_DS_L2; break; default:
pr_warning_once("AmpereOne: Unknown data source (0x%x)\n",
record->source); return;
}
staticvoid arm_spe__synth_data_source_hisi_hip(conststruct arm_spe_record *record, union perf_mem_data_src *data_src)
{ /* Use common synthesis method to handle store operations */ if (record->op & ARM_SPE_OP_ST) {
arm_spe__synth_data_source_common(record, data_src); return;
}
/* Metadata version 1 assumes all CPUs are the same (old behavior) */ if (spe->metadata_ver == 1) { constchar *cpuid;
pr_warning_once("Old SPE metadata, re-record to improve decode accuracy\n");
cpuid = perf_env__cpuid(perf_session__env(spe->session));
midr = strtol(cpuid, NULL, 16);
} else { /* CPU ID is -1 for per-thread mode */ if (speq->cpu < 0) { /* * On the heterogeneous system, due to CPU ID is -1, * cannot confirm the data source packet is supported.
*/ if (!spe->is_homogeneous) returnfalse;
/* In homogeneous system, simply use CPU0's metadata */ if (spe->metadata)
metadata = spe->metadata[0];
} else {
metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu);
}
if (!metadata) returnfalse;
midr = metadata[ARM_SPE_CPU_MIDR];
}
for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) { if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) {
data_source_handles[i].ds_synth(record, data_src); returntrue;
}
}
if (spe->sample_flc) { if (record->type & ARM_SPE_L1D_MISS) {
err = arm_spe__synth_mem_sample(speq, spe->l1d_miss_id,
data_src); if (err) return err;
}
if (record->type & ARM_SPE_L1D_ACCESS) {
err = arm_spe__synth_mem_sample(speq, spe->l1d_access_id,
data_src); if (err) return err;
}
}
if (spe->sample_llc) { if (record->type & ARM_SPE_LLC_MISS) {
err = arm_spe__synth_mem_sample(speq, spe->llc_miss_id,
data_src); if (err) return err;
}
if (record->type & ARM_SPE_LLC_ACCESS) {
err = arm_spe__synth_mem_sample(speq, spe->llc_access_id,
data_src); if (err) return err;
}
}
if (spe->sample_tlb) { if (record->type & ARM_SPE_TLB_MISS) {
err = arm_spe__synth_mem_sample(speq, spe->tlb_miss_id,
data_src); if (err) return err;
}
if (record->type & ARM_SPE_TLB_ACCESS) {
err = arm_spe__synth_mem_sample(speq, spe->tlb_access_id,
data_src); if (err) return err;
}
}
if (spe->synth_opts.last_branch &&
(spe->sample_branch || spe->sample_instructions))
arm_spe__prep_branch_stack(speq);
if (spe->sample_branch && (record->op & ARM_SPE_OP_BRANCH_ERET)) {
err = arm_spe__synth_branch_sample(speq, spe->branch_id); if (err) return err;
}
if (spe->sample_remote_access &&
(record->type & ARM_SPE_REMOTE_ACCESS)) {
err = arm_spe__synth_mem_sample(speq, spe->remote_access_id,
data_src); if (err) return err;
}
/* * When data_src is zero it means the record is not a memory operation, * skip to synthesize memory sample for this case.
*/ if (spe->sample_memory && is_ldst_op(record->op)) {
err = arm_spe__synth_mem_sample(speq, spe->memory_id, data_src); if (err) return err;
}
if (spe->sample_instructions) {
err = arm_spe__synth_instruction_sample(speq, spe->instructions_id, data_src); if (err) return err;
}
if (!spe->kernel_start)
spe->kernel_start = machine__kernel_start(spe->machine);
while (1) { /* * The usual logic is firstly to decode the packets, and then * based the record to synthesize sample; but here the flow is * reversed: it calls arm_spe_sample() for synthesizing samples * prior to arm_spe_decode(). * * Two reasons for this code logic: * 1. Firstly, when setup queue in arm_spe__setup_queue(), it * has decoded trace data and generated a record, but the record * is left to generate sample until run to here, so it's correct * to synthesize sample for the left record. * 2. After decoding trace data, it needs to compare the record * timestamp with the coming perf event, if the record timestamp * is later than the perf event, it needs bail out and pushs the * record into auxtrace heap, thus the record can be deferred to * synthesize sample until run to here at the next time; so this * can correlate samples between Arm SPE trace data and other * perf events with correct time ordering.
*/
/* * Update pid/tid info.
*/
record = &speq->decoder->record; if (!spe->timeless_decoding && record->context_id != (u64)-1) {
ret = arm_spe_set_tid(speq, record->context_id); if (ret) return ret;
spe->use_ctx_pkt_for_pid = true;
}
ret = arm_spe_sample(speq); if (ret) return ret;
ret = arm_spe_decode(speq->decoder); if (!ret) {
pr_debug("No data or all data has been processed.\n"); return 1;
}
/* * Error is detected when decode SPE trace data, continue to * the next trace data and find out more records.
*/ if (ret < 0) continue;
record = &speq->decoder->record;
/* Update timestamp for the last record */ if (record->timestamp > speq->timestamp)
speq->timestamp = record->timestamp;
/* * If the timestamp of the queue is later than timestamp of the * coming perf event, bail out so can allow the perf event to * be processed ahead.
*/ if (!spe->timeless_decoding && speq->timestamp >= *timestamp) {
*timestamp = speq->timestamp; return 0;
}
}
/* * Circle through the list of event and complain if we find one * with the time bit set.
*/
evlist__for_each_entry(evlist, evsel) { if ((evsel->core.attr.sample_type & PERF_SAMPLE_TIME))
timeless_decoding = false;
}
/* * A previous context-switch event has set pid/tid in the machine's context, so * here we need to update the pid/tid in the thread and SPE queue.
*/ if (!spe->use_ctx_pkt_for_pid)
arm_spe_set_pid_tid_cpu(spe, queue);
ret = arm_spe_run_decoder(speq, &ts); if (ret < 0) {
auxtrace_heap__add(&spe->heap, queue_nr, ts); return ret;
}
if (!ret) {
ret = auxtrace_heap__add(&spe->heap, queue_nr, ts); if (ret < 0) return ret;
} else {
speq->on_heap = false;
}
}
/* Dump here now we have copied a piped trace out of the pipe */ if (dump_trace) { if (auxtrace_buffer__get_data(buffer, fd)) {
arm_spe_dump_event(spe, buffer->data,
buffer->size);
auxtrace_buffer__put_data(buffer);
}
}
}
ret = arm_spe__update_queues(spe); if (ret < 0) return ret;
if (spe->timeless_decoding) return arm_spe_process_timeless_queues(spe, -1,
MAX_TIMESTAMP - 1);
ret = arm_spe_process_queues(spe, MAX_TIMESTAMP); if (ret) return ret;
if (!spe->use_ctx_pkt_for_pid)
ui__warning("Arm SPE CONTEXT packets not found in the traces.\n" "Matching of TIDs to SPE events could be inaccurate.\n");
return 0;
}
static u64 *arm_spe__alloc_per_cpu_metadata(u64 *buf, int per_cpu_size)
{
u64 *metadata;
metadata = zalloc(per_cpu_size); if (!metadata) return NULL;
for (i = 0; i < hdr_size; i++)
fprintf(stdout, hdr_fmts[i], arr[i]);
arr += hdr_size; for (cpu = 0; cpu < cpu_num; cpu++) { /* * The parameters from ARM_SPE_MAGIC to ARM_SPE_CPU_NR_PARAMS * are fixed. The sequential parameter size is decided by the * field 'ARM_SPE_CPU_NR_PARAMS'.
*/
cpu_size = (ARM_SPE_CPU_NR_PARAMS + 1) + arr[ARM_SPE_CPU_NR_PARAMS]; for (i = 0; i < cpu_size; i++)
fprintf(stdout, metadata_per_cpu_fmts[i], arr[i]);
arr += cpu_size;
}
}
if (spe->synth_opts.last_branch) { if (spe->synth_opts.last_branch_sz > 2)
pr_debug("Arm SPE supports only two bstack entries (PBT+TGT).\n");
attr.sample_type |= PERF_SAMPLE_BRANCH_STACK; /* * We don't use the hardware index, but the sample generation * code uses the new format branch_stack with this field, * so the event attributes must indicate that it's present.
*/
attr.branch_sample_type |= PERF_SAMPLE_BRANCH_HW_INDEX;
}
if (spe->synth_opts.branches) {
spe->sample_branch = true;
if (spe->synth_opts.mem) {
spe->sample_memory = true;
err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err;
spe->memory_id = id;
arm_spe_set_event_name(evlist, id, "memory");
id += 1;
}
if (spe->synth_opts.instructions) { if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) {
pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n"); goto synth_instructions_out;
} if (spe->synth_opts.period > 1)
pr_warning("Arm SPE has a hardware-based sample period.\n" "Additional instruction events will be discarded by --itrace\n");
/* * The synthesized event PERF_RECORD_TIME_CONV has been handled ahead * and the parameters for hardware clock are stored in the session * context. Passes these parameters to the struct perf_tsc_conversion * in "spe->tc", which is used for later conversion between clock * counter and timestamp. * * For backward compatibility, copies the fields starting from * "time_cycles" only if they are contained in the event.
*/
spe->tc.time_shift = tc->time_shift;
spe->tc.time_mult = tc->time_mult;
spe->tc.time_zero = tc->time_zero;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.