// SPDX-License-Identifier: GPL-2.0-only /* * Perf support for the Statistical Profiling Extension, introduced as * part of ARMv8.2. * * Copyright (C) 2016 ARM Limited * * Author: Will Deacon <will.deacon@arm.com>
*/
/* * Cache if the event is allowed to trace Context information. * This allows us to perform the check, i.e, perf_allow_kernel(), * in the context of the event owner, once, during the event_init().
*/ #define SPE_PMU_HW_FLAGS_CX 0x00001
/* Convert a free-running index from perf into an SPE buffer offset */ #define PERF_IDX2OFF(idx, buf) \
((idx) % ((unsignedlong)(buf)->nr_pages << PAGE_SHIFT))
/* Keep track of our dynamic hotplug state */ staticenum cpuhp_state arm_spe_pmu_online;
/* This sysfs gunk was really good fun to write. */ enum arm_spe_pmu_capabilities {
SPE_PMU_CAP_ARCH_INST = 0,
SPE_PMU_CAP_ERND,
SPE_PMU_CAP_FEAT_MAX,
SPE_PMU_CAP_CNT_SZ = SPE_PMU_CAP_FEAT_MAX,
SPE_PMU_CAP_MIN_IVAL,
};
/* * The PMSIDR_EL1.Interval field (stored in spe_pmu->min_period) is a * recommendation for the minimum interval, not a hardware limitation. * * According to the Arm ARM (DDI 0487 L.a), section D24.7.12 PMSIRR_EL1, * Sampling Interval Reload Register, the INTERVAL field (bits [31:8]) * states: "Software must set this to a nonzero value". Use 1 as the * minimum value.
*/
u64 min_period = FIELD_PREP(PMSIRR_EL1_INTERVAL_MASK, 1);
/* * The trace format isn't parseable in reverse, so clamp * the limit to half of the buffer size in snapshot mode * so that the worst case is half a buffer of records, as * opposed to a single record.
*/ if (head < limit >> 1)
limit >>= 1;
/* * If we're within max_record_sz of the limit, we must * pad, move the head index and recompute the limit.
*/ if (limit - head < spe_pmu->max_record_sz) {
arm_spe_pmu_pad_buf(handle, limit - head);
handle->head = PERF_IDX2OFF(limit, buf);
limit = ((buf->nr_pages * PAGE_SIZE) >> 1) + handle->head;
}
/* * The head can be misaligned for two reasons: * * 1. The hardware left PMBPTR pointing to the first byte after * a record when generating a buffer management event. * * 2. We used perf_aux_output_skip to consume handle->size bytes * and CIRC_SPACE was used to compute the size, which always * leaves one entry free. * * Deal with this by padding to the next alignment boundary and * moving the head index. If we run out of buffer space, we'll * reduce handle->size to zero and end up reporting truncation.
*/
head = PERF_IDX2OFF(handle->head, buf); if (!IS_ALIGNED(head, spe_pmu->align)) { unsignedlong delta = roundup(head, spe_pmu->align) - head;
/* If we've run out of free space, then nothing more to do */ if (!handle->size) goto no_space;
/* Compute the tail and wakeup indices now that we've aligned head */
tail = PERF_IDX2OFF(handle->head + handle->size, buf);
wakeup = PERF_IDX2OFF(handle->wakeup, buf);
/* * Avoid clobbering unconsumed data. We know we have space, so * if we see head == tail we know that the buffer is empty. If * head > tail, then there's nothing to clobber prior to * wrapping.
*/ if (head < tail)
limit = round_down(tail, PAGE_SIZE);
/* * Wakeup may be arbitrarily far into the future. If it's not in * the current generation, either we'll wrap before hitting it, * or it's in the past and has been handled already. * * If there's a wakeup before we wrap, arrange to be woken up by * the page boundary following it. Keep the tail boundary if * that's lower.
*/ if (handle->wakeup < (handle->head + handle->size) && head <= wakeup)
limit = min(limit, round_up(wakeup, PAGE_SIZE));
/* * If the head has come too close to the end of the buffer, * then pad to the end and recompute the limit.
*/ if (limit && (limit - head < spe_pmu->max_record_sz)) {
arm_spe_pmu_pad_buf(handle, limit - head);
limit = __arm_spe_pmu_next_off(handle);
}
/* Start a new aux session */
buf = perf_aux_output_begin(handle, event); if (!buf) {
event->hw.state |= PERF_HES_STOPPED; /* * We still need to clear the limit pointer, since the * profiler might only be disabled by virtue of a fault.
*/
limit = 0; goto out_write_limit;
}
/* * Ensure new profiling data is visible to the CPU and any external * aborts have been resolved.
*/
psb_csync();
dsb(nsh);
/* Ensure hardware updates to PMBPTR_EL1 are visible */
isb();
/* Service required? */
pmbsr = read_sysreg_s(SYS_PMBSR_EL1); if (!FIELD_GET(PMBSR_EL1_S, pmbsr)) return SPE_PMU_BUF_FAULT_ACT_SPURIOUS;
/* * If we've lost data, disable profiling and also set the PARTIAL * flag to indicate that the last record is corrupted.
*/ if (FIELD_GET(PMBSR_EL1_DL, pmbsr))
perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED |
PERF_AUX_FLAG_PARTIAL);
/* Report collisions to userspace so that it can up the period */ if (FIELD_GET(PMBSR_EL1_COLL, pmbsr))
perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION);
/* We only expect buffer management events */ switch (FIELD_GET(PMBSR_EL1_EC, pmbsr)) { case PMBSR_EL1_EC_BUF: /* Handled below */ break; case PMBSR_EL1_EC_FAULT_S1: case PMBSR_EL1_EC_FAULT_S2:
err_str = "Unexpected buffer fault"; goto out_err; default:
err_str = "Unknown error code"; goto out_err;
}
/* Buffer management event */ switch (FIELD_GET(PMBSR_EL1_BUF_BSC_MASK, pmbsr)) { case PMBSR_EL1_BUF_BSC_FULL:
ret = SPE_PMU_BUF_FAULT_ACT_OK; goto out_stop; default:
err_str = "Unknown buffer status code";
}
out_err:
pr_err_ratelimited("%s on CPU %d [PMBSR=0x%016llx, PMBPTR=0x%016llx, PMBLIMITR=0x%016llx]\n",
err_str, smp_processor_id(), pmbsr,
read_sysreg_s(SYS_PMBPTR_EL1),
read_sysreg_s(SYS_PMBLIMITR_EL1));
ret = SPE_PMU_BUF_FAULT_ACT_FATAL;
act = arm_spe_pmu_buf_get_fault_act(handle); if (act == SPE_PMU_BUF_FAULT_ACT_SPURIOUS) return IRQ_NONE;
/* * Ensure perf callbacks have completed, which may disable the * profiling buffer in response to a TRUNCATION flag.
*/
irq_work_run();
switch (act) { case SPE_PMU_BUF_FAULT_ACT_FATAL: /* * If a fatal exception occurred then leaving the profiling * buffer enabled is a recipe waiting to happen. Since * fatal faults don't always imply truncation, make sure * that the profiling buffer is disabled explicitly before * clearing the syndrome register.
*/
arm_spe_pmu_disable_and_drain_local(); break; case SPE_PMU_BUF_FAULT_ACT_OK: /* * We handled the fault (the buffer was full), so resume * profiling as long as we didn't detect truncation. * PMBPTR might be misaligned, but we'll burn that bridge * when we get to it.
*/ if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) {
arm_spe_perf_aux_output_begin(handle, event);
isb();
} break; case SPE_PMU_BUF_FAULT_ACT_SPURIOUS: /* We've seen you before, but GCC has the memory of a sieve. */ break;
}
/* The buffer pointers are now sane, so resume profiling. */
write_sysreg_s(0, SYS_PMBSR_EL1); return IRQ_HANDLED;
}
static u64 arm_spe_pmsevfr_res0(u16 pmsver)
{ switch (pmsver) { case ID_AA64DFR0_EL1_PMSVer_IMP: return PMSEVFR_EL1_RES0_IMP; case ID_AA64DFR0_EL1_PMSVer_V1P1: return PMSEVFR_EL1_RES0_V1P1; case ID_AA64DFR0_EL1_PMSVer_V1P2: /* Return the highest version we support in default */ default: return PMSEVFR_EL1_RES0_V1P2;
}
}
/* This is, of course, deeply driver-specific */ if (attr->type != event->pmu->type) return -ENOENT;
if (event->cpu >= 0 &&
!cpumask_test_cpu(event->cpu, &spe_pmu->supported_cpus)) return -ENOENT;
if (arm_spe_event_to_pmsevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver)) return -EOPNOTSUPP;
if (arm_spe_event_to_pmsnevfr(event) & arm_spe_pmsevfr_res0(spe_pmu->pmsver)) return -EOPNOTSUPP;
if (attr->exclude_idle) return -EOPNOTSUPP;
/* * Feedback-directed frequency throttling doesn't work when we * have a buffer of samples. We'd need to manually count the * samples in the buffer when it fills up and adjust the event * count to reflect that. Instead, just force the user to specify * a sample period.
*/ if (attr->freq) return -EINVAL;
/* If we're already stopped, then nothing to do */ if (hwc->state & PERF_HES_STOPPED) return;
/* Stop all trace generation */
arm_spe_pmu_disable_and_drain_local();
if (flags & PERF_EF_UPDATE) { /* * If there's a fault pending then ensure we contain it * to this buffer, since we might be on the context-switch * path.
*/ if (perf_get_aux(handle)) { enum arm_spe_pmu_buf_fault_action act;
/* * This may also contain ECOUNT, but nobody else should * be looking at period_left, since we forbid frequency * based sampling.
*/
local64_set(&hwc->period_left, read_sysreg_s(SYS_PMSICR_EL1));
hwc->state |= PERF_HES_UPTODATE;
}
hwc->state |= PERF_HES_STOPPED;
}
staticint arm_spe_pmu_add(struct perf_event *event, int flags)
{ int ret = 0; struct arm_spe_pmu *spe_pmu = to_spe_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; int cpu = event->cpu == -1 ? smp_processor_id() : event->cpu;
if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus)) return -ENOENT;
staticvoid *arm_spe_pmu_setup_aux(struct perf_event *event, void **pages, int nr_pages, bool snapshot)
{ int i, cpu = event->cpu; struct page **pglist; struct arm_spe_pmu_buf *buf;
/* We need at least two pages for this to work. */ if (nr_pages < 2) return NULL;
/* * We require an even number of pages for snapshot mode, so that * we can effectively treat the buffer as consisting of two equal * parts and give userspace a fighting chance of getting some * useful data out of it.
*/ if (snapshot && (nr_pages & 1)) return NULL;
if (cpu == -1)
cpu = raw_smp_processor_id();
buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, cpu_to_node(cpu)); if (!buf) return NULL;
pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL); if (!pglist) goto out_free_buf;
for (i = 0; i < nr_pages; ++i)
pglist[i] = virt_to_page(pages[i]);
buf->base = vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL); if (!buf->base) goto out_free_pglist;
int idx; char *name; struct device *dev = &spe_pmu->pdev->dev;
spe_pmu->pmu = (struct pmu) {
.module = THIS_MODULE,
.parent = &spe_pmu->pdev->dev,
.capabilities = PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE,
.attr_groups = arm_spe_pmu_attr_groups, /* * We hitch a ride on the software context here, so that * we can support per-task profiling (which is not possible * with the invalid context as it doesn't get sched callbacks). * This requires that userspace either uses a dummy event for * perf_event_open, since the aux buffer is not setup until * a subsequent mmap, or creates the profiling event in a * disabled state and explicitly PERF_EVENT_IOC_ENABLEs it * once the buffer has been created.
*/
.task_ctx_nr = perf_sw_context,
.event_init = arm_spe_pmu_event_init,
.add = arm_spe_pmu_add,
.del = arm_spe_pmu_del,
.start = arm_spe_pmu_start,
.stop = arm_spe_pmu_stop,
.read = arm_spe_pmu_read,
.setup_aux = arm_spe_pmu_setup_aux,
.free_aux = arm_spe_pmu_free_aux,
};
idx = atomic_inc_return(&pmu_idx);
name = devm_kasprintf(dev, GFP_KERNEL, "%s_%d", PMUNAME, idx); if (!name) {
dev_err(dev, "failed to allocate name for pmu %d\n", idx); return -ENOMEM;
}
fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64DFR0_EL1),
ID_AA64DFR0_EL1_PMSVer_SHIFT); if (!fld) {
dev_err(dev, "unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n",
fld, smp_processor_id()); return;
}
spe_pmu->pmsver = (u16)fld;
/* Read PMBIDR first to determine whether or not we have access */
reg = read_sysreg_s(SYS_PMBIDR_EL1); if (FIELD_GET(PMBIDR_EL1_P, reg)) {
dev_err(dev, "profiling buffer owned by higher exception level\n"); return;
}
/* Minimum alignment. If it's out-of-range, then fail the probe */
fld = FIELD_GET(PMBIDR_EL1_ALIGN, reg);
spe_pmu->align = 1 << fld; if (spe_pmu->align > SZ_2K) {
dev_err(dev, "unsupported PMBIDR.Align [%d] on CPU %d\n",
fld, smp_processor_id()); return;
}
/* It's now safe to read PMSIDR and figure out what we've got */
reg = read_sysreg_s(SYS_PMSIDR_EL1); if (FIELD_GET(PMSIDR_EL1_FE, reg))
spe_pmu->features |= SPE_PMU_FEAT_FILT_EVT;
if (FIELD_GET(PMSIDR_EL1_FnE, reg))
spe_pmu->features |= SPE_PMU_FEAT_INV_FILT_EVT;
if (FIELD_GET(PMSIDR_EL1_FT, reg))
spe_pmu->features |= SPE_PMU_FEAT_FILT_TYP;
if (FIELD_GET(PMSIDR_EL1_FL, reg))
spe_pmu->features |= SPE_PMU_FEAT_FILT_LAT;
if (FIELD_GET(PMSIDR_EL1_ARCHINST, reg))
spe_pmu->features |= SPE_PMU_FEAT_ARCH_INST;
if (FIELD_GET(PMSIDR_EL1_LDS, reg))
spe_pmu->features |= SPE_PMU_FEAT_LDS;
if (FIELD_GET(PMSIDR_EL1_ERND, reg))
spe_pmu->features |= SPE_PMU_FEAT_ERND;
if (spe_pmu->pmsver >= ID_AA64DFR0_EL1_PMSVer_V1P2)
spe_pmu->features |= SPE_PMU_FEAT_DISCARD;
/* This field has a spaced out encoding, so just use a look-up */
fld = FIELD_GET(PMSIDR_EL1_INTERVAL, reg); switch (fld) { case PMSIDR_EL1_INTERVAL_256:
spe_pmu->min_period = 256; break; case PMSIDR_EL1_INTERVAL_512:
spe_pmu->min_period = 512; break; case PMSIDR_EL1_INTERVAL_768:
spe_pmu->min_period = 768; break; case PMSIDR_EL1_INTERVAL_1024:
spe_pmu->min_period = 1024; break; case PMSIDR_EL1_INTERVAL_1536:
spe_pmu->min_period = 1536; break; case PMSIDR_EL1_INTERVAL_2048:
spe_pmu->min_period = 2048; break; case PMSIDR_EL1_INTERVAL_3072:
spe_pmu->min_period = 3072; break; default:
dev_warn(dev, "unknown PMSIDR_EL1.Interval [%d]; assuming 8\n",
fld);
fallthrough; case PMSIDR_EL1_INTERVAL_4096:
spe_pmu->min_period = 4096;
}
/* Maximum record size. If it's out-of-range, then fail the probe */
fld = FIELD_GET(PMSIDR_EL1_MAXSIZE, reg);
spe_pmu->max_record_sz = 1 << fld; if (spe_pmu->max_record_sz > SZ_2K || spe_pmu->max_record_sz < 16) {
dev_err(dev, "unsupported PMSIDR_EL1.MaxSize [%d] on CPU %d\n",
fld, smp_processor_id()); return;
}
dev_info(dev, "probed SPEv1.%d for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n",
spe_pmu->pmsver - 1, cpumask_pr_args(&spe_pmu->supported_cpus),
spe_pmu->max_record_sz, spe_pmu->align, spe_pmu->features);
spe_pmu->features |= SPE_PMU_FEAT_DEV_PROBED;
}
staticvoid __arm_spe_pmu_reset_local(void)
{ /* * This is probably overkill, as we have no idea where we're * draining any buffered data to...
*/
arm_spe_pmu_disable_and_drain_local();
/* Reset the buffer base pointer */
write_sysreg_s(0, SYS_PMBPTR_EL1);
isb();
/* Make sure we probe the hardware on a relevant CPU */
ret = smp_call_function_any(mask, __arm_spe_pmu_dev_probe, spe_pmu, 1); if (ret || !(spe_pmu->features & SPE_PMU_FEAT_DEV_PROBED)) return -ENXIO;
/* Request our PPIs (note that the IRQ is still disabled) */
ret = request_percpu_irq(spe_pmu->irq, arm_spe_pmu_irq_handler, DRVNAME,
spe_pmu->handle); if (ret) return ret;
/* * Register our hotplug notifier now so we don't miss any events. * This will enable the IRQ for any supported CPUs that are already * up.
*/
ret = cpuhp_state_add_instance(arm_spe_pmu_online,
&spe_pmu->hotplug_node); if (ret)
free_percpu_irq(spe_pmu->irq, spe_pmu->handle);
/* * If kernelspace is unmapped when running at EL0, then the SPE * buffer will fault and prematurely terminate the AUX session.
*/ if (arm64_kernel_unmapped_at_el0()) {
dev_warn_once(dev, "profiling buffer inaccessible. Try passing \"kpti=off\" on the kernel command line\n"); return -EPERM;
}
spe_pmu = devm_kzalloc(dev, sizeof(*spe_pmu), GFP_KERNEL); if (!spe_pmu) return -ENOMEM;
spe_pmu->handle = alloc_percpu(typeof(*spe_pmu->handle)); if (!spe_pmu->handle) return -ENOMEM;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.