/* * IBS states: * * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken * and any further add()s must fail. * * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are * complicated by the fact that the IBS hardware can send late NMIs (ie. after * we've cleared the EN bit). * * In order to consume these late NMIs we have the STOPPED state, any NMI that * happens after we've cleared the EN state will clear this bit and report the * NMI handled (this is fundamentally racy in the face or multiple NMI sources, * someone else can consume our BIT and our NMI will go unhandled). * * And since we cannot set/clear this separate bit together with the EN bit, * there are races; if we cleared STARTED early, an NMI could land in * between clearing STARTED and clearing the EN bit (in fact multiple NMIs * could happen if the period is small enough), and consume our STOPPED bit * and trigger streams of unhandled NMIs. * * If, however, we clear STARTED late, an NMI can hit between clearing the * EN bit and clearing STARTED, still see STARTED set and process the event. * If this event will have the VALID bit clear, we bail properly, but this * is not a given. With VALID set we can end up calling pmu::stop() again * (the throttle logic) and trigger the WARNs in there. * * So what we do is set STOPPING before clearing EN to avoid the pmu::stop() * nesting, and clear STARTED late, so that we have a well defined state over * the clearing of the EN bit. * * XXX: we could probably be using !atomic bitops for all this.
*/
staticint
perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
{
s64 left = local64_read(&hwc->period_left);
s64 period = hwc->sample_period; int overflow = 0;
/* * If we are way outside a reasonable range then just skip forward:
*/ if (unlikely(left <= -period)) {
left = period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
overflow = 1;
}
if (unlikely(left < (s64)min)) {
left += period;
local64_set(&hwc->period_left, left);
hwc->last_period = period;
overflow = 1;
}
/* * If the hw period that triggers the sw overflow is too short * we might hit the irq handler. This biases the results. * Thus we shorten the next-to-last period and set the last * period to the max period.
*/ if (left > max) {
left -= max; if (left > max)
left = max; elseif (left < min)
left = min;
}
/* * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta * count to the generic event atomically:
*/
prev_raw_count = local64_read(&hwc->prev_count); if (!local64_try_cmpxchg(&hwc->prev_count,
&prev_raw_count, new_raw_count)) return 0;
/* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count.
*/
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
staticstruct perf_ibs *get_ibs_pmu(int type)
{ if (perf_ibs_fetch.pmu.type == type) return &perf_ibs_fetch; if (perf_ibs_op.pmu.type == type) return &perf_ibs_op; return NULL;
}
/* * core pmu config -> IBS config * * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count * perf record -a -e r076:p ... # same as -e cpu-cycles:p * perf record -a -e r0C1:p ... # use ibs op counting micro-ops * * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, * MSRC001_1033) is used to select either cycle or micro-ops counting * mode.
*/ staticint core_pmu_ibs_config(struct perf_event *event, u64 *config)
{ switch (event->attr.type) { case PERF_TYPE_HARDWARE: switch (event->attr.config) { case PERF_COUNT_HW_CPU_CYCLES:
*config = 0; return 0;
} break; case PERF_TYPE_RAW: switch (event->attr.config) { case 0x0076:
*config = 0; return 0; case 0x00C1:
*config = IBS_OP_CNT_CTL; return 0;
} break; default: return -ENOENT;
}
return -EOPNOTSUPP;
}
/* * The rip of IBS samples has skid 0. Thus, IBS supports precise * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the * rip is invalid when IBS was not able to record the rip correctly. * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
*/ int forward_event_to_ibs(struct perf_event *event)
{
u64 config = 0;
if (!event->attr.precise_ip || event->attr.precise_ip > 2) return -EOPNOTSUPP;
/* * Grouping of IBS events is not possible since IBS can have only * one event active at any point in time.
*/ staticint validate_group(struct perf_event *event)
{ struct perf_event *sibling;
if (event->group_leader == event) return 0;
if (event->group_leader->pmu == event->pmu) return -EINVAL;
/* * If we modify hwc->sample_period, we also need to update * hwc->last_period and hwc->period_left.
*/
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
/* * If the internal 27-bit counter rolled over, the count is MaxCnt * and the lower 7 bits of CurCnt are randomized. * Otherwise CurCnt has the full 27-bit current counter value.
*/ if (op_ctl.op_val) {
count = op_ctl.opmaxcnt << 4; if (ibs_caps & IBS_CAPS_OPCNTEXT)
count += op_ctl.opmaxcnt_ext << 20;
} elseif (ibs_caps & IBS_CAPS_RDWROPCNT) {
count = op_ctl.opcurcnt;
}
/* * Set width to 64 since we do not overflow on max width but * instead on max count. In perf_ibs_set_period() we clear * prev count manually on overflow.
*/ while (!perf_event_try_update(event, count, 64)) {
rdmsrq(event->hw.config_base, *config);
count = perf_ibs->get_count(*config);
}
}
/* * Erratum #420 Instruction-Based Sampling Engine May Generate * Interrupt that Cannot Be Cleared: * * Must clear counter mask first, then clear the enable bit. See * Revision Guide for AMD Family 10h Processors, Publication #41322.
*/ staticinlinevoid perf_ibs_disable_event(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 config)
{
config &= ~perf_ibs->cnt_mask; if (boot_cpu_data.x86 == 0x10)
wrmsrq(hwc->config_base, config);
config &= ~perf_ibs->enable_mask;
wrmsrq(hwc->config_base, config);
}
/* * We cannot restore the ibs pmu state, so we always needs to update * the event while stopping it and then reset the state when starting * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in * perf_ibs_start()/perf_ibs_stop() and instead always do it.
*/ staticvoid perf_ibs_start(struct perf_event *event, int flags)
{ struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
u64 period, config = 0;
if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return;
if (event->attr.freq && hwc->sample_period < perf_ibs->min_period)
hwc->sample_period = perf_ibs->min_period;
perf_ibs_set_period(perf_ibs, hwc, &period); if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) {
config |= period & IBS_OP_MAX_CNT_EXT_MASK;
period &= ~IBS_OP_MAX_CNT_EXT_MASK;
}
config |= period >> 4;
/* * Set STARTED before enabling the hardware, such that a subsequent NMI * must observe it.
*/
set_bit(IBS_STARTED, pcpu->state);
clear_bit(IBS_STOPPING, pcpu->state);
perf_ibs_enable_event(perf_ibs, hwc, config);
if (test_and_set_bit(IBS_STOPPING, pcpu->state)) return;
stopping = test_bit(IBS_STARTED, pcpu->state);
if (!stopping && (hwc->state & PERF_HES_UPTODATE)) return;
rdmsrq(hwc->config_base, config);
if (stopping) { /* * Set STOPPED before disabling the hardware, such that it * must be visible to NMIs the moment we clear the EN bit, * at which point we can generate an !VALID sample which * we need to consume.
*/
set_bit(IBS_STOPPED, pcpu->state);
perf_ibs_disable_event(perf_ibs, hwc, config); /* * Clear STARTED after disabling the hardware; if it were * cleared before an NMI hitting after the clear but before * clearing the EN bit might think it a spurious NMI and not * handle it. * * Clearing it after, however, creates the problem of the NMI * handler seeing STARTED but not having a valid sample.
*/
clear_bit(IBS_STARTED, pcpu->state);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
if (hwc->state & PERF_HES_UPTODATE) return;
/* * Clear valid bit to not count rollovers on update, rollovers * are only updated in the irq handler.
*/
config &= ~perf_ibs->valid_mask;
/* * This contradicts with perf_ibs_init() which allows sample period * with lower nibble bits set but silently masks them off. Whereas * this returns error.
*/ if (low_nibble || value < perf_ibs->min_period) return -EINVAL;
return 0;
}
/* * We need to initialize with empty group if all attributes in the * group are dynamic.
*/ staticstruct attribute *attrs_empty[] = {
NULL,
};
/* * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached * memory accesses. So, check DcUcMemAcc bit early.
*/ if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) return L(UNC) | LN(UNC);
/* L1 Hit */ if (op_data3->dc_miss == 0) return L(L1) | LN(L1);
/* L2 Hit */ if (op_data3->l2_miss == 0) { /* Erratum #1293 */ if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
!(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) return L(L2) | LN(L2);
}
/* * OP_DATA2 is valid only for load ops. Skip all checks which * uses OP_DATA2[DataSrc].
*/ if (data_src->mem_op != PERF_MEM_OP_LOAD) goto check_mab;
if (ibs_caps & IBS_CAPS_ZEN4) {
u64 val = g_zen4_data_src[ibs_data_src];
if (!val) goto check_mab;
/* HOPS_1 because IBS doesn't provide remote socket detail */ if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) { if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1); else
val |= REM | HOPS(1);
}
return val;
} else {
u64 val = g_data_src[ibs_data_src];
if (!val) goto check_mab;
/* HOPS_1 because IBS doesn't provide remote socket detail */ if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) { if (ibs_data_src == IBS_DATA_SRC_DRAM)
val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1); else
val |= REM | HOPS(1);
}
return val;
}
check_mab: /* * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding * DC misses. However, such data may come from any level in mem * hierarchy. IBS provides detail about both MAB as well as actual * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set * MAB only when IBS fails to provide DataSrc.
*/ if (op_data3->dc_miss_no_mab_alloc) return L(LFB) | LN(LFB);
/* Don't set HIT with NA */ return PERF_MEM_S(LVL, NA) | LN(NA);
}
if (op_data3->dc_locked_op)
data_src->mem_lock = PERF_MEM_LOCK_LOCKED;
}
/* Be careful. Works only for contiguous MSRs. */ #define ibs_fetch_msr_idx(msr) (msr - MSR_AMD64_IBSFETCHCTL) #define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL)
staticvoid perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, struct perf_sample_data *data, union ibs_op_data2 *op_data2, union ibs_op_data3 *op_data3)
{ union perf_mem_data_src *data_src = &data->data_src;
static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, union ibs_op_data3 *op_data3)
{
__u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)];
/* Erratum #1293 */ if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF &&
(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { /* * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. * DataSrc=0 is 'No valid status' and RmtNode is invalid when * DataSrc=0.
*/
val = 0;
} return val;
}
staticvoid perf_ibs_parse_ld_st_data(__u64 sample_type, struct perf_ibs_data *ibs_data, struct perf_sample_data *data)
{ union ibs_op_data3 op_data3; union ibs_op_data2 op_data2; union ibs_op_data op_data;
if (!test_bit(IBS_STARTED, pcpu->state)) {
fail: /* * Catch spurious interrupts after stopping IBS: After * disabling IBS there could be still incoming NMIs * with samples that even have the valid bit cleared. * Mark all this NMIs as handled.
*/ if (test_and_clear_bit(IBS_STOPPED, pcpu->state)) return 1;
do {
rdmsrq(msr + offset, *buf++);
size++;
offset = find_next_bit(perf_ibs->offset_mask,
perf_ibs->offset_max,
offset + 1);
} while (offset < offset_max);
if (perf_ibs_ldlat_event(perf_ibs, event)) { union ibs_op_data3 op_data3;
op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; /* * Opening event is errored out if load latency threshold is * outside of [128, 2048] range. Since the event has reached * interrupt handler, we can safely assume the threshold is * within [128, 2048] range.
*/ if (!op_data3.ld_op || !op_data3.dc_miss ||
op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF)) goto out;
}
/* * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately * depending on their availability. * Can't add to offset_max as they are staggered
*/ if (event->attr.sample_type & PERF_SAMPLE_RAW) { if (perf_ibs == &perf_ibs_op) { if (ibs_caps & IBS_CAPS_BRNTRGT) {
rdmsrq(MSR_AMD64_IBSBRTARGET, *buf++);
br_target_idx = size;
size++;
} if (ibs_caps & IBS_CAPS_OPDATA4) {
rdmsrq(MSR_AMD64_IBSOPDATA4, *buf++);
size++;
}
} if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) {
rdmsrq(MSR_AMD64_ICIBSEXTDCTL, *buf++);
size++;
}
}
ibs_data.size = sizeof(u64) * size;
regs = *iregs; if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
regs.flags &= ~PERF_EFLAGS_EXACT;
} else { /* Workaround for erratum #1197 */ if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1])) goto out;
if (perf_ibs == &perf_ibs_op)
perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data);
/* * rip recorded by IbsOpRip will not be consistent with rsp and rbp * recorded as part of interrupt regs. Thus we need to use rip from * interrupt regs while unwinding call stack.
*/
perf_sample_save_callchain(&data, event, iregs);
static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
{ struct cpu_perf_ibs __percpu *pcpu; int ret;
pcpu = alloc_percpu(struct cpu_perf_ibs); if (!pcpu) return -ENOMEM;
perf_ibs->pcpu = pcpu;
ret = perf_pmu_register(&perf_ibs->pmu, name, -1); if (ret) {
perf_ibs->pcpu = NULL;
free_percpu(pcpu);
}
return ret;
}
static __init int perf_ibs_fetch_init(void)
{ /* * Some chips fail to reset the fetch count when it is written; instead * they need a 0-1 transition of IbsFetchEn.
*/ if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18)
perf_ibs_fetch.fetch_count_reset_broken = 1;
if (!get_eilvt(offset)) {
pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); goto out;
}
valid = 1;
out:
preempt_enable();
return valid;
}
staticint setup_ibs_ctl(int ibs_eilvt_off)
{ struct pci_dev *cpu_cfg; int nodes;
u32 value = 0;
nodes = 0;
cpu_cfg = NULL; do {
cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
PCI_DEVICE_ID_AMD_10H_NB_MISC,
cpu_cfg); if (!cpu_cfg) break;
++nodes;
pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
| IBSCTL_LVT_OFFSET_VALID);
pci_read_config_dword(cpu_cfg, IBSCTL, &value); if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
pci_dev_put(cpu_cfg);
pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
value); return -EINVAL;
}
} while (1);
if (!nodes) {
pr_debug("No CPU node configured for IBS\n"); return -ENODEV;
}
return 0;
}
/* * This runs only on the current cpu. We try to find an LVT offset and * setup the local APIC. For this we must disable preemption. On * success we initialize all nodes with this offset. This updates then * the offset in the IBS_CTL per-node msr. The per-core APIC setup of * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that * is using the new offset.
*/ staticvoid force_ibs_eilvt_setup(void)
{ int offset; int ret;
preempt_disable(); /* find the next free available EILVT entry, skip offset 0 */ for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { if (get_eilvt(offset)) break;
}
preempt_enable();
staticvoid ibs_eilvt_setup(void)
{ /* * Force LVT offset assignment for family 10h: The offsets are * not assigned by the BIOS for this family, so the OS is * responsible for doing it. If the OS assignment fails, fall * back to BIOS settings and try to setup this.
*/ if (boot_cpu_data.x86 == 0x10)
force_ibs_eilvt_setup();
}
caps = __get_ibs_caps(); if (!caps) return -ENODEV; /* ibs not supported by the cpu */
ibs_eilvt_setup();
if (!ibs_eilvt_valid()) return -EINVAL;
perf_ibs_pm_init();
ibs_caps = caps; /* make ibs_caps visible to other cpus: */
smp_mb(); /* * x86_pmu_amd_ibs_starting_cpu will be called from core on * all online cpus.
*/
cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, "perf/x86/amd/ibs:starting",
x86_pmu_amd_ibs_starting_cpu,
x86_pmu_amd_ibs_dying_cpu);
return perf_event_ibs_init();
}
/* Since we need the pci subsystem to init ibs we can't do this earlier: */
device_initcall(amd_ibs_init);
Messung V0.5
¤ Dauer der Verarbeitung: 0.9 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.