// SPDX-License-Identifier: GPL-2.0-only /* * Per core/cpu state * * Used to coordinate shared registers between HT threads or * among events on a single PMU.
*/
/* * When HT is off these events can only run on the bottom 4 counters * When HT is on, they are impacted by the HT bug and require EXCL access
*/
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
/* * When HT is off these events can only run on the bottom 4 counters * When HT is on, they are impacted by the HT bug and require EXCL access
*/
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0x2e, 0xff),
INTEL_EVENT_CONSTRAINT(0x3c, 0xff), /* * Generally event codes < 0x90 are restricted to counters 0-3. * The 0x2E and 0x3C are exception, which has no restriction.
*/
INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf),
INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1),
INTEL_EVENT_CONSTRAINT(0xce, 0x1),
INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), /* * Generally event codes >= 0x90 are likely to have no restrictions. * The exception are defined as above.
*/
INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff),
/* * topdown events for Intel Core CPUs. * * The events are all in slots, which is a free slot in a 4 wide * pipeline. Some events are already reported in slots, for cycle * events we multiply by the pipeline width (4). * * With Hyper Threading on, topdown metrics are either summed or averaged * between the threads of a core: (count_t0 + count_t1). * * For the average case the metric is always scaled to pipeline width, * so we use factor 2 ((count_t0 + count_t1) / 2 * 4)
*/
/* * When HT is off these events can only run on the bottom 4 counters * When HT is on, they are impacted by the HT bug and require EXCL access
*/
INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
EVENT_CONSTRAINT_END
};
staticstruct event_constraint intel_bdw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */ /* * when HT is off, these can only run on the bottom 4 counters
*/
INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xcd, 0xf), /* MEM_TRANS_RETIRED.* */
EVENT_CONSTRAINT_END
};
/* * Notes on the events: * - data reads do not include code reads (comparable to earlier tables) * - data counts include speculative execution (except L1 write, dtlb, bpu) * - remote node access includes remote memory, remote cache, remote mmio. * - prefetches are not included in the counts. * - icache miss does not include decoded icache
*/
/* * Notes on the events: * - data reads do not include code reads (comparable to earlier tables) * - data counts include speculative execution (except L1 write, dtlb, bpu) * - remote node access includes remote memory, remote cache, remote mmio. * - prefetches are not included in the counts because they are not * reliably counted.
*/
/* * Used from PMIs where the LBRs are already disabled. * * This function could be called consecutively. It is required to remain in * disabled state if called consecutively. * * During consecutive calls, the same disable value will be written to related * registers, so the PMU state remains unchanged. * * intel_bts events don't coexist with intel PMU's BTS events because of * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them * disabled around intel PMU's event batching etc, only inside the PMI handler. * * Avoid PEBS_ENABLE MSR access in PMIs. * The GLOBAL_CTRL has been disabled. All the counters do not count anymore. * It doesn't matter if the PEBS is enabled or not. * Usually, the PEBS status are not changed in PMIs. It's unnecessary to * access PEBS_ENABLE MSR in disable_all()/enable_all(). * However, there are some cases which may change PEBS status, e.g. PMI * throttle. The PEBS_ENABLE should be updated where the status changes.
*/ static __always_inline void __intel_pmu_disable_all(bool bts)
{ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0);
if (bts && test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
}
/* must not have branches... */
local_irq_save(flags);
__intel_pmu_disable_all(false); /* we don't care about BTS */
__intel_pmu_lbr_disable(); /* ... until here */ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
}
/* must not have branches... */
local_irq_save(flags);
__intel_pmu_disable_all(false); /* we don't care about BTS */
__intel_pmu_arch_lbr_disable(); /* ... until here */ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags);
}
/* * Workaround for: * Intel Errata AAK100 (model 26) * Intel Errata AAP53 (model 30) * Intel Errata BD53 (model 44) * * The official story: * These chips need to be 'reset' when adding counters by programming the * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either * in sequence on the same PMC or on different PMCs. * * In practice it appears some of these events do in fact count, and * we need to program all 4 events.
*/ staticvoid intel_pmu_nhm_workaround(void)
{ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); staticconstunsignedlong nhm_magic[4] = {
0x4300B5,
0x4300D2,
0x4300B1,
0x4300B1
}; struct perf_event *event; int i;
/* * The Errata requires below steps: * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL; * 2) Configure 4 PERFEVTSELx with the magic events and clear * the corresponding PMCx; * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL; * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL; * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
*/
/* * The real steps we choose are a little different from above. * A) To reduce MSR operations, we don't run step 1) as they * are already cleared before this function is called; * B) Call x86_perf_event_update to save PMCx before configuring * PERFEVTSELx with magic number; * C) With step 5), we do clear only when the PERFEVTSELx is * not used currently. * D) Call x86_perf_event_set_period to restore PMCx;
*/
/* We always operate 4 pairs of PERF Counters */ for (i = 0; i < 4; i++) {
event = cpuc->events[i]; if (event)
static_call(x86_pmu_update)(event);
}
for (i = 0; i < 4; i++) {
wrmsrq(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
wrmsrq(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
}
staticvoid intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
{ /* * We're going to use PMC3, make sure TFA is set before we touch it.
*/ if (cntr == 3)
intel_set_tfa(cpuc, true);
}
if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
/* * When there are other active TopDown events, * don't disable the fixed counter 3.
*/ if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx)) return;
idx = INTEL_PMC_IDX_FIXED_SLOTS;
}
switch (idx) { case 0 ... INTEL_PMC_IDX_FIXED - 1:
intel_clear_masks(event, idx);
x86_pmu_disable_event(event); break; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
intel_pmu_disable_fixed(event); break; case INTEL_PMC_IDX_FIXED_BTS:
intel_pmu_disable_bts();
intel_pmu_drain_bts_buffer(); return; case INTEL_PMC_IDX_FIXED_VLBR:
intel_clear_masks(event, idx); break; default:
intel_clear_masks(event, idx);
pr_warn("Failed to disable the event with invalid index %d\n",
idx); return;
}
/* * Needs to be called after x86_pmu_disable_event, * so we don't trigger the event without PEBS bit set.
*/ if (unlikely(event->attr.precise_ip))
static_call(x86_pmu_pebs_disable)(event);
}
staticvoid intel_pmu_assign_event(struct perf_event *event, int idx)
{ if (is_pebs_pt(event))
perf_report_aux_output_id(event, idx);
}
/* * The values in PERF_METRICS MSR are derived from fixed counter 3. * Software should start both registers, PERF_METRICS and fixed * counter 3, from zero. * Clear PERF_METRICS and Fixed counter 3 in initialization. * After that, both MSRs will be cleared for each read. * Don't need to clear them again.
*/ if (left == x86_pmu.max_period) {
wrmsrq(MSR_CORE_PERF_FIXED_CTR3, 0);
wrmsrq(MSR_PERF_METRICS, 0);
hwc->saved_slots = 0;
hwc->saved_metric = 0;
}
if ((hwc->saved_slots) && is_slots_event(event)) {
wrmsrq(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots);
wrmsrq(MSR_PERF_METRICS, hwc->saved_metric);
}
delta = icl_get_topdown_value(event, slots, metrics); if (last_slots)
last = icl_get_topdown_value(event, last_slots, last_metrics);
/* * The 8bit integer fraction of metric may be not accurate, * especially when the changes is very small. * For example, if only a few bad_spec happens, the fraction * may be reduced from 1 to 0. If so, the bad_spec event value * will be 0 which is definitely less than the last value. * Avoid update event->count for this case.
*/ if (delta > last) {
delta -= last;
local64_add(delta, &event->count);
}
}
for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { if (!is_topdown_idx(idx)) continue;
other = cpuc->events[idx];
other->hw.saved_slots = slots;
other->hw.saved_metric = metrics;
}
}
/* * Update all active Topdown events. * * The PERF_METRICS and Fixed counter 3 are read separately. The values may be * modify by a NMI. PMU has to be disabled before calling this function.
*/
/* * Check and update this event, which may have been cleared * in active_mask e.g. x86_pmu_stop()
*/ if (event && !test_bit(event->hw.idx, cpuc->active_mask)) {
__icl_update_topdown_event(event, slots, metrics,
event->hw.saved_slots,
event->hw.saved_metric);
/* * In x86_pmu_stop(), the event is cleared in active_mask first, * then drain the delta, which indicates context switch for * counting. * Save metric and slots for context switch. * Don't need to reset the PERF_METRICS and Fixed counter 3. * Because the values will be restored in next schedule in.
*/
update_saved_topdown_regs(event, slots, metrics, metric_end);
reset = false;
}
if (reset) { /* The fixed counter 3 has to be written before the PERF_METRICS. */
wrmsrq(MSR_CORE_PERF_FIXED_CTR3, 0);
wrmsrq(MSR_PERF_METRICS, 0); if (event)
update_saved_topdown_regs(event, 0, 0, metric_end);
}
/* Only need to call update_topdown_event() once for group read. */ if (is_metric_event(event) && (cpuc->txn_flags & PERF_PMU_TXN_READ)) return;
cpuc->enabled = 0; if (pmu_enabled)
intel_pmu_disable_all();
/* * If the PEBS counters snapshotting is enabled, * the topdown event is available in PEBS records.
*/ if (is_topdown_count(event) && !is_pebs_counter_event_group(event))
static_call(intel_pmu_update_topdown_event)(event, NULL); else
intel_pmu_drain_pebs_buffer();
cpuc->enabled = pmu_enabled; if (pmu_enabled)
intel_pmu_enable_all(0);
if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); /* * When there are other active TopDown events, * don't enable the fixed counter 3 again.
*/ if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx)) return;
idx = INTEL_PMC_IDX_FIXED_SLOTS;
if (event->attr.config1 & INTEL_TD_CFG_METRIC_CLEAR)
bits |= INTEL_FIXED_3_METRICS_CLEAR;
}
intel_set_masks(event, idx);
/* * Enable IRQ generation (0x8), if not PEBS, * and enable ring-3 counting (0x2) and ring-0 counting (0x1) * if requested:
*/ if (!event->attr.precise_ip)
bits |= INTEL_FIXED_0_ENABLE_PMI; if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
bits |= INTEL_FIXED_0_USER; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
bits |= INTEL_FIXED_0_KERNEL;
/* * ANY bit is supported in v3 and up
*/ if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
bits |= INTEL_FIXED_0_ANYTHREAD;
if (cpuc->acr_cfg_b[idx] != mask) {
wrmsrl(msr_b + msr_offset, mask);
cpuc->acr_cfg_b[idx] = mask;
} /* Only need to update the reload value when there is a valid config value. */ if (mask && cpuc->acr_cfg_c[idx] != reload) {
wrmsrl(msr_c + msr_offset, reload);
cpuc->acr_cfg_c[idx] = reload;
}
}
staticvoid intel_pmu_add_event(struct perf_event *event)
{ if (event->attr.precise_ip)
intel_pmu_pebs_add(event); if (intel_pmu_needs_branch_stack(event))
intel_pmu_lbr_add(event); if (is_pebs_counter_event_group(event) ||
is_acr_event_group(event))
this_cpu_ptr(&cpu_hw_events)->n_late_setup++;
}
/* * Save and restart an expired event. Called by NMI contexts, * so it has to be careful about preempting normal event ops:
*/ int intel_pmu_save_and_restart(struct perf_event *event)
{
static_call(x86_pmu_update)(event); /* * For a checkpointed counter always reset back to 0. This * avoids a situation where the counter overflows, aborts the * transaction and is then set back to shortly before the * overflow, and overflows and aborts again.
*/ if (unlikely(event_is_checkpointed(event))) { /* No race with NMIs because the counter should not be armed */
wrmsrq(event->hw.event_base, 0);
local64_set(&event->hw.prev_count, 0);
} return static_call(x86_pmu_set_period)(event);
}
staticint intel_pmu_set_period(struct perf_event *event)
{ if (unlikely(is_topdown_count(event))) return static_call(intel_pmu_set_topdown_event_period)(event);
/* Ack all overflows and disable fixed counters */ if (x86_pmu.version >= 2) {
intel_pmu_ack_status(intel_pmu_get_status());
wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, 0);
}
/* Reset LBRs and LBR freezing */ if (x86_pmu.lbr_nr) {
update_debugctlmsr(get_debugctlmsr() &
~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
}
local_irq_restore(flags);
}
/* * We may be running with guest PEBS events created by KVM, and the * PEBS records are logged into the guest's DS and invisible to host. * * In the case of guest PEBS overflow, we only trigger a fake event * to emulate the PEBS overflow PMI for guest PEBS counters in KVM. * The guest will then vm-entry and check the guest DS area to read * the guest PEBS records. * * The contents and other behavior of the guest event do not matter.
*/ staticvoid x86_pmu_handle_guest_pebs(struct pt_regs *regs, struct perf_sample_data *data)
{ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
u64 guest_pebs_idxs = cpuc->pebs_enabled & ~cpuc->intel_ctrl_host_mask; struct perf_event *event = NULL; int bit;
if (!unlikely(perf_guest_state())) return;
if (!x86_pmu.pebs_ept || !x86_pmu.pebs_active ||
!guest_pebs_idxs) return;
/* * Ignore a range of extra bits in status that do not indicate * overflow by themselves.
*/
status &= ~(GLOBAL_STATUS_COND_CHG |
GLOBAL_STATUS_ASIF |
GLOBAL_STATUS_LBRS_FROZEN); if (!status) return 0; /* * In case multiple PEBS events are sampled at the same time, * it is possible to have GLOBAL_STATUS bit 62 set indicating * PEBS buffer overflow and also seeing at most 3 PEBS counters * having their bits set in the status register. This is a sign * that there was at least one PEBS record pending at the time * of the PMU interrupt. PEBS counters must only be processed * via the drain_pebs() calls and not via the regular sample * processing loop coming after that the function, otherwise * phony regular samples may be generated in the sampling buffer * not marked with the EXACT tag. Another possibility is to have * one PEBS event and at least one non-PEBS event which overflows * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will * not be set, yet the overflow status bit for the PEBS counter will * be on Skylake. * * To avoid this problem, we systematically ignore the PEBS-enabled * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs().
*/
status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable);
/* * PEBS overflow sets bit 62 in the global status register
*/ if (__test_and_clear_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsignedlong *)&status)) {
u64 pebs_enabled = cpuc->pebs_enabled;
/* * PMI throttle may be triggered, which stops the PEBS event. * Although cpuc->pebs_enabled is updated accordingly, the * MSR_IA32_PEBS_ENABLE is not updated. Because the * cpuc->enabled has been forced to 0 in PMI. * Update the MSR if pebs_enabled is changed.
*/ if (pebs_enabled != cpuc->pebs_enabled)
wrmsrq(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
/* * Above PEBS handler (PEBS counters snapshotting) has updated fixed * counter 3 and perf metrics counts if they are in counter group, * unnecessary to update again.
*/ if (cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS] &&
is_pebs_counter_event_group(cpuc->events[INTEL_PMC_IDX_FIXED_SLOTS]))
status &= ~GLOBAL_STATUS_PERF_METRICS_OVF_BIT;
}
/* * Intel PT
*/ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsignedlong *)&status)) {
handled++; if (!perf_guest_handle_intel_pt_intr())
intel_pt_interrupt();
}
/* * Checkpointed counters can lead to 'spurious' PMIs because the * rollback caused by the PMI will have cleared the overflow status * bit. Therefore always force probe these counters.
*/
status |= cpuc->intel_cp_status;
/* * There may be unprocessed PEBS records in the PEBS buffer, * which still stores the previous values. * Process those records first before handling the latest value. * For example, * A is a regular counter * B is a PEBS event which reads A * C is a PEBS event * * The following can happen: * B-assist A=1 * C A=2 * B-assist A=3 * A-overflow-PMI A=4 * C-assist-PMI (PEBS buffer) A=5 * * The PEBS buffer has to be drained before handling the A-PMI
*/ if (is_pebs_counter_event_group(event))
x86_pmu.drain_pebs(regs, &data);
last_period = event->hw.last_period;
if (!intel_pmu_save_and_restart(event)) continue;
perf_sample_data_init(&data, 0, last_period);
if (has_branch_stack(event))
intel_pmu_lbr_save_brstack(&data, cpuc, event);
perf_event_overflow(event, &data, regs);
}
return handled;
}
/* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply:
*/ staticint intel_pmu_handle_irq(struct pt_regs *regs)
{ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); bool late_ack = hybrid_bit(cpuc->pmu, late_ack); bool mid_ack = hybrid_bit(cpuc->pmu, mid_ack); int loops;
u64 status; int handled; int pmu_enabled;
/* * Save the PMU state. * It needs to be restored when leaving the handler.
*/
pmu_enabled = cpuc->enabled; /* * In general, the early ACK is only applied for old platforms. * For the big core starts from Haswell, the late ACK should be * applied. * For the small core after Tremont, we have to do the ACK right * before re-enabling counters, which is in the middle of the * NMI handler.
*/ if (!late_ack && !mid_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI);
intel_bts_disable_local();
cpuc->enabled = 0;
__intel_pmu_disable_all(true);
handled = intel_pmu_drain_bts_buffer();
handled += intel_bts_interrupt();
status = intel_pmu_get_status(); if (!status) goto done;
/* * Repeat if there is more work to be done:
*/
status = intel_pmu_get_status(); if (status) goto again;
done: if (mid_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI); /* Only restore PMU state when it's active. See x86_pmu_disable(). */
cpuc->enabled = pmu_enabled; if (pmu_enabled)
__intel_pmu_enable_all(0, true);
intel_bts_enable_local();
/* * Only unmask the NMI after the overflow counters * have been reset. This avoids spurious NMIs on * Haswell CPUs.
*/ if (late_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI); return handled;
}
/* * manage allocation of shared extra msr for certain events * * sharing can be: * per-cpu: to be shared between the various events on a single PMU * per-core: per-cpu + shared by HT threads
*/ staticstruct event_constraint *
__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, struct hw_perf_event_extra *reg)
{ struct event_constraint *c = &emptyconstraint; struct er_account *era; unsignedlong flags; int idx = reg->idx;
/* * reg->alloc can be set due to existing state, so for fake cpuc we * need to ignore this, otherwise we might fail to allocate proper fake * state for this extra reg constraint. Also see the comment below.
*/ if (reg->alloc && !cpuc->is_fake) return NULL; /* call x86_get_event_constraint() */
again:
era = &cpuc->shared_regs->regs[idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when * passing a fake cpuc
*/
raw_spin_lock_irqsave(&era->lock, flags);
if (!atomic_read(&era->ref) || era->config == reg->config) {
/* * If its a fake cpuc -- as per validate_{group,event}() we * shouldn't touch event state and we can avoid doing so * since both will only call get_event_constraints() once * on each event, this avoids the need for reg->alloc. * * Not doing the ER fixup will only result in era->reg being * wrong, but since we won't actually try and program hardware * this isn't a problem either.
*/ if (!cpuc->is_fake) { if (idx != reg->idx)
intel_fixup_er(event, idx);
/* * x86_schedule_events() can call get_event_constraints() * multiple times on events in the case of incremental * scheduling(). reg->alloc ensures we only do the ER * allocation once.
*/
reg->alloc = 1;
}
/* lock in msr value */
era->config = reg->config;
era->reg = reg->reg;
/* one more user */
atomic_inc(&era->ref);
/* * need to call x86_get_event_constraint() * to check if associated event has constraints
*/
c = NULL;
} else {
idx = intel_alt_er(cpuc, idx, reg->config); if (idx != reg->idx) {
raw_spin_unlock_irqrestore(&era->lock, flags); goto again;
}
}
raw_spin_unlock_irqrestore(&era->lock, flags);
/* * Only put constraint if extra reg was actually allocated. Also takes * care of event which do not use an extra shared reg. * * Also, if this is a fake cpuc we shouldn't touch any event state * (reg->alloc) and we don't care about leaving inconsistent cpuc state * either since it'll be thrown out.
*/ if (!reg->alloc || cpuc->is_fake) return;
staticvoid
intel_start_scheduling(struct cpu_hw_events *cpuc)
{ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; struct intel_excl_states *xl; int tid = cpuc->excl_thread_id;
/* * nothing needed if in group validation mode
*/ if (cpuc->is_fake || !is_ht_workaround_enabled()) return;
/* * no exclusion needed
*/ if (WARN_ON_ONCE(!excl_cntrs)) return;
xl = &excl_cntrs->states[tid];
xl->sched_started = true; /* * lock shared state until we are done scheduling * in stop_event_scheduling() * makes scheduling appear as a transaction
*/
raw_spin_lock(&excl_cntrs->lock);
}
staticvoid intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
{ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; struct event_constraint *c = cpuc->event_constraint[idx]; struct intel_excl_states *xl; int tid = cpuc->excl_thread_id;
if (cpuc->is_fake || !is_ht_workaround_enabled()) return;
staticvoid
intel_stop_scheduling(struct cpu_hw_events *cpuc)
{ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; struct intel_excl_states *xl; int tid = cpuc->excl_thread_id;
/* * nothing needed if in group validation mode
*/ if (cpuc->is_fake || !is_ht_workaround_enabled()) return; /* * no exclusion needed
*/ if (WARN_ON_ONCE(!excl_cntrs)) return;
xl = &excl_cntrs->states[tid];
xl->sched_started = false; /* * release shared state lock (acquired in intel_start_scheduling())
*/
raw_spin_unlock(&excl_cntrs->lock);
}
/* * mark constraint as dynamic
*/
cx->flags |= PERF_X86_EVENT_DYNAMIC;
c = cx;
}
return c;
}
staticstruct event_constraint *
intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, int idx, struct event_constraint *c)
{ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; struct intel_excl_states *xlo; int tid = cpuc->excl_thread_id; int is_excl, i, w;
/* * validating a group does not require * enforcing cross-thread exclusion
*/ if (cpuc->is_fake || !is_ht_workaround_enabled()) return c;
/* * no exclusion needed
*/ if (WARN_ON_ONCE(!excl_cntrs)) return c;
/* * because we modify the constraint, we need * to make a copy. Static constraints come * from static const tables. * * only needed when constraint has not yet * been cloned (marked dynamic)
*/
c = dyn_constraint(cpuc, c, idx);
/* * From here on, the constraint is dynamic. * Either it was just allocated above, or it * was allocated during a earlier invocation * of this function
*/
/* * state of sibling HT
*/
xlo = &excl_cntrs->states[tid ^ 1];
/* * Modify static constraint with current dynamic * state of thread * * EXCLUSIVE: sibling counter measuring exclusive event * SHARED : sibling counter measuring non-exclusive event * UNUSED : sibling counter unused
*/
w = c->weight;
for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) { /* * exclusive event in sibling counter * our corresponding counter cannot be used * regardless of our event
*/ if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) {
__clear_bit(i, c->idxmsk);
w--; continue;
} /* * if measuring an exclusive event, sibling * measuring non-exclusive, then counter cannot * be used
*/ if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) {
__clear_bit(i, c->idxmsk);
w--; continue;
}
}
/* * if we return an empty mask, then switch * back to static empty constraint to avoid * the cost of freeing later on
*/ if (!w)
c = &emptyconstraint;
/* * nothing needed if in group validation mode
*/ if (cpuc->is_fake) return;
if (WARN_ON_ONCE(!excl_cntrs)) return;
if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; if (!--cpuc->n_excl)
WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
}
/* * If event was actually assigned, then mark the counter state as * unused now.
*/ if (hwc->idx >= 0) {
xl = &excl_cntrs->states[tid];
/* * put_constraint may be called from x86_schedule_events() * which already has the lock held so here make locking * conditional.
*/ if (!xl->sched_started)
raw_spin_lock(&excl_cntrs->lock);
xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
if (!xl->sched_started)
raw_spin_unlock(&excl_cntrs->lock);
}
}
/* * is PMU has exclusive counter restrictions, then * all events are subject to and must call the * put_excl_constraints() routine
*/ if (cpuc->excl_cntrs)
intel_put_excl_constraints(cpuc, event);
}
staticvoid intel_pebs_aliases_core2(struct perf_event *event)
{ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. * * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't * PEBS capable. However we can use INST_RETIRED.ANY_P * (0x00c0), which is a PEBS capable event, to get the same * count. * * INST_RETIRED.ANY_P counts the number of cycles that retires * CNTMASK instructions. By setting CNTMASK to a value (16) * larger than the maximum number of instructions that can be * retired per cycle (4) and then inverting the condition, we * count all cycles that retire 16 or less instructions, which * is every cycle. * * Thereby we gain a PEBS capable cycle counter.
*/
u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
staticvoid intel_pebs_aliases_snb(struct perf_event *event)
{ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. * * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't * PEBS capable. However we can use UOPS_RETIRED.ALL * (0x01c2), which is a PEBS capable event, to get the same * count. * * UOPS_RETIRED.ALL counts the number of cycles that retires * CNTMASK micro-ops. By setting CNTMASK to a value (16) * larger than the maximum number of micro-ops that can be * retired per cycle (4) and then inverting the condition, we * count all cycles that retire 16 or less micro-ops, which * is every cycle. * * Thereby we gain a PEBS capable cycle counter.
*/
u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
staticvoid intel_pebs_aliases_precdist(struct perf_event *event)
{ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. * * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't * PEBS capable. However we can use INST_RETIRED.PREC_DIST * (0x01c0), which is a PEBS capable event, to get the same * count. * * The PREC_DIST event has special support to minimize sample * shadowing effects. One drawback is that it can be * only programmed on counter 1, but that seems like an * acceptable trade off.
*/
u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
static u64 intel_pmu_freq_start_period(struct perf_event *event)
{ int type = event->attr.type;
u64 config, factor;
s64 start;
/* * The 127 is the lowest possible recommended SAV (sample after value) * for a 4000 freq (default freq), according to the event list JSON file. * Also, assume the workload is idle 50% time.
*/
factor = 64 * 4000; if (type != PERF_TYPE_HARDWARE && type != PERF_TYPE_HW_CACHE) goto end;
/* * The estimation of the start period in the freq mode is * based on the below assumption. * * For a cycles or an instructions event, 1GHZ of the * underlying platform, 1 IPC. The workload is idle 50% time. * The start period = 1,000,000,000 * 1 / freq / 2. * = 500,000,000 / freq * * Usually, the branch-related events occur less than the * instructions event. According to the Intel event list JSON * file, the SAV (sample after value) of a branch-related event * is usually 1/4 of an instruction event. * The start period of branch-related events = 125,000,000 / freq. * * The cache-related events occurs even less. The SAV is usually * 1/20 of an instruction event. * The start period of cache-related events = 25,000,000 / freq.
*/
config = event->attr.config & PERF_HW_EVENT_MASK; if (type == PERF_TYPE_HARDWARE) { switch (config) { case PERF_COUNT_HW_CPU_CYCLES: case PERF_COUNT_HW_INSTRUCTIONS: case PERF_COUNT_HW_BUS_CYCLES: case PERF_COUNT_HW_STALLED_CYCLES_FRONTEND: case PERF_COUNT_HW_STALLED_CYCLES_BACKEND: case PERF_COUNT_HW_REF_CPU_CYCLES:
factor = 500000000; break; case PERF_COUNT_HW_BRANCH_INSTRUCTIONS: case PERF_COUNT_HW_BRANCH_MISSES:
factor = 125000000; break; case PERF_COUNT_HW_CACHE_REFERENCES: case PERF_COUNT_HW_CACHE_MISSES:
factor = 25000000; break; default: goto end;
}
}
if (type == PERF_TYPE_HW_CACHE)
factor = 25000000;
end: /* * Usually, a prime or a number with less factors (close to prime) * is chosen as an SAV, which makes it less likely that the sampling * period synchronizes with some periodic event in the workload. * Minus 1 to make it at least avoiding values near power of twos * for the default freq.
*/
start = DIV_ROUND_UP_ULL(factor, event->attr.sample_freq) - 1;
if (start > x86_pmu.max_period)
start = x86_pmu.max_period;
if (x86_pmu.limit_period)
x86_pmu.limit_period(event, &start);
if (event->attr.precise_ip) { if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) return -EINVAL;
if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event)) &&
!has_aux_action(event)) {
event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
event->attach_state |= PERF_ATTACH_SCHED_CB;
}
} if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
}
if (needs_branch_stack(event)) { /* Avoid branch stack setup for counting events in SAMPLE READ */ if (is_sampling_event(event) ||
!(event->attr.sample_type & PERF_SAMPLE_READ))
event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK;
}
if (branch_sample_counters(event)) { struct perf_event *leader, *sibling; int num = 0;
if (!(x86_pmu.flags & PMU_FL_BR_CNTR) ||
(event->attr.config & ~INTEL_ARCH_EVENT_MASK)) return -EINVAL;
/* * The branch counter logging is not supported in the call stack * mode yet, since we cannot simply flush the LBR during e.g., * multiplexing. Also, there is no obvious usage with the call * stack mode. Simply forbids it for now. * * If any events in the group enable the branch counter logging * feature, the group is treated as a branch counter logging * group, which requires the extra space to store the counters.
*/
leader = event->group_leader; if (branch_sample_call_stack(leader)) return -EINVAL; if (branch_sample_counters(leader)) {
num++;
leader->hw.dyn_constraint &= x86_pmu.lbr_counters;
}
leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS;
for_each_sibling_event(sibling, leader) { if (branch_sample_call_stack(sibling)) return -EINVAL; if (branch_sample_counters(sibling)) {
num++;
sibling->hw.dyn_constraint &= x86_pmu.lbr_counters;
}
}
if (num > fls(x86_pmu.lbr_counters)) return -EINVAL; /* * Only applying the PERF_SAMPLE_BRANCH_COUNTERS doesn't * require any branch stack setup. * Clear the bit to avoid unnecessary branch stack setup.
*/ if (0 == (event->attr.branch_sample_type &
~(PERF_SAMPLE_BRANCH_PLM_ALL |
PERF_SAMPLE_BRANCH_COUNTERS)))
event->hw.flags &= ~PERF_X86_EVENT_NEEDS_BRANCH_STACK;
/* * Force the leader to be a LBR event. So LBRs can be reset * with the leader event. See intel_pmu_lbr_del() for details.
*/ if (!intel_pmu_needs_branch_stack(leader)) return -EINVAL;
}
if (intel_pmu_needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret;
event->attach_state |= PERF_ATTACH_SCHED_CB;
/* * BTS is set up earlier in this path, so don't account twice
*/ if (!unlikely(intel_pmu_has_bts(event))) { /* disallow lbr if conflicting events are present */ if (x86_add_exclusive(x86_lbr_exclusive_lbr)) return -EBUSY;
event->destroy = hw_perf_lbr_event_destroy;
}
}
if (event->attr.aux_output) { if (!event->attr.precise_ip) return -EINVAL;
/* Not support perf metrics */ if (is_metric_event(event)) return -EINVAL;
/* Not support freq mode */ if (event->attr.freq) return -EINVAL;
/* PDist is not supported */ if (event->attr.config2 && event->attr.precise_ip > 2) return -EINVAL;
/* The reload value cannot exceeds the max period */ if (event->attr.sample_period > x86_pmu.max_period) return -EINVAL; /* * The counter-constraints of each event cannot be finalized * unless the whole group is scanned. However, it's hard * to know whether the event is the last one of the group. * Recalculate the counter-constraints for each event when * adding a new event. * * The group is traversed twice, which may be optimized later. * In the first round, * - Find all events which do reload when other events * overflow and set the corresponding counter-constraints * - Add all events, which can cause other events reload, * in the cause_mask * - Error out if the number of events exceeds the HW limit * - The ACR events must be contiguous. * Error out if there are non-X86 events between ACR events. * This is not a HW limit, but a SW limit. * With the assumption, the intel_pmu_acr_late_setup() can * easily convert the event idx to counter idx without * traversing the whole event list.
*/ if (!is_x86_event(leader)) return -EINVAL;
if (leader->attr.config2)
intel_pmu_set_acr_cntr_constr(leader, &cause_mask, &num);
if (leader->nr_siblings) {
for_each_sibling_event(sibling, leader) { if (!is_x86_event(sibling)) {
has_sw_event = true; continue;
} if (!sibling->attr.config2) continue; if (has_sw_event) return -EINVAL;
intel_pmu_set_acr_cntr_constr(sibling, &cause_mask, &num);
}
} if (leader != event && event->attr.config2) { if (has_sw_event) return -EINVAL;
intel_pmu_set_acr_cntr_constr(event, &cause_mask, &num);
}
if (hweight64(cause_mask) > hweight64(hybrid(pmu, acr_cause_mask64)) ||
num > hweight64(hybrid(event->pmu, acr_cntr_mask64))) return -EINVAL; /* * In the second round, apply the counter-constraints for * the events which can cause other events reload.
*/
intel_pmu_set_acr_caused_constr(leader, idx++, cause_mask);
if (leader->nr_siblings) {
for_each_sibling_event(sibling, leader)
intel_pmu_set_acr_caused_constr(sibling, idx++, cause_mask);
}
if (leader != event)
intel_pmu_set_acr_caused_constr(event, idx, cause_mask);
leader->hw.flags |= PERF_X86_EVENT_ACR;
}
if ((event->attr.type == PERF_TYPE_HARDWARE) ||
(event->attr.type == PERF_TYPE_HW_CACHE)) return 0;
/* * Config Topdown slots and metric events * * The slots event on Fixed Counter 3 can support sampling, * which will be handled normally in x86_perf_event_update(). * * Metric events don't support sampling and require being paired * with a slots event as group leader. When the slots event * is used in a metrics group, it too cannot support sampling.
*/ if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) { /* The metrics_clear can only be set for the slots event */ if (event->attr.config1 &&
(!is_slots_event(event) || (event->attr.config1 & ~INTEL_TD_CFG_METRIC_CLEAR))) return -EINVAL;
if (event->attr.config2) return -EINVAL;
/* * The TopDown metrics events and slots event don't * support any filters.
*/ if (event->attr.config & X86_ALL_EVENT_FLAGS) return -EINVAL;
if (is_available_metric_event(event)) { struct perf_event *leader = event->group_leader;
/* The metric events don't support sampling. */ if (is_sampling_event(event)) return -EINVAL;
/* The metric events require a slots group leader. */ if (!is_slots_event(leader)) return -EINVAL;
/* * The leader/SLOTS must not be a sampling event for * metric use; hardware requires it starts at 0 when used * in conjunction with MSR_PERF_METRICS.
*/ if (is_sampling_event(leader)) return -EINVAL;
event->event_caps |= PERF_EV_CAP_SIBLING; /* * Only once we have a METRICs sibling do we * need TopDown magic.
*/
leader->hw.flags |= PERF_X86_EVENT_TOPDOWN;
event->hw.flags |= PERF_X86_EVENT_TOPDOWN;
}
}
/* * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR * doesn't function quite right. As a work-around it needs to always be * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82). * The actual count of this second event is irrelevant it just needs * to be active to make the first event function correctly. * * In a group, the auxiliary event must be in front of the load latency * event. The rule is to simplify the implementation of the check. * That's because perf cannot have a complete group at the moment.
*/ if (require_mem_loads_aux_event(event) &&
(event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
is_mem_loads_event(event)) { struct perf_event *leader = event->group_leader; struct perf_event *sibling = NULL;
/* * When this memload event is also the first event (no group * exists yet), then there is no aux event before it.
*/ if (leader == event) return -ENODATA;
if (!is_mem_loads_aux_event(leader)) {
for_each_sibling_event(sibling, leader) { if (is_mem_loads_aux_event(sibling)) break;
} if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list)) return -ENODATA;
}
}
if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) return 0;
if (x86_pmu.version < 3) return -EINVAL;
ret = perf_allow_cpu(); if (ret) return ret;
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
return 0;
}
/* * Currently, the only caller of this function is the atomic_switch_perf_msrs(). * The host perf context helps to prepare the values of the real hardware for * a set of msrs that need to be switched atomically in a vmx transaction. * * For example, the pseudocode needed to add a new msr should look like: * * arr[(*nr)++] = (struct perf_guest_switch_msr){ * .msr = the hardware msr address, * .host = the value the hardware has when it doesn't run a guest, * .guest = the value the hardware has when it runs a guest, * }; * * These values have nothing to do with the emulated values the guest sees * when it uses {RD,WR}MSR, which should be handled by the KVM context, * specifically in the intel_pmu_{get,set}_msr().
*/ staticstruct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
{ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data;
u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; int global_ctrl, pebs_enable;
/* * In addition to obeying exclude_guest/exclude_host, remove bits being * used for PEBS when running a guest, because PEBS writes to virtual * addresses (not physical addresses).
*/
*nr = 0;
global_ctrl = (*nr)++;
arr[global_ctrl] = (struct perf_guest_switch_msr){
.msr = MSR_CORE_PERF_GLOBAL_CTRL,
.host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask,
.guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask,
};
if (!x86_pmu.ds_pebs) return arr;
/* * If PMU counter has PEBS enabled it is not enough to * disable counter on a guest entry since PEBS memory * write can overshoot guest entry and corrupt guest * memory. Disabling PEBS solves the problem. * * Don't do this if the CPU already enforces it.
*/ if (x86_pmu.pebs_no_isolation) {
arr[(*nr)++] = (struct perf_guest_switch_msr){
.msr = MSR_IA32_PEBS_ENABLE,
.host = cpuc->pebs_enabled,
.guest = 0,
}; return arr;
}
staticint hsw_hw_config(struct perf_event *event)
{ int ret = intel_pmu_hw_config(event);
if (ret) return ret; if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE)) return 0;
event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
/* * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with * PEBS or in ANY thread mode. Since the results are non-sensical forbid * this combination.
*/ if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
event->attr.precise_ip > 0)) return -EOPNOTSUPP;
if (event_is_checkpointed(event)) { /* * Sampling of checkpointed events can cause situations where * the CPU constantly aborts because of a overflow, which is * then checkpointed back and ignored. Forbid checkpointing * for sampling. * * But still allow a long sampling period, so that perf stat * from KVM works.
*/ if (event->attr.sample_period > 0 &&
event->attr.sample_period < 0x7fffffff) return -EOPNOTSUPP;
} return 0;
}
c = intel_get_event_constraints(cpuc, idx, event);
/* Handle special quirk on in_tx_checkpointed only in counter 2 */ if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { if (c->idxmsk64 & (1U << 2)) return &counter2_constraint; return &emptyconstraint;
}
return c;
}
staticstruct event_constraint *
icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event)
{ /* * Fixed counter 0 has less skid. * Force instruction:ppp in Fixed counter 0
*/ if ((event->attr.precise_ip == 3) &&
constraint_match(&fixed0_constraint, event->hw.config)) return &fixed0_constraint;
/* * The :ppp indicates the Precise Distribution (PDist) facility, which * is only supported on the GP counter 0. If a :ppp event which is not * available on the GP counter 0, error out. * Exception: Instruction PDIR is only available on the fixed counter 0.
*/ if ((event->attr.precise_ip == 3) &&
!constraint_match(&fixed0_constraint, event->hw.config)) { if (c->idxmsk64 & BIT_ULL(0)) return &counter0_constraint;
c = intel_get_event_constraints(cpuc, idx, event);
/* * :ppp means to do reduced skid PEBS, * which is available on PMC0 and fixed counter 0.
*/ if (event->attr.precise_ip == 3) { /* Force instruction:ppp on PMC0 and Fixed counter 0 */ if (constraint_match(&fixed0_constraint, event->hw.config)) return &fixed0_counter0_constraint;
/* * Without TFA we must not use PMC3.
*/ if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) {
c = dyn_constraint(cpuc, c, idx);
c->idxmsk64 &= ~(1ULL << 3);
c->weight--;
}
c = intel_get_event_constraints(cpuc, idx, event);
/* * The :ppp indicates the Precise Distribution (PDist) facility, which * is only supported on the GP counter 0 & 1 and Fixed counter 0. * If a :ppp event which is not available on the above eligible counters, * error out.
*/ if (event->attr.precise_ip == 3) { /* Force instruction:ppp on PMC0, 1 and Fixed counter 0 */ if (constraint_match(&fixed0_constraint, event->hw.config)) { /* The fixed counter 0 doesn't support LBR event logging. */ if (branch_sample_counters(event)) return &counter0_1_constraint; else return &fixed0_counter0_1_constraint;
}
switch (c->idxmsk64 & 0x3ull) { case 0x1: return &counter0_constraint; case 0x2: return &counter1_constraint; case 0x3: return &counter0_1_constraint;
} return &emptyconstraint;
}
/* The Retire Latency is not supported by the fixed counter 0. */ if (event->attr.precise_ip &&
(event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
constraint_match(&fixed0_constraint, event->hw.config)) { /* * The Instruction PDIR is only available * on the fixed counter 0. Error out for this case.
*/ if (event->attr.precise_ip == 3) return &emptyconstraint; return &counters_1_7_constraint;
}
if (pmu->pmu_type == hybrid_tiny) return intel_pmu_hw_config(event);
return adl_hw_config(event);
}
/* * The HSW11 requires a period larger than 100 which is the same as the BDM11. * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL. * * The message 'interrupt took too long' can be observed on any counter which * was armed with a period < 32 and two events expired in the same NMI. * A minimum period of 32 is enforced for the rest of the events.
*/ staticvoid hsw_limit_period(struct perf_event *event, s64 *left)
{
*left = max(*left, erratum_hsw11(event) ? 128 : 32);
}
/* * Broadwell: * * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine * the two to enforce a minimum period of 128 (the smallest value that has bits * 0-5 cleared and >= 100). * * Because of how the code in x86_perf_event_set_period() works, the truncation * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period * to make up for the 'lost' events due to carrying the 'error' in period_left. * * Therefore the effective (average) period matches the requested period, * despite coarser hardware granularity.
*/ staticvoid bdw_limit_period(struct perf_event *event, s64 *left)
{ if (erratum_hsw11(event)) { if (*left < 128)
*left = 128;
*left &= ~0x3fULL;
}
}
/* * The umask and umask2 have different formats but share the * same attr name. In update mode, the previous value of the * umask is unconditionally removed before is_visible. If * umask2 format is not enumerated, it's impossible to roll * back to the old format. * Does the check in umask2_show rather than is_visible.
*/ if (i == 0) return attr->mode;
mask = hybrid(dev_get_drvdata(dev), config_mask); if (i == 1) return (mask & ARCH_PERFMON_EVENTSEL_EQ) ? attr->mode : 0;
/* PERF_CAPABILITIES.RDPMC_METRICS_CLEAR */ if (i == 2) { union perf_capabilities intel_cap = hybrid(dev_get_drvdata(dev), intel_cap);
staticstruct intel_shared_regs *allocate_shared_regs(int cpu)
{ struct intel_shared_regs *regs; int i;
regs = kzalloc_node(sizeof(struct intel_shared_regs),
GFP_KERNEL, cpu_to_node(cpu)); if (regs) { /* * initialize the locks to keep lockdep happy
*/ for (i = 0; i < EXTRA_REG_MAX; i++)
raw_spin_lock_init(®s->regs[i].lock);
if (eax.split.acr_subleaf) {
cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_ACR_LEAF,
&cntr, &fixed_cntr, &ecx, &edx); /* The mask of the counters which can be reloaded */
hybrid(pmu, acr_cntr_mask64) = cntr | ((u64)fixed_cntr << INTEL_PMC_IDX_FIXED);
/* The mask of the counters which can cause a reload of reloadable counters */
hybrid(pmu, acr_cause_mask64) = ecx | ((u64)edx << INTEL_PMC_IDX_FIXED);
}
if (!intel_pmu_broken_perf_cap()) { /* Perf Metric (Bit 15) and PEBS via PT (Bit 16) are hybrid enumeration */
rdmsrq(MSR_IA32_PERF_CAPABILITIES, hybrid(pmu, intel_cap).capabilities);
}
}
/* * This is running on a CPU model that is known to have hybrid * configurations. But the CPU told us it is not hybrid, shame * on it. There should be a fixup function provided for these * troublesome CPUs (->get_hybrid_cpu_type).
*/ if (cpu_type == INTEL_CPU_TYPE_UNKNOWN) { if (x86_pmu.get_hybrid_cpu_type)
cpu_type = x86_pmu.get_hybrid_cpu_type(); else return NULL;
}
/* * This essentially just maps between the 'hybrid_cpu_type' * and 'hybrid_pmu_type' enums except for ARL-H processor * which needs to compare atom uarch native id since ARL-H * contains two different atom uarchs.
*/ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type;
u32 native_id;
if (cpu_type == INTEL_CPU_TYPE_CORE && pmu_type == hybrid_big) return &x86_pmu.hybrid_pmu[i]; if (cpu_type == INTEL_CPU_TYPE_ATOM) { if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small) return &x86_pmu.hybrid_pmu[i];
staticvoid intel_pmu_cpu_starting(int cpu)
{ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int core_id = topology_core_id(cpu); int i;
if (is_hybrid() && !init_hybrid_pmu(cpu)) return;
init_debug_store_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up, and that may * even boot with LBRs enabled.
*/ if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && x86_pmu.lbr_nr)
msr_clear_bit(MSR_IA32_DEBUGCTLMSR, DEBUGCTLMSR_LBR_BIT);
intel_pmu_lbr_reset();
if (x86_pmu.version > 1)
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
/* * Disable perf metrics if any added CPU doesn't support it. * * Turn off the check for a hybrid architecture, because the * architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicate * the architecture features. The perf metrics is a model-specific * feature for now. The corresponding bit should always be 0 on * a hybrid platform, e.g., Alder Lake.
*/ if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) { union perf_capabilities perf_cap;
/* * Intel PMCs cannot be accessed sanely above 32-bit width, * so we install an artificial 1<<31 period regardless of * the generic event period:
*/
.max_period = (1ULL<<31) - 1,
.get_event_constraints = intel_get_event_constraints,
.put_event_constraints = intel_put_event_constraints,
.event_constraints = intel_core_event_constraints,
.guest_get_msrs = core_guest_get_msrs,
.format_attrs = intel_arch_formats_attr,
.events_sysfs_show = intel_event_sysfs_show,
/* * Virtual (or funny metal) CPU can define x86_pmu.extra_regs * together with PMU version 1 and thus be using core_pmu with * shared_regs. We need following callbacks here to allocate * it properly.
*/
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.cpu_dead = intel_pmu_cpu_dead,
/* * SMM has access to all 4 rings and while traditionally SMM code only * ran in CPL0, 2021-era firmware is starting to make use of CPL3 in SMM. * * Since the EVENTSEL.{USR,OS} CPL filtering makes no distinction * between SMM or not, this results in what should be pure userspace * counters including SMM data. * * This is a clear privilege issue, therefore globally disable * counting SMM by default.
*/
.attr_freeze_on_smi = 1,
};
static __init void intel_clovertown_quirk(void)
{ /* * PEBS is unreliable due to: * * AJ67 - PEBS may experience CPL leaks * AJ68 - PEBS PMI may be delayed by one event * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12] * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS * * AJ67 could be worked around by restricting the OS/USR flags. * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI. * * AJ106 could possibly be worked around by not allowing LBR * usage from PEBS, including the fixup. * AJ68 could possibly be worked around by always programming * a pebs_event_reset[0] value and coping with the lost events. * * But taken together it might just make sense to not enable PEBS on * these chips.
*/
pr_warn("PEBS disabled due to CPU errata\n");
x86_pmu.ds_pebs = 0;
x86_pmu.pebs_constraints = NULL;
}
staticvoid intel_snb_check_microcode(void)
{ if (intel_snb_pebs_broken() == x86_pmu.pebs_broken) return;
/* * Serialized by the microcode lock..
*/ if (x86_pmu.pebs_broken) {
pr_info("PEBS enabled due to microcode update\n");
x86_pmu.pebs_broken = 0;
} else {
pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
x86_pmu.pebs_broken = 1;
}
}
/* * Under certain circumstances, access certain MSR may cause #GP. * The function tests if the input MSR can be safely accessed.
*/ staticbool check_msr(unsignedlong msr, u64 mask)
{
u64 val_old, val_new, val_tmp;
/* * Disable the check for real HW, so we don't * mess with potentially enabled registers:
*/ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) returntrue;
/* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s.
*/ if (rdmsrq_safe(msr, &val_old)) returnfalse;
/* * Only change the bits which can be updated by wrmsrq.
*/
val_tmp = val_old ^ mask;
if (is_lbr_from(msr))
val_tmp = lbr_from_signext_quirk_wr(val_tmp);
if (wrmsrq_safe(msr, val_tmp) ||
rdmsrq_safe(msr, &val_new)) returnfalse;
/* * Quirk only affects validation in wrmsr(), so wrmsrq()'s value * should equal rdmsrq()'s even with the quirk.
*/ if (val_new != val_tmp) returnfalse;
if (is_lbr_from(msr))
val_old = lbr_from_signext_quirk_wr(val_old);
/* Here it's sure that the MSR can be safely accessed. * Restore the old value and return.
*/
wrmsrq(msr, val_old);
static __init void intel_arch_events_quirk(void)
{ int bit;
/* disable event that reported as not present by cpuid */
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
pr_warn("CPUID marked event: \'%s\' unavailable\n",
intel_arch_events_map[bit].name);
}
}
static __init void intel_nehalem_quirk(void)
{ union cpuid10_ebx ebx;
ebx.full = x86_pmu.events_maskl; if (ebx.split.no_branch_misses_retired) { /* * Erratum AAJ80 detected, we work it around by using * the BR_MISP_EXEC.ANY event. This will over-count * branch-misses, but it's still much better than the * architectural event which is often completely bogus:
*/
intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
ebx.split.no_branch_misses_retired = 0;
x86_pmu.events_maskl = ebx.full;
pr_info("CPU erratum AAJ80 worked around\n");
}
}
/* * enable software workaround for errata: * SNB: BJ122 * IVB: BV98 * HSW: HSD29 * * Only needed when HT is enabled. However detecting * if HT is enabled is difficult (model specific). So instead, * we enable the workaround in the early boot, and verify if * it is needed in a later initcall phase once we have valid * topology information to check if HT is actually enabled
*/ static __init void intel_ht_bug(void)
{
x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
/* * check if PMC3 is used * and if so force schedule out for all event types all contexts
*/ if (test_bit(3, cpuc->active_mask))
perf_pmu_resched(x86_get_pmu(smp_processor_id()));
}
static umode_t
td_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{ /* * Hide the perf metrics topdown events * if the feature is not enumerated.
*/ if (x86_pmu.num_topdown_events) return x86_pmu.intel_cap.perf_metrics ? attr->mode : 0;
/* Must be in IDX order */
EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small);
EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small);
EVENT_ATTR_STR_HYBRID(mem-loads-aux, mem_ld_aux_adl, "event=0x03,umask=0x82", hybrid_big);
/* * event on fixed counter2 (REF_CYCLES) only works on this * counter, so do not extend mask to generic counters
*/
for_each_event_constraint(c, event_constraints) { /* * Don't extend the topdown slots and metrics * events to the generic counters.
*/ if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { /* * Disable topdown slots and metrics events, * if slots event is not in CPUID.
*/ if (!(INTEL_PMC_MSK_FIXED_SLOTS & intel_ctrl))
c->idxmsk64 = 0;
c->weight = hweight64(c->idxmsk64); continue;
}
if (c->cmask == FIXED_EVENT_FLAGS) { /* Disabled fixed counters which are not in CPUID */
c->idxmsk64 &= intel_ctrl;
/* * Don't extend the pseudo-encoding to the * generic counters
*/ if (!use_fixed_pseudo_encoding(c->code))
c->idxmsk64 |= cntr_mask;
}
c->idxmsk64 &= cntr_mask | (fixed_cntr_mask << INTEL_PMC_IDX_FIXED);
c->weight = hweight64(c->idxmsk64);
}
}
/* * Access extra MSR may cause #GP under certain circumstances. * E.g. KVM doesn't support offcore event * Check all extra_regs here.
*/ if (!extra_regs) return;
/* Architectural Perfmon was introduced starting with Core "Yonah" */ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_vfm < INTEL_CORE_YONAH) return p6_pmu_init(); break; case 11: return knc_pmu_init(); case 15: return p4_pmu_init();
}
pr_cont("unsupported CPU family %d model %d ",
boot_cpu_data.x86, boot_cpu_data.x86_model); return -ENODEV;
}
/* * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not.
*/
cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full); if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV;
version = eax.split.version_id; if (version < 2)
x86_pmu = core_pmu; else
x86_pmu = intel_pmu;
/* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor:
*/ if (version > 1 && version < 5) { int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
if (boot_cpu_has(X86_FEATURE_ARCH_LBR))
intel_pmu_arch_lbr_init();
intel_pebs_init();
x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
if (version >= 5) {
x86_pmu.intel_cap.anythread_deprecated = edx.split.anythread_deprecated; if (x86_pmu.intel_cap.anythread_deprecated)
pr_cont(" AnyThread deprecated, ");
}
/* * Many features on and after V6 require dynamic constraint, * e.g., Arch PEBS, ACR.
*/ if (version >= 6)
x86_pmu.flags |= PMU_FL_DYN_CONSTRAINT; /* * Install the hw-cache-events table:
*/ switch (boot_cpu_data.x86_vfm) { case INTEL_CORE_YONAH:
pr_cont("Core events, ");
name = "core"; break;
case INTEL_CORE2_MEROM:
x86_add_quirk(intel_clovertown_quirk);
fallthrough;
case INTEL_CORE2_MEROM_L: case INTEL_CORE2_PENRYN: case INTEL_CORE2_DUNNINGTON:
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids));
case INTEL_NEHALEM: case INTEL_NEHALEM_EP: case INTEL_NEHALEM_EX:
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
pr_cont("Nehalem events, ");
name = "nehalem"; break;
case INTEL_ATOM_BONNELL: case INTEL_ATOM_BONNELL_MID: case INTEL_ATOM_SALTWELL: case INTEL_ATOM_SALTWELL_MID: case INTEL_ATOM_SALTWELL_TABLET:
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids));
case INTEL_ATOM_SILVERMONT: case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_SILVERMONT_MID: case INTEL_ATOM_AIRMONT: case INTEL_ATOM_SILVERMONT_MID2:
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
case INTEL_ATOM_GOLDMONT: case INTEL_ATOM_GOLDMONT_D:
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_skl();
x86_pmu.event_constraints = intel_slm_event_constraints;
x86_pmu.pebs_constraints = intel_glm_pebs_event_constraints;
x86_pmu.extra_regs = intel_glm_extra_regs; /* * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS * for precise cycles. * :pp is identical to :ppp
*/
x86_pmu.pebs_aliases = NULL;
x86_pmu.pebs_prec_dist = true;
x86_pmu.lbr_pt_coexist = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
td_attr = glm_events_attrs;
extra_attr = slm_format_attr;
pr_cont("Goldmont events, ");
name = "goldmont"; break;
case INTEL_ATOM_GOLDMONT_PLUS:
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_skl();
x86_pmu.event_constraints = intel_slm_event_constraints;
x86_pmu.extra_regs = intel_glm_extra_regs; /* * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS * for precise cycles.
*/
x86_pmu.pebs_aliases = NULL;
x86_pmu.pebs_prec_dist = true;
x86_pmu.lbr_pt_coexist = true;
x86_pmu.pebs_capable = ~0ULL;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_PEBS_ALL;
x86_pmu.get_event_constraints = glp_get_event_constraints;
td_attr = glm_events_attrs; /* Goldmont Plus has 4-wide pipeline */
event_attr_td_total_slots_scale_glm.event_str = "4";
extra_attr = slm_format_attr;
pr_cont("Goldmont plus events, ");
name = "goldmont_plus"; break;
case INTEL_ATOM_TREMONT_D: case INTEL_ATOM_TREMONT: case INTEL_ATOM_TREMONT_L:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
case INTEL_WESTMERE: case INTEL_WESTMERE_EP: case INTEL_WESTMERE_EX:
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
pr_cont("SandyBridge events, ");
name = "sandybridge"; break;
case INTEL_IVYBRIDGE: case INTEL_IVYBRIDGE_X:
x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); /* dTLB-load-misses on IVB is different than SNB */
hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
pr_cont("IvyBridge events, ");
name = "ivybridge"; break;
case INTEL_HASWELL: case INTEL_HASWELL_X: case INTEL_HASWELL_L: case INTEL_HASWELL_G:
x86_add_quirk(intel_ht_bug);
x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_hsw();
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
x86_pmu.pebs_prec_dist = true; /* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
case INTEL_BROADWELL: case INTEL_BROADWELL_D: case INTEL_BROADWELL_G: case INTEL_BROADWELL_X:
x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
/* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
extra_attr = slm_format_attr;
pr_cont("Knights Landing/Mill events, ");
name = "knights-landing"; break;
case INTEL_SKYLAKE_X:
pmem = true;
fallthrough; case INTEL_SKYLAKE_L: case INTEL_SKYLAKE: case INTEL_KABYLAKE_L: case INTEL_KABYLAKE: case INTEL_COMETLAKE_L: case INTEL_COMETLAKE:
x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
intel_pmu_lbr_init_skl();
/* INT_MISC.RECOVERY_CYCLES has umask 1 in Skylake */
event_attr_td_recovery_bubbles.event_str_noht = "event=0xd,umask=0x1,cmask=1";
event_attr_td_recovery_bubbles.event_str_ht = "event=0xd,umask=0x1,cmask=1,any=1";
x86_pmu.event_constraints = intel_skl_event_constraints;
x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
x86_pmu.extra_regs = intel_skl_extra_regs;
x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
x86_pmu.pebs_prec_dist = true; /* all extra regs are per-cpu when HT is on */
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
/* * Processors with CPUID.RTM_ALWAYS_ABORT have TSX deprecated by default. * TSX force abort hooks are not required on these systems. Only deploy * workaround when microcode has not enabled X86_FEATURE_RTM_ALWAYS_ABORT.
*/ if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) &&
!boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
x86_pmu.flags |= PMU_FL_TFA;
x86_pmu.get_event_constraints = tfa_get_event_constraints;
x86_pmu.enable_all = intel_tfa_pmu_enable_all;
x86_pmu.commit_scheduling = intel_tfa_commit_scheduling;
}
pr_cont("Skylake events, ");
name = "skylake"; break;
case INTEL_ICELAKE_X: case INTEL_ICELAKE_D:
x86_pmu.pebs_ept = 1;
pmem = true;
fallthrough; case INTEL_ICELAKE_L: case INTEL_ICELAKE: case INTEL_TIGERLAKE_L: case INTEL_TIGERLAKE: case INTEL_ROCKETLAKE:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
intel_pmu_lbr_init_skl();
case INTEL_SAPPHIRERAPIDS_X: case INTEL_EMERALDRAPIDS_X:
x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
x86_pmu.extra_regs = intel_glc_extra_regs;
pr_cont("Sapphire Rapids events, ");
name = "sapphire_rapids"; goto glc_common;
case INTEL_GRANITERAPIDS_X: case INTEL_GRANITERAPIDS_D:
x86_pmu.extra_regs = intel_rwc_extra_regs;
pr_cont("Granite Rapids events, ");
name = "granite_rapids";
case INTEL_ALDERLAKE: case INTEL_ALDERLAKE_L: case INTEL_RAPTORLAKE: case INTEL_RAPTORLAKE_P: case INTEL_RAPTORLAKE_S: /* * Alder Lake has 2 types of CPU, core and atom. * * Initialize the common PerfMon capabilities here.
*/
intel_pmu_init_hybrid(hybrid_big_small);
/* * Quirk: For some Alder Lake machine, when all E-cores are disabled in * a BIOS, the leaf 0xA will enumerate all counters of P-cores. However, * the X86_FEATURE_HYBRID_CPU is still set. The above codes will * mistakenly add extra counters for P-cores. Correct the number of * counters here.
*/ if ((x86_pmu_num_counters(&pmu->pmu) > 8) || (x86_pmu_num_counters_fixed(&pmu->pmu) > 4)) {
pmu->cntr_mask64 = x86_pmu.cntr_mask64;
pmu->fixed_cntr_mask64 = x86_pmu.fixed_cntr_mask64;
}
/* Initialize big core specific PerfMon capabilities. */
pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
intel_pmu_init_lnc(&pmu->pmu);
/* Initialize Atom core specific PerfMon capabilities. */
pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
intel_pmu_init_skt(&pmu->pmu);
/* Initialize Lower Power Atom specific PerfMon capabilities. */
pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_TINY_IDX];
intel_pmu_init_grt(&pmu->pmu);
pmu->extra_regs = intel_cmt_extra_regs;
intel_pmu_pebs_data_source_arl_h();
pr_cont("ArrowLake-H Hybrid events, ");
name = "arrowlake_h_hybrid"; break;
default: switch (x86_pmu.version) { case 1:
x86_pmu.event_constraints = intel_v1_event_constraints;
pr_cont("generic architected perfmon v1, ");
name = "generic_arch_v1"; break; case 2: case 3: case 4: /* * default constraints for v2 and up
*/
x86_pmu.event_constraints = intel_gen_event_constraints;
pr_cont("generic architected perfmon, ");
name = "generic_arch_v2+"; break; default: /* * The default constraints for v5 and up can support up to * 16 fixed counters. For the fixed counters 4 and later, * the pseudo-encoding is applied. * The constraints may be cut according to the CPUID enumeration * by inserting the EVENT_CONSTRAINT_END.
*/ if (fls64(x86_pmu.fixed_cntr_mask64) > INTEL_PMC_MAX_FIXED)
x86_pmu.fixed_cntr_mask64 &= GENMASK_ULL(INTEL_PMC_MAX_FIXED - 1, 0);
intel_v5_gen_event_constraints[fls64(x86_pmu.fixed_cntr_mask64)].weight = -1;
x86_pmu.event_constraints = intel_v5_gen_event_constraints;
pr_cont("generic architected perfmon, ");
name = "generic_arch_v5+"; break;
}
}
/* * The archPerfmonExt (0x23) includes an enhanced enumeration of * PMU architectural features with a per-core view. For non-hybrid, * each core has the same PMU capabilities. It's good enough to * update the x86_pmu from the booting CPU. For hybrid, the x86_pmu * is used to keep the common capabilities. Still keep the values * from the leaf 0xa. The core specific update will be done later * when a new type is online.
*/ if (!is_hybrid() && boot_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT))
update_pmu_cap(NULL);
/* AnyThread may be deprecated on arch perfmon v5 or later */ if (x86_pmu.intel_cap.anythread_deprecated)
x86_pmu.format_attrs = intel_arch_formats_attr;
intel_pmu_check_event_constraints(x86_pmu.event_constraints,
x86_pmu.cntr_mask64,
x86_pmu.fixed_cntr_mask64,
x86_pmu.intel_ctrl); /* * Access LBR MSR may cause #GP under certain circumstances. * Check all LBR MSR here. * Disable LBR access if any LBR MSRs can not be accessed.
*/ if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL))
x86_pmu.lbr_nr = 0; for (i = 0; i < x86_pmu.lbr_nr; i++) { if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
x86_pmu.lbr_nr = 0;
}
if (x86_pmu.lbr_nr) {
intel_pmu_lbr_init();
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
/* only support branch_stack snapshot for perfmon >= v2 */ if (x86_pmu.disable_all == intel_pmu_disable_all) { if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) {
static_call_update(perf_snapshot_branch_stack,
intel_pmu_snapshot_arch_branch_stack);
} else {
static_call_update(perf_snapshot_branch_stack,
intel_pmu_snapshot_branch_stack);
}
}
}
intel_pmu_check_extra_regs(x86_pmu.extra_regs);
/* Support full width counters using alternative MSR range */ if (x86_pmu.intel_cap.full_width_write) {
x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
x86_pmu.perfctr = MSR_IA32_PMC0;
pr_cont("full-width counters, ");
}
if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
if (x86_pmu.intel_cap.pebs_timing_info)
x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
intel_aux_output_init();
return 0;
}
/* * HT bug: phase 2 init * Called once we have valid topology information to check * whether or not HT is enabled * If HT is off, then we disable the workaround
*/ static __init int fixup_ht_bug(void)
{ int c; /* * problem not present on this CPU model, nothing to do
*/ if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED)) return 0;
if (topology_max_smt_threads() > 1) {
pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n"); return 0;
}
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.148Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.