/* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. * gp counters are stored in gp_counters[] and fixed counters are stored * in fixed_counters[] respectively. Both of them are part of "struct * kvm_pmu"; * - pmu.c understands the difference between gp counters and fixed counters. * However AMD doesn't support fixed-counters; * - There are three types of index to access perf counters (PMC): * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD * has MSR_K7_PERFCTRn and, for families 15H and later, * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are * aliased to MSR_K7_PERFCTRn. * 2. MSR Index (named idx): This normally is used by RDPMC instruction. * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except * that it also supports fixed counters. idx can be used to as index to * gp and fixed counters. * 3. Global PMC Index (named pmc): pmc is an index specific to PMU * code. Each pmc, stored in kvm_pmc.idx field, is unique across * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
*/
if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { if (!in_pmi) { /* * TODO: KVM is currently _choosing_ to not generate records * for emulated instructions, avoiding BUFFER_OVF PMI when * there are no records. Strictly speaking, it should be done * as well in the right context to improve sampling accuracy.
*/
skip_pmi = true;
} else { /* Indicate PEBS overflow PMI to guest. */
skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
(unsignedlong *)&pmu->global_status);
}
} else {
__set_bit(pmc->idx, (unsignedlong *)&pmu->global_status);
}
if (pmc->intr && !skip_pmi)
kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
}
/* * Ignore asynchronous overflow events for counters that are scheduled * to be reprogrammed, e.g. if a PMI for the previous event races with * KVM's handling of a related guest WRMSR.
*/ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) return;
__kvm_perf_overflow(pmc, true);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
{ /* * For some model specific pebs counters with special capabilities * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise * level to the maximum value (currently 3, backwards compatible) * so that the perf subsystem would assign specific hardware counter * with that capability for vPMC.
*/ if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
(pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu))) return 3;
/* * The non-zero precision level of guest event makes the ordinary * guest event becomes a guest PEBS event and triggers the host * PEBS PMI handler to determine whether the PEBS overflow PMI * comes from the host counters or the guest.
*/ return 1;
}
if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
(boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) { /* * HSW_IN_TX_CHECKPOINTED is not supported with nonzero * period. Just clear the sample period so at least * allocating the counter doesn't fail.
*/
attr.sample_period = 0;
} if (pebs) { /* * For most PEBS hardware events, the difference in the software * precision levels of guest and host PEBS events will not affect * the accuracy of the PEBS profiling result, because the "event IP" * in the PEBS record is calibrated on the guest side.
*/
attr.precise_ip = pmc_get_pebs_precise_level(pmc);
}
/* update counter, reset event value to avoid redundant accumulation */ if (pmc->perf_event && !pmc->is_paused)
counter += perf_event_pause(pmc->perf_event, true);
/* * Snapshot the previous counter *after* accumulating state from perf. * If overflow already happened, hardware (via perf) is responsible for * generating a PMI. KVM just needs to detect overflow on emulated * counter events that haven't yet been processed.
*/
prev_counter = counter & pmc_bitmask(pmc);
staticbool pmc_resume_counter(struct kvm_pmc *pmc)
{ if (!pmc->perf_event) returnfalse;
/* recalibrate sample period and check if it's accepted by perf core */ if (is_sampling_event(pmc->perf_event) &&
perf_event_period(pmc->perf_event,
get_sample_period(pmc, pmc->counter))) returnfalse;
if (test_bit(pmc->idx, (unsignedlong *)&pmc_to_pmu(pmc)->pebs_enable) !=
(!!pmc->perf_event->attr.precise_ip)) returnfalse;
/* reuse perf_event to serve as pmc_reprogram_counter() does*/
perf_event_enable(pmc->perf_event);
pmc->is_paused = false;
void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
{ /* * Drop any unconsumed accumulated counts, the WRMSR is a write, not a * read-modify-write. Adjust the counter value so that its value is * relative to the current count, as reading the current count from * perf is faster than pausing and repgrogramming the event in order to * reset it to '0'. Note, this very sneakily offsets the accumulated * emulated count too, by using pmc_read_counter()!
*/
pmc->emulated_counter = 0;
pmc->counter += val - pmc_read_counter(pmc);
pmc->counter &= pmc_bitmask(pmc);
pmc_update_sample_period(pmc);
}
EXPORT_SYMBOL_GPL(pmc_write_counter);
/* * For the event filter, searching is done on the 'includes' list and * 'excludes' list separately rather than on the 'events' list (which * has both). As a result the exclude bit can be ignored.
*/ staticint filter_event_cmp(constvoid *pa, constvoid *pb)
{ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
}
index = find_filter_index(events, nevents, event_select); if (index < 0) returnfalse;
/* * Entries are sorted by the event select. Walk the list in both * directions to process all entries with the targeted event select.
*/ for (i = index; i < nevents; i++) { if (filter_event_cmp(&events[i], &event_select)) break;
if (is_filter_entry_match(events[i], umask)) returntrue;
}
for (i = index - 1; i >= 0; i--) { if (filter_event_cmp(&events[i], &event_select)) break;
if (is_filter_entry_match(events[i], umask)) returntrue;
}
/* * The reprogramming bitmap can be written asynchronously by something * other than the task that holds vcpu->mutex, take care to clear only * the bits that will actually processed.
*/
BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
kvm_for_each_pmc(pmu, pmc, bit, bitmap) { /* * If reprogramming fails, e.g. due to contention, re-set the * regprogram bit set, i.e. opportunistically try again on the * next PMU refresh. Don't make a new request as doing so can * stall the guest if reprogramming repeatedly fails.
*/ if (reprogram_counter(pmc))
set_bit(pmc->idx, pmu->reprogram_pmi);
}
/* * Release unused perf_events if the corresponding guest MSRs weren't * accessed during the last vCPU time slice (need_cleanup is set when * the vCPU is scheduled back in).
*/ if (unlikely(pmu->need_cleanup))
kvm_pmu_cleanup(vcpu);
}
int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsignedint idx)
{ /* * On Intel, VMX interception has priority over RDPMC exceptions that * aren't already handled by the emulator, i.e. there are no additional * check needed for Intel PMUs. * * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts, * i.e. an invalid PMC results in a #GP, not #VMEXIT.
*/ if (!kvm_pmu_ops.check_rdpmc_early) return 0;
switch (msr) { case MSR_CORE_PERF_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
msr_info->data = pmu->global_status; break; case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_CORE_PERF_GLOBAL_CTRL:
msr_info->data = pmu->global_ctrl; break; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = 0; break; default: return kvm_pmu_call(get_msr)(vcpu, msr_info);
}
/* * Note, AMD ignores writes to reserved bits and read-only PMU MSRs, * whereas Intel generates #GP on attempts to write reserved/RO MSRs.
*/ switch (msr) { case MSR_CORE_PERF_GLOBAL_STATUS: if (!msr_info->host_initiated) return 1; /* RO MSR */
fallthrough; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: /* Per PPR, Read-only MSR. Writes are ignored. */ if (!msr_info->host_initiated) break;
if (data & pmu->global_status_rsvd) return 1;
pmu->global_status = data; break; case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
data &= ~pmu->global_ctrl_rsvd;
fallthrough; case MSR_CORE_PERF_GLOBAL_CTRL: if (!kvm_valid_perf_global_ctrl(pmu, data)) return 1;
if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
reprogram_counters(pmu, diff);
} break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: /* * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in * GLOBAL_STATUS, and so the set of reserved bits is the same.
*/ if (data & pmu->global_status_rsvd) return 1;
fallthrough; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: if (!msr_info->host_initiated)
pmu->global_status &= ~data; break; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: if (!msr_info->host_initiated)
pmu->global_status |= data & ~pmu->global_status_rsvd; break; default:
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return kvm_pmu_call(set_msr)(vcpu, msr_info);
}
/* * At RESET, both Intel and AMD CPUs set all enable bits for general * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that * was written for v1 PMUs don't unknowingly leave GP counters disabled * in the global controls). Emulate that behavior when refreshing the * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
*/ if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
}
/* Release perf_events for vPMCs that have been unused for a full time slice. */ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
{ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc = NULL;
DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX); int i;
/* * Skip the CPL lookup, which isn't free on Intel, if the result will * be the same regardless of the CPL.
*/ if (select_os == select_user) return select_os;
kvm_for_each_pmc(pmu, pmc, i, bitmap) { /* * Ignore checks for edge detect (all events currently emulated * but KVM are always rising edges), pin control (unsupported * by modern CPUs), and counter mask and its invert flag (KVM * doesn't emulate multiple events in a single clock cycle). * * Note, the uppermost nibble of AMD's mask overlaps Intel's * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved * bits (bits 35:34). Checking the "in HLE/RTM transaction" * flags is correct as the vCPU can't be in a transaction if * KVM is emulating an instruction. Checking the reserved bits * might be wrong if they are defined in the future, but so * could ignoring them, so do the simple thing for now.
*/ if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
!pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) continue;
for (i = 0; i < filter->nevents; i++) { if (filter->events[i] & ~mask) returnfalse;
}
returntrue;
}
staticvoid convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
{ int i, j;
for (i = 0, j = 0; i < filter->nevents; i++) { /* * Skip events that are impossible to match against a guest * event. When filtering, only the event select + unit mask * of the guest event is used. To maintain backwards * compatibility, impossible filters can't be rejected :-(
*/ if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
ARCH_PERFMON_EVENTSEL_UMASK)) continue; /* * Convert userspace events to a common in-kernel event so * only one code path is needed to support both events. For * the in-kernel events use masked events because they are * flexible enough to handle both cases. To convert to masked * events all that's needed is to add an "all ones" umask_mask, * (unmasked filter events don't support EXCLUDE).
*/
filter->events[j++] = filter->events[i] |
(0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
}
filter->nevents = j;
}
staticint prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
{ int i;
if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
convert_to_masked_filter(filter); elseif (!is_masked_filter_valid(filter)) return -EINVAL;
/* * Sort entries by event select and includes vs. excludes so that all * entries for a given event select can be processed efficiently during * filtering. The EXCLUDE flag uses a more significant bit than the * event select, and so the sorted list is also effectively split into * includes and excludes sub-lists.
*/
sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
filter_sort_cmp, NULL);
i = filter->nevents; /* Find the first EXCLUDE event (only supported for masked events). */ if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) { for (i = 0; i < filter->nevents; i++) { if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE) break;
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.