/* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12)
/* * Enable local APIC timer advancement (tscdeadline mode only) with adaptive * tuning. When enabled, KVM programs the host timer event to fire early, i.e. * before the deadline expires, to account for the delay between taking the * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume * the guest, i.e. so that the interrupt arrives in the guest with minimal * latency relative to the deadline programmed by the guest.
*/ staticbool lapic_timer_advance __read_mostly = true;
module_param(lapic_timer_advance, bool, 0444);
/* * For simplicity, KVM always allocates enough space for all possible * xAPIC IDs. Yell, but don't kill the VM, as KVM can continue on * without the optimized map.
*/ if (WARN_ON_ONCE(xapic_id > new->max_apic_id)) return -EINVAL;
/* * Bail if a vCPU was added and/or enabled its APIC between allocating * the map and doing the actual calculations for the map. Note, KVM * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if * the compiler decides to reload x2apic_id after this check.
*/ if (x2apic_id > new->max_apic_id) return -E2BIG;
/* * Deliberately truncate the vCPU ID when detecting a mismatched APIC * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a * 32-bit value. Any unwanted aliasing due to truncation results will * be detected below.
*/ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
*xapic_id_mismatch = true;
/* * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs. * Allow sending events to vCPUs by their x2APIC ID even if the target * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap * and collide). * * Honor the architectural (and KVM's non-optimized) behavior if * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed * to process messages independently. If multiple vCPUs have the same * effective APIC ID, e.g. due to the x2APIC wrap or because the guest * manually modified its xAPIC IDs, events targeting that ID are * supposed to be recognized by all vCPUs with said ID.
*/ if (vcpu->kvm->arch.x2apic_format) { /* See also kvm_apic_match_physical_addr(). */ if (apic_x2apic_mode(apic) || x2apic_id > 0xff)
new->phys_map[x2apic_id] = apic;
if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
new->phys_map[xapic_id] = apic;
} else { /* * Disable the optimized map if the physical APIC ID is already * mapped, i.e. is aliased to multiple vCPUs. The optimized * map requires a strict 1:1 mapping between IDs and vCPUs.
*/ if (apic_x2apic_mode(apic))
physical_id = x2apic_id; else
physical_id = xapic_id;
/* * To optimize logical mode delivery, all software-enabled APICs must * be configured for the same mode.
*/ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
new->logical_mode = logical_mode;
} elseif (new->logical_mode != logical_mode) {
new->logical_mode = KVM_APIC_MODE_MAP_DISABLED; return;
}
/* * In x2APIC mode, the LDR is read-only and derived directly from the * x2APIC ID, thus is guaranteed to be addressable. KVM reuses * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by * reversing the LDR calculation to get cluster of APICs, i.e. no * additional work is required.
*/ if (apic_x2apic_mode(apic)) return;
/* * CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock. * * DIRTY -> UPDATE_IN_PROGRESS and UPDATE_IN_PROGRESS -> CLEAN happen with * apic_map_lock_held.
*/ enum {
CLEAN,
UPDATE_IN_PROGRESS,
DIRTY
};
staticvoid kvm_recalculate_apic_map(struct kvm *kvm)
{ struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; unsignedlong i;
u32 max_id = 255; /* enough space for any xAPIC ID */ bool xapic_id_mismatch; int r;
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return;
WARN_ONCE(!irqchip_in_kernel(kvm), "Dirty APIC map without an in-kernel local APIC");
mutex_lock(&kvm->arch.apic_map_lock);
retry: /* * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map (if clean) * or the APIC registers (if dirty). Note, on retry the map may have * not yet been marked dirty by whatever task changed a vCPU's x2APIC * ID, i.e. the map may still show up as in-progress. In that case * this task still needs to retry and complete its calculation.
*/ if (atomic_cmpxchg_acquire(&kvm->arch.apic_map_dirty,
DIRTY, UPDATE_IN_PROGRESS) == CLEAN) { /* Someone else has updated the map. */
mutex_unlock(&kvm->arch.apic_map_lock); return;
}
/* * Reset the mismatch flag between attempts so that KVM does the right * thing if a vCPU changes its xAPIC ID, but do NOT reset max_id, i.e. * keep max_id strictly increasing. Disallowing max_id from shrinking * ensures KVM won't get stuck in an infinite loop, e.g. if the vCPU * with the highest x2APIC ID is toggling its APIC on and off.
*/
xapic_id_mismatch = false;
kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu))
max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_apic_present(vcpu)) continue;
r = kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch); if (r) {
kvfree(new); new = NULL; if (r == -E2BIG) {
cond_resched(); goto retry;
}
goto out;
}
kvm_recalculate_logical_map(new, vcpu);
}
out: /* * The optimized map is effectively KVM's internal version of APICv, * and all unwanted aliasing that results in disabling the optimized * map also applies to APICv.
*/ if (!new)
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED); else
kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
if (xapic_id_mismatch)
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); else
kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
old = rcu_dereference_protected(kvm->arch.apic_map,
lockdep_is_held(&kvm->arch.apic_map_lock));
rcu_assign_pointer(kvm->arch.apic_map, new); /* * Write kvm->arch.apic_map before clearing apic->apic_map_dirty. * If another update has come in, leave it DIRTY.
*/
atomic_cmpxchg_release(&kvm->arch.apic_map_dirty,
UPDATE_IN_PROGRESS, CLEAN);
mutex_unlock(&kvm->arch.apic_map_lock);
/* Check if there are APF page ready requests pending */ if (enabled) {
kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
kvm_xen_sw_enable_lapic(apic->vcpu);
}
}
v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16);
/* * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) * which doesn't have EOI register; Some buggy OSes (e.g. Windows with * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC * version first and level-triggered interrupts never get EOIed in * IOAPIC.
*/ if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) &&
!ioapic_in_kernel(vcpu->kvm))
v |= APIC_LVR_DIRECTED_EOI;
kvm_lapic_set_reg(apic, APIC_LVR, v);
}
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
{ int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; int i;
if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries) return;
/* Initialize/mask any "new" LVT entries. */ for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++)
kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
apic->nr_lvt_entries = nr_lvt_entries;
/* The number of LVT entries is reflected in the version register. */
kvm_apic_set_version(vcpu);
}
/* * With APIC virtualization enabled, all caching is disabled * because the processor can modify ISR under the hood. Instead * just set SVI.
*/ if (unlikely(apic->apicv_active))
kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec); else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR); /* * ISR (in service register) bit is set when injecting an interrupt. * The highest vector is injected. Thus the latest bit set matches * the highest bit in ISR.
*/
apic->highest_isr_cache = vec;
}
}
staticinlineint apic_find_highest_isr(struct kvm_lapic *apic)
{ int result;
/* * Note that isr_count is always 1, and highest_isr_cache * is always -1, with APIC virtualization enabled.
*/ if (!apic->isr_count) return -1; if (likely(apic->highest_isr_cache != -1)) return apic->highest_isr_cache;
result = apic_find_highest_vector(apic->regs + APIC_ISR);
ASSERT(result == -1 || result >= 16);
/* * We do get here for APIC virtualization enabled if the guest * uses the Hyper-V APIC enlightenment. In this case we may need * to trigger a new interrupt delivery by writing the SVI field; * on the other hand isr_count and highest_isr_cache are unused * and must be left alone.
*/ if (unlikely(apic->apicv_active))
kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic)); else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
apic->highest_isr_cache = -1;
}
}
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{ /* This may race with setting of irr in __apic_accept_irq() and * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq * will cause vmexit immediately and the value will be recalculated * on the next vmentry.
*/ return apic_find_highest_irr(vcpu->arch.apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
staticint __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, struct dest_map *dest_map);
if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) returnfalse;
/* * Clear pending bit in any case: it will be set again on vmentry. * While this might not be ideal from performance point of view, * this makes sure pv eoi is only enabled when we know it's safe.
*/
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
/* * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they * were in x2APIC mode if the target APIC ID can't be encoded as an * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC * mode. Match the x2APIC ID if and only if the target APIC ID can't * be encoded in xAPIC to avoid spurious matches against a vCPU that * changed its (addressable) xAPIC ID (which is writable).
*/ if (apic_x2apic_mode(apic) || mda > 0xff) return mda == kvm_x2apic_id(apic);
/* The KVM local APIC implementation has two quirks: * * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID. * KVM doesn't do that aliasing. * * - in-kernel IOAPIC messages have to be delivered directly to * x2APIC, because the kernel does not support interrupt remapping. * In order to support broadcast without interrupt remapping, x2APIC * rewrites the destination of non-IPI messages from APIC_BROADCAST * to X2APIC_BROADCAST. * * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is * important when userspace wants to use x2APIC-format MSIs, because * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
*/ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsignedint dest_id, struct kvm_lapic *source, struct kvm_lapic *target)
{ bool ipi = source != NULL;
/* Return true if the interrupt can be handled by using *bitmap as index mask * for valid destinations in *dst array. * Return false if kvm_apic_map_get_dest_lapic did nothing useful. * Note: we may have zero kvm_lapic destinations when we return true, which * means that the interrupt should be dropped. In this case, *bitmap would be * zero and *dst undefined.
*/ staticinlinebool kvm_apic_map_get_dest_lapic(struct kvm *kvm, struct kvm_lapic **src, struct kvm_lapic_irq *irq, struct kvm_apic_map *map, struct kvm_lapic ***dst, unsignedlong *bitmap)
{ int i, lowest;
ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); if (ret) {
*r = 0;
for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue;
*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
}
}
rcu_read_unlock(); return ret;
}
/* * This routine tries to handle interrupts in posted mode, here is how * it deals with different cases: * - For single-destination interrupts, handle it in posted mode * - Else if vector hashing is enabled and it is a lowest-priority * interrupt, handle it in posted mode and use the following mechanism * to find the destination vCPU. * 1. For lowest-priority interrupts, store all the possible * destination vCPUs in an array. * 2. Use "guest vector % max number of destination vCPUs" to find * the right destination vCPU in the array for the lowest-priority * interrupt. * - Otherwise, use remapped mode to inject the interrupt.
*/ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu)
{ struct kvm_apic_map *map; unsignedlong bitmap; struct kvm_lapic **dst = NULL; bool ret = false;
if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
hweight16(bitmap) == 1) { unsignedlong i = find_first_bit(&bitmap, 16);
if (dst[i]) {
*dest_vcpu = dst[i]->vcpu;
ret = true;
}
}
rcu_read_unlock(); return ret;
}
/* * Add a pending IRQ into lapic. * Return 1 if successfully added and 0 if discarded.
*/ staticint __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, struct dest_map *dest_map)
{ int result = 0; struct kvm_vcpu *vcpu = apic->vcpu;
trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
trig_mode, vector); switch (delivery_mode) { case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
fallthrough; case APIC_DM_FIXED: if (unlikely(trig_mode && !level)) break;
/* FIXME add logic for vcpu on reset */ if (unlikely(!apic_enabled(apic))) break;
result = 1;
if (dest_map) {
__set_bit(vcpu->vcpu_id, dest_map->map);
dest_map->vectors[vcpu->vcpu_id] = vector;
}
if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode)
apic_set_vector(vector, apic->regs + APIC_TMR); else
apic_clear_vector(vector, apic->regs + APIC_TMR);
}
case APIC_DM_REMRD:
result = 1;
vcpu->arch.pv.pv_unhalted = 1;
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu); break;
case APIC_DM_SMI: if (!kvm_inject_smi(vcpu)) {
kvm_vcpu_kick(vcpu);
result = 1;
} break;
case APIC_DM_NMI:
result = 1;
kvm_inject_nmi(vcpu);
kvm_vcpu_kick(vcpu); break;
case APIC_DM_INIT: if (!trig_mode || level) {
result = 1; /* assumes that there are only KVM_APIC_INIT/SIPI */
apic->pending_events = (1UL << KVM_APIC_INIT);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
} break;
case APIC_DM_STARTUP:
result = 1;
apic->sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */
smp_wmb();
set_bit(KVM_APIC_SIPI, &apic->pending_events);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu); break;
case APIC_DM_EXTINT: /* * Should only be called by kvm_apic_local_deliver() with LVT0, * before NMI watchdog was enabled. Already handled by * kvm_apic_accept_pic_intr().
*/ break;
/* * This routine identifies the destination vcpus mask meant to receive the * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find * out the destination vcpus array and set the bitmap or it traverses to * each available vcpu to identify the same.
*/ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, unsignedlong *vcpu_bitmap)
{ struct kvm_lapic **dest_vcpu = NULL; struct kvm_lapic *src = NULL; struct kvm_apic_map *map; struct kvm_vcpu *vcpu; unsignedlong bitmap, i; int vcpu_idx; bool ret;
staticvoid kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
{ int __maybe_unused trigger_mode;
/* Eoi the ioapic only if the ioapic doesn't own the vector. */ if (!kvm_ioapic_handles_vector(apic, vector)) return;
/* * If the intercepted EOI is for an IRQ that was pending from previous * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely * no longer need to be intercepted.
*/ if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector)
kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu);
/* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) {
apic->vcpu->arch.pending_ioapic_eoi = vector;
kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu); return;
}
/* * this interface assumes a trap-like exit, which has already finished * desired side effect including vISR and vPPR update.
*/ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector)
{ struct kvm_lapic *apic = vcpu->arch.apic;
case APIC_TMCCT: /* Timer CCR */ if (apic_lvtt_tscdeadline(apic)) return 0;
val = apic_get_tmcct(apic); break; case APIC_PROCPRI:
apic_update_ppr(apic);
val = kvm_lapic_get_reg(apic, offset); break; case APIC_TASKPRI:
report_tpr_access(apic, false);
fallthrough; default:
val = kvm_lapic_get_reg(apic, offset); break;
}
if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
/* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */ if (!apic_x2apic_mode(apic))
valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
APIC_REG_MASK(APIC_DFR) |
APIC_REG_MASK(APIC_ICR2);
/* * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in * x2APIC and needs to be manually handled by the caller.
*/
WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR);
if (alignment + len > 4) return 1;
if (offset > 0x3f0 ||
!(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset))) return 1;
result = __apic_read(apic, offset & ~0xf);
trace_kvm_apic_read(offset, result);
switch (len) { case 1: case 2: case 4:
memcpy(data, (char *)&result + alignment, len); break; default:
printk(KERN_ERR "Local APIC read with len = %x, " "should be 1,2, or 4 instead\n", len); break;
} return 0;
}
staticvoid limit_periodic_timer_frequency(struct kvm_lapic *apic)
{ /* * Do not allow the guest to program periodic timers with small * interval, since the hrtimers are not throttled by the host * scheduler.
*/ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
s64 min_period = min_timer_period_us * 1000LL;
if (apic->lapic_timer.period < min_period) {
pr_info_once( "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
apic->lapic_timer.period, min_period);
apic->lapic_timer.period = min_period;
}
}
}
/* * Assume a timer IRQ was "injected" if the APIC is protected. KVM's * copy of the vIRR is bogus, it's the responsibility of the caller to * precisely check whether or not a timer IRQ is pending.
*/ if (apic->guest_apic_protected) returntrue;
reg = kvm_lapic_get_reg(apic, APIC_LVTT); if (kvm_apic_hw_enabled(apic)) { int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR;
if (apic->apicv_active)
bitmap = apic->regs + APIC_IRR;
if (apic_test_vector(vec, bitmap)) returntrue;
} returnfalse;
}
/* * If the guest TSC is running at a different ratio than the host, then * convert the delay to nanoseconds to achieve an accurate delay. Note * that __delay() uses delay_tsc whenever the hardware has TSC, thus * always for VMX enabled hardware.
*/ if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) {
__delay(min(guest_cycles,
nsec_to_cycles(vcpu, timer_advance_ns)));
} else {
u64 delay_ns = guest_cycles * 1000000ULL;
do_div(delay_ns, vcpu->arch.virtual_tsc_khz);
ndelay(min_t(u32, delay_ns, timer_advance_ns));
}
}
/* Do not adjust for tiny fluctuations or large random spikes. */ if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN) return;
/* too early */ if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
} else { /* too late */
ns = advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz);
timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
}
/* * If the timer fired early, reread the TSC to account for the overhead * of the above adjustment to avoid waiting longer than is necessary.
*/ if (guest_tsc < tsc_deadline)
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
}
if (kvm_use_posted_timer_interrupt(apic->vcpu)) { /* * Ensure the guest's timer has truly expired before posting an * interrupt. Open code the relevant checks to avoid querying * lapic_timer_int_injected(), which will be false since the * interrupt isn't yet injected. Waiting until after injecting * is not an option since that won't help a posted interrupt.
*/ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
vcpu->arch.apic->lapic_timer.timer_advance_ns)
__kvm_wait_lapic_expire(vcpu);
kvm_apic_inject_pending_timer_irqs(apic); return;
}
atomic_inc(&apic->lapic_timer.pending);
kvm_make_request(KVM_REQ_UNBLOCK, vcpu); if (from_timer_fn)
kvm_vcpu_kick(vcpu);
}
/* * Synchronize both deadlines to the same time source or * differences in the periods (caused by differences in the * underlying clocks or numerical approximation errors) will * cause the two to drift apart over time as the errors * accumulate.
*/
apic->lapic_timer.target_expiration =
ktime_add_ns(apic->lapic_timer.target_expiration,
apic->lapic_timer.period);
delta = ktime_sub(apic->lapic_timer.target_expiration, now);
apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
nsec_to_cycles(apic->vcpu, delta);
}
staticvoid start_sw_period(struct kvm_lapic *apic)
{ if (!apic->lapic_timer.period) return;
if (ktime_after(ktime_get(),
apic->lapic_timer.target_expiration)) {
apic_timer_expired(apic, false);
/* * To simplify handling the periodic timer, leave the hv timer running * even if the deadline timer has expired, i.e. rely on the resulting * VM-Exit to recompute the periodic timer's target expiration.
*/ if (!apic_lvtt_period(apic)) { /* * Cancel the hv timer if the sw timer fired while the hv timer * was being programmed, or if the hv timer itself expired.
*/ if (atomic_read(&ktimer->pending)) {
cancel_hv_timer(apic);
} elseif (expired) {
apic_timer_expired(apic, false);
cancel_hv_timer(apic);
}
}
WARN_ON(preemptible()); if (apic->lapic_timer.hv_timer_in_use)
cancel_hv_timer(apic); if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) return;
preempt_disable(); /* If the preempt notifier has already run, it also called apic_timer_expired */ if (!apic->lapic_timer.hv_timer_in_use) goto out;
WARN_ON(kvm_vcpu_is_blocking(vcpu));
apic_timer_expired(apic, false);
cancel_hv_timer(apic);
preempt_disable(); /* Possibly the TSC deadline timer is not enabled yet */ if (apic->lapic_timer.hv_timer_in_use)
start_sw_timer(apic);
preempt_enable();
}
staticint kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{ int ret = 0;
trace_kvm_apic_write(reg, val);
switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) {
kvm_apic_set_xapic_id(apic, val >> 24);
} else {
ret = 1;
} break;
case APIC_TASKPRI:
report_tpr_access(apic, true);
apic_set_tpr(apic, val & 0xff); break;
case APIC_EOI:
apic_set_eoi(apic); break;
case APIC_LDR: if (!apic_x2apic_mode(apic))
kvm_apic_set_ldr(apic, val & APIC_LDR_MASK); else
ret = 1; break;
case APIC_DFR: if (!apic_x2apic_mode(apic))
kvm_apic_set_dfr(apic, val | 0x0FFFFFFF); else
ret = 1; break;
case APIC_SPIV: {
u32 mask = 0x3ff; if (kvm_lapic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI)
mask |= APIC_SPIV_DIRECTED_EOI;
apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i;
for (i = 0; i < apic->nr_lvt_entries; i++) {
kvm_lapic_set_reg(apic, APIC_LVTx(i),
kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED);
}
apic_update_lvtt(apic);
atomic_set(&apic->lapic_timer.pending, 0);
} break;
} case APIC_ICR:
WARN_ON_ONCE(apic_x2apic_mode(apic));
/* No delay here, so we always clear the pending bit */
val &= ~APIC_ICR_BUSY;
kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2));
kvm_lapic_set_reg(apic, APIC_ICR, val); break; case APIC_ICR2: if (apic_x2apic_mode(apic))
ret = 1; else
kvm_lapic_set_reg(apic, APIC_ICR2, val & 0xff000000); break;
case APIC_LVT0:
apic_manage_nmi_watchdog(apic, val);
fallthrough; case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: case APIC_LVTERR: case APIC_LVTCMCI: {
u32 index = get_lvt_index(reg); if (!kvm_lapic_lvt_supported(apic, index)) {
ret = 1; break;
} if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= apic_lvt_mask[index];
kvm_lapic_set_reg(apic, reg, val); break;
}
case APIC_LVTT: if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask);
kvm_lapic_set_reg(apic, APIC_LVTT, val);
apic_update_lvtt(apic); break;
case APIC_TMICT: if (apic_lvtt_tscdeadline(apic)) break;
case APIC_TDCR: {
uint32_t old_divisor = apic->divide_count;
kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb);
update_divide_count(apic); if (apic->divide_count != old_divisor &&
apic->lapic_timer.period) {
hrtimer_cancel(&apic->lapic_timer.timer);
update_target_expiration(apic, old_divisor);
restart_apic_timer(apic);
} break;
} case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0)
ret = 1; break;
case APIC_SELF_IPI: /* * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold * the vector, everything else is reserved.
*/ if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK))
ret = 1; else
kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0); break; default:
ret = 1; break;
}
/* * Recalculate APIC maps if necessary, e.g. if the software enable bit * was toggled, the APIC ID changed, etc... The maps are marked dirty * on relevant changes, i.e. this is a nop for most writes.
*/
kvm_recalculate_apic_map(apic->vcpu->kvm);
int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
{ if (data & X2APIC_ICR_RESERVED_BITS) return 1;
/* * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but * only AMD requires it to be zero, Intel essentially just ignores the * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, * the CPU performs the reserved bits checks, i.e. the underlying CPU * behavior will "win". Arbitrarily clear the BUSY bit, as there is no * sane way to provide consistent behavior with respect to hardware.
*/
data &= ~APIC_ICR_BUSY;
/* * ICR is a single 64-bit register when x2APIC is enabled, all others * registers hold 32-bit values. For legacy xAPIC, ICR writes need to * go down the common path to get the upper half from ICR2. * * Note, using the write helpers may incur an unnecessary write to the * virtual APIC state, but KVM needs to conditionally modify the value * in certain cases, e.g. to clear the ICR busy bit. The cost of extra * conditional branches is likely a wash relative to the cost of the * maybe-unecessary write, and both are in the noise anyways.
*/ if (apic_x2apic_mode(apic) && offset == APIC_ICR)
WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); else
kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
}
EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode);
/* * When APICv is enabled, KVM must always search the IRR for a pending * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU * isn't running. If APICv is disabled, KVM _should_ search the IRR * for a pending IRQ. But KVM currently doesn't ensure *all* hardware, * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching * the IRR at this time could race with IRQ delivery from hardware that * still sees APICv as being enabled. * * FIXME: Ensure other vCPUs and devices observe the change in APICv * state prior to updating KVM's metadata caches, so that KVM * can safely search the IRR and set irr_pending accordingly.
*/
apic->irr_pending = true;
int kvm_alloc_apic_access_page(struct kvm *kvm)
{ void __user *hva; int ret = 0;
mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_memslot_enabled ||
kvm->arch.apic_access_memslot_inhibited) goto out;
hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); if (IS_ERR(hva)) {
ret = PTR_ERR(hva); goto out;
}
if (!kvm->arch.apic_access_memslot_enabled) return;
kvm_vcpu_srcu_read_unlock(vcpu);
mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_memslot_enabled) {
__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); /* * Clear "enabled" after the memslot is deleted so that a * different vCPU doesn't get a false negative when checking * the flag out of slots_lock. No additional memory barrier is * needed as modifying memslots requires waiting other vCPUs to * drop SRCU (see above), and false positives are ok as the * flag is rechecked after acquiring slots_lock.
*/
kvm->arch.apic_access_memslot_enabled = false;
/* * Mark the memslot as inhibited to prevent reallocating the * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
*/
kvm->arch.apic_access_memslot_inhibited = true;
}
if (!init_event) {
msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu))
msr_val |= MSR_IA32_APICBASE_BSP;
/* * Use the inner helper to avoid an extra recalcuation of the * optimized APIC map if some other task has dirtied the map. * The recalculation needed for this vCPU will be done after * all APIC state has been initialized (see below).
*/
__kvm_apic_set_base(vcpu, msr_val);
}
if (!apic) return;
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
/* The xAPIC ID is set at RESET even if the APIC was already enabled. */ if (!init_event)
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < apic->nr_lvt_entries; i++)
kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) &&
kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
kvm_lapic_set_reg(apic, APIC_LVT0,
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
kvm_apic_set_dfr(apic, 0xffffffffU);
apic_set_spiv(apic, 0xff);
kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic))
kvm_apic_set_ldr(apic, 0);
kvm_lapic_set_reg(apic, APIC_ESR, 0); if (!apic_x2apic_mode(apic)) {
kvm_lapic_set_reg(apic, APIC_ICR, 0);
kvm_lapic_set_reg(apic, APIC_ICR2, 0);
} else {
kvm_lapic_set_reg64(apic, APIC_ICR, 0);
}
kvm_lapic_set_reg(apic, APIC_TDCR, 0);
kvm_lapic_set_reg(apic, APIC_TMICT, 0); for (i = 0; i < 8; i++) {
kvm_lapic_set_reg(apic, APIC_IRR + 0x10 * i, 0);
kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
kvm_apic_update_apicv(vcpu);
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_HARD); if (lapic_timer_advance)
apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
/* * Stuff the APIC ENABLE bit in lieu of temporarily incrementing * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset().
*/
vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
/* * Defer evaluating inhibits until the vCPU is first run, as this vCPU * will not get notified of any changes until this vCPU is visible to * other vCPUs (marked online and added to the set of vCPUs). * * Opportunistically mark APICv active as VMX in particularly is highly * unlikely to have inhibits. Ignore the current per-VM APICv state so * that vCPU creation is guaranteed to run with a deterministic value, * the request will ensure the vCPU gets the correct state before VM-Entry.
*/ if (enable_apicv) {
apic->apicv_active = true;
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
}
/* * We get here even with APIC virtualization enabled, if doing * nested virtualization and L1 runs with the "acknowledge interrupt * on exit" mode. Then we cannot inject the interrupt via RVI, * because the process would deliver it through the IDT.
*/
apic_clear_irr(vector, apic); if (kvm_hv_synic_auto_eoi_set(vcpu, vector)) { /* * For auto-EOI interrupts, there might be another pending * interrupt above PPR, so check whether to raise another * KVM_REQ_EVENT.
*/
apic_update_ppr(apic);
} else { /* * For normal interrupts, PPR has been raised and there cannot * be a higher-priority pending interrupt---except if there was * a concurrent interrupt injection, but that would have * triggered KVM_REQ_EVENT already.
*/
apic_set_isr(vector, apic);
__apic_update_ppr(apic, &ppr);
}
if (vcpu->kvm->arch.x2apic_format) { if (*id != x2apic_id) return -EINVAL;
} else { /* * Ignore the userspace value when setting APIC state. * KVM's model is that the x2APIC ID is readonly, e.g. * KVM only supports delivering interrupts to KVM's * version of the x2APIC ID. However, for backwards * compatibility, don't reject attempts to set a * mismatched ID for userspace that hasn't opted into * x2apic_format.
*/ if (set)
*id = x2apic_id; else
*id = x2apic_id << 24;
}
/* * In x2APIC mode, the LDR is fixed and based on the id. And * if the ICR is _not_ split, ICR is internally a single 64-bit * register, but needs to be split to ICR+ICR2 in userspace for * backwards compatibility.
*/ if (set)
*ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
/* * Get calculated timer current count for remaining timer period (if * any) and store it in the returned register set.
*/
apic_set_reg(s->regs, APIC_TMCCT, __apic_read(vcpu->arch.apic, APIC_TMCCT));
return kvm_apic_state_fixup(vcpu, s, false);
}
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
{ struct kvm_lapic *apic = vcpu->arch.apic; int r;
kvm_x86_call(apicv_pre_state_restore)(vcpu);
/* set SPIV separately to get count of SW disabled APICs right */
apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
r = kvm_apic_state_fixup(vcpu, s, true); if (r) {
kvm_recalculate_apic_map(vcpu->kvm); return r;
}
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
if (!lapic_in_kernel(vcpu) ||
kvm_can_post_timer_interrupt(vcpu)) return;
timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD);
}
/* * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt * * Detect whether guest triggered PV EOI since the * last entry. If yes, set EOI on guests's behalf. * Clear PV EOI in guest memory in any case.
*/ staticvoid apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, struct kvm_lapic *apic)
{ int vector; /* * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host * and KVM_PV_EOI_ENABLED in guest memory as follows: * * KVM_APIC_PV_EOI_PENDING is unset: * -> host disabled PV EOI. * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: * -> host enabled PV EOI, guest did not execute EOI yet. * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: * -> host enabled PV EOI, guest executed EOI.
*/
BUG_ON(!pv_eoi_enabled(vcpu));
if (pv_eoi_test_and_clr_pending(vcpu)) return;
vector = apic_set_eoi(apic);
trace_kvm_pv_eoi(apic, vector);
}
if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return;
if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, sizeof(u32))) return;
apic_set_tpr(vcpu->arch.apic, data & 0xff);
}
/* * apic_sync_pv_eoi_to_guest - called before vmentry * * Detect whether it's safe to enable PV EOI and * if yes do so.
*/ staticvoid apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, struct kvm_lapic *apic)
{ if (!pv_eoi_enabled(vcpu) || /* IRR set or many bits in ISR: could be nested. */
apic->irr_pending || /* Cache not set: could be safe but we don't bother. */
apic->highest_isr_cache == -1 || /* Need EOI to update ioapic. */
kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) { /* * PV EOI was disabled by apic_sync_pv_eoi_from_guest * so we need not do anything here.
*/ return;
}
if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1;
*data = low;
return 0;
}
staticint kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
{ /* * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and * can be written as such, all other registers remain accessible only * through 32-bit reads/writes.
*/ if (reg == APIC_ICR) return kvm_x2apic_icr_write(apic, data);
/* Bits 63:32 are reserved in all other registers. */ if (data >> 32) return 1;
int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsignedlong len)
{
u64 addr = data & ~KVM_MSR_ENABLED; struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; unsignedlong new_len; int ret;
if (!IS_ALIGNED(addr, 4)) return 1;
if (data & KVM_MSR_ENABLED) { if (addr == ghc->gpa && len <= ghc->len)
new_len = ghc->len; else
new_len = len;
ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); if (ret) return ret;
}
vcpu->arch.pv_eoi.msr_val = data;
return 0;
}
int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
{ struct kvm_lapic *apic = vcpu->arch.apic;
u8 sipi_vector; int r;
if (!kvm_apic_has_pending_init_or_sipi(vcpu)) return 0;
if (is_guest_mode(vcpu)) {
r = kvm_check_nested_events(vcpu); if (r < 0) return r == -EBUSY ? 0 : r; /* * Continue processing INIT/SIPI even if a nested VM-Exit * occurred, e.g. pending SIPIs should be dropped if INIT+SIPI * are blocked as a result of transitioning to VMX root mode.
*/
}
/* * INITs are blocked while CPU is in specific states (SMM, VMX root * mode, SVM with GIF=0), while SIPIs are dropped if the CPU isn't in * wait-for-SIPI (WFS).
*/ if (!kvm_apic_init_sipi_allowed(vcpu)) {
WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
clear_bit(KVM_APIC_SIPI, &apic->pending_events); return 0;
}
if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu))
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else
kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
} if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { /* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu,
sipi_vector);
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
}
} return 0;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.