/* Grab CPPR of the most favored pending interrupt */
cppr = ack & 0xff; if (cppr < 8)
xc->pending |= 1 << cppr;
/* Check consistency */ if (cppr >= xc->hw_cppr)
pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
smp_processor_id(), cppr, xc->hw_cppr);
/* * Update our image of the HW CPPR. We don't yet modify * xc->cppr, this will be done as we scan for interrupts * in the queues.
*/
xc->hw_cppr = cppr;
}
val = __raw_readq(__x_eoi_page(xd) + offset); #ifdef __LITTLE_ENDIAN__
val >>= 64-8; #endif return (u8)val;
}
staticvoid xive_vm_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
{ /* If the XIVE supports the new "store EOI facility, use it */ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
__raw_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI); elseif (xd->flags & XIVE_IRQ_FLAG_LSI) { /* * For LSIs the HW EOI cycle is used rather than PQ bits, * as they are automatically re-triggred in HW when still * pending.
*/
__raw_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI);
} else {
uint64_t eoi_val;
/* * Otherwise for EOI, we use the special MMIO that does * a clear of both P and Q and returns the old Q, * except for LSIs where we use the "EOI cycle" special * load. * * This allows us to then do a re-trigger if Q was set * rather than synthetizing an interrupt in software
*/
eoi_val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_00);
/* Re-trigger if needed */ if ((eoi_val & 1) && __x_trig_page(xd))
__raw_writeq(0, __x_trig_page(xd));
}
}
/* * Snapshot the queue page. The test further down for EOI * must use the same "copy" that was used by __xive_read_eq * since qpage can be set concurrently and we don't want * to miss an EOI.
*/
qpage = READ_ONCE(q->qpage);
skip_ipi: /* * Try to fetch from the queue. Will return 0 for a * non-queueing priority (ie, qpage = 0).
*/
hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
/* * If this was a signal for an MFFR change done by * H_IPI we skip it. Additionally, if we were fetching * we EOI it now, thus re-enabling reception of a new * such signal. * * We also need to do that if prio is 0 and we had no * page for the queue. In this case, we have non-queued * IPI that needs to be EOId. * * This is safe because if we have another pending MFRR * change that wasn't observed above, the Q bit will have * been set and another occurrence of the IPI will trigger.
*/ if (hirq == XICS_IPI || (prio == 0 && !qpage)) { if (scan_type == scan_fetch) {
xive_vm_source_eoi(xc->vp_ipi,
&xc->vp_ipi_data);
q->idx = idx;
q->toggle = toggle;
} /* Loop back on same queue with updated idx/toggle */
WARN_ON(hirq && hirq != XICS_IPI); if (hirq) goto skip_ipi;
}
/* If it's the dummy interrupt, continue searching */ if (hirq == XICS_DUMMY) goto skip_ipi;
/* Clear the pending bit if the queue is now empty */ if (!hirq) {
pending &= ~(1 << prio);
/* * Check if the queue count needs adjusting due to * interrupts being moved away.
*/ if (atomic_read(&q->pending_count)) { int p = atomic_xchg(&q->pending_count, 0);
/* * If the most favoured prio we found pending is less * favored (or equal) than a pending IPI, we return * the IPI instead.
*/ if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
prio = xc->mfrr;
hirq = XICS_IPI; break;
}
/* If fetching, update queue pointers */ if (scan_type == scan_fetch) {
q->idx = idx;
q->toggle = toggle;
}
}
/* If we are just taking a "peek", do nothing else */ if (scan_type == scan_poll) return hirq;
/* Update the pending bits */
xc->pending = pending;
/* * If this is an EOI that's it, no CPPR adjustment done here, * all we needed was cleanup the stale pending bits and check * if there's anything left.
*/ if (scan_type == scan_eoi) return hirq;
/* * If we found an interrupt, adjust what the guest CPPR should * be as if we had just fetched that interrupt from HW. * * Note: This can only make xc->cppr smaller as the previous * loop will only exit with hirq != 0 if prio is lower than * the current xc->cppr. Thus we don't need to re-check xc->mfrr * for pending IPIs.
*/ if (hirq)
xc->cppr = prio; /* * If it was an IPI the HW CPPR might have been lowered too much * as the HW interrupt we use for IPIs is routed to priority 0. * * We re-sync it here.
*/ if (xc->cppr != xc->hw_cppr) {
xc->hw_cppr = xc->cppr;
__raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR);
}
/* That should never hit */ if (hirq & 0xff000000)
pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
/* * XXX We could check if the interrupt is masked here and * filter it. If we chose to do so, we would need to do: * * if (masked) { * lock(); * if (masked) { * old_Q = true; * hirq = 0; * } * unlock(); * }
*/
/* Return interrupt and old CPPR in GPR4 */
kvmppc_set_gpr(vcpu, 4, hirq | (old_cppr << 24));
/* Grab the target VCPU if not the current one */ if (xc->server_num != server) {
vcpu = kvmppc_xive_find_server(vcpu->kvm, server); if (!vcpu) return H_PARAMETER;
xc = vcpu->arch.xive_vcpu;
/* Scan all priorities */
pending = 0xff;
} else { /* Grab pending interrupt if any */
__be64 qw1 = __raw_readq(xive_tima + TM_QW1_OS);
u8 pipr = be64_to_cpu(qw1) & 0xff;
/* Remember old and update SW state */
old_cppr = xc->cppr;
xc->cppr = cppr;
/* * Order the above update of xc->cppr with the subsequent * read of xc->mfrr inside push_pending_to_hw()
*/
smp_mb();
if (cppr > old_cppr) { /* * We are masking less, we need to look for pending things * to deliver and set VP pending bits accordingly to trigger * a new interrupt otherwise we might miss MFRR changes for * which we have optimized out sending an IPI signal.
*/
xive_vm_push_pending_to_hw(xc);
} else { /* * We are masking more, we need to check the queue for any * interrupt that has been routed to another CPU, take * it out (replace it with the dummy) and retrigger it. * * This is necessary since those interrupts may otherwise * never be processed, at least not until this CPU restores * its CPPR. * * This is in theory racy vs. HW adding new interrupts to * the queue. In practice this works because the interesting * cases are when the guest has done a set_xive() to move the * interrupt away, which flushes the xive, followed by the * target CPU doing a H_CPPR. So any new interrupt coming into * the queue must still be routed to us and isn't a source * of concern.
*/
xive_vm_scan_for_rerouted_irqs(xive, xc);
}
/* * IPIs are synthesized from MFRR and thus don't need * any special EOI handling. The underlying interrupt * used to signal MFRR changes is EOId when fetched from * the queue.
*/ if (irq == XICS_IPI || irq == 0) { /* * This barrier orders the setting of xc->cppr vs. * subsequent test of xc->mfrr done inside * scan_interrupts and push_pending_to_hw
*/
smp_mb(); goto bail;
}
/* Find interrupt source */
sb = kvmppc_xive_find_source(xive, irq, &src); if (!sb) {
pr_devel(" source not found !\n");
rc = H_PARAMETER; /* Same as above */
smp_mb(); goto bail;
}
state = &sb->irq_state[src];
kvmppc_xive_select_irq(state, &hw_num, &xd);
state->in_eoi = true;
/* * This barrier orders both setting of in_eoi above vs, * subsequent test of guest_priority, and the setting * of xc->cppr vs. subsequent test of xc->mfrr done inside * scan_interrupts and push_pending_to_hw
*/
smp_mb();
again: if (state->guest_priority == MASKED) {
arch_spin_lock(&sb->lock); if (state->guest_priority != MASKED) {
arch_spin_unlock(&sb->lock); goto again;
}
pr_devel(" EOI on saved P...\n");
/* Clear old_p, that will cause unmask to perform an EOI */
state->old_p = false;
arch_spin_unlock(&sb->lock);
} else {
pr_devel(" EOI on source...\n");
/* Perform EOI on the source */
xive_vm_source_eoi(hw_num, xd);
/* If it's an emulated LSI, check level and resend */ if (state->lsi && state->asserted)
__raw_writeq(0, __x_trig_page(xd));
}
/* * This barrier orders the above guest_priority check * and spin_lock/unlock with clearing in_eoi below. * * It also has to be a full mb() as it must ensure * the MMIOs done in source_eoi() are completed before * state->in_eoi is visible.
*/
mb();
state->in_eoi = false;
bail:
/* Re-evaluate pending IRQs and update HW */
xive_vm_scan_interrupts(xc, xc->pending, scan_eoi);
xive_vm_push_pending_to_hw(xc);
pr_devel(" after scan pending=%02x\n", xc->pending);
/* * The load of xc->cppr below and the subsequent MMIO store * to the IPI must happen after the above mfrr update is * globally visible so that: * * - Synchronize with another CPU doing an H_EOI or a H_CPPR * updating xc->cppr then reading xc->mfrr. * * - The target of the IPI sees the xc->mfrr update
*/
mb();
/* Shoot the IPI if most favored than target cppr */ if (mfrr < xc->cppr)
__raw_writeq(0, __x_trig_page(&xc->vp_ipi_data));
return H_SUCCESS;
}
/* * We leave a gap of a couple of interrupts in the queue to * account for the IPI and additional safety guard.
*/ #define XIVE_Q_GAP 2
if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE) return kvmppc_xive_vcpu_has_save_restore(vcpu);
returntrue;
}
/* * Push a vcpu's context to the XIVE on guest entry. * This assumes we are in virtual mode (MMU on)
*/ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
{ void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
u64 pq;
/* * Nothing to do if the platform doesn't have a XIVE * or this vCPU doesn't have its own XIVE context * (e.g. because it's not using an in-kernel interrupt controller).
*/ if (!tima || !vcpu->arch.xive_cam_word) return;
/* * We clear the irq_pending flag. There is a small chance of a * race vs. the escalation interrupt happening on another * processor setting it again, but the only consequence is to * cause a spurious wakeup on the next H_CEDE, which is not an * issue.
*/
vcpu->arch.irq_pending = 0;
/* * In single escalation mode, if the escalation interrupt is * on, we mask it.
*/ if (vcpu->arch.xive_esc_on) {
pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
XIVE_ESB_SET_PQ_01));
mb();
/* * We have a possible subtle race here: The escalation * interrupt might have fired and be on its way to the * host queue while we mask it, and if we unmask it * early enough (re-cede right away), there is a * theoretical possibility that it fires again, thus * landing in the target queue more than once which is * a big no-no. * * Fortunately, solving this is rather easy. If the * above load setting PQ to 01 returns a previous * value where P is set, then we know the escalation * interrupt is somewhere on its way to the host. In * that case we simply don't clear the xive_esc_on * flag below. It will be eventually cleared by the * handler for the escalation interrupt. * * Then, when doing a cede, we check that flag again * before re-enabling the escalation interrupt, and if * set, we abort the cede.
*/ if (!(pq & XIVE_ESB_VAL_P)) /* Now P is 0, we can clear the flag */
vcpu->arch.xive_esc_on = 0;
}
}
EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
/* * Pull a vcpu's context from the XIVE on guest exit. * This assumes we are in virtual mode (MMU on)
*/ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
{ void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
if (!vcpu->arch.xive_pushed) return;
/* * Should not have been pushed if there is no tima
*/ if (WARN_ON(!tima)) return;
eieio(); /* First load to pull the context, we ignore the value */
__raw_readl(tima + TM_SPC_PULL_OS_CTX); /* Second load to recover the context state (Words 0 and 1) */ if (!kvmppc_xive_vcpu_has_save_restore(vcpu))
vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);
/* Fixup some of the state for the next load */
vcpu->arch.xive_saved_state.lsmfb = 0;
vcpu->arch.xive_saved_state.ack = 0xff;
vcpu->arch.xive_pushed = 0;
eieio();
}
EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
if (vcpu->arch.xive_esc_on) { /* * If we still have a pending escalation, abort the cede, * and we must set PQ to 10 rather than 00 so that we don't * potentially end up with two entries for the escalation * interrupt in the XIVE interrupt queue. In that case * we also don't want to set xive_esc_on to 1 here in * case we race with xive_esc_irq().
*/
ret = false; /* * The escalation interrupts are special as we don't EOI them. * There is no need to use the load-after-store ordering offset * to set PQ to 10 as we won't use StoreEOI.
*/
__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
} else {
vcpu->arch.xive_esc_on = true;
mb();
__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
}
mb();
/* * This is a simple trigger for a generic XIVE IRQ. This must * only be called for interrupts that support a trigger page
*/ staticbool xive_irq_trigger(struct xive_irq_data *xd)
{ /* This should be only for MSIs */ if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) returnfalse;
/* Those interrupts should always have a trigger page */ if (WARN_ON(!xd->trig_mmio)) returnfalse;
vcpu->arch.irq_pending = 1;
smp_mb(); if (vcpu->arch.ceded || vcpu->arch.nested)
kvmppc_fast_vcpu_kick(vcpu);
/* Since we have the no-EOI flag, the interrupt is effectively * disabled now. Clearing xive_esc_on means we won't bother * doing so on the next entry. * * This also allows the entry code to know that if a PQ combination * of 10 is observed while xive_esc_on is true, it means the queue * contains an unprocessed escalation interrupt. We don't make use of * that knowledge today but might (see comment in book3s_hv_rmhandler.S)
*/
vcpu->arch.xive_esc_on = false;
/* This orders xive_esc_on = false vs. subsequent stale_p = true */
smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */
/* Already there ? */ if (xc->esc_virq[prio]) return 0;
/* Hook up the escalation interrupt */
xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq); if (!xc->esc_virq[prio]) {
pr_err("Failed to map escalation interrupt for queue %d of VCPU %d\n",
prio, xc->server_num); return -EIO;
}
if (single_escalation)
name = kasprintf(GFP_KERNEL, "kvm-%lld-%d",
vcpu->kvm->arch.lpid, xc->server_num); else
name = kasprintf(GFP_KERNEL, "kvm-%lld-%d-%d",
vcpu->kvm->arch.lpid, xc->server_num, prio); if (!name) {
pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
prio, xc->server_num);
rc = -ENOMEM; goto error;
}
rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
IRQF_NO_THREAD, name, vcpu); if (rc) {
pr_err("Failed to request escalation interrupt for queue %d of VCPU %d\n",
prio, xc->server_num); goto error;
}
xc->esc_virq_names[prio] = name;
/* In single escalation mode, we grab the ESB MMIO of the * interrupt and mask it. Also populate the VCPU v/raddr * of the ESB page for use by asm entry/exit code. Finally * set the XIVE_IRQ_FLAG_NO_EOI flag which will prevent the * core code from performing an EOI on the escalation * interrupt, thus leaving it effectively masked after * it fires once.
*/ if (single_escalation) { struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]); struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
/* Allocate the queue and retrieve infos on current node for now */
qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_page_order); if (!qpage) {
pr_err("Failed to allocate queue %d for VCPU %d\n",
prio, xc->server_num); return -ENOMEM;
}
memset(qpage, 0, 1 << xive->q_order);
/* * Reconfigure the queue. This will set q->qpage only once the * queue is fully configured. This is a requirement for prio 0 * as we will stop doing EOIs for every IPI as soon as we observe * qpage being non-NULL, and instead will only EOI when we receive * corresponding queue 0 entries
*/
rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage,
xive->q_order, true); if (rc)
pr_err("Failed to configure queue %d for VCPU %d\n",
prio, xc->server_num); return rc;
}
/* Called with xive->lock held */ staticint xive_check_provisioning(struct kvm *kvm, u8 prio)
{ struct kvmppc_xive *xive = kvm->arch.xive; struct kvm_vcpu *vcpu; unsignedlong i; int rc;
/* * Take the lock, set masked, try again if racing * with H_EOI
*/ for (;;) {
arch_spin_lock(&sb->lock);
old_prio = state->guest_priority;
state->guest_priority = MASKED;
mb(); if (!state->in_eoi) break;
state->guest_priority = old_prio;
arch_spin_unlock(&sb->lock);
}
/* No change ? Bail */ if (old_prio == MASKED) return old_prio;
/* Get the right irq */
kvmppc_xive_select_irq(state, &hw_num, &xd);
/* Set PQ to 10, return old P and old Q and remember them */
val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10);
state->old_p = !!(val & 2);
state->old_q = !!(val & 1);
/* * Synchronize hardware to sensure the queues are updated when * masking
*/
xive_native_sync_source(hw_num);
return old_prio;
}
staticvoid xive_lock_for_unmask(struct kvmppc_xive_src_block *sb, struct kvmppc_xive_irq_state *state)
{ /* * Take the lock try again if racing with H_EOI
*/ for (;;) {
arch_spin_lock(&sb->lock); if (!state->in_eoi) break;
arch_spin_unlock(&sb->lock);
}
}
/* If we aren't changing a thing, move on */ if (state->guest_priority != MASKED) goto bail;
/* Get the right irq */
kvmppc_xive_select_irq(state, &hw_num, &xd);
/* Old Q set, set PQ to 11 */ if (state->old_q)
xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
/* * If not old P, then perform an "effective" EOI, * on the source. This will handle the cases where * FW EOI is needed.
*/ if (!state->old_p)
xive_vm_source_eoi(hw_num, xd);
/* Synchronize ordering and mark unmasked */
mb();
bail:
state->guest_priority = prio;
}
/* * Target an interrupt to a given server/prio, this will fallback * to another server if necessary and perform the HW targetting * updates as needed * * NOTE: Must be called with the state lock held
*/ staticint xive_target_interrupt(struct kvm *kvm, struct kvmppc_xive_irq_state *state,
u32 server, u8 prio)
{ struct kvmppc_xive *xive = kvm->arch.xive;
u32 hw_num; int rc;
/* * This will return a tentative server and actual * priority. The count for that new target will have * already been incremented.
*/
rc = kvmppc_xive_select_target(kvm, &server, prio);
/* * We failed to find a target ? Not much we can do * at least until we support the GIQ.
*/ if (rc) return rc;
/* * Increment the old queue pending count if there * was one so that the old queue count gets adjusted later * when observed to be empty.
*/ if (state->act_priority != MASKED)
xive_inc_q_pending(kvm,
state->act_server,
state->act_priority); /* * Update state and HW
*/
state->act_priority = prio;
state->act_server = server;
/* Get the right irq */
kvmppc_xive_select_irq(state, &hw_num, NULL);
/* * Targetting rules: In order to avoid losing track of * pending interrupts across mask and unmask, which would * allow queue overflows, we implement the following rules: * * - Unless it was never enabled (or we run out of capacity) * an interrupt is always targetted at a valid server/queue * pair even when "masked" by the guest. This pair tends to * be the last one used but it can be changed under some * circumstances. That allows us to separate targetting * from masking, we only handle accounting during (re)targetting, * this also allows us to let an interrupt drain into its target * queue after masking, avoiding complex schemes to remove * interrupts out of remote processor queues. * * - When masking, we set PQ to 10 and save the previous value * of P and Q. * * - When unmasking, if saved Q was set, we set PQ to 11 * otherwise we leave PQ to the HW state which will be either * 10 if nothing happened or 11 if the interrupt fired while * masked. Effectively we are OR'ing the previous Q into the * HW Q. * * Then if saved P is clear, we do an effective EOI (Q->P->Trigger) * which will unmask the interrupt and shoot a new one if Q was * set. * * Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11, * effectively meaning an H_EOI from the guest is still expected * for that interrupt). * * - If H_EOI occurs while masked, we clear the saved P. * * - When changing target, we account on the new target and * increment a separate "pending" counter on the old one. * This pending counter will be used to decrement the old * target's count when its queue has been observed empty.
*/
sb = kvmppc_xive_find_source(xive, irq, &idx); if (!sb) return -EINVAL;
state = &sb->irq_state[idx];
/* * We first handle masking/unmasking since the locking * might need to be retried due to EOIs, we'll handle * targetting changes later. These functions will return * with the SB lock held. * * xive_lock_and_mask() will also set state->guest_priority * but won't otherwise change other fields of the state. * * xive_lock_for_unmask will not actually unmask, this will * be done later by xive_finish_unmask() once the targetting * has been done, so we don't try to unmask an interrupt * that hasn't yet been targetted.
*/ if (priority == MASKED)
xive_lock_and_mask(xive, sb, state); else
xive_lock_for_unmask(sb, state);
/* * Then we handle targetting. * * First calculate a new "actual priority"
*/
new_act_prio = state->act_priority; if (priority != MASKED)
new_act_prio = xive_prio_from_guest(priority);
/* * Then check if we actually need to change anything, * * The condition for re-targetting the interrupt is that * we have a valid new priority (new_act_prio is not 0xff) * and either the server or the priority changed. * * Note: If act_priority was ff and the new priority is * also ff, we don't do anything and leave the interrupt * untargetted. An attempt of doing an int_on on an * untargetted interrupt will fail. If that is a problem * we could initialize interrupts with valid default
*/
if (new_act_prio != MASKED &&
(state->act_server != server ||
state->act_priority != new_act_prio))
rc = xive_target_interrupt(kvm, state, server, new_act_prio);
/* * Perform the final unmasking of the interrupt source * if necessary
*/ if (priority != MASKED)
xive_finish_unmask(xive, sb, state, priority);
/* * Finally Update saved_priority to match. Only int_on/off * set this field to a different value.
*/
state->saved_priority = priority;
/* * We can't update the state of a "pushed" VCPU, but that * shouldn't happen because the vcpu->mutex makes running a * vcpu mutually exclusive with doing one_reg get/set on it.
*/ if (WARN_ON(vcpu->arch.xive_pushed)) return -EIO;
/* * Update MFRR state. If it's not 0xff, we mark the VCPU as * having a pending MFRR change, which will re-evaluate the * target. The VCPU will thus potentially get a spurious * interrupt but that's not a big deal.
*/
xc->mfrr = mfrr; if (mfrr < cppr)
xive_irq_trigger(&xc->vp_ipi_data);
/* * Now saved XIRR is "interesting". It means there's something in * the legacy "1 element" queue... for an IPI we simply ignore it, * as the MFRR restore will handle that. For anything else we need * to force a resend of the source. * However the source may not have been setup yet. If that's the * case, we keep that info and increment a counter in the xive to * tell subsequent xive_set_source() to go look.
*/ if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) {
xc->delayed_irq = xisr;
xive->delayed_irqs++;
pr_devel(" xisr restore delayed\n");
}
sb = kvmppc_xive_find_source(xive, guest_irq, &idx); if (!sb) return -EINVAL;
state = &sb->irq_state[idx];
/* * Mark the passed-through interrupt as going to a VCPU, * this will prevent further EOIs and similar operations * from the XIVE code. It will also mask the interrupt * to either PQ=10 or 11 state, the latter if the interrupt * is pending. This will allow us to unmask or retrigger it * after routing it to the guest with a simple EOI. * * The "state" argument is a "token", all it needs is to be * non-NULL to switch to passed-through or NULL for the * other way around. We may not yet have an actual VCPU * target here and we don't really care.
*/
rc = irq_set_vcpu_affinity(host_irq, state); if (rc) {
pr_err("Failed to set VCPU affinity for host IRQ %ld\n", host_irq); return rc;
}
/* * Mask and read state of IPI. We need to know if its P bit * is set as that means it's potentially already using a * queue entry in the target
*/
prio = xive_lock_and_mask(xive, sb, state);
pr_devel(" old IPI prio %02x P:%d Q:%d\n", prio,
state->old_p, state->old_q);
/* Turn the IPI hard off */
xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
/* * Reset ESB guest mapping. Needed when ESB pages are exposed * to the guest in XIVE native mode
*/ if (xive->ops && xive->ops->reset_mapped)
xive->ops->reset_mapped(kvm, guest_irq);
/* Grab info about irq */
state->pt_number = hw_irq;
state->pt_data = irq_data_get_irq_handler_data(host_data);
/* * Configure the IRQ to match the existing configuration of * the IPI if it was already targetted. Otherwise this will * mask the interrupt in a lossy way (act_priority is 0xff) * which is fine for a never started interrupt.
*/
xive_native_configure_irq(hw_irq,
kvmppc_xive_vp(xive, state->act_server),
state->act_priority, state->number);
/* * We do an EOI to enable the interrupt (and retrigger if needed) * if the guest has the interrupt unmasked and the P bit was *not* * set in the IPI. If it was set, we know a slot may still be in * use in the target queue thus we have to wait for a guest * originated EOI
*/ if (prio != MASKED && !state->old_p)
xive_vm_source_eoi(hw_irq, state->pt_data);
/* Clear old_p/old_q as they are no longer relevant */
state->old_p = state->old_q = false;
sb = kvmppc_xive_find_source(xive, guest_irq, &idx); if (!sb) return -EINVAL;
state = &sb->irq_state[idx];
/* * Mask and read state of IRQ. We need to know if its P bit * is set as that means it's potentially already using a * queue entry in the target
*/
prio = xive_lock_and_mask(xive, sb, state);
pr_devel(" old IRQ prio %02x P:%d Q:%d\n", prio,
state->old_p, state->old_q);
/* * If old_p is set, the interrupt is pending, we switch it to * PQ=11. This will force a resend in the host so the interrupt * isn't lost to whatever host driver may pick it up
*/ if (state->old_p)
xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
/* Release the passed-through interrupt to the host */
rc = irq_set_vcpu_affinity(host_irq, NULL); if (rc) {
pr_err("Failed to clr VCPU affinity for host IRQ %ld\n", host_irq); return rc;
}
/* Forget about the IRQ */
state->pt_number = 0;
state->pt_data = NULL;
/* * Reset ESB guest mapping. Needed when ESB pages are exposed * to the guest in XIVE native mode
*/ if (xive->ops && xive->ops->reset_mapped) {
xive->ops->reset_mapped(kvm, guest_irq);
}
/* Reconfigure the IPI */
xive_native_configure_irq(state->ipi_number,
kvmppc_xive_vp(xive, state->act_server),
state->act_priority, state->number);
/* * If old_p is set (we have a queue entry potentially * occupied) or the interrupt is masked, we set the IPI * to PQ=10 state. Otherwise we just re-enable it (PQ=00).
*/ if (prio == MASKED || state->old_p)
xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10); else
xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00);
/* * Clear pointers to escalation interrupt ESB. * This is safe because the vcpu->mutex is held, preventing * any other CPU from concurrently executing a KVM_RUN ioctl.
*/
vcpu->arch.xive_esc_vaddr = 0;
vcpu->arch.xive_esc_raddr = 0;
}
/* * In single escalation mode, the escalation interrupt is marked so * that EOI doesn't re-enable it, but just sets the stale_p flag to * indicate that the P bit has already been dealt with. However, the * assembly code that enters the guest sets PQ to 00 without clearing * stale_p (because it has no easy way to address it). Hence we have * to adjust stale_p before shutting down the interrupt.
*/ void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, int irq)
{ struct irq_data *d = irq_get_irq_data(irq); struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
/* * This slightly odd sequence gives the right result * (i.e. stale_p set if xive_esc_on is false) even if * we race with xive_esc_irq() and xive_irq_eoi().
*/
xd->stale_p = false;
smp_mb(); /* paired with smb_wmb in xive_esc_irq */ if (!vcpu->arch.xive_esc_on)
xd->stale_p = true;
}
staticbool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu)
{ /* We have a block of xive->nr_servers VPs. We just need to check * packed vCPU ids are below that.
*/ return kvmppc_pack_vcpu_id(xive->kvm, cpu) < xive->nr_servers;
}
r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id); if (r) goto bail;
if (!kvmppc_xive_check_save_restore(vcpu)) {
pr_err("inconsistent save-restore setup for VCPU %d\n", cpu);
r = -EIO; goto bail;
}
/* Configure VCPU fields for use by assembly push/pull */
vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
/* Allocate IPI */
xc->vp_ipi = xive_native_alloc_irq(); if (!xc->vp_ipi) {
pr_err("Failed to allocate xive irq for VCPU IPI\n");
r = -EIO; goto bail;
}
pr_devel(" IPI=0x%x\n", xc->vp_ipi);
r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data); if (r) goto bail;
/* * Enable the VP first as the single escalation mode will * affect escalation interrupts numbering
*/
r = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive)); if (r) {
pr_err("Failed to enable VP in OPAL, err %d\n", r); goto bail;
}
/* * Initialize queues. Initially we set them all for no queueing * and we enable escalation for queue 0 only which we'll use for * our mfrr change notifications. If the VCPU is hot-plugged, we * do handle provisioning however based on the existing "map" * of enabled queues.
*/ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { struct xive_q *q = &xc->queues[i];
/* Single escalation, no queue 7 */ if (i == 7 && kvmppc_xive_has_single_escalation(xive)) break;
/* Is queue already enabled ? Provision it */ if (xive->qmap & (1 << i)) {
r = xive_provision_queue(vcpu, i); if (r == 0 && !kvmppc_xive_has_single_escalation(xive))
kvmppc_xive_attach_escalation(
vcpu, i, kvmppc_xive_has_single_escalation(xive)); if (r) goto bail;
} else {
r = xive_native_configure_queue(xc->vp_id,
q, i, NULL, 0, true); if (r) {
pr_err("Failed to configure queue %d for VCPU %d\n",
i, cpu); goto bail;
}
}
}
/* If not done above, attach priority 0 escalation */
r = kvmppc_xive_attach_escalation(vcpu, 0, kvmppc_xive_has_single_escalation(xive)); if (r) goto bail;
/* Route the IPI */
r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI); if (!r)
xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
bail:
mutex_unlock(&xive->lock); if (r) {
kvmppc_xive_cleanup_vcpu(vcpu); return r;
}
sb = kvmppc_xive_find_source(xive, irq, &idx); if (!sb) return;
state = &sb->irq_state[idx];
/* Some sanity checking */ if (!state->valid) {
pr_err("invalid irq 0x%x in cpu queue!\n", irq); return;
}
/* * If the interrupt is in a queue it should have P set. * We warn so that gets reported. A backtrace isn't useful * so no need to use a WARN_ON.
*/ if (!state->saved_p)
pr_err("Interrupt 0x%x is marked in a queue but P not set !\n", irq);
/* * Lock / exclude EOI (not technically necessary if the * guest isn't running concurrently. If this becomes a * performance issue we can probably remove the lock.
*/
xive_lock_for_unmask(sb, state);
/* Restore mask/prio if it wasn't masked */ if (state->saved_scan_prio != MASKED)
xive_finish_unmask(xive, sb, state, state->saved_scan_prio);
/* * See comment in xive_get_source() about how this * work. Collect a stable state for all interrupts
*/ for (i = 0; i <= xive->max_sbid; i++) { struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; if (!sb) continue; for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++)
xive_pre_save_mask_irq(xive, sb, j);
}
/* Then scan the queues and update the "in_queue" flag */
kvm_for_each_vcpu(i, vcpu, xive->kvm) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; if (!xc) continue; for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) { if (xc->queues[j].qpage)
xive_pre_save_queue(xive, &xc->queues[j]);
}
}
/* Finally restore interrupt states */ for (i = 0; i <= xive->max_sbid; i++) { struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; if (!sb) continue; for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++)
xive_pre_save_unmask_irq(xive, sb, j);
}
}
staticvoid xive_post_save_scan(struct kvmppc_xive *xive)
{
u32 i, j;
/* Clear all the in_queue flags */ for (i = 0; i <= xive->max_sbid; i++) { struct kvmppc_xive_src_block *sb = xive->src_blocks[i]; if (!sb) continue; for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++)
sb->irq_state[j].in_queue = false;
}
/* Next get_source() will do a new scan */
xive->saved_src_count = 0;
}
/* * This returns the source configuration and state to user space.
*/ staticint xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
{ struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state;
u64 __user *ubufp = (u64 __user *) addr;
u64 val, prio;
u16 idx;
sb = kvmppc_xive_find_source(xive, irq, &idx); if (!sb) return -ENOENT;
state = &sb->irq_state[idx];
if (!state->valid) return -ENOENT;
pr_devel("get_source(%ld)...\n", irq);
/* * So to properly save the state into something that looks like a * XICS migration stream we cannot treat interrupts individually. * * We need, instead, mask them all (& save their previous PQ state) * to get a stable state in the HW, then sync them to ensure that * any interrupt that had already fired hits its queue, and finally * scan all the queues to collect which interrupts are still present * in the queues, so we can set the "pending" flag on them and * they can be resent on restore. * * So we do it all when the "first" interrupt gets saved, all the * state is collected at that point, the rest of xive_get_source() * will merely collect and convert that state to the expected * userspace bit mask.
*/ if (xive->saved_src_count == 0)
xive_pre_save_scan(xive);
xive->saved_src_count++;
/* Convert saved state into something compatible with xics */
val = state->act_server;
prio = state->saved_scan_prio;
if (prio == MASKED) {
val |= KVM_XICS_MASKED;
prio = state->saved_priority;
}
val |= prio << KVM_XICS_PRIORITY_SHIFT; if (state->lsi) {
val |= KVM_XICS_LEVEL_SENSITIVE; if (state->saved_p)
val |= KVM_XICS_PENDING;
} else { if (state->saved_p)
val |= KVM_XICS_PRESENTED;
if (state->saved_q)
val |= KVM_XICS_QUEUED;
/* * We mark it pending (which will attempt a re-delivery) * if we are in a queue *or* we were masked and had * Q set which is equivalent to the XICS "masked pending" * state
*/ if (state->in_queue || (prio == MASKED && state->saved_q))
val |= KVM_XICS_PENDING;
}
/* * If that was the last interrupt saved, reset the * in_queue flags
*/ if (xive->saved_src_count == xive->src_count)
xive_post_save_scan(xive);
/* Copy the result to userspace */ if (put_user(val, ubufp)) return -EFAULT;
return 0;
}
struct kvmppc_xive_src_block *kvmppc_xive_create_src_block( struct kvmppc_xive *xive, int irq)
{ struct kvmppc_xive_src_block *sb; int i, bid;
bid = irq >> KVMPPC_XICS_ICS_SHIFT;
mutex_lock(&xive->lock);
/* block already exists - somebody else got here first */ if (xive->src_blocks[bid]) goto out;
/* Create the ICS */
sb = kzalloc(sizeof(*sb), GFP_KERNEL); if (!sb) goto out;
/* * If the source doesn't already have an IPI, allocate * one and get the corresponding data
*/ if (!state->ipi_number) {
state->ipi_number = xive_native_alloc_irq(); if (state->ipi_number == 0) {
pr_devel("Failed to allocate IPI !\n"); return -ENOMEM;
}
xive_native_populate_irq_data(state->ipi_number, &state->ipi_data);
pr_devel(" src_ipi=0x%x\n", state->ipi_number);
}
/* * We use lock_and_mask() to set us in the right masked * state. We will override that state from the saved state * further down, but this will handle the cases of interrupts * that need FW masking. We set the initial guest_priority to * 0 before calling it to ensure it actually performs the masking.
*/
state->guest_priority = 0;
xive_lock_and_mask(xive, sb, state);
/* * Now, we select a target if we have one. If we don't we * leave the interrupt untargetted. It means that an interrupt * can become "untargetted" across migration if it was masked * by set_xive() but there is little we can do about it.
*/
/* First convert prio and mark interrupt as untargetted */
act_prio = xive_prio_from_guest(guest_prio);
state->act_priority = MASKED;
/* * We need to drop the lock due to the mutex below. Hopefully * nothing is touching that interrupt yet since it hasn't been * advertized to a running guest yet
*/
arch_spin_unlock(&sb->lock);
/* If we have a priority target the interrupt */ if (act_prio != MASKED) { /* First, check provisioning of queues */
mutex_lock(&xive->lock);
rc = xive_check_provisioning(xive->kvm, act_prio);
mutex_unlock(&xive->lock);
/* Target interrupt */ if (rc == 0)
rc = xive_target_interrupt(xive->kvm, state,
server, act_prio); /* * If provisioning or targetting failed, leave it * alone and masked. It will remain disabled until * the guest re-targets it.
*/
}
/* * Find out if this was a delayed irq stashed in an ICP, * in which case, treat it as pending
*/ if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) {
val |= KVM_XICS_PENDING;
pr_devel(" Found delayed ! forcing PENDING !\n");
}
/* Cleanup the SW state */
state->old_p = false;
state->old_q = false;
state->lsi = false;
state->asserted = false;
/* Restore LSI state */ if (val & KVM_XICS_LEVEL_SENSITIVE) {
state->lsi = true; if (val & KVM_XICS_PENDING)
state->asserted = true;
pr_devel(" LSI ! Asserted=%d\n", state->asserted);
}
/* * Restore P and Q. If the interrupt was pending, we * force Q and !P, which will trigger a resend. * * That means that a guest that had both an interrupt * pending (queued) and Q set will restore with only * one instance of that interrupt instead of 2, but that * is perfectly fine as coalescing interrupts that haven't * been presented yet is always allowed.
*/ if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
state->old_p = true; if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
state->old_q = true;
/* * If the interrupt was unmasked, update guest priority and * perform the appropriate state transition and do a * re-trigger if necessary.
*/ if (val & KVM_XICS_MASKED) {
pr_devel(" masked, saving prio\n");
state->guest_priority = MASKED;
state->saved_priority = guest_prio;
} else {
pr_devel(" unmasked, restoring to prio %d\n", guest_prio);
xive_finish_unmask(xive, sb, state, guest_prio);
state->saved_priority = guest_prio;
}
/* Increment the number of valid sources and mark this one valid */ if (!state->valid)
xive->src_count++;
state->valid = true;
return 0;
}
int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status)
{ struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state;
u16 idx;
if (!xive) return -ENODEV;
sb = kvmppc_xive_find_source(xive, irq, &idx); if (!sb) return -EINVAL;
/* Perform locklessly .... (we need to do some RCUisms here...) */
state = &sb->irq_state[idx]; if (!state->valid) return -EINVAL;
/* We don't allow a trigger on a passed-through interrupt */ if (state->pt_number) return -EINVAL;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.