// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. * * Authors: * Paul Mackerras <paulus@au1.ibm.com> * Alexander Graf <agraf@suse.de> * Kevin Wolf <mail@kevin-wolf.de> * * Description: KVM functions specific to running on Book 3S * processors in hypervisor mode (specifically POWER7 and later). * * This file is derived from arch/powerpc/kvm/book3s.c, * by Alexander Graf <agraf@suse.de>.
*/
/* Used to indicate that a guest page fault needs to be handled */ #define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) /* Used to indicate that a guest passthrough interrupt needs to be handled */ #define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2)
/* Used as a "null" value for timebase values */ #define TB_NIL (~(u64)0)
staticint dynamic_mt_modes = 6;
module_param(dynamic_mt_modes, int, 0644);
MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)"); staticint target_smt_mode;
module_param(target_smt_mode, int, 0644);
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
staticbool one_vm_per_core;
module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif
/* If set, guests are allowed to create and control nested guests */ staticbool nested = true;
module_param(nested, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
/* * RWMR values for POWER8. These control the rate at which PURR * and SPURR count and should be set according to the number of * online threads in the vcore being run.
*/ #define RWMR_RPA_P8_1THREAD 0x164520C62609AECAUL #define RWMR_RPA_P8_2THREAD 0x7FFF2908450D8DA9UL #define RWMR_RPA_P8_3THREAD 0x164520C62609AECAUL #define RWMR_RPA_P8_4THREAD 0x199A421245058DA9UL #define RWMR_RPA_P8_5THREAD 0x164520C62609AECAUL #define RWMR_RPA_P8_6THREAD 0x164520C62609AECAUL #define RWMR_RPA_P8_7THREAD 0x164520C62609AECAUL #define RWMR_RPA_P8_8THREAD 0x164520C62609AECAUL
/* Used to traverse the list of runnable threads for a given vcore */ #define for_each_runnable_thread(i, vcpu, vc) \ for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
/* If we're a nested hypervisor, fall back to ordinary IPIs for now */ if (kvmhv_on_pseries()) returnfalse;
/* On POWER9 we can use msgsnd to IPI any cpu */ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu);
smp_mb();
__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); returntrue;
}
/* On POWER8 for IPIs to threads in the same core, use msgsnd */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
preempt_disable(); if (cpu_first_thread_sibling(cpu) ==
cpu_first_thread_sibling(smp_processor_id())) {
msg |= cpu_thread_in_core(cpu);
smp_mb();
__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
preempt_enable(); returntrue;
}
preempt_enable();
}
#ifdefined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) if (cpu >= 0 && cpu < nr_cpu_ids) { if (paca_ptrs[cpu]->kvm_hstate.xics_phys) {
xics_wake_cpu(cpu); returntrue;
}
opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); returntrue;
} #endif
returnfalse;
}
staticvoid kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
{ int cpu; struct rcuwait *waitp;
/* * rcuwait_wake_up contains smp_mb() which orders prior stores that * create pending work vs below loads of cpu fields. The other side * is the barrier in vcpu run that orders setting the cpu fields vs * testing for pending work.
*/
waitp = kvm_arch_vcpu_get_wait(vcpu); if (rcuwait_wake_up(waitp))
++vcpu->stat.generic.halt_wakeup;
cpu = READ_ONCE(vcpu->arch.thread_cpu); if (cpu >= 0 && kvmppc_ipi_thread(cpu)) return;
/* CPU points to the first thread of the core */
cpu = vcpu->cpu; if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
smp_send_reschedule(cpu);
}
/* * We use the vcpu_load/put functions to measure stolen time. * * Stolen time is counted as time when either the vcpu is able to * run as part of a virtual core, but the task running the vcore * is preempted or sleeping, or when the vcpu needs something done * in the kernel by the task running the vcpu, but that task is * preempted or sleeping. Those two things have to be counted * separately, since one of the vcpu tasks will take on the job * of running the core, and the other vcpu tasks in the vcore will * sleep waiting for it to do that, but that sleep shouldn't count * as stolen time. * * Hence we accumulate stolen time when the vcpu can run as part of * a vcore using vc->stolen_tb, and the stolen time when the vcpu * needs its task to do other things in the kernel (for example, * service a page fault) in busy_stolen. We don't accumulate * stolen time for a vcore when it is inactive, or for a vcpu * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of * a misnomer; it means that the vcpu task is not executing in * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in * the kernel. We don't have any way of dividing up that time * between time that the vcpu is genuinely stopped, time that * the task is actively working on behalf of the vcpu, and time * that the task is preempted, so we don't count any of it as * stolen. * * Updates to busy_stolen are protected by arch.tbacct_lock; * updates to vc->stolen_tb are protected by the vcore->stoltb_lock * lock. The stolen times are measured in units of timebase ticks. * (Note that the != TB_NIL checks below are purely defensive; * they should never fail.) * * The POWER9 path is simpler, one vcpu per virtual core so the * former case does not exist. If a vcpu is preempted when it is * BUSY_IN_HOST and not ceded or otherwise blocked, then accumulate * the stolen cycles in busy_stolen. RUNNING is not a preemptible * state in the P9 path.
*/
/* * We can test vc->runner without taking the vcore lock, * because only this task ever sets vc->runner to this * vcpu, and once it is set to this vcpu, only this task * ever sets it to NULL.
*/ if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
kvmppc_core_end_stolen(vc, now);
if (cpu_has_feature(CPU_FTR_ARCH_300)) { /* * In the P9 path, RUNNABLE is not preemptible * (nor takes host interrupts)
*/
WARN_ON_ONCE(vcpu->arch.state == KVMPPC_VCPU_RUNNABLE); /* * Account stolen time when preempted while the vcpu task is * running in the kernel (but not in qemu, which is INACTIVE).
*/ if (task_is_running(current) &&
vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
vcpu->arch.busy_preempt = mftb(); return;
}
now = mftb();
if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
kvmppc_core_start_stolen(vc, now);
/* Dummy value used in computing PCR value below */ #define PCR_ARCH_31 (PCR_ARCH_300 << 1)
staticinlineunsignedlong map_pcr_to_cap(unsignedlong pcr)
{ unsignedlong cap = 0;
switch (pcr) { case PCR_ARCH_300:
cap = H_GUEST_CAP_POWER9; break; case PCR_ARCH_31: if (cpu_has_feature(CPU_FTR_P11_PVR))
cap = H_GUEST_CAP_POWER11; else
cap = H_GUEST_CAP_POWER10; break; default: break;
}
/* We can (emulate) our own architecture version and anything older */ if (cpu_has_feature(CPU_FTR_P11_PVR) || cpu_has_feature(CPU_FTR_ARCH_31))
host_pcr_bit = PCR_ARCH_31; elseif (cpu_has_feature(CPU_FTR_ARCH_300))
host_pcr_bit = PCR_ARCH_300; elseif (cpu_has_feature(CPU_FTR_ARCH_207S))
host_pcr_bit = PCR_ARCH_207; elseif (cpu_has_feature(CPU_FTR_ARCH_206))
host_pcr_bit = PCR_ARCH_206; else
host_pcr_bit = PCR_ARCH_205;
/* Determine lowest PCR bit needed to run guest in given PVR level */
guest_pcr_bit = host_pcr_bit; if (arch_compat) { switch (arch_compat) { case PVR_ARCH_205:
guest_pcr_bit = PCR_ARCH_205; break; case PVR_ARCH_206: case PVR_ARCH_206p:
guest_pcr_bit = PCR_ARCH_206; break; case PVR_ARCH_207:
guest_pcr_bit = PCR_ARCH_207; break; case PVR_ARCH_300:
guest_pcr_bit = PCR_ARCH_300; break; case PVR_ARCH_31: case PVR_ARCH_31_P11:
guest_pcr_bit = PCR_ARCH_31; break; default: return -EINVAL;
}
}
if (kvmhv_on_pseries() && kvmhv_is_nestedv2()) { /* * 'arch_compat == 0' would mean the guest should default to * L1's compatibility. In this case, the guest would pick * host's PCR and evaluate the corresponding capabilities.
*/
cap = map_pcr_to_cap(guest_pcr_bit); if (!(cap & nested_capabilities)) return -EINVAL;
}
spin_lock(&vc->lock);
vc->arch_compat = arch_compat;
kvmhv_nestedv2_mark_dirty(vcpu, KVMPPC_GSID_LOGICAL_PVR); /* * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit * Also set all reserved PCR bits
*/
vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
spin_unlock(&vc->lock);
return 0;
}
staticvoid kvmppc_dump_regs(struct kvm_vcpu *vcpu)
{ int r;
/* Length for a per-processor buffer is passed in at offset 4 in the buffer */ struct reg_vpa {
u32 dummy; union {
__be16 hword;
__be32 word;
} length;
};
switch (subfunc) { case H_VPA_REG_VPA: /* register VPA */ /* * The size of our lppaca is 1kB because of the way we align * it for the guest to avoid crossing a 4kB boundary. We only * use 640 bytes of the structure though, so we should accept * clients that set a size of 640.
*/
BUILD_BUG_ON(sizeof(struct lppaca) != 640); if (len < sizeof(struct lppaca)) break;
vpap = &tvcpu->arch.vpa;
err = 0; break;
case H_VPA_REG_DTL: /* register DTL */ if (len < sizeof(struct dtl_entry)) break;
len -= len % sizeof(struct dtl_entry);
/* Check that they have previously registered a VPA */
err = H_RESOURCE; if (!vpa_is_registered(&tvcpu->arch.vpa)) break;
vpap = &tvcpu->arch.dtl;
err = 0; break;
case H_VPA_REG_SLB: /* register SLB shadow buffer */ /* Check that they have previously registered a VPA */
err = H_RESOURCE; if (!vpa_is_registered(&tvcpu->arch.vpa)) break;
vpap = &tvcpu->arch.slb_shadow;
err = 0; break;
case H_VPA_DEREG_VPA: /* deregister VPA */ /* Check they don't still have a DTL or SLB buf registered */
err = H_RESOURCE; if (vpa_is_registered(&tvcpu->arch.dtl) ||
vpa_is_registered(&tvcpu->arch.slb_shadow)) break;
/* * We need to pin the page pointed to by vpap->next_gpa, * but we can't call kvmppc_pin_guest_page under the lock * as it does get_user_pages() and down_read(). So we * have to drop the lock, pin the page, then get the lock * again and check that a new area didn't get registered * in the meantime.
*/ for (;;) {
gpa = vpap->next_gpa;
spin_unlock(&vcpu->arch.vpa_update_lock);
va = NULL;
nb = 0; if (gpa)
va = kvmppc_pin_guest_page(kvm, gpa, &nb);
spin_lock(&vcpu->arch.vpa_update_lock); if (gpa == vpap->next_gpa) break; /* sigh... unpin that one and try again */ if (va)
kvmppc_unpin_guest_page(kvm, va, gpa, false);
}
vpap->update_pending = 0; if (va && nb < vpap->len) { /* * If it's now too short, it must be that userspace * has changed the mappings underlying guest memory, * so unregister the region.
*/
kvmppc_unpin_guest_page(kvm, va, gpa, false);
va = NULL;
}
*old_vpap = *vpap;
vpap->gpa = gpa;
vpap->pinned_addr = va;
vpap->dirty = false; if (va)
vpap->pinned_end = va + vpap->len;
}
if (!(vcpu->arch.vpa.update_pending ||
vcpu->arch.slb_shadow.update_pending ||
vcpu->arch.dtl.update_pending)) return;
spin_lock(&vcpu->arch.vpa_update_lock); if (vcpu->arch.vpa.update_pending) {
kvmppc_update_vpa(vcpu, &vcpu->arch.vpa, &old_vpa); if (old_vpa.pinned_addr) { if (kvmhv_is_nestedv2())
kvmhv_nestedv2_set_vpa(vcpu, ~0ull);
kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
old_vpa.dirty);
} if (vcpu->arch.vpa.pinned_addr) {
init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); if (kvmhv_is_nestedv2())
kvmhv_nestedv2_set_vpa(vcpu, __pa(vcpu->arch.vpa.pinned_addr));
}
} if (vcpu->arch.dtl.update_pending) {
kvmppc_update_vpa(vcpu, &vcpu->arch.dtl, &old_vpa); if (old_vpa.pinned_addr)
kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
old_vpa.dirty);
vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
vcpu->arch.dtl_index = 0;
} if (vcpu->arch.slb_shadow.update_pending) {
kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow, &old_vpa); if (old_vpa.pinned_addr)
kvmppc_unpin_guest_page(kvm, old_vpa.pinned_addr, old_vpa.gpa,
old_vpa.dirty);
}
spin_unlock(&vcpu->arch.vpa_update_lock);
}
/* * Return the accumulated stolen time for the vcore up until `now'. * The caller should hold the vcore lock.
*/ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
{
u64 p; unsignedlong flags;
WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
spin_lock_irqsave(&vc->stoltb_lock, flags);
p = vc->stolen_tb; if (vc->vcore_state != VCORE_INACTIVE &&
vc->preempt_tb != TB_NIL)
p += now - vc->preempt_tb;
spin_unlock_irqrestore(&vc->stoltb_lock, flags); return p;
}
/* See if there is a doorbell interrupt pending for a vcpu */ staticbool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
{ int thr; struct kvmppc_vcore *vc;
if (vcpu->arch.doorbell_request) returntrue; if (cpu_has_feature(CPU_FTR_ARCH_300)) returnfalse; /* * Ensure that the read of vcore->dpdes comes after the read * of vcpu->doorbell_request. This barrier matches the * smp_wmb() in kvmppc_guest_entry_inject().
*/
smp_rmb();
vc = vcpu->arch.vcore;
thr = vcpu->vcpu_id - vc->first_vcpuid; return !!(vc->dpdes & (1 << thr));
}
staticbool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
{ if (kvmppc_get_arch_compat(vcpu) >= PVR_ARCH_207) returntrue; if ((!kvmppc_get_arch_compat(vcpu)) &&
cpu_has_feature(CPU_FTR_ARCH_207S)) returntrue; returnfalse;
}
staticint kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsignedlong mflags, unsignedlong resource, unsignedlong value1, unsignedlong value2)
{ switch (resource) { case H_SET_MODE_RESOURCE_SET_CIABR: if (!kvmppc_power8_compatible(vcpu)) return H_P2; if (value2) return H_P4; if (mflags) return H_UNSUPPORTED_FLAG_START; /* Guests can't breakpoint the hypervisor */ if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER) return H_P3;
kvmppc_set_ciabr_hv(vcpu, value1); return H_SUCCESS; case H_SET_MODE_RESOURCE_SET_DAWR0: if (!kvmppc_power8_compatible(vcpu)) return H_P2; if (!ppc_breakpoint_available()) return H_P2; if (mflags) return H_UNSUPPORTED_FLAG_START; if (value2 & DABRX_HYP) return H_P4;
kvmppc_set_dawr0_hv(vcpu, value1);
kvmppc_set_dawrx0_hv(vcpu, value2); return H_SUCCESS; case H_SET_MODE_RESOURCE_SET_DAWR1: if (!kvmppc_power8_compatible(vcpu)) return H_P2; if (!ppc_breakpoint_available()) return H_P2; if (!cpu_has_feature(CPU_FTR_DAWR1)) return H_P2; if (!vcpu->kvm->arch.dawr1_enabled) return H_FUNCTION; if (mflags) return H_UNSUPPORTED_FLAG_START; if (value2 & DABRX_HYP) return H_P4;
kvmppc_set_dawr1_hv(vcpu, value1);
kvmppc_set_dawrx1_hv(vcpu, value2); return H_SUCCESS; case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE: /* * KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved. * Keep this in synch with kvmppc_filter_guest_lpcr_hv.
*/ if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
kvmhv_vcpu_is_radix(vcpu) && mflags == 3) return H_UNSUPPORTED_FLAG_START; return H_TOO_HARD; default: return H_TOO_HARD;
}
}
/* Copy guest memory in place - must reside within a single memslot */ staticint kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from, unsignedlong len)
{ struct kvm_memory_slot *to_memslot = NULL; struct kvm_memory_slot *from_memslot = NULL; unsignedlong to_addr, from_addr; int r;
/* Get HPA for from address */
from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT); if (!from_memslot) return -EFAULT; if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
<< PAGE_SHIFT)) return -EINVAL;
from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT); if (kvm_is_error_hva(from_addr)) return -EFAULT;
from_addr |= (from & (PAGE_SIZE - 1));
/* Get HPA for to address */
to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT); if (!to_memslot) return -EFAULT; if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
<< PAGE_SHIFT)) return -EINVAL;
to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT); if (kvm_is_error_hva(to_addr)) return -EFAULT;
to_addr |= (to & (PAGE_SIZE - 1));
/* Perform copy */
r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
len); if (r) return -EFAULT;
mark_page_dirty(kvm, to >> PAGE_SHIFT); return 0;
}
/* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */ if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED)) return H_PARAMETER;
/* dest (and src if copy_page flag set) must be page aligned */ if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask))) return H_PARAMETER;
/* zero and/or copy the page as determined by the flags */ if (flags & H_COPY_PAGE) {
ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz); if (ret < 0) return H_PARAMETER;
} elseif (flags & H_ZERO_PAGE) {
ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz); if (ret < 0) return H_PARAMETER;
}
/* * We expect to have been called by the real mode handler * (kvmppc_rm_h_confer()) which would have directly returned * H_SUCCESS if the source vcore wasn't idle (e.g. if it may * have useful work to do and should not confer) so we don't * recheck that here. * * In the case of the P9 single vcpu per vcore case, the real * mode handler is not called but no other threads are in the * source vcore.
*/ if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
spin_lock(&vcore->lock); if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
vcore->vcore_state != VCORE_INACTIVE &&
vcore->runner)
target = vcore->runner;
spin_unlock(&vcore->lock);
}
/* Send the error out to userspace via KVM_RUN */ return rc; case H_LOGICAL_CI_LOAD:
ret = kvmppc_h_logical_ci_load(vcpu); if (ret == H_TOO_HARD) return RESUME_HOST; break; case H_LOGICAL_CI_STORE:
ret = kvmppc_h_logical_ci_store(vcpu); if (ret == H_TOO_HARD) return RESUME_HOST; break; case H_SET_MODE:
ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6),
kvmppc_get_gpr(vcpu, 7)); if (ret == H_TOO_HARD) return RESUME_HOST; break; case H_XIRR: case H_CPPR: case H_EOI: case H_IPI: case H_IPOLL: case H_XIRR_X: if (kvmppc_xics_enabled(vcpu)) { if (xics_on_xive()) {
ret = H_NOT_AVAILABLE; return RESUME_GUEST;
}
ret = kvmppc_xics_hcall(vcpu, req); break;
} return RESUME_HOST; case H_SET_DABR:
ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4)); break; case H_SET_XDABR:
ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5)); break; #ifdef CONFIG_SPAPR_TCE_IOMMU case H_GET_TCE:
ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5)); if (ret == H_TOO_HARD) return RESUME_HOST; break; case H_PUT_TCE:
ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6)); if (ret == H_TOO_HARD) return RESUME_HOST; break; case H_PUT_TCE_INDIRECT:
ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6),
kvmppc_get_gpr(vcpu, 7)); if (ret == H_TOO_HARD) return RESUME_HOST; break; case H_STUFF_TCE:
ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6),
kvmppc_get_gpr(vcpu, 7)); if (ret == H_TOO_HARD) return RESUME_HOST; break; #endif case H_RANDOM: { unsignedlong rand;
if (!arch_get_random_seed_longs(&rand, 1))
ret = H_HARDWARE;
kvmppc_set_gpr(vcpu, 4, rand); break;
} case H_RPT_INVALIDATE:
ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6),
kvmppc_get_gpr(vcpu, 7),
kvmppc_get_gpr(vcpu, 8),
kvmppc_get_gpr(vcpu, 9)); break;
case H_SET_PARTITION_TABLE:
ret = H_FUNCTION; if (nesting_enabled(kvm))
ret = kvmhv_set_partition_table(vcpu); break; case H_ENTER_NESTED:
ret = H_FUNCTION; if (!nesting_enabled(kvm)) break;
ret = kvmhv_enter_nested_guest(vcpu); if (ret == H_INTERRUPT) {
kvmppc_set_gpr(vcpu, 3, 0);
vcpu->arch.hcall_needed = 0; return -EINTR;
} elseif (ret == H_TOO_HARD) {
kvmppc_set_gpr(vcpu, 3, 0);
vcpu->arch.hcall_needed = 0; return RESUME_HOST;
} break; case H_TLB_INVALIDATE:
ret = H_FUNCTION; if (nesting_enabled(kvm))
ret = kvmhv_do_nested_tlbie(vcpu); break; case H_COPY_TOFROM_GUEST:
ret = H_FUNCTION; if (nesting_enabled(kvm))
ret = kvmhv_copy_tofrom_guest_nested(vcpu); break; case H_PAGE_INIT:
ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6)); break; case H_SVM_PAGE_IN:
ret = H_UNSUPPORTED; if (kvmppc_get_srr1(vcpu) & MSR_S)
ret = kvmppc_h_svm_page_in(kvm,
kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6)); break; case H_SVM_PAGE_OUT:
ret = H_UNSUPPORTED; if (kvmppc_get_srr1(vcpu) & MSR_S)
ret = kvmppc_h_svm_page_out(kvm,
kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6)); break; case H_SVM_INIT_START:
ret = H_UNSUPPORTED; if (kvmppc_get_srr1(vcpu) & MSR_S)
ret = kvmppc_h_svm_init_start(kvm); break; case H_SVM_INIT_DONE:
ret = H_UNSUPPORTED; if (kvmppc_get_srr1(vcpu) & MSR_S)
ret = kvmppc_h_svm_init_done(kvm); break; case H_SVM_INIT_ABORT: /* * Even if that call is made by the Ultravisor, the SSR1 value * is the guest context one, with the secure bit clear as it has * not yet been secured. So we can't check it here. * Instead the kvm->arch.secure_guest flag is checked inside * kvmppc_h_svm_init_abort().
*/
ret = kvmppc_h_svm_init_abort(kvm); break;
/* * Handle H_CEDE in the P9 path where we don't call the real-mode hcall * handlers in book3s_hv_rmhandlers.S. * * This has to be done early, not in kvmppc_pseries_do_hcall(), so * that the cede logic in kvmppc_run_single_vcpu() works properly.
*/ staticvoid kvmppc_cede(struct kvm_vcpu *vcpu)
{
__kvmppc_set_msr_hv(vcpu, __kvmppc_get_msr_hv(vcpu) | MSR_EE);
vcpu->arch.ceded = 1;
smp_mb(); if (vcpu->arch.prodded) {
vcpu->arch.prodded = 0;
smp_mb();
vcpu->arch.ceded = 0;
}
}
staticint kvmppc_hcall_impl_hv(unsignedlong cmd)
{ switch (cmd) { case H_CEDE: case H_PROD: case H_CONFER: case H_REGISTER_VPA: case H_SET_MODE: #ifdef CONFIG_SPAPR_TCE_IOMMU case H_GET_TCE: case H_PUT_TCE: case H_PUT_TCE_INDIRECT: case H_STUFF_TCE: #endif case H_LOGICAL_CI_LOAD: case H_LOGICAL_CI_STORE: #ifdef CONFIG_KVM_XICS case H_XIRR: case H_CPPR: case H_EOI: case H_IPI: case H_IPOLL: case H_XIRR_X: #endif case H_PAGE_INIT: case H_RPT_INVALIDATE: return 1;
}
/* See if it's in the real-mode table */ return kvmppc_hcall_impl_hv_realmode(cmd);
}
if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
EMULATE_DONE) { /* * Fetch failed, so return to guest and * try executing it again.
*/ return RESUME_GUEST;
}
nthreads = vcpu->kvm->arch.emul_smt_mode;
dpdes = 0;
cpu = vcpu->vcpu_id & ~(nthreads - 1); for (thr = 0; thr < nthreads; ++thr, ++cpu) {
v = kvmppc_find_vcpu(vcpu->kvm, cpu); if (!v) continue; /* * If the vcpu is currently running on a physical cpu thread, * interrupt it in order to pull it out of the guest briefly, * which will update its vcore->dpdes value.
*/
pcpu = READ_ONCE(v->cpu); if (pcpu >= 0)
smp_call_function_single(pcpu, do_nothing, NULL, 1); if (kvmppc_doorbell_pending(v))
dpdes |= 1 << thr;
} return dpdes;
}
/* * On POWER9, emulate doorbell-related instructions in order to * give the guest the illusion of running on a multi-threaded core. * The instructions emulated are msgsndp, msgclrp, mfspr TIR, * and mfspr DPDES.
*/ staticint kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
{
u32 inst, rb, thr; unsignedlong arg; struct kvm *kvm = vcpu->kvm; struct kvm_vcpu *tvcpu;
ppc_inst_t pinst;
/* * If the lppaca had pmcregs_in_use clear when we exited the guest, then * HFSCR_PM is cleared for next entry. If the guest then tries to access * the PMU SPRs, we get this facility unavailable interrupt. Putting HFSCR_PM * back in the guest HFSCR will cause the next entry to load the PMU SPRs and * allow the guest access to continue.
*/ staticint kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu)
{ if (!(vcpu->arch.hfscr_permitted & HFSCR_PM)) return EMULATE_FAIL;
staticint kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, struct task_struct *tsk)
{ struct kvm_run *run = vcpu->run; int r = RESUME_HOST;
vcpu->stat.sum_exits++;
/* * This can happen if an interrupt occurs in the last stages * of guest entry or the first stages of guest exit (i.e. after * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV * and before setting it to KVM_GUEST_MODE_HOST_HV). * That can happen due to a bug, or due to a machine check * occurring at just the wrong time.
*/ if (!kvmhv_is_nestedv2() && (__kvmppc_get_msr_hv(vcpu) & MSR_HV)) {
printk(KERN_EMERG "KVM trap in HV mode!\n");
printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
vcpu->arch.trap, kvmppc_get_pc(vcpu),
vcpu->arch.shregs.msr);
kvmppc_dump_regs(vcpu);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
run->hw.hardware_exit_reason = vcpu->arch.trap; return RESUME_HOST;
}
run->exit_reason = KVM_EXIT_UNKNOWN;
run->ready_for_interrupt_injection = 1; switch (vcpu->arch.trap) { /* We're good on these - the host merely wanted to get our attention */ case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
WARN_ON_ONCE(1); /* Should never happen */
vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
fallthrough; case BOOK3S_INTERRUPT_HV_DECREMENTER:
vcpu->stat.dec_exits++;
r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_EXTERNAL: case BOOK3S_INTERRUPT_H_DOORBELL: case BOOK3S_INTERRUPT_H_VIRT:
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST; break; /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ case BOOK3S_INTERRUPT_HMI: case BOOK3S_INTERRUPT_PERFMON: case BOOK3S_INTERRUPT_SYSTEM_RESET:
r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST); /* * Print the MCE event to host console. Ratelimit so the guest * can't flood the host log.
*/ if (__ratelimit(&rs))
machine_check_print_event_info(&vcpu->arch.mce_evt,false, true);
/* * If the guest can do FWNMI, exit to userspace so it can * deliver a FWNMI to the guest. * Otherwise we synthesize a machine check for the guest * so that it knows that the machine check occurred.
*/ if (!vcpu->kvm->arch.fwnmi_enabled) {
ulong flags = (__kvmppc_get_msr_hv(vcpu) & 0x083c0000) |
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
kvmppc_core_queue_machine_check(vcpu, flags);
r = RESUME_GUEST; break;
}
/* Exit to guest with KVM_EXIT_NMI as exit reason */
run->exit_reason = KVM_EXIT_NMI;
run->hw.hardware_exit_reason = vcpu->arch.trap; /* Clear out the old NMI status from run->flags */
run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK; /* Now set the NMI status */ if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; else
run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
r = RESUME_HOST; break;
} case BOOK3S_INTERRUPT_PROGRAM:
{
ulong flags; /* * Normally program interrupts are delivered directly * to the guest by the hardware, but we can get here * as a result of a hypervisor emulation interrupt * (e40) getting turned into a 700 by BML RTAS.
*/
flags = (__kvmppc_get_msr_hv(vcpu) & 0x1f0000ull) |
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED);
kvmppc_core_queue_program(vcpu, flags);
r = RESUME_GUEST; break;
} case BOOK3S_INTERRUPT_SYSCALL:
{ int i;
if (!kvmhv_is_nestedv2() && unlikely(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) { /* * Guest userspace executed sc 1. This can only be * reached by the P9 path because the old path * handles this case in realmode hcall handlers.
*/ if (!kvmhv_vcpu_is_radix(vcpu)) { /* * A guest could be running PR KVM, so this * may be a PR KVM hcall. It must be reflected * to the guest kernel as a sc interrupt.
*/
kvmppc_core_queue_syscall(vcpu);
} else { /* * Radix guests can not run PR KVM or nested HV * hash guests which might run PR KVM, so this * is always a privilege fault. Send a program * check to guest kernel.
*/
kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
}
r = RESUME_GUEST; break;
}
/* * hcall - gather args and set exit_reason. This will next be * handled by kvmppc_pseries_do_hcall which may be able to deal * with it and resume guest, or may punt to userspace.
*/
run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); for (i = 0; i < 9; ++i)
run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
run->exit_reason = KVM_EXIT_PAPR_HCALL;
vcpu->arch.hcall_needed = 1;
r = RESUME_HOST; break;
} /* * We get these next two if the guest accesses a page which it thinks * it has mapped but which is not actually present, either because * it is for an emulated I/O device or because the corresonding * host page has been paged out. * * Any other HDSI/HISI interrupts have been handled already for P7/8 * guests. For POWER9 hash guests not using rmhandlers, basic hash * fault handling is done here.
*/ case BOOK3S_INTERRUPT_H_DATA_STORAGE: { unsignedlong vsid; long err;
if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
unlikely(vcpu->arch.fault_dsisr == HDSISR_CANARY)) {
r = RESUME_GUEST; /* Just retry if it's the canary */ break;
}
if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) { /* * Radix doesn't require anything, and pre-ISAv3.0 hash * already attempted to handle this in rmhandlers. The * hash fault handling below is v3 only (it uses ASDR * via fault_gpa).
*/
r = RESUME_PAGE_FAULT; break;
}
if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
kvmppc_core_queue_data_storage(vcpu,
kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
r = RESUME_GUEST; break;
}
err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
vsid, vcpu->arch.fault_dsisr, false); if (err == 0) {
r = RESUME_GUEST;
} elseif (err == -1) {
r = RESUME_PAGE_FAULT;
} else {
kvmppc_core_queue_inst_storage(vcpu,
err | (kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
r = RESUME_GUEST;
} break;
}
/* * This occurs if the guest executes an illegal instruction. * If the guest debug is disabled, generate a program interrupt * to the guest. If guest debug is enabled, we need to check * whether the instruction is a software breakpoint instruction. * Accordingly return to Guest or Host.
*/ case BOOK3S_INTERRUPT_H_EMUL_ASSIST: if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
swab32(vcpu->arch.emul_inst) :
vcpu->arch.emul_inst; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
r = kvmppc_emulate_debug_inst(vcpu);
} else {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
r = RESUME_GUEST;
} break;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM case BOOK3S_INTERRUPT_HV_SOFTPATCH: /* * This occurs for various TM-related instructions that * we need to emulate on POWER9 DD2.2. We have already * handled the cases where the guest was in real-suspend * mode and was transitioning to transactional state.
*/
r = kvmhv_p9_tm_emulation(vcpu); if (r != -1) break;
fallthrough; /* go to facility unavailable handler */ #endif
/* * This occurs if the guest (kernel or userspace), does something that * is prohibited by HFSCR. * On POWER9, this could be a doorbell instruction that we need * to emulate. * Otherwise, we just generate a program interrupt to the guest.
*/ case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
u64 cause = kvmppc_get_hfscr_hv(vcpu) >> 56;
r = EMULATE_FAIL; if (cpu_has_feature(CPU_FTR_ARCH_300)) { switch (cause) { case FSCR_MSGP_LG:
r = kvmppc_emulate_doorbell_instr(vcpu); break; case FSCR_PM_LG:
r = kvmppc_pmu_unavailable(vcpu); break; case FSCR_EBB_LG:
r = kvmppc_ebb_unavailable(vcpu); break; case FSCR_TM_LG:
r = kvmppc_tm_unavailable(vcpu); break; default: break;
}
} if (r == EMULATE_FAIL) {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL |
(kvmppc_get_msr(vcpu) & SRR1_PREFIXED));
r = RESUME_GUEST;
} break;
}
case BOOK3S_INTERRUPT_HV_RM_HARD:
r = RESUME_PASSTHROUGH; break; default:
kvmppc_dump_regs(vcpu);
printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
vcpu->arch.trap, kvmppc_get_pc(vcpu),
__kvmppc_get_msr_hv(vcpu));
run->hw.hardware_exit_reason = vcpu->arch.trap;
r = RESUME_HOST; break;
}
return r;
}
staticint kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
{ int r; int srcu_idx;
vcpu->stat.sum_exits++;
/* * This can happen if an interrupt occurs in the last stages * of guest entry or the first stages of guest exit (i.e. after * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV * and before setting it to KVM_GUEST_MODE_HOST_HV). * That can happen due to a bug, or due to a machine check * occurring at just the wrong time.
*/ if (__kvmppc_get_msr_hv(vcpu) & MSR_HV) {
pr_emerg("KVM trap in HV mode while nested!\n");
pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
vcpu->arch.trap, kvmppc_get_pc(vcpu),
__kvmppc_get_msr_hv(vcpu));
kvmppc_dump_regs(vcpu); return RESUME_HOST;
} switch (vcpu->arch.trap) { /* We're good on these - the host merely wanted to get our attention */ case BOOK3S_INTERRUPT_HV_DECREMENTER:
vcpu->stat.dec_exits++;
r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_EXTERNAL:
vcpu->stat.ext_intr_exits++;
r = RESUME_HOST; break; case BOOK3S_INTERRUPT_H_DOORBELL: case BOOK3S_INTERRUPT_H_VIRT:
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST; break; /* These need to go to the nested HV */ case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
vcpu->stat.dec_exits++;
r = RESUME_HOST; break; /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ case BOOK3S_INTERRUPT_HMI: case BOOK3S_INTERRUPT_PERFMON: case BOOK3S_INTERRUPT_SYSTEM_RESET:
r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK:
{ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST); /* Pass the machine check to the L1 guest */
r = RESUME_HOST; /* Print the MCE event to host console. */ if (__ratelimit(&rs))
machine_check_print_event_info(&vcpu->arch.mce_evt, false, true); break;
} /* * We get these next two if the guest accesses a page which it thinks * it has mapped but which is not actually present, either because * it is for an emulated I/O device or because the corresonding * host page has been paged out.
*/ case BOOK3S_INTERRUPT_H_DATA_STORAGE:
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvmhv_nested_page_fault(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); break; case BOOK3S_INTERRUPT_H_INST_STORAGE:
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
DSISR_SRR1_MATCH_64S; if (__kvmppc_get_msr_hv(vcpu) & HSRR1_HISI_WRITE)
vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvmhv_nested_page_fault(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); break;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM case BOOK3S_INTERRUPT_HV_SOFTPATCH: /* * This occurs for various TM-related instructions that * we need to emulate on POWER9 DD2.2. We have already * handled the cases where the guest was in real-suspend * mode and was transitioning to transactional state.
*/
r = kvmhv_p9_tm_emulation(vcpu); if (r != -1) break;
fallthrough; /* go to facility unavailable handler */ #endif
case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
r = RESUME_HOST; break;
case BOOK3S_INTERRUPT_HV_RM_HARD:
vcpu->arch.trap = 0;
r = RESUME_GUEST; if (!xics_on_xive())
kvmppc_xics_rm_complete(vcpu, 0); break; case BOOK3S_INTERRUPT_SYSCALL:
{ unsignedlong req = kvmppc_get_gpr(vcpu, 3);
/* * The H_RPT_INVALIDATE hcalls issued by nested * guests for process-scoped invalidations when * GTSE=0, are handled here in L0.
*/ if (req == H_RPT_INVALIDATE) {
r = kvmppc_nested_h_rpt_invalidate(vcpu); break;
}
r = RESUME_HOST; break;
} default:
r = RESUME_HOST; break;
}
return r;
}
staticint kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{ int i;
memset(sregs, 0, sizeof(struct kvm_sregs));
sregs->pvr = vcpu->arch.pvr; for (i = 0; i < vcpu->arch.slb_max; i++) {
sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
}
return 0;
}
staticint kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{ int i, j;
/* Only accept the same PVR as the host's, since we can't spoof it */ if (sregs->pvr != vcpu->arch.pvr) return -EINVAL;
j = 0; for (i = 0; i < vcpu->arch.slb_nr; i++) { if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
++j;
}
}
vcpu->arch.slb_max = j;
return 0;
}
/* * Enforce limits on guest LPCR values based on hardware availability, * guest configuration, and possibly hypervisor support and security * concerns.
*/ unsignedlong kvmppc_filter_lpcr_hv(struct kvm *kvm, unsignedlong lpcr)
{ /* LPCR_TC only applies to HPT guests */ if (kvm_is_radix(kvm))
lpcr &= ~LPCR_TC;
/* On POWER8 and above, userspace can modify AIL */ if (!cpu_has_feature(CPU_FTR_ARCH_207S))
lpcr &= ~LPCR_AIL; if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */ /* * On some POWER9s we force AIL off for radix guests to prevent * executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to * guest, which can result in Q0 translations with LPID=0 PID=PIDR to * be cached, which the host TLB management does not expect.
*/ if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
lpcr &= ~LPCR_AIL;
/* * On POWER9, allow userspace to enable large decrementer for the * guest, whether or not the host has it enabled.
*/ if (!cpu_has_feature(CPU_FTR_ARCH_300))
lpcr &= ~LPCR_LD;
/* * If ILE (interrupt little-endian) has changed, update the * MSR_LE bit in the intr_msr for each vcpu in this vcore.
*/ if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) { struct kvm_vcpu *vcpu; unsignedlong i;
staticint kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
{ int r = 0; longint i;
switch (id) { case KVM_REG_PPC_DEBUG_INST:
*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT); break; case KVM_REG_PPC_HIOR:
*val = get_reg_val(id, 0); break; case KVM_REG_PPC_DABR:
*val = get_reg_val(id, vcpu->arch.dabr); break; case KVM_REG_PPC_DABRX:
*val = get_reg_val(id, vcpu->arch.dabrx); break; case KVM_REG_PPC_DSCR:
*val = get_reg_val(id, kvmppc_get_dscr_hv(vcpu)); break; case KVM_REG_PPC_PURR:
*val = get_reg_val(id, kvmppc_get_purr_hv(vcpu)); break; case KVM_REG_PPC_SPURR:
*val = get_reg_val(id, kvmppc_get_spurr_hv(vcpu)); break; case KVM_REG_PPC_AMR:
*val = get_reg_val(id, kvmppc_get_amr_hv(vcpu)); break; case KVM_REG_PPC_UAMOR:
*val = get_reg_val(id, kvmppc_get_uamor_hv(vcpu)); break; case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
i = id - KVM_REG_PPC_MMCR0;
*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, i)); break; case KVM_REG_PPC_MMCR2:
*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, 2)); break; case KVM_REG_PPC_MMCRA:
*val = get_reg_val(id, kvmppc_get_mmcra_hv(vcpu)); break; case KVM_REG_PPC_MMCRS:
*val = get_reg_val(id, vcpu->arch.mmcrs); break; case KVM_REG_PPC_MMCR3:
*val = get_reg_val(id, kvmppc_get_mmcr_hv(vcpu, 3)); break; case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
i = id - KVM_REG_PPC_PMC1;
*val = get_reg_val(id, kvmppc_get_pmc_hv(vcpu, i)); break; case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
i = id - KVM_REG_PPC_SPMC1;
*val = get_reg_val(id, vcpu->arch.spmc[i]); break; case KVM_REG_PPC_SIAR:
*val = get_reg_val(id, kvmppc_get_siar_hv(vcpu)); break; case KVM_REG_PPC_SDAR:
*val = get_reg_val(id, kvmppc_get_sdar_hv(vcpu)); break; case KVM_REG_PPC_SIER:
*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 0)); break; case KVM_REG_PPC_SIER2:
*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 1)); break; case KVM_REG_PPC_SIER3:
*val = get_reg_val(id, kvmppc_get_sier_hv(vcpu, 2)); break; case KVM_REG_PPC_IAMR:
*val = get_reg_val(id, kvmppc_get_iamr_hv(vcpu)); break; case KVM_REG_PPC_PSPB:
*val = get_reg_val(id, kvmppc_get_pspb_hv(vcpu)); break; case KVM_REG_PPC_DPDES: /* * On POWER9, where we are emulating msgsndp etc., * we return 1 bit for each vcpu, which can come from * either vcore->dpdes or doorbell_request. * On POWER8, doorbell_request is 0.
*/ if (cpu_has_feature(CPU_FTR_ARCH_300))
*val = get_reg_val(id, vcpu->arch.doorbell_request); else
*val = get_reg_val(id, vcpu->arch.vcore->dpdes); break; case KVM_REG_PPC_VTB:
*val = get_reg_val(id, kvmppc_get_vtb(vcpu)); break; case KVM_REG_PPC_DAWR:
*val = get_reg_val(id, kvmppc_get_dawr0_hv(vcpu)); break; case KVM_REG_PPC_DAWRX:
*val = get_reg_val(id, kvmppc_get_dawrx0_hv(vcpu)); break; case KVM_REG_PPC_DAWR1:
*val = get_reg_val(id, kvmppc_get_dawr1_hv(vcpu)); break; case KVM_REG_PPC_DAWRX1:
*val = get_reg_val(id, kvmppc_get_dawrx1_hv(vcpu)); break; case KVM_REG_PPC_DEXCR:
*val = get_reg_val(id, kvmppc_get_dexcr_hv(vcpu)); break; case KVM_REG_PPC_HASHKEYR:
*val = get_reg_val(id, kvmppc_get_hashkeyr_hv(vcpu)); break; case KVM_REG_PPC_HASHPKEYR:
*val = get_reg_val(id, kvmppc_get_hashpkeyr_hv(vcpu)); break; case KVM_REG_PPC_CIABR:
*val = get_reg_val(id, kvmppc_get_ciabr_hv(vcpu)); break; case KVM_REG_PPC_CSIGR:
*val = get_reg_val(id, vcpu->arch.csigr); break; case KVM_REG_PPC_TACR:
*val = get_reg_val(id, vcpu->arch.tacr); break; case KVM_REG_PPC_TCSCR:
*val = get_reg_val(id, vcpu->arch.tcscr); break; case KVM_REG_PPC_PID:
*val = get_reg_val(id, kvmppc_get_pid(vcpu)); break; case KVM_REG_PPC_ACOP:
*val = get_reg_val(id, vcpu->arch.acop); break; case KVM_REG_PPC_WORT:
*val = get_reg_val(id, kvmppc_get_wort_hv(vcpu)); break; case KVM_REG_PPC_TIDR:
*val = get_reg_val(id, vcpu->arch.tid); break; case KVM_REG_PPC_PSSCR:
*val = get_reg_val(id, vcpu->arch.psscr); break; case KVM_REG_PPC_VPA_ADDR:
spin_lock(&vcpu->arch.vpa_update_lock);
*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
spin_unlock(&vcpu->arch.vpa_update_lock); break; case KVM_REG_PPC_VPA_SLB:
spin_lock(&vcpu->arch.vpa_update_lock);
val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
val->vpaval.length = vcpu->arch.slb_shadow.len;
spin_unlock(&vcpu->arch.vpa_update_lock); break; case KVM_REG_PPC_VPA_DTL:
spin_lock(&vcpu->arch.vpa_update_lock);
val->vpaval.addr = vcpu->arch.dtl.next_gpa;
val->vpaval.length = vcpu->arch.dtl.len;
spin_unlock(&vcpu->arch.vpa_update_lock); break; case KVM_REG_PPC_TB_OFFSET:
*val = get_reg_val(id, kvmppc_get_tb_offset(vcpu)); break; case KVM_REG_PPC_LPCR: case KVM_REG_PPC_LPCR_64:
*val = get_reg_val(id, kvmppc_get_lpcr(vcpu)); break; case KVM_REG_PPC_PPR:
*val = get_reg_val(id, kvmppc_get_ppr_hv(vcpu)); break; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM case KVM_REG_PPC_TFHAR:
*val = get_reg_val(id, vcpu->arch.tfhar); break; case KVM_REG_PPC_TFIAR:
*val = get_reg_val(id, vcpu->arch.tfiar); break; case KVM_REG_PPC_TEXASR:
*val = get_reg_val(id, vcpu->arch.texasr); break; case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
i = id - KVM_REG_PPC_TM_GPR0;
*val = get_reg_val(id, vcpu->arch.gpr_tm[i]); break; case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
{ int j;
i = id - KVM_REG_PPC_TM_VSR0; if (i < 32) for (j = 0; j < TS_FPRWIDTH; j++)
val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j]; else { if (cpu_has_feature(CPU_FTR_ALTIVEC))
val->vval = vcpu->arch.vr_tm.vr[i-32]; else
r = -ENXIO;
} break;
} case KVM_REG_PPC_TM_CR:
*val = get_reg_val(id, vcpu->arch.cr_tm); break; case KVM_REG_PPC_TM_XER:
*val = get_reg_val(id, vcpu->arch.xer_tm); break; case KVM_REG_PPC_TM_LR:
*val = get_reg_val(id, vcpu->arch.lr_tm); break; case KVM_REG_PPC_TM_CTR:
*val = get_reg_val(id, vcpu->arch.ctr_tm); break; case KVM_REG_PPC_TM_FPSCR:
*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr); break; case KVM_REG_PPC_TM_AMR:
*val = get_reg_val(id, vcpu->arch.amr_tm); break; case KVM_REG_PPC_TM_PPR:
*val = get_reg_val(id, vcpu->arch.ppr_tm); break; case KVM_REG_PPC_TM_VRSAVE:
*val = get_reg_val(id, vcpu->arch.vrsave_tm); break; case KVM_REG_PPC_TM_VSCR: if (cpu_has_feature(CPU_FTR_ALTIVEC))
*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]); else
r = -ENXIO; break; case KVM_REG_PPC_TM_DSCR:
*val = get_reg_val(id, vcpu->arch.dscr_tm); break; case KVM_REG_PPC_TM_TAR:
*val = get_reg_val(id, vcpu->arch.tar_tm); break; #endif case KVM_REG_PPC_ARCH_COMPAT:
*val = get_reg_val(id, kvmppc_get_arch_compat(vcpu)); break; case KVM_REG_PPC_DEC_EXPIRY:
*val = get_reg_val(id, kvmppc_get_dec_expires(vcpu)); break; case KVM_REG_PPC_ONLINE:
*val = get_reg_val(id, vcpu->arch.online); break; case KVM_REG_PPC_PTCR:
*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr); break; case KVM_REG_PPC_FSCR:
*val = get_reg_val(id, kvmppc_get_fscr_hv(vcpu)); break; default:
r = -EINVAL; break;
}
return r;
}
staticint kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
{ int r = 0; longint i; unsignedlong addr, len;
switch (id) { case KVM_REG_PPC_HIOR: /* Only allow this to be set to zero */ if (set_reg_val(id, *val))
r = -EINVAL; break; case KVM_REG_PPC_DABR:
vcpu->arch.dabr = set_reg_val(id, *val); break; case KVM_REG_PPC_DABRX:
vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP; break; case KVM_REG_PPC_DSCR:
kvmppc_set_dscr_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_PURR:
kvmppc_set_purr_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_SPURR:
kvmppc_set_spurr_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_AMR:
kvmppc_set_amr_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_UAMOR:
kvmppc_set_uamor_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
i = id - KVM_REG_PPC_MMCR0;
kvmppc_set_mmcr_hv(vcpu, i, set_reg_val(id, *val)); break; case KVM_REG_PPC_MMCR2:
kvmppc_set_mmcr_hv(vcpu, 2, set_reg_val(id, *val)); break; case KVM_REG_PPC_MMCRA:
kvmppc_set_mmcra_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_MMCRS:
vcpu->arch.mmcrs = set_reg_val(id, *val); break; case KVM_REG_PPC_MMCR3:
kvmppc_set_mmcr_hv(vcpu, 3, set_reg_val(id, *val)); break; case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
i = id - KVM_REG_PPC_PMC1;
kvmppc_set_pmc_hv(vcpu, i, set_reg_val(id, *val)); break; case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
i = id - KVM_REG_PPC_SPMC1;
vcpu->arch.spmc[i] = set_reg_val(id, *val); break; case KVM_REG_PPC_SIAR:
kvmppc_set_siar_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_SDAR:
kvmppc_set_sdar_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_SIER:
kvmppc_set_sier_hv(vcpu, 0, set_reg_val(id, *val)); break; case KVM_REG_PPC_SIER2:
kvmppc_set_sier_hv(vcpu, 1, set_reg_val(id, *val)); break; case KVM_REG_PPC_SIER3:
kvmppc_set_sier_hv(vcpu, 2, set_reg_val(id, *val)); break; case KVM_REG_PPC_IAMR:
kvmppc_set_iamr_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_PSPB:
kvmppc_set_pspb_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_DPDES: if (cpu_has_feature(CPU_FTR_ARCH_300))
vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1; else
vcpu->arch.vcore->dpdes = set_reg_val(id, *val); break; case KVM_REG_PPC_VTB:
kvmppc_set_vtb(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_DAWR:
kvmppc_set_dawr0_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_DAWRX:
kvmppc_set_dawrx0_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP); break; case KVM_REG_PPC_DAWR1:
kvmppc_set_dawr1_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_DAWRX1:
kvmppc_set_dawrx1_hv(vcpu, set_reg_val(id, *val) & ~DAWRX_HYP); break; case KVM_REG_PPC_DEXCR:
kvmppc_set_dexcr_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_HASHKEYR:
kvmppc_set_hashkeyr_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_HASHPKEYR:
kvmppc_set_hashpkeyr_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_CIABR:
kvmppc_set_ciabr_hv(vcpu, set_reg_val(id, *val)); /* Don't allow setting breakpoints in hypervisor code */ if ((kvmppc_get_ciabr_hv(vcpu) & CIABR_PRIV) == CIABR_PRIV_HYPER)
kvmppc_set_ciabr_hv(vcpu, kvmppc_get_ciabr_hv(vcpu) & ~CIABR_PRIV); break; case KVM_REG_PPC_CSIGR:
vcpu->arch.csigr = set_reg_val(id, *val); break; case KVM_REG_PPC_TACR:
vcpu->arch.tacr = set_reg_val(id, *val); break; case KVM_REG_PPC_TCSCR:
vcpu->arch.tcscr = set_reg_val(id, *val); break; case KVM_REG_PPC_PID:
kvmppc_set_pid(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_ACOP:
vcpu->arch.acop = set_reg_val(id, *val); break; case KVM_REG_PPC_WORT:
kvmppc_set_wort_hv(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_TIDR:
vcpu->arch.tid = set_reg_val(id, *val); break; case KVM_REG_PPC_PSSCR:
vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS; break; case KVM_REG_PPC_VPA_ADDR:
addr = set_reg_val(id, *val);
r = -EINVAL; if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
vcpu->arch.dtl.next_gpa)) break;
r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca)); break; case KVM_REG_PPC_VPA_SLB:
addr = val->vpaval.addr;
len = val->vpaval.length;
r = -EINVAL; if (addr && !vcpu->arch.vpa.next_gpa) break;
r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len); break; case KVM_REG_PPC_VPA_DTL:
addr = val->vpaval.addr;
len = val->vpaval.length;
r = -EINVAL; if (addr && (len < sizeof(struct dtl_entry) ||
!vcpu->arch.vpa.next_gpa)) break;
len -= len % sizeof(struct dtl_entry);
r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); break; case KVM_REG_PPC_TB_OFFSET:
{ /* round up to multiple of 2^24 */
u64 tb_offset = ALIGN(set_reg_val(id, *val), 1UL << 24);
/* * Now that we know the timebase offset, update the * decrementer expiry with a guest timebase value. If * the userspace does not set DEC_EXPIRY, this ensures * a migrated vcpu at least starts with an expired * decrementer, which is better than a large one that * causes a hang.
*/
kvmppc_set_tb_offset(vcpu, tb_offset); if (!kvmppc_get_dec_expires(vcpu) && tb_offset)
kvmppc_set_dec_expires(vcpu, get_tb() + tb_offset);
kvmppc_set_tb_offset(vcpu, tb_offset); break;
} case KVM_REG_PPC_LPCR:
kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true); break; case KVM_REG_PPC_LPCR_64:
kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false); break; case KVM_REG_PPC_PPR:
kvmppc_set_ppr_hv(vcpu, set_reg_val(id, *val)); break; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM case KVM_REG_PPC_TFHAR:
vcpu->arch.tfhar = set_reg_val(id, *val); break; case KVM_REG_PPC_TFIAR:
vcpu->arch.tfiar = set_reg_val(id, *val); break; case KVM_REG_PPC_TEXASR:
vcpu->arch.texasr = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
i = id - KVM_REG_PPC_TM_GPR0;
vcpu->arch.gpr_tm[i] = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
{ int j;
i = id - KVM_REG_PPC_TM_VSR0; if (i < 32) for (j = 0; j < TS_FPRWIDTH; j++)
vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j]; else if (cpu_has_feature(CPU_FTR_ALTIVEC))
vcpu->arch.vr_tm.vr[i-32] = val->vval; else
r = -ENXIO; break;
} case KVM_REG_PPC_TM_CR:
vcpu->arch.cr_tm = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_XER:
vcpu->arch.xer_tm = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_LR:
vcpu->arch.lr_tm = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_CTR:
vcpu->arch.ctr_tm = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_FPSCR:
vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_AMR:
vcpu->arch.amr_tm = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_PPR:
vcpu->arch.ppr_tm = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_VRSAVE:
vcpu->arch.vrsave_tm = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_VSCR: if (cpu_has_feature(CPU_FTR_ALTIVEC))
vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val); else
r = - ENXIO; break; case KVM_REG_PPC_TM_DSCR:
vcpu->arch.dscr_tm = set_reg_val(id, *val); break; case KVM_REG_PPC_TM_TAR:
vcpu->arch.tar_tm = set_reg_val(id, *val); break; #endif case KVM_REG_PPC_ARCH_COMPAT:
r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_DEC_EXPIRY:
kvmppc_set_dec_expires(vcpu, set_reg_val(id, *val)); break; case KVM_REG_PPC_ONLINE:
i = set_reg_val(id, *val); if (i && !vcpu->arch.online)
atomic_inc(&vcpu->arch.vcore->online_count); elseif (!i && vcpu->arch.online)
atomic_dec(&vcpu->arch.vcore->online_count);
vcpu->arch.online = i; break; case KVM_REG_PPC_PTCR:
vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val); break; case KVM_REG_PPC_FSCR:
kvmppc_set_fscr_hv(vcpu, set_reg_val(id, *val)); break; default:
r = -EINVAL; break;
}
return r;
}
/* * On POWER9, threads are independent and can be in different partitions. * Therefore we consider each thread to be a subcore. * There is a restriction that all threads have to be in the same * MMU mode (radix or HPT), unfortunately, but since we only support * HPT guests on a HPT host so far, that isn't an impediment yet.
*/ staticint threads_per_vcore(struct kvm *kvm)
{ if (cpu_has_feature(CPU_FTR_ARCH_300)) return 1; return threads_per_subcore;
}
staticint kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
{ int err; int core; struct kvmppc_vcore *vcore; struct kvm *kvm; unsignedint id;
kvm = vcpu->kvm;
id = vcpu->vcpu_id;
vcpu->arch.shared = &vcpu->arch.shregs; #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE /* * The shared struct is never shared on HV, * so we can always use host endianness
*/ #ifdef __BIG_ENDIAN__
vcpu->arch.shared_big_endian = true; #else
vcpu->arch.shared_big_endian = false; #endif #endif
if (kvmhv_is_nestedv2()) {
err = kvmhv_nestedv2_vcpu_create(vcpu, &vcpu->arch.nestedv2_io); if (err < 0) return err;
}
kvmppc_set_ctrl_hv(vcpu, CTRL_RUNLATCH); /* default to host PVR, since we can't spoof it */
kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
spin_lock_init(&vcpu->arch.vpa_update_lock);
spin_lock_init(&vcpu->arch.tbacct_lock);
vcpu->arch.busy_preempt = TB_NIL;
__kvmppc_set_msr_hv(vcpu, MSR_ME);
vcpu->arch.intr_msr = MSR_SF | MSR_ME;
/* * Set the default HFSCR for the guest from the host value. * This value is only used on POWER9 and later. * On >= POWER9, we want to virtualize the doorbell facility, so we * don't set the HFSCR_MSGP bit, and that causes those instructions * to trap and then we emulate them.
*/
kvmppc_set_hfscr_hv(vcpu, HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP);
/* On POWER10 and later, allow prefixed instructions */ if (cpu_has_feature(CPU_FTR_ARCH_31))
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_PREFIX);
if (cpu_has_feature(CPU_FTR_HVMODE)) {
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) & mfspr(SPRN_HFSCR));
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
kvmppc_set_hfscr_hv(vcpu, kvmppc_get_hfscr_hv(vcpu) | HFSCR_TM); #endif
} if (cpu_has_feature(CPU_FTR_TM_COMP))
vcpu->arch.hfscr |= HFSCR_TM;
staticint kvmhv_set_smt_mode(struct kvm *kvm, unsignedlong smt_mode, unsignedlong flags)
{ int err; int esmt = 0;
if (flags) return -EINVAL; if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode)) return -EINVAL; if (!cpu_has_feature(CPU_FTR_ARCH_300)) { /* * On POWER8 (or POWER7), the threading mode is "strict", * so we pack smt_mode vcpus per vcore.
*/ if (smt_mode > threads_per_subcore) return -EINVAL;
} else { /* * On POWER9, the threading mode is "loose", * so each vcpu gets its own vcore.
*/
esmt = smt_mode;
smt_mode = 1;
}
mutex_lock(&kvm->lock);
err = -EBUSY; if (!kvm->arch.online_vcores) {
kvm->arch.smt_mode = smt_mode;
kvm->arch.emul_smt_mode = esmt;
err = 0;
}
mutex_unlock(&kvm->lock);
/* Ensure the thread won't go into the kernel if it wakes */
tpaca->kvm_hstate.kvm_vcpu = NULL;
tpaca->kvm_hstate.kvm_vcore = NULL;
tpaca->kvm_hstate.napping = 0;
smp_wmb();
tpaca->kvm_hstate.hwthread_req = 1;
/* * If the thread is already executing in the kernel (e.g. handling * a stray interrupt), wait for it to get back to nap mode. * The smp_mb() is to ensure that our setting of hwthread_req * is visible before we look at hwthread_state, so if this * races with the code at system_reset_pSeries and the thread * misses our setting of hwthread_req, we are sure to see its * setting of hwthread_state, and vice versa.
*/
smp_mb(); while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) { if (--timeout <= 0) {
pr_err("KVM: couldn't grab cpu %d\n", cpu); return -EBUSY;
}
udelay(1);
} return 0;
}
staticvoid radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
{ struct kvm_nested_guest *nested = vcpu->arch.nested;
cpumask_t *need_tlb_flush; int i;
if (nested)
need_tlb_flush = &nested->need_tlb_flush; else
need_tlb_flush = &kvm->arch.need_tlb_flush;
cpu = cpu_first_tlb_thread_sibling(cpu); for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
i += cpu_tlb_thread_sibling_step())
cpumask_set_cpu(i, need_tlb_flush);
/* * Make sure setting of bit in need_tlb_flush precedes testing of * cpu_in_guest. The matching barrier on the other side is hwsync * when switching to guest MMU mode, which happens between * cpu_in_guest being set to the guest kvm, and need_tlb_flush bit * being tested.
*/
smp_mb();
for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
i += cpu_tlb_thread_sibling_step()) { struct kvm *running = *per_cpu_ptr(&cpu_in_guest, i);
if (running == kvm)
smp_call_function_single(i, do_nothing, NULL, 1);
}
}
/* * If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync; * ptesync sequence on the old CPU before migrating to a new one, in * case we interrupted the guest between a tlbie ; eieio ; * tlbsync; ptesync sequence. * * Otherwise, ptesync is sufficient for ordering tlbiel sequences.
*/ if (kvm->arch.lpcr & LPCR_GTSE) asmvolatile("eieio; tlbsync; ptesync"); else asmvolatile("ptesync");
}
staticvoid kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
{ struct kvm_nested_guest *nested = vcpu->arch.nested; struct kvm *kvm = vcpu->kvm; int prev_cpu;
if (!cpu_has_feature(CPU_FTR_HVMODE)) return;
if (nested)
prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id]; else
prev_cpu = vcpu->arch.prev_cpu;
/* * With radix, the guest can do TLB invalidations itself, * and it could choose to use the local form (tlbiel) if * it is invalidating a translation that has only ever been * used on one vcpu. However, that doesn't mean it has * only ever been used on one physical cpu, since vcpus * can move around between pcpus. To cope with this, when * a vcpu moves from one pcpu to another, we need to tell * any vcpus running on the same core as this vcpu previously * ran to flush the TLB.
*/ if (prev_cpu != pcpu) { if (prev_cpu >= 0) { if (cpu_first_tlb_thread_sibling(prev_cpu) !=
cpu_first_tlb_thread_sibling(pcpu))
radix_flush_cpu(kvm, prev_cpu, vcpu);
cpu = vc->pcpu; if (vcpu) { if (vcpu->arch.timer_running) {
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
vcpu->arch.timer_running = 0;
}
cpu += vcpu->arch.ptid;
vcpu->cpu = vc->pcpu;
vcpu->arch.thread_cpu = cpu;
}
tpaca = paca_ptrs[cpu];
tpaca->kvm_hstate.kvm_vcpu = vcpu;
tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
tpaca->kvm_hstate.fake_suspend = 0; /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
smp_wmb();
tpaca->kvm_hstate.kvm_vcore = vc; if (cpu != smp_processor_id())
kvmppc_ipi_thread(cpu);
}
staticvoid kvmppc_wait_for_nap(int n_threads)
{ int cpu = smp_processor_id(); int i, loops;
if (n_threads <= 1) return; for (loops = 0; loops < 1000000; ++loops) { /* * Check if all threads are finished. * We set the vcore pointer when starting a thread * and the thread clears it when finished, so we look * for any threads that still have a non-NULL vcore ptr.
*/ for (i = 1; i < n_threads; ++i) if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore) break; if (i == n_threads) {
HMT_medium(); return;
}
HMT_low();
}
HMT_medium(); for (i = 1; i < n_threads; ++i) if (paca_ptrs[cpu + i]->kvm_hstate.kvm_vcore)
pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
}
/* * Check that we are on thread 0 and that any other threads in * this core are off-line. Then grab the threads so they can't * enter the kernel.
*/ staticint on_primary_thread(void)
{ int cpu = smp_processor_id(); int thr;
/* Are we on a primary subcore? */ if (cpu_thread_in_subcore(cpu)) return 0;
thr = 0; while (++thr < threads_per_subcore) if (cpu_online(cpu + thr)) return 0;
/* Grab all hw threads so they can't go into the kernel */ for (thr = 1; thr < threads_per_subcore; ++thr) { if (kvmppc_grab_hwthread(cpu + thr)) { /* Couldn't grab one; let the others go */ do {
kvmppc_release_hwthread(cpu + thr);
} while (--thr > 0); return 0;
}
} return 1;
}
/* * A list of virtual cores for each physical CPU. * These are vcores that could run but their runner VCPU tasks are * (or may be) preempted.
*/ struct preempted_vcore_list { struct list_head list;
spinlock_t lock;
};
/* * This stores information about the virtual cores currently * assigned to a physical core.
*/ struct core_info { int n_subcores; int max_subcore_threads; int total_threads; int subcore_threads[MAX_SUBCORES]; struct kvmppc_vcore *vc[MAX_SUBCORES];
};
/* * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7 * respectively in 2-way micro-threading (split-core) mode on POWER8.
*/ staticint subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
staticbool subcore_config_ok(int n_subcores, int n_threads)
{ /* * POWER9 "SMT4" cores are permanently in what is effectively a 4-way * split-core mode, with one thread per subcore.
*/ if (cpu_has_feature(CPU_FTR_ARCH_300)) return n_subcores <= 4 && n_threads == 1;
/* On POWER8, can only dynamically split if unsplit to begin with */ if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS) returnfalse; if (n_subcores > MAX_SUBCORES) returnfalse; if (n_subcores > 1) { if (!(dynamic_mt_modes & 2))
n_subcores = 4; if (n_subcores > 2 && !(dynamic_mt_modes & 4)) returnfalse;
}
/* * Work out whether it is possible to piggyback the execution of * vcore *pvc onto the execution of the other vcores described in *cip.
*/ staticbool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip, int target_threads)
{ if (cip->total_threads + pvc->num_threads > target_threads) returnfalse;
return can_dynamic_split(pvc, cip);
}
staticvoid prepare_threads(struct kvmppc_vcore *vc)
{ int i; struct kvm_vcpu *vcpu;
for (sub = 0; sub < cip->n_subcores; ++sub) {
vc = cip->vc[sub]; if (!vc->kvm->arch.mmu_ready) returntrue;
for_each_runnable_thread(i, vcpu, vc) if (signal_pending(vcpu->arch.run_task)) returntrue;
} returnfalse;
}
staticvoid post_guest_process(struct kvmppc_vcore *vc, bool is_master)
{ int still_running = 0, i;
u64 now; long ret; struct kvm_vcpu *vcpu;
spin_lock(&vc->lock);
now = get_tb();
for_each_runnable_thread(i, vcpu, vc) { /* * It's safe to unlock the vcore in the loop here, because * for_each_runnable_thread() is safe against removal of * the vcpu, and the vcore state is VCORE_EXITING here, * so any vcpus becoming runnable will have their arch.trap * set to zero and can't actually run in the guest.
*/
spin_unlock(&vc->lock); /* cancel pending dec exception if dec is positive */ if (now < kvmppc_dec_expires_host_tb(vcpu) &&
kvmppc_core_pending_dec(vcpu))
kvmppc_core_dequeue_dec(vcpu);
trace_kvm_guest_exit(vcpu);
ret = RESUME_GUEST; if (vcpu->arch.trap)
ret = kvmppc_handle_exit_hv(vcpu,
vcpu->arch.run_task);
vcpu->arch.ret = ret;
vcpu->arch.trap = 0;
spin_lock(&vc->lock); if (is_kvmppc_resume_guest(vcpu->arch.ret)) { if (vcpu->arch.pending_exceptions)
kvmppc_core_prepare_to_enter(vcpu); if (vcpu->arch.ceded)
kvmppc_set_timer(vcpu); else
++still_running;
} else {
kvmppc_remove_runnable(vc, vcpu, mftb());
wake_up(&vcpu->arch.cpu_run);
}
} if (!is_master) { if (still_running > 0) {
kvmppc_vcore_preempt(vc);
} elseif (vc->runner) {
vc->vcore_state = VCORE_PREEMPT;
kvmppc_core_start_stolen(vc, mftb());
} else {
vc->vcore_state = VCORE_INACTIVE;
} if (vc->n_runnable > 0 && vc->runner == NULL) { /* make sure there's a candidate runner awake */
i = -1;
vcpu = next_runnable_thread(vc, &i);
wake_up(&vcpu->arch.cpu_run);
}
}
spin_unlock(&vc->lock);
}
/* * Clear core from the list of active host cores as we are about to * enter the guest. Only do this if it is the primary thread of the * core (not if a subcore) that is entering the guest.
*/ staticinlineint kvmppc_clear_host_core(unsignedint cpu)
{ int core;
if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) return 0; /* * Memory barrier can be omitted here as we will do a smp_wmb() * later in kvmppc_start_thread and we need ensure that state is * visible to other CPUs only after we enter guest.
*/
core = cpu >> threads_shift;
kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0; return 0;
}
/* * Advertise this core as an active host core since we exited the guest * Only need to do this if it is the primary thread of the core that is * exiting.
*/ staticinlineint kvmppc_set_host_core(unsignedint cpu)
{ int core;
if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) return 0;
/* * Memory barrier can be omitted here because we do a spin_unlock * immediately after this which provides the memory barrier.
*/
core = cpu >> threads_shift;
kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1; return 0;
}
staticvoid set_irq_happened(int trap)
{ switch (trap) { case BOOK3S_INTERRUPT_EXTERNAL:
local_paca->irq_happened |= PACA_IRQ_EE; break; case BOOK3S_INTERRUPT_H_DOORBELL:
local_paca->irq_happened |= PACA_IRQ_DBELL; break; case BOOK3S_INTERRUPT_HMI:
local_paca->irq_happened |= PACA_IRQ_HMI; break; case BOOK3S_INTERRUPT_SYSTEM_RESET:
replay_system_reset(); break;
}
}
/* * Run a set of guest threads on a physical core. * Called with vc->lock held.
*/ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
{ struct kvm_vcpu *vcpu; int i; int srcu_idx; struct core_info core_info; struct kvmppc_vcore *pvc; struct kvm_split_mode split_info, *sip; int split, subcore_size, active; int sub; bool thr0_done; unsignedlong cmd_bit, stat_bit; int pcpu, thr; int target_threads; int controlled_threads; int trap; bool is_power8;
if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300))) return;
/* * Remove from the list any threads that have a signal pending * or need a VPA update done
*/
prepare_threads(vc);
/* if the runner is no longer runnable, let the caller pick a new one */ if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE) return;
/* * Number of threads that we will be controlling: the same as * the number of threads per subcore, except on POWER9, * where it's 1 because the threads are (mostly) independent.
*/
controlled_threads = threads_per_vcore(vc->kvm);
/* * Make sure we are running on primary threads, and that secondary * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest.
*/ if ((controlled_threads > 1) &&
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
for_each_runnable_thread(i, vcpu, vc) {
vcpu->arch.ret = -EBUSY;
kvmppc_remove_runnable(vc, vcpu, mftb());
wake_up(&vcpu->arch.cpu_run);
} goto out;
}
/* * See if we could run any other vcores on the physical core * along with this one.
*/
init_core_info(&core_info, vc);
pcpu = smp_processor_id();
target_threads = controlled_threads; if (target_smt_mode && target_smt_mode < target_threads)
target_threads = target_smt_mode; if (vc->num_threads < target_threads)
collect_piggybacks(&core_info, target_threads);
/* * Hard-disable interrupts, and check resched flag and signals. * If we need to reschedule or deliver a signal, clean up * and return without going into the guest(s). * If the mmu_ready flag has been cleared, don't go into the * guest because that means a HPT resize operation is in progress.
*/
local_irq_disable();
hard_irq_disable(); if (lazy_irq_pending() || need_resched() ||
recheck_signals_and_mmu(&core_info)) {
local_irq_enable();
vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ for (sub = 1; sub < core_info.n_subcores; ++sub) {
pvc = core_info.vc[sub]; /* Put back on to the preempted vcores list */
kvmppc_vcore_preempt(pvc);
spin_unlock(&pvc->lock);
} for (i = 0; i < controlled_threads; ++i)
kvmppc_release_hwthread(pcpu + i); return;
}
/* * On POWER8, set RWMR register. * Since it only affects PURR and SPURR, it doesn't affect * the host, so we don't save/restore the host value.
*/ if (is_power8) { unsignedlong rwmr_val = RWMR_RPA_P8_8THREAD; int n_online = atomic_read(&vc->online_count);
/* * Use the 8-thread value if we're doing split-core * or if the vcore's online count looks bogus.
*/ if (split == 1 && threads_per_subcore == MAX_SMT_THREADS &&
n_online >= 1 && n_online <= MAX_SMT_THREADS)
rwmr_val = p8_rwmr_values[n_online];
mtspr(SPRN_RWMR, rwmr_val);
}
/* Start all the threads */
active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) {
thr = is_power8 ? subcore_thread_map[sub] : sub;
thr0_done = false;
active |= 1 << thr;
pvc = core_info.vc[sub];
pvc->pcpu = pcpu + thr;
for_each_runnable_thread(i, vcpu, pvc) { /* * XXX: is kvmppc_start_thread called too late here? * It updates vcpu->cpu and vcpu->arch.thread_cpu * which are used by kvmppc_fast_vcpu_kick_hv(), but * kick is called after new exceptions become available * and exceptions are checked earlier than here, by * kvmppc_core_prepare_to_enter.
*/
kvmppc_start_thread(vcpu, pvc);
kvmppc_update_vpa_dispatch(vcpu, pvc);
trace_kvm_guest_enter(vcpu); if (!vcpu->arch.ptid)
thr0_done = true;
active |= 1 << (thr + vcpu->arch.ptid);
} /* * We need to start the first thread of each subcore * even if it doesn't have a vcpu.
*/ if (!thr0_done)
kvmppc_start_thread(NULL, pvc);
}
/* * Ensure that split_info.do_nap is set after setting * the vcore pointer in the PACA of the secondaries.
*/
smp_mb();
/* * When doing micro-threading, poke the inactive threads as well. * This gets them to the nap instruction after kvm_do_nap, * which reduces the time taken to unsplit later.
*/ if (cmd_bit) {
split_info.do_nap = 1; /* ask secondaries to nap when done */ for (thr = 1; thr < threads_per_subcore; ++thr) if (!(active & (1 << thr)))
kvmppc_ipi_thread(pcpu + thr);
}
if (!vtime_accounting_enabled_this_cpu()) {
local_irq_enable(); /* * Service IRQs here before guest_timing_exit_irqoff() so any * ticks that occurred while running the guest are accounted to * the guest. If vtime accounting is enabled, accounting uses * TB rather than ticks, so it can be done without enabling * interrupts here, which has the problem that it accounts * interrupt processing overhead to the host.
*/
local_irq_disable();
}
guest_timing_exit_irqoff();
local_irq_enable();
/* Let secondaries go back to the offline loop */ for (i = 0; i < controlled_threads; ++i) {
kvmppc_release_hwthread(pcpu + i); if (sip && sip->napped[i])
kvmppc_ipi_thread(pcpu + i);
}
spin_unlock(&vc->lock);
/* make sure updates to secondary vcpu structs are visible now */
smp_mb();
preempt_enable();
for (sub = 0; sub < core_info.n_subcores; ++sub) {
pvc = core_info.vc[sub];
post_guest_process(pvc, pvc == vc);
}
if (rc != H_SUCCESS) {
pr_err("KVM Guest Run VCPU hcall failed\n"); if (rc == H_INVALID_ELEMENT_ID)
pr_err("KVM: Guest Run VCPU invalid element id at %ld\n", i); elseif (rc == H_INVALID_ELEMENT_SIZE)
pr_err("KVM: Guest Run VCPU invalid element size at %ld\n", i); elseif (rc == H_INVALID_ELEMENT_VALUE)
pr_err("KVM: Guest Run VCPU invalid element value at %ld\n", i); return -EINVAL;
}
accumulate_time(vcpu, &vcpu->arch.guest_exit);
rc = kvmhv_nestedv2_parse_output(vcpu); if (rc < 0) return -EINVAL;
timer_rearm_host_dec(*tb);
/* Record context switch and guest_run_time data */ if (kvmhv_get_l2_counters_status())
do_trace_nested_cs_time(vcpu);
return trap;
}
/* call our hypervisor to load up HV regs and go */ staticint kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, unsignedlong lpcr, u64 *tb)
{ unsignedlong host_psscr; unsignedlong msr; struct hv_guest_state hvregs; struct p9_host_os_sprs host_os_sprs;
s64 dec; int trap;
msr = mfmsr();
save_p9_host_os_sprs(&host_os_sprs);
/* * We need to save and restore the guest visible part of the * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor * doesn't do this for us. Note only required if pseries since * this is done in kvmhv_vcpu_entry_p9() below otherwise.
*/
host_psscr = mfspr(SPRN_PSSCR_PR);
kvmppc_msr_hard_disable_set_facilities(vcpu, msr); if (lazy_irq_pending()) return 0;
if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
msr = mfmsr(); /* TM restore can update msr */
if (vcpu->arch.psscr != host_psscr)
mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
/* * hvregs has the doorbell status, so zero it here which * enables us to receive doorbells when H_ENTER_NESTED is * in progress for this vCPU
*/
if (vcpu->arch.doorbell_request)
vcpu->arch.doorbell_request = 0;
/* * When setting DEC, we must always deal with irq_work_raise * via NMI vs setting DEC. The problem occurs right as we * switch into guest mode if a NMI hits and sets pending work * and sets DEC, then that will apply to the guest and not * bring us back to the host. * * irq_work_raise could check a flag (or possibly LPCR[HDICE] * for example) and set HDEC to 1? That wouldn't solve the * nested hv case which needs to abort the hcall or zero the * time limit. * * XXX: Another day's problem.
*/
mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb);
dec = mfspr(SPRN_DEC); if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
dec = (s32) dec;
*tb = mftb();
vcpu->arch.dec_expires = dec + (*tb + kvmppc_get_tb_offset(vcpu));
timer_rearm_host_dec(*tb);
restore_p9_host_os_sprs(vcpu, &host_os_sprs); if (vcpu->arch.psscr != host_psscr)
mtspr(SPRN_PSSCR_PR, host_psscr);
return trap;
}
/* * Guest entry for POWER9 and later CPUs.
*/ staticint kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsignedlong lpcr, u64 *tb)
{ struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *nested = vcpu->arch.nested;
u64 next_timer; int trap;
next_timer = timer_get_next_tb(); if (*tb >= next_timer) return BOOK3S_INTERRUPT_HV_DECREMENTER; if (next_timer < time_limit)
time_limit = next_timer; elseif (*tb >= time_limit) /* nested time limit */ return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
vcpu->arch.ceded = 0;
vcpu_vpa_increment_dispatch(vcpu);
if (kvmhv_on_pseries()) { if (kvmhv_is_nestedv1())
trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb); else
trap = kvmhv_vcpu_entry_nestedv2(vcpu, time_limit, lpcr, tb);
/* H_CEDE has to be handled now, not later */ if (trap == BOOK3S_INTERRUPT_SYSCALL && !nested &&
kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
kvmppc_cede(vcpu);
kvmppc_set_gpr(vcpu, 3, 0);
trap = 0;
}
/* * XIVE rearm and XICS hcalls must be handled * before xive context is pulled (is this * true?)
*/ if (req == H_CEDE) { /* H_CEDE has to be handled now */
kvmppc_cede(vcpu); if (!kvmppc_xive_rearm_escalation(vcpu)) { /* * Pending escalation so abort * the cede.
*/
vcpu->arch.ceded = 0;
}
kvmppc_set_gpr(vcpu, 3, 0);
trap = 0;
} elseif (req == H_ENTER_NESTED) { /* * L2 should not run with the L1 * context so rearm and pull it.
*/ if (!kvmppc_xive_rearm_escalation(vcpu)) { /* * Pending escalation so abort * H_ENTER_NESTED.
*/
kvmppc_set_gpr(vcpu, 3, 0);
trap = 0;
}
} elseif (hcall_is_xics(req)) { int ret;
ret = kvmppc_xive_xics_hcall(vcpu, req); if (ret != H_TOO_HARD) {
kvmppc_set_gpr(vcpu, 3, ret);
trap = 0;
}
}
}
kvmppc_xive_pull_vcpu(vcpu);
if (kvm_is_radix(kvm))
vcpu->arch.slb_max = 0;
}
vcpu_vpa_increment_dispatch(vcpu);
return trap;
}
/* * Wait for some other vcpu thread to execute us, and * wake us up when we need to handle something in the host.
*/ staticvoid kvmppc_wait_for_exec(struct kvmppc_vcore *vc, struct kvm_vcpu *vcpu, int wait_state)
{
DEFINE_WAIT(wait);
/* * Check to see if any of the runnable vcpus on the vcore have pending * exceptions or are no longer ceded
*/ staticint kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
{ struct kvm_vcpu *vcpu; int i;
for_each_runnable_thread(i, vcpu, vc) { if (kvmppc_vcpu_check_block(vcpu)) return 1;
}
return 0;
}
/* * All the vcpus in this vcore are idle, so wait for a decrementer * or external interrupt to one of the vcpus. vc->lock is held.
*/ staticvoid kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
{
ktime_t cur, start_poll, start_wait; int do_sleep = 1;
u64 block_ns;
WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
/* Poll for pending exceptions and ceded state */
cur = start_poll = ktime_get(); if (vc->halt_poll_ns) {
ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
++vc->runner->stat.generic.halt_attempted_poll;
if (!do_sleep) {
++vc->runner->stat.generic.halt_successful_poll; goto out;
}
}
prepare_to_rcuwait(&vc->wait);
set_current_state(TASK_INTERRUPTIBLE); if (kvmppc_vcore_check_block(vc)) {
finish_rcuwait(&vc->wait);
do_sleep = 0; /* If we polled, count this as a successful poll */ if (vc->halt_poll_ns)
++vc->runner->stat.generic.halt_successful_poll; goto out;
}
/* Adjust poll time */ if (halt_poll_ns) { if (block_ns <= vc->halt_poll_ns)
; /* We slept and blocked for longer than the max halt time */ elseif (vc->halt_poll_ns && block_ns > halt_poll_ns)
shrink_halt_poll_ns(vc); /* We slept and our poll time is too small */ elseif (vc->halt_poll_ns < halt_poll_ns &&
block_ns < halt_poll_ns)
grow_halt_poll_ns(vc); if (vc->halt_poll_ns > halt_poll_ns)
vc->halt_poll_ns = halt_poll_ns;
} else
vc->halt_poll_ns = 0;
trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
}
/* * This never fails for a radix guest, as none of the operations it does * for a radix guest can fail or have a way to report failure.
*/ staticint kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
{ int r = 0; struct kvm *kvm = vcpu->kvm;
mutex_lock(&kvm->arch.mmu_setup_lock); if (!kvm->arch.mmu_ready) { if (!kvm_is_radix(kvm))
r = kvmppc_hv_setup_htab_rma(vcpu); if (!r) { if (cpu_has_feature(CPU_FTR_ARCH_300))
kvmppc_setup_partition_table(kvm);
kvm->arch.mmu_ready = 1;
}
}
mutex_unlock(&kvm->arch.mmu_setup_lock); return r;
}
staticint kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
{ struct kvm_run *run = vcpu->run; int n_ceded, i, r; struct kvmppc_vcore *vc; struct kvm_vcpu *v;
/* * Synchronize with other threads in this virtual core
*/
vc = vcpu->arch.vcore;
spin_lock(&vc->lock);
vcpu->arch.ceded = 0;
vcpu->arch.run_task = current;
vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
vcpu->arch.busy_preempt = TB_NIL;
WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
++vc->n_runnable;
/* * This happens the first time this is called for a vcpu. * If the vcore is already running, we may be able to start * this thread straight away and have it join in.
*/ if (!signal_pending(current)) { if ((vc->vcore_state == VCORE_PIGGYBACK ||
vc->vcore_state == VCORE_RUNNING) &&
!VCORE_IS_EXITING(vc)) {
kvmppc_update_vpa_dispatch(vcpu, vc);
kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu);
} elseif (vc->vcore_state == VCORE_SLEEPING) {
rcuwait_wake_up(&vc->wait);
}
}
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
!signal_pending(current)) { /* See if the MMU is ready to go */ if (!vcpu->kvm->arch.mmu_ready) {
spin_unlock(&vc->lock);
r = kvmhv_setup_mmu(vcpu);
spin_lock(&vc->lock); if (r) {
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
run->fail_entry.
hardware_entry_failure_reason = 0;
vcpu->arch.ret = r; break;
}
}
if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
kvmppc_vcore_end_preempt(vc);
if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { /* Wake up some vcpu to run the core */
i = -1;
v = next_runnable_thread(vc, &i);
wake_up(&v->arch.cpu_run);
}
/* See if the MMU is ready to go */ if (unlikely(!kvm->arch.mmu_ready)) {
r = kvmhv_setup_mmu(vcpu); if (r) {
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
run->fail_entry.hardware_entry_failure_reason = 0;
vcpu->arch.ret = r; return r;
}
}
if (need_resched())
cond_resched();
kvmppc_update_vpas(vcpu);
preempt_disable();
pcpu = smp_processor_id(); if (kvm_is_radix(kvm))
kvmppc_prepare_radix_vcpu(vcpu, pcpu);
/* flags save not required, but irq_pmu has no disable/enable API */
powerpc_local_irq_pmu_save(flags);
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
if (signal_pending(current)) goto sigpend; if (need_resched() || !kvm->arch.mmu_ready) goto out;
/* * Orders set cpu/thread_cpu vs testing for pending interrupts and * doorbells below. The other side is when these fields are set vs * kvmppc_fast_vcpu_kick_hv reading the cpu/thread_cpu fields to * kick a vCPU to notice the pending interrupt.
*/
smp_mb();
if (!nested) {
kvmppc_core_prepare_to_enter(vcpu); if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
&vcpu->arch.pending_exceptions) ||
xive_interrupt_pending(vcpu)) { /* * For nested HV, don't synthesize but always pass MER, * the L0 will be able to optimise that more * effectively than manipulating registers directly.
*/ if (!kvmhv_on_pseries() && (__kvmppc_get_msr_hv(vcpu) & MSR_EE))
kvmppc_inject_interrupt_hv(vcpu,
BOOK3S_INTERRUPT_EXTERNAL, 0); else
lpcr |= LPCR_MER;
} else { /* * L1's copy of L2's LPCR (vcpu->arch.vcore->lpcr) can get its MER bit * unexpectedly set - for e.g. during NMI handling when all register * states are synchronized from L0 to L1. L1 needs to inform L0 about * MER=1 only when there are pending external interrupts. * In the above if check, MER bit is set if there are pending * external interrupts. Hence, explicitly mask off MER bit * here as otherwise it may generate spurious interrupts in L2 KVM * causing an endless loop, which results in L2 guest getting hung.
*/
lpcr &= ~LPCR_MER;
}
} elseif (vcpu->arch.pending_exceptions ||
xive_interrupt_pending(vcpu)) {
vcpu->arch.ret = RESUME_HOST; goto out;
}
if (vcpu->arch.timer_running) {
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
vcpu->arch.timer_running = 0;
}
if (!vtime_accounting_enabled_this_cpu()) {
powerpc_local_irq_pmu_restore(flags); /* * Service IRQs here before guest_timing_exit_irqoff() so any * ticks that occurred while running the guest are accounted to * the guest. If vtime accounting is enabled, accounting uses * TB rather than ticks, so it can be done without enabling * interrupts here, which has the problem that it accounts * interrupt processing overhead to the host.
*/
powerpc_local_irq_pmu_save(flags);
}
guest_timing_exit_irqoff();
powerpc_local_irq_pmu_restore(flags);
preempt_enable();
/* * cancel pending decrementer exception if DEC is now positive, or if * entering a nested guest in which case the decrementer is now owned * by L2 and the L1 decrementer is provided in hdec_expires
*/ if (kvmppc_core_pending_dec(vcpu) &&
((tb < kvmppc_dec_expires_host_tb(vcpu)) ||
(trap == BOOK3S_INTERRUPT_SYSCALL &&
kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
kvmppc_core_dequeue_dec(vcpu);
trace_kvm_guest_exit(vcpu);
r = RESUME_GUEST; if (trap) { if (!nested)
r = kvmppc_handle_exit_hv(vcpu, current); else
r = kvmppc_handle_nested_exit(vcpu);
}
vcpu->arch.ret = r;
if (is_kvmppc_resume_guest(r) && !kvmppc_vcpu_check_block(vcpu)) {
kvmppc_set_timer(vcpu);
prepare_to_rcuwait(wait); for (;;) {
set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) {
vcpu->stat.signal_exits++;
run->exit_reason = KVM_EXIT_INTR;
vcpu->arch.ret = -EINTR; break;
}
staticint kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
{ struct kvm_run *run = vcpu->run; int r; int srcu_idx; struct kvm *kvm; unsignedlong msr;
start_timing(vcpu, &vcpu->arch.vcpu_entry);
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return -EINVAL;
}
/* No need to go into the guest when all we'll do is come back out */ if (signal_pending(current)) {
run->exit_reason = KVM_EXIT_INTR; return -EINTR;
}
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* * Don't allow entry with a suspended transaction, because * the guest entry/exit code will lose it.
*/ if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
(current->thread.regs->msr & MSR_TM)) { if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
run->fail_entry.hardware_entry_failure_reason = 0; return -EINVAL;
}
} #endif
/* * Force online to 1 for the sake of old userspace which doesn't * set it.
*/ if (!vcpu->arch.online) {
atomic_inc(&vcpu->arch.vcore->online_count);
vcpu->arch.online = 1;
}
kvmppc_core_prepare_to_enter(vcpu);
kvm = vcpu->kvm;
atomic_inc(&kvm->arch.vcpus_running); /* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
smp_mb();
msr = 0; if (IS_ENABLED(CONFIG_PPC_FPU))
msr |= MSR_FP; if (cpu_has_feature(CPU_FTR_ALTIVEC))
msr |= MSR_VEC; if (cpu_has_feature(CPU_FTR_VSX))
msr |= MSR_VSX; if ((cpu_has_feature(CPU_FTR_TM) ||
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
(kvmppc_get_hfscr_hv(vcpu) & HFSCR_TM))
msr |= MSR_TM;
msr = msr_check_and_set(msr);
do {
accumulate_time(vcpu, &vcpu->arch.guest_entry); if (cpu_has_feature(CPU_FTR_ARCH_300))
r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
vcpu->arch.vcore->lpcr); else
r = kvmppc_run_vcpu(vcpu);
if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
accumulate_time(vcpu, &vcpu->arch.hcall);
if (!kvmhv_is_nestedv2() && WARN_ON_ONCE(__kvmppc_get_msr_hv(vcpu) & MSR_PR)) { /* * These should have been caught reflected * into the guest by now. Final sanity check: * don't allow userspace to execute hcalls in * the hypervisor.
*/
r = RESUME_GUEST; continue;
}
trace_kvm_hcall_enter(vcpu);
r = kvmppc_pseries_do_hcall(vcpu);
trace_kvm_hcall_exit(vcpu, r);
kvmppc_core_prepare_to_enter(vcpu);
} elseif (r == RESUME_PAGE_FAULT) {
accumulate_time(vcpu, &vcpu->arch.pg_fault);
srcu_idx = srcu_read_lock(&kvm->srcu);
r = kvmppc_book3s_hv_page_fault(vcpu,
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
srcu_read_unlock(&kvm->srcu, srcu_idx);
} elseif (r == RESUME_PASSTHROUGH) { if (WARN_ON(xics_on_xive()))
r = H_SUCCESS; else
r = kvmppc_xics_rm_complete(vcpu, 0);
}
} while (is_kvmppc_resume_guest(r));
accumulate_time(vcpu, &vcpu->arch.vcpu_exit);
/* * POWER7, POWER8 and POWER9 all support 32 storage keys for data. * POWER7 doesn't support keys for instruction accesses, * POWER8 and POWER9 do.
*/
info->data_keys = 32;
info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
/* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
info->slb_size = 32;
/* We only support these sizes for now, and no muti-size segments */
sps = &info->sps[0];
kvmppc_add_seg_page_size(&sps, 12, 0);
kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
/* If running as a nested hypervisor, we don't support HPT guests */ if (kvmhv_on_pseries())
info->flags |= KVM_PPC_NO_HASH;
return 0;
}
/* * Get (and clear) the dirty memory log for a memory slot.
*/ staticint kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, struct kvm_dirty_log *log)
{ struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int r; unsignedlong n, i; unsignedlong *buf, *p; struct kvm_vcpu *vcpu;
mutex_lock(&kvm->slots_lock);
r = -EINVAL; if (log->slot >= KVM_USER_MEM_SLOTS) goto out;
slots = kvm_memslots(kvm);
memslot = id_to_memslot(slots, log->slot);
r = -ENOENT; if (!memslot || !memslot->dirty_bitmap) goto out;
/* * Use second half of bitmap area because both HPT and radix * accumulate bits in the first half.
*/
n = kvm_dirty_bitmap_bytes(memslot);
buf = memslot->dirty_bitmap + n / sizeof(long);
memset(buf, 0, n);
if (kvm_is_radix(kvm))
r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf); else
r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf); if (r) goto out;
/* * We accumulate dirty bits in the first half of the * memslot's dirty_bitmap area, for when pages are paged * out or modified by the host directly. Pick up these * bits and add them to the map.
*/
p = memslot->dirty_bitmap; for (i = 0; i < n / sizeof(long); ++i)
buf[i] |= xchg(&p[i], 0);
/* Harvest dirty bits from VPA and DTL updates */ /* Note: we never modify the SLB shadow buffer areas */
kvm_for_each_vcpu(i, vcpu, kvm) {
spin_lock(&vcpu->arch.vpa_update_lock);
kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf);
kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf);
spin_unlock(&vcpu->arch.vpa_update_lock);
}
r = -EFAULT; if (copy_to_user(log->dirty_bitmap, buf, n)) goto out;
r = 0;
out:
mutex_unlock(&kvm->slots_lock); return r;
}
staticvoid kvmppc_core_commit_memory_region_hv(struct kvm *kvm, struct kvm_memory_slot *old, conststruct kvm_memory_slot *new, enum kvm_mr_change change)
{ /* * If we are creating or modifying a memslot, it might make * some address that was previously cached as emulated * MMIO be no longer emulated MMIO, so invalidate * all the caches of emulated MMIO translations.
*/ if (change != KVM_MR_DELETE)
atomic64_inc(&kvm->arch.mmio_update);
/* * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels * have already called kvm_arch_flush_shadow_memslot() to * flush shadow mappings. For KVM_MR_CREATE we have no * previous mappings. So the only case to handle is * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit * has been changed. * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES * to get rid of any THP PTEs in the partition-scoped page tables * so we can track dirtiness at the page level; we flush when * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to * using THP PTEs.
*/ if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
kvmppc_radix_flush_memslot(kvm, old); /* * If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
*/ if (!kvm->arch.secure_guest) return;
switch (change) { case KVM_MR_CREATE: /* * @TODO kvmppc_uvmem_memslot_create() can fail and * return error. Fix this.
*/
kvmppc_uvmem_memslot_create(kvm, new); break; case KVM_MR_DELETE:
kvmppc_uvmem_memslot_delete(kvm, old); break; default: /* TODO: Handle KVM_MR_MOVE */ break;
}
}
/* * Update LPCR values in kvm->arch and in vcores. * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion * of kvm->arch.lpcr update).
*/ void kvmppc_update_lpcr(struct kvm *kvm, unsignedlong lpcr, unsignedlong mask)
{ longint i;
u32 cores_done = 0;
if ((kvm->arch.lpcr & mask) == lpcr) return;
kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
for (i = 0; i < KVM_MAX_VCORES; ++i) { struct kvmppc_vcore *vc = kvm->arch.vcores[i]; if (!vc) continue;
if (!kvm_is_radix(kvm)) { /* PS field - page size for VRMA */
dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); /* HTABSIZE and HTABORG fields */
dw0 |= kvm->arch.sdr1;
/* Second dword as set by userspace */
dw1 = kvm->arch.process_table;
} else {
dw0 = PATB_HR | radix__get_tree_size() |
__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
dw1 = PATB_GR | kvm->arch.process_table;
}
kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
}
/* * Set up HPT (hashed page table) and RMA (real-mode area). * Must be called with kvm->arch.mmu_setup_lock held.
*/ staticint kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
{ int err = 0; struct kvm *kvm = vcpu->kvm; unsignedlong hva; struct kvm_memory_slot *memslot; struct vm_area_struct *vma; unsignedlong lpcr = 0, senc; unsignedlong psize, porder; int srcu_idx;
/* Allocate hashed page table (if not done already) and reset it */ if (!kvm->arch.hpt.virt) { int order = KVM_DEFAULT_HPT_ORDER; struct kvm_hpt_info info;
err = kvmppc_allocate_hpt(&info, order); /* If we get here, it means userspace didn't specify a * size explicitly. So, try successively smaller
* sizes if the default failed. */ while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
err = kvmppc_allocate_hpt(&info, order);
/* Look up the memslot for guest physical address 0 */
srcu_idx = srcu_read_lock(&kvm->srcu);
memslot = gfn_to_memslot(kvm, 0);
/* We must have some memory at 0 by now */
err = -EINVAL; if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) goto out_srcu;
/* Look up the VMA for the start of this memory slot */
hva = memslot->userspace_addr;
mmap_read_lock(kvm->mm);
vma = vma_lookup(kvm->mm, hva); if (!vma || (vma->vm_flags & VM_IO)) goto up_out;
psize = vma_kernel_pagesize(vma);
mmap_read_unlock(kvm->mm);
/* We can handle 4k, 64k or 16M pages in the VRMA */ if (psize >= 0x1000000)
psize = 0x1000000; elseif (psize >= 0x10000)
psize = 0x10000; else
psize = 0x1000;
porder = __ilog2(psize);
senc = slb_pgsize_encoding(psize);
kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
(VRMA_VSID << SLB_VSID_SHIFT_1T); /* Create HPTEs in the hash page table for the VRMA */
kvmppc_map_vrma(vcpu, memslot, porder);
/* Update VRMASD field in the LPCR */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) { /* the -4 is to account for senc values starting at 0x10 */
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
}
/* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
smp_wmb();
err = 0;
out_srcu:
srcu_read_unlock(&kvm->srcu, srcu_idx);
out: return err;
/* * Must be called with kvm->arch.mmu_setup_lock held and * mmu_ready = 0 and no vcpus running.
*/ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
{ unsignedlong lpcr, lpcr_mask;
if (nesting_enabled(kvm))
kvmhv_release_all_nested(kvm);
kvmppc_rmap_reset(kvm);
kvm->arch.process_table = 0; /* Mutual exclusion with kvm_unmap_gfn_range etc. */
spin_lock(&kvm->mmu_lock);
kvm->arch.radix = 0;
spin_unlock(&kvm->mmu_lock);
kvmppc_free_radix(kvm);
/* * Must be called with kvm->arch.mmu_setup_lock held and * mmu_ready = 0 and no vcpus running.
*/ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
{ unsignedlong lpcr, lpcr_mask; int err;
err = kvmppc_init_vm_radix(kvm); if (err) return err;
kvmppc_rmap_reset(kvm); /* Mutual exclusion with kvm_unmap_gfn_range etc. */
spin_lock(&kvm->mmu_lock);
kvm->arch.radix = 1;
spin_unlock(&kvm->mmu_lock);
kvmppc_free_hpt(&kvm->arch.hpt);
#ifdef CONFIG_KVM_XICS /* * Allocate a per-core structure for managing state about which cores are * running in the host versus the guest and for exchanging data between * real mode KVM and CPU running in the host. * This is only done for the first VM. * The allocated structure stays even if all VMs have stopped. * It is only freed when the kvm-hv module is unloaded. * It's OK for this routine to fail, we just don't support host * core operations like redirecting H_IPI wakeups.
*/ void kvmppc_alloc_host_rm_ops(void)
{ struct kvmppc_host_rm_ops *ops; unsignedlong l_ops; int cpu, core; int size;
if (cpu_has_feature(CPU_FTR_ARCH_300)) return;
/* Not the first time here ? */ if (kvmppc_host_rm_ops_hv != NULL) return;
ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL); if (!ops) return;
for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { if (!cpu_online(cpu)) continue;
core = cpu >> threads_shift;
ops->rm_core[core].rm_state.in_host = 1;
}
ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
/* * Make the contents of the kvmppc_host_rm_ops structure visible * to other CPUs before we assign it to the global variable. * Do an atomic assignment (no locks used here), but if someone * beats us to it, just free our copy and return.
*/
smp_wmb();
l_ops = (unsignedlong) ops;
switch (rc) { case H_PARAMETER: case H_FUNCTION: case H_STATE: return -EINVAL; case H_NOT_ENOUGH_RESOURCES: case H_ABORTED: return -ENOMEM; case H_AUTHORITY: return -EPERM; case H_NOT_AVAILABLE: return -EBUSY;
}
kvm->arch.lpid = guest_id;
}
/* * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, * make sure we flush on each core before running the new VM. * On POWER9, the tlbie in mmu_partition_table_set_entry() * does this flush for us.
*/ if (!cpu_has_feature(CPU_FTR_ARCH_300))
cpumask_setall(&kvm->arch.need_tlb_flush);
/* Start out with the default set of hcalls enabled */
memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls, sizeof(kvm->arch.enabled_hcalls));
if (!cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
/* Init LPCR for virtual RMA mode */ if (cpu_has_feature(CPU_FTR_HVMODE)) {
kvm->arch.host_lpid = mfspr(SPRN_LPID);
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
lpcr &= LPCR_PECE | LPCR_LPES;
} else { /* * The L2 LPES mode will be set by the L0 according to whether * or not it needs to take external interrupts in HV mode.
*/
lpcr = 0;
}
lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
LPCR_VPM0 | LPCR_VPM1;
kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
(VRMA_VSID << SLB_VSID_SHIFT_1T); /* On POWER8 turn on online bit to enable PURR/SPURR */ if (cpu_has_feature(CPU_FTR_ARCH_207S))
lpcr |= LPCR_ONL; /* * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed) * Set HVICE bit to enable hypervisor virtualization interrupts. * Set HEIC to prevent OS interrupts to go to hypervisor (should * be unnecessary but better safe than sorry in case we re-enable * EE in HV mode with this LPCR still set)
*/ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
lpcr &= ~LPCR_VPM0;
lpcr |= LPCR_HVICE | LPCR_HEIC;
/* * If xive is enabled, we route 0x500 interrupts directly * to the guest.
*/ if (xics_on_xive())
lpcr |= LPCR_LPES;
}
/* * If the host uses radix, the guest starts out as radix.
*/ if (radix_enabled()) {
kvm->arch.radix = 1;
kvm->arch.mmu_ready = 1;
lpcr &= ~LPCR_VPM1;
lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; if (cpu_has_feature(CPU_FTR_HVMODE) &&
cpu_has_feature(CPU_FTR_ARCH_31) &&
(kvm->arch.host_lpcr & LPCR_HAIL))
lpcr |= LPCR_HAIL;
ret = kvmppc_init_vm_radix(kvm); if (ret) { if (kvmhv_is_nestedv2())
plpar_guest_delete(0, kvm->arch.lpid); else
kvmppc_free_lpid(kvm->arch.lpid); return ret;
}
kvmppc_setup_partition_table(kvm);
}
verify_lpcr(kvm, lpcr);
kvm->arch.lpcr = lpcr;
/* Initialization for future HPT resizes */
kvm->arch.resize_hpt = NULL;
/* * Work out how many sets the TLB has, for the use of * the TLB invalidation loop in book3s_hv_rmhandlers.S.
*/ if (cpu_has_feature(CPU_FTR_ARCH_31)) { /* * P10 will flush all the congruence class with a single tlbiel
*/
kvm->arch.tlb_sets = 1;
} elseif (radix_enabled())
kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ elseif (cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ elseif (cpu_has_feature(CPU_FTR_ARCH_207S))
kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */ else
kvm->arch.tlb_sets = POWER7_TLB_SETS; /* 128 */
/* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online.
*/ if (!cpu_has_feature(CPU_FTR_ARCH_300))
kvm_hv_vm_activated();
/* * Initialize smt_mode depending on processor. * POWER8 and earlier have to use "strict" threading, where * all vCPUs in a vcore have to run on the same (sub)core, * whereas on POWER9 the threads can each run a different * guest.
*/ if (!cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.smt_mode = threads_per_subcore; else
kvm->arch.smt_mode = 1;
kvm->arch.emul_smt_mode = 1;
for (i = 0; i < KVM_MAX_VCORES; ++i)
kfree(kvm->arch.vcores[i]);
kvm->arch.online_vcores = 0;
}
staticvoid kvmppc_core_destroy_vm_hv(struct kvm *kvm)
{ if (!cpu_has_feature(CPU_FTR_ARCH_300))
kvm_hv_vm_deactivated();
kvmppc_free_vcores(kvm);
if (kvm_is_radix(kvm))
kvmppc_free_radix(kvm); else
kvmppc_free_hpt(&kvm->arch.hpt);
/* Perform global invalidation and return lpid to the pool */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { if (nesting_enabled(kvm))
kvmhv_release_all_nested(kvm);
kvm->arch.process_table = 0; if (kvm->arch.secure_guest)
uv_svm_terminate(kvm->arch.lpid); if (!kvmhv_is_nestedv2())
kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
}
/* We don't need to emulate any privileged instructions or dcbz */ staticint kvmppc_core_emulate_op_hv(struct kvm_vcpu *vcpu, unsignedint inst, int *advance)
{ return EMULATE_FAIL;
}
staticint kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
{ struct irq_desc *desc; struct kvmppc_irq_map *irq_map; struct kvmppc_passthru_irqmap *pimap; struct irq_chip *chip; int i, rc = 0; struct irq_data *host_data;
if (!kvm_irq_bypass) return 1;
desc = irq_to_desc(host_irq); if (!desc) return -EIO;
mutex_lock(&kvm->lock);
pimap = kvm->arch.pimap; if (pimap == NULL) { /* First call, allocate structure to hold IRQ map */
pimap = kvmppc_alloc_pimap(); if (pimap == NULL) {
mutex_unlock(&kvm->lock); return -ENOMEM;
}
kvm->arch.pimap = pimap;
}
/* * For now, we only support interrupts for which the EOI operation * is an OPAL call followed by a write to XIRR, since that's * what our real-mode EOI code does, or a XIVE interrupt
*/
chip = irq_data_get_irq_chip(&desc->irq_data); if (!chip || !is_pnv_opal_msi(chip)) {
pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
host_irq, guest_gsi);
mutex_unlock(&kvm->lock); return -ENOENT;
}
/* * See if we already have an entry for this guest IRQ number. * If it's mapped to a hardware IRQ number, that's an error, * otherwise re-use this entry.
*/ for (i = 0; i < pimap->n_mapped; i++) { if (guest_gsi == pimap->mapped[i].v_hwirq) { if (pimap->mapped[i].r_hwirq) {
mutex_unlock(&kvm->lock); return -EINVAL;
} break;
}
}
if (i == KVMPPC_PIRQ_MAPPED) {
mutex_unlock(&kvm->lock); return -EAGAIN; /* table is full */
}
/* * Order the above two stores before the next to serialize with * the KVM real mode handler.
*/
smp_wmb();
/* * The 'host_irq' number is mapped in the PCI-MSI domain but * the underlying calls, which will EOI the interrupt in real * mode, need an HW IRQ number mapped in the XICS IRQ domain.
*/
host_data = irq_domain_get_irq_data(irq_get_default_domain(), host_irq);
irq_map->r_hwirq = (unsignedint)irqd_to_hwirq(host_data);
if (i == pimap->n_mapped)
pimap->n_mapped++;
if (xics_on_xive())
rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq); else
kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq); if (rc)
irq_map->r_hwirq = 0;
mutex_unlock(&kvm->lock);
return 0;
}
staticint kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
{ struct irq_desc *desc; struct kvmppc_passthru_irqmap *pimap; int i, rc = 0;
if (!kvm_irq_bypass) return 0;
desc = irq_to_desc(host_irq); if (!desc) return -EIO;
mutex_lock(&kvm->lock); if (!kvm->arch.pimap) goto unlock;
pimap = kvm->arch.pimap;
for (i = 0; i < pimap->n_mapped; i++) { if (guest_gsi == pimap->mapped[i].v_hwirq) break;
}
if (i == pimap->n_mapped) {
mutex_unlock(&kvm->lock); return -ENODEV;
}
/* invalidate the entry (what to do on error from the above ?) */
pimap->mapped[i].r_hwirq = 0;
/* * We don't free this structure even when the count goes to * zero. The structure is freed when we destroy the VM.
*/
unlock:
mutex_unlock(&kvm->lock); return rc;
}
staticint kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod)
{ int ret = 0; struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
irqfd->producer = prod;
ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); if (ret)
pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
prod->irq, irqfd->gsi, ret);
/* * When producer of consumer is unregistered, we change back to * default external interrupt handling mode - KVM real mode * will switch back to host.
*/
ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi); if (ret)
pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
prod->irq, irqfd->gsi, ret);
} #endif
/* If we're a nested hypervisor, we currently only support radix */ if (kvmhv_on_pseries()) {
r = -EOPNOTSUPP; break;
}
r = -EFAULT; if (get_user(htab_order, (u32 __user *)argp)) break;
r = kvmppc_alloc_reset_hpt(kvm, htab_order); if (r) break;
r = 0; break;
}
case KVM_PPC_GET_HTAB_FD: { struct kvm_get_htab_fd ghf;
r = -EFAULT; if (copy_from_user(&ghf, argp, sizeof(ghf))) break;
r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); break;
}
case KVM_PPC_RESIZE_HPT_PREPARE: { struct kvm_ppc_resize_hpt rhpt;
r = -EFAULT; if (copy_from_user(&rhpt, argp, sizeof(rhpt))) break;
r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt); break;
}
case KVM_PPC_RESIZE_HPT_COMMIT: { struct kvm_ppc_resize_hpt rhpt;
r = -EFAULT; if (copy_from_user(&rhpt, argp, sizeof(rhpt))) break;
r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt); break;
}
default:
r = -ENOTTY;
}
return r;
}
/* * List of hcall numbers to enable by default. * For compatibility with old userspace, we enable by default * all hcalls that were implemented before the hcall-enabling * facility was added. Note this list should not include H_RTAS.
*/ staticunsignedint default_hcall_list[] = {
H_REMOVE,
H_ENTER,
H_READ,
H_PROTECT,
H_BULK_REMOVE, #ifdef CONFIG_SPAPR_TCE_IOMMU
H_GET_TCE,
H_PUT_TCE, #endif
H_SET_DABR,
H_SET_XDABR,
H_CEDE,
H_PROD,
H_CONFER,
H_REGISTER_VPA, #ifdef CONFIG_KVM_XICS
H_EOI,
H_CPPR,
H_IPI,
H_IPOLL,
H_XIRR,
H_XIRR_X, #endif
0
};
staticvoid init_default_hcalls(void)
{ int i; unsignedint hcall;
for (i = 0; default_hcall_list[i]; ++i) {
hcall = default_hcall_list[i];
WARN_ON(!kvmppc_hcall_impl_hv(hcall));
__set_bit(hcall / 4, default_enabled_hcalls);
}
}
staticint kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
{ unsignedlong lpcr; int radix; int err;
/* If not on a POWER9, reject it */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) return -ENODEV;
/* If any unknown flags set, reject it */ if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) return -EINVAL;
/* GR (guest radix) bit in process_table field must match */
radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); if (!!(cfg->process_table & PATB_GR) != radix) return -EINVAL;
/* Process table size field must be reasonable, i.e. <= 24 */ if ((cfg->process_table & PRTS_MASK) > 24) return -EINVAL;
/* We can change a guest to/from radix now, if the host is radix */ if (radix && !radix_enabled()) return -EINVAL;
/* If we're a nested hypervisor, we currently only support radix */ if (kvmhv_on_pseries() && !radix) return -EINVAL;
mutex_lock(&kvm->arch.mmu_setup_lock); if (radix != kvm_is_radix(kvm)) { if (kvm->arch.mmu_ready) {
kvm->arch.mmu_ready = 0; /* order mmu_ready vs. vcpus_running */
smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) {
kvm->arch.mmu_ready = 1;
err = -EBUSY; goto out_unlock;
}
} if (radix)
err = kvmppc_switch_mmu_to_radix(kvm); else
err = kvmppc_switch_mmu_to_hpt(kvm); if (err) goto out_unlock;
}
/* * Enable a guest to become a secure VM, or test whether * that could be enabled. * Called when the KVM_CAP_PPC_SECURE_GUEST capability is * tested (kvm == NULL) or enabled (kvm != NULL).
*/ staticint kvmhv_enable_svm(struct kvm *kvm)
{ if (!kvmppc_uvmem_available()) return -EINVAL; if (kvm)
kvm->arch.svm_enabled = 1; return 0;
}
/* * IOCTL handler to turn off secure mode of guest * * - Release all device pages * - Issue ucall to terminate the guest on the UV side * - Unpin the VPA pages. * - Reinit the partition scoped page tables
*/ staticint kvmhv_svm_off(struct kvm *kvm)
{ struct kvm_vcpu *vcpu; int mmu_was_ready; int srcu_idx; int ret = 0; unsignedlong i;
if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) return ret;
mutex_lock(&kvm->arch.mmu_setup_lock);
mmu_was_ready = kvm->arch.mmu_ready; if (kvm->arch.mmu_ready) {
kvm->arch.mmu_ready = 0; /* order mmu_ready vs. vcpus_running */
smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) {
kvm->arch.mmu_ready = 1;
ret = -EBUSY; goto out;
}
}
srcu_idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct kvm_memory_slot *memslot; struct kvm_memslots *slots = __kvm_memslots(kvm, i); int bkt;
ret = uv_svm_terminate(kvm->arch.lpid); if (ret != U_SUCCESS) {
ret = -EINVAL; goto out;
}
/* * When secure guest is reset, all the guest pages are sent * to UV via UV_PAGE_IN before the non-boot vcpus get a * chance to run and unpin their VPA pages. Unpinning of all * VPA pages is done here explicitly so that VPA pages * can be migrated to the secure side. * * This is required to for the secure SMP guest to reboot * correctly.
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
spin_lock(&vcpu->arch.vpa_update_lock);
unpin_vpa_reset(kvm, &vcpu->arch.dtl);
unpin_vpa_reset(kvm, &vcpu->arch.slb_shadow);
unpin_vpa_reset(kvm, &vcpu->arch.vpa);
spin_unlock(&vcpu->arch.vpa_update_lock);
}
staticint kvmhv_enable_dawr1(struct kvm *kvm)
{ if (!cpu_has_feature(CPU_FTR_DAWR1)) return -ENODEV;
/* kvm == NULL means the caller is testing if the capability exists */ if (kvm)
kvm->arch.dawr1_enabled = true; return 0;
}
staticbool kvmppc_hash_v3_possible(void)
{ if (!cpu_has_feature(CPU_FTR_ARCH_300)) returnfalse;
if (!cpu_has_feature(CPU_FTR_HVMODE)) returnfalse;
/* * POWER9 chips before version 2.02 can't have some threads in * HPT mode and some in radix mode on the same core.
*/ if (radix_enabled()) { unsignedint pvr = mfspr(SPRN_PVR); if ((pvr >> 16) == PVR_POWER9 &&
(((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101))) returnfalse;
}
if (!tlbie_capable) {
pr_err("KVM-HV: Host does not support TLBIE\n"); return -ENODEV;
}
/* * FIXME!! Do we need to check on all cpus ?
*/
r = kvmppc_core_check_processor_compat_hv(); if (r < 0) return -ENODEV;
r = kvmhv_nested_init(); if (r) return r;
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
r = kvm_init_subcore_bitmap(); if (r) goto err;
}
/* * We need a way of accessing the XICS interrupt controller, * either directly, via paca_ptrs[cpu]->kvm_hstate.xics_phys, or * indirectly, via OPAL.
*/ #ifdef CONFIG_SMP if (!xics_on_xive() && !kvmhv_on_pseries() &&
!local_paca->kvm_hstate.xics_phys) { struct device_node *np;
np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); if (!np) {
pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
r = -ENODEV; goto err;
} /* presence of intc confirmed - node can be dropped again */
of_node_put(np);
} #endif
init_default_hcalls();
init_vcore_lists();
r = kvmppc_mmu_hv_init(); if (r) goto err;
if (kvmppc_radix_possible()) {
r = kvmppc_radix_init(); if (r) goto err;
}
r = kvmppc_uvmem_init(); if (r < 0) {
pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r); return r;
}
#ifdefined(CONFIG_KVM_XICS) /* * IRQ bypass is supported only for interrupts whose EOI operations are * handled via OPAL calls. Therefore, register IRQ bypass handlers * exclusively for PowerNV KVM when booted with 'xive=off', indicating * the use of the emulated XICS interrupt controller.
*/ if (!kvmhv_on_pseries()) {
pr_info("KVM-HV: Enabling IRQ bypass\n");
kvm_ops_hv.irq_bypass_add_producer =
kvmppc_irq_bypass_add_producer_hv;
kvm_ops_hv.irq_bypass_del_producer =
kvmppc_irq_bypass_del_producer_hv;
} #endif
module_init(kvmppc_book3s_init_hv);
module_exit(kvmppc_book3s_exit_hv);
MODULE_DESCRIPTION("KVM on Book3S (POWER8 and later) in hypervisor mode");
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(KVM_MINOR);
MODULE_ALIAS("devname:kvm");
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.135 Sekunden
(vorverarbeitet am 2026-04-25)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.