if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
entry->index = 0;
/* * The TDX module doesn't allow configuring the guest phys addr bits * (EAX[23:16]). However, KVM uses it as an interface to the userspace * to configure the GPAW. Report these bits as configurable.
*/ if (entry->function == 0x80000008)
entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
for (i = 0; i < td_conf->num_cpuid_config; i++)
td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
return 0;
}
/* * Some SEAMCALLs acquire the TDX module globally, and can fail with * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
*/ static DEFINE_MUTEX(tdx_lock);
/* * A per-CPU list of TD vCPUs associated with a given CPU. * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU * list. * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of * the old CPU during the IPI callback running on the old CPU, and then added * to the per-CPU list of the new CPU. * - When a TD is tearing down, all vCPUs are disassociated from their current * running CPUs and removed from the per-CPU list during the IPI callback * running on those CPUs. * - When a CPU is brought down, traverse the per-CPU list to disassociate all * associated TD vCPUs and remove them from the per-CPU list.
*/ static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
/* * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU * to its list before it's deleted from this CPU's list.
*/
smp_wmb();
/* * The page could have been poisoned. MOVDIR64B also clears * the poison bit so the kernel can safely use the page again.
*/ for (i = 0; i < PAGE_SIZE; i += 64)
movdir64b(dest + i, zero_page); /* * MOVDIR64B store uses WC buffer. Prevent following memory reads * from seeing potentially poisoned cache.
*/
__mb();
}
/* * No need to check for TDX_OPERAND_BUSY; all TD pages are freed * before the HKID is released and control pages have also been * released at this point, so there is no possibility of contention.
*/ if (WARN_ON_ONCE(err)) {
pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); return -EIO;
} return 0;
}
staticint tdx_reclaim_page(struct page *page)
{ int r;
r = __tdx_reclaim_page(page); if (!r)
tdx_clear_page(page); return r;
}
/* * Reclaim the TD control page(s) which are crypto-protected by TDX guest's * private KeyID. Assume the cache associated with the TDX private KeyID has * been flushed.
*/ staticvoid tdx_reclaim_control_page(struct page *ctrl_page)
{ /* * Leak the page if the kernel failed to reclaim the page. * The kernel cannot use it safely anymore.
*/ if (tdx_reclaim_page(ctrl_page)) return;
/* Task migration can race with CPU offlining. */ if (unlikely(vcpu->cpu != raw_smp_processor_id())) return;
/* * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The * list tracking still needs to be updated so that it's correct if/when * the vCPU does get initialized.
*/ if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { /* * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: * TDVPR as exclusive, TDR as shared, and TDCS as shared. This * vp flush function is called when destructing vCPU/TD or vCPU * migration. No other thread uses TDVPR in those cases.
*/
err = tdh_vp_flush(&to_tdx(vcpu)->vp); if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { /* * This function is called in IPI context. Do not use * printk to avoid console semaphore. * The caller prints out the error message, instead.
*/ if (err)
arg->err = err;
}
}
tdx_disassociate_vp(vcpu);
}
staticvoid tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
{ struct tdx_flush_vp_arg arg = {
.vcpu = vcpu,
}; int cpu = vcpu->cpu;
if (unlikely(cpu == -1)) return;
smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); if (KVM_BUG_ON(arg.err, vcpu->kvm))
pr_tdx_error(TDH_VP_FLUSH, arg.err);
}
/* * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private * KeyID on the package or core. The TDX module may not finish the * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The * kernel should retry it until it returns success w/o rescheduling.
*/ for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
resume = !!err;
err = tdh_phymem_cache_wb(resume); switch (err) { case TDX_INTERRUPTED_RESUMABLE: continue; case TDX_NO_HKID_READY_TO_WBCACHE:
err = TDX_SUCCESS; /* Already done by other thread */
fallthrough; default: goto out;
}
}
out: if (WARN_ON_ONCE(err))
pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
}
/* * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. * Multiple TDX guests can be destroyed simultaneously. Take the * mutex to prevent it from getting error.
*/
mutex_lock(&tdx_lock);
/* * Releasing HKID is in vm_destroy(). * After the above flushing vps, there should be no more vCPU * associations, as all vCPU fds have been released at this stage.
*/
err = tdh_mng_vpflushdone(&kvm_tdx->td); if (err == TDX_FLUSHVP_NOT_DONE) goto out; if (KVM_BUG_ON(err, kvm)) {
pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
kvm_tdx->hkid); goto out;
}
for_each_online_cpu(i) { if (packages_allocated &&
cpumask_test_and_set_cpu(topology_physical_package_id(i),
packages)) continue; if (targets_allocated)
cpumask_set_cpu(i, targets);
} if (targets_allocated)
on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); else
on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); /* * In the case of error in smp_func_do_phymem_cache_wb(), the following * tdh_mng_key_freeid() will fail.
*/
err = tdh_mng_key_freeid(&kvm_tdx->td); if (KVM_BUG_ON(err, kvm)) {
pr_tdx_error(TDH_MNG_KEY_FREEID, err);
pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
kvm_tdx->hkid);
} else {
tdx_hkid_free(kvm_tdx);
}
/* * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong * heavily with TDX module. Give up freeing TD pages. As the function * already warned, don't warn it again.
*/ if (is_hkid_assigned(kvm_tdx)) return;
if (kvm_tdx->td.tdcs_pages) { for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { if (!kvm_tdx->td.tdcs_pages[i]) continue;
if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) return;
/* * Use a SEAMCALL to ask the TDX module to flush the cache based on the * KeyID. TDX module may access TDR while operating on TD (Especially * when it is reclaiming TDCS).
*/
err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); if (KVM_BUG_ON(err, kvm)) {
pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); return;
}
tdx_clear_page(kvm_tdx->td.tdr_page);
/* * Because guest TD is protected, VMM can't parse the instruction in TD. * Instead, guest uses MMIO hypercall. For unmodified device driver, * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO * instruction into MMIO hypercall. * * SPTE value for MMIO needs to be setup so that #VE is injected into * TD instead of triggering EPT MISCONFIG. * - RWX=0 so that EPT violation is triggered. * - suppress #VE bit is cleared to inject #VE.
*/
kvm_mmu_set_mmio_spte_value(kvm, 0);
/* * TDX has its own limit of maximum vCPUs it can support for all * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports * such limit via the MAX_VCPU_PER_TD global metadata. In * practice, it reflects the number of logical CPUs that ALL * platforms that the TDX module supports can possibly have. * * Limit TDX guest's maximum vCPUs to the number of logical CPUs * the platform has. Simply forwarding the MAX_VCPU_PER_TD to * userspace would result in an unpredictable ABI.
*/
kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
if (kvm_tdx->state != TD_STATE_INITIALIZED) return -EIO;
/* * TDX module mandates APICv, which requires an in-kernel local APIC. * Disallow an in-kernel I/O APIC, because level-triggered interrupts * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
*/ if (!irqchip_split(vcpu->kvm)) return -EINVAL;
vmx_vcpu_pi_load(vcpu, cpu); if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) return;
tdx_flush_vp_on_cpu(vcpu);
KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
local_irq_disable(); /* * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure * vcpu->cpu is read before tdx->cpu_list.
*/
smp_rmb();
bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
{ /* * KVM can't get the interrupt status of TDX guest and it assumes * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, * which passes the interrupt blocked flag.
*/ return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
!to_tdx(vcpu)->vp_enter_args.r12;
}
/* * Only check RVI pending for HALTED case with IRQ enabled. * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the * interrupt was pending before TD exit, then it _must_ be blocked, * otherwise the interrupt would have been serviced at the instruction * boundary.
*/ if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
to_tdx(vcpu)->vp_enter_args.r12) returnfalse;
/* * Compared to vmx_prepare_switch_to_guest(), there is not much to do * as SEAMCALL/SEAMRET calls take care of most of save and restore.
*/ void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{ struct vcpu_vt *vt = to_vt(vcpu);
if (vt->guest_state_loaded) return;
if (likely(is_64bit_mm(current->mm)))
vt->msr_host_kernel_gs_base = current->thread.gsbase; else
vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
/* * It is not possible to reclaim pages while hkid is assigned. It might * be assigned if: * 1. the TD VM is being destroyed but freeing hkid failed, in which * case the pages are leaked * 2. TD VCPU creation failed and this on the error path, in which case * there is nothing to do anyway
*/ if (is_hkid_assigned(kvm_tdx)) return;
if (tdx->vp.tdcx_pages) { for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { if (tdx->vp.tdcx_pages[i])
tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
}
kfree(tdx->vp.tdcx_pages);
tdx->vp.tdcx_pages = NULL;
} if (tdx->vp.tdvpr_page) {
tdx_reclaim_control_page(tdx->vp.tdvpr_page);
tdx->vp.tdvpr_page = 0;
tdx->vp.tdvpr_pa = 0;
}
tdx->state = VCPU_TD_STATE_UNINITIALIZED;
}
int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{ if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) return -EINVAL;
return 1;
}
static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
{ switch (tdvmcall_leaf(vcpu)) { case EXIT_REASON_CPUID: case EXIT_REASON_HLT: case EXIT_REASON_IO_INSTRUCTION: case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: return tdvmcall_leaf(vcpu); case EXIT_REASON_EPT_VIOLATION: return EXIT_REASON_EPT_MISCONFIG; default: break;
}
switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { case TDX_SUCCESS: case TDX_NON_RECOVERABLE_VCPU: case TDX_NON_RECOVERABLE_TD: case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: break; default: return -1u;
}
exit_reason = tdx->vp_enter_ret;
switch (exit_reason) { case EXIT_REASON_TDCALL: if (tdvmcall_exit_type(vcpu)) return EXIT_REASON_VMCALL;
return tdcall_to_vmx_exit_reason(vcpu); case EXIT_REASON_EPT_MISCONFIG: /* * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in * non-instrumentable code with interrupts disabled.
*/ return -1u; default: break;
}
/* * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. * * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target * vCPUs leaving fastpath so that interrupt can be enabled to ensure the * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the * requester may be blocked endlessly.
*/ if (unlikely(tdx_operand_busy(vp_enter_ret))) return EXIT_FASTPATH_EXIT_HANDLED;
/* * All TDX hosts support PKRU; but even if they didn't, * vcpu->arch.host_pkru would be 0 and the wrpkru would be * skipped.
*/ if (vcpu->arch.host_pkru != 0)
wrpkru(vcpu->arch.host_pkru);
if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
/* * Likewise, even if a TDX hosts didn't support XSS both arms of * the comparison would be 0 and the wrmsrl would be skipped.
*/ if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
wrmsrl(MSR_IA32_XSS, kvm_host.xss);
}
/* * WARN if KVM wants to force an immediate exit, as the TDX module does * not guarantee entry into the guest, i.e. it's possible for KVM to * _think_ it completed entry to the guest and forced an immediate exit * without actually having done so. Luckily, KVM never needs to force * an immediate exit for TDX (KVM can't do direct event injection, so * just WARN and continue on.
*/
WARN_ON_ONCE(run_flags);
/* * Wait until retry of SEPT-zap-related SEAMCALL completes before * allowing vCPU entry to avoid contention with tdh_vp_enter() and * TDCALLs.
*/ if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) return EXIT_FASTPATH_EXIT_HANDLED;
if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) return EXIT_FASTPATH_NONE;
if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) return EXIT_FASTPATH_NONE;
if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(tdx_failed_vmentry(vcpu))) return EXIT_FASTPATH_NONE;
return tdx_exit_handlers_fastpath(vcpu);
}
void tdx_inject_nmi(struct kvm_vcpu *vcpu)
{
++vcpu->stat.nmi_injections;
td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); /* * From KVM's perspective, NMI injection is completed right after * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by * the TDX module or not.
*/
vcpu->arch.nmi_injected = false; /* * TDX doesn't support KVM to request NMI window exit. If there is * still a pending vNMI, KVM is not able to inject it along with the * one pending in TDX module in a back-to-back way. Since the previous * vNMI is still pending in TDX module, i.e. it has not been delivered * to TDX guest yet, it's OK to collapse the pending vNMI into the * previous one. The guest is expected to handle all the NMI sources * when handling the first vNMI.
*/
vcpu->arch.nmi_pending = 0;
}
/* * Machine checks are handled by handle_exception_irqoff(), or by * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
*/ if (is_nmi(intr_info) || is_machine_check(intr_info)) return 1;
/* * Split into chunks and check interrupt pending between chunks. This allows * for timely injection of interrupts to prevent issues with guest lockup * detection.
*/ #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) staticvoid __tdx_map_gpa(struct vcpu_tdx *tdx);
tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; if (tdx->map_gpa_next >= tdx->map_gpa_end) return 1;
/* * Stop processing the remaining part if there is a pending interrupt, * which could be qualified to deliver. Skip checking pending RVI for * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
*/ if (kvm_vcpu_has_events(vcpu)) {
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
tdx->vp_enter_args.r11 = tdx->map_gpa_next; return 1;
}
if (size > TDX_MAP_GPA_MAX_LEN)
size = TDX_MAP_GPA_MAX_LEN;
tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; /* * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
*/
tdx->vcpu.run->hypercall.ret = 0;
tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
KVM_MAP_GPA_RANGE_ENCRYPTED :
KVM_MAP_GPA_RANGE_DECRYPTED;
tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
/* * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE * bit set. This is a base call so it should always be supported, but * KVM has no way to ensure that userspace implements the GHCI correctly. * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error * to the guest.
*/ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; goto error;
}
/* * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to * do MMIO emulation for private GPA.
*/ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) goto error;
/* * For now, there is no TDVMCALL beyond GHCI base API supported by KVM * directly without the support from userspace, just set the value * returned from userspace.
*/
tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
/* The gpa of buffer must have shared bit set. */ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); return 1;
}
/* * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the * callback tdx_gmem_post_populate() then maps pages into private memory. * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the * private EPT structures for the page to have been built before, which is * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there * are no half-initialized shared EPT pages.
*/ staticint tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, enum pg_level level, kvm_pfn_t pfn)
{ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) return -EINVAL;
/* nr_premapped will be decreased when tdh_mem_page_add() is called. */
atomic64_inc(&kvm_tdx->nr_premapped); return 0;
}
/* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) return -EINVAL;
/* * Because guest_memfd doesn't support page migration with * a_ops->migrate_folio (yet), no callback is triggered for KVM on page * migration. Until guest_memfd supports page migration, prevent page * migration. * TODO: Once guest_memfd introduces callback on page migration, * implement it and remove get_page/put_page().
*/
get_page(page);
/* * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching * barrier in tdx_td_finalize().
*/
smp_rmb(); if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) return tdx_mem_page_aug(kvm, gfn, level, page);
/* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) return -EINVAL;
if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) return -EINVAL;
/* * When zapping private page, write lock is held. So no race condition * with other vcpu sept operation. * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
*/
err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
&level_state);
if (unlikely(tdx_operand_busy(err))) { /* * The second retry is expected to succeed after kicking off all * other vCPUs and prevent them from invoking TDH.VP.ENTER.
*/
tdx_no_vcpus_enter_start(kvm);
err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
&level_state);
tdx_no_vcpus_enter_stop(kvm);
}
/* * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called * successfully. * * Since tdh_mem_sept_add() must have been invoked successfully before a * non-leaf entry present in the mirrored page table, the SEPT ZAP related * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the * SEPT. * * Further check if the returned entry from SEPT walking is with RWX permissions * to filter out anything unexpected. * * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from * level_state returned from a SEAMCALL error is the same as that passed into * the SEAMCALL.
*/ staticint tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
u64 entry, int level)
{ if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) returnfalse;
if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) returnfalse;
if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) returnfalse;
/* * Ensure shared and private EPTs to be flushed on all vCPUs. * tdh_mem_track() is the only caller that increases TD epoch. An increase in * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are * running in guest mode with the value "N - 1". * * A successful execution of tdh_mem_track() ensures that vCPUs can only run in * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch * being increased to "N + 1". * * Kicking off all vCPUs after that further results in no vCPUs can run in guest * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. * to increase TD epoch to "N + 2"). * * TDX module will flush EPT on the next TD enter and make vCPUs to run in * guest mode with TD epoch value "N + 1". * * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by * waiting empty IPI handler ack_kick(). * * No action is required to the vCPUs being kicked off since the kicking off * occurs certainly after TD epoch increment and before the next * tdh_mem_track().
*/ staticvoid tdx_track(struct kvm *kvm)
{ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
u64 err;
/* If TD isn't finalized, it's before any vcpu running. */ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) return;
lockdep_assert_held_write(&kvm->mmu_lock);
err = tdh_mem_track(&kvm_tdx->td); if (unlikely(tdx_operand_busy(err))) { /* After no vCPUs enter, the second retry is expected to succeed */
tdx_no_vcpus_enter_start(kvm);
err = tdh_mem_track(&kvm_tdx->td);
tdx_no_vcpus_enter_stop(kvm);
}
if (KVM_BUG_ON(err, kvm))
pr_tdx_error(TDH_MEM_TRACK, err);
/* * free_external_spt() is only called after hkid is freed when TD is * tearing down. * KVM doesn't (yet) zap page table pages in mirror page table while * TD is active, though guest pages mapped in mirror page table could be * zapped during TD is active, e.g. for shared <-> private conversion * and slot move/deletion.
*/ if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) return -EINVAL;
/* * The HKID assigned to this TD was already freed and cache was * already flushed. We don't have to flush again.
*/ return tdx_reclaim_page(virt_to_page(private_spt));
}
/* * HKID is released after all private pages have been removed, and set * before any might be populated. Warn if zapping is attempted when * there can't be anything populated in the private EPT.
*/ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) return -EINVAL;
ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); if (ret <= 0) return ret;
/* * TDX requires TLB tracking before dropping private page. Do * it here, although it is also done later.
*/
tdx_track(kvm);
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
gpa, vcpu->vcpu_id);
kvm_vm_dead(vcpu->kvm); return -EIO;
} /* * Always treat SEPT violations as write faults. Ignore the * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. * TD private pages are always RWX in the SEPT tables, * i.e. they're always mapped writable. Just as importantly, * treating SEPT violations as write faults is necessary to * avoid COW allocations, which will cause TDAUGPAGE failures * due to aliasing a single HPA to multiple GPAs.
*/
exit_qual = EPT_VIOLATION_ACC_WRITE;
/* Only private GPA triggers zero-step mitigation */
local_retry = true;
} else {
exit_qual = vmx_get_exit_qual(vcpu); /* * EPT violation due to instruction fetch should never be * triggered from shared memory in TDX guest. If such EPT * violation occurs, treat it as broken hardware.
*/ if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) return -EIO;
}
trace_kvm_page_fault(vcpu, gpa, exit_qual);
/* * To minimize TDH.VP.ENTER invocations, retry locally for private GPA * mapping in TDX. * * KVM may return RET_PF_RETRY for private GPA due to * - contentions when atomically updating SPTEs of the mirror page table * - in-progress GFN invalidation or memslot removal. * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) * or certain TDCALLs. * * If TDH.VP.ENTER is invoked more times than the threshold set by the * TDX module before KVM resolves the private GPA mapping, the TDX * module will activate zero-step mitigation during TDH.VP.ENTER. This * process acquires an SEPT tree lock in the TDX module, leading to * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD * operations on other vCPUs. * * Breaking out of local retries for kvm_vcpu_has_events() is for * interrupt injection. kvm_vcpu_has_events() should not see pending * events for TDX. Since KVM can't determine if IRQs (or NMIs) are * blocked by TDs, false positives are inevitable i.e., KVM may re-enter * the guest even if the IRQ/NMI can't be delivered. * * Note: even without breaking out of local retries, zero-step * mitigation may still occur due to * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, * - a single RIP causing EPT violations for more GFNs than the * threshold count. * This is safe, as triggering zero-step mitigation only introduces * contentions to page installation SEAMCALLs on other vCPUs, which will * handle retries locally in their EPT violation handlers.
*/ while (1) {
ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
if (ret != RET_PF_RETRY || !local_retry) break;
if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) break;
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
ret = -EIO; break;
}
cond_resched();
} return ret;
}
int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
{ if (err) {
tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); return 1;
}
if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
KVM_BUG_ON(1, vcpu->kvm); return -EIO;
}
/* * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and * TDX_SEAMCALL_VMFAILINVALID.
*/ if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); goto unhandled_exit;
}
if (unlikely(tdx_failed_vmentry(vcpu))) { /* * If the guest state is protected, that means off-TD debug is * not enabled, TDX_NON_RECOVERABLE must be set.
*/
WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
!(vp_enter_ret & TDX_NON_RECOVERABLE));
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0;
}
switch (exit_reason.basic) { case EXIT_REASON_TRIPLE_FAULT:
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
vcpu->mmio_needed = 0; return 0; case EXIT_REASON_EXCEPTION_NMI: return tdx_handle_exception_nmi(vcpu); case EXIT_REASON_EXTERNAL_INTERRUPT:
++vcpu->stat.irq_exits; return 1; case EXIT_REASON_CPUID: return tdx_emulate_cpuid(vcpu); case EXIT_REASON_HLT: return kvm_emulate_halt_noskip(vcpu); case EXIT_REASON_TDCALL: return handle_tdvmcall(vcpu); case EXIT_REASON_VMCALL: return tdx_emulate_vmcall(vcpu); case EXIT_REASON_IO_INSTRUCTION: return tdx_emulate_io(vcpu); case EXIT_REASON_MSR_READ:
kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); return kvm_emulate_rdmsr(vcpu); case EXIT_REASON_MSR_WRITE:
kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); return kvm_emulate_wrmsr(vcpu); case EXIT_REASON_EPT_MISCONFIG: return tdx_emulate_mmio(vcpu); case EXIT_REASON_EPT_VIOLATION: return tdx_handle_ept_violation(vcpu); case EXIT_REASON_OTHER_SMI: /* * Unlike VMX, SMI in SEAM non-root mode (i.e. when * TD guest vCPU is running) will cause VM exit to TDX module, * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered * and handled by kernel handler right away. * * The Other SMI exit can also be caused by the SEAM non-root * machine check delivered via Machine Check System Management * Interrupt (MSMI), but it has already been handled by the * kernel machine check handler, i.e., the memory page has been * marked as poisoned and it won't be freed to the free list * when the TDX guest is terminated (the TDX module marks the * guest as dead and prevent it from further running when * machine check happens in SEAM non-root). * * - A MSMI will not reach here, it's handled as non_recoverable * case above. * - If it's not an MSMI, no need to do anything here.
*/ return 1; default: break;
}
bool tdx_has_emulated_msr(u32 index)
{ switch (index) { case MSR_IA32_UCODE_REV: case MSR_IA32_ARCH_CAPABILITIES: case MSR_IA32_POWER_CTL: case MSR_IA32_CR_PAT: case MSR_MTRRcap: case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: case MSR_IA32_TSC_DEADLINE: case MSR_IA32_MISC_ENABLE: case MSR_PLATFORM_INFO: case MSR_MISC_FEATURES_ENABLES: case MSR_IA32_APICBASE: case MSR_EFER: case MSR_IA32_FEAT_CTL: case MSR_IA32_MCG_CAP: case MSR_IA32_MCG_STATUS: case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_EXT_CTL: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ case MSR_KVM_POLL_CONTROL: returntrue; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: /* * x2APIC registers that are virtualized by the CPU can't be * emulated, KVM doesn't have access to the virtual APIC page.
*/ switch (index) { case X2APIC_MSR(APIC_TASKPRI): case X2APIC_MSR(APIC_PROCPRI): case X2APIC_MSR(APIC_EOI): case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): returnfalse; default: returntrue;
} default: returnfalse;
}
}
staticbool tdx_is_read_only_msr(u32 index)
{ return index == MSR_IA32_APICBASE || index == MSR_EFER ||
index == MSR_IA32_FEAT_CTL;
}
int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{ switch (msr->index) { case MSR_IA32_FEAT_CTL: /* * MCE and MCA are advertised via cpuid. Guest kernel could * check if LMCE is enabled or not.
*/
msr->data = FEAT_CTL_LOCKED; if (vcpu->arch.mcg_cap & MCG_LMCE_P)
msr->data |= FEAT_CTL_LMCE_ENABLED; return 0; case MSR_IA32_MCG_EXT_CTL: if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) return 1;
msr->data = vcpu->arch.mcg_ext_ctl; return 0; default: if (!tdx_has_emulated_msr(msr->index)) return 1;
return kvm_get_msr_common(vcpu, msr);
}
}
int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{ switch (msr->index) { case MSR_IA32_MCG_EXT_CTL: if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
(msr->data & ~MCG_EXT_CTL_LMCE_EN)) return 1;
vcpu->arch.mcg_ext_ctl = msr->data; return 0; default: if (tdx_is_read_only_msr(msr->index)) return 1;
/* * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is * similar to TDX's GPAW. Use this field as the interface for userspace to
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.12 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.