// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * AMD SVM-SEV support * * Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
/* Called with the sev_bitmap_lock held, or on shutdown */ staticint sev_flush_asids(unsignedint min_asid, unsignedint max_asid)
{ int ret, error = 0; unsignedint asid;
/* Check if there are any ASIDs to reclaim before performing a flush */
asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); if (asid > max_asid) return -EBUSY;
/* * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, * so it must be guarded.
*/
down_write(&sev_deactivate_lock);
/* SNP firmware requires use of WBINVD for ASID recycling. */
wbinvd_on_all_cpus();
if (sev_snp_enabled)
ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); else
ret = sev_guest_df_flush(&error);
/* Must be called with the sev_bitmap_lock held */ staticbool __sev_recycle_asids(unsignedint min_asid, unsignedint max_asid)
{ if (sev_flush_asids(min_asid, max_asid)) returnfalse;
/* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */
bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap,
nr_asids);
bitmap_zero(sev_reclaim_asid_bitmap, nr_asids);
staticint sev_asid_new(struct kvm_sev_info *sev)
{ /* * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. * Note: min ASID can end up larger than the max if basic SEV support is * effectively disabled by disallowing use of ASIDs for SEV guests.
*/ unsignedint min_asid = sev->es_active ? 1 : min_sev_asid; unsignedint max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; unsignedint asid; bool retry = true; int ret;
if (min_asid > max_asid) return -ENOTTY;
WARN_ON(sev->misc_cg);
sev->misc_cg = get_current_misc_cg();
ret = sev_misc_cg_try_charge(sev); if (ret) {
put_misc_cg(sev->misc_cg);
sev->misc_cg = NULL; return ret;
}
/* * Transition a page to hypervisor-owned/shared state in the RMP table. This * should not fail under normal conditions, but leak the page should that * happen since it will no longer be usable by the host due to RMP protections.
*/ staticint kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level)
{ if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) {
snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); return -EIO;
}
return 0;
}
/* * Certain page-states, such as Pre-Guest and Firmware pages (as documented * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE * unless they are reclaimed first. * * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they * might not be usable by the host due to being set as immutable or still * being associated with a guest ASID. * * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be * converted back to shared, as the page is no longer usable due to RMP * protections, and it's infeasible for the guest to continue on.
*/ staticint snp_page_reclaim(struct kvm *kvm, u64 pfn)
{ struct sev_data_snp_page_reclaim data = {0}; int fw_err, rc;
/* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */
down_read(&sev_deactivate_lock);
sev_guest_deactivate(&deactivate, NULL);
up_read(&sev_deactivate_lock);
sev_decommission(handle);
}
/* * This sets up bounce buffers/firmware pages to handle SNP Guest Request * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB * 2.0 specification for more details. * * Technically, when an SNP Guest Request is issued, the guest will provide its * own request/response pages, which could in theory be passed along directly * to firmware rather than using bounce pages. However, these pages would need * special care: * * - Both pages are from shared guest memory, so they need to be protected * from migration/etc. occurring while firmware reads/writes to them. At a * minimum, this requires elevating the ref counts and potentially needing * an explicit pinning of the memory. This places additional restrictions * on what type of memory backends userspace can use for shared guest * memory since there is some reliance on using refcounted pages. * * - The response page needs to be switched to Firmware-owned[1] state * before the firmware can write to it, which can lead to potential * host RMP #PFs if the guest is misbehaved and hands the host a * guest page that KVM might write to for other reasons (e.g. virtio * buffers/etc.). * * Both of these issues can be avoided completely by using separately-allocated * bounce pages for both the request/response pages and passing those to * firmware instead. So that's what is being set up here. * * Guest requests rely on message sequence numbers to ensure requests are * issued to firmware in the order the guest issues them, so concurrent guest * requests generally shouldn't happen. But a misbehaved guest could issue * concurrent guest requests in theory, so a mutex is used to serialize * access to the bounce buffers. * * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" * in the APM for details on the related RMP restrictions.
*/ staticint snp_guest_req_init(struct kvm *kvm)
{ struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct page *req_page;
req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!req_page) return -ENOMEM;
/* * Currently KVM supports the full range of mandatory features defined * by version 2 of the GHCB protocol, so default to that for SEV-ES * guests created via KVM_SEV_INIT2.
*/ if (sev->es_active && !sev->ghcb_version)
sev->ghcb_version = GHCB_VERSION_DEFAULT;
if (vm_type == KVM_X86_SNP_VM)
sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE;
ret = sev_asid_new(sev); if (ret) goto e_no_asid;
init_args.probe = false;
ret = sev_platform_init(&init_args); if (ret) goto e_free_asid;
if (!zalloc_cpumask_var(&sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM; goto e_free_asid;
}
/* This needs to happen after SEV/SNP firmware initialization. */ if (vm_type == KVM_X86_SNP_VM) {
ret = snp_guest_req_init(kvm); if (ret) goto e_free;
}
/* * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will * continue to only ever support the minimal GHCB protocol version.
*/ if (vm_type == KVM_X86_SEV_ES_VM)
data.ghcb_version = GHCB_VERSION_MIN;
for (i = 0; i < npages; i++) {
page_virtual = kmap_local_page(pages[i]);
clflush_cache_range(page_virtual, PAGE_SIZE);
kunmap_local(page_virtual);
cond_resched();
}
}
staticvoid sev_writeback_caches(struct kvm *kvm)
{ /* * Ensure that all dirty guest tagged cache entries are written back * before releasing the pages back to the system for use. CLFLUSH will * not do this without SME_COHERENT, and flushing many cache lines * individually is slower than blasting WBINVD for large VMs, so issue * WBNOINVD (or WBINVD if the "no invalidate" variant is unsupported) * on CPUs that have done VMRUN, i.e. may have dirtied data using the * VM's ASID. * * For simplicity, never remove CPUs from the bitmap. Ideally, KVM * would clear the mask when flushing caches, but doing so requires * serializing multiple calls and having responding CPUs (to the IPI) * mark themselves as still running if they are running (or about to * run) a vCPU for the VM. * * Note, the caller is responsible for ensuring correctness if the mask * can be modified, e.g. if a CPU could be doing VMRUN.
*/
wbnoinvd_on_cpus_mask(to_kvm_sev_info(kvm)->have_run_cpus);
}
/* Lock the user memory. */
inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); if (IS_ERR(inpages)) return PTR_ERR(inpages);
/* * Flush (on non-coherent CPUs) before LAUNCH_UPDATE encrypts pages in * place; the cache may contain the data that was written unencrypted.
*/
sev_clflush_pages(inpages, npages);
for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { int offset, len;
/* * If the user buffer is not page-aligned, calculate the offset * within the page.
*/
offset = vaddr & (PAGE_SIZE - 1);
/* Calculate the number of pages that can be encrypted in one go. */
pages = get_num_contig_pages(i, inpages, npages);
len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
data.len = len;
data.address = __sme_page_pa(inpages[i]) + offset;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, &data, &argp->error); if (ret) goto e_unpin;
size -= len;
next_vaddr = vaddr + len;
}
e_unpin: /* content of memory is updated, mark pages dirty */ for (i = 0; i < npages; i++) {
set_page_dirty_lock(inpages[i]);
mark_page_accessed(inpages[i]);
} /* unlock the user pages */
sev_unpin_memory(kvm, inpages, npages); return ret;
}
/* Check some debug related fields before encrypting the VMSA */ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) return -EINVAL;
/* * SEV-ES will use a VMSA that is pointed to by the VMCB, not * the traditional VMSA that is part of the VMCB. Copy the * traditional VMSA as it has been built so far (in prep * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
*/
memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
for (i = 0; i < 8; i++) { /* * The format of the x87 save area is undocumented and * definitely not what you would expect. It consists of * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes * area with bytes 8-9 of each register.
*/
d = save->fpreg_x87 + i * 8;
s = ((u8 *)xsave->i387.st_space) + i * 16;
memcpy(d, s, 8);
save->fpreg_x87[64 + i * 2] = s[8];
save->fpreg_x87[64 + i * 2 + 1] = s[9];
}
memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
s = get_xsave_addr(xsave, XFEATURE_YMM); if (s)
memcpy(save->fpreg_ymm, s, 256); else
memset(save->fpreg_ymm, 0, 256);
}
pr_debug("Virtual Machine Save Area (VMSA):\n");
print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
return 0;
}
staticint __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, int *error)
{ struct sev_data_launch_update_vmsa vmsa; struct vcpu_svm *svm = to_svm(vcpu); int ret;
if (vcpu->guest_debug) {
pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported"); return -EINVAL;
}
/* Perform some pre-encryption checks against the VMSA */
ret = sev_es_sync_vmsa(svm); if (ret) return ret;
/* * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of * the VMSA memory content (i.e it will write the same memory region * with the guest's key), so invalidate it first.
*/
clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
/* * SEV-ES guests maintain an encrypted version of their FPU * state which is restored and saved on VMRUN and VMEXIT. * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't * do xsave/xrstor on it.
*/
fpstate_set_confidential(&vcpu->arch.guest_fpu);
vcpu->arch.guest_state_protected = true;
/* * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it * only after setting guest_state_protected because KVM_SET_MSRS allows * dynamic toggling of LBRV (for performance reason) on write access to * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
*/
svm_enable_lbrv(vcpu); return 0;
}
staticint __sev_dbg_decrypt(struct kvm *kvm, unsignedlong src_paddr, unsignedlong dst_paddr, int sz, int *err)
{ int offset;
/* * Its safe to read more than we are asked, caller should ensure that * destination has enough space.
*/
offset = src_paddr & 15;
src_paddr = round_down(src_paddr, 16);
sz = round_up(sz + offset, 16);
staticint __sev_dbg_decrypt_user(struct kvm *kvm, unsignedlong paddr, void __user *dst_uaddr, unsignedlong dst_paddr, int size, int *err)
{ struct page *tpage = NULL; int ret, offset;
/* if inputs are not 16-byte then use intermediate buffer */ if (!IS_ALIGNED(dst_paddr, 16) ||
!IS_ALIGNED(paddr, 16) ||
!IS_ALIGNED(size, 16)) {
tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!tpage) return -ENOMEM;
dst_paddr = __sme_page_pa(tpage);
}
ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err); if (ret) goto e_free;
if (tpage) {
offset = paddr & 15; if (copy_to_user(dst_uaddr, page_address(tpage) + offset, size))
ret = -EFAULT;
}
e_free: if (tpage)
__free_page(tpage);
return ret;
}
staticint __sev_dbg_encrypt_user(struct kvm *kvm, unsignedlong paddr, void __user *vaddr, unsignedlong dst_paddr, void __user *dst_vaddr, int size, int *error)
{ struct page *src_tpage = NULL; struct page *dst_tpage = NULL; int ret, len = size;
/* If source buffer is not aligned then use an intermediate buffer */ if (!IS_ALIGNED((unsignedlong)vaddr, 16)) {
src_tpage = alloc_page(GFP_KERNEL_ACCOUNT); if (!src_tpage) return -ENOMEM;
if (copy_from_user(page_address(src_tpage), vaddr, size)) {
__free_page(src_tpage); return -EFAULT;
}
paddr = __sme_page_pa(src_tpage);
}
/* * If destination buffer or length is not aligned then do read-modify-write: * - decrypt destination in an intermediate buffer * - copy the source buffer in an intermediate buffer * - use the intermediate buffer as source buffer
*/ if (!IS_ALIGNED((unsignedlong)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { int dst_offset;
dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT); if (!dst_tpage) {
ret = -ENOMEM; goto e_free;
}
ret = __sev_dbg_decrypt(kvm, dst_paddr,
__sme_page_pa(dst_tpage), size, error); if (ret) goto e_free;
/* * If source is kernel buffer then use memcpy() otherwise * copy_from_user().
*/
dst_offset = dst_paddr & 15;
if (src_tpage)
memcpy(page_address(dst_tpage) + dst_offset,
page_address(src_tpage), size); else { if (copy_from_user(page_address(dst_tpage) + dst_offset,
vaddr, size)) {
ret = -EFAULT; goto e_free;
}
}
/* * Flush (on non-coherent CPUs) before DBG_{DE,EN}CRYPT read or modify * the pages; flush the destination too so that future accesses do not * see stale data.
*/
sev_clflush_pages(src_p, 1);
sev_clflush_pages(dst_p, 1);
/* * Since user buffer may not be page aligned, calculate the * offset within the page.
*/
s_off = vaddr & ~PAGE_MASK;
d_off = dst_vaddr & ~PAGE_MASK;
len = min_t(size_t, (PAGE_SIZE - s_off), size);
/* * Flush (on non-coherent CPUs) before LAUNCH_SECRET encrypts pages in * place; the cache may contain the data that was written unencrypted.
*/
sev_clflush_pages(pages, n);
/* * The secret must be copied into contiguous memory region, lets verify * that userspace memory pages are contiguous before we issue command.
*/ if (get_num_contig_pages(0, pages, n) != n) {
ret = -EINVAL; goto e_unpin_memory;
}
if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_start))) return -EFAULT;
/* if session_len is zero, userspace wants to query the session length */ if (!params.session_len) return __sev_send_start_query_session_length(kvm, argp,
¶ms);
/* some sanity checks */ if (!params.pdh_cert_uaddr || !params.pdh_cert_len ||
!params.session_uaddr || params.session_len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL;
/* allocate the memory to hold the session data blob */
session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT); if (!session_data) return -ENOMEM;
/* copy the certificate blobs from userspace */
pdh_cert = psp_copy_user_blob(params.pdh_cert_uaddr,
params.pdh_cert_len); if (IS_ERR(pdh_cert)) {
ret = PTR_ERR(pdh_cert); goto e_free_session;
}
plat_certs = psp_copy_user_blob(params.plat_certs_uaddr,
params.plat_certs_len); if (IS_ERR(plat_certs)) {
ret = PTR_ERR(plat_certs); goto e_free_pdh;
}
amd_certs = psp_copy_user_blob(params.amd_certs_uaddr,
params.amd_certs_len); if (IS_ERR(amd_certs)) {
ret = PTR_ERR(amd_certs); goto e_free_plat_cert;
}
if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_update_data))) return -EFAULT;
/* userspace wants to query either header or trans length */ if (!params.trans_len || !params.hdr_len) return __sev_send_update_data_query_lengths(kvm, argp, ¶ms);
if (!params.trans_uaddr || !params.guest_uaddr ||
!params.guest_len || !params.hdr_uaddr) return -EINVAL;
/* Check if we are crossing the page boundary */
offset = params.guest_uaddr & (PAGE_SIZE - 1); if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) return -EINVAL;
/* The SEND_UPDATE_DATA command requires C-bit to be always set. */
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
data.guest_address |= sev_me_mask;
data.guest_len = params.guest_len;
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
if (ret) goto e_free_trans_data;
/* copy transport buffer to user space */ if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
trans_data, params.trans_len)) {
ret = -EFAULT; goto e_free_trans_data;
}
/* Copy packet header to userspace. */ if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
params.hdr_len))
ret = -EFAULT;
/* * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP * encrypts the written data with the guest's key, and the cache may * contain dirty, unencrypted data.
*/
sev_clflush_pages(guest_page, n);
/* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
data.guest_address |= sev_me_mask;
data.guest_len = params.guest_len;
data.handle = to_kvm_sev_info(kvm)->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
&argp->error);
staticbool is_cmd_allowed_from_mirror(u32 cmd_id)
{ /* * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES * active mirror VMs. Also allow the debugging and status commands.
*/ if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
cmd_id == KVM_SEV_DBG_ENCRYPT) returntrue;
/* * Bail if these VMs are already involved in a migration to avoid * deadlock between two VMs trying to migrate to/from each other.
*/ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) return -EBUSY;
if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) goto release_dst;
r = -EINTR; if (mutex_lock_killable(&dst_kvm->lock)) goto release_src; if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) goto unlock_dst; return 0;
/* * If this VM has mirrors, "transfer" each mirror's refcount of the * source to the destination (this KVM). The caller holds a reference * to the source, so there's no danger of use-after-free.
*/
list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms);
list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) {
kvm_get_kvm(dst_kvm);
kvm_put_kvm(src_kvm);
mirror->enc_context_owner = dst_kvm;
}
/* * If this VM is a mirror, remove the old mirror from the owners list * and add the new mirror to the list.
*/ if (is_mirroring_enc_context(dst_kvm)) { struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner);
/* * Note, the source is not required to have the same number of * vCPUs as the destination when migrating a vanilla SEV VM.
*/
src_vcpu = kvm_get_vcpu(src_kvm, i);
src_svm = to_svm(src_vcpu);
/* * Transfer VMSA and GHCB state to the destination. Nullify and * clear source fields as appropriate, the state now belongs to * the destination.
*/
memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es));
dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa;
dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa;
dst_vcpu->arch.guest_state_protected = true;
source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm); if (ret) return ret;
if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL; goto out_unlock;
}
src_sev = to_kvm_sev_info(source_kvm);
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev; if (dst_sev->misc_cg != src_sev->misc_cg) {
ret = sev_misc_cg_try_charge(dst_sev); if (ret) goto out_dst_cgroup;
charged = true;
}
ret = kvm_lock_all_vcpus(kvm); if (ret) goto out_dst_cgroup;
ret = kvm_lock_all_vcpus(source_kvm); if (ret) goto out_dst_vcpu;
ret = sev_check_source_vcpus(kvm, source_kvm); if (ret) goto out_source_vcpu;
/* * Allocate a new have_run_cpus for the destination, i.e. don't copy * the set of CPUs from the source. If a CPU was used to run a vCPU in * the source VM but is never used for the destination VM, then the CPU * can only have cached memory that was accessible to the source VM.
*/ if (!zalloc_cpumask_var(&dst_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM; goto out_source_vcpu;
}
sev_migrate_from(kvm, source_kvm);
kvm_vm_dead(source_kvm);
cg_cleanup_sev = src_sev;
ret = 0;
out_source_vcpu:
kvm_unlock_all_vcpus(source_kvm);
out_dst_vcpu:
kvm_unlock_all_vcpus(kvm);
out_dst_cgroup: /* Operates on the source on success, on the destination on failure. */ if (charged)
sev_misc_cg_uncharge(cg_cleanup_sev);
put_misc_cg(cg_cleanup_sev->misc_cg);
cg_cleanup_sev->misc_cg = NULL;
out_unlock:
sev_unlock_two_vms(kvm, source_kvm); return ret;
}
int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
{ if (group != KVM_X86_GRP_SEV) return -ENXIO;
switch (attr) { case KVM_X86_SEV_VMSA_FEATURES:
*val = sev_supported_vmsa_features; return 0;
default: return -ENXIO;
}
}
/* * The guest context contains all the information, keys and metadata * associated with the guest that the firmware tracks to implement SEV * and SNP features. The firmware stores the guest context in hypervisor * provide page via the SNP_GCTX_CREATE command.
*/ staticvoid *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp)
{ struct sev_data_snp_addr data = {}; void *context; int rc;
/* Allocate memory for context page */
context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); if (!context) return NULL;
ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
&fw_args, &sev_populate_args->fw_error); if (ret) goto fw_err;
}
return 0;
fw_err: /* * If the firmware command failed handle the reclaim and cleanup of that * PFN specially vs. prior pages which can be cleaned up below without * needing to reclaim in advance. * * Additionally, when invalid CPUID function entries are detected, * firmware writes the expected values into the page and leaves it * unencrypted so it can be used for debugging and error-reporting. * * Copy this page back into the source buffer so userspace can use this * information to provide information on which CPUID leaves/fields * failed CPUID validation.
*/ if (!snp_page_reclaim(kvm, pfn + i) &&
sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { void *vaddr = kmap_local_pfn(pfn + i);
if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
pr_debug("Failed to write CPUID page back to userspace\n");
kunmap_local(vaddr);
}
/* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
n_private--;
err:
pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n",
__func__, ret, sev_populate_args->fw_error, n_private); for (i = 0; i < n_private; i++)
kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K);
/* * For each GFN that's being prepared as part of the initial guest * state, the following pre-conditions are verified: * * 1) The backing memslot is a valid private memslot. * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES * beforehand. * 3) The PFN of the guest_memfd has not already been set to private * in the RMP table. * * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page * faults if there's a race between a fault and an attribute update via * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized * here. However, kvm->slots_lock guards against both this as well as * concurrent memslot updates occurring while these checks are being * performed, so use that here to make it easier to reason about the * initial expected state and better guard against unexpected * situations.
*/
mutex_lock(&kvm->slots_lock);
memslot = gfn_to_memslot(kvm, params.gfn_start); if (!kvm_slot_can_be_private(memslot)) {
ret = -EINVAL; goto out;
}
/* Transition the VMSA page to a firmware state. */
ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); if (ret) return ret;
/* Issue the SNP command to encrypt the VMSA */
data.address = __sme_pa(svm->sev_es.vmsa);
ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE,
&data, &argp->error); if (ret) {
snp_page_reclaim(kvm, pfn);
return ret;
}
svm->vcpu.arch.guest_state_protected = true; /* * SEV-ES (and thus SNP) guest mandates LBR Virtualization to * be _always_ ON. Enable it only after setting * guest_state_protected because KVM_SET_MSRS allows dynamic * toggling of LBRV (for performance reason) on write access to * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set.
*/
svm_enable_lbrv(vcpu);
}
/* * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages * can be given to the guest simply by marking the RMP entry as private. * This can happen on first access and also with KVM_PRE_FAULT_MEMORY.
*/ if (!ret)
kvm->arch.pre_fault_allowed = true;
kfree(id_auth);
e_free_id_block:
kfree(id_block);
e_free:
kfree(data);
return ret;
}
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{ struct kvm_sev_cmd sev_cmd; int r;
if (!sev_enabled) return -ENOTTY;
if (!argp) return 0;
if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) return -EFAULT;
mutex_lock(&kvm->lock);
/* Only the enc_context_owner handles some memory enc operations. */ if (is_mirroring_enc_context(kvm) &&
!is_cmd_allowed_from_mirror(sev_cmd.id)) {
r = -EINVAL; goto out;
}
/* * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only * allow the use of SNP-specific commands.
*/ if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) {
r = -EPERM; goto out;
}
switch (sev_cmd.id) { case KVM_SEV_ES_INIT: if (!sev_es_enabled) {
r = -ENOTTY; goto out;
}
fallthrough; case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd); break; case KVM_SEV_INIT2:
r = sev_guest_init2(kvm, &sev_cmd); break; case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd); break; case KVM_SEV_LAUNCH_UPDATE_DATA:
r = sev_launch_update_data(kvm, &sev_cmd); break; case KVM_SEV_LAUNCH_UPDATE_VMSA:
r = sev_launch_update_vmsa(kvm, &sev_cmd); break; case KVM_SEV_LAUNCH_MEASURE:
r = sev_launch_measure(kvm, &sev_cmd); break; case KVM_SEV_LAUNCH_FINISH:
r = sev_launch_finish(kvm, &sev_cmd); break; case KVM_SEV_GUEST_STATUS:
r = sev_guest_status(kvm, &sev_cmd); break; case KVM_SEV_DBG_DECRYPT:
r = sev_dbg_crypt(kvm, &sev_cmd, true); break; case KVM_SEV_DBG_ENCRYPT:
r = sev_dbg_crypt(kvm, &sev_cmd, false); break; case KVM_SEV_LAUNCH_SECRET:
r = sev_launch_secret(kvm, &sev_cmd); break; case KVM_SEV_GET_ATTESTATION_REPORT:
r = sev_get_attestation_report(kvm, &sev_cmd); break; case KVM_SEV_SEND_START:
r = sev_send_start(kvm, &sev_cmd); break; case KVM_SEV_SEND_UPDATE_DATA:
r = sev_send_update_data(kvm, &sev_cmd); break; case KVM_SEV_SEND_FINISH:
r = sev_send_finish(kvm, &sev_cmd); break; case KVM_SEV_SEND_CANCEL:
r = sev_send_cancel(kvm, &sev_cmd); break; case KVM_SEV_RECEIVE_START:
r = sev_receive_start(kvm, &sev_cmd); break; case KVM_SEV_RECEIVE_UPDATE_DATA:
r = sev_receive_update_data(kvm, &sev_cmd); break; case KVM_SEV_RECEIVE_FINISH:
r = sev_receive_finish(kvm, &sev_cmd); break; case KVM_SEV_SNP_LAUNCH_START:
r = snp_launch_start(kvm, &sev_cmd); break; case KVM_SEV_SNP_LAUNCH_UPDATE:
r = snp_launch_update(kvm, &sev_cmd); break; case KVM_SEV_SNP_LAUNCH_FINISH:
r = snp_launch_finish(kvm, &sev_cmd); break; default:
r = -EINVAL; goto out;
}
if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
r = -EFAULT;
out:
mutex_unlock(&kvm->lock); return r;
}
int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range)
{ struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct enc_region *region; int ret = 0;
if (!sev_guest(kvm)) return -ENOTTY;
/* If kvm is mirroring encryption context it isn't responsible for it */ if (is_mirroring_enc_context(kvm)) return -EINVAL;
if (range->addr > ULONG_MAX || range->size > ULONG_MAX) return -EINVAL;
region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT); if (!region) return -ENOMEM;
mutex_lock(&kvm->lock);
region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages,
FOLL_WRITE | FOLL_LONGTERM); if (IS_ERR(region->pages)) {
ret = PTR_ERR(region->pages);
mutex_unlock(&kvm->lock); goto e_free;
}
/* * The guest may change the memory encryption attribute from C=0 -> C=1 * or vice versa for this memory range. Lets make sure caches are * flushed to ensure that guest data gets written into memory with * correct C-bit. Note, this must be done before dropping kvm->lock, * as region and its array of pages can be freed by a different task * once kvm->lock is released.
*/
sev_clflush_pages(region->pages, region->npages);
int sev_mem_enc_unregister_region(struct kvm *kvm, struct kvm_enc_region *range)
{ struct enc_region *region; int ret;
/* If kvm is mirroring encryption context it isn't responsible for it */ if (is_mirroring_enc_context(kvm)) return -EINVAL;
mutex_lock(&kvm->lock);
if (!sev_guest(kvm)) {
ret = -ENOTTY; goto failed;
}
region = find_enc_region(kvm, range); if (!region) {
ret = -EINVAL; goto failed;
}
sev_writeback_caches(kvm);
__unregister_enc_region_locked(kvm, region);
mutex_unlock(&kvm->lock); return 0;
failed:
mutex_unlock(&kvm->lock); return ret;
}
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsignedint source_fd)
{ CLASS(fd, f)(source_fd); struct kvm *source_kvm; struct kvm_sev_info *source_sev, *mirror_sev; int ret;
if (fd_empty(f)) return -EBADF;
if (!file_is_kvm(fd_file(f))) return -EBADF;
source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm); if (ret) return ret;
/* * Mirrors of mirrors should work, but let's not get silly. Also * disallow out-of-band SEV/SEV-ES init if the target is already an * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being * created after SEV/SEV-ES initialization, e.g. to init intercepts.
*/ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
ret = -EINVAL; goto e_unlock;
}
mirror_sev = to_kvm_sev_info(kvm); if (!zalloc_cpumask_var(&mirror_sev->have_run_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM; goto e_unlock;
}
/* * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it
*/
source_sev = to_kvm_sev_info(source_kvm);
kvm_get_kvm(source_kvm);
list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms);
/* Set enc_context_owner and copy its encryption context over */
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
mirror_sev->asid = source_sev->asid;
mirror_sev->fd = source_sev->fd;
mirror_sev->es_active = source_sev->es_active;
mirror_sev->need_init = false;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
INIT_LIST_HEAD(&mirror_sev->mirror_vms);
ret = 0;
/* * Do not copy ap_jump_table. Since the mirror does not share the same * KVM contexts as the original, and they may have different * memory-views.
*/
staticint snp_decommission_context(struct kvm *kvm)
{ struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_snp_addr data = {}; int ret;
/* If context is not created then do nothing */ if (!sev->snp_context) return 0;
/* Do the decommision, which will unbind the ASID from the SNP context */
data.address = __sme_pa(sev->snp_context);
down_write(&sev_deactivate_lock);
ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
up_write(&sev_deactivate_lock);
if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) return ret;
/* * If this is a mirror VM, remove it from the owner's list of a mirrors * and skip ASID cleanup (the ASID is tied to the lifetime of the owner). * Note, mirror VMs don't support registering encrypted regions.
*/ if (is_mirroring_enc_context(kvm)) { struct kvm *owner_kvm = sev->enc_context_owner;
/* * if userspace was terminated before unregistering the memory regions * then lets unpin all the registered memory.
*/ if (!list_empty(head)) {
list_for_each_safe(pos, q, head) {
__unregister_enc_region_locked(kvm,
list_entry(pos, struct enc_region, list));
cond_resched();
}
}
if (sev_snp_guest(kvm)) {
snp_guest_req_cleanup(kvm);
/* * Decomission handles unbinding of the ASID. If it fails for * some unexpected reason, just leak the ASID.
*/ if (snp_decommission_context(kvm)) return;
} else {
sev_unbind_asid(kvm, sev->handle);
}
sev_asid_free(sev);
}
void __init sev_set_cpu_caps(void)
{ if (sev_enabled) {
kvm_cpu_cap_set(X86_FEATURE_SEV);
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
} if (sev_es_enabled) {
kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
} if (sev_snp_enabled) {
kvm_cpu_cap_set(X86_FEATURE_SEV_SNP);
kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM);
}
}
if (!sev_enabled || !npt_enabled || !nrips) goto out;
/* * SEV must obviously be supported in hardware. Sanity check that the * CPU supports decode assists, which is mandatory for SEV guests to * support instruction emulation. Ditto for flushing by ASID, as SEV * guests are bound to a single ASID, i.e. KVM can't rotate to a new * ASID to effect a TLB flush.
*/ if (!boot_cpu_has(X86_FEATURE_SEV) ||
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) ||
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out;
/* * The kernel's initcall infrastructure lacks the ability to express * dependencies between initcalls, whereas the modules infrastructure * automatically handles dependencies via symbol loading. Ensure the * PSP SEV driver is initialized before proceeding if KVM is built-in, * as the dependency isn't handled by the initcall infrastructure.
*/ if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) goto out;
/* Retrieve SEV CPUID information */
cpuid(0x8000001f, &eax, &ebx, &ecx, &edx);
/* Set encryption bit location for SEV-ES guests */
sev_enc_bit = ebx & 0x3f;
/* Maximum number of encrypted guests supported simultaneously */
max_sev_asid = ecx; if (!max_sev_asid) goto out;
/* Minimum ASID value that should be used for SEV guest */
min_sev_asid = edx;
sev_me_mask = 1UL << (ebx & 0x3f);
/* * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap, * even though it's never used, so that the bitmap is indexed by the * actual ASID.
*/
nr_asids = max_sev_asid + 1;
sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); if (!sev_asid_bitmap) goto out;
/* SEV-ES support requested? */ if (!sev_es_enabled) goto out;
/* * SEV-ES requires MMIO caching as KVM doesn't have access to the guest * instruction stream, i.e. can't emulate in response to a #NPF and * instead relies on #NPF(RSVD) being reflected into the guest as #VC * (the guest can then do a #VMGEXIT to request MMIO emulation).
*/ if (!enable_mmio_caching) goto out;
/* Does the CPU support SEV-ES? */ if (!boot_cpu_has(X86_FEATURE_SEV_ES)) goto out;
if (!lbrv) {
WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), "LBRV must be present for SEV-ES support"); goto out;
}
/* Has the system been allocated ASIDs for SEV-ES? */ if (min_sev_asid == 1) goto out;
int sev_cpu_init(struct svm_cpu_data *sd)
{ if (!sev_enabled) return 0;
sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL); if (!sd->sev_vmcbs) return -ENOMEM;
return 0;
}
/* * Pages used by hardware to hold guest encrypted state must be flushed before * returning them to the system.
*/ staticvoid sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
{ unsignedint asid = sev_get_asid(vcpu->kvm);
/* * Note! The address must be a kernel address, as regular page walk * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user * address is non-deterministic and unsafe. This function deliberately * takes a pointer to deter passing in a user address.
*/ unsignedlong addr = (unsignedlong)va;
/* * If CPU enforced cache coherency for encrypted mappings of the * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache * flush is still needed in order to work properly with DMA devices.
*/ if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
clflush_cache_range(va, PAGE_SIZE); return;
}
/* * VM Page Flush takes a host virtual address and a guest ASID. Fall * back to full writeback of caches if this faults so as not to make * any problems worse by leaving stale encrypted data in the cache.
*/ if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) goto do_sev_writeback_caches;
void sev_guest_memory_reclaimed(struct kvm *kvm)
{ /* * With SNP+gmem, private/encrypted memory is unreachable via the * hva-based mmu notifiers, i.e. these events are explicitly scoped to * shared pages, where there's no need to flush caches.
*/ if (!sev_guest(kvm) || sev_snp_guest(kvm)) return;
/* * If it's an SNP guest, then the VMSA was marked in the RMP table as * a guest-owned page. Transition the page to hypervisor state before * releasing it back to the system.
*/ if (sev_snp_guest(vcpu->kvm)) {
u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT;
if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) goto skip_vmsa_free;
}
if (vcpu->arch.guest_state_protected)
sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
__free_page(virt_to_page(svm->sev_es.vmsa));
skip_vmsa_free: if (svm->sev_es.ghcb_sa_free)
kvfree(svm->sev_es.ghcb_sa);
}
/* Re-use the dump_invalid_vmcb module parameter */ if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return;
}
nbits = sizeof(svm->sev_es.valid_bitmap) * 8;
/* * Print KVM's snapshot of the GHCB values that were (unsuccessfully) * used to handle the exit. If the guest has since modified the GHCB * itself, dumping the raw GHCB won't help debug why KVM was unable to * handle the VMGEXIT that KVM observed.
*/
pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa);
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code",
kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1",
control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2",
control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm));
pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch",
svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm));
pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap);
}
/* * The GHCB protocol so far allows for the following data * to be returned: * GPRs RAX, RBX, RCX, RDX * * Copy their values, even if they may not have been written during the * VM-Exit. It's the guest's responsibility to not consume random data.
*/
ghcb_set_rax(ghcb, vcpu->arch.regs[VCPU_REGS_RAX]);
ghcb_set_rbx(ghcb, vcpu->arch.regs[VCPU_REGS_RBX]);
ghcb_set_rcx(ghcb, vcpu->arch.regs[VCPU_REGS_RCX]);
ghcb_set_rdx(ghcb, vcpu->arch.regs[VCPU_REGS_RDX]);
}
/* * The GHCB protocol so far allows for the following data * to be supplied: * GPRs RAX, RBX, RCX, RDX * XCR0 * CPL * * VMMCALL allows the guest to provide extra registers. KVM also * expects RSI for hypercalls, so include that, too. * * Copy their values to the appropriate location if supplied.
*/
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
/* * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging.
*/
exit_code = kvm_ghcb_get_sw_exit_code(control);
/* Only GHCB Usage code 0 is supported */ if (svm->sev_es.ghcb->ghcb_usage) {
reason = GHCB_ERR_INVALID_USAGE; goto vmgexit_err;
}
reason = GHCB_ERR_MISSING_INPUT;
if (!kvm_ghcb_sw_exit_code_is_valid(svm) ||
!kvm_ghcb_sw_exit_info_1_is_valid(svm) ||
!kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err;
switch (exit_code) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_RDTSC: break; case SVM_EXIT_RDPMC: if (!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_CPUID: if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd) if (!kvm_ghcb_xcr0_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_INVD: break; case SVM_EXIT_IOIO: if (control->exit_info_1 & SVM_IOIO_STR_MASK) { if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err;
} else { if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK)) if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err;
} break; case SVM_EXIT_MSR: if (!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; if (control->exit_info_1) { if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_rdx_is_valid(svm)) goto vmgexit_err;
} break; case SVM_EXIT_VMMCALL: if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_cpl_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_RDTSCP: break; case SVM_EXIT_WBINVD: break; case SVM_EXIT_MONITOR: if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_rcx_is_valid(svm) ||
!kvm_ghcb_rdx_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_MWAIT: if (!kvm_ghcb_rax_is_valid(svm) ||
!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; break; case SVM_VMGEXIT_MMIO_READ: case SVM_VMGEXIT_MMIO_WRITE: if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; case SVM_VMGEXIT_AP_CREATION: if (!sev_snp_guest(vcpu->kvm)) goto vmgexit_err; if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err; break; case SVM_VMGEXIT_NMI_COMPLETE: case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: case SVM_VMGEXIT_UNSUPPORTED_EVENT: case SVM_VMGEXIT_HV_FEATURES: case SVM_VMGEXIT_TERM_REQUEST: break; case SVM_VMGEXIT_PSC: if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; case SVM_VMGEXIT_GUEST_REQUEST: case SVM_VMGEXIT_EXT_GUEST_REQUEST: if (!sev_snp_guest(vcpu->kvm) ||
!PAGE_ALIGNED(control->exit_info_1) ||
!PAGE_ALIGNED(control->exit_info_2) ||
control->exit_info_1 == control->exit_info_2) goto vmgexit_err; break; default:
reason = GHCB_ERR_INVALID_EVENT; goto vmgexit_err;
}
return 0;
vmgexit_err: if (reason == GHCB_ERR_INVALID_USAGE) {
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
svm->sev_es.ghcb->ghcb_usage);
} elseif (reason == GHCB_ERR_INVALID_EVENT) {
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
exit_code);
} else {
vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
exit_code);
dump_ghcb(svm);
}
svm_vmgexit_bad_input(svm, reason);
/* Resume the guest to "return" the error code. */ return 1;
}
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{ /* Clear any indication that the vCPU is in a type of AP Reset Hold */
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
if (!svm->sev_es.ghcb) return;
if (svm->sev_es.ghcb_sa_free) { /* * The scratch area lives outside the GHCB, so there is a * buffer that, depending on the operation performed, may * need to be synced, then freed.
*/ if (svm->sev_es.ghcb_sa_sync) {
kvm_write_guest(svm->vcpu.kvm,
svm->sev_es.sw_scratch,
svm->sev_es.ghcb_sa,
svm->sev_es.ghcb_sa_len);
svm->sev_es.ghcb_sa_sync = false;
}
/* * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP * AP Destroy event.
*/ if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) return -EINVAL;
/* * To optimize cache flushes when memory is reclaimed from an SEV VM, * track physical CPUs that enter the guest for SEV VMs and thus can * have encrypted, dirty data in the cache, and flush caches only for * CPUs that have entered the guest.
*/ if (!cpumask_test_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus))
cpumask_set_cpu(cpu, to_kvm_sev_info(kvm)->have_run_cpus);
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
/* * Flush guest TLB: * * 1) when different VMCB for the same ASID is to be run on the same host CPU. * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
*/ if (sd->sev_vmcbs[asid] == svm->vmcb &&
svm->vcpu.arch.last_vmentry_cpu == cpu) return 0;
scratch_gpa_beg = svm->sev_es.sw_scratch; if (!scratch_gpa_beg) {
pr_err("vmgexit: scratch gpa not provided\n"); goto e_scratch;
}
scratch_gpa_end = scratch_gpa_beg + len; if (scratch_gpa_end < scratch_gpa_beg) {
pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
len, scratch_gpa_beg); goto e_scratch;
}
if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { /* Scratch area begins within GHCB */
ghcb_scratch_beg = control->ghcb_gpa +
offsetof(struct ghcb, shared_buffer);
ghcb_scratch_end = control->ghcb_gpa +
offsetof(struct ghcb, reserved_0xff0);
/* * If the scratch area begins within the GHCB, it must be * completely contained in the GHCB shared buffer area.
*/ if (scratch_gpa_beg < ghcb_scratch_beg ||
scratch_gpa_end > ghcb_scratch_end) {
pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
scratch_gpa_beg, scratch_gpa_end); goto e_scratch;
}
scratch_va = (void *)svm->sev_es.ghcb;
scratch_va += (scratch_gpa_beg - control->ghcb_gpa);
} else { /* * The guest memory must be read into a kernel buffer, so * limit the size
*/ if (len > GHCB_SCRATCH_AREA_LIMIT) {
pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
len, GHCB_SCRATCH_AREA_LIMIT); goto e_scratch;
}
scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); if (!scratch_va) return -ENOMEM;
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { /* Unable to copy scratch area from guest */
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
kvfree(scratch_va); return -EFAULT;
}
/* * The scratch area is outside the GHCB. The operation will * dictate whether the buffer needs to be synced before running * the vCPU next time (i.e. a read was requested so the data * must be written back to the guest memory).
*/
svm->sev_es.ghcb_sa_sync = sync;
svm->sev_es.ghcb_sa_free = true;
}
/* * PSMASH_FAIL_INUSE indicates another processor is modifying the * entry, so retry until that's no longer the case.
*/ do {
ret = psmash(pfn);
} while (ret == PSMASH_FAIL_INUSE);
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; /* * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
*/
vcpu->run->hypercall.ret = 0;
vcpu->run->hypercall.args[0] = gpa;
vcpu->run->hypercall.args[1] = 1;
vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE)
? KVM_MAP_GPA_RANGE_ENCRYPTED
: KVM_MAP_GPA_RANGE_DECRYPTED;
vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
/* * PSC requests always get a "no action" response in SW_EXITINFO1, with * a PSC-specific return code in SW_EXITINFO2 that provides the "real" * return code. E.g. if the PSC request was interrupted, the need to * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1.
*/
svm_vmgexit_no_action(svm, psc_ret);
}
/* * Everything in-flight has been processed successfully. Update the * corresponding entries in the guest's PSC buffer and zero out the * count of in-flight PSC entries.
*/ for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
svm->sev_es.psc_inflight--, idx++) { struct psc_entry *entry = &entries[idx];
if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); return 1;
}
next_range: /* There should be no other PSCs in-flight at this point. */ if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); return 1;
}
/* * The PSC descriptor buffer can be modified by a misbehaved guest after * validation, so take care to only use validated copies of values used * for things like array indexing.
*/
idx_start = hdr->cur_entry;
idx_end = hdr->end_entry;
if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); return 1;
}
/* Find the start of the next range which needs processing. */ for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
entry_start = entries[idx];
if (entry_start.cur_page) { /* * If this is a partially-completed 2M range, force 4K handling * for the remaining pages since they're effectively split at * this point. Subsequent code should ensure this doesn't get * combined with adjacent PSC entries where 2M handling is still * possible.
*/
npages -= entry_start.cur_page;
gfn += entry_start.cur_page;
huge = false;
}
if (npages) break;
}
if (idx > idx_end) { /* Nothing more to process. */
snp_complete_psc(svm, 0); return 1;
}
/* * Find all subsequent PSC entries that contain adjacent GPA * ranges/operations and can be combined into a single * KVM_HC_MAP_GPA_RANGE exit.
*/ while (++idx <= idx_end) { struct psc_entry entry = entries[idx];
switch (entry_start.operation) { case VMGEXIT_PSC_OP_PRIVATE: case VMGEXIT_PSC_OP_SHARED:
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; /* * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
*/
vcpu->run->hypercall.ret = 0;
vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
vcpu->run->hypercall.args[1] = npages;
vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
? KVM_MAP_GPA_RANGE_ENCRYPTED
: KVM_MAP_GPA_RANGE_DECRYPTED;
vcpu->run->hypercall.args[2] |= entry_start.pagesize
? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
: KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
vcpu->arch.complete_userspace_io = snp_complete_one_psc; return 0; /* forward request to userspace */ default: /* * Only shared/private PSC operations are currently supported, so if the * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), * then consider the entire range completed and avoid exiting to * userspace. In theory snp_complete_psc() can always be called directly * at this point to complete the current range and start the next one, * but that could lead to unexpected levels of recursion.
*/
__snp_complete_one_psc(svm); goto next_range;
}
BUG();
}
/* * Invoked as part of svm_vcpu_reset() processing of an init event.
*/ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
{ struct vcpu_svm *svm = to_svm(vcpu); struct kvm_memory_slot *slot; struct page *page;
kvm_pfn_t pfn;
gfn_t gfn;
if (!sev_snp_guest(vcpu->kvm)) return;
guard(mutex)(&svm->sev_es.snp_vmsa_mutex);
if (!svm->sev_es.snp_ap_waiting_for_reset) return;
svm->sev_es.snp_ap_waiting_for_reset = false;
/* Mark the vCPU as offline and not runnable */
vcpu->arch.pv.pv_unhalted = false;
kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED);
/* Clear use of the VMSA */
svm->vmcb->control.vmsa_pa = INVALID_PAGE;
/* * When replacing the VMSA during SEV-SNP AP creation, * mark the VMCB dirty so that full state is always reloaded.
*/
vmcb_mark_all_dirty(svm->vmcb);
if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) return;
slot = gfn_to_memslot(vcpu->kvm, gfn); if (!slot) return;
/* * The new VMSA will be private memory guest memory, so retrieve the * PFN from the gmem backend.
*/ if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) return;
/* * From this point forward, the VMSA will always be a guest-mapped page * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but * that involves cleanups like flushing caches, which would ideally be * handled during teardown rather than guest boot. Deferring that also * allows the existing logic for SEV-ES VMSAs to be re-used with * minimal SNP-specific changes.
*/
svm->sev_es.snp_has_guest_vmsa = true;
/* Use the new VMSA */
svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn);
/* Mark the vCPU as runnable */
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
/* * gmem pages aren't currently migratable, but if this ever changes * then care should be taken to ensure svm->sev_es.vmsa is pinned * through some other means.
*/
kvm_release_page_clean(page);
}
/* Validate the APIC ID */
target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); if (!target_vcpu) {
vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
apic_id); return -EINVAL;
}
target_svm = to_svm(target_vcpu);
guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex);
switch (request) { case SVM_VMGEXIT_AP_CREATE_ON_INIT: case SVM_VMGEXIT_AP_CREATE: if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) {
vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n",
vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); return -EINVAL;
}
if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
svm->vmcb->control.exit_info_2); return -EINVAL;
}
/* * Malicious guest can RMPADJUST a large page into VMSA which * will hit the SNP erratum where the CPU will incorrectly signal * an RMP violation #PF if a hugepage collides with the RMP entry * of VMSA page, reject the AP CREATE request if VMSA address from * guest is 2M aligned.
*/ if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
vcpu_unimpl(vcpu, "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
svm->vmcb->control.exit_info_2); return -EINVAL;
}
target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; break; case SVM_VMGEXIT_AP_DESTROY:
target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; break; default:
vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
request); return -EINVAL;
}
/* * Unless Creation is deferred until INIT, signal the vCPU to update * its state.
*/ if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
/* * Firmware failures are propagated on to guest, but any other failure * condition along the way should be reported to userspace. E.g. if * the PSP is dead and commands are timing out.
*/
ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); if (ret && !fw_err) goto out_unlock;
if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) {
ret = -EIO; goto out_unlock;
}
/* No action is requested *from KVM* if there was a firmware error. */
svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err));
/* * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for * additional certificate data to be provided alongside the attestation * report via the guest-provided data pages indicated by RAX/RBX. The * certificate data is optional and requires additional KVM enablement * to provide an interface for userspace to provide it, but KVM still * needs to be able to handle extended guest requests either way. So * provide a stub implementation that will always return an empty * certificate table in the guest-provided data pages.
*/ if (msg_type == SNP_MSG_REPORT_REQ) { struct kvm_vcpu *vcpu = &svm->vcpu;
u64 data_npages;
gpa_t data_gpa;
if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) goto request_invalid;
if (!PAGE_ALIGNED(data_gpa)) goto request_invalid;
/* * As per GHCB spec (see "SNP Extended Guest Request"), the * certificate table is terminated by 24-bytes of zeroes.
*/ if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) return -EIO;
}
set_ghcb_msr_bits(svm, GHCB_MSR_CPUID_RESP,
GHCB_MSR_INFO_MASK,
GHCB_MSR_INFO_POS); break;
} case GHCB_MSR_AP_RESET_HOLD_REQ:
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
/* * Preset the result to a non-SIPI return and then only set * the result to non-zero when delivering a SIPI.
*/
set_ghcb_msr_bits(svm, 0,
GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
GHCB_MSR_INFO_MASK,
GHCB_MSR_INFO_POS); break; case GHCB_MSR_HV_FT_REQ:
set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); break; case GHCB_MSR_PREF_GPA_REQ: if (!sev_snp_guest(vcpu->kvm)) goto out_terminate;
/* SEV-SNP guest requires that the GHCB GPA must be registered */ if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) {
vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); return -EINVAL;
}
ret = sev_es_validate_vmgexit(svm); if (ret) return ret;
svm_vmgexit_success(svm, 0);
exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { case SVM_VMGEXIT_MMIO_READ:
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); if (ret) break;
ret = kvm_sev_es_mmio_read(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_MMIO_WRITE:
ret = setup_vmgexit_scratch(svm, false, control->exit_info_2); if (ret) break;
ret = kvm_sev_es_mmio_write(vcpu,
control->exit_info_1,
control->exit_info_2,
svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE:
++vcpu->stat.nmi_window_exits;
svm->nmi_masked = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 1; break; case SVM_VMGEXIT_AP_HLT_LOOP:
svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm);
switch (control->exit_info_1) { case 0: /* Set AP jump table address */
sev->ap_jump_table = control->exit_info_2; break; case 1: /* Get AP jump table address */
svm_vmgexit_success(svm, sev->ap_jump_table); break; default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
}
ret = 1; break;
} case SVM_VMGEXIT_HV_FEATURES:
svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED);
ret = 1; break; case SVM_VMGEXIT_TERM_REQUEST:
pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
control->exit_info_1, control->exit_info_2);
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
vcpu->run->system_event.ndata = 1;
vcpu->run->system_event.data[0] = control->ghcb_gpa; break; case SVM_VMGEXIT_PSC:
ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); if (ret) break;
ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_AP_CREATION:
ret = sev_snp_ap_creation(svm); if (ret) {
svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT);
}
ret = 1; break; case SVM_VMGEXIT_GUEST_REQUEST:
ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); break; case SVM_VMGEXIT_EXT_GUEST_REQUEST:
ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2); break; case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
ret = -EINVAL; break; default:
ret = svm_invoke_exit_handler(vcpu, exit_code);
}
return ret;
}
int sev_es_string_io(struct vcpu_svm *svm, int size, unsignedint port, int in)
{ int count; int bytes; int r;
if (svm->vmcb->control.exit_info_2 > INT_MAX) return -EINVAL;
count = svm->vmcb->control.exit_info_2; if (unlikely(check_mul_overflow(count, size, &bytes))) return -EINVAL;
r = setup_vmgexit_scratch(svm, in, bytes); if (r) return r;
void sev_es_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
{ /* Clear intercepts on MSRs that are context switched by hardware. */
svm_disable_intercept_for_msr(vcpu, MSR_AMD64_SEV_ES_GHCB, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_EFER, MSR_TYPE_RW);
svm_disable_intercept_for_msr(vcpu, MSR_IA32_CR_PAT, MSR_TYPE_RW);
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX))
svm_set_intercept_for_msr(vcpu, MSR_TSC_AUX, MSR_TYPE_RW,
!guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
!guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID));
/* * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if * the host/guest supports its use. * * KVM treats the guest as being capable of using XSAVES even if XSAVES * isn't enabled in guest CPUID as there is no intercept for XSAVES, * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is * exposed to the guest and XSAVES is supported in hardware. Condition * full XSS passthrough on the guest being able to use XSAVES *and* * XSAVES being exposed to the guest so that KVM can at least honor * guest CPUID for RDMSR and WRMSR.
*/
svm_set_intercept_for_msr(vcpu, MSR_IA32_XSS, MSR_TYPE_RW,
!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) ||
!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES));
}
/* For sev guests, the memory encryption bit is not reserved in CR3. */
best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best)
vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
}
/* * An SEV-ES guest requires a VMSA area that is a separate from the * VMCB page. Do not include the encryption mask on the VMSA physical * address since hardware will access it using the guest key. Note, * the VMSA will be NULL if this vCPU is the destination for intrahost * migration, and will be copied later.
*/ if (!svm->sev_es.snp_has_guest_vmsa) { if (svm->sev_es.vmsa)
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); else
svm->vmcb->control.vmsa_pa = INVALID_PAGE;
}
if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
VMCB_ALLOWED_SEV_FEATURES_VALID;
vmcb->control.intercepts[INTERCEPT_DR] = 0; if (!sev_vcpu_has_debug_swap(svm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
} else { /* * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't * allow debugging SEV-ES guests, and enables DebugSwap iff * NO_NESTED_DATA_BP is supported, so there's no reason to * intercept #DB when DebugSwap is enabled. For simplicity * with respect to guest debug, intercept #DB for other VMs * even if NO_NESTED_DATA_BP is supported, i.e. even if the * guest can't DoS the CPU with infinite #DB vectoring.
*/
clr_exception_intercept(svm, DB_VECTOR);
}
/* * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as * KVM can't decrypt guest memory to decode the faulting instruction.
*/
clr_exception_intercept(svm, GP_VECTOR);
if (sev_es_guest(svm->vcpu.kvm))
sev_es_init_vmcb(svm);
}
/* * Set the GHCB MSR value as per the GHCB specification when emulating * vCPU RESET for an SEV-ES guest.
*/
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
/* * All host state for SEV-ES guests is categorized into three swap types * based on how it is handled by hardware during a world switch: * * A: VMRUN: Host state saved in host save area * VMEXIT: Host state loaded from host save area * * B: VMRUN: Host state _NOT_ saved in host save area * VMEXIT: Host state loaded from host save area * * C: VMRUN: Host state _NOT_ saved in host save area * VMEXIT: Host state initialized to default(reset) values * * Manually save type-B state, i.e. state that is loaded by VMEXIT but * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed * by common SVM code).
*/
hostsa->xcr0 = kvm_host.xcr0;
hostsa->pkru = read_pkru();
hostsa->xss = kvm_host.xss;
/* * If DebugSwap is enabled, debug registers are loaded but NOT saved by * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does * not save or load debug registers. Sadly, KVM can't prevent SNP * guests from lying about DebugSwap on secondary vCPUs, i.e. the * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what * the guest has actually enabled (or not!) in the VMSA. * * If DebugSwap is *possible*, save the masks so that they're restored * if the guest enables DebugSwap. But for the DRs themselves, do NOT * rely on the CPU to restore the host values; KVM will restore them as * needed in common code, via hw_breakpoint_restore(). Note, KVM does * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs * don't need to be restored per se, KVM just needs to ensure they are * loaded with the correct values *if* the CPU writes the MSRs.
*/ if (sev_vcpu_has_debug_swap(svm) ||
(sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3);
}
/* * TSC_AUX is always virtualized for SEV-ES guests when the feature is * available, i.e. TSC_AUX is loaded on #VMEXIT from the host save area. * Set the save area to the current hardware value, i.e. the current * user return value, so that the correct value is restored on #VMEXIT.
*/ if (cpu_feature_enabled(X86_FEATURE_V_TSC_AUX) &&
!WARN_ON_ONCE(tsc_aux_uret_slot < 0))
hostsa->tsc_aux = kvm_get_user_return_msr(tsc_aux_uret_slot);
}
/* First SIPI: Use the values as initially set by the VMM */ if (!svm->sev_es.received_first_sipi) {
svm->sev_es.received_first_sipi = true; return;
}
/* Subsequent SIPI */ switch (svm->sev_es.ap_reset_hold_type) { case AP_RESET_HOLD_NAE_EVENT: /* * Return from an AP Reset Hold VMGEXIT, where the guest will * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
*/
svm_vmgexit_success(svm, 1); break; case AP_RESET_HOLD_MSR_PROTO: /* * Return from an AP Reset Hold VMGEXIT, where the guest will * set the CS and RIP. Set GHCB data field to a non-zero value.
*/
set_ghcb_msr_bits(svm, 1,
GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return alloc_pages_node(node, gfp | __GFP_ZERO, 0);
/* * Allocate an SNP-safe page to workaround the SNP erratum where * the CPU will incorrectly signal an RMP violation #PF if a * hugepage (2MB or 1GB) collides with the RMP entry of a * 2MB-aligned VMCB, VMSA, or AVIC backing page. * * Allocate one extra page, choose a page which is not * 2MB-aligned, and free the other.
*/
p = alloc_pages_node(node, gfp | __GFP_ZERO, 1); if (!p) return NULL;
/* * The only time RMP faults occur for shared pages is when the guest is * triggering an RMP fault for an implicit page-state change from * shared->private. Implicit page-state changes are forwarded to * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults * for shared pages should not end up here.
*/ if (!kvm_mem_is_private(kvm, gfn)) {
pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n",
gpa); return;
}
slot = gfn_to_memslot(kvm, gfn); if (!kvm_slot_can_be_private(slot)) {
pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
gpa); return;
}
ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); if (ret) {
pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n",
gpa); return;
}
ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); if (ret || !assigned) {
pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n",
gpa, pfn, ret); goto out_no_trace;
}
/* * There are 2 cases where a PSMASH may be needed to resolve an #NPF * with PFERR_GUEST_RMP_BIT set: * * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM * bit set if the guest issues them with a smaller granularity than * what is indicated by the page-size bit in the 2MB RMP entry for * the PFN that backs the GPA. * * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is * smaller than what is indicated by the 2MB RMP entry for the PFN * that backs the GPA. * * In both these cases, the corresponding 2M RMP entry needs to * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already * split into 4K RMP entries, then this is likely a spurious case which * can occur when there are concurrent accesses by the guest to a 2MB * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in * the process of being PMASH'd into 4K entries. These cases should * resolve automatically on subsequent accesses, so just ignore them * here.
*/ if (rmp_level == PG_LEVEL_4K) goto out;
ret = snp_rmptable_psmash(pfn); if (ret) { /* * Look it up again. If it's 4K now then the PSMASH may have * raced with another process and the issue has already resolved * itself.
*/ if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) &&
assigned && rmp_level == PG_LEVEL_4K) goto out;
pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n",
gpa, pfn, ret);
}
/* * If this is a large folio, and the entire 2M range containing the * PFN is currently shared, then the entire 2M-aligned range can be * set to private via a single 2M RMP entry.
*/ if (max_level_for_order(order) > PG_LEVEL_4K &&
is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) returntrue;
returnfalse;
}
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order)
{ struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
kvm_pfn_t pfn_aligned;
gfn_t gfn_aligned; int level, rc; bool assigned;
if (!sev_snp_guest(kvm)) return 0;
rc = snp_lookup_rmpentry(pfn, &assigned, &level); if (rc) {
pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n",
gfn, pfn, rc); return -ENOENT;
}
/* * If an unaligned PFN corresponds to a 2M region assigned as a * large page in the RMP table, PSMASH the region into individual * 4K RMP entries before attempting to convert a 4K sub-page.
*/ if (!use_2m_update && rmp_level > PG_LEVEL_4K) { /* * This shouldn't fail, but if it does, report it, but * still try to update RMP entry to shared and pray this * was a spurious error that can be addressed later.
*/
rc = snp_rmptable_psmash(pfn);
WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n",
pfn, rc);
}
rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n",
pfn, rc)) goto next_pfn;
/* * SEV-ES avoids host/guest cache coherency issues through * WBNOINVD hooks issued via MMU notifiers during run-time, and * KVM's VM destroy path at shutdown. Those MMU notifier events * don't cover gmem since there is no requirement to map pages * to a HVA in order to use them for a running guest. While the * shutdown path would still likely cover things for SNP guests, * userspace may also free gmem pages during run-time via * hole-punching operations on the guest_memfd, so flush the * cache entries for these pages before free'ing them back to * the host.
*/
clflush_cache_range(__va(pfn_to_hpa(pfn)),
use_2m_update ? PMD_SIZE : PAGE_SIZE);
next_pfn:
pfn += use_2m_update ? PTRS_PER_PMD : 1;
cond_resched();
}
}
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
{ int level, rc; bool assigned;
/* * If the VMSA has not yet been encrypted, return a pointer to the * current un-encrypted VMSA.
*/ if (!vcpu->arch.guest_state_protected) return (struct vmcb_save_area *)svm->sev_es.vmsa;
sev = to_kvm_sev_info(vcpu->kvm);
/* Check if the SEV policy allows debugging */ if (sev_snp_guest(vcpu->kvm)) { if (!(sev->policy & SNP_POLICY_DEBUG)) return NULL;
} else { if (sev->policy & SEV_POLICY_NODBG) return NULL;
}
if (sev_snp_guest(vcpu->kvm)) { struct sev_data_snp_dbg dbg = {0};
vmsa = snp_alloc_firmware_page(__GFP_ZERO); if (!vmsa) return NULL;
ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
/* * Return the target page to a hypervisor page no matter what. * If this fails, the page can't be used, so leak it and don't * try to use it.
*/ if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) return NULL;
ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); if (ret) {
pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
ret, error, error);
__free_page(vmsa_page);
return NULL;
}
}
return vmsa;
}
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
{ /* If the VMSA has not yet been encrypted, nothing was allocated */ if (!vcpu->arch.guest_state_protected || !vmsa) return;
free_page((unsignedlong)vmsa);
}
Messung V0.5 in Prozent
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.89Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.