/* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context.
*/ #define VMX_VPID_EXTENT_SUPPORTED_MASK \
(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
for (i = j = 0; i < max_shadow_read_only_fields; i++) { struct shadow_vmcs_field entry = shadow_read_only_fields[i];
u16 field = entry.encoding;
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
(i + 1 == max_shadow_read_only_fields ||
shadow_read_only_fields[i + 1].encoding != field + 1))
pr_err("Missing field from shadow_read_only_field %x\n",
field + 1);
for (i = j = 0; i < max_shadow_read_write_fields; i++) { struct shadow_vmcs_field entry = shadow_read_write_fields[i];
u16 field = entry.encoding;
if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
(i + 1 == max_shadow_read_write_fields ||
shadow_read_write_fields[i + 1].encoding != field + 1))
pr_err("Missing field from shadow_read_write_field %x\n",
field + 1);
WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
field <= GUEST_TR_AR_BYTES, "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
/* * PML and the preemption timer can be emulated, but the * processor cannot vmwrite to fields that don't exist * on bare metal.
*/ switch (field) { case GUEST_PML_INDEX: if (!cpu_has_vmx_pml()) continue; break; case VMX_PREEMPTION_TIMER_VALUE: if (!cpu_has_vmx_preemption_timer()) continue; break; case GUEST_INTR_STATUS: if (!cpu_has_vmx_apicv()) continue; break; default: break;
}
staticint nested_vmx_failValid(struct kvm_vcpu *vcpu,
u32 vm_instruction_error)
{
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
X86_EFLAGS_SF | X86_EFLAGS_OF))
| X86_EFLAGS_ZF);
get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; /* * We don't need to force sync to shadow VMCS because * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all * fields and thus must be synced.
*/ if (nested_vmx_is_evmptr12_set(to_vmx(vcpu)))
to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
/* * failValid writes the error number to the current VMCS, which * can't be done if there isn't a current VMCS.
*/ if (vmx->nested.current_vmptr == INVALID_GPA &&
!nested_vmx_is_evmptr12_valid(vmx)) return nested_vmx_failInvalid(vcpu);
staticbool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
{ #ifdef CONFIG_KVM_HYPERV struct vcpu_vmx *vmx = to_vmx(vcpu); /* * When Enlightened VMEntry is enabled on the calling CPU we treat * memory area pointer by vmptr as Enlightened VMCS (as there's no good * way to distinguish it from VMCS12) and we must not corrupt it by * writing to the non-existent 'launch_state' field. The area doesn't * have to be the currently active EVMCS on the calling CPU and there's * nothing KVM has to do to transition it from 'active' to 'non-active' * state. It is possible that the area will stay mapped as * vmx->nested.hv_evmcs but this shouldn't be a problem.
*/ if (!guest_cpu_cap_has_evmcs(vcpu) ||
!evmptr_is_valid(nested_get_evmptr(vcpu))) returnfalse;
if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr)
nested_release_evmcs(vcpu);
/* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX.
*/ staticvoid free_nested(struct kvm_vcpu *vcpu)
{ struct vcpu_vmx *vmx = to_vmx(vcpu);
if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return;
/* * Ensure that the current vmcs of the logical processor is the * vmcs01 of the vcpu before calling free_nested().
*/ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
vmx_leave_nested(vcpu);
vcpu_put(vcpu);
}
if (vmx->nested.pml_full) {
vm_exit_reason = EXIT_REASON_PML_FULL;
vmx->nested.pml_full = false;
/* * It should be impossible to trigger a nested PML Full VM-Exit * for anything other than an EPT Violation from L2. KVM *can* * trigger nEPT page fault injection in response to an EPT * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT * tables also changed, but KVM should not treat EPT Misconfig * VM-Exits as writes.
*/
WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
/* * PML Full and EPT Violation VM-Exits both use bit 12 to report * "NMI unblocking due to IRET", i.e. the bit can be propagated * as-is from the original EXIT_QUALIFICATION.
*/
exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI;
} else { if (fault->error_code & PFERR_RSVD_MASK) {
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
exit_qualification = 0;
} else {
exit_qualification = fault->exit_qualification;
exit_qualification |= vmx_get_exit_qual(vcpu) &
(EPT_VIOLATION_GVA_IS_VALID |
EPT_VIOLATION_GVA_TRANSLATED);
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
}
/* * Although the caller (kvm_inject_emulated_page_fault) would * have already synced the faulting address in the shadow EPT * tables for the current EPTP12, we also need to sync it for * any other cached EPTP02s based on the same EP4TA, since the * TLB associates mappings to the EP4TA rather than the full EPTP.
*/
nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer,
fault->address);
}
/* * Drop bits 31:16 of the error code when performing the #PF mask+match * check. All VMCS fields involved are 32 bits, but Intel CPUs never * set bits 31:16 and VMX disallows setting bits 31:16 in the injected * error code. Including the to-be-dropped bits in the check might * result in an "impossible" or missed exit from L1's perspective.
*/ if (vector == PF_VECTOR) return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) return -EINVAL;
return 0;
}
/* * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, * only the "disable intercept" case needs to be handled.
*/ staticvoid nested_vmx_disable_intercept_for_x2apic_msr(unsignedlong *msr_bitmap_l1, unsignedlong *msr_bitmap_l0,
u32 msr, int type)
{ if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr))
vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr);
if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr))
vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr);
}
staticinlinevoid enable_x2apic_msr_intercepts(unsignedlong *msr_bitmap)
{ int msr;
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { unsigned word = msr / BITS_PER_LONG;
staticinlinevoid nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, unsignedlong *msr_bitmap_l1, unsignedlong *msr_bitmap_l0,
u32 msr, int types)
{ if (types & MSR_TYPE_R)
nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1,
msr_bitmap_l0, msr); if (types & MSR_TYPE_W)
nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1,
msr_bitmap_l0, msr);
}
/* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware.
*/ staticinlinebool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); int msr; unsignedlong *msr_bitmap_l1; unsignedlong *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; struct kvm_host_map map;
/* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() ||
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) returnfalse;
/* * MSR bitmap update can be skipped when: * - MSR bitmap for L1 hasn't changed. * - Nested hypervisor (L1) is attempting to launch the same L2 as * before. * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature * and tells KVM (L0) there were no changes in MSR bitmap for L2.
*/ if (!vmx->nested.force_msr_bitmap_recalc) { struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap &&
evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) returntrue;
}
if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) returnfalse;
msr_bitmap_l1 = (unsignedlong *)map.hva;
/* * To keep the control flow simple, pay eight 8-byte writes (sixteen * 4-byte writes on 32-bit systems) up front to enable intercepts for * the x2APIC MSR range and selectively toggle those relevant to L2.
*/
enable_x2apic_msr_intercepts(msr_bitmap_l0);
if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { if (nested_cpu_has_apic_reg_virt(vmcs12)) { /* * L0 need not intercept reads for MSRs between 0x800 * and 0x8ff, it just lets the processor take the value * from the virtual-APIC page; take those 256 bits * directly from the L1 bitmap.
*/ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { unsigned word = msr / BITS_PER_LONG;
/* * If virtualize x2apic mode is enabled, * virtualize apic access must be disabled.
*/ if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) return -EINVAL;
/* * If virtual interrupt delivery is enabled, * we must exit on external interrupts.
*/ if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) return -EINVAL;
/* * bits 15:8 should be zero in posted_intr_nv, * the descriptor address has been already checked * in nested_get_vmcs12_pages. * * bits 5:0 of posted_intr_desc_addr should be zero.
*/ if (nested_cpu_has_posted_intr(vmcs12) &&
(CC(!nested_cpu_has_vid(vmcs12)) ||
CC(!nested_exit_intr_ack_set(vcpu)) ||
CC((vmcs12->posted_intr_nv & 0xff00)) ||
CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) return -EINVAL;
/* tpr shadow is needed by all apicv features. */ if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) return -EINVAL;
/* * Exceeding the limit results in architecturally _undefined_ behavior, * i.e. KVM is allowed to do literally anything in response to a bad * limit. Immediately generate a consistency check so that code that * consumes the count doesn't need to worry about extreme edge cases.
*/ if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) return -EINVAL;
staticint nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e)
{ if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
nested_vmx_msr_check_common(vcpu, e)) return -EINVAL; return 0;
}
/* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. * * One of the failure modes for MSR load/store is when a list exceeds the * virtual hardware's capacity. To maintain compatibility with hardware inasmuch * as possible, process all valid entries before failing rather than precheck * for a capacity violation.
*/ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{
u32 i; struct vmx_msr_entry e;
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) { if (WARN_ON_ONCE(i >= max_msr_list_size)) goto fail;
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
&e, sizeof(e))) {
pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n",
__func__, i, gpa + i * sizeof(e)); goto fail;
} if (nested_vmx_load_msr_check(vcpu, &e)) {
pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n",
__func__, i, e.index, e.reserved); goto fail;
} if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value); goto fail;
}
} return 0;
fail: /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ return i + 1;
}
/* * If the L0 hypervisor stored a more accurate value for the TSC that * does not include the time taken for emulation of the L2->L1 * VM-exit in L0, use the more accurate value.
*/ if (msr_index == MSR_IA32_TSC) { int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
MSR_IA32_TSC);
if (i >= 0) {
u64 val = vmx->msr_autostore.guest.val[i].value;
if (in_vmcs12_store_list && !in_autostore_list) { if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by * nested_vmx_get_vmexit_msr_value() by reading KVM's * internal MSR state instead of reading the value from * the vmcs02 VMExit MSR-store area.
*/
pr_warn_ratelimited( "Not enough msr entries in msr_autostore. Can't add msr %x\n",
msr_index); return;
}
last = autostore->nr++;
autostore->val[last].index = msr_index;
} elseif (!in_vmcs12_store_list && in_autostore_list) {
last = --autostore->nr;
autostore->val[msr_autostore_slot] = autostore->val[last];
}
}
/* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to * @entry_failure_code.
*/ staticint nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsignedlong cr3, bool nested_ept, bool reload_pdptrs, enum vm_entry_failure_code *entry_failure_code)
{ if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL;
}
/* * If PAE paging and EPT are both on, CR3 is not used by the CPU and * must not be dereferenced.
*/ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
CC(!load_pdptrs(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL;
}
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
if (!nested_ept)
kvm_mmu_new_pgd(vcpu, cr3);
return 0;
}
/* * Returns if KVM is able to config CPU to tag TLB entries * populated by L2 differently than TLB entries populated * by L1. * * If L0 uses EPT, L1 and L2 run with different EPTP because * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries * are tagged with different EPTP. * * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged * with different VPID (L1 entries are tagged with vmx->vpid * while L2 entries are tagged with vmx->nested.vpid02).
*/ staticbool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
{ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
/* * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the * same VPID as the host, and so architecturally, linear and combined * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This * is required if VPID is disabled in KVM, as a TLB flush (there are no * VPIDs) still occurs from L1's perspective, and KVM may need to * synchronize the MMU in response to the guest TLB flush. * * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. * EPT is a special snowflake, as guest-physical mappings aren't * flushed on VPID invalidations, including VM-Enter or VM-Exit with * VPID disabled. As a result, KVM _never_ needs to sync nEPT * entries on VM-Enter because L1 can't rely on VM-Enter to flush * those mappings.
*/ if (!nested_cpu_has_vpid(vmcs12)) {
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return;
}
/* L2 should never have a VPID if VPID is disabled. */
WARN_ON(!enable_vpid);
/* * VPID is enabled and in use by vmcs12. If vpid12 is changing, then * emulate a guest TLB flush as KVM does not track vpid12 history nor * is the VPID incorporated into the MMU context. I.e. KVM must assume * that the new vpid12 has never been used and thus represents a new * guest ASID that cannot have entries in the TLB.
*/ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return;
}
/* * If VPID is enabled, used by vmc12, and vpid12 is not changing but * does not have a unique TLB tag (ASID), i.e. EPT is disabled and * KVM was unable to allocate a VPID for L2, flush the current context * as the effective ASID is common to both L1 and L2.
*/ if (!nested_has_guest_tlb_tag(vcpu))
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
/* * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has * inverted polarity), the incoming value must not set feature bits or * reserved bits that aren't allowed/supported by KVM. Fields, i.e. * multi-bit values, are explicitly checked below.
*/ if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) return -EINVAL;
/* * KVM does not emulate a version of VMX that constrains physical * addresses of VMX structures (e.g. VMCS) to 32-bits.
*/ if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EINVAL;
if (vmx_basic_vmcs_revision_id(vmx_basic) !=
vmx_basic_vmcs_revision_id(data)) return -EINVAL;
if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) return -EINVAL;
/* * The incoming value must not set feature bits or reserved bits that * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are * explicitly checked below.
*/ if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) return -EINVAL;
if ((vmx->nested.msrs.pinbased_ctls_high &
PIN_BASED_VMX_PREEMPTION_TIMER) &&
vmx_misc_preemption_timer_rate(data) !=
vmx_misc_preemption_timer_rate(vmx_misc)) return -EINVAL;
if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) return -EINVAL;
if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) return -EINVAL;
if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) return -EINVAL;
vmx->nested.msrs.misc_low = data;
vmx->nested.msrs.misc_high = data >> 32;
/* * 1 bits (which indicates bits which "must-be-1" during VMX operation) * must be 1 in the restored value.
*/ if (!is_bitwise_subset(data, *msr, -1ULL)) return -EINVAL;
/* * Called when userspace is restoring VMX MSRs. * * Returns 0 on success, non-0 otherwise.
*/ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{ struct vcpu_vmx *vmx = to_vmx(vcpu);
/* * Don't allow changes to the VMX capability MSRs while the vCPU * is in VMX operation.
*/ if (vmx->nested.vmxon) return -EBUSY;
switch (msr_index) { case MSR_IA32_VMX_BASIC: return vmx_restore_vmx_basic(vmx, data); case MSR_IA32_VMX_PINBASED_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS: case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_ENTRY_CTLS: /* * The "non-true" VMX capability MSRs are generated from the * "true" MSRs, so we do not support restoring them directly. * * If userspace wants to emulate VMX_BASIC[55]=0, userspace * should restore the "true" MSRs with the must-be-1 bits * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND * DEFAULT SETTINGS".
*/ return -EINVAL; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS2: return vmx_restore_control_msr(vmx, msr_index, data); case MSR_IA32_VMX_MISC: return vmx_restore_vmx_misc(vmx, data); case MSR_IA32_VMX_CR0_FIXED0: case MSR_IA32_VMX_CR4_FIXED0: return vmx_restore_fixed0_msr(vmx, msr_index, data); case MSR_IA32_VMX_CR0_FIXED1: case MSR_IA32_VMX_CR4_FIXED1: /* * These MSRs are generated based on the vCPU's CPUID, so we * do not support restoring them directly.
*/ return -EINVAL; case MSR_IA32_VMX_EPT_VPID_CAP: return vmx_restore_vmx_ept_vpid_cap(vmx, data); case MSR_IA32_VMX_VMCS_ENUM:
vmx->nested.msrs.vmcs_enum = data; return 0; case MSR_IA32_VMX_VMFUNC: if (data & ~vmcs_config.nested.vmfunc_controls) return -EINVAL;
vmx->nested.msrs.vmfunc_controls = data; return 0; default: /* * The rest of the VMX capability MSRs do not support restore.
*/ return -EINVAL;
}
}
/* Returns 0 on success, non-0 otherwise. */ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
{ switch (msr_index) { case MSR_IA32_VMX_BASIC:
*pdata = msrs->basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS:
*pdata = vmx_control_msr(
msrs->pinbased_ctls_low,
msrs->pinbased_ctls_high); if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: case MSR_IA32_VMX_PROCBASED_CTLS:
*pdata = vmx_control_msr(
msrs->procbased_ctls_low,
msrs->procbased_ctls_high); if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: case MSR_IA32_VMX_EXIT_CTLS:
*pdata = vmx_control_msr(
msrs->exit_ctls_low,
msrs->exit_ctls_high); if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: case MSR_IA32_VMX_ENTRY_CTLS:
*pdata = vmx_control_msr(
msrs->entry_ctls_low,
msrs->entry_ctls_high); if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC:
*pdata = vmx_control_msr(
msrs->misc_low,
msrs->misc_high); break; case MSR_IA32_VMX_CR0_FIXED0:
*pdata = msrs->cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1:
*pdata = msrs->cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0:
*pdata = msrs->cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1:
*pdata = msrs->cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM:
*pdata = msrs->vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2:
*pdata = vmx_control_msr(
msrs->secondary_ctls_low,
msrs->secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP:
*pdata = msrs->ept_caps |
((u64)msrs->vpid_caps << 32); break; case MSR_IA32_VMX_VMFUNC:
*pdata = msrs->vmfunc_controls; break; default: return 1;
}
return 0;
}
/* * Copy the writable VMCS shadow fields back to the VMCS12, in case they have * been modified by the L1 guest. Note, "writable" in this context means * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" * VM-exit information fields (which are actually writable if the vCPU is * configured to support "VMWRITE to any supported field in the VMCS").
*/ staticvoid copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
{ struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); struct shadow_vmcs_field field; unsignedlong val; int i;
if (WARN_ON(!shadow_vmcs)) return;
preempt_disable();
vmcs_load(shadow_vmcs);
for (i = 0; i < max_shadow_read_write_fields; i++) {
field = shadow_read_write_fields[i];
val = __vmcs_readl(field.encoding);
vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
}
for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) {
field = fields[q][i];
val = vmcs12_read_any(vmcs12, field.encoding,
field.offset);
__vmcs_writel(field.encoding, val);
}
}
/* * Currently, KVM only supports eVMCS version 1 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this * value to first u32 field of eVMCS which should specify eVMCS * VersionNumber. * * Guest should be aware of supported eVMCS versions by host by * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is * expected to set this CPUID leaf according to the value * returned in vmcs_version from nested_enable_evmcs(). * * However, it turns out that Microsoft Hyper-V fails to comply * to their own invented interface: When Hyper-V use eVMCS, it * just sets first u32 field of eVMCS to revision_id specified * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number * which is one of the supported versions specified in * CPUID.0x4000000A.EAX[0:15]. * * To overcome Hyper-V bug, we accept here either a supported * eVMCS version or VMCS12 revision_id as valid values for first * u32 field of eVMCS.
*/ if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
(vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
nested_release_evmcs(vcpu); return EVMPTRLD_VMFAIL;
}
vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
evmcs_gpa_changed = true; /* * Unlike normal vmcs12, enlightened vmcs12 is not fully * reloaded from guest's memory (read only fields, fields not * present in struct hv_enlightened_vmcs, ...). Make sure there * are no leftovers.
*/ if (from_launch) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
memset(vmcs12, 0, sizeof(*vmcs12));
vmcs12->hdr.revision_id = VMCS12_REVISION;
}
}
/* * Clean fields data can't be used on VMLAUNCH and when we switch * between different L2 guests as KVM keeps a single VMCS12 per L1.
*/ if (from_launch || evmcs_gpa_changed) {
vmx->nested.hv_evmcs->hv_clean_fields &=
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
/* * A timer value of zero is architecturally guaranteed to cause * a VMExit prior to executing any instructions in the guest.
*/ if (preemption_timeout == 0) {
vmx_preemption_timer_fn(&vmx->nested.preemption_timer); return;
}
/* * If vmcs02 hasn't been initialized, set the constant vmcs02 state * according to L0's settings (vmcs12 is irrelevant here). Host * fields that come from L0 and are not constant, e.g. HOST_CR3, * will be set as needed prior to VMLAUNCH/VMRESUME.
*/ if (vmx->nested.vmcs02_initialized) return;
vmx->nested.vmcs02_initialized = true;
/* * We don't care what the EPTP value is we just need to guarantee * it's valid so we don't get a false positive when doing early * consistency checks.
*/ if (enable_ept && nested_early_check)
vmcs_write64(EPT_POINTER,
construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
if (vmx->ve_info)
vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
/* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc())
vmcs_write64(VM_FUNCTION_CONTROL, 0);
if (cpu_has_vmx_posted_intr())
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
/* * PML is emulated for L2, but never enabled in hardware as the MMU * handles A/D emulation. Disabling PML for L2 also avoids having to * deal with filtering out L2 GPAs from the buffer.
*/ if (enable_pml) {
vmcs_write64(PML_ADDRESS, 0);
vmcs_write16(GUEST_PML_INDEX, -1);
}
if (cpu_has_vmx_encls_vmexit())
vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
if (kvm_notify_vmexit_enabled(kvm))
vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window);
/* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based * on L2's behavior, e.g. switching to/from long mode.
*/
vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
/* * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the * same VPID as the host. Emulate this behavior by using vpid01 for L2 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter * and VM-Exit are architecturally required to flush VPID=0, but *only* * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the * required flushes), but doing so would cause KVM to over-flush. E.g. * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, * and then runs L2 X again, then KVM can and should retain TLB entries * for VPID12=1.
*/ if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); else
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
}
}
/* * A vmexit (to either L1 hypervisor or L0 userspace) is always needed * for I/O port accesses.
*/
exec_control |= CPU_BASED_UNCOND_IO_EXITING;
exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
/* * This bit will be computed in nested_get_vmcs12_pages, because * we do not have access to L1's MSR bitmap yet. For now, keep * the same bit as before, hoping to avoid multiple VMWRITEs that * only set/clear this bit.
*/
exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_ENABLE_XSAVES |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_ENABLE_VMFUNC |
SECONDARY_EXEC_DESC);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
/* PML is emulated and never enabled in hardware for L2. */
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
/* VMCS shadowing for L2 is emulated for now */
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
/* * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() * will not have to rewrite the controls just for this bit.
*/ if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP))
exec_control |= SECONDARY_EXEC_DESC;
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
vmcs_write16(GUEST_INTR_STATUS,
vmcs12->guest_intr_status);
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
vmx_write_encls_bitmap(&vmx->vcpu, vmcs12);
secondary_exec_controls_set(vmx, exec_control);
}
/* * ENTRY CONTROLS * * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). * * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to * do the same for L2.
*/
exec_control = __vm_entry_controls_get(vmcs01);
exec_control |= (vmcs12->vm_entry_controls &
~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE; if (guest_efer != kvm_host.efer)
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
}
vm_entry_controls_set(vmx, exec_control);
/* * EXIT CONTROLS * * L2->L1 exit controls are emulated - the hardware exit is to L0 so * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
exec_control = __vm_exit_controls_get(vmcs01); if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER; else
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
vm_exit_controls_set(vmx, exec_control);
/* * L1 may access the L2's PDPTR, so save them to construct * vmcs12
*/ if (enable_ept) {
vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
}
if (nested_cpu_has_xsaves(vmcs12))
vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
/* * Whether page-faults are trapped is determined by a combination of * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 * doesn't care about page faults then we should set all of these to * L1's desires. However, if L0 does care about (some) page faults, it * is not easy (if at all possible?) to merge L0 and L1's desires, we * simply ask to exit on each and every L2 page fault. This is done by * setting MASK=MATCH=0 and (see below) EB.PF=1. * Note that below we don't need special code to set EB.PF beyond the * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when * !enable_ept, EB.PF is 1, so the "or" will always be 1.
*/ if (vmx_need_pf_intercept(&vmx->vcpu)) { /* * TODO: if both L0 and L1 need the same MASK and MATCH, * go ahead and use it?
*/
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
} else {
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
}
/* * Make sure the msr_autostore list is up to date before we set the * count in the vmcs02.
*/
prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
/* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 * guest in a way that will both be appropriate to L1's requests, and our * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various * vcpu->arch fields. * Returns 0 on success, 1 on failure. Invalid state exit qualification code * is assigned to entry_failure_code on failure.
*/ staticint prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool from_vmentry, enum vm_entry_failure_code *entry_failure_code)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); bool load_guest_pdptrs_vmcs12 = false;
if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) {
prepare_vmcs02_rare(vmx, vmcs12);
vmx->nested.dirty_vmcs12 = false;
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later.
*/
vmx_update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
if (nested_cpu_has_ept(vmcs12))
nested_ept_init_mmu_context(vcpu);
/* * Override the CR0/CR4 read shadows after setting the effective guest * CR0/CR4. The common helpers also set the shadows, but they don't * account for vmcs12's cr0/4_guest_host_mask.
*/
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
vmx_set_efer(vcpu, vcpu->arch.efer);
/* * Guest state is invalid and unrestricted guest is disabled, * which means L1 attempted VMEntry to L2 with invalid state. * Fail the VMEntry. * * However when force loading the guest state (SMM exit or * loading nested state after migration, it is possible to * have invalid guest state now, which will be later fixed by * restoring L2 register state
*/ if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL;
}
/* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
from_vmentry, entry_failure_code)) return -EINVAL;
/* * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 * on nested VM-Exit, which can occur without actually running L2 and * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the * transition to HLT instead of running L2.
*/ if (enable_ept)
vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
is_pae_paging(vcpu)) {
vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
}
/* * It was observed that genuine Hyper-V running in L1 doesn't reset * 'hv_clean_fields' by itself, it only sets the corresponding dirty * bits when it changes a field in eVMCS. Mark all fields as clean * here.
*/ if (nested_vmx_is_evmptr12_valid(vmx))
evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
staticbool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12)
{ /* * Check that the given linear address is canonical after a VM exit * from L2, based on HOST_CR4.LA57 value that will be loaded for L1.
*/
u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48;
/* * If the load IA32_EFER VM-exit control is 1, bits reserved in the * IA32_EFER MSR must be 0 in the field for that register. In addition, * the values of the LMA and LME bits in the field must each be that of * the host address-space size VM-exit control.
*/ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) return -EINVAL;
}
/* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: * - Bits reserved in the IA32_EFER MSR must be 0. * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of * the IA-32e mode guest VM-exit control. It must also be identical * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to * CR0.PG) is 1.
*/ if (to_vmx(vcpu)->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) return -EINVAL;
}
if (vmx->msr_autoload.host.nr)
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); if (vmx->msr_autoload.guest.nr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
preempt_disable();
vmx_prepare_switch_to_guest(vcpu);
/* * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. * there is no need to preserve other bits or save/restore the field.
*/
vmcs_writel(GUEST_RFLAGS, 0);
if (vmx->msr_autoload.host.nr)
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); if (vmx->msr_autoload.guest.nr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
if (vm_fail) {
u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
/* * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
*/ if (hw_breakpoint_active())
set_debugreg(__this_cpu_read(cpu_dr7), 7);
local_irq_enable();
preempt_enable();
/* * A non-failing VMEntry means we somehow entered guest mode with * an illegal RIP, and that's just the tip of the iceberg. There * is no telling what memory has been modified or what state has * been exposed to unknown code. Hitting this all but guarantees * a (very critical) hardware issue.
*/
WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
VMX_EXIT_REASONS_FAILED_VMENTRY));
/* * hv_evmcs may end up being not mapped after migration (when * L2 was running), map it here to make sure vmcs12 changes are * properly reflected.
*/ if (guest_cpu_cap_has_evmcs(vcpu) &&
vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status =
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
if (evmptrld_status == EVMPTRLD_VMFAIL ||
evmptrld_status == EVMPTRLD_ERROR) returnfalse;
/* * Post migration VMCS12 always provides the most actual * information, copy it to eVMCS upon entry.
*/
vmx->nested.need_vmcs12_to_shadow_sync = true;
}
if (!vcpu->arch.pdptrs_from_userspace &&
!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { /* * Reload the guest's PDPTRs since after a migration * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs.
*/ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) returnfalse;
}
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
map = &vmx->nested.apic_access_page_map;
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) {
vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn));
} else {
pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n",
__func__);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0; returnfalse;
}
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
map = &vmx->nested.virtual_apic_map;
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
} elseif (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * The processor will never use the TPR shadow, simply * clear the bit from the execution control. Such a * configuration is useless, but it happens in tests. * For any other configuration, failing the vm entry is * _not_ what the processor does but it's basically the * only possibility we have.
*/
exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
} else { /* * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to * force VM-Entry to fail.
*/
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
}
}
if (nested_cpu_has_posted_intr(vmcs12)) {
map = &vmx->nested.pi_desc_map;
if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
vmx->nested.pi_desc =
(struct pi_desc *)(((void *)map->hva) +
offset_in_page(vmcs12->posted_intr_desc_addr));
vmcs_write64(POSTED_INTR_DESC_ADDR,
pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
} else { /* * Defer the KVM_INTERNAL_EXIT until KVM tries to * access the contents of the VMCS12 posted interrupt * descriptor. (Note that KVM may do this when it * should not, per the architectural specification.)
*/
vmx->nested.pi_desc = NULL;
pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
}
} if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
returntrue;
}
staticbool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{ #ifdef CONFIG_KVM_HYPERV /* * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post * migration.
*/ if (!nested_get_evmcs_page(vcpu)) {
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
__func__);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
returnfalse;
} #endif
if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) returnfalse;
/* * Check if PML is enabled for the nested guest. Whether eptp bit 6 is * set is already checked as part of A/D emulation.
*/
vmcs12 = get_vmcs12(vcpu); if (!nested_cpu_has_pml(vmcs12)) return 0;
if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
offset_in_page(dst), sizeof(gpa))) return 0;
vmcs12->guest_pml_index--;
return 0;
}
/* * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are * slightly different). It also specifies what exception to inject otherwise. * Note that many of these exceptions have priority over VM exits, so they * don't have to be checked again here.
*/ staticint nested_vmx_check_permission(struct kvm_vcpu *vcpu)
{ if (!to_vmx(vcpu)->nested.vmxon) {
kvm_queue_exception(vcpu, UD_VECTOR); return 0;
}
if (vmx_get_cpl(vcpu)) {
kvm_inject_gp(vcpu, 0); return 0;
}
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); if (kvm_mpx_supported() &&
(!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
/* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* * nested early checks are disabled. In the event of a "late" VM-Fail, * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its * software model to the pre-VMEntry host state. When EPT is disabled, * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is * guaranteed to be overwritten with a shadow CR3 prior to re-entering * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail * path would need to manually save/restore vmcs01.GUEST_CR3.
*/ if (!enable_ept && !nested_early_check)
vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
if (from_vmentry) { if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
}
if (nested_vmx_check_vmentry_hw(vcpu)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_VMFAIL;
}
if (from_vmentry) {
failed_index = nested_vmx_load_msr(vcpu,
vmcs12->vm_entry_msr_load_addr,
vmcs12->vm_entry_msr_load_count); if (failed_index) {
exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
vmcs12->exit_qualification = failed_index; goto vmentry_fail_vmexit_guest_mode;
}
} else { /* * The MMU is not initialized to point at the right entities yet and * "get pages" would need to read data from the guest (i.e. we will * need to perform gpa to hpa translation). Request a call * to nested_get_vmcs12_pages before the next VM-entry. The MSRs * have already been set at vmentry time and should not be reset.
*/
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
}
/* * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit * unconditionally. Take care to pull data from vmcs01 as appropriate, * e.g. when checking for interrupt windows, as vmcs02 is now loaded.
*/ if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING |
CPU_BASED_NMI_WINDOW_EXITING)) ||
kvm_apic_has_pending_init_or_sipi(vcpu) ||
kvm_apic_has_interrupt(vcpu))
kvm_make_request(KVM_REQ_EVENT, vcpu);
/* * Do not start the preemption timer hrtimer until after we know * we are successful, so that only nested_vmx_vmexit needs to cancel * the timer.
*/
vmx->nested.preemption_timer_expired = false; if (nested_cpu_has_preemption_timer(vmcs12)) {
u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
vmx_start_preemption_timer(vcpu, timer_value);
}
/* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set * the success flag) when L2 exits (see nested_vmx_vmexit()).
*/ return NVMX_VMENTRY_SUCCESS;
/* * A failed consistency check that leads to a VMExit during L1's * VMEnter to L2 is a variation of a normal VMexit, as explained in * 26.7 "VM-entry failures during or after loading guest state".
*/
vmentry_fail_vmexit_guest_mode: if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
leave_guest_mode(vcpu);
if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu);
if (CC(!nested_vmx_is_evmptr12_valid(vmx) &&
vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu);
vmcs12 = get_vmcs12(vcpu);
/* * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact * that there *is* a valid VMCS pointer, RFLAGS.CF is set * rather than RFLAGS.ZF, and no error number is stored to the * VM-instruction error field.
*/ if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu);
if (nested_vmx_is_evmptr12_valid(vmx)) { struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx);
copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); /* Enlightened VMCS doesn't have launch state */
vmcs12->launch_state = !launch;
} elseif (enable_shadow_vmcs) {
copy_shadow_to_vmcs12(vmx);
}
/* * The nested entry process starts with enforcing various prerequisites * on vmcs12 as required by the Intel SDM, and act appropriately when * they fail: As the SDM explains, some conditions should cause the * instruction to fail, while others will cause the instruction to seem * to succeed, but return an EXIT_REASON_INVALID_STATE. * To speed up the normal (success) code path, we should avoid checking * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02.
*/ if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
if (nested_vmx_check_controls(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
if (nested_vmx_check_address_space_size(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
if (nested_vmx_check_host_state(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
/* * We're finally done with prerequisite checking, and can start with * the nested entry.
*/
vmx->nested.nested_run_pending = 1;
vmx->nested.has_preemption_timer_deadline = false;
status = nested_vmx_enter_non_root_mode(vcpu, true); if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed;
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
/* * Must happen outside of nested_vmx_enter_non_root_mode() as it will * also be used as part of restoring nVMX state for * snapshot restore (migration). * * In this flow, it is assumed that vmcs12 cache was * transferred as part of captured nVMX state and should * therefore not be read from guest memory (which may not * exist on destination host yet).
*/
nested_cache_shadow_vmcs12(vcpu, vmcs12);
switch (vmcs12->guest_activity_state) { case GUEST_ACTIVITY_HLT: /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be * awakened by event injection or by an NMI-window VM-exit or * by an interrupt-window VM-exit, halt the vcpu.
*/ if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
!nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) &&
!(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
vmx->nested.nested_run_pending = 0; return kvm_emulate_halt_noskip(vcpu);
} break; case GUEST_ACTIVITY_WAIT_SIPI:
vmx->nested.nested_run_pending = 0;
kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break;
}
/* * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). * This function returns the new value we should put in vmcs12.guest_cr0. * It's not enough to just return the vmcs02 GUEST_CR0. Rather, * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 * didn't trap the bit, because if L1 did, so would L0). * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have * been modified by L2, and L1 knows it. So just leave the old value of * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 * isn't relevant, because if L0 traps this bit it can set it to anything. * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have * changed these bits, and therefore they need to be updated, but L0 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
*/ staticinlineunsignedlong
vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{ return /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
vcpu->arch.cr0_guest_owned_bits));
}
/* * Per the SDM, VM-Exits due to double and triple faults are never * considered to occur during event delivery, even if the double/triple * fault is the result of an escalating vectoring issue. * * Note, the SDM qualifies the double fault behavior with "The original * event results in a double-fault exception". It's unclear why the * qualification exists since exits due to double fault can occur only * while vectoring a different exception (injected events are never * subject to interception), i.e. there's _always_ an original event. * * The SDM also uses NMI as a confusing example for the "original event * causes the VM exit directly" clause. NMI isn't special in any way, * the same rule applies to all events that cause an exit directly. * NMI is an odd choice for the example because NMIs can only occur on * instruction boundaries, i.e. they _can't_ occur during vectoring.
*/ if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
is_double_fault(exit_intr_info))) {
vmcs12->idt_vectoring_info_field = 0;
} elseif (vcpu->arch.exception.injected) {
nr = vcpu->arch.exception.vector;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
/* * Unlike AMD's Paged Real Mode, which reports an error code on #PF * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the * "has error code" flags on VM-Exit if the CPU is in Real Mode.
*/ if (ex->has_error_code && is_protmode(vcpu)) { /* * Intel CPUs do not generate error codes with bits 31:16 set, * and more importantly VMX disallows setting bits 31:16 in the * injected error code for VM-Entry. Drop the bits to mimic * hardware and avoid inducing failure on nested VM-Entry if L1 * chooses to inject the exception back to L2. AMD CPUs _do_ * generate "full" 32-bit error codes, so KVM allows userspace * to inject exception error codes with bits 31:16 set.
*/
vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (kvm_exception_is_soft(ex->vector))
intr_info |= INTR_TYPE_SOFT_EXCEPTION; else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
vmx_get_nmi_mask(vcpu))
intr_info |= INTR_INFO_UNBLOCK_NMI;
/* * Returns true if a debug trap is (likely) pending delivery. Infer the class * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). * Using the payload is flawed because code breakpoints (fault-like) and data * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. * this will return false positives if a to-be-injected code breakpoint #DB is * pending (from KVM's perspective, but not "pending" across an instruction * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it * too is trap-like. * * KVM "works" despite these flaws as ICEBP isn't currently supported by the * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the * #DB has already happened), and MTF isn't marked pending on code breakpoints * from the emulator (because such #DBs are fault-like and thus don't trigger * actions that fire on instruction retire).
*/ staticunsignedlong vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
{ if (!ex->pending || ex->vector != DB_VECTOR) return 0;
/* General Detect #DBs are always fault-like. */ return ex->payload & ~DR6_BD;
}
/* * Returns true if there's a pending #DB exception that is lower priority than * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by * KVM, but could theoretically be injected by userspace. Note, this code is * imperfect, see above.
*/ staticbool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
{ return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
}
/* * Certain VM-exits set the 'pending debug exceptions' field to indicate a * recognized #DB (data or single-step) that has yet to be delivered. Since KVM * represents these debug traps with a payload that is said to be compatible * with the 'pending debug exceptions' field, write the payload to the VMCS * field if a VM-exit is delivered before the debug trap.
*/ staticvoid nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
{ unsignedlong pending_dbg;
pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); if (pending_dbg)
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
}
if (nested_vmx_preemption_timer_pending(vcpu) ||
vmx->nested.mtf_pending) returntrue;
/* * Virtual Interrupt Delivery doesn't require manual injection. Either * the interrupt is already in GUEST_RVI and will be recognized by CPU * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move * the interrupt from the PIR to RVI prior to entering the guest.
*/ if (for_injection) returnfalse;
if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
__vmx_interrupt_blocked(vcpu)) returnfalse;
/* * Per the Intel SDM's table "Priority Among Concurrent Events", with minor * edits to fill in missing examples, e.g. #DB due to split-lock accesses, * and less minor edits to splice in the priority of VMX Non-Root specific * events, e.g. MTF and NMI/INTR-window exiting. * * 1 Hardware Reset and Machine Checks * - RESET * - Machine Check * * 2 Trap on Task Switch * - T flag in TSS is set (on task switch) * * 3 External Hardware Interventions * - FLUSH * - STOPCLK * - SMI * - INIT * * 3.5 Monitor Trap Flag (MTF) VM-exit[1] * * 4 Traps on Previous Instruction * - Breakpoints * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O * breakpoint, or #DB due to a split-lock access) * * 4.3 VMX-preemption timer expired VM-exit * * 4.6 NMI-window exiting VM-exit[2] * * 5 Nonmaskable Interrupts (NMI) * * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery * * 6 Maskable Hardware Interrupts * * 7 Code Breakpoint Fault * * 8 Faults from Fetching Next Instruction * - Code-Segment Limit Violation * - Code Page Fault * - Control protection exception (missing ENDBRANCH at target of indirect * call or jump) * * 9 Faults from Decoding Next Instruction * - Instruction length > 15 bytes * - Invalid Opcode * - Coprocessor Not Available * *10 Faults on Executing Instruction * - Overflow * - Bound error * - Invalid TSS * - Segment Not Present * - Stack fault * - General Protection * - Data Page Fault * - Alignment Check * - x86 FPU Floating-point exception * - SIMD floating-point exception * - Virtualization exception * - Control protection exception * * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), * INIT signals, and higher priority events take priority over MTF VM exits. * MTF VM exits take priority over debug-trap exceptions and lower priority * events. * * [2] Debug-trap exceptions and higher priority events take priority over VM exits * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption * timer take priority over VM exits caused by the "NMI-window exiting" * VM-execution control and lower priority events. * * [3] Debug-trap exceptions and higher priority events take priority over VM exits * caused by "NMI-window exiting". VM exits caused by this control take * priority over non-maskable interrupts (NMIs) and lower priority events. * * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, * non-maskable interrupts (NMIs) and higher priority events take priority over * delivery of a virtual interrupt; delivery of a virtual interrupt takes * priority over external interrupts and lower priority events.
*/ staticint vmx_check_nested_events(struct kvm_vcpu *vcpu)
{ struct kvm_lapic *apic = vcpu->arch.apic; struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Only a pending nested run blocks a pending exception. If there is a * previously injected event, the pending exception occurred while said * event was being delivered and thus needs to be handled.
*/ bool block_nested_exceptions = vmx->nested.nested_run_pending; /* * Events that don't require injection, i.e. that are virtualized by * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need * to regain control in order to deliver the event, and hardware will * handle event ordering, e.g. with respect to injected exceptions. * * But, new events (not exceptions) are only recognized at instruction * boundaries. If an event needs reinjection, then KVM is handling a * VM-Exit that occurred _during_ instruction execution; new events, * irrespective of whether or not they're injected, are blocked until * the instruction completes.
*/ bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); /* * Inject events are blocked by nested VM-Enter, as KVM is responsible * for managing priority between concurrent events, i.e. KVM needs to * wait until after VM-Enter completes to deliver injected events.
*/ bool block_nested_events = block_nested_exceptions ||
block_non_injected_events;
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY;
nested_vmx_update_pending_dbg(vcpu);
clear_bit(KVM_APIC_INIT, &apic->pending_events); if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
/* MTF is discarded if the vCPU is in WFS. */
vmx->nested.mtf_pending = false; return 0;
}
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (block_nested_events) return -EBUSY;
clear_bit(KVM_APIC_SIPI, &apic->pending_events); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
apic->sipi_vector & 0xFFUL); return 0;
} /* Fallthrough, the SIPI is completely ignored. */
}
/* * Process exceptions that are higher priority than Monitor Trap Flag: * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but * could theoretically come in from userspace), and ICEBP (INT1). * * TODO: SMIs have higher priority than MTF and trap-like #DBs (except * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF * across SMI/RSM as it should; that needs to be addressed in order to * prioritize SMI over MTF and trap-like #DBs.
*/ if (vcpu->arch.exception_vmexit.pending &&
!vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { if (block_nested_exceptions) return -EBUSY;
if (vcpu->arch.exception.pending) { if (block_nested_exceptions) return -EBUSY; goto no_vmexit;
}
if (nested_vmx_preemption_timer_pending(vcpu)) { if (block_nested_events) return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); return 0;
}
if (vcpu->arch.smi_pending && !is_smm(vcpu)) { if (block_nested_events) return -EBUSY; goto no_vmexit;
}
if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; if (!nested_exit_on_nmi(vcpu)) goto no_vmexit;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
NMI_VECTOR | INTR_TYPE_NMI_INTR |
INTR_INFO_VALID_MASK, 0); /* * The NMI-triggered VM exit counts as injection: * clear this one and block further NMIs.
*/
vcpu->arch.nmi_pending = 0;
vmx_set_nmi_mask(vcpu, true); return 0;
}
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { int irq;
if (!nested_exit_on_intr(vcpu)) { if (block_nested_events) return -EBUSY;
goto no_vmexit;
}
if (!nested_exit_intr_ack_set(vcpu)) { if (block_nested_events) return -EBUSY;
irq = kvm_apic_has_interrupt(vcpu); if (WARN_ON_ONCE(irq < 0)) goto no_vmexit;
/* * If the IRQ is L2's PI notification vector, process posted * interrupts for L2 instead of injecting VM-Exit, as the * detection/morphing architecturally occurs when the IRQ is * delivered to the CPU. Note, only interrupts that are routed * through the local APIC trigger posted interrupt processing, * and enabling posted interrupts requires ACK-on-exit.
*/ if (irq == vmx->nested.posted_intr_nv) { /* * Nested posted interrupts are delivered via RVI, i.e. * aren't injected by KVM, and so can be queued even if * manual event injection is disallowed.
*/ if (block_non_injected_events) return -EBUSY;
/* * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI * if APICv is active.
*/
kvm_apic_ack_interrupt(vcpu, irq); return 0;
}
value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
do_div(value, 1000000); return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
}
staticbool is_vmcs12_ext_field(unsignedlong field)
{ switch (field) { case GUEST_ES_SELECTOR: case GUEST_CS_SELECTOR: case GUEST_SS_SELECTOR: case GUEST_DS_SELECTOR: case GUEST_FS_SELECTOR: case GUEST_GS_SELECTOR: case GUEST_LDTR_SELECTOR: case GUEST_TR_SELECTOR: case GUEST_ES_LIMIT: case GUEST_CS_LIMIT: case GUEST_SS_LIMIT: case GUEST_DS_LIMIT: case GUEST_FS_LIMIT: case GUEST_GS_LIMIT: case GUEST_LDTR_LIMIT: case GUEST_TR_LIMIT: case GUEST_GDTR_LIMIT: case GUEST_IDTR_LIMIT: case GUEST_ES_AR_BYTES: case GUEST_DS_AR_BYTES: case GUEST_FS_AR_BYTES: case GUEST_GS_AR_BYTES: case GUEST_LDTR_AR_BYTES: case GUEST_TR_AR_BYTES: case GUEST_ES_BASE: case GUEST_CS_BASE: case GUEST_SS_BASE: case GUEST_DS_BASE: case GUEST_FS_BASE: case GUEST_GS_BASE: case GUEST_LDTR_BASE: case GUEST_TR_BASE: case GUEST_GDTR_BASE: case GUEST_IDTR_BASE: case GUEST_PENDING_DBG_EXCEPTIONS: case GUEST_BNDCFGS: returntrue; default: break;
}
/* * Update the guest state fields of vmcs12 to reflect changes that * occurred while L2 was running. (The "IA-32e mode guest" bit of the * VM-entry controls is also updated, since this is really a guest * state bit.)
*/ staticvoid sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{ struct vcpu_vmx *vmx = to_vmx(vcpu);
if (nested_vmx_is_evmptr12_valid(vmx))
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
/* * In some cases (usually, nested EPT), L2 is allowed to change its * own CR3 without exiting. If it has changed it, we must keep it. * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. * * Additionally, restore L2's PDPTR to vmcs12.
*/ if (enable_ept) {
vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
}
}
/* * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. * Writes to DEBUGCTL that aren't intercepted by L1 are immediately * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into * vmcs02 doesn't strictly track vmcs12.
*/ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
vmcs12->guest_dr7 = vcpu->arch.dr7;
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
vmcs12->guest_ia32_efer = vcpu->arch.efer;
}
/* * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), * and this function updates it to reflect the changes to the guest state while * L2 was running (and perhaps made some exits which were handled directly by L0 * without going back to L1), and to reflect the exit reason. * Note that we do not have to copy here all VMCS fields, just those that * could have changed by the L2 guest or the exit - i.e., the guest-state and * exit-information fields only. Other fields are modified by L1 with VMWRITE, * which already writes to vmcs12 directly.
*/ staticvoid prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
u32 vm_exit_reason, u32 exit_intr_info, unsignedlong exit_qualification, u32 exit_insn_len)
{ /* update exit information fields: */
vmcs12->vm_exit_reason = vm_exit_reason; if (vmx_get_exit_reason(vcpu).enclave_mode)
vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
vmcs12->exit_qualification = exit_qualification;
/* * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other * exit info fields are unmodified.
*/ if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
vmcs12->launch_state = 1;
/* vm_entry_intr_info_field is cleared on exit. Emulate this
* instead of reading the real value. */
vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
/* * Transfer the event that L0 or L1 may wanted to inject into * L2 to IDT_VECTORING_INFO_FIELD.
*/
vmcs12_save_pending_event(vcpu, vmcs12,
vm_exit_reason, exit_intr_info);
/* * According to spec, there's no need to store the guest's * MSRs if the exit is due to a VM-entry failure that occurs * during or after loading the guest state. Since this exit * does not fall in that category, we need to save the MSRs.
*/ if (nested_vmx_store_msr(vcpu,
vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
nested_vmx_abort(vcpu,
VMX_ABORT_SAVE_GUEST_MSR_FAIL);
}
}
/* * A part of what we need to when the nested L2 guest exits and we want to * run its L1 parent, is to reset L1's guest state to the host state specified * in vmcs12. * This function is to be called not only on normal nested exit, but also on * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry * Failures During or After Loading Guest State"). * This function should be called when the active VMCS is L1's (vmcs01).
*/ staticvoid load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{ enum vm_entry_failure_code ignored; struct kvm_segment seg;
/* * Note that calling vmx_set_cr0 is important, even if cr0 hasn't * actually changed, because vmx_set_cr0 refers to efer set above. * * CR0_GUEST_HOST_MASK is already set in the original vmcs01 * (KVM doesn't change it);
*/
vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmx_set_cr0(vcpu, vmcs12->host_cr0);
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
vmx_set_cr4(vcpu, vmcs12->host_cr4);
nested_ept_uninit_mmu_context(vcpu);
/* * Only PDPTE load can fail as the value of cr3 was checked on entry and * couldn't have changed.
*/ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
vmcs_write64(GUEST_BNDCFGS, 0);
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
vcpu->arch.pat = vmcs12->host_ia32_pat;
} if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
vmcs12->host_ia32_perf_global_ctrl));
if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) return vmcs_read64(GUEST_IA32_EFER);
if (cpu_has_load_ia32_efer()) return kvm_host.efer;
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) return vmx->msr_autoload.guest.val[i].value;
}
efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); if (efer_msr) return efer_msr->data;
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { /* * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set * as vmcs01.GUEST_DR7 contains a userspace defined value * and vcpu->arch.dr7 is not squirreled away before the * nested VMENTER (not worth adding a variable in nested_vmx).
*/ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
kvm_set_dr(vcpu, 7, DR7_FIXED_1); else
WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
}
/* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */
vmx_reload_guest_debugctl(vcpu);
/* * Note that calling vmx_set_{efer,cr0,cr4} is important as they * handle a variety of side effects to KVM's software model.
*/
vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
/* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs * from vmcs01 (if necessary). The PDPTRs are not loaded on * VMFail, like everything else we just need to ensure our * software model is up-to-date.
*/ if (enable_ept && is_pae_paging(vcpu))
ept_save_pdptrs(vcpu);
kvm_mmu_reset_context(vcpu);
/* * This nasty bit of open coding is a compromise between blindly * loading L1's MSRs using the exit load lists (incorrect emulation * of VMFail), leaving the nested VM's MSRs in the software model * (incorrect behavior) and snapshotting the modified MSRs (too * expensive since the lists are unbound by hardware). For each * MSR that was (prematurely) loaded from the nested VMEntry load * list, reload it from the exit load list if it exists and differs * from the guest value. The intent is to stuff host state as * silently as possible, not to fully process the exit load list.
*/ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
pr_debug_ratelimited( "%s read MSR index failed (%u, 0x%08llx)\n",
__func__, i, gpa); goto vmabort;
}
/* * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
*/ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
u32 exit_intr_info, unsignedlong exit_qualification,
u32 exit_insn_len)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
/* Pending MTF traps are discarded on VM-Exit. */
vmx->nested.mtf_pending = false;
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
#ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { /* * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map * Enlightened VMCS after migration and we still need to * do that when something is forcing L2->L1 exit prior to * the first L2 run.
*/
(void)nested_get_evmcs_page(vcpu);
} #endif
/* Service pending TLB flush requests for L2 before switching to L1. */
kvm_service_local_tlb_flush_requests(vcpu);
/* * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are * up-to-date before switching to L1.
*/ if (enable_ept && is_pae_paging(vcpu))
vmx_ept_load_pdptrs(vcpu);
leave_guest_mode(vcpu);
if (nested_cpu_has_preemption_timer(vmcs12))
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
}
if (likely(!vmx->fail)) {
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
if (vm_exit_reason != -1)
prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
exit_intr_info, exit_qualification,
exit_insn_len);
/* * Must happen outside of sync_vmcs02_to_vmcs12() as it will * also be used to capture vmcs12 cache as part of * capturing nVMX state for snapshot (migration). * * Otherwise, this flush will dirty guest memory at a * point it is already assumed by user-space to be * immutable.
*/
nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
} else { /* * The only expected VM-instruction error is "VM entry with * invalid control field(s)." Anything else indicates a * problem with L0. And we should never get here with a * VMFail of any type if early consistency checks are enabled.
*/
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
WARN_ON_ONCE(nested_early_check);
}
/* * Drop events/exceptions that were queued for re-injection to L2 * (picked up via vmx_complete_interrupts()), as well as exceptions * that were pending for L2. Note, this must NOT be hoisted above * prepare_vmcs12(), events/exceptions queued for re-injection need to * be captured in vmcs12 (see vmcs12_save_pending_event()).
*/
vcpu->arch.nmi_injected = false;
kvm_clear_exception_queue(vcpu);
kvm_clear_interrupt_queue(vcpu);
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
kvm_nested_vmexit_handle_ibrs(vcpu);
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_caps.has_tsc_control)
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
if (vmx->nested.l1_tpr_threshold != -1)
vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
if (vmx->nested.change_vmcs01_virtual_apic_mode) {
vmx->nested.change_vmcs01_virtual_apic_mode = false;
vmx_set_virtual_apic_mode(vcpu);
}
if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
vmx->nested.update_vmcs01_cpu_dirty_logging = false;
vmx_update_cpu_dirty_logging(vcpu);
}
nested_put_vmcs12_pages(vcpu);
if (vmx->nested.reload_vmcs01_apic_access_page) {
vmx->nested.reload_vmcs01_apic_access_page = false;
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
}
if (vmx->nested.update_vmcs01_apicv_status) {
vmx->nested.update_vmcs01_apicv_status = false;
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
}
if (vmx->nested.update_vmcs01_hwapic_isr) {
vmx->nested.update_vmcs01_hwapic_isr = false;
kvm_apic_update_hwapic_isr(vcpu);
}
/* in case we halted in L2 */
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
if (likely(!vmx->fail)) { if (vm_exit_reason != -1)
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
vmcs12->idt_vectoring_info_field,
vmcs12->vm_exit_intr_info,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
load_vmcs12_host_state(vcpu, vmcs12);
/* * Process events if an injectable IRQ or NMI is pending, even * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). * If an event became pending while L2 was active, KVM needs to * either inject the event or request an IRQ/NMI window. SMIs * don't need to be processed as SMM is mutually exclusive with * non-root mode. INIT/SIPI don't need to be checked as INIT * is blocked post-VMXON, and SIPIs are ignored.
*/ if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending)
kvm_make_request(KVM_REQ_EVENT, vcpu); return;
}
/* * After an early L2 VM-entry failure, we're now back * in L1 which thinks it just finished a VMLAUNCH or * VMRESUME instruction, so we need to set the failure * flag and the VM-instruction error field of the VMCS * accordingly, and skip the emulated instruction.
*/
(void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
/* * Restore L1's host state to KVM's software model. We're here * because a consistency check was caught by hardware, which * means some amount of guest state has been propagated to KVM's * model and needs to be unwound to the host's state.
*/
nested_vmx_restore_host_state(vcpu);
/* * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). * On success, returns 0. When the operand is invalid, returns 1 and throws * #UD, #GP, or #SS.
*/ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsignedlong exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
{
gva_t off; bool exn; struct kvm_segment s;
/* * According to Vol. 3B, "Information for VM Exits Due to Instruction * Execution", on an exit, vmx_instruction_info holds most of the * addressing components of the operand. Only the displacement part * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). * For how an actual address is calculated from all these components, * refer to Vol. 1, "Operand Addressing".
*/ int scaling = vmx_instruction_info & 3; int addr_size = (vmx_instruction_info >> 7) & 7; bool is_reg = vmx_instruction_info & (1u << 10); int seg_reg = (vmx_instruction_info >> 15) & 7; int index_reg = (vmx_instruction_info >> 18) & 0xf; bool index_is_valid = !(vmx_instruction_info & (1u << 22)); int base_reg = (vmx_instruction_info >> 23) & 0xf; bool base_is_valid = !(vmx_instruction_info & (1u << 27));
if (is_reg) {
kvm_queue_exception(vcpu, UD_VECTOR); return 1;
}
/* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */
off = exit_qualification; /* holds the displacement */ if (addr_size == 1)
off = (gva_t)sign_extend64(off, 31); elseif (addr_size == 0)
off = (gva_t)sign_extend64(off, 15); if (base_is_valid)
off += kvm_register_read(vcpu, base_reg); if (index_is_valid)
off += kvm_register_read(vcpu, index_reg) << scaling;
vmx_get_segment(vcpu, &s, seg_reg);
/* * The effective address, i.e. @off, of a memory operand is truncated * based on the address size of the instruction. Note that this is * the *effective address*, i.e. the address prior to accounting for * the segment's base.
*/ if (addr_size == 1) /* 32 bit */
off &= 0xffffffff; elseif (addr_size == 0) /* 16 bit */
off &= 0xffff;
/* Checks for #GP/#SS exceptions. */
exn = false; if (is_long_mode(vcpu)) { /* * The virtual/linear address is never truncated in 64-bit * mode, e.g. a 32-bit address size can yield a 64-bit virtual * address when using FS/GS with a non-zero base.
*/ if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
*ret = s.base + off; else
*ret = off;
*ret = vmx_get_untagged_addr(vcpu, *ret, 0); /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory * destination for long mode!
*/
exn = is_noncanonical_address(*ret, vcpu, 0);
} else { /* * When not in long mode, the virtual/linear address is * unconditionally truncated to 32 bits regardless of the * address size.
*/
*ret = (s.base + off) & 0xffffffff;
/* Protected mode: apply checks for segment validity in the * following order: * - segment type check (#GP(0) may be thrown) * - usability check (#GP(0)/#SS(0)) * - limit check (#GP(0)/#SS(0))
*/ if (wr) /* #GP(0) if the destination operand is located in a * read-only data segment or any code segment.
*/
exn = ((s.type & 0xa) == 0 || (s.type & 8)); else /* #GP(0) if the source operand is located in an * execute-only code segment
*/
exn = ((s.type & 0xa) == 8); if (exn) {
kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return 1;
} /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
*/
exn = (s.unusable != 0);
/* * Protected mode: #GP(0)/#SS(0) if the memory operand is * outside the segment limit. All CPUs that support VMX ignore * limit checks for flat segments, i.e. segments with base==0, * limit==0xffffffff and of type expand-up data or code.
*/ if (!(s.base == 0 && s.limit == 0xffffffff &&
((s.type & 8) || !(s.type & 4))))
exn = exn || ((u64)off + len - 1 > s.limit);
} if (exn) {
kvm_queue_exception_e(vcpu,
seg_reg == VCPU_SREG_SS ?
SS_VECTOR : GP_VECTOR,
0); return 1;
}
return 0;
}
staticint nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, int *ret)
{
gva_t gva; struct x86_exception e; int r;
r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); if (r != X86EMUL_CONTINUE) {
*ret = kvm_handle_memory_failure(vcpu, r, &e); return -EINVAL;
}
return 0;
}
/* * Allocate a shadow VMCS and associate it with the currently loaded * VMCS, unless such a shadow VMCS already exists. The newly allocated * VMCS is also VMCLEARed, so that it is ready for use.
*/ staticstruct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
/* * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it * when L1 executes VMXOFF or the vCPU is forced out of nested * operation. VMXON faults if the CPU is already post-VMXON, so it * should be impossible to already have an allocated shadow VMCS. KVM * doesn't support virtualization of VMCS shadowing, so vmcs01 should * always be the loaded VMCS.
*/ if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) return loaded_vmcs->shadow_vmcs;
loaded_vmcs->shadow_vmcs = alloc_vmcs(true); if (loaded_vmcs->shadow_vmcs)
vmcs_clear(loaded_vmcs->shadow_vmcs);
/* * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter * the guest and so cannot rely on hardware to perform the check, * which has higher priority than VM-Exit (see Intel SDM's pseudocode * for VMXON). * * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't * force any of the relevant guest state. For a restricted guest, KVM * does force CR0.PE=1, but only to also force VM86 in order to emulate * Real Mode, and so there's no need to check CR0.PE manually.
*/ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
kvm_queue_exception(vcpu, UD_VECTOR); return 1;
}
/* * The CPL is checked for "not in VMX operation" and for "in VMX root", * and has higher priority than the VM-Fail due to being post-VMXON, * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits * from L2 to L1, i.e. there's no need to check for the vCPU being in * VMX non-root. * * Forwarding the VM-Exit unconditionally, i.e. without performing the * #UD checks (see above), is functionally ok because KVM doesn't allow * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are * missed by hardware due to shadowing CR0 and/or CR4.
*/ if (vmx_get_cpl(vcpu)) {
kvm_inject_gp(vcpu, 0); return 1;
}
if (vmx->nested.vmxon) return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
/* * Invalid CR0/CR4 generates #GP. These checks are performed if and * only if the vCPU isn't already in VMX operation, i.e. effectively * have lower priority than the VM-Fail above.
*/ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
!nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
kvm_inject_gp(vcpu, 0); return 1;
}
if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) return ret;
/* * SDM 3: 24.11.5 * The first 4 bytes of VMXON region contain the supported * VMCS revision identifier * * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; * which replaces physical address width with 32
*/ if (!page_address_valid(vcpu, vmptr)) return nested_vmx_failInvalid(vcpu);
if (enable_shadow_vmcs) { /* copy to memory all shadowed fields in case
they were modified */
copy_shadow_to_vmcs12(vmx);
vmx_disable_shadow_vmcs(vmx);
}
vmx->nested.posted_intr_nv = -1;
/* Emulate the VMXOFF instruction */ staticint handle_vmxoff(struct kvm_vcpu *vcpu)
{ if (!nested_vmx_check_permission(vcpu)) return 1;
free_nested(vcpu);
if (kvm_apic_has_pending_init_or_sipi(vcpu))
kvm_make_request(KVM_REQ_EVENT, vcpu);
return nested_vmx_succeed(vcpu);
}
/* Emulate the VMCLEAR instruction */ staticint handle_vmclear(struct kvm_vcpu *vcpu)
{ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 zero = 0;
gpa_t vmptr; int r;
if (!nested_vmx_check_permission(vcpu)) return 1;
if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) return r;
if (!page_address_valid(vcpu, vmptr)) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
if (vmptr == vmx->nested.vmxon_ptr) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
/* * Silently ignore memory errors on VMCLEAR, Intel's pseudocode * for VMCLEAR includes a "ensure that data for VMCS referenced * by the operand is in memory" clause that guards writes to * memory, i.e. doing nothing for I/O is architecturally valid. * * FIXME: Suppress failures if and only if no memslot is found, * i.e. exit to userspace if __copy_to_user() fails.
*/
(void)kvm_vcpu_write_guest(vcpu,
vmptr + offsetof(struct vmcs12,
launch_state),
&zero, sizeof(zero));
}
/* Decode instruction info and find the field to read */
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
if (!nested_vmx_is_evmptr12_valid(vmx)) { /* * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid.
*/ if (vmx->nested.current_vmptr == INVALID_GPA ||
(is_guest_mode(vcpu) &&
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu);
offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
/* Read the field, zero-extended to a u64 value */
value = vmcs12_read_any(vmcs12, field, offset);
} else { /* * Hyper-V TLFS (as of 6.0b) explicitly states, that while an * enlightened VMCS is active VMREAD/VMWRITE instructions are * unsupported. Unfortunately, certain versions of Windows 11 * don't comply with this requirement which is not enforced in * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a * workaround, as misbehaving guests will panic on VM-Fail. * Note, enlightened VMCS is incompatible with shadow VMCS so * all VMREADs from L2 should go to L1.
*/ if (WARN_ON_ONCE(is_guest_mode(vcpu))) return nested_vmx_failInvalid(vcpu);
/* Read the field, zero-extended to a u64 value */
value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset);
}
/* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending * on the guest's mode (32 or 64 bit), not on the given field's length.
*/ if (instr_info & BIT(10)) {
kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value);
} else {
len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification,
instr_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */
r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e);
}
/* * The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 * bit (value), and then copies only the appropriate number of * bits into the vmcs12 field.
*/
u64 value = 0;
if (!nested_vmx_check_permission(vcpu)) return 1;
/* * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMWRITE sets the ALU flags for VMfailInvalid.
*/ if (vmx->nested.current_vmptr == INVALID_GPA ||
(is_guest_mode(vcpu) &&
get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu);
if (instr_info & BIT(10))
value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); else {
len = is_64_bit_mode(vcpu) ? 8 : 4; if (get_vmx_mem_address(vcpu, exit_qualification,
instr_info, false, len, &gva)) return 1;
r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e);
}
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
offset = get_vmcs12_field_offset(field); if (offset < 0) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/* * If the vCPU supports "VMWRITE to any supported field in the * VMCS," then the "read-only" fields are actually read/write.
*/ if (vmcs_field_readonly(field) &&
!nested_cpu_has_vmwrite_any_field(vcpu)) return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
/* * Ensure vmcs12 is up-to-date before any VMWRITE that dirties * vmcs12, else we may crush a field or consume a stale value.
*/ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
/* * Some Intel CPUs intentionally drop the reserved bits of the AR byte * fields on VMWRITE. Emulate this behavior to ensure consistent KVM * behavior regardless of the underlying hardware, e.g. if an AR_BYTE * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD * from L1 will return a different value than VMREAD from L2 (L1 sees * the stripped down value, L2 sees the full value as stored by KVM).
*/ if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
value &= 0x1f0ff;
vmcs12_write_any(vmcs12, field, offset, value);
/* * Do not track vmcs12 dirty-state if in guest-mode as we actually * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated * by L1 without a vmexit are always updated in the vmcs02, i.e. don't * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
*/ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { /* * L1 can read these fields without exiting, ensure the * shadow VMCS is up-to-date.
*/ if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
preempt_disable();
vmcs_load(vmx->vmcs01.shadow_vmcs);
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the * given physical address won't match the required * VMCS12_REVISION identifier.
*/ return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
/* * Load VMCS12 from guest memory since it is not already * cached.
*/ if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12,
VMCS12_SIZE)) { return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
/* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global)
*/ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false, sizeof(operand), &gva)) return 1;
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e);
/* * Nested EPT roots are always held through guest_mmu, * not root_mmu.
*/
mmu = &vcpu->arch.guest_mmu;
switch (type) { case VMX_EPT_EXTENT_CONTEXT: if (!nested_vmx_check_eptp(vcpu, operand.eptp)) return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
roots_to_free = 0; if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd,
operand.eptp))
roots_to_free |= KVM_MMU_ROOT_CURRENT;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
mmu->prev_roots[i].pgd,
operand.eptp))
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
} break; case VMX_EPT_EXTENT_GLOBAL:
roots_to_free = KVM_MMU_ROOTS_ALL; break; default:
BUG(); break;
}
if (roots_to_free)
kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
/* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global)
*/ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false, sizeof(operand), &gva)) return 1;
r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); if (r != X86EMUL_CONTINUE) return kvm_handle_memory_failure(vcpu, r, &e);
if (operand.vpid >> 16) return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
/* * Always flush the effective vpid02, i.e. never flush the current VPID * and never explicitly flush vpid01. INVVPID targets a VPID, not a * VMCS, and so whether or not the current vmcs12 has VPID enabled is * irrelevant (and there may not be a loaded vmcs12).
*/
vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: /* * LAM doesn't apply to addresses that are inputs to TLB * invalidation.
*/ if (!operand.vpid ||
is_noncanonical_invlpg_address(operand.gla, vcpu)) return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid_sync_vcpu_addr(vpid02, operand.gla); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: if (!operand.vpid) return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid_sync_context(vpid02); break; case VMX_VPID_EXTENT_ALL_CONTEXT:
vpid_sync_context(vpid02); break; default:
WARN_ON_ONCE(1); return kvm_skip_emulated_instruction(vcpu);
}
/* * Sync the shadow page tables if EPT is disabled, L1 is invalidating * linear mappings for L2 (tagged with L2's VPID). Free all guest * roots as VPIDs are not tracked in the MMU role. * * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share * an MMU when EPT is disabled. * * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
*/ if (!enable_ept)
kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu);
if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) return 1; if (index >= VMFUNC_EPTP_ENTRIES) return 1;
if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
&new_eptp, index * 8, 8)) return 1;
/* * If the (L2) guest does a vmfunc to the currently * active ept pointer, we don't have to do anything else
*/ if (vmcs12->ept_pointer != new_eptp) { if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1;
/* * VMFUNC should never execute cleanly while L1 is active; KVM supports * VMFUNC for nested VMs, but not for L1.
*/ if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
kvm_queue_exception(vcpu, UD_VECTOR); return 1;
}
vmcs12 = get_vmcs12(vcpu);
/* * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC * is enabled in vmcs02 if and only if it's enabled in vmcs12.
*/ if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
kvm_queue_exception(vcpu, UD_VECTOR); return 1;
}
if (!(vmcs12->vm_function_control & BIT_ULL(function))) goto fail;
switch (function) { case 0: if (nested_vmx_eptp_switching(vcpu, vmcs12)) goto fail; break; default: goto fail;
} return kvm_skip_emulated_instruction(vcpu);
fail: /* * This is effectively a reflected VM-Exit, as opposed to a synthesized * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode * EXIT_REASON_VMFUNC as the exit reason.
*/
nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full,
vmx_get_intr_info(vcpu),
vmx_get_exit_qual(vcpu)); return 1;
}
/* * Return true if an IO instruction with the specified port and size should cause * a VM-exit into L1.
*/ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsignedint port, int size)
{ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
gpa_t bitmap, last_bitmap;
u8 b;
/* * Return 1 if we should exit from L2 to L1 to handle an MSR access, * rather than handle it ourselves in L0. I.e., check whether L1 expressed * disinterest in the current event (read or write a specific MSR) by using an * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
*/ staticbool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, union vmx_exit_reason exit_reason)
{
u32 msr_index;
gpa_t bitmap;
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) returntrue;
/* * The MSR_BITMAP page is divided into four 1024-byte bitmaps, * for the four combinations of read/write and low/high MSR numbers. * First we need to figure out which of the four to use:
*/
bitmap = vmcs12->msr_bitmap; if (exit_reason.basic == EXIT_REASON_MSR_WRITE ||
exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM)
bitmap += 2048; if (msr_index >= 0xc0000000) {
msr_index -= 0xc0000000;
bitmap += 1024;
}
/* Then read the msr_index'th bit from this bitmap: */ if (msr_index < 1024*8) { unsignedchar b; if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) returntrue; return 1 & (b >> (msr_index & 7));
} else returntrue; /* let L1 handle the wrong parameter */
}
/* * Return 1 if we should exit from L2 to L1 to handle a CR access exit, * rather than handle it ourselves in L0. I.e., check if L1 wanted to * intercept (via guest_host_mask etc.) the current event.
*/ staticbool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{ unsignedlong exit_qualification = vmx_get_exit_qual(vcpu); int cr = exit_qualification & 15; int reg; unsignedlong val;
switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */
reg = (exit_qualification >> 8) & 15;
val = kvm_register_read(vcpu, reg); switch (cr) { case 0: if (vmcs12->cr0_guest_host_mask &
(val ^ vmcs12->cr0_read_shadow)) returntrue; break; case 3: if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) returntrue; break; case 4: if (vmcs12->cr4_guest_host_mask &
(vmcs12->cr4_read_shadow ^ val)) returntrue; break; case 8: if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) returntrue; break;
} break; case 2: /* clts */ if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
(vmcs12->cr0_read_shadow & X86_CR0_TS)) returntrue; break; case 1: /* mov from cr */ switch (cr) { case 3: if (vmcs12->cpu_based_vm_exec_control &
CPU_BASED_CR3_STORE_EXITING) returntrue; break; case 8: if (vmcs12->cpu_based_vm_exec_control &
CPU_BASED_CR8_STORE_EXITING) returntrue; break;
} break; case 3: /* lmsw */ /* * lmsw can change bits 1..3 of cr0, and only set bit 0 of * cr0. Other attempted changes are ignored, with no exit.
*/
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; if (vmcs12->cr0_guest_host_mask & 0xe &
(val ^ vmcs12->cr0_read_shadow)) returntrue; if ((vmcs12->cr0_guest_host_mask & 0x1) &&
!(vmcs12->cr0_read_shadow & 0x1) &&
(val & 0x1)) returntrue; break;
} returnfalse;
}
if (!nested_cpu_has_shadow_vmcs(vmcs12)) returntrue;
/* Decode instruction info and find the field to access */
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Out-of-range fields always cause a VM exit from L2 to L1 */ if (field >> 15) returntrue;
if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) returntrue;
/* * An MTF VM-exit may be injected into the guest by setting the * interruption-type to 7 (other event) and the vector field to 0. Such * is the case regardless of the 'monitor trap flag' VM-execution * control.
*/ return entry_intr_info == (INTR_INFO_VALID_MASK
| INTR_TYPE_OTHER_EVENT);
}
/* * Return true if L0 wants to handle an exit from L2 regardless of whether or not * L1 wants the exit. Only call this when in is_guest_mode (L2).
*/ staticbool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, union vmx_exit_reason exit_reason)
{
u32 intr_info;
switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) returntrue; elseif (is_page_fault(intr_info)) return vcpu->arch.apf.host_apf_flags ||
vmx_need_pf_intercept(vcpu); elseif (is_debug(intr_info) &&
vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) returntrue; elseif (is_breakpoint(intr_info) &&
vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) returntrue; elseif (is_alignment_check(intr_info) &&
!vmx_guest_inject_ac(vcpu)) returntrue; elseif (is_ve_fault(intr_info)) returntrue; returnfalse; case EXIT_REASON_EXTERNAL_INTERRUPT: returntrue; case EXIT_REASON_MCE_DURING_VMENTRY: returntrue; case EXIT_REASON_EPT_VIOLATION: /* * L0 always deals with the EPT violation. If nested EPT is * used, and the nested mmu code discovers that the address is * missing in the guest EPT table (EPT12), the EPT violation * will be injected with nested_ept_inject_page_fault()
*/ returntrue; case EXIT_REASON_EPT_MISCONFIG: /* * L2 never uses directly L1's EPT, but rather L0's own EPT * table (shadow on EPT) or a merged EPT table that L0 built * (EPT on EPT). So any problems with the structure of the * table is L0's fault.
*/ returntrue; case EXIT_REASON_PREEMPTION_TIMER: returntrue; case EXIT_REASON_PML_FULL: /* * PML is emulated for an L1 VMM and should never be enabled in * vmcs02, always "handle" PML_FULL by exiting to userspace.
*/ returntrue; case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ returntrue; case EXIT_REASON_BUS_LOCK: /* * At present, bus lock VM exit is never exposed to L1. * Handle L2's bus locks in L0 directly.
*/ returntrue; #ifdef CONFIG_KVM_HYPERV case EXIT_REASON_VMCALL: /* Hyper-V L2 TLB flush hypercall is handled by L0 */ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
kvm_hv_is_tlb_flush_hcall(vcpu); #endif default: break;
} returnfalse;
}
/* * Return 1 if L1 wants to intercept an exit from L2. Only call this when in * is_guest_mode (L2).
*/ staticbool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, union vmx_exit_reason exit_reason)
{ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u32 intr_info;
switch ((u16)exit_reason.basic) { case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) returntrue; elseif (is_page_fault(intr_info)) returntrue; return vmcs12->exception_bitmap &
(1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: return nested_exit_on_intr(vcpu); case EXIT_REASON_TRIPLE_FAULT: returntrue; case EXIT_REASON_INTERRUPT_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); case EXIT_REASON_NMI_WINDOW: return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); case EXIT_REASON_TASK_SWITCH: returntrue; case EXIT_REASON_CPUID: returntrue; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); case EXIT_REASON_INVD: returntrue; case EXIT_REASON_INVLPG: return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); case EXIT_REASON_RDRAND: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); case EXIT_REASON_RDSEED: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMREAD: return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
vmcs12->vmread_bitmap); case EXIT_REASON_VMWRITE: return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
vmcs12->vmwrite_bitmap); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting!
*/ returntrue; case EXIT_REASON_CR_ACCESS: return nested_vmx_exit_handled_cr(vcpu, vmcs12); case EXIT_REASON_DR_ACCESS: return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: return nested_vmx_exit_handled_io(vcpu, vmcs12); case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: case EXIT_REASON_MSR_READ_IMM: case EXIT_REASON_MSR_WRITE_IMM: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); case EXIT_REASON_INVALID_STATE: returntrue; case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); case EXIT_REASON_MONITOR_TRAP_FLAG: return nested_vmx_exit_handled_mtf(vmcs12); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
nested_cpu_has2(vmcs12,
SECONDARY_EXEC_PAUSE_LOOP_EXITING); case EXIT_REASON_MCE_DURING_VMENTRY: returntrue; case EXIT_REASON_TPR_BELOW_THRESHOLD: return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); case EXIT_REASON_APIC_ACCESS: case EXIT_REASON_APIC_WRITE: case EXIT_REASON_EOI_INDUCED: /* * The controls for "virtualize APIC accesses," "APIC- * register virtualization," and "virtual-interrupt * delivery" only come from vmcs12.
*/ returntrue; case EXIT_REASON_INVPCID: return
nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: returntrue; case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: /* * This should never happen, since it is not possible to * set XSS to a non-zero value---neither in L1 nor in L2. * If if it were, XSS would have to be checked against * the XSS exit bitmap in vmcs12.
*/ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); case EXIT_REASON_ENCLS: return nested_vmx_exit_handled_encls(vcpu, vmcs12); case EXIT_REASON_NOTIFY: /* Notify VM exit is not exposed to L1 */ returnfalse; case EXIT_REASON_SEAMCALL: case EXIT_REASON_TDCALL: /* * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't * virtualized by KVM for L1 hypervisors, i.e. L1 should * never want or expect such an exit.
*/ returnfalse; default: returntrue;
}
}
/* * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was * reflected into L1.
*/ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
{ struct vcpu_vmx *vmx = to_vmx(vcpu); union vmx_exit_reason exit_reason = vmx->vt.exit_reason; unsignedlong exit_qual;
u32 exit_intr_info;
WARN_ON_ONCE(vmx->nested.nested_run_pending);
/* * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM * has already loaded L2's state.
*/ if (unlikely(vmx->fail)) {
trace_kvm_nested_vmenter_failed( "hardware VM-instruction error: ",
vmcs_read32(VM_INSTRUCTION_ERROR));
exit_intr_info = 0;
exit_qual = 0; goto reflect_vmexit;
}
trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
/* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) returnfalse;
/* If L1 doesn't want the exit, handle it in L0. */ if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) returnfalse;
/* * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would * need to be synthesized by querying the in-kernel LAPIC, but external * interrupts are never reflected to L1 so it's a non-issue.
*/
exit_intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(exit_intr_info)) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) return -EFAULT;
if (!vmx_has_valid_vmcs12(vcpu)) goto out;
/* * When running L2, the authoritative vmcs12 state is in the * vmcs02. When running L1, the authoritative vmcs12 state is * in the shadow or enlightened vmcs linked to vmcs01, unless * need_vmcs12_to_shadow_sync is set, in which case, the authoritative * vmcs12 state is in the vmcs12 already.
*/ if (is_guest_mode(vcpu)) {
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
} else {
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (!vmx->nested.need_vmcs12_to_shadow_sync) { if (nested_vmx_is_evmptr12_valid(vmx)) /* * L1 hypervisor is not obliged to keep eVMCS * clean fields data always up-to-date while * not in guest mode, 'hv_clean_fields' is only * supposed to be actual upon vmentry so we need * to ignore it here and do full copy.
*/
copy_enlightened_to_vmcs12(vmx, 0); elseif (enable_shadow_vmcs)
copy_shadow_to_vmcs12(vmx);
}
}
/* * Copy over the full allocated size of vmcs12 rather than just the size * of the struct.
*/ if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) return -EFAULT;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
vmcs12->vmcs_link_pointer != INVALID_GPA) { if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT;
}
out: return kvm_state.size;
}
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) return -EINVAL;
if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { if (kvm_state->hdr.vmx.smm.flags) return -EINVAL;
if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) return -EINVAL;
/* * KVM_STATE_NESTED_EVMCS used to signal that KVM should * enable eVMCS capability on vCPU. However, since then * code was changed such that flag signals vmcs12 should * be copied into eVMCS in guest memory. * * To preserve backwards compatibility, allow user * to set this flag even when there is no VMXON region.
*/ if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL;
} else { if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return -EINVAL;
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) return -EINVAL;
}
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return -EINVAL;
if (kvm_state->hdr.vmx.smm.flags &
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL;
if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) return -EINVAL;
/* * SMM temporarily disables VMX, so we cannot be in guest mode, * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags * must be zero.
*/ if (is_smm(vcpu) ?
(kvm_state->flags &
(KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
: kvm_state->hdr.vmx.smm.flags) return -EINVAL;
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
!(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL;
if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) return 0;
vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
ret = enter_vmx_operation(vcpu); if (ret) return ret;
/* Empty 'VMXON' state is permitted if no VMCS loaded */ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { /* See vmx_has_valid_vmcs12. */ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
(kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
(kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) return -EINVAL; else return 0;
}
if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
!page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) return -EINVAL;
set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); #ifdef CONFIG_KVM_HYPERV
} elseif (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* * nested_vmx_handle_enlightened_vmptrld() cannot be called * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be * restored yet. EVMCS will be mapped from * nested_get_vmcs12_pages().
*/
vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); #endif
} else { return -EINVAL;
}
/* * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo * that madness to get the encoding for comparison.
*/ #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
static u64 nested_vmx_calc_vmcs_enum_msr(void)
{ /* * Note these are the so called "index" of the VMCS field encoding, not * the index into vmcs12.
*/ unsignedint max_idx, idx; int i;
/* * For better or worse, KVM allows VMREAD/VMWRITE to all fields in * vmcs12, regardless of whether or not the associated feature is * exposed to L1. Simply find the field with the highest index.
*/
max_idx = 0; for (i = 0; i < nr_vmcs12_fields; i++) { /* The vmcs12 table is very, very sparsely populated. */ if (!vmcs12_field_offsets[i]) continue;
msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
msrs->procbased_ctls_high &=
CPU_BASED_INTR_WINDOW_EXITING |
CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING | #ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | #endif
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the * hardware. For example, L1 can specify an MSR bitmap - and we * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps.
*/
msrs->procbased_ctls_high |=
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
CPU_BASED_USE_MSR_BITMAPS;
/* We support free control of CR3 access interception. */
msrs->procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
}
/* * Advertise EPTP switching irrespective of hardware support, * KVM emulates it in software so long as VMFUNC is supported.
*/ if (cpu_has_vmx_vmfunc())
msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
}
/* * Old versions of KVM use the single-context version without * checking for support, so declare that it is supported even * though it is treated as global context. The alternative is * not failing the single-context invvpid, and it is worse.
*/ if (enable_vpid) {
msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_VPID;
msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
VMX_VPID_EXTENT_SUPPORTED_MASK;
}
if (enable_unrestricted_guest)
msrs->secondary_ctls_high |=
SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (flexpriority_enabled)
msrs->secondary_ctls_high |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (enable_sgx)
msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
}
staticvoid nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
{ /* * This MSR reports some information about VMX support. We * should return information about the VMX we emulate for the * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware.
*/
msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
X86_MEMTYPE_WB);
msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
}
staticvoid nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
{ /* * These MSRs specify bits which the guest must keep fixed on * while L1 is in VMXON mode (in L1's root mode, or running an L2). * We picked the standard core2 setting.
*/ #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) #define VMXON_CR4_ALWAYSON X86_CR4_VMXE
msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
/* These MSRs specify bits which the guest must keep fixed off. */
rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
if (vmx_umip_emulated())
msrs->cr4_fixed1 |= X86_CR4_UMIP;
}
/* * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be * returned for the various VMX controls MSRs when nested VMX is enabled. * The same values should also be used to verify that vmcs12 control fields are * valid during nested entry from L1 to L2. * Each of these control msrs has a low and high 32-bit half: A low bit is on * if the corresponding bit in the (32-bit) control field *must* be on, and a * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify().
*/ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
{ struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
/* * Note that as a general rule, the high half of the MSRs (bits in * the control fields which may be 1) should be initialized by the * intersection of the underlying hardware's MSR (i.e., features which * can be supported) and the list of features we want to expose - * because they are known to be properly supported in our code. * Also, usually, the low half of the MSRs (bits which must be 1) can * be set to 0, meaning that L1 may turn off any of these bits. The * reason is that if one of these bits is necessary, it will appear * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control * fields of vmcs01 and vmcs02, will turn these bits off - and * nested_vmx_l1_wants_exit() will not pass related exits to L1. * These rules have exceptions below.
*/
nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs);
if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++)
free_page((unsignedlong)vmx_bitmap[i]);
}
}
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
{ int i;
if (!cpu_has_vmx_shadow_vmcs())
enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) { /* * The vmx_bitmap is not tied to a VM and so should * not be charged to a memcg.
*/
vmx_bitmap[i] = (unsignedlong *)
__get_free_page(GFP_KERNEL); if (!vmx_bitmap[i]) {
nested_vmx_hardware_unsetup(); return -ENOMEM;
}
}
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.140Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.