/* SPDX-License-Identifier: GPL-2.0-only */ /* * Kernel-based Virtual Machine driver for Linux * * This header defines architecture specific interfaces, x86 version
*/
/* * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS).
*/ #ifdef CONFIG_KVM_MAX_NR_VCPUS #define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS #else #define KVM_MAX_VCPUS 1024 #endif
/* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs * might be larger than the actual number of VCPUs because the * APIC ID encodes CPU topology information. * * In the worst case, we'll need less than one extra bit for the * Core ID, and less than one extra bit for the Package (Die) ID, * so ratio of 4 should be enough.
*/ #define KVM_VCPU_ID_RATIO 4 #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
/* memory slots that are not exposed to userspace */ #define KVM_INTERNAL_MEM_SLOTS 3
struct x86_emulate_ctxt; struct x86_exception; union kvm_smram; enum x86_intercept; enum x86_intercept_stage;
#define KVM_NR_DB_REGS 4
#define DR6_BUS_LOCK (1 << 11) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) #define DR6_BT (1 << 15) #define DR6_RTM (1 << 16) /* * DR6_ACTIVE_LOW combines fixed-1 and active-low bits. * We can regard all the bits in DR6_FIXED_1 as active_low bits; * they will never be 0 for now, but when they are defined * in the future it will require no code change. * * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
*/ #define DR6_ACTIVE_LOW 0xffff0ff0 #define DR6_VOLATILE 0x0001e80f #define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
/* * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks * when emulating instructions that triggers implicit access.
*/ #define PFERR_IMPLICIT_ACCESS BIT_ULL(48) /* * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred * when the guest was accessing private memory.
*/ #define PFERR_PRIVATE_ACCESS BIT_ULL(49) #define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
/* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* * The following bit is set with PV-EOI, unset on EOI. * We detect PV-EOI changes by guest by comparing * this bit with PV-EOI in guest memory. * See the implementation in apic_update_pv_eoi.
*/ #define KVM_APIC_PV_EOI_PENDING 1
/* * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows * allocating 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must * not create more than 2^16-1 upper-level shadow pages at a single gfn, * otherwise gfn_write_track will overflow and explosions will ensue. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which * incorporates various mode bits and properties of the SP. Roughly speaking, * the number of unique SPs that can theoretically be created is 2^n, where n * is the number of bits that are used to compute the role. * * But, even though there are 20 bits in the mask below, not all combinations * of modes and flags are possible: * * - invalid shadow pages are not accounted, mirror pages are not shadowed, * so the bits are effectively 18. * * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging); * execonly and ad_disabled are only used for nested EPT which has * has_4_byte_gpte=0. Therefore, 2 bits are always unused. * * - the 4 bits of level are effectively limited to the values 2/3/4/5, * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE * paging has exactly one upper level, making level completely redundant * when has_4_byte_gpte=1. * * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if * cr0_wp=0, therefore these three bits only give rise to 5 possibilities. * * Therefore, the maximum number of possible upper-level shadow pages for a * single gfn is a bit less than 2^13.
*/ union kvm_mmu_page_role {
u32 word; struct { unsigned level:4; unsigned has_4_byte_gpte:1; unsigned quadrant:2; unsigned direct:1; unsigned access:3; unsigned invalid:1; unsigned efer_nx:1; unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; unsigned ad_disabled:1; unsigned guest_mode:1; unsigned passthrough:1; unsigned is_mirror:1; unsigned :4;
/* * This is left at the top of the word so that * kvm_memslots_for_spte_role can extract it with a * simple shift. While there is room, give it a whole * byte so it is also faster to load it from memory.
*/ unsigned smm:8;
};
};
/* * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties * relevant to the current MMU configuration. When loading CR0, CR4, or EFER, * including on nested transitions, if nothing in the full role changes then * MMU re-configuration can be skipped. @valid bit is set on first usage so we * don't treat all-zero structure as valid data. * * The properties that are tracked in the extended role but not the page role * are for things that either (a) do not affect the validity of the shadow page * or (b) are indirectly reflected in the shadow page's role. For example, * CR4.PKE only affects permission checks for software walks of the guest page * tables (because KVM doesn't support Protection Keys with shadow paging), and * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level. * * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role. * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and * SMAP, but the MMU's permission checks for software walks need to be SMEP and * SMAP aware regardless of CR0.WP.
*/ union kvm_mmu_extended_role {
u32 word; struct { unsignedint valid:1; unsignedint execonly:1; unsignedint cr4_pse:1; unsignedint cr4_pke:1; unsignedint cr4_smap:1; unsignedint cr4_smep:1; unsignedint cr4_la57:1; unsignedint efer_lma:1;
};
};
union kvm_cpu_role {
u64 as_u64; struct { union kvm_mmu_page_role base; union kvm_mmu_extended_role ext;
};
};
struct kvm_rmap_head {
atomic_long_t val;
};
struct kvm_pio_request { unsignedlong count; int in; int port; int size;
};
/* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the * current mmu mode.
*/ struct kvm_mmu { unsignedlong (*get_guest_pgd)(struct kvm_vcpu *vcpu);
u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); int (*sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i); struct kvm_mmu_root_info root;
hpa_t mirror_root_hpa; union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role;
/* * The pkru_mask indicates if protection key checks are needed. It * consists of 16 domains indexed by page fault error code bits [4:1], * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables. * Each domain has 2 bits which are ANDed with AD and WD from PKRU.
*/
u32 pkru_mask;
/* * Bitmap; bit set = permission fault * Byte index: page fault error code [4:1] * Bit index: pte permissions in ACC_* format
*/
u8 permissions[16];
u64 *pae_root;
u64 *pml4_root;
u64 *pml5_root;
/* * check zero bits on shadow page table entries, these * bits include not only hardware reserved bits but also * the bits spte never used.
*/ struct rsvd_bits_validate shadow_zero_check;
struct rsvd_bits_validate guest_rsvd_check;
u64 pdptrs[4]; /* pae */
};
enum pmc_type {
KVM_PMC_GP = 0,
KVM_PMC_FIXED,
};
struct kvm_pmc { enum pmc_type type;
u8 idx; bool is_paused; bool intr; /* * Base value of the PMC counter, relative to the *consumed* count in * the associated perf_event. This value includes counter updates from * the perf_event and emulated_count since the last time the counter * was reprogrammed, but it is *not* the current value as seen by the * guest or userspace. * * The count is relative to the associated perf_event so that KVM * doesn't need to reprogram the perf_event every time the guest writes * to the counter.
*/
u64 counter; /* * PMC events triggered by KVM emulation that haven't been fully * processed, i.e. haven't undergone overflow detection.
*/
u64 emulated_counter;
u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters.
*/
u64 current_config;
};
/* More counters may conflict with other existing Architectural MSRs */ #define KVM_MAX(a, b) ((a) >= (b) ? (a) : (b)) #define KVM_MAX_NR_INTEL_GP_COUNTERS 8 #define KVM_MAX_NR_AMD_GP_COUNTERS 6 #define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \
KVM_MAX_NR_AMD_GP_COUNTERS)
/* * Overlay the bitmap with a 64-bit atomic so that all bits can be * set in a single access, e.g. to reprogram all counters when the PMU * filter changes.
*/ union {
DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
atomic64_t __reprogram_pmi;
};
DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
/* * If a guest counter is cross-mapped to host counter with different * index, its PEBS capability will be temporarily disabled. * * The user should make sure that this mask is updated * after disabling interrupts and before perf_guest_get_msrs();
*/
u64 host_cross_mapped_mask;
/* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice.
*/ bool need_cleanup;
/* * The total number of programmed perf_events and it helps to avoid * redundant check before cleanup if guest don't use vPMU at all.
*/
u8 event_count;
};
struct kvm_pmu_ops;
enum {
KVM_DEBUGREG_BP_ENABLED = BIT(0),
KVM_DEBUGREG_WONT_EXIT = BIT(1), /* * Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by * hardware on exit from or enter to guest. KVM needn't switch them. * DR0-3, DR6 and DR7 are set to their architectural INIT value on VM * exit, host values need to be restored.
*/
KVM_DEBUGREG_AUTO_SWITCH = BIT(2),
};
/* The maximum number of entries on the TLB flush fifo. */ #define KVM_HV_TLB_FLUSH_FIFO_SIZE (16) /* * Note: the following 'magic' entry is made up by KVM to avoid putting * anything besides GVA on the TLB flush fifo. It is theoretically possible * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000 * which will look identical. KVM's action to 'flush everything' instead of * flushing these particular addresses is, however, fully legitimate as * flushing more than requested is always OK.
*/ #define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1)
/* * Preallocated buffers for handling hypercalls that pass sparse vCPU * sets (for high vCPU counts, they're too large to comfortably fit on * the stack).
*/
u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS];
DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
/* * Hardware-defined CPUID leafs that are either scattered by the kernel or are * unknown to the kernel, but need to be directly used by KVM. Note, these * word values conflict with the kernel's "bug" caps, but KVM doesn't use those.
*/ enum kvm_only_cpuid_leafs {
CPUID_12_EAX = NCAPINTS,
CPUID_7_1_EDX,
CPUID_8000_0007_EDX,
CPUID_8000_0022_EAX,
CPUID_7_2_EDX,
CPUID_24_0_EBX,
CPUID_8000_0021_ECX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
};
struct kvm_vcpu_arch { /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions.
*/ unsignedlong regs[NR_VCPU_REGS];
u32 regs_avail;
u32 regs_dirty;
/* * Paging state of the vcpu * * If the vcpu runs in guest mode with two level paging this still saves * the paging mode of the l1 guest. This context is always used to * handle faults.
*/ struct kvm_mmu *mmu;
/* Non-nested MMU for L1 */ struct kvm_mmu root_mmu;
/* L1 MMU when running nested */ struct kvm_mmu guest_mmu;
/* * Paging state of an L2 guest (used for nested npt) * * This context will save all necessary information to walk page tables * of an L2 guest. This context is only initialized for page table * walking and not for faulting since we never handle l2 page faults on * the host.
*/ struct kvm_mmu nested_mmu;
/* * Pointer to the mmu context currently used for * gva_to_gpa translations.
*/ struct kvm_mmu *walk_mmu;
struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_shadow_page_cache; struct kvm_mmu_memory_cache mmu_shadowed_info_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* * This cache is to allocate external page table. E.g. private EPT used * by the TDX module.
*/ struct kvm_mmu_memory_cache mmu_external_spt_cache;
/* * QEMU userspace and the guest each have their own FPU state. * In vcpu_run, we switch between the user and guest FPU contexts. * While running a VCPU, the VCPU thread will have the guest FPU * context. * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The * "guest_fpstate" state here contains the guest FPU context, with the * host PRKU bits.
*/ struct fpu_guest guest_fpu;
/* Exceptions to be injected to the guest. */ struct kvm_queued_exception exception; /* Exception VM-Exits to be synthesized to L1. */ struct kvm_queued_exception exception_vmexit;
int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; bool cpuid_dynamic_bits_dirty; bool is_amd_compatible;
/* * cpu_caps holds the effective guest capabilities, i.e. the features * the vCPU is allowed to use. Typically, but not always, features can * be used by the guest if and only if both KVM and userspace want to * expose the feature to the guest. * * A common exception is for virtualization holes, i.e. when KVM can't * prevent the guest from using a feature, in which case the vCPU "has" * the feature regardless of what KVM or userspace desires. * * Note, features that don't require KVM involvement in any way are * NOT enforced/sanitized by KVM, i.e. are taken verbatim from the * guest CPUID provided by userspace.
*/
u32 cpu_caps[NR_KVM_CPU_CAPS];
u64 reserved_gpa_bits; int maxphyaddr;
/* emulate context */
struct x86_emulate_ctxt *emulate_ctxt; bool emulate_regs_need_sync_to_vcpu; bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); unsignedlong cui_linear_rip; int cui_rdmsr_imm_reg;
gpa_t time;
s8 pvclock_tsc_shift;
u32 pvclock_tsc_mul; unsignedint hw_tsc_khz; struct gfn_to_pfn_cache pv_time; /* set guest stopped flag in pvclock flags field */ bool pvclock_set_guest_stopped_request;
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ /* Number of NMIs pending injection, not including hardware vNMIs. */ unsignedint nmi_pending; bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */
u8 handling_intr_from_guest;
/* * Track the mode of the optimized logical map, as the rules for decoding the * destination vary per mode. Enabling the optimized logical map requires all * software-enabled local APIs to be in the same mode, each addressable APIC to * be mapped to only one MDA, and each MDA to map to at most one APIC.
*/ enum kvm_apic_logical_mode { /* All local APICs are software disabled. */
KVM_APIC_MODE_SW_DISABLED, /* All software enabled local APICs in xAPIC cluster addressing mode. */
KVM_APIC_MODE_XAPIC_CLUSTER, /* All software enabled local APICs in xAPIC flat addressing mode. */
KVM_APIC_MODE_XAPIC_FLAT, /* All software enabled local APICs in x2APIC mode. */
KVM_APIC_MODE_X2APIC, /* * Optimized map disabled, e.g. not all local APICs in the same logical * mode, same logical ID assigned to multiple APICs, etc.
*/
KVM_APIC_MODE_MAP_DISABLED,
};
/* Current state of Hyper-V TSC page clocksource */ enum hv_tsc_page_status { /* TSC page was not set up or disabled */
HV_TSC_PAGE_UNSET = 0, /* TSC page MSR was written by the guest, update pending */
HV_TSC_PAGE_GUEST_CHANGED, /* TSC page update was triggered from the host side */
HV_TSC_PAGE_HOST_CHANGED, /* TSC page was properly set up and is currently active */
HV_TSC_PAGE_SET, /* TSC page was set up with an inaccessible GPA */
HV_TSC_PAGE_BROKEN,
};
enum kvm_irqchip_mode {
KVM_IRQCHIP_NONE,
KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
};
/********************************************************************/ /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */ /********************************************************************/
/* * APIC acceleration is disabled by a module parameter * and/or not supported in hardware.
*/
APICV_INHIBIT_REASON_DISABLED,
/* * APIC acceleration is inhibited because AutoEOI feature is * being used by a HyperV guest.
*/
APICV_INHIBIT_REASON_HYPERV,
/* * APIC acceleration is inhibited because the userspace didn't yet * enable the kernel/split irqchip.
*/
APICV_INHIBIT_REASON_ABSENT,
/* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ * (out of band, debug measure of blocking all interrupts on this vCPU) * was enabled, to avoid AVIC/APICv bypassing it.
*/
APICV_INHIBIT_REASON_BLOCKIRQ,
/* * APICv is disabled because not all vCPUs have a 1:1 mapping between * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
*/
APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
/* * For simplicity, the APIC acceleration is inhibited * first time either APIC ID or APIC base are changed by the guest * from their reset values.
*/
APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
/******************************************************/ /* INHIBITs that are relevant only to the AMD's AVIC. */ /******************************************************/
/* * AVIC is inhibited on a vCPU because it runs a nested guest. * * This is needed because unlike APICv, the peers of this vCPU * cannot use the doorbell mechanism to signal interrupts via AVIC when * a vCPU runs nested.
*/
APICV_INHIBIT_REASON_NESTED,
/* * On SVM, the wait for the IRQ window is implemented with pending vIRQ, * which cannot be injected when the AVIC is enabled, thus AVIC * is inhibited while KVM waits for IRQ window.
*/
APICV_INHIBIT_REASON_IRQWIN,
/* * PIT (i8254) 're-inject' mode, relies on EOI intercept, * which AVIC doesn't support for edge triggered interrupts.
*/
APICV_INHIBIT_REASON_PIT_REINJ,
/* * AVIC is disabled because SEV doesn't support it.
*/
APICV_INHIBIT_REASON_SEV,
/* * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1 * mapping between logical ID and vCPU.
*/
APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
/* * AVIC is disabled because the vCPU's APIC ID is beyond the max * supported by AVIC/x2AVIC, i.e. the vCPU is unaddressable.
*/
APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG,
struct kvm_arch { unsignedlong n_used_mmu_pages; unsignedlong n_requested_mmu_pages; unsignedlong n_max_mmu_pages; unsignedint indirect_shadow_pages;
u8 mmu_valid_gen;
u8 vm_type; bool has_private_mem; bool has_protected_state; bool pre_fault_allowed; struct hlist_head *mmu_page_hash; struct list_head active_mmu_pages; /* * A list of kvm_mmu_page structs that, if zapped, could possibly be * replaced by an NX huge page. A shadow page is on this list if its * existence disallows an NX huge page (nx_huge_page_disallowed is set) * and there are no other conditions that prevent a huge page, e.g. * the backing host page is huge, dirtly logging is not enabled for its * memslot, etc... Note, zapping shadow pages on this list doesn't * guarantee an NX huge page will be created in its stead, e.g. if the * guest attempts to execute from the region then KVM obviously can't * create an NX huge page (without hanging the guest).
*/ struct list_head possible_nx_huge_pages; #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; #endif /* * Protects marking pages unsync during page faults, as TDP MMU page * faults only take mmu_lock for read. For simplicity, the unsync * pages lock is always taken when marking pages unsync regardless of * whether mmu_lock is held for read or write.
*/
spinlock_t mmu_unsync_pages_lock;
/* * This also protects nr_vcpus_matched_tsc which is read from a * preemption-disabled region, so it must be a raw spinlock.
*/
raw_spinlock_t tsc_write_lock;
u64 last_tsc_nsec;
u64 last_tsc_write;
u32 last_tsc_khz;
u64 last_tsc_offset;
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
u64 cur_tsc_generation; int nr_vcpus_matched_tsc;
u32 notify_window;
u32 notify_vmexit_flags; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace * the opportunity to look at it.
*/ bool exit_on_emulation_error;
/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
u32 user_space_msr_mask; struct kvm_x86_msr_filter __rcu *msr_filter;
u32 hypercall_exit_enabled;
/* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed;
#ifdef CONFIG_X86_64 #ifdef CONFIG_KVM_PROVE_MMU /* * The number of TDP MMU pages across all roots. Used only to sanity * check that KVM isn't leaking TDP MMU pages.
*/
atomic64_t tdp_mmu_pages; #endif
/* * List of struct kvm_mmu_pages being used as roots. * All struct kvm_mmu_pages in the list should have * tdp_mmu_page set. * * For reads, this list is protected by: * RCU alone or * the MMU lock in read mode + RCU or * the MMU lock in write mode * * For writes, this list is protected by tdp_mmu_pages_lock; see * below for the details. * * Roots will remain in the list until their tdp_mmu_root_count * drops to zero, at which point the thread that decremented the * count to zero should removed the root from the list and clean * it up, freeing the root after an RCU grace period.
*/ struct list_head tdp_mmu_roots;
/* * Protects accesses to the following fields when the MMU lock * is held in read mode: * - tdp_mmu_roots (above) * - the link field of kvm_mmu_page structs used by the TDP MMU * - possible_nx_huge_pages; * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU * Because the lock is only taken within the MMU lock, strictly * speaking it is redundant to acquire this lock when the thread * holds the MMU lock in write mode. However it often simplifies * the code to do so.
*/
spinlock_t tdp_mmu_pages_lock; #endif/* CONFIG_X86_64 */
/* * If set, at least one shadow root has been allocated. This flag * is used as one input when determining whether certain memslot * related allocations are necessary.
*/ bool shadow_root_allocated;
#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING /* * If set, the VM has (or had) an external write tracking user, and * thus all write tracking metadata has been allocated, even if KVM * itself isn't using write tracking.
*/ bool external_write_tracking_enabled; #endif
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
spinlock_t hv_root_tdp_lock; struct hv_partition_assist_pg *hv_pa_pg; #endif /* * VM-scope maximum vCPU ID. Used to determine the size of structures * that increase along with the maximum vCPU ID, in which case, using * the global KVM_MAX_VCPU_IDS may lead to significant memory waste.
*/
u32 max_vcpu_ids;
bool disable_nx_huge_pages;
/* * Memory caches used to allocate shadow pages when performing eager * page splitting. No need for a shadowed_info_cache since eager page * splitting only allocates direct shadow pages. * * Protected by kvm->slots_lock.
*/ struct kvm_mmu_memory_cache split_shadow_page_cache; struct kvm_mmu_memory_cache split_page_header_cache;
/* * Memory cache used to allocate pte_list_desc structs while splitting * huge pages. In the worst case, to split one huge page, 512 * pte_list_desc structs are needed to add each lower level leaf sptep * to the rmap plus 1 to extend the parent_ptes rmap of the lower level * page table. * * Protected by kvm->slots_lock.
*/ #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1) struct kvm_mmu_memory_cache split_desc_cache;
gfn_t gfn_direct_bits;
/* * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero * value indicates CPU dirty logging is unsupported or disabled in * current VM.
*/ int cpu_dirty_log_size;
};
/* Create, but do not attach this VCPU */ int (*vcpu_precreate)(struct kvm *kvm); int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
/* * Mask of DEBUGCTL bits that are owned by the host, i.e. that need to * match the host's value even while the guest is active.
*/ const u64 HOST_OWNED_DEBUGCTL;
/* * Flush any TLB entries associated with the given GVA. * Does not need to flush GPA->HPA mappings. * Can potentially get non-canonical addresses through INVLPGs, which * the implementation may choose to ignore if appropriate.
*/ void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr);
/* * Flush any TLB entries created by the guest. Like tlb_flush_gva(), * does not need to flush GPA->HPA mappings.
*/ void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
u64 run_flags); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*update_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsignedchar *hypercall_addr); void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected); void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*inject_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); /* Whether or not a virtual NMI is pending in hardware. */ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu); /* * Attempt to pend a virtual NMI in hardware. Returns %true on success * to allow using static_call_ret0 as the fallback.
*/ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
constbool x2apic_icr_is_split; constunsignedlong required_apicv_inhibits; bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsignedint addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
/* Update external mapping with page table link. */ int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, void *external_spt); /* Update the external page table from spte getting set. */ int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
kvm_pfn_t pfn_for_gfn);
/* Update external page tables for page table about to be freed. */ int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, void *external_spt);
/* Update external page table from spte getting removed, and flush TLB. */ int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
kvm_pfn_t pfn_for_gfn);
/* * Retrieve somewhat arbitrary exit/entry information. Intended to * be used only from within tracepoints or error paths.
*/ void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason,
u64 *info1, u64 *info2,
u32 *intr_info, u32 *error_code);
int load_pdptrs(struct kvm_vcpu *vcpu, unsignedlong cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, constvoid *val, int bytes);
externbool tdp_enabled;
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
/* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context * should be reused as is, i.e. skip initialization of * emulation context, instruction fetch and decode. * * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. * Indicates that only select instructions (tagged with * EmulateOnUD) should be emulated (to minimize the emulator * attack surface). See also EMULTYPE_TRAP_UD_FORCED. * * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to * decode the instruction length. For use *only* by * kvm_x86_ops.skip_emulated_instruction() implementations if * EMULTYPE_COMPLETE_USER_EXIT is not set. * * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to * retry native execution under certain conditions, * Can only be set in conjunction with EMULTYPE_PF. * * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was * triggered by KVM's magic "force emulation" prefix, * which is opt in via module param (off by default). * Bypasses EmulateOnUD restriction despite emulating * due to an intercepted #UD (see EMULTYPE_TRAP_UD). * Used to test the full emulator from userspace. * * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware * backdoor emulation, which is opt in via module param. * VMware backdoor emulation handles select instructions * and reinjects the #GP for all other cases. * * EMULTYPE_PF - Set when an intercepted #PF triggers the emulation, in which case * the CR2/GPA value pass on the stack is valid. * * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility * state and inject single-step #DBs after skipping * an instruction (after completing userspace I/O). * * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that * is attempting to write a gfn that contains one or * more of the PTEs used to translate the write itself, * and the owning page table is being shadowed by KVM. * If emulation of the faulting instruction fails and * this flag is set, KVM will exit to userspace instead * of retrying emulation as KVM cannot make forward * progress. * * If emulation fails for a write to guest page tables, * KVM unprotects (zaps) the shadow page for the target * gfn and resumes the guest to retry the non-emulatable * instruction (on hardware). Unprotecting the gfn * doesn't allow forward progress for a self-changing * access because doing so also zaps the translation for * the gfn, i.e. retrying the instruction will hit a * !PRESENT fault, which results in a new shadow page * and sends KVM back to square one.
*/ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_ALLOW_RETRY_PF (1 << 3) #define EMULTYPE_TRAP_UD_FORCED (1 << 4) #define EMULTYPE_VMWARE_GP (1 << 5) #define EMULTYPE_PF (1 << 6) #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) #define EMULTYPE_WRITE_PF_TO_SP (1 << 8)
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.