/* * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT * bit, and thus are guaranteed to be non-zero when valid. And, when a guest * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE, * as the CPU would treat that as PRESENT PDPTR with reserved bits set. Use * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
*/ #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x))
struct kvm_mmu_page { /* * Note, "link" through "spt" fit in a single 64 byte cache line on * 64-bit kernels, keep it that way unless there's a reason not to.
*/ struct list_head link; struct hlist_node hash_link;
bool tdp_mmu_page; bool unsync; union {
u8 mmu_valid_gen;
/* Only accessed under slots_lock. */ bool tdp_mmu_scheduled_root_to_zap;
};
/* * The shadow page can't be replaced by an equivalent huge page * because it is being used to map an executable page in the guest * and the NX huge page mitigation is enabled.
*/ bool nx_huge_page_disallowed;
/* * The following two entries are used to key the shadow page in the * hash table.
*/ union kvm_mmu_page_role role;
gfn_t gfn;
u64 *spt;
/* * Stores the result of the guest translation being shadowed by each * SPTE. KVM shadows two types of guest translations: nGPA -> GPA * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both * cases the result of the translation is a GPA and a set of access * constraints. * * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed * access permissions are stored in the lower bits. Note, for * convenience and uniformity across guests, the access permissions are * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format.
*/
u64 *shadowed_translation;
/* Currently serving as active root */ union { int root_count;
refcount_t tdp_mmu_root_count;
};
bool has_mapped_host_mmio;
union { /* These two members aren't used for TDP MMU */ struct { unsignedint unsync_children; /* * Number of writes since the last time traversal * visited this page.
*/
atomic_t write_flooding_count;
}; /* * Page table page of external PT. * Passed to TDX module, not accessed by KVM.
*/ void *external_spt;
}; union { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
tdp_ptep_t ptep;
};
DECLARE_BITMAP(unsync_child_bitmap, 512);
/* * Tracks shadow pages that, if zapped, would allow KVM to create an NX * huge page. A shadow page will have nx_huge_page_disallowed set but * not be on the list if a huge page is disallowed for other reasons, * e.g. because KVM is shadowing a PTE at the same gfn, the memslot * isn't properly aligned, etc...
*/ struct list_head possible_nx_huge_page_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an * update is in progress; see the comments in __get_spte_lockless().
*/ int clear_spte_count; #endif
#ifdef CONFIG_X86_64 /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif
};
staticinlinevoid kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{ /* * external_spt is allocated for TDX module to hold private EPT mappings, * TDX module will initialize the page by itself. * Therefore, KVM does not need to initialize or access external_spt. * KVM only interacts with sp->spt for private EPT operations.
*/
sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache);
}
staticinline gfn_t kvm_gfn_root_bits(conststruct kvm *kvm, conststruct kvm_mmu_page *root)
{ /* * Since mirror SPs are used only for TDX, which maps private memory * at its "natural" GFN, no mask needs to be applied to them - and, dually, * we expect that the bits is only used for the shared PT.
*/ if (is_mirror_sp(root)) return 0; return kvm_gfn_direct_bits(kvm);
}
staticinlinebool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
{ /* * When using the EPT page-modification log, the GPAs in the CPU dirty * log would come from L2 rather than L1. Therefore, we need to rely * on write protection to record dirty pages, which bypasses PML, since * writes now result in a vmexit. Note, the check on CPU dirty logging * being enabled is mandatory as the bits used to denote WP-only SPTEs * are reserved for PAE paging (32-bit KVM).
*/ return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode;
}
/* Derived from mmu and global state. */ constbool is_tdp; constbool is_private; constbool nx_huge_page_workaround_enabled;
/* * Whether a >4KB mapping can be created or is forbidden due to NX * hugepages.
*/ bool huge_page_disallowed;
/* * Maximum page size that can be created for this fault; input to * FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
*/
u8 max_level;
/* * Page size that can be created based on the max_level and the * page size used by the host mapping.
*/
u8 req_level;
/* * Page size that will be created based on the req_level and * huge_page_disallowed.
*/
u8 goal_level;
/* * Shifted addr, or result of guest page table walk if addr is a gva. In * the case of VM where memslot's can be mapped at multiple GPA aliases * (i.e. TDX), the gfn field does not contain the bit that selects between * the aliases (i.e. the shared bit for TDX).
*/
gfn_t gfn;
/* The memslot containing gfn. May be NULL. */ struct kvm_memory_slot *slot;
/* * Indicates the guest is trying to write a gfn that contains one or * more of the PTEs used to translate the write itself, i.e. the access * is changing its own translation in the guest page tables.
*/ bool write_fault_to_shadow_pgtable;
};
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
/* * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(), * and of course kvm_mmu_do_page_fault(). * * RET_PF_CONTINUE: So far, so good, keep handling the page fault. * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the * gfn and retry, or emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. * * Any names added to this enum should be exported to userspace for use in * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h * * Note, all values must be greater than or equal to zero so as not to encroach * on -errno return values.
*/ enum {
RET_PF_CONTINUE = 0,
RET_PF_RETRY,
RET_PF_EMULATE,
RET_PF_WRITE_PROTECTED,
RET_PF_INVALID,
RET_PF_FIXED,
RET_PF_SPURIOUS,
};
/* * Define RET_PF_CONTINUE as 0 to allow for * - efficient machine code when checking for CONTINUE, e.g. * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero, * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value.
*/
static_assert(RET_PF_CONTINUE == 0);
if (vcpu->arch.mmu->root_role.direct) { /* * Things like memslots don't understand the concept of a shared * bit. Strip it so that the GFN can be used like normal, and the * fault.addr can be used when the shared bit is needed.
*/
fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm);
fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
}
/* * With retpoline being active an indirect call is rather expensive, * so do a direct call in the most common case.
*/ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault); else
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
/* * Not sure what's happening, but punt to userspace and hope that * they can fix it by changing memory to shared, or they can * provide a better error.
*/ if (r == RET_PF_EMULATE && fault.is_private) {
pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); return -EFAULT;
}
if (fault.write_fault_to_shadow_pgtable && emulation_type)
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP; if (level)
*level = fault.goal_level;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.