// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * MMU support * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com>
*/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
/* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: * 1. the guest-virtual to guest-physical * 2. while doing 1. it walks guest-physical to host-physical * If the hardware supports that we don't need to do shadow paging.
*/ bool tdp_enabled = false;
/* make pte_list_desc fit well in cache lines */ #define PTE_LIST_EXT 14
/* * struct pte_list_desc is the core data structure used to implement a custom * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a * given GFN when used in the context of rmaps. Using a custom list allows KVM * to optimize for the common case where many GFNs will have at most a handful * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small * memory footprint, which in turn improves runtime performance by exploiting * cache locality. * * A list is comprised of one or more pte_list_desc objects (descriptors). * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor * is full and a new SPTEs needs to be added, a new descriptor is allocated and * becomes the head of the list. This means that by definitions, all tail * descriptors are full. * * Note, the meta data fields are deliberately placed at the start of the * structure to optimize the cacheline layout; accessing the descriptor will * touch only a single cacheline so long as @spte_count<=6 (or if only the * descriptors metadata is accessed).
*/ struct pte_list_desc { struct pte_list_desc *more; /* The number of PTEs stored in _this_ descriptor. */
u32 spte_count; /* The number of PTEs stored in all tails of this descriptor. */
u32 tail_count;
u64 *sptes[PTE_LIST_EXT];
};
/* * Yes, lot's of underscores. They're a hint that you probably shouldn't be * reading from the role_regs. Once the root_role is constructed, it becomes * the single source of truth for the MMU's state.
*/ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ staticinlinebool __maybe_unused \
____is_##reg##_##name(conststruct kvm_mmu_role_regs *regs) \
{ \ return !!(regs->reg & flag); \
}
BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
/* * The MMU itself (with a valid role) is the single source of truth for the * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, * and the vCPU may be incorrect/irrelevant.
*/ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ staticinlinebool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
{ \ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
}
BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
/* Flush the range of guest memory mapped by the given SPTE. */ staticvoid kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
{ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
/* * If we map the spte from nonpresent to present, We should store * the high bits firstly, then set present bit, so cpu can not * fetch this spte while we are setting the spte.
*/
smp_wmb();
WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
}
staticvoid __update_clear_spte_fast(u64 *sptep, u64 spte)
{ union split_spte *ssptep, sspte;
/* xchg acts as a barrier before the setting of the high bits */
orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
orig.spte_high = ssptep->spte_high;
ssptep->spte_high = sspte.spte_high;
count_spte_clear(sptep, spte);
return orig.spte;
}
/* * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * * An spte tlb flush may be pending, because they are coalesced and * we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * * Reading the spte while an update is in progress may get the old value * for the high part of the spte. The race is fine for a present->non-present * change (because the high part of the spte is ignored for non-present spte), * but for a present->present change we must reread the spte. * * All such changes are done in two steps (present->non-present and * non-present->present), hence it is enough to count the number of * present->non-present updates: if it changed while reading the spte, * we might have hit the race. This is done using clear_spte_count.
*/ static u64 __get_spte_lockless(u64 *sptep)
{ struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count;
retry:
count = sp->clear_spte_count;
smp_rmb();
spte.spte_low = orig->spte_low;
smp_rmb();
spte.spte_high = orig->spte_high;
smp_rmb();
if (unlikely(spte.spte_low != orig->spte_low ||
count != sp->clear_spte_count)) goto retry;
return spte.spte;
} #endif
/* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present * or in a state where the hardware will not attempt to update * the spte.
*/ staticvoid mmu_spte_set(u64 *sptep, u64 new_spte)
{
WARN_ON_ONCE(is_shadow_present_pte(*sptep));
__set_spte(sptep, new_spte);
}
/* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changed. * * Returns true if the TLB needs to be flushed
*/ staticbool mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
/* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. * Returns the old PTE.
*/ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
{
u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level;
/* * Rules for using mmu_spte_clear_no_track: * Directly clear spte without caring the state bits of sptep, * it is used to set the upper level spte.
*/ staticvoid mmu_spte_clear_no_track(u64 *sptep)
{
__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
}
staticvoid walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_begin();
} else { /* * Prevent page table teardown by making any free-er wait during * kvm_flush_remote_tlbs() IPI to all active vcpus.
*/
local_irq_disable();
/* * Make sure a following spte read is not reordered ahead of the write * to vcpu->mode.
*/
smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
}
}
staticvoid walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_end();
} else { /* * Make sure the write to vcpu->mode is not reordered in front of * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
*/
smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
local_irq_enable();
}
}
staticint mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
{ int r;
/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; if (kvm_has_mirrored_tdp(vcpu->kvm)) {
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache,
PT64_ROOT_MAX_LEVEL); if (r) return r;
}
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
PT64_ROOT_MAX_LEVEL); if (r) return r; if (maybe_indirect) {
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
PT64_ROOT_MAX_LEVEL); if (r) return r;
} return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
PT64_ROOT_MAX_LEVEL);
}
/* * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note * that the SPTE itself may have a more constrained access permissions that * what the guest enforces. For example, a guest may create an executable * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
*/ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
{ if (sp->shadowed_translation) return sp->shadowed_translation[index] & ACC_ALL;
/* * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, * KVM is not shadowing any guest page tables, so the "guest access * permissions" are just ACC_ALL. * * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM * is shadowing a guest huge page with small pages, the guest access * permissions being shadowed are the access permissions of the huge * page. * * In both cases, sp->role.access contains the correct access bits.
*/ return sp->role.access;
}
/* * Return the pointer to the large page information for a given gfn, * handling slots that are not large page aligned.
*/ staticstruct kvm_lpage_info *lpage_info_slot(gfn_t gfn, conststruct kvm_memory_slot *slot, int level)
{ unsignedlong idx;
/* * The most significant bit in disallow_lpage tracks whether or not memory * attributes are mixed, i.e. not identical for all gfns at the current level. * The lower order bits are used to refcount other cases where a hugepage is * disallowed, e.g. if KVM has shadow a page table at the gfn.
*/ #define KVM_LPAGE_MIXED_FLAG BIT(31)
staticvoid update_gfn_disallow_lpage_count(conststruct kvm_memory_slot *slot,
gfn_t gfn, int count)
{ struct kvm_lpage_info *linfo; int old, i;
for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
kvm->arch.indirect_shadow_pages++; /* * Ensure indirect_shadow_pages is elevated prior to re-reading guest * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight * emulated writes are visible before re-reading guest PTEs, or that * an emulated write will see the elevated count and acquire mmu_lock * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write().
*/
smp_mb();
/* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PG_LEVEL_4K) return __kvm_write_track_add_gfn(kvm, slot, gfn);
kvm_mmu_gfn_disallow_lpage(slot, gfn);
if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
}
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{ /* * If it's possible to replace the shadow page with an NX huge page, * i.e. if the shadow page is the only thing currently preventing KVM * from using a huge page, add the shadow page to the list of "to be * zapped for NX recovery" pages. Note, the shadow page can already be * on the list if KVM is reusing an existing shadow page, i.e. if KVM * links a shadow page at multiple points.
*/ if (!list_empty(&sp->possible_nx_huge_page_link)) return;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return NULL; if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) return NULL;
return slot;
}
/* * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct * pte_list_desc containing more mappings.
*/ #define KVM_RMAP_MANY BIT(0)
/* * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always * operates with mmu_lock held for write), but rmaps can be walked without * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain * being zapped/dropped _while the rmap is locked_. * * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be * done while holding mmu_lock for write. This allows a task walking rmaps * without holding mmu_lock to concurrently walk the same entries as a task * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify * the rmaps, thus the walks are stable. * * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED, * only the rmap chains themselves are protected. E.g. holding an rmap's lock * ensures all "struct pte_list_desc" fields are stable.
*/ #define KVM_RMAP_LOCKED BIT(1)
/* * Elide the lock if the rmap is empty, as lockless walkers (read-only * mode) don't need to (and can't) walk an empty rmap, nor can they add * entries to the rmap. I.e. the only paths that process empty rmaps * do so while holding mmu_lock for write, and are mutually exclusive.
*/
old_val = atomic_long_read(&rmap_head->val); if (!old_val) return 0;
do { /* * If the rmap is locked, wait for it to be unlocked before * trying acquire the lock, e.g. to avoid bouncing the cache * line.
*/ while (old_val & KVM_RMAP_LOCKED) {
cpu_relax();
old_val = atomic_long_read(&rmap_head->val);
}
/* * Recheck for an empty rmap, it may have been purged by the * task that held the lock.
*/ if (!old_val) return 0;
new_val = old_val | KVM_RMAP_LOCKED; /* * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap * from being reordered outside of the critical section created by * __kvm_rmap_lock(). * * Pairs with the atomic_long_set_release() in kvm_rmap_unlock(). * * For the !old_val case, no ordering is needed, as there is no rmap * to walk.
*/
} while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
/* * Return the old value, i.e. _without_ the LOCKED bit set. It's * impossible for the return value to be 0 (see above), i.e. the read- * only unlock flow can't get a false positive and fail to unlock.
*/ return old_val;
}
staticvoid __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, unsignedlong val)
{
KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED); /* * Ensure that all accesses to the rmap have completed before unlocking * the rmap. * * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
*/
atomic_long_set_release(&rmap_head->val, val);
}
/* * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The * actual locking is the same, but the caller is disallowed from modifying the * rmap, and so the unlock flow is a nop if the rmap is/was empty.
*/ staticunsignedlong kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
{ unsignedlong rmap_val;
/* * Returns the number of pointers in the rmap chain, not counting the new one.
*/ staticint pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
u64 *spte, struct kvm_rmap_head *rmap_head)
{ unsignedlong old_val, new_val; struct pte_list_desc *desc; int count = 0;
/* * The head descriptor should never be empty. A new head is added only * when adding an entry and the previous head is full, and heads are * removed (this flow) when they become empty.
*/
KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
/* * Replace the to-be-freed SPTE with the last valid entry from the head * descriptor to ensure that tail descriptors are full at all times. * Note, this also means that tail_count is stable for each descriptor.
*/
desc->sptes[i] = head_desc->sptes[j];
head_desc->sptes[j] = NULL;
head_desc->spte_count--; if (head_desc->spte_count) return;
/* * The head descriptor is empty. If there are no tail descriptors, * nullify the rmap head to mark the list as empty, else point the rmap * head at the next descriptor, i.e. the new head.
*/ if (!head_desc->more)
*rmap_val = 0; else
*rmap_val = (unsignedlong)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
/* Return true if at least one SPTE was zapped, false otherwise */ staticbool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{ struct pte_list_desc *desc, *next; unsignedlong rmap_val; int i;
rmap_val = kvm_rmap_lock(kvm, rmap_head); if (!rmap_val) returnfalse;
for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++)
mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
next = desc->more;
mmu_free_pte_list_desc(desc);
}
out: /* rmap_head is meaningless now, remember to reset it */
kvm_rmap_unlock(kvm, rmap_head, 0); returntrue;
}
/* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU * so we have to determine which memslots to use based on context * information in sp->role.
*/
slots = kvm_memslots_for_spte_role(kvm, sp->role);
/* * Used by the following functions to iterate through the sptes linked by a * rmap. All fields are private and not assumed to be used outside.
*/ struct rmap_iterator { /* private fields */ struct rmap_head *head; struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */
};
/* * Iteration must be started by this function. This should also be used after * removing/dropping sptes from the rmap link because in such cases the * information in the iterator may not be valid. * * Returns sptep if found, NULL otherwise.
*/ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter)
{ unsignedlong rmap_val = kvm_rmap_get(rmap_head);
/* * Must be used with a valid iterator: e.g. after rmap_get_first(). * * Returns sptep if found, NULL otherwise.
*/ static u64 *rmap_get_next(struct rmap_iterator *iter)
{ if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) {
++iter->pos; if (iter->desc->sptes[iter->pos]) return iter->desc->sptes[iter->pos];
}
iter->desc = iter->desc->more;
if (iter->desc) {
iter->pos = 0; /* desc->sptes[0] cannot be NULL */ return iter->desc->sptes[iter->pos];
}
}
if (flush)
kvm_flush_remote_tlbs_sptep(kvm, sptep);
}
/* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. * * Note: write protection is difference between dirty logging and spte * protection: * - for dirty logging, the spte can be set to writable at anytime if * its dirty bitmap is properly set. * - for spte protection, the spte can be writable only after unsync-ing * shadow page. * * Return true if tlb need be flushed.
*/ staticbool spte_write_protect(u64 *sptep, bool pt_protect)
{
u64 spte = *sptep;
if (!is_writable_pte(spte) &&
!(pt_protect && is_mmu_writable_spte(spte))) returnfalse;
if (pt_protect)
spte &= ~shadow_mmu_writable_mask;
spte = spte & ~PT_WRITABLE_MASK;
/* * Gets the GFN ready for another round of dirty logging by clearing the * - D bit on ad-enabled SPTEs, and * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared.
*/ staticbool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, conststruct kvm_memory_slot *slot)
{
u64 *sptep; struct rmap_iterator iter; bool flush = false;
/* clear the first set bit */
mask &= mask - 1;
}
}
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsignedlong mask)
{ /* * If the slot was assumed to be "initially all dirty", write-protect * huge pages to ensure they are split to 4KiB on the first write (KVM * dirty logs at 4KiB granularity). If eager page splitting is enabled, * immediately try to split huge pages, e.g. so that vCPUs don't get * saddled with the cost of splitting. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large * pages.
*/ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
if (READ_ONCE(eager_page_split))
kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K);
/* Cross two large pages? */ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
ALIGN(end << PAGE_SHIFT, PMD_SIZE))
kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
PG_LEVEL_2M);
}
/* * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in * mask. If PML is enabled and the GFN doesn't need to be write- * protected for other reasons, e.g. shadow paging, clear the Dirty bit. * Otherwise clear the Writable bit. * * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is * enabled but it chooses between clearing the Dirty bit and Writeable * bit based on the context.
*/ if (kvm->arch.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
int kvm_cpu_dirty_log_size(struct kvm *kvm)
{ return kvm->arch.cpu_dirty_log_size;
}
if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = gfn_to_rmap(gfn, i, slot);
write_protected |= rmap_write_protect(rmap_head, true);
}
}
if (tdp_mmu_enabled)
write_protected |=
kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
/* The return value indicates if tlb flush on all vcpus is needed. */ typedefbool (*slot_rmaps_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, conststruct kvm_memory_slot *slot);
/* * To prevent races with vCPUs faulting in a gfn using stale data, * zapping a gfn range must be protected by mmu_invalidate_in_progress * (and mmu_invalidate_seq). The only exception is memslot deletion; * in that case, SRCU synchronization ensures that SPTEs are zapped * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the * invalid slot.
*/
lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
lockdep_is_held(&kvm->slots_lock));
if (kvm_memslots_have_rmaps(kvm))
flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
range->start, range->end,
range->may_block, flush);
if (tdp_mmu_enabled)
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
if (kvm_x86_ops.set_apic_access_page_addr &&
range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) { if (!is_accessed_spte(spte)) continue;
if (test_only) {
kvm_rmap_unlock_readonly(rmap_head, rmap_val); returntrue;
}
if (spte_ad_enabled(spte))
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsignedlong *)sptep); else /* * If the following cmpxchg fails, the * spte is being concurrently modified * and should most likely stay young.
*/
cmpxchg64(sptep, spte,
mark_spte_for_access_track(spte));
young = true;
}
if (tdp_mmu_enabled)
young = kvm_tdp_mmu_test_age_gfn(kvm, range);
if (young) return young;
if (kvm_may_have_shadow_mmu_sptes(kvm))
young |= kvm_rmap_age_gfn_range(kvm, range, true);
return young;
}
staticvoid kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
{ #ifdef CONFIG_KVM_PROVE_MMU int i;
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
sp->spt[i], &sp->spt[i],
kvm_mmu_page_get_gfn(sp, i));
} #endif
}
staticstruct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
{ /* * Ensure the load of the hash table pointer itself is ordered before * loads to walk the table. The pointer is set at runtime outside of * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of * shadow pages becomes necessary only when KVM needs to shadow L1's * TDP for an L2 guest. Pairs with the smp_store_release() in * kvm_mmu_alloc_page_hash().
*/ struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
/* * Ignore various flags when verifying that it's safe to sync a shadow * page using the current MMU context. * * - level: not part of the overall MMU role and will never match as the MMU's * level tracks the root level * - access: updated based on the new guest PTE * - quadrant: not part of the overall MMU role (similar to level)
*/ constunion kvm_mmu_page_role sync_role_ign = {
.level = 0xf,
.access = 0x7,
.quadrant = 0x3,
.passthrough = 0x1,
};
/* * Direct pages can never be unsync, and KVM should never attempt to * sync a shadow page for a different MMU context, e.g. if the role * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the * reserved bits checks will be wrong, etc...
*/ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
(sp->role.word ^ root_role.word) & ~sync_role_ign.word)) returnfalse;
returntrue;
}
staticint kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{ /* sp->spt[i] has initial value of shadow page table allocation */ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE) return 0;
return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
}
staticint __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{ int flush = 0; int i;
if (!kvm_sync_page_check(vcpu, sp)) return -1;
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { int ret = kvm_sync_spte(vcpu, sp, i);
if (ret < -1) return -1;
flush |= ret;
}
/* * Note, any flush is purely for KVM's correctness, e.g. when dropping * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier * unmap or dirty logging event doesn't fail to flush. The guest is * responsible for flushing the TLB to ensure any changes in protection * bits are recognized, i.e. until the guest flushes or page faults on * a relevant address, KVM is architecturally allowed to let vCPUs use * cached translations with the old protection bits.
*/ return flush;
}
staticint kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list)
{ int ret = __kvm_sync_page(vcpu, sp);
if (ret < 0)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return ret;
}
/* Also set up a sentinel. Further entries in pvec are all * children of sp, so this element is never overwritten.
*/
parents->parent[level-1] = NULL; return mmu_pages_next(pvec, parents, 0);
}
/* * The vCPU is required when finding indirect shadow pages; the shadow * page may already exist and syncing it needs the vCPU pointer in * order to read guest page tables. Direct shadow pages are never * unsync, thus @vcpu can be NULL if @role.direct is true.
*/ staticstruct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu,
gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role)
{ struct kvm_mmu_page *sp; int ret; int collisions = 0;
LIST_HEAD(invalid_list);
if (sp->role.word != role.word) { /* * If the guest is creating an upper-level page, zap * unsync pages for the same gfn. While it's possible * the guest is using recursive page tables, in all * likelihood the guest has stopped using the unsync * page and is installing a completely unrelated page. * Unsync pages must not be left as is, because the new * upper-level page will be write-protected.
*/ if (role.level > PG_LEVEL_4K && sp->unsync)
kvm_mmu_prepare_zap_page(kvm, sp,
&invalid_list); continue;
}
/* unsync and write-flooding only apply to indirect SPs. */ if (sp->role.direct) goto out;
if (sp->unsync) { if (KVM_BUG_ON(!vcpu, kvm)) break;
/* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) * it doesn't write-protect the page or mark it synchronized! * This way the validity of the mapping is ensured, but the * overhead of write protection is not incurred until the * guest invalidates the TLB mapping. This allows multiple * SPs for a single gfn to be unsync. * * If the sync fails, the page is zapped. If so, break * in order to rebuild it.
*/
ret = kvm_sync_page(vcpu, sp, &invalid_list); if (ret < 0) break;
WARN_ON_ONCE(!list_empty(&invalid_list)); if (ret > 0)
kvm_flush_remote_tlbs(kvm);
}
__clear_sp_write_flooding_count(sp);
goto out;
}
sp = NULL;
++kvm->stat.mmu_cache_miss;
out:
kvm_mmu_commit_zap_page(kvm, &invalid_list);
if (collisions > kvm->stat.max_mmu_page_hash_collisions)
kvm->stat.max_mmu_page_hash_collisions = collisions; return sp;
}
/* Caches used when allocating a new shadow page. */ struct shadow_page_caches { struct kvm_mmu_memory_cache *page_header_cache; struct kvm_mmu_memory_cache *shadow_page_cache; struct kvm_mmu_memory_cache *shadowed_info_cache;
};
/* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See * comments in kvm_zap_obsolete_pages().
*/
sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
list_add(&sp->link, &kvm->arch.active_mmu_pages);
kvm_account_mmu_page(kvm, sp);
/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ staticstruct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu, struct shadow_page_caches *caches,
gfn_t gfn, union kvm_mmu_page_role role)
{ struct hlist_head *sp_list; struct kvm_mmu_page *sp; bool created = false;
/* * No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as * mmu_page_hash must be set prior to creating the first shadow root, * i.e. reaching this point is fully serialized by slots_arch_lock.
*/
BUG_ON(!kvm->arch.mmu_page_hash);
sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
/* * If the guest has 4-byte PTEs then that means it's using 32-bit, * 2-level, non-PAE paging. KVM shadows such guests with PAE paging * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must * shadow each guest page table with multiple shadow page tables, which * requires extra bookkeeping in the role. * * Specifically, to shadow the guest's page directory (which covers a * 4GiB address space), KVM uses 4 PAE page directories, each mapping * 1GiB of the address space. @role.quadrant encodes which quarter of * the address space each maps. * * To shadow the guest's page tables (which each map a 4MiB region), KVM * uses 2 PAE page tables, each mapping a 2MiB region. For these, * @role.quadrant encodes which half of the region they map. * * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow * PDPTEs; those 4 PAE page directories are pre-allocated and their * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume * bit 21 in the PTE (the child here), KVM propagates that bit to the * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE * covers bit 21 (see above), thus the quadrant is calculated from the * _least_ significant bit of the PDE index.
*/ if (role.has_4_byte_gpte) {
WARN_ON_ONCE(role.level != PG_LEVEL_4K);
role.quadrant = spte_index(sptep) & 1;
}
if (iterator->level == PT32E_ROOT_LEVEL) { /* * prev_root is currently only used for 64-bit hosts. So only * the active root_hpa is valid here.
*/
BUG_ON(root != vcpu->arch.mmu->root.hpa);
/* * If an SPTE is present already, it must be a leaf and therefore * a large one. Drop it, and flush the TLB if needed, before * installing sp.
*/ if (is_shadow_present_pte(*sptep))
drop_large_spte(kvm, sptep, flush);
/* * The non-direct sub-pagetable must be updated before linking. For * L1 sp, the pagetable is updated via kvm_sync_page() in * kvm_mmu_find_shadow_page() without write-protecting the gfn, * so sp->unsync can be true or false. For higher level non-direct * sp, the pagetable is updated/synced via mmu_sync_children() in * FNAME(fetch)(), so sp->unsync_children can only be false. * WARN_ON_ONCE() if anything happens unexpectedly.
*/ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
mark_unsync(sptep);
}
/* * For the direct sp, if the guest pte's dirty bit * changed form clean to dirty, it will corrupt the * sp's access: allow writable in the read-only sp, * so we should update the spte at this point to get * a new sp with the correct access.
*/
child = spte_to_child_sp(*sptep); if (child->role.access == direct_access) return;
/* * Recursively zap nested TDP SPs, parentless SPs are * unlikely to be used again in the near future. This * avoids retaining a large number of stale nested SPs.
*/ if (tdp_enabled && invalid_list &&
child->role.guest_mode &&
!atomic_long_read(&child->parent_ptes.val)) return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
} elseif (is_mmio_spte(kvm, pte)) {
mmu_spte_clear_no_track(spte);
} return 0;
}
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
if (!sp->role.invalid && sp_has_gptes(sp))
unaccount_shadowed(kvm, sp);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { /* Count self */
(*nr_zapped)++;
/* * Already invalid pages (previously active roots) are not on * the active page list. See list_del() in the "else" case of * !sp->root_count.
*/ if (sp->role.invalid)
list_add(&sp->link, invalid_list); else
list_move(&sp->link, invalid_list);
kvm_unaccount_mmu_page(kvm, sp);
} else { /* * Remove the active root from the active page list, the root * will be explicitly freed when the root_count hits zero.
*/
list_del(&sp->link);
/* * Obsolete pages cannot be used on any vCPUs, see the comment * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also * treats invalid shadow pages as being obsolete.
*/
zapped_root = !is_obsolete_sp(kvm, sp);
}
if (sp->nx_huge_page_disallowed)
unaccount_nx_huge_page(kvm, sp);
sp->role.invalid = 1;
/* * Make the request to free obsolete roots after marking the root * invalid, otherwise other vCPUs may not see it as invalid.
*/ if (zapped_root)
kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); return list_unstable;
}
/* * We need to make sure everyone sees our modifications to * the page tables and see changes to vcpu->mode here. The barrier * in the kvm_flush_remote_tlbs() achieves this. This pairs * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. * * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit * guest mode and/or lockless shadow page table walks.
*/
kvm_flush_remote_tlbs(kvm);
if (list_empty(&kvm->arch.active_mmu_pages)) return 0;
restart:
list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { /* * Don't zap active root pages, the page itself can't be freed * and zapping it will just force vCPUs to realloc and reload.
*/ if (sp->root_count) continue;
/* * Note, this check is intentionally soft, it only guarantees that one * page is available, while the caller may end up allocating as many as * four pages, e.g. for PAE roots or for 5-level paging. Temporarily * exceeding the (arbitrary by default) limit will not harm the host, * being too aggressive may unnecessarily kill the guest, and getting an * exact count is far more trouble than it's worth, especially in the * page fault paths.
*/ if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; return 0;
}
/* * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock
*/ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsignedlong goal_nr_mmu_pages)
{
write_lock(&kvm->mmu_lock);
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
goal_nr_mmu_pages);
/* * Bail early if there aren't any write-protected shadow pages to avoid * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked * by a third party. Reading indirect_shadow_pages without holding * mmu_lock is safe, as this is purely an optimization, i.e. a false * positive is benign, and a false negative will simply result in KVM * skipping the unprotect+retry path, which is also an optimization.
*/ if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) goto out;
if (!vcpu->arch.mmu->root_role.direct) {
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); if (gpa == INVALID_GPA) goto out;
}
/* * Snapshot the result before zapping, as zapping will remove all list * entries, i.e. checking the list later would yield a false negative.
*/
r = !list_empty(&invalid_list);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
write_unlock(&kvm->mmu_lock);
/* * Attempt to unsync any shadow pages that can be reached by the specified gfn, * KVM is creating a writable mapping for said gfn. Returns 0 if all pages * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected.
*/ int mmu_try_to_unsync_pages(struct kvm *kvm, conststruct kvm_memory_slot *slot,
gfn_t gfn, bool synchronizing, bool prefetch)
{ struct kvm_mmu_page *sp; bool locked = false;
/* * Force write-protection if the page is being tracked. Note, the page * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below!
*/ if (kvm_gfn_is_write_tracked(kvm, slot, gfn)) return -EPERM;
/* * The page is not write-tracked, mark existing shadow pages unsync * unless KVM is synchronizing an unsync SP. In that case, KVM must * complete emulation of the guest TLB flush before allowing shadow * pages to become unsync (writable by the guest).
*/
for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { if (synchronizing) return -EPERM;
if (sp->unsync) continue;
if (prefetch) return -EEXIST;
/* * TDP MMU page faults require an additional spinlock as they * run with mmu_lock held for read, not write, and the unsync * logic is not thread safe. Take the spinklock regardless of * the MMU type to avoid extra conditionals/parameters, there's * no meaningful penalty if mmu_lock is held for write.
*/ if (!locked) {
locked = true;
spin_lock(&kvm->arch.mmu_unsync_pages_lock);
/* * Recheck after taking the spinlock, a different vCPU * may have since marked the page unsync. A false * negative on the unprotected check above is not * possible as clearing sp->unsync _must_ hold mmu_lock * for write, i.e. unsync cannot transition from 1->0 * while this CPU holds mmu_lock for read (or write).
*/ if (READ_ONCE(sp->unsync)) continue;
}
WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
kvm_unsync_page(kvm, sp);
} if (locked)
spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
/* * We need to ensure that the marking of unsync pages is visible * before the SPTE is updated to allow writes because * kvm_mmu_sync_roots() checks the unsync flags without holding * the MMU lock and so can race with this. If the SPTE was updated * before the page had been marked as unsync-ed, something like the * following could happen: * * CPU 1 CPU 2 * --------------------------------------------------------------------- * 1.2 Host updates SPTE * to be writable * 2.1 Guest writes a GPTE for GVA X. * (GPTE being in the guest page table shadowed * by the SP from CPU 1.) * This reads SPTE during the page table walk. * Since SPTE.W is read as 1, there is no * fault. * * 2.2 Guest issues TLB flush. * That causes a VM Exit. * * 2.3 Walking of unsync pages sees sp->unsync is * false and skips the page. * * 2.4 Guest accesses GVA X. * Since the mapping in the SP was not updated, * so the old mapping for GVA X incorrectly * gets used. * 1.1 Host marks SP * as unsync * (sp->unsync = true) * * The write barrier below ensures that 1.1 happens before 1.2 and thus * the situation in 2.4 does not arise. It pairs with the read barrier * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
*/
smp_wmb();
if (is_shadow_present_pte(*sptep)) { if (prefetch && is_last_spte(*sptep, level) &&
pfn == spte_to_pfn(*sptep)) return RET_PF_SPURIOUS;
/* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE.
*/ if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { struct kvm_mmu_page *child;
u64 pte = *sptep;
for (i = 0; i < nr_pages; i++, gfn++, sptep++) {
mmu_set_spte(vcpu, slot, sptep, access, gfn,
page_to_pfn(pages[i]), NULL);
/* * KVM always prefetches writable pages from the primary MMU, * and KVM can make its SPTE writable in the fast page handler, * without notifying the primary MMU. Mark pages/folios dirty * now to ensure file data is written back if it ends up being * written by the guest. Because KVM's prefetching GUPs * writable PTEs, the probability of unnecessary writeback is * extremely low.
*/
kvm_release_page_dirty(pages[i]);
}
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { if (is_shadow_present_pte(*spte) || spte == sptep) { if (!start) continue; if (!direct_pte_prefetch_many(vcpu, sp, start, spte)) return;
/* * Without accessed bits, there's no way to distinguish between * actually accessed translations and prefetched, so disable pte * prefetch if accessed bits aren't available.
*/ if (sp_ad_disabled(sp)) return;
if (sp->role.level > PG_LEVEL_4K) return;
/* * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses.
*/ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return;
__direct_pte_prefetch(vcpu, sp, sptep);
}
/* * Lookup the mapping level for @gfn in the current mm. * * WARNING! Use of host_pfn_mapping_level() requires the caller and the end * consumer to be tied into KVM's handlers for MMU notifier events! * * There are several ways to safely use this helper: * * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation * event for the hva. This can be done by explicit checking the MMU notifier * or by ensuring that KVM already has a valid mapping that covers the hva. * * - Do not use the result to install new mappings, e.g. use the host mapping * level only to decide whether or not to zap an entry. In this case, it's * not required to hold mmu_lock (though it's highly likely the caller will * want to hold mmu_lock anyways, e.g. to modify SPTEs). * * Note! The lookup can still race with modifications to host page tables, but * the above "rules" ensure KVM will not _consume_ the result of the walk if a * race with the primary MMU occurs.
*/ staticint host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, conststruct kvm_memory_slot *slot)
{ int level = PG_LEVEL_4K; unsignedlong hva; unsignedlong flags;
pgd_t pgd;
p4d_t p4d;
pud_t pud;
pmd_t pmd;
/* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the * "writable" check in __gfn_to_hva_many(), which will always fail on * read-only memslots due to gfn_to_hva() assuming writes. Earlier * page fault steps have already verified the guest isn't writing a * read-only memslot.
*/
hva = __gfn_to_hva_memslot(slot, gfn);
/* * Disable IRQs to prevent concurrent tear down of host page tables, * e.g. if the primary MMU promotes a P*D to a huge page and then frees * the original page table.
*/
local_irq_save(flags);
/* * Read each entry once. As above, a non-leaf entry can be promoted to * a huge page _during_ this walk. Re-reading the entry could send the * walk into the weeks, e.g. p*d_leaf() returns false (sees the old * value) and then p*d_offset() walks into the target huge page instead * of the old page table (sees the new value).
*/
pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out;
p4d = READ_ONCE(*p4d_offset(&pgd, hva)); if (p4d_none(p4d) || !p4d_present(p4d)) goto out;
pud = READ_ONCE(*pud_offset(&p4d, hva)); if (pud_none(pud) || !pud_present(pud)) goto out;
if (pud_leaf(pud)) {
level = PG_LEVEL_1G; goto out;
}
pmd = READ_ONCE(*pmd_offset(&pud, hva)); if (pmd_none(pmd) || !pmd_present(pmd)) goto out;
if (pmd_leaf(pmd))
level = PG_LEVEL_2M;
out:
local_irq_restore(flags); return level;
}
staticint __kvm_mmu_max_mapping_level(struct kvm *kvm, conststruct kvm_memory_slot *slot,
gfn_t gfn, int max_level, bool is_private)
{ struct kvm_lpage_info *linfo; int host_level;
if (unlikely(fault->max_level == PG_LEVEL_4K)) return;
if (is_error_noslot_pfn(fault->pfn)) return;
if (kvm_slot_dirty_track_enabled(slot)) return;
/* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting.
*/
fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
fault->gfn, fault->max_level,
fault->is_private); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return;
/* * mmu_invalidate_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us.
*/
fault->goal_level = fault->req_level;
mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
fault->pfn &= ~mask;
}
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
{ if (cur_level > PG_LEVEL_4K &&
cur_level == fault->goal_level &&
is_shadow_present_pte(spte) &&
!is_large_pte(spte) &&
spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* * A small SPTE exists for this pfn, but FNAME(fetch), * direct_map(), or kvm_tdp_mmu_map() would like to create a * large PTE instead: just force them to go down another level, * patching back for them into pfn the next 9 bits of the * address.
*/
u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
KVM_PAGES_PER_HPAGE(cur_level - 1);
fault->pfn |= fault->gfn & page_mask;
fault->goal_level--;
}
}
trace_kvm_mmu_spte_requested(fault);
for_each_shadow_entry(vcpu, fault->addr, it) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable.
*/ if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, *it.sptep, it.level);
base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break;
/* * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access.
*/ if (fault->pfn == KVM_PFN_ERR_RO_FAULT) return RET_PF_EMULATE;
if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(fault->slot, fault->gfn); return RET_PF_RETRY;
}
/* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an * MMIO SPTE will just be an expensive nop.
*/ if (unlikely(!enable_mmio_caching)) return RET_PF_EMULATE;
/* * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, * any guest that generates such gfns is running nested and is being * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and * only if L1's MAXPHYADDR is inaccurate with respect to the * hardware's).
*/ if (unlikely(fault->gfn > kvm_mmu_max_gfn())) return RET_PF_EMULATE;
return RET_PF_CONTINUE;
}
staticbool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault)
{ /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only * reach the common page fault handler if the SPTE has an invalid MMIO * generation number. Refreshing the MMIO generation needs to go down * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag!
*/ if (fault->rsvd) returnfalse;
/* * For hardware-protected VMs, certain conditions like attempting to * perform a write to a page which is not in the state that the guest * expects it to be in can result in a nested/extended #PF. In this * case, the below code might misconstrue this situation as being the * result of a write-protected access, and treat it as a spurious case * rather than taking any action to satisfy the real source of the #PF * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the * guest spinning on a #PF indefinitely, so don't attempt the fast path * in this case. * * Note that the kvm_mem_is_private() check might race with an * attribute update, but this will either result in the guest spinning * on RET_PF_SPURIOUS until the update completes, or an actual spurious * case might go down the slow path. Either case will resolve itself.
*/ if (kvm->arch.has_private_mem &&
fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) returnfalse;
/* * #PF can be fast if: * * 1. The shadow page table entry is not present and A/D bits are * disabled _by KVM_, which could mean that the fault is potentially * caused by access tracking (if enabled). If A/D bits are enabled * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D * bits for L2 and employ access tracking, but the fast page fault * mechanism only supports direct MMUs. * 2. The shadow page table entry is present, the access is a write, * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e. * the fault was caused by a write-protection violation. If the * SPTE is MMU-writable (determined later), the fault can be fixed * by setting the Writable bit, which can be done out of mmu_lock.
*/ if (!fault->present) return !kvm_ad_enabled;
/* * Note, instruction fetches and writes are mutually exclusive, ignore * the "exec" flag.
*/ return fault->write;
}
/* * Returns true if the SPTE was fixed successfully. Otherwise, * someone else modified the SPTE from its original value.
*/ staticbool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
u64 *sptep, u64 old_spte, u64 new_spte)
{ /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in * set_spte. But fast_page_fault is very unlikely to happen with PML * enabled, so we do not do this. This might result in the same GPA * to be logged in PML buffer again when the write really happens, and * eventually to be called by mark_page_dirty twice. But it's also no * harm. This also avoids the TLB flush needed after setting dirty bit * so non-PML cases won't be impacted. * * Compare with make_spte() where instead shadow_dirty_mask is set.
*/ if (!try_cmpxchg64(sptep, &old_spte, new_spte)) returnfalse;
if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
returntrue;
}
/* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no * walk could be performed, returns NULL and *spte does not contain valid data. * * Contract: * - Must be called between walk_shadow_page_lockless_{begin,end}. * - The returned sptep must not be used after walk_shadow_page_lockless_end.
*/ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
{ struct kvm_shadow_walk_iterator iterator;
u64 old_spte;
u64 *sptep = NULL;
/* * It's entirely possible for the mapping to have been zapped * by a different task, but the root page should always be * available as the vCPU holds a reference to its root(s).
*/ if (WARN_ON_ONCE(!sptep))
spte = FROZEN_SPTE;
if (!is_shadow_present_pte(spte)) break;
sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break;
/* * Check whether the memory access that caused the fault would * still cause it if it were to be performed right now. If not, * then this is a spurious fault caused by TLB lazily flushed, * or some other CPU has already fixed the PTE after the * current CPU took the fault. * * Need not check the access of upper level table entries since * they are always ACC_ALL.
*/ if (is_access_allowed(fault, spte)) {
ret = RET_PF_SPURIOUS; break;
}
new_spte = spte;
/* * KVM only supports fixing page faults outside of MMU lock for * direct MMUs, nested MMUs are always indirect, and KVM always * uses A/D bits for non-nested MMUs. Thus, if A/D bits are * enabled, the SPTE can't be an access-tracked SPTE.
*/ if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte))
new_spte = restore_acc_track_spte(new_spte) |
shadow_accessed_mask;
/* * To keep things simple, only SPTEs that are MMU-writable can * be made fully writable outside of mmu_lock, e.g. only SPTEs * that were write-protected for dirty-logging or access * tracking are handled here. Don't bother checking if the * SPTE is writable to prioritize running with A/D bits enabled. * The is_access_allowed() check above handles the common case * of the fault being spurious, and the SPTE is known to be * shadow-present, i.e. except for access tracking restoration * making the new SPTE writable, the check is wasteful.
*/ if (fault->write && is_mmu_writable_spte(spte)) {
new_spte |= PT_WRITABLE_MASK;
/* * Do not fix write-permission on the large spte when * dirty logging is enabled. Since we only dirty the * first page into the dirty-bitmap in * fast_pf_fix_direct_spte(), other pages are missed * if its slot has dirty logging enabled. * * Instead, we let the slow page fault path create a * normal spte to fix the access.
*/ if (sp->role.level > PG_LEVEL_4K &&
kvm_slot_dirty_track_enabled(fault->slot)) break;
}
/* Verify that the fault can be handled in the fast path */ if (new_spte == spte ||
!is_access_allowed(fault, new_spte)) break;
/* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail.
*/ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
ret = RET_PF_FIXED; break;
}
if (++retry_count > 4) {
pr_warn_once("Fast #PF retrying more than 4 times.\n"); break;
}
/* Before acquiring the MMU lock, see if we need to do any real work. */
free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
&& VALID_PAGE(mmu->root.hpa);
if (!free_active_root) { for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
VALID_PAGE(mmu->prev_roots[i].hpa)) break;
if (i == KVM_MMU_NUM_PREV_ROOTS) return;
}
if (is_tdp_mmu)
read_lock(&kvm->mmu_lock); else
write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
&invalid_list);
if (free_active_root) { if (kvm_mmu_is_dummy_root(mmu->root.hpa)) { /* Nothing to cleanup for dummy roots. */
} elseif (root_to_sp(mmu->root.hpa)) {
mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list);
} elseif (mmu->pae_root) { for (i = 0; i < 4; ++i) { if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) continue;
/* * This should not be called while L2 is active, L2 can't invalidate * _only_ its own roots, e.g. INVVPID unconditionally exits.
*/
WARN_ON_ONCE(mmu->root_role.guest_mode);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
root_hpa = mmu->prev_roots[i].hpa; if (!VALID_PAGE(root_hpa)) continue;
h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT); if (!h) return -ENOMEM;
/* * Ensure the hash table pointer is set only after all stores to zero * the memory are retired. Pairs with the smp_load_acquire() in * kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to * add (or remove) shadow pages, and so readers are guaranteed to see * an empty list for their current mmu_lock critical section.
*/
smp_store_release(&kvm->arch.mmu_page_hash, h); return 0;
}
staticint mmu_first_shadow_root_alloc(struct kvm *kvm)
{ struct kvm_memslots *slots; struct kvm_memory_slot *slot; int r = 0, i, bkt;
/* * Check if this is the first shadow root being allocated before * taking the lock.
*/ if (kvm_shadow_root_allocated(kvm)) return 0;
mutex_lock(&kvm->slots_arch_lock);
/* Recheck, under the lock, whether this is the first shadow root. */ if (kvm_shadow_root_allocated(kvm)) goto out_unlock;
r = kvm_mmu_alloc_page_hash(kvm); if (r) goto out_unlock;
/* * Check if memslot metadata actually needs to be allocated, e.g. all * metadata will be allocated upfront if TDP is disabled.
*/ if (kvm_memslots_have_rmaps(kvm) &&
kvm_page_track_write_tracking_enabled(kvm)) goto out_success;
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(slot, bkt, slots) { /* * Both of these functions are no-ops if the target is * already allocated, so unconditionally calling both * is safe. Intentionally do NOT free allocations on * failure to avoid having to track which allocations * were made now versus when the memslot was created. * The metadata is guaranteed to be freed when the slot * is freed, and will be kept/used if userspace retries * KVM_RUN instead of killing the VM.
*/
r = memslot_rmap_alloc(slot, slot->npages); if (r) goto out_unlock;
r = kvm_page_track_write_tracking_alloc(slot); if (r) goto out_unlock;
}
}
/* * Ensure that shadow_root_allocated becomes true strictly after * all the related pointers are set.
*/
out_success:
smp_store_release(&kvm->arch.shadow_root_allocated, true);
if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
mmu->root.hpa = kvm_mmu_get_dummy_root(); return 0;
}
/* * On SVM, reading PDPTRs might access guest memory, which might fault * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
*/ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) {
pdptrs[i] = mmu->get_pdptr(vcpu, i); if (!(pdptrs[i] & PT_PRESENT_MASK)) continue;
if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT))
pdptrs[i] = 0;
}
}
r = mmu_first_shadow_root_alloc(vcpu->kvm); if (r) return r;
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock;
/* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root.
*/ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
mmu->root_role.level);
mmu->root.hpa = root; goto set_root_pgd;
}
if (WARN_ON_ONCE(!mmu->pae_root)) {
r = -EIO; goto out_unlock;
}
/* * We shadow a 32 bit page table. This may be a legacy 2-level * or a PAE 3-level page table. In either case we need to be aware that * the shadow page table may be a PAE or a long mode page table.
*/
pm_mask = PT_PRESENT_MASK | shadow_me_value; if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
if (WARN_ON_ONCE(!mmu->pml4_root)) {
r = -EIO; goto out_unlock;
}
mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
if (mmu->root_role.level == PT64_ROOT_5LEVEL) { if (WARN_ON_ONCE(!mmu->pml5_root)) {
r = -EIO; goto out_unlock;
}
mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
}
}
for (i = 0; i < 4; ++i) {
WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
/* * If shadowing 32-bit non-PAE page tables, each PAE page * directory maps one quarter of the guest's non-PAE page * directory. Othwerise each PAE page direct shadows one guest * PAE page directory so that quadrant should be 0.
*/
quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
/* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP * tables are allocated and initialized at root creation as there is no * equivalent level in the guest's NPT to shadow. Allocate the tables * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
*/ if (mmu->root_role.direct ||
mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
mmu->root_role.level < PT64_ROOT_4LEVEL) return 0;
/* * NPT, the only paging mode that uses this horror, uses a fixed number * of levels for the shadow page tables, e.g. all MMUs are 4-level or * all MMus are 5-level. Thus, this can safely require that pml5_root * is allocated if the other roots are valid and pml5 is needed, as any * prior MMU would also have required pml5.
*/ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) return 0;
/* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid.
*/ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
(need_pml5 && mmu->pml5_root))) return -EIO;
/* * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and * doesn't need to be decrypted.
*/
pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pae_root) return -ENOMEM;
#ifdef CONFIG_X86_64
pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml4_root) goto err_pml4;
if (need_pml5) {
pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml5_root) goto err_pml5;
} #endif
if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root)) returnfalse;
/* * The read barrier orders the CPU's read of SPTE.W during the page table * walk before the reads of sp->unsync/sp->unsync_children here. * * Even if another CPU was marking the SP as unsync-ed simultaneously, * any guest page table changes are not guaranteed to be visible anyway * until this VCPU issues a TLB flush strictly after those changes are * made. We only need to ensure that the other CPU sets these flags * before any actual changes to the page tables are made. The comments * in mmu_try_to_unsync_pages() describe what could go wrong if this * requirement isn't satisfied.
*/
smp_rmb();
sp = root_to_sp(root);
/* * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the * PDPTEs for a given PAE root need to be synchronized individually.
*/ if (WARN_ON_ONCE(!sp)) returnfalse;
if (sp->unsync || sp->unsync_children) returntrue;
returnfalse;
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{ int i; struct kvm_mmu_page *sp;
if (vcpu->arch.mmu->root_role.direct) return;
if (!VALID_PAGE(vcpu->arch.mmu->root.hpa)) return;
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root.hpa;
staticbool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{ /* * A nested guest cannot use the MMIO cache if it is using nested * page tables, because cr2 is a nGPA while the cache stores GPAs.
*/ if (mmu_is_nested(vcpu)) returnfalse;
if (direct) return vcpu_match_mmio_gpa(vcpu, addr);
return vcpu_match_mmio_gva(vcpu, addr);
}
/* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. * * Must be called between walk_shadow_page_lockless_{begin,end}.
*/ staticint get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
{ struct kvm_shadow_walk_iterator iterator; int leaf = -1;
u64 spte;
/* * Skip reserved bits checks on the terminal leaf if it's not a valid * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by * design, always have reserved bits set. The purpose of the checks is * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
*/ if (!is_shadow_present_pte(sptes[leaf]))
leaf++;
if (!fault->present || !fault->write) returnfalse;
/* * guest is writing the page which is write tracked which can * not be fixed by page fault handler.
*/ if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn)) returntrue;
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{ int r;
if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) return;
if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
work->wakeup_all) return;
r = kvm_mmu_reload(vcpu); if (unlikely(r)) return;
if (!vcpu->arch.mmu->root_role.direct &&
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return;
r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL, NULL);
/* * Account fixed page faults, otherwise they'll never be counted, but * ignore stats for all other return times. Page-ready "faults" aren't * truly spurious and never trigger emulation
*/ if (r == RET_PF_FIXED)
vcpu->stat.pf_fixed++;
}
/* * If resolving the page failed because I/O is needed to fault-in the * page, then either set up an asynchronous #PF to do the I/O, or if * doing an async #PF isn't possible, retry with I/O allowed. All * other failures are terminal, i.e. retrying won't help.
*/ if (fault->pfn != KVM_PFN_ERR_NEEDS_IO) return RET_PF_CONTINUE;
/* * Allow gup to bail on pending non-fatal signals when it's also allowed * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending.
*/
foll |= FOLL_INTERRUPTIBLE;
foll &= ~FOLL_NOWAIT;
fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
&fault->map_writable, &fault->refcounted_page);
if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm)) return -EFAULT;
/* * Note that the mmu_invalidate_seq also serves to detect a concurrent * change in attributes. is_page_fault_stale() will detect an * invalidation relate to fault->fn and resume the guest without * installing a mapping in the page tables.
*/
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb();
/* * Now that we have a snapshot of mmu_invalidate_seq we can check for a * private vs. shared mismatch.
*/ if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT;
}
if (unlikely(!slot)) return kvm_handle_noslot_fault(vcpu, fault, access);
/* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will * be zapped before KVM inserts a new MMIO SPTE for the gfn.
*/ if (slot->flags & KVM_MEMSLOT_INVALID) return RET_PF_RETRY;
if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { /* * Don't map L1's APIC access page into L2, KVM doesn't support * using APICv/AVIC to accelerate L2 accesses to L1's APIC, * i.e. the access needs to be emulated. Emulating access to * L1's APIC is also correct if L1 is accelerating L2's own * virtual APIC, but for some reason L1 also maps _L1's_ APIC * into L2. Note, vcpu_is_mmio_gpa() always treats access to * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM * uses different roots for L1 vs. L2, i.e. there is no danger * of breaking APICv/AVIC for L1.
*/ if (is_guest_mode(vcpu)) return kvm_handle_noslot_fault(vcpu, fault, access);
/* * If the APIC access page exists but is disabled, go directly * to emulation without caching the MMIO access or creating a * MMIO SPTE. That way the cache doesn't need to be purged * when the AVIC is re-enabled.
*/ if (!kvm_apicv_activated(vcpu->kvm)) return RET_PF_EMULATE;
}
/* * Check for a relevant mmu_notifier invalidation event before getting * the pfn from the primary MMU, and before acquiring mmu_lock. * * For mmu_lock, if there is an in-progress invalidation and the kernel * allows preemption, the invalidation task may drop mmu_lock and yield * in response to mmu_lock being contended, which is *very* counter- * productive as this vCPU can't actually make forward progress until * the invalidation completes. * * Retrying now can also avoid unnessary lock contention in the primary * MMU, as the primary MMU doesn't necessarily hold a single lock for * the duration of the invalidation, i.e. faulting in a conflicting pfn * can cause the invalidation to take longer by holding locks that are * needed to complete the invalidation. * * Do the pre-check even for non-preemtible kernels, i.e. even if KVM * will never yield mmu_lock in response to contention, as this vCPU is * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU.
*/ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY;
ret = __kvm_mmu_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret;
if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault);
if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access);
/* * Check again for a relevant mmu_notifier invalidation event purely to * avoid contending mmu_lock. Most invalidations will be detected by * the previous check, but checking is extremely cheap relative to the * overall cost of failing to detect the invalidation until after * mmu_lock is acquired.
*/ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) {
kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY); return RET_PF_RETRY;
}
return RET_PF_CONTINUE;
}
/* * Returns true if the page fault is stale and needs to be retried, i.e. if the * root was invalidated by a memslot update or a relevant mmu_notifier fired.
*/ staticbool is_page_fault_stale(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{ struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
/* Special roots, e.g. pae_root, are not backed by shadow pages. */ if (sp && is_obsolete_sp(vcpu->kvm, sp)) returntrue;
/* * Roots without an associated shadow page are considered invalid if * there is a pending request to free obsolete roots. The request is * only a hint that the current root _may_ be obsolete and needs to be * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs * to reload even if no vCPU is actively using the root.
*/ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) returntrue;
/* * Check for a relevant mmu_notifier invalidation event one last time * now that mmu_lock is held, as the "unsafe" checks performed without * holding mmu_lock can get false negatives.
*/ return fault->slot &&
mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn);
}
staticint direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{ int r;
/* Dummy roots are used only for shadowing bad guest roots. */ if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY;
if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_WRITE_PROTECTED;
r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r;
r = mmu_topup_memory_caches(vcpu, false); if (r) return r;
r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r;
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
if (is_page_fault_stale(vcpu, fault)) goto out_unlock;
r = make_mmu_pages_available(vcpu); if (r) goto out_unlock;
staticint nonpaging_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{ /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
fault->max_level = PG_LEVEL_2M; return direct_page_fault(vcpu, fault);
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len)
{ int r = 1;
u32 flags = vcpu->arch.apf.host_apf_flags;
#ifndef CONFIG_X86_64 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif /* * Legacy #PF exception only have a 32-bit error code. Simply drop the * upper bits as KVM doesn't use them for #PF (because they are never * set), and to ensure there are no collisions with KVM-defined bits.
*/ if (WARN_ON_ONCE(error_code >> 32))
error_code = lower_32_bits(error_code);
/* * Restrict KVM-defined flags to bits 63:32 so that it's impossible for * them to conflict with #PF error codes, which are limited to 32 bits.
*/
BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
vcpu->arch.l1tf_flush_l1d = true; if (!flags) {
trace_kvm_page_fault(vcpu, fault_address, error_code);
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{ #ifdef CONFIG_X86_64 if (tdp_mmu_enabled) return kvm_tdp_mmu_page_fault(vcpu, fault); #endif
return direct_page_fault(vcpu, fault);
}
int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level)
{ int r;
/* * Restrict to TDP page fault, since that's the only case where the MMU * is indexed by GPA.
*/ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) return -EOPNOTSUPP;
do { if (signal_pending(current)) return -EINTR;
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) return -EIO;
cond_resched();
r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level);
} while (r == RET_PF_RETRY);
if (r < 0) return r;
switch (r) { case RET_PF_FIXED: case RET_PF_SPURIOUS: case RET_PF_WRITE_PROTECTED: return 0;
case RET_PF_EMULATE: return -ENOENT;
case RET_PF_RETRY: case RET_PF_CONTINUE: case RET_PF_INVALID: default:
WARN_ONCE(1, "could not fix page fault during prefault"); return -EIO;
}
}
EXPORT_SYMBOL_GPL(kvm_tdp_map_page);
/* * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging.
*/
r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); if (r < 0) return r;
/* * If the mapping that covers range->gpa can use a huge page, it * may start below it or end after range->gpa + range->size.
*/
end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); return min(range->size, end - range->gpa);
}
if (!role.direct && pgd != root->pgd) returnfalse;
sp = root_to_sp(root->hpa); if (WARN_ON_ONCE(!sp)) returnfalse;
return role.word == sp->role.word;
}
/* * Find out if a previously cached root matching the new pgd/role is available, * and insert the current root as the MRU in the cache. * If a matching root is found, it is assigned to kvm_mmu->root and * true is returned. * If no match is found, kvm_mmu->root is left invalid, the LRU root is * evicted to make room for the current root, and false is returned.
*/ staticbool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu,
gpa_t new_pgd, union kvm_mmu_page_role new_role)
{
uint i;
if (is_root_usable(&mmu->root, new_pgd, new_role)) returntrue;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { /* * The swaps end up rotating the cache like this: * C 0 1 2 3 (on entry to the function) * 0 C 1 2 3 * 1 C 0 2 3 * 2 C 0 1 3 * 3 C 0 1 2 (on exit from the loop)
*/
swap(mmu->root, mmu->prev_roots[i]); if (is_root_usable(&mmu->root, new_pgd, new_role)) returntrue;
}
/* * Find out if a previously cached root matching the new pgd/role is available. * On entry, mmu->root is invalid. * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry * of the cache becomes invalid, and true is returned. * If no match is found, kvm_mmu->root is left invalid and false is returned.
*/ staticbool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu,
gpa_t new_pgd, union kvm_mmu_page_role new_role)
{
uint i;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role)) goto hit;
returnfalse;
hit:
swap(mmu->root, mmu->prev_roots[i]); /* Bubble up the remaining roots. */ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++)
mmu->prev_roots[i] = mmu->prev_roots[i + 1];
mmu->prev_roots[i].hpa = INVALID_PAGE; returntrue;
}
staticbool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
gpa_t new_pgd, union kvm_mmu_page_role new_role)
{ /* * Limit reuse to 64-bit hosts+VMs without "special" roots in order to * avoid having to deal with PDPTEs and other complexities.
*/ if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa))
kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
/* * Return immediately if no usable root was found, kvm_mmu_reload() * will establish a valid root prior to the next VM-Enter.
*/ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) return;
/* * It's possible that the cached previous root page is obsolete because * of a change in the MMU generation number. However, changing the * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, * which will free the root set here and allocate a new one.
*/
kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
if (force_flush_and_sync_on_reuse) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
/* * The last MMIO access's GVA and GPA are cached in the VCPU. When * switching to a new CR3, that GVA->GPA mapping may no longer be * valid. So clear any cached MMIO info even when we don't need to sync * the shadow page tables.
*/
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
/* * If this is a direct root page, it doesn't have a write flooding * count. Otherwise, clear the write flooding count.
*/ if (!new_role.direct) { struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa);
if (!WARN_ON_ONCE(!sp))
__clear_sp_write_flooding_count(sp);
}
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
/* Note, NX doesn't exist in PDPTEs, this is handled below. */ if (!nx)
high_bits_rsvd |= rsvd_bits(63, 63);
/* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only.
*/ if (amd)
nonleaf_bit8_rsvd = rsvd_bits(8, 8);
switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */
rsvd_check->rsvd_bits_mask[0][1] = 0;
rsvd_check->rsvd_bits_mask[0][0] = 0;
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
if (!pse) {
rsvd_check->rsvd_bits_mask[1][1] = 0; break;
}
bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ if (!execonly) { /* bits 0..2 must not be 100 unless VMX capabilities allow it */
bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
}
rsvd_check->bad_mt_xwr = bad_mt_xwr;
}
/* * the page table on host is the shadow page table for the page * table in guest or amd nested guest, its mmu features completely * follow the features in guest.
*/ staticvoid reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{ /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ bool is_amd = true; /* KVM doesn't use 2-level page tables for the shadow MMU. */ bool is_pse = false; struct rsvd_bits_validate *shadow_zero_check; int i;
for (i = context->root_role.level; --i >= 0;) { /* * So far shadow_me_value is a constant during KVM's life * time. Bits in shadow_me_value are allowed to be set. * Bits in shadow_me_mask but not in shadow_me_value are * not allowed to be set.
*/
shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
}
/* * the direct page table on host, use as much mmu features as * possible, however, kvm currently does not do execution-protection.
*/ staticvoid reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
{ struct rsvd_bits_validate *shadow_zero_check; int i;
for (i = context->root_role.level; --i >= 0;) {
shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
}
}
/* * as the comments in reset_shadow_zero_bits_mask() except it * is the shadow page table for intel nested guest.
*/ staticvoid
reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
reserved_hpa_bits(), execonly,
max_huge_page_level);
}
/* * Each "*f" variable has a 1 bit for each UWX value * that causes a fault with the given PFEC.
*/
/* Faults from writes to non-writable pages */
u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; /* Faults from user mode accesses to supervisor pages */
u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; /* Faults from fetches of non-executable pages*/
u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; /* Faults from kernel mode fetches of user pages */
u8 smepf = 0; /* Faults from kernel mode accesses of user pages */
u8 smapf = 0;
if (!ept) { /* Faults from kernel mode accesses to user pages */
u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
/* Not really needed: !nx will cause pte.nx to fault */ if (!efer_nx)
ff = 0;
/* Allow supervisor writes if !cr0.wp */ if (!cr0_wp)
wf = (pfec & PFERR_USER_MASK) ? wf : 0;
/* Disallow supervisor fetches of user code if cr4.smep */ if (cr4_smep)
smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
/* * SMAP:kernel-mode data accesses from user-mode * mappings should fault. A fault is considered * as a SMAP violation if all of the following * conditions are true: * - X86_CR4_SMAP is set in CR4 * - A user page is accessed * - The access is not a fetch * - The access is supervisor mode * - If implicit supervisor access or X86_EFLAGS_AC is clear * * Here, we cover the first four conditions. * The fifth is computed dynamically in permission_fault(); * PFERR_RSVD_MASK bit will be set in PFEC if the access is * *not* subject to SMAP restrictions.
*/ if (cr4_smap)
smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
}
/* * PKU is an additional mechanism by which the paging controls access to * user-mode addresses based on the value in the PKRU register. Protection * key violations are reported through a bit in the page fault error code. * Unlike other bits of the error code, the PK bit is not known at the * call site of e.g. gva_to_gpa; it must be computed directly in * permission_fault based on two bits of PKRU, on some machine state (CR4, * CR0, EFER, CPL), and on other bits of the error code and the page tables. * * In particular the following conditions come from the error code, the * page tables and the machine state: * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) * - PK is always zero if U=0 in the page tables * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. * * The PKRU bitmask caches the result of these four conditions. The error * code (minus the P bit) and the page table's U bit form an index into the * PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed * with the two bits of the PKRU register corresponding to the protection key. * For the first three conditions above the bits will be 00, thus masking * away both AD and WD. For all reads or if the last condition holds, WD * only will be masked away.
*/ staticvoid update_pkru_bitmask(struct kvm_mmu *mmu)
{ unsigned bit; bool wp;
mmu->pkru_mask = 0;
if (!is_cr4_pke(mmu)) return;
wp = is_cr0_wp(mmu);
for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { unsigned pfec, pkey_bits; bool check_pkey, check_write, ff, uf, wf, pte_user;
/* PFEC.RSVD is replaced by ACC_USER_MASK. */
pte_user = pfec & PFERR_RSVD_MASK;
/* * Only need to check the access which is not an * instruction fetch and is to a user page.
*/
check_pkey = (!ff && pte_user); /* * write access is controlled by PKRU if it is a * user access or CR0.WP = 1.
*/
check_write = check_pkey && wf && (uf || wp);
staticunion kvm_mmu_page_role
kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role)
{ union kvm_mmu_page_role role = {0};
staticvoid kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role)
{ struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_page_role root_role;
root_role = cpu_role.base;
/* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
/* * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role. * KVM uses NX when TDP is disabled to handle a variety of scenarios, * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. * The iTLB multi-hit workaround can be toggled at any time, so assume * NX can be used by any non-nested shadow MMU to avoid having to reset * MMU contexts.
*/
root_role.efer_nx = true;
staticunion kvm_cpu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, bool execonly, u8 level)
{ union kvm_cpu_role role = {0};
/* * KVM does not support SMM transfer monitors, and consequently does not * support the "entry to SMM" control either. role.base.smm is always 0.
*/
WARN_ON_ONCE(is_smm(vcpu));
role.base.level = level;
role.base.has_4_byte_gpte = false;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
role.base.guest_mode = true;
role.base.access = ACC_ALL;
if (new_mode.as_u64 != context->cpu_role.as_u64) { /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
context->cpu_role.as_u64 = new_mode.as_u64;
context->root_role.word = new_mode.base.word;
/* * L2 page tables are never shadowed, so there is no need to sync * SPTEs.
*/
g_context->sync_spte = NULL;
/* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using * L1's nested page tables (e.g. EPT12). The nested translation * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using * L2's page tables as the first level of translation and L1's * nested page tables as the second level of translation. Basically * the gva_to_gpa functions between mmu and nested_mmu are swapped.
*/ if (!is_paging(vcpu))
g_context->gva_to_gpa = nonpaging_gva_to_gpa; elseif (is_long_mode(vcpu))
g_context->gva_to_gpa = paging64_gva_to_gpa; elseif (is_pae(vcpu))
g_context->gva_to_gpa = paging64_gva_to_gpa; else
g_context->gva_to_gpa = paging32_gva_to_gpa;
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
{ /* * Invalidate all MMU roles to force them to reinitialize as CPUID * information is factored into reserved bit calculations. * * Correctly handling multiple vCPU models with respect to paging and * physical address properties) in a single VM would require tracking * all relevant CPUID information in kvm_mmu_page_role. That is very * undesirable as it would increase the memory requirements for * gfn_write_track (see struct kvm_mmu_page_role comments). For now * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.root_role.invalid = 1;
vcpu->arch.guest_mmu.root_role.invalid = 1;
vcpu->arch.nested_mmu.root_role.invalid = 1;
vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
kvm_mmu_reset_context(vcpu);
/* * Changing guest CPUID after KVM_RUN is forbidden, see the comment in * kvm_arch_vcpu_ioctl().
*/
KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
}
r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct); if (r) goto out;
r = mmu_alloc_special_roots(vcpu); if (r) goto out; if (vcpu->arch.mmu->root_role.direct)
r = mmu_alloc_direct_roots(vcpu); else
r = mmu_alloc_shadow_roots(vcpu); if (r) goto out;
kvm_mmu_sync_roots(vcpu);
kvm_mmu_load_pgd(vcpu);
/* * Flush any TLB entries for the new root, the provenance of the root * is unknown. Even if KVM ensures there are no stale TLB entries * for a freed root, in theory another hypervisor could have left * stale entries. Flushing on alloc also allows KVM to skip the TLB * flush when freeing a root (see kvm_tdp_mmu_put_root()).
*/
kvm_x86_call(flush_tlb_current)(vcpu);
out: return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_load);
/* * When freeing obsolete roots, treat roots as obsolete if they don't * have an associated shadow page, as it's impossible to determine if * such roots are fresh or stale. This does mean KVM will get false * positives and free roots that don't strictly need to be freed, but * such false positives are relatively rare: * * (a) only PAE paging and nested NPT have roots without shadow pages * (or any shadow paging flavor with a dummy root, see note below) * (b) remote reloads due to a memslot update obsoletes _all_ roots * (c) KVM doesn't track previous roots for PAE paging, and the guest * is unlikely to zap an in-use PGD. * * Note! Dummy roots are unique in that they are obsoleted by memslot * _creation_! See also FNAME(fetch).
*/
sp = root_to_sp(root_hpa); return !sp || is_obsolete_sp(kvm, sp);
}
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes)
{
u64 gentry = 0; int r;
/* * Assume that the pte write on a page table of the same type * as the current vcpu paging mode since we update the sptes only * when they have the same mode.
*/ if (is_pae(vcpu) && *bytes == 4) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
*gpa &= ~(gpa_t)7;
*bytes = 8;
}
if (*bytes == 4 || *bytes == 8) {
r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); if (r)
gentry = 0;
}
return gentry;
}
/* * If we're seeing too many writes to a page, it may no longer be a page table, * or we may be forking, in which case it is better to unmap the page.
*/ staticbool detect_write_flooding(struct kvm_mmu_page *sp)
{ /* * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected.
*/ if (sp->role.level == PG_LEVEL_4K) returnfalse;
/* * Misaligned accesses are too much trouble to fix up; also, they usually * indicate a page is not used as a page table.
*/ staticbool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, int bytes)
{ unsigned offset, pte_size, misaligned;
/* * Sometimes, the OS only writes the last one bytes to update status * bits, for example, in linux, andb instruction is used in clear_bit().
*/ if (!(offset & (pte_size - 1)) && bytes == 1) returnfalse;
/* * When emulating guest writes, ensure the written value is visible to * any task that is handling page faults before checking whether or not * KVM is shadowing a guest PTE. This ensures either KVM will create * the correct SPTE in the page fault handler, or this task will see * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in * account_shadowed().
*/
smp_mb(); if (!vcpu->kvm->arch.indirect_shadow_pages) return;
staticint kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
u64 error_code, int *emulation_type)
{ bool direct = vcpu->arch.mmu->root_role.direct;
/* * Do not try to unprotect and retry if the vCPU re-faulted on the same * RIP with the same address that was previously unprotected, as doing * so will likely put the vCPU into an infinite. E.g. if the vCPU uses * a non-page-table modifying instruction on the PDE that points to the * instruction, then unprotecting the gfn will unmap the instruction's * code, i.e. make it impossible for the instruction to ever complete.
*/ if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) &&
vcpu->arch.last_retry_addr == cr2_or_gpa) return RET_PF_EMULATE;
/* * Reset the unprotect+retry values that guard against infinite loops. * The values will be refreshed if KVM explicitly unprotects a gfn and * retries, in all other cases it's safe to retry in the future even if * the next page fault happens on the same RIP+address.
*/
vcpu->arch.last_retry_eip = 0;
vcpu->arch.last_retry_addr = 0;
/* * It should be impossible to reach this point with an MMIO cache hit, * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, * writable memslot, and creating a memslot should invalidate the MMIO * cache by way of changing the memslot generation. WARN and disallow * retry if MMIO is detected, as retrying MMIO emulation is pointless * and could put the vCPU into an infinite loop because the processor * will keep faulting on the non-existent MMIO address.
*/ if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) return RET_PF_EMULATE;
/* * Before emulating the instruction, check to see if the access was due * to a read-only violation while the CPU was walking non-nested NPT * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. * If L1 is sharing (a subset of) its page tables with L2, e.g. by * having nCR3 share lower level page tables with hCR3, then when KVM * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also * unknowingly write-protecting L1's guest page tables, which KVM isn't * shadowing. * * Because the CPU (by default) walks NPT page tables using a write * access (to ensure the CPU can do A/D updates), page walks in L1 can * trigger write faults for the above case even when L1 isn't modifying * PTEs. As a result, KVM will unnecessarily emulate (or at least, try * to emulate) an excessive number of L1 instructions; because L1's MMU * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs * and thus no need to emulate in order to guarantee forward progress. * * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can * proceed without triggering emulation. If one or more shadow pages * was zapped, skip emulation and resume L1 to let it natively execute * the instruction. If no shadow pages were zapped, then the write- * fault is due to something else entirely, i.e. KVM needs to emulate, * as resuming the guest will put it into an infinite loop. * * Note, this code also applies to Intel CPUs, even though it is *very* * unlikely that an L1 will share its page tables (IA32/PAE/paging64 * format) with L2's page tables (EPT format). * * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to * unprotect the gfn and retry if an event is awaiting reinjection. If * KVM emulates multiple instructions before completing event injection, * the event could be delayed beyond what is architecturally allowed, * e.g. KVM could inject an IRQ after the TPR has been raised.
*/ if (((direct && is_write_to_guest_page_table(error_code)) ||
(!direct && kvm_event_needs_reinjection(vcpu))) &&
kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return RET_PF_RETRY;
/* * The gfn is write-protected, but if KVM detects its emulating an * instruction that is unlikely to be used to modify page tables, or if * emulation fails, KVM can try to unprotect the gfn and let the CPU * re-execute the instruction that caused the page fault. Do not allow * retrying an instruction from a nested guest as KVM is only explicitly * shadowing L1's page tables, i.e. unprotecting something for L1 isn't * going to magically fix whatever issue caused L2 to fail.
*/ if (!is_guest_mode(vcpu))
*emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
return RET_PF_EMULATE;
}
int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len)
{ int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct;
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY;
/* * Except for reserved faults (emulated MMIO is shared-only), set the * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's * current attributes, which are the source of truth for such VMs. Note, * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't * currently supported nested virtualization (among many other things) * for software-protected VMs.
*/ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
!(error_code & PFERR_RSVD_MASK) &&
vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
error_code |= PFERR_PRIVATE_ACCESS;
r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) return -EFAULT;
r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate;
}
if (r == RET_PF_INVALID) {
vcpu->stat.pf_taken++;
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
&emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO;
}
if (r < 0) return r;
if (r == RET_PF_WRITE_PROTECTED)
r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
&emulation_type);
/* * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE. * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to * indicate continuing the page fault handling until to the final * page table mapping phase.
*/
WARN_ON_ONCE(r == RET_PF_CONTINUE); if (r != RET_PF_EMULATE) return r;
/* * Walking and synchronizing SPTEs both assume they are operating in * the context of the current MMU, and would need to be reworked if * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
*/ if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) return;
/* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ if (is_noncanonical_invlpg_address(addr, vcpu)) return;
kvm_x86_call(flush_tlb_gva)(vcpu, addr);
}
if (!mmu->sync_spte) return;
if (roots & KVM_MMU_ROOT_CURRENT)
__kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (roots & KVM_MMU_ROOT_PREVIOUS(i))
__kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
}
}
EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{ /* * INVLPG is required to invalidate any global mappings for the VA, * irrespective of PCID. Blindly sync all roots as it would take * roughly the same amount of work/time to determine whether any of the * previous roots have a global mapping. * * Mappings not reachable via the current or previous cached roots will * be synced when switching to that new cr3, so nothing needs to be * done here for them.
*/
kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
if (pcid == kvm_get_active_pcid(vcpu))
roots |= KVM_MMU_ROOT_CURRENT;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
roots |= KVM_MMU_ROOT_PREVIOUS(i);
}
if (roots)
kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
++vcpu->stat.invlpg;
/* * Mappings not reachable via the current cr3 or the prev_roots will be * synced when switching to that cr3, so nothing needs to be done here * for them.
*/
}
void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level)
{
tdp_enabled = enable_tdp;
tdp_root_level = tdp_forced_root_level;
max_tdp_level = tdp_max_root_level;
#ifdef CONFIG_X86_64
tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; #endif /* * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when * the kernel is not. But, KVM never creates a page size greater than * what is used by the kernel for any given HVA, i.e. the kernel's * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
*/ if (tdp_enabled)
max_huge_page_level = tdp_huge_page_level; elseif (boot_cpu_has(X86_FEATURE_GBPAGES))
max_huge_page_level = PG_LEVEL_1G; else
max_huge_page_level = PG_LEVEL_2M;
}
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
mmu->root.hpa = INVALID_PAGE;
mmu->root.pgd = 0;
mmu->mirror_root_hpa = INVALID_PAGE; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
/* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) return 0;
/* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU * creation. When emulating 32-bit mode, cr3 is only 32 bits even on * x86_64. Therefore we need to allocate the PDP table in the first * 4GB of memory, which happens to fit the DMA32 zone. TDP paging * generally doesn't use PAE paging and can skip allocating the PDP * table. The main exception, handled here, is SVM's 32-bit NPT. The * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
*/ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0;
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM;
mmu->pae_root = page_address(page);
/* * CR3 is only 32 bits when PAE paging is used, thus it's impossible to * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so * that KVM's writes and the CPU's reads get along. Note, this is * only necessary when using shadow paging, as 64-bit NPT can get at * the C-bit even when shadowing 32-bit NPT, and SME isn't supported * by 32-bit kernels (when KVM itself uses 32-bit NPT).
*/ if (!tdp_enabled)
set_memory_decrypted((unsignedlong)mmu->pae_root, 1); else
WARN_ON_ONCE(shadow_me_value);
for (i = 0; i < 4; ++i)
mmu->pae_root[i] = INVALID_PAE_ROOT;
return 0;
}
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{ int ret;
restart:
list_for_each_entry_safe_reverse(sp, node,
&kvm->arch.active_mmu_pages, link) { /* * No obsolete valid page exists before a newly created page * since active_mmu_pages is a FIFO list.
*/ if (!is_obsolete_sp(kvm, sp)) break;
/* * Invalid pages should never land back on the list of active * pages. Skip the bogus page, otherwise we'll get stuck in an * infinite loop if the page gets put back on the list (again).
*/ if (WARN_ON_ONCE(sp->role.invalid)) continue;
/* * No need to flush the TLB since we're only zapping shadow * pages with an obsolete generation number and all vCPUS have * loaded a new root, i.e. the shadow pages being zapped cannot * be in active use by the guest.
*/ if (batch >= BATCH_ZAP_PAGES &&
cond_resched_rwlock_write(&kvm->mmu_lock)) {
batch = 0; goto restart;
}
/* * Kick all vCPUs (via remote TLB flush) before freeing the page tables * to ensure KVM is not in the middle of a lockless shadow page table * walk, which may reference the pages. The remote TLB flush itself is * not required and is simply a convenient way to kick vCPUs as needed. * KVM performs a local TLB flush when allocating a new root (see * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are * running with an obsolete MMU.
*/
kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
/* * Fast invalidate all shadow pages and use lock-break technique * to zap obsolete pages. * * It's required when memslot is being deleted or VM is being * destroyed, in these cases, we should ensure that KVM MMU does * not use any resource of the being-deleted slot or all slots * after calling the function.
*/ staticvoid kvm_mmu_zap_all_fast(struct kvm *kvm)
{
lockdep_assert_held(&kvm->slots_lock);
/* * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is * held for the entire duration of zapping obsolete pages, it's * impossible for there to be multiple invalid generations associated * with *valid* shadow pages at any given time, i.e. there is exactly * one valid generation and (at most) one invalid generation.
*/
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
/* * In order to ensure all vCPUs drop their soon-to-be invalid roots, * invalidating TDP MMU roots must be done while holding mmu_lock for * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/ if (tdp_mmu_enabled) { /* * External page tables don't support fast zapping, therefore * their mirrors must be invalidated separately by the caller.
*/
kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS);
}
/* * Notify all vcpus to reload its shadow page table and flush TLB. * Then all vcpus will switch to new shadow page table with the new * mmu_valid_gen. * * Note: we need to do this under the protection of mmu_lock, * otherwise, vcpu would purge shadow page but miss tlb flush.
*/
kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
kvm_zap_obsolete_pages(kvm);
write_unlock(&kvm->mmu_lock);
/* * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before * returning to the caller, e.g. if the zap is in response to a memslot * deletion, mmu_notifier callbacks will be unable to reach the SPTEs * associated with the deleted memslot once the update completes, and * Deferring the zap until the final reference to the root is put would * lead to use-after-free.
*/ if (tdp_mmu_enabled)
kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
}
/* * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end * (not including it)
*/ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{ bool flush;
staticbool need_topup_split_caches_or_resched(struct kvm *kvm)
{ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) returntrue;
/* * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed * to split a single huge page. Calculating how many are actually needed * is possible but not worth the complexity.
*/ return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) ||
need_topup(&kvm->arch.split_page_header_cache, 1) ||
need_topup(&kvm->arch.split_shadow_page_cache, 1);
}
staticint topup_split_caches(struct kvm *kvm)
{ /* * Allocating rmap list entries when splitting huge pages for nested * MMUs is uncommon as KVM needs to use a list if and only if there is * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be * aliased by multiple L2 gfns and/or from multiple nested roots with * different roles. Aliasing gfns when using TDP is atypical for VMMs; * a few gfns are often aliased during boot, e.g. when remapping BIOS, * but aliasing rarely occurs post-boot or for many gfns. If there is * only one rmap entry, rmap->val points directly at that one entry and * doesn't need to allocate a list. Buffer the cache by the default * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM * encounters an aliased gfn or two.
*/ constint capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; int r;
lockdep_assert_held(&kvm->slots_lock);
r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity,
SPLIT_DESC_CACHE_MIN_NR_OBJECTS); if (r) return r;
r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); if (r) return r;
/* * Note, huge page splitting always uses direct shadow pages, regardless * of whether the huge page itself is mapped by a direct or indirect * shadow page, since the huge page region itself is being directly * mapped with smaller pages.
*/
role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access);
/* Direct SPs do not require a shadowed_info_cache. */
caches.page_header_cache = &kvm->arch.split_page_header_cache;
caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
/* Safe to pass NULL for vCPU since requesting a direct SP. */ return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role);
}
for (index = 0; index < SPTE_ENT_PER_PAGE; index++) {
sptep = &sp->spt[index];
gfn = kvm_mmu_page_get_gfn(sp, index);
/* * The SP may already have populated SPTEs, e.g. if this huge * page is aliased by multiple sptes with the same access * permissions. These entries are guaranteed to map the same * gfn-to-pfn translation since the SP is direct, so no need to * modify them. * * However, if a given SPTE points to a lower level page table, * that lower level page table may only be partially populated. * Installing such SPTEs would effectively unmap a potion of the * huge page. Unmapping guest memory always requires a TLB flush * since a subsequent operation on the unmapped regions would * fail to detect the need to flush.
*/ if (is_shadow_present_pte(*sptep)) {
flush |= !is_last_spte(*sptep, sp->role.level); continue;
}
/* Grab information for the tracepoint before dropping the MMU lock. */
gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep));
level = huge_sp->role.level;
spte = *huge_sptep;
if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
r = -ENOSPC; goto out;
}
if (need_topup_split_caches_or_resched(kvm)) {
write_unlock(&kvm->mmu_lock);
cond_resched(); /* * If the topup succeeds, return -EAGAIN to indicate that the * rmap iterator should be restarted because the MMU lock was * dropped.
*/
r = topup_split_caches(kvm) ?: -EAGAIN;
write_lock(&kvm->mmu_lock); goto out;
}
/* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ if (WARN_ON_ONCE(!sp->role.guest_mode)) continue;
/* The rmaps should never contain non-leaf SPTEs. */ if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) continue;
/* SPs with level >PG_LEVEL_4K should never by unsync. */ if (WARN_ON_ONCE(sp->unsync)) continue;
/* Don't bother splitting huge pages on invalid SPs. */ if (sp->role.invalid) continue;
r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
/* * The split succeeded or needs to be retried because the MMU * lock was dropped. Either way, restart the iterator to get it * back into a consistent state.
*/ if (!r || r == -EAGAIN) goto restart;
/* The split failed and shouldn't be retried (e.g. -ENOMEM). */ break;
}
returnfalse;
}
staticvoid kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, conststruct kvm_memory_slot *slot,
gfn_t start, gfn_t end, int target_level)
{ int level;
/* * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working * down to the target level. This ensures pages are recursively split * all the way to the target level. There's no need to split pages * already at the target level.
*/ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
__walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
level, level, start, end - 1, true, true, false);
}
/* Must be called with the mmu_lock held in write-mode. */ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, conststruct kvm_memory_slot *memslot,
u64 start, u64 end, int target_level)
{ if (!tdp_mmu_enabled) return;
if (kvm_memslots_have_rmaps(kvm))
kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level);
/* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to * ensure that guest writes are reflected in the dirty log before the * ioctl to enable dirty logging on this memslot completes. Since the * split SPTEs retain the write and dirty bits of the huge SPTE, it is * safe for KVM to decide if a TLB flush is necessary based on the split * SPTEs.
*/
}
/* * We cannot do huge page mapping for indirect shadow pages, * which are found on the last rmap (level = 1) when not using * tdp; such shadow pages are synced with the page table in * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1.
*/ if (sp->role.direct &&
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_remote_tlbs_range())
kvm_flush_remote_tlbs_sptep(kvm, sptep); else
need_tlb_flush = 1;
staticvoid kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, conststruct kvm_memory_slot *slot)
{ /* * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap * pages that are already mapped at the maximum hugepage level.
*/ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
kvm_flush_remote_tlbs_memslot(kvm, slot);
}
if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_recover_huge_pages(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, conststruct kvm_memory_slot *memslot)
{ if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock); /* * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity.
*/
walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
write_unlock(&kvm->mmu_lock);
}
if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
read_unlock(&kvm->mmu_lock);
}
/* * The caller will flush the TLBs after this function returns. * * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB * out of mmu lock also guarantees no dirty pages will be lost in * dirty_bitmap.
*/
}
if (list_empty(&kvm->arch.active_mmu_pages)) goto out_flush;
/* * Since accounting information is stored in struct kvm_arch_memory_slot, * all MMU pages that are shadowing guest PTEs must be zapped before the * memslot is deleted, as freeing such pages after the memslot is freed * will result in use-after-free, e.g. in unaccount_shadowed().
*/ for (i = 0; i < slot->npages; i++) { struct kvm_mmu_page *sp;
gfn_t gfn = slot->base_gfn + i;
/* * Generation numbers are incremented in multiples of the number of * address spaces in order to provide unique generations across all * address spaces. Strip what is effectively the address space * modifier prior to checking for a wrap of the MMIO generation so * that a wrap in any address space is detected.
*/
gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1);
/* * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages.
*/ if (unlikely(gen == 0)) {
kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_zap_all_fast(kvm);
}
}
staticvoid kvm_wake_nx_recovery_thread(struct kvm *kvm)
{ /* * The NX recovery thread is spawned on-demand at the first KVM_RUN and * may not be valid even though the VM is globally visible. Do nothing, * as such a VM can't have any possible NX huge pages.
*/ struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread);
staticbool get_nx_auto_mode(void)
{ /* Return true when CPU has the bug, and mitigations are ON */ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
}
/* * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as * its default value of -1 is technically undefined behavior for a boolean. * Forward the module init call to SPTE code so that it too can handle module * params that need to be resolved/snapshot.
*/ void __init kvm_mmu_x86_module_init(void)
{ if (nx_huge_pages == -1)
__set_nx_huge_pages(get_nx_auto_mode());
/* * Snapshot userspace's desire to enable the TDP MMU. Whether or not the * TDP MMU is actually enabled is determined in kvm_configure_mmu() * when the vendor module is loaded.
*/
tdp_mmu_allowed = tdp_mmu_enabled;
kvm_mmu_spte_module_init();
}
/* * The bulk of the MMU initialization is deferred until the vendor module is * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need * to be reset when a potentially different vendor module is loaded.
*/ int kvm_mmu_vendor_module_init(void)
{ int ret = -ENOMEM;
/* * MMU roles use union aliasing which is, generally speaking, an * undefined behavior. However, we supposedly know how compilers behave * and the current status quo is unlikely to change. Guardians below are * supposed to let us know if the assumption becomes false.
*/
BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
kvm_mmu_reset_all_pte_masks();
pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT); if (!pte_list_desc_cache) goto out;
/* * Calculate the effective recovery period, accounting for '0' meaning "let KVM * select a halving time of 1 hour". Returns true if recovery is enabled.
*/ staticbool calc_nx_huge_pages_recovery_period(uint *period)
{ /* * Use READ_ONCE to get the params, this may be called outside of the * param setters, e.g. by the kthread to compute its next timeout.
*/ bool enabled = READ_ONCE(nx_huge_pages);
uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
if (!enabled || !ratio) returnfalse;
*period = READ_ONCE(nx_huge_pages_recovery_period_ms); if (!*period) { /* Make sure the period is not less than one second. */
ratio = min(ratio, 3600u);
*period = 60 * 60 * 1000 / ratio;
} returntrue;
}
/* * Zapping TDP MMU shadow pages, including the remote TLB flush, must * be done under RCU protection, because the pages are freed via RCU * callback.
*/
rcu_read_lock();
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(&kvm->arch.possible_nx_huge_pages)) break;
/* * We use a separate list instead of just using active_mmu_pages * because the number of shadow pages that be replaced with an * NX huge page is expected to be relatively small compared to * the total number of shadow pages. And because the TDP MMU * doesn't use active_mmu_pages.
*/
sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page,
possible_nx_huge_page_link);
WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
WARN_ON_ONCE(!sp->role.direct);
/* * Unaccount and do not attempt to recover any NX Huge Pages * that are being dirty tracked, as they would just be faulted * back in as 4KiB pages. The NX Huge Pages in this slot will be * recovered, along with all the other huge pages in the slot, * when dirty logging is disabled. * * Since gfn_to_memslot() is relatively expensive, it helps to * skip it if it the test cannot possibly return true. On the * other hand, if any memslot has logging enabled, chances are * good that all of them do, in which case unaccount_nx_huge_page() * is much cheaper than zapping the page. * * If a memslot update is in progress, reading an incorrect value * of kvm->nr_memslots_dirty_logging is not a problem: if it is * becoming zero, gfn_to_memslot() will be done unnecessarily; if * it is becoming nonzero, the page will be zapped unnecessarily. * Either way, this only affects efficiency in racy situations, * and not correctness.
*/
slot = NULL; if (atomic_read(&kvm->nr_memslots_dirty_logging)) { struct kvm_memslots *slots;
enabled = calc_nx_huge_pages_recovery_period(&period); if (!enabled) returnfalse;
remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
- get_jiffies_64(); if (remaining_time > 0) {
schedule_timeout(remaining_time); /* check for signals and come back */ returntrue;
}
/* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM * can simply ignore such slots. But if userspace is making memory * PRIVATE, then KVM must prevent the guest from accessing the memory * as shared. And if userspace is making memory SHARED and this point * is reached, then at least one page within the range was previously * PRIVATE, i.e. the slot's possible hugepage ranges are changing. * Zapping SPTEs in this case ensures KVM will reassess whether or not * a hugepage can be used for affected ranges.
*/ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) returnfalse;
if (WARN_ON_ONCE(range->end <= range->start)) returnfalse;
/* * If the head and tail pages of the range currently allow a hugepage, * i.e. reside fully in the slot and don't have mixed attributes, then * add each corresponding hugepage range to the ongoing invalidation, * e.g. to prevent KVM from creating a hugepage in response to a fault * for a gfn whose attributes aren't changing. Note, only the range * of gfns whose attributes are being modified needs to be explicitly * unmapped, as that will unmap any existing hugepages.
*/ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
gfn_t start = gfn_round_for_level(range->start, level);
gfn_t end = gfn_round_for_level(range->end - 1, level);
gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
/* * Calculate which ranges can be mapped with hugepages even if the slot * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over * a range that has PRIVATE GFNs, and conversely converting a range to * SHARED may now allow hugepages.
*/ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) returnfalse;
/* * The sequence matters here: upper levels consume the result of lower * level's scanning.
*/ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
gfn_t gfn = gfn_round_for_level(range->start, level);
/* Process the head page if it straddles the range. */ if (gfn != range->start || gfn + nr_pages > range->end) { /* * Skip mixed tracking if the aligned gfn isn't covered * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes.
*/ if (gfn >= slot->base_gfn &&
gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
hugepage_clear_mixed(slot, gfn, level); else
hugepage_set_mixed(slot, gfn, level);
}
gfn += nr_pages;
}
/* * Pages entirely covered by the range are guaranteed to have * only the attributes which were just set.
*/ for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
hugepage_clear_mixed(slot, gfn, level);
/* * Process the last tail page if it straddles the range and is * contained by the memslot. Like the head page, KVM can't * create a hugepage if the slot size is misaligned.
*/ if (gfn < range->end &&
(gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
hugepage_clear_mixed(slot, gfn, level); else
hugepage_set_mixed(slot, gfn, level);
}
} returnfalse;
}
void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, struct kvm_memory_slot *slot)
{ int level;
if (!kvm_arch_has_private_mem(kvm)) return;
for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { /* * Don't bother tracking mixed attributes for pages that can't * be huge due to alignment, i.e. process only pages that are * entirely contained by the memslot.
*/
gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level);
gfn_t start = gfn_round_for_level(slot->base_gfn, level);
gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
gfn_t gfn;
if (start < slot->base_gfn)
start += nr_pages;
/* * Unlike setting attributes, every potential hugepage needs to * be manually checked as the attributes may already be mixed.
*/ for (gfn = start; gfn < end; gfn += nr_pages) { unsignedlong attrs = kvm_get_memory_attributes(kvm, gfn);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.