// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * MMU support * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com>
*/ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
/* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: * 1. the guest-virtual to guest-physical * 2. while doing 1. it walks guest-physical to host-physical * If the hardware supports that we don't need to do shadow paging.
*/ bool tdp_enabled = false;
/* make pte_list_desc fit well in cache lines */ #define PTE_LIST_EXT 14
/* * struct pte_list_desc is the core data structure used to implement a custom * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a * given GFN when used in the context of rmaps. Using a custom list allows KVM * to optimize for the common case where many GFNs will have at most a handful * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small * memory footprint, which in turn improves runtime performance by exploiting * cache locality. * * A list is comprised of one or more pte_list_desc objects (descriptors). * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor * is full and a new SPTEs needs to be added, a new descriptor is allocated and * becomes the head of the list. This means that by definitions, all tail * descriptors are full. * * Note, the meta data fields are deliberately placed at the start of the * structure to optimize the cacheline layout; accessing the descriptor will * touch only a single cacheline so long as @spte_count<=6 (or if only the * descriptors metadata is accessed).
*/ struct pte_list_desc { struct pte_list_desc *more; /* The number of PTEs stored in _this_ descriptor. */
u32 spte_count; /* The number of PTEs stored in all tails of this descriptor. */
u32 tail_count;
u64 *sptes[PTE_LIST_EXT];
};
/* * Yes, lot's of underscores. They're a hint that you probably shouldn't be * reading from the role_regs. Once the root_role is constructed, it becomes * the single source of truth for the MMU's state.
*/ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ staticinlinebool __maybe_unused \
____is_##reg##_##name(conststruct kvm_mmu_role_regs *regs) \
{ \ return !!(regs->reg & flag); \
}
BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
/* * The MMU itself (with a valid role) is the single source of truth for the * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, * and the vCPU may be incorrect/irrelevant.
*/ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ staticinlinebool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
{ \ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
}
BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
/* Flush the range of guest memory mapped by the given SPTE. */ staticvoid kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
{ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
/* * If we map the spte from nonpresent to present, We should store * the high bits firstly, then set present bit, so cpu can not * fetch this spte while we are setting the spte.
*/
smp_wmb();
WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
}
staticvoid __update_clear_spte_fast(u64 *sptep, u64 spte)
{ union split_spte *ssptep, sspte;
/* xchg acts as a barrier before the setting of the high bits */
orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
orig.spte_high = ssptep->spte_high;
ssptep->spte_high = sspte.spte_high;
count_spte_clear(sptep, spte);
return orig.spte;
}
/* * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * * An spte tlb flush may be pending, because they are coalesced and * we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * * Reading the spte while an update is in progress may get the old value * for the high part of the spte. The race is fine for a present->non-present * change (because the high part of the spte is ignored for non-present spte), * but for a present->present change we must reread the spte. * * All such changes are done in two steps (present->non-present and * non-present->present), hence it is enough to count the number of * present->non-present updates: if it changed while reading the spte, * we might have hit the race. This is done using clear_spte_count.
*/ static u64 __get_spte_lockless(u64 *sptep)
{ struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count;
retry:
count = sp->clear_spte_count;
smp_rmb();
spte.spte_low = orig->spte_low;
smp_rmb();
spte.spte_high = orig->spte_high;
smp_rmb();
if (unlikely(spte.spte_low != orig->spte_low ||
count != sp->clear_spte_count)) goto retry;
return spte.spte;
} #endif
/* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present * or in a state where the hardware will not attempt to update * the spte.
*/ staticvoid mmu_spte_set(u64 *sptep, u64 new_spte)
{
WARN_ON_ONCE(is_shadow_present_pte(*sptep));
__set_spte(sptep, new_spte);
}
/* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changed. * * Returns true if the TLB needs to be flushed
*/ staticbool mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
/* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. * Returns the old PTE.
*/ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
{
u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level;
/* * Rules for using mmu_spte_clear_no_track: * Directly clear spte without caring the state bits of sptep, * it is used to set the upper level spte.
*/ staticvoid mmu_spte_clear_no_track(u64 *sptep)
{
__update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
}
staticvoid walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_begin();
} else { /* * Prevent page table teardown by making any free-er wait during * kvm_flush_remote_tlbs() IPI to all active vcpus.
*/
local_irq_disable();
/* * Make sure a following spte read is not reordered ahead of the write * to vcpu->mode.
*/
smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
}
}
staticvoid walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_end();
} else { /* * Make sure the write to vcpu->mode is not reordered in front of * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
*/
smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
local_irq_enable();
}
}
staticint mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
{ int r;
/* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; if (kvm_has_mirrored_tdp(vcpu->kvm)) {
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache,
PT64_ROOT_MAX_LEVEL); if (r) return r;
}
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache,
PT64_ROOT_MAX_LEVEL); if (r) return r; if (maybe_indirect) {
r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
PT64_ROOT_MAX_LEVEL); if (r) return r;
} return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
PT64_ROOT_MAX_LEVEL);
}
/* * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note * that the SPTE itself may have a more constrained access permissions that * what the guest enforces. For example, a guest may create an executable * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
*/ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
{ if (sp->shadowed_translation) return sp->shadowed_translation[index] & ACC_ALL;
/* * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, * KVM is not shadowing any guest page tables, so the "guest access * permissions" are just ACC_ALL. * * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM * is shadowing a guest huge page with small pages, the guest access * permissions being shadowed are the access permissions of the huge * page. * * In both cases, sp->role.access contains the correct access bits.
*/ return sp->role.access;
}
/* * Return the pointer to the large page information for a given gfn, * handling slots that are not large page aligned.
*/ staticstruct kvm_lpage_info *lpage_info_slot(gfn_t gfn, conststruct kvm_memory_slot *slot, int level)
{ unsignedlong idx;
/* * The most significant bit in disallow_lpage tracks whether or not memory * attributes are mixed, i.e. not identical for all gfns at the current level. * The lower order bits are used to refcount other cases where a hugepage is * disallowed, e.g. if KVM has shadow a page table at the gfn.
*/ #define KVM_LPAGE_MIXED_FLAG BIT(31)
staticvoid update_gfn_disallow_lpage_count(conststruct kvm_memory_slot *slot,
gfn_t gfn, int count)
{ struct kvm_lpage_info *linfo; int old, i;
for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
kvm->arch.indirect_shadow_pages++; /* * Ensure indirect_shadow_pages is elevated prior to re-reading guest * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight * emulated writes are visible before re-reading guest PTEs, or that * an emulated write will see the elevated count and acquire mmu_lock * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write().
*/
smp_mb();
/* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PG_LEVEL_4K) return __kvm_write_track_add_gfn(kvm, slot, gfn);
kvm_mmu_gfn_disallow_lpage(slot, gfn);
if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
}
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{ /* * If it's possible to replace the shadow page with an NX huge page, * i.e. if the shadow page is the only thing currently preventing KVM * from using a huge page, add the shadow page to the list of "to be * zapped for NX recovery" pages. Note, the shadow page can already be * on the list if KVM is reusing an existing shadow page, i.e. if KVM * links a shadow page at multiple points.
*/ if (!list_empty(&sp->possible_nx_huge_page_link)) return;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return NULL; if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) return NULL;
return slot;
}
/* * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct * pte_list_desc containing more mappings.
*/ #define KVM_RMAP_MANY BIT(0)
/* * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always * operates with mmu_lock held for write), but rmaps can be walked without * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain * being zapped/dropped _while the rmap is locked_. * * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be * done while holding mmu_lock for write. This allows a task walking rmaps * without holding mmu_lock to concurrently walk the same entries as a task * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify * the rmaps, thus the walks are stable. * * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED, * only the rmap chains themselves are protected. E.g. holding an rmap's lock * ensures all "struct pte_list_desc" fields are stable.
*/ #define KVM_RMAP_LOCKED BIT(1)
/* * Elide the lock if the rmap is empty, as lockless walkers (read-only * mode) don't need to (and can't) walk an empty rmap, nor can they add * entries to the rmap. I.e. the only paths that process empty rmaps * do so while holding mmu_lock for write, and are mutually exclusive.
*/
old_val = atomic_long_read(&rmap_head->val); if (!old_val) return 0;
do { /* * If the rmap is locked, wait for it to be unlocked before * trying acquire the lock, e.g. to avoid bouncing the cache * line.
*/ while (old_val & KVM_RMAP_LOCKED) {
cpu_relax();
old_val = atomic_long_read(&rmap_head->val);
}
/* * Recheck for an empty rmap, it may have been purged by the * task that held the lock.
*/ if (!old_val) return 0;
new_val = old_val | KVM_RMAP_LOCKED; /* * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap * from being reordered outside of the critical section created by * __kvm_rmap_lock(). * * Pairs with the atomic_long_set_release() in kvm_rmap_unlock(). * * For the !old_val case, no ordering is needed, as there is no rmap * to walk.
*/
} while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val));
/* * Return the old value, i.e. _without_ the LOCKED bit set. It's * impossible for the return value to be 0 (see above), i.e. the read- * only unlock flow can't get a false positive and fail to unlock.
*/ return old_val;
}
staticvoid __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, unsignedlong val)
{
KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED); /* * Ensure that all accesses to the rmap have completed before unlocking * the rmap. * * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock().
*/
atomic_long_set_release(&rmap_head->val, val);
}
/* * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The * actual locking is the same, but the caller is disallowed from modifying the * rmap, and so the unlock flow is a nop if the rmap is/was empty.
*/ staticunsignedlong kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head)
{ unsignedlong rmap_val;
/* * Returns the number of pointers in the rmap chain, not counting the new one.
*/ staticint pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
u64 *spte, struct kvm_rmap_head *rmap_head)
{ unsignedlong old_val, new_val; struct pte_list_desc *desc; int count = 0;
/* * The head descriptor should never be empty. A new head is added only * when adding an entry and the previous head is full, and heads are * removed (this flow) when they become empty.
*/
KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm);
/* * Replace the to-be-freed SPTE with the last valid entry from the head * descriptor to ensure that tail descriptors are full at all times. * Note, this also means that tail_count is stable for each descriptor.
*/
desc->sptes[i] = head_desc->sptes[j];
head_desc->sptes[j] = NULL;
head_desc->spte_count--; if (head_desc->spte_count) return;
/* * The head descriptor is empty. If there are no tail descriptors, * nullify the rmap head to mark the list as empty, else point the rmap * head at the next descriptor, i.e. the new head.
*/ if (!head_desc->more)
*rmap_val = 0; else
*rmap_val = (unsignedlong)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
/* Return true if at least one SPTE was zapped, false otherwise */ staticbool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{ struct pte_list_desc *desc, *next; unsignedlong rmap_val; int i;
rmap_val = kvm_rmap_lock(kvm, rmap_head); if (!rmap_val) returnfalse;
for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++)
mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
next = desc->more;
mmu_free_pte_list_desc(desc);
}
out: /* rmap_head is meaningless now, remember to reset it */
kvm_rmap_unlock(kvm, rmap_head, 0); returntrue;
}
/* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU * so we have to determine which memslots to use based on context * information in sp->role.
*/
slots = kvm_memslots_for_spte_role(kvm, sp->role);
/* * Used by the following functions to iterate through the sptes linked by a * rmap. All fields are private and not assumed to be used outside.
*/ struct rmap_iterator { /* private fields */ struct rmap_head *head; struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */
};
/* * Iteration must be started by this function. This should also be used after * removing/dropping sptes from the rmap link because in such cases the * information in the iterator may not be valid. * * Returns sptep if found, NULL otherwise.
*/ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter)
{ unsignedlong rmap_val = kvm_rmap_get(rmap_head);
/* * Must be used with a valid iterator: e.g. after rmap_get_first(). * * Returns sptep if found, NULL otherwise.
*/ static u64 *rmap_get_next(struct rmap_iterator *iter)
{ if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) {
++iter->pos; if (iter->desc->sptes[iter->pos]) return iter->desc->sptes[iter->pos];
}
iter->desc = iter->desc->more;
if (iter->desc) {
iter->pos = 0; /* desc->sptes[0] cannot be NULL */ return iter->desc->sptes[iter->pos];
}
}
if (flush)
kvm_flush_remote_tlbs_sptep(kvm, sptep);
}
/* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. * * Note: write protection is difference between dirty logging and spte * protection: * - for dirty logging, the spte can be set to writable at anytime if * its dirty bitmap is properly set. * - for spte protection, the spte can be writable only after unsync-ing * shadow page. * * Return true if tlb need be flushed.
*/ staticbool spte_write_protect(u64 *sptep, bool pt_protect)
{
u64 spte = *sptep;
if (!is_writable_pte(spte) &&
!(pt_protect && is_mmu_writable_spte(spte))) returnfalse;
if (pt_protect)
spte &= ~shadow_mmu_writable_mask;
spte = spte & ~PT_WRITABLE_MASK;
/* * Gets the GFN ready for another round of dirty logging by clearing the * - D bit on ad-enabled SPTEs, and * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared.
*/ staticbool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, conststruct kvm_memory_slot *slot)
{
u64 *sptep; struct rmap_iterator iter; bool flush = false;
/* clear the first set bit */
mask &= mask - 1;
}
}
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsignedlong mask)
{ /* * If the slot was assumed to be "initially all dirty", write-protect * huge pages to ensure they are split to 4KiB on the first write (KVM * dirty logs at 4KiB granularity). If eager page splitting is enabled, * immediately try to split huge pages, e.g. so that vCPUs don't get * saddled with the cost of splitting. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large * pages.
*/ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
if (READ_ONCE(eager_page_split))
kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K);
/* Cross two large pages? */ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
ALIGN(end << PAGE_SHIFT, PMD_SIZE))
kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
PG_LEVEL_2M);
}
/* * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in * mask. If PML is enabled and the GFN doesn't need to be write- * protected for other reasons, e.g. shadow paging, clear the Dirty bit. * Otherwise clear the Writable bit. * * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is * enabled but it chooses between clearing the Dirty bit and Writeable * bit based on the context.
*/ if (kvm->arch.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
int kvm_cpu_dirty_log_size(struct kvm *kvm)
{ return kvm->arch.cpu_dirty_log_size;
}
if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = gfn_to_rmap(gfn, i, slot);
write_protected |= rmap_write_protect(rmap_head, true);
}
}
if (tdp_mmu_enabled)
write_protected |=
kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
/* The return value indicates if tlb flush on all vcpus is needed. */ typedefbool (*slot_rmaps_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, conststruct kvm_memory_slot *slot);
/* * To prevent races with vCPUs faulting in a gfn using stale data, * zapping a gfn range must be protected by mmu_invalidate_in_progress * (and mmu_invalidate_seq). The only exception is memslot deletion; * in that case, SRCU synchronization ensures that SPTEs are zapped * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the * invalid slot.
*/
lockdep_assert_once(kvm->mmu_invalidate_in_progress ||
lockdep_is_held(&kvm->slots_lock));
if (kvm_memslots_have_rmaps(kvm))
flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
range->start, range->end,
range->may_block, flush);
if (tdp_mmu_enabled)
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
if (kvm_x86_ops.set_apic_access_page_addr &&
range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) { if (!is_accessed_spte(spte)) continue;
if (test_only) {
kvm_rmap_unlock_readonly(rmap_head, rmap_val); returntrue;
}
if (spte_ad_enabled(spte))
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsignedlong *)sptep); else /* * If the following cmpxchg fails, the * spte is being concurrently modified * and should most likely stay young.
*/
cmpxchg64(sptep, spte,
mark_spte_for_access_track(spte));
young = true;
}
if (tdp_mmu_enabled)
young = kvm_tdp_mmu_test_age_gfn(kvm, range);
if (young) return young;
if (kvm_may_have_shadow_mmu_sptes(kvm))
young |= kvm_rmap_age_gfn_range(kvm, range, true);
return young;
}
staticvoid kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
{ #ifdef CONFIG_KVM_PROVE_MMU int i;
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
sp->spt[i], &sp->spt[i],
kvm_mmu_page_get_gfn(sp, i));
} #endif
}
staticstruct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
{ /* * Ensure the load of the hash table pointer itself is ordered before * loads to walk the table. The pointer is set at runtime outside of * mmu_lock when the TDP MMU is enabled, i.e. when the hash table of * shadow pages becomes necessary only when KVM needs to shadow L1's * TDP for an L2 guest. Pairs with the smp_store_release() in * kvm_mmu_alloc_page_hash().
*/ struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
/* * Ignore various flags when verifying that it's safe to sync a shadow * page using the current MMU context. * * - level: not part of the overall MMU role and will never match as the MMU's * level tracks the root level * - access: updated based on the new guest PTE * - quadrant: not part of the overall MMU role (similar to level)
*/ constunion kvm_mmu_page_role sync_role_ign = {
.level = 0xf,
.access = 0x7,
.quadrant = 0x3,
.passthrough = 0x1,
};
/* * Direct pages can never be unsync, and KVM should never attempt to * sync a shadow page for a different MMU context, e.g. if the role * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the * reserved bits checks will be wrong, etc...
*/ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
(sp->role.word ^ root_role.word) & ~sync_role_ign.word)) returnfalse;
returntrue;
}
staticint kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{ /* sp->spt[i] has initial value of shadow page table allocation */ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE) return 0;
return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
}
staticint __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{ int flush = 0; int i;
if (!kvm_sync_page_check(vcpu, sp)) return -1;
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { int ret = kvm_sync_spte(vcpu, sp, i);
if (ret < -1) return -1;
flush |= ret;
}
/* * Note, any flush is purely for KVM's correctness, e.g. when dropping * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier * unmap or dirty logging event doesn't fail to flush. The guest is * responsible for flushing the TLB to ensure any changes in protection * bits are recognized, i.e. until the guest flushes or page faults on * a relevant address, KVM is architecturally allowed to let vCPUs use * cached translations with the old protection bits.
*/ return flush;
}
staticint kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list)
{ int ret = __kvm_sync_page(vcpu, sp);
if (ret < 0)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return ret;
}
/* Also set up a sentinel. Further entries in pvec are all * children of sp, so this element is never overwritten.
*/
parents->parent[level-1] = NULL; return mmu_pages_next(pvec, parents, 0);
}
/* * The vCPU is required when finding indirect shadow pages; the shadow * page may already exist and syncing it needs the vCPU pointer in * order to read guest page tables. Direct shadow pages are never * unsync, thus @vcpu can be NULL if @role.direct is true.
*/ staticstruct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu,
gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role)
{ struct kvm_mmu_page *sp; int ret; int collisions = 0;
LIST_HEAD(invalid_list);
if (sp->role.word != role.word) { /* * If the guest is creating an upper-level page, zap * unsync pages for the same gfn. While it's possible * the guest is using recursive page tables, in all * likelihood the guest has stopped using the unsync * page and is installing a completely unrelated page. * Unsync pages must not be left as is, because the new * upper-level page will be write-protected.
*/ if (role.level > PG_LEVEL_4K && sp->unsync)
kvm_mmu_prepare_zap_page(kvm, sp,
&invalid_list); continue;
}
/* unsync and write-flooding only apply to indirect SPs. */ if (sp->role.direct) goto out;
if (sp->unsync) { if (KVM_BUG_ON(!vcpu, kvm)) break;
/* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) * it doesn't write-protect the page or mark it synchronized! * This way the validity of the mapping is ensured, but the * overhead of write protection is not incurred until the * guest invalidates the TLB mapping. This allows multiple * SPs for a single gfn to be unsync. * * If the sync fails, the page is zapped. If so, break * in order to rebuild it.
*/
ret = kvm_sync_page(vcpu, sp, &invalid_list); if (ret < 0) break;
WARN_ON_ONCE(!list_empty(&invalid_list)); if (ret > 0)
kvm_flush_remote_tlbs(kvm);
}
__clear_sp_write_flooding_count(sp);
goto out;
}
sp = NULL;
++kvm->stat.mmu_cache_miss;
out:
kvm_mmu_commit_zap_page(kvm, &invalid_list);
if (collisions > kvm->stat.max_mmu_page_hash_collisions)
kvm->stat.max_mmu_page_hash_collisions = collisions; return sp;
}
/* Caches used when allocating a new shadow page. */ struct shadow_page_caches { struct kvm_mmu_memory_cache *page_header_cache; struct kvm_mmu_memory_cache *shadow_page_cache; struct kvm_mmu_memory_cache *shadowed_info_cache;
};
/* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See * comments in kvm_zap_obsolete_pages().
*/
sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
list_add(&sp->link, &kvm->arch.active_mmu_pages);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.45 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.