/* * Mark a range of guest physical address space old (all accesses fault) in the * VM's GPA page table to allow detection of commonly used pages.
*/ staticint kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)
{ if (kvm_pte_young(*pte)) {
*pte = kvm_pte_mkold(*pte); return 1;
}
return 0;
}
/* * Mark a range of guest physical address space clean (writes fault) in the VM's * GPA page table to allow dirty page tracking.
*/ staticint kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)
{
gfn_t offset;
kvm_pte_t val;
val = *pte; /* * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end * may cross hugepage, for first huge page parameter addr is equal to * start, however for the second huge page addr is base address of * this huge page, rather than start or end address
*/ if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) {
offset = (addr >> PAGE_SHIFT) - ctx->gfn; if (!(BIT(offset) & ctx->mask)) return 0;
}
/* * Need not split huge page now, just set write-proect pte bit * Split huge page until next write fault
*/ if (kvm_pte_dirty(val)) {
*pte = kvm_pte_mkclean(val); return 1;
}
p = (unsignedlong *)addr;
end = p + PTRS_PER_PTE; do {
p[0] = val;
p[1] = val;
p[2] = val;
p[3] = val;
p[4] = val;
p += 8;
p[-3] = val;
p[-2] = val;
p[-1] = val;
} while (p != end);
}
/* * Caller must hold kvm->mm_lock * * Walk the page tables of kvm to find the PTE corresponding to the * address @addr. If page tables don't exist for @addr, they will be created * from the MMU cache if @cache is not NULL.
*/ static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, unsignedlong addr, int level)
{
kvm_ptw_ctx ctx;
kvm_pte_t *entry, *child;
kvm_ptw_prepare(kvm, &ctx);
child = kvm->arch.pgd; while (ctx.level > level) {
entry = kvm_pgtable_offset(&ctx, child, addr); if (kvm_pte_none(&ctx, entry)) { if (!cache) return NULL;
/* * Page walker for VM shadow mmu at last level * The last level is small pte page or huge pmd page
*/ staticint kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)
{ int ret;
phys_addr_t next, start, size; struct list_head *list;
kvm_pte_t *entry, *child;
ret = 0;
start = addr;
child = (kvm_pte_t *)__va(PHYSADDR(*dir));
entry = kvm_pgtable_offset(ctx, child, addr); do {
next = addr + (0x1UL << ctx->pgtable_shift); if (!kvm_pte_present(ctx, entry)) continue;
ret |= ctx->ops(entry, addr, ctx);
} while (entry++, addr = next, addr < end);
/* * Page walker for VM shadow mmu at page root table
*/ staticint kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)
{ int ret;
phys_addr_t next;
kvm_pte_t *entry;
ret = 0;
entry = kvm_pgtable_offset(ctx, dir, addr); do {
next = kvm_pgtable_addr_end(ctx, addr, end); if (!kvm_pte_present(ctx, entry)) continue;
kvm_ptw_enter(ctx);
ret |= kvm_ptw_dir(entry, addr, next, ctx);
kvm_ptw_exit(ctx);
} while (entry++, addr = next, addr < end);
return ret;
}
/* * kvm_flush_range() - Flush a range of guest physical addresses. * @kvm: KVM pointer. * @start_gfn: Guest frame number of first page in GPA range to flush. * @end_gfn: Guest frame number of last page in GPA range to flush. * @lock: Whether to hold mmu_lock or not * * Flushes a range of GPA mappings from the GPA page tables.
*/ staticvoid kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock)
{ int ret;
kvm_ptw_ctx ctx; struct list_head *pos, *temp;
if (lock) {
spin_lock(&kvm->mmu_lock);
ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT,
end_gfn << PAGE_SHIFT, &ctx);
spin_unlock(&kvm->mmu_lock);
} else
ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT,
end_gfn << PAGE_SHIFT, &ctx);
/* Flush vpid for each vCPU individually */ if (ret)
kvm_flush_remote_tlbs(kvm);
/* * free pte table page after mmu_lock * the pte table page is linked together with ctx.list
*/
list_for_each_safe(pos, temp, &ctx.list) {
list_del(pos);
free_page((unsignedlong)pos);
}
}
/* * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean. * @kvm: KVM pointer. * @start_gfn: Guest frame number of first page in GPA range to flush. * @end_gfn: Guest frame number of last page in GPA range to flush. * * Make a range of GPA mappings clean so that guest writes will fault and * trigger dirty page logging. * * The caller must hold the @kvm->mmu_lock spinlock. * * Returns: Whether any GPA mappings were modified, which would require * derived mappings (GVA page tables & TLB enties) to be * invalidated.
*/ staticint kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
{
kvm_ptw_ctx ctx;
if ((change != KVM_MR_MOVE) && (change != KVM_MR_CREATE)) return 0; /* * Prevent userspace from creating a memory region outside of the * VM GPA address space
*/ if ((new->base_gfn + new->npages) > (kvm->arch.gpa_size >> PAGE_SHIFT)) return -ENOMEM;
new->arch.flags = 0;
size = new->npages * PAGE_SIZE;
gpa_start = new->base_gfn << PAGE_SHIFT;
hva_start = new->userspace_addr; if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(gpa_start, PMD_SIZE)
&& IS_ALIGNED(hva_start, PMD_SIZE))
new->arch.flags |= KVM_MEM_HUGEPAGE_CAPABLE; else { /* * Pages belonging to memslots that don't have the same * alignment within a PMD for userspace and GPA cannot be * mapped with PMD entries, because we'll end up mapping * the wrong pages. * * Consider a layout like the following: * * memslot->userspace_addr: * +-----+--------------------+--------------------+---+ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| * +-----+--------------------+--------------------+---+ * * memslot->base_gfn << PAGE_SIZE: * +---+--------------------+--------------------+-----+ * |abc|def Stage-2 block | Stage-2 block |tvxyz| * +---+--------------------+--------------------+-----+ * * If we create those stage-2 blocks, we'll end up with this * incorrect mapping: * d -> f * e -> g * f -> h
*/
gpa_offset = gpa_start & (PMD_SIZE - 1);
hva_offset = hva_start & (PMD_SIZE - 1); if (gpa_offset != hva_offset) {
new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE;
} else { if (gpa_offset == 0)
gpa_offset = PMD_SIZE; if ((size + gpa_offset) < (PMD_SIZE * 2))
new->arch.flags |= KVM_MEM_HUGEPAGE_INCAPABLE;
}
}
/* Only track memslot flags changed */ if (change != KVM_MR_FLAGS_ONLY) return;
/* Discard dirty page tracking on readonly memslot */ if ((old_flags & new_flags) & KVM_MEM_READONLY) return;
/* * If dirty page logging is enabled, write protect all pages in the slot * ready for dirty logging. * * There is no need to do this in any of the following cases: * CREATE: No dirty mappings will already exist. * MOVE/DELETE: The old mappings will already have been cleaned up by * kvm_arch_flush_shadow_memslot()
*/ if (!(old_flags & KVM_MEM_LOG_DIRTY_PAGES) && log_dirty_pages) { /* * Initially-all-set does not require write protecting any page * because they're all assumed to be dirty.
*/ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) return;
void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{ /* * The slot has been made invalid (ready for moving or deletion), so we * need to ensure that it can no longer be accessed by any guest vCPUs.
*/
kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1);
}
if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep)) returntrue;
returnfalse;
}
/* * kvm_map_page_fast() - Fast path GPA fault handler. * @vcpu: vCPU pointer. * @gpa: Guest physical address of fault. * @write: Whether the fault was due to a write. * * Perform fast path GPA fault handling, doing all that can be done without * calling into KVM. This handles marking old pages young (for idle page * tracking), and dirtying of clean pages (for dirty page logging). * * Returns: 0 on success, in which case we can update derived mappings and * resume guest execution. * -EFAULT on failure due to absent GPA mapping or write to * read-only page, in which case KVM must be consulted.
*/ staticint kvm_map_page_fast(struct kvm_vcpu *vcpu, unsignedlong gpa, bool write)
{ int ret = 0;
kvm_pte_t *ptep, changed, new;
gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_memory_slot *slot;
spin_lock(&kvm->mmu_lock);
/* Fast path - just check GPA page table for an existing entry */
ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); if (!ptep || !kvm_pte_present(NULL, ptep)) {
ret = -EFAULT; goto out;
}
/* Track access to pages marked old */ new = kvm_pte_mkyoung(*ptep); if (write && !kvm_pte_dirty(new)) { if (!kvm_pte_writeable(new)) {
ret = -EFAULT; goto out;
}
if (kvm_pte_huge(new)) { /* * Do not set write permission when dirty logging is * enabled for HugePages
*/
slot = gfn_to_memslot(kvm, gfn); if (kvm_slot_dirty_track_enabled(slot)) {
ret = -EFAULT; goto out;
}
}
/* Track dirtying of writeable pages */ new = kvm_pte_mkdirty(new);
}
changed = new ^ (*ptep); if (changed)
kvm_set_pte(ptep, new);
spin_unlock(&kvm->mmu_lock);
if (kvm_pte_dirty(changed))
mark_page_dirty(kvm, gfn);
/* Disable dirty logging on HugePages */ if (kvm_slot_dirty_track_enabled(memslot) && write) returnfalse;
if (kvm_hugepage_capable(memslot)) returntrue;
if (kvm_hugepage_incapable(memslot)) returnfalse;
start = memslot->userspace_addr;
end = start + memslot->npages * PAGE_SIZE;
/* * Next, let's make sure we're not trying to map anything not covered * by the memslot. This means we have to prohibit block size mappings * for the beginning and end of a non-block aligned and non-block sized * memory slot (illustrated by the head and tail parts of the * userspace view above containing pages 'abcde' and 'xyz', * respectively). * * Note that it doesn't matter if we do the check using the * userspace_addr or the base_gfn, as both are equally aligned (per * the check above) and equally sized.
*/ return (hva >= ALIGN(start, PMD_SIZE)) && (hva < ALIGN_DOWN(end, PMD_SIZE));
}
/* * Lookup the mapping level for @gfn in the current mm. * * WARNING! Use of host_pfn_mapping_level() requires the caller and the end * consumer to be tied into KVM's handlers for MMU notifier events! * * There are several ways to safely use this helper: * * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation * event for the hva. This can be done by explicit checking the MMU notifier * or by ensuring that KVM already has a valid mapping that covers the hva. * * - Do not use the result to install new mappings, e.g. use the host mapping * level only to decide whether or not to zap an entry. In this case, it's * not required to hold mmu_lock (though it's highly likely the caller will * want to hold mmu_lock anyways, e.g. to modify SPTEs). * * Note! The lookup can still race with modifications to host page tables, but * the above "rules" ensure KVM will not _consume_ the result of the walk if a * race with the primary MMU occurs.
*/ staticint host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, conststruct kvm_memory_slot *slot)
{ int level = 0; unsignedlong hva; unsignedlong flags;
pgd_t pgd;
p4d_t p4d;
pud_t pud;
pmd_t pmd;
/* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the * "writable" check in __gfn_to_hva_many(), which will always fail on * read-only memslots due to gfn_to_hva() assuming writes. Earlier * page fault steps have already verified the guest isn't writing a * read-only memslot.
*/
hva = __gfn_to_hva_memslot(slot, gfn);
/* * Disable IRQs to prevent concurrent tear down of host page tables, * e.g. if the primary MMU promotes a P*D to a huge page and then frees * the original page table.
*/
local_irq_save(flags);
/* * Read each entry once. As above, a non-leaf entry can be promoted to * a huge page _during_ this walk. Re-reading the entry could send the * walk into the weeks, e.g. p*d_leaf() returns false (sees the old * value) and then p*d_offset() walks into the target huge page instead * of the old page table (sees the new value).
*/
pgd = pgdp_get(pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out;
p4d = p4dp_get(p4d_offset(&pgd, hva)); if (p4d_none(p4d) || !p4d_present(p4d)) goto out;
pud = pudp_get(pud_offset(&p4d, hva)); if (pud_none(pud) || !pud_present(pud)) goto out;
pmd = pmdp_get(pmd_offset(&pud, hva)); if (pmd_none(pmd) || !pmd_present(pmd)) goto out;
memcache = &vcpu->arch.mmu_page_cache;
child = kvm_mmu_memory_cache_alloc(memcache);
val = kvm_pte_mksmall(*ptep); for (i = 0; i < PTRS_PER_PTE; i++) {
kvm_set_pte(child + i, val);
val += PAGE_SIZE;
}
smp_wmb(); /* Make pte visible before pmd */ /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */
kvm_set_pte(ptep, __pa(child));
/* * kvm_map_page() - Map a guest physical page. * @vcpu: vCPU pointer. * @gpa: Guest physical address of fault. * @write: Whether the fault was due to a write. * * Handle GPA faults by creating a new GPA mapping (or updating an existing * one). * * This takes care of marking pages young or dirty (idle/dirty page tracking), * asking KVM for the corresponding PFN, and creating a mapping in the GPA page * tables. Derived mappings (GVA page tables and TLBs) must be handled by the * caller. * * Returns: 0 on success * -EFAULT if there is no memory region at @gpa or a write was * attempted to a read-only memory region. This is usually handled * as an MMIO access.
*/ staticint kvm_map_page(struct kvm_vcpu *vcpu, unsignedlong gpa, bool write)
{ bool writeable; int srcu_idx, err, retry_no = 0, level; unsignedlong hva, mmu_seq, prot_bits;
kvm_pfn_t pfn;
kvm_pte_t *ptep, new_pte;
gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_memory_slot *memslot; struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct page *page;
/* Try the fast path to handle old / clean pages */
srcu_idx = srcu_read_lock(&kvm->srcu);
err = kvm_map_page_fast(vcpu, gpa, write); if (!err) goto out;
/* We need a minimum of cached pages ready for page table creation */
err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); if (err) goto out;
retry: /* * Used to check for invalidations in progress, of the pfn that is * returned by pfn_to_pfn_prot below.
*/
mmu_seq = kvm->mmu_invalidate_seq; /* * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in * kvm_faultin_pfn() (which calls get_user_pages()), so that we don't * risk the page we get a reference to getting unmapped before we have a * chance to grab the mmu_lock without mmu_invalidate_retry() noticing. * * This smp_rmb() pairs with the effective smp_wmb() of the combination * of the pte_unmap_unlock() after the PTE is zapped, and the * spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before * mmu_invalidate_seq is incremented.
*/
smp_rmb();
/* Slow path - ask KVM core whether we can access this GPA */
pfn = kvm_faultin_pfn(vcpu, gfn, write, &writeable, &page); if (is_error_noslot_pfn(pfn)) {
err = -EFAULT; goto out;
}
/* Check if an invalidation has taken place since we got pfn */
spin_lock(&kvm->mmu_lock); if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { /* * This can happen when mappings are changed asynchronously, but * also synchronously if a COW is triggered by * kvm_faultin_pfn().
*/
spin_unlock(&kvm->mmu_lock);
kvm_release_page_unused(page); if (retry_no > 100) {
retry_no = 0;
schedule();
}
retry_no++; goto retry;
}
/* * For emulated devices such virtio device, actual cache attribute is * determined by physical machine. * For pass through physical device, it should be uncachable
*/
prot_bits = _PAGE_PRESENT | __READABLE; if (pfn_valid(pfn))
prot_bits |= _CACHE_CC; else
prot_bits |= _CACHE_SUC;
if (writeable) {
prot_bits = kvm_pte_mkwriteable(prot_bits); if (write)
prot_bits = kvm_pte_mkdirty(prot_bits);
}
/* Disable dirty logging on HugePages */
level = 0; if (fault_supports_huge_mapping(memslot, hva, write)) { /* Check page level about host mmu*/
level = host_pfn_mapping_level(kvm, gfn, memslot); if (level == 1) { /* * Check page level about secondary mmu * Disable hugepage if it is normal page on * secondary mmu already
*/
ptep = kvm_populate_gpa(kvm, NULL, gpa, 0); if (ptep && !kvm_pte_huge(*ptep))
level = 0;
}
int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsignedlong gpa, bool write, int ecode)
{ int ret;
ret = kvm_map_page(vcpu, gpa, write); if (ret) return ret;
/* Invalidate this entry in the TLB */ if (!cpu_has_ptw || (ecode == EXCCODE_TLBM)) { /* * With HW PTW, invalid TLB is not added when page fault. But * for EXCCODE_TLBM exception, stale TLB may exist because of * the last read access. * * With SW PTW, invalid TLB is added in TLB refill exception.
*/
vcpu->arch.flush_gpa = gpa;
kvm_make_request(KVM_REQ_TLB_FLUSH_GPA, vcpu);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.