// SPDX-License-Identifier: GPL-2.0-only /* * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. * No bombay mix was harmed in the writing of this file. * * Copyright (C) 2020 Google LLC * Author: Will Deacon <will@kernel.org>
*/
/* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */
WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); return walker->cb(ctx, visit);
}
staticbool kvm_pgtable_walk_continue(conststruct kvm_pgtable_walker *walker, int r)
{ /* * Visitor callbacks return EAGAIN when the conditions that led to a * fault are no longer reflected in the page tables due to a race to * update a PTE. In the context of a fault handler this is interpreted * as a signal to retry guest execution. * * Ignore the return code altogether for walkers outside a fault handler * (e.g. write protecting a range of memory) and chug along with the * page table walk.
*/ if (r == -EAGAIN) return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT);
if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
reload = true;
}
if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
reload = true;
}
/* * Reload the page table after invoking the walker callback for leaf * entries or after pre-order traversal, to allow the walker to descend * into a newly installed or replaced table.
*/ if (reload) {
ctx.old = READ_ONCE(*ptep);
table = kvm_pte_table(ctx.old, level);
}
if (!kvm_pgtable_walk_continue(data->walker, ret)) goto out;
if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
prot |= KVM_PGTABLE_PROT_X;
ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
prot |= KVM_PGTABLE_PROT_R; elseif (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
prot |= KVM_PGTABLE_PROT_RW;
vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
vtcr |= VTCR_EL2_T0SZ(phys_shift); /* * Use a minimum 2 level page table to prevent splitting * host PMD huge pages at stage2.
*/
lvls = stage2_pgtable_levels(phys_shift); if (lvls < 2)
lvls = 2;
/* * When LPA2 is enabled, the HW supports an extra level of translation * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2 * to as an addition to SL0 to enable encoding this extra start level. * However, since we always use concatenated pages for the first level * lookup, we will never need this extra level and therefore do not need * to touch SL2.
*/
vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
#ifdef CONFIG_ARM64_HW_AFDBM /* * Enable the Hardware Access Flag management, unconditionally * on all CPUs. In systems that have asymmetric support for the feature * this allows KVM to leverage hardware support on the subset of cores * that implement the feature. * * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by * hardware) on implementations that do not advertise support for the * feature. As such, setting HA unconditionally is safe, unless you * happen to be running on a design that has unadvertised support for * HAFDBS. Here be dragons.
*/ if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
vtcr |= VTCR_EL2_HA; #endif/* CONFIG_ARM64_HW_AFDBM */
if (kvm_lpa2_is_enabled())
vtcr |= VTCR_EL2_DS;
/* Set the vmid bits */
vtcr |= (get_vmid_bits(mmfr1) == 16) ?
VTCR_EL2_VS_16BIT :
VTCR_EL2_VS_8BIT;
return vtcr;
}
staticbool stage2_has_fwb(struct kvm_pgtable *pgt)
{ if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) returnfalse;
staticbool stage2_pte_is_counted(kvm_pte_t pte)
{ /* * The refcount tracks valid entries as well as invalid entries if they * encode ownership of a page to another entity than the page-table * owner, whose id is 0.
*/ return !!pte;
}
/** * stage2_try_break_pte() - Invalidates a pte according to the * 'break-before-make' requirements of the * architecture. * * @ctx: context of the visited pte. * @mmu: stage-2 mmu * * Returns: true if the pte was successfully broken. * * If the removed pte was valid, performs the necessary serialization and TLB * invalidation for the old value. For counted ptes, drops the reference count * on the containing table page.
*/ staticbool stage2_try_break_pte(conststruct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu)
{ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
if (stage2_pte_is_locked(ctx->old)) { /* * Should never occur if this walker has exclusive access to the * page tables.
*/
WARN_ON(!kvm_pgtable_walk_shared(ctx)); returnfalse;
}
if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) returnfalse;
if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { /* * Perform the appropriate TLB invalidation based on the * evicted pte value (if any).
*/ if (kvm_pte_table(ctx->old, ctx->level)) {
u64 size = kvm_granule_size(ctx->level);
u64 addr = ALIGN_DOWN(ctx->addr, size);
if (stage2_pte_is_counted(new))
mm_ops->get_page(ctx->ptep);
smp_store_release(ctx->ptep, new);
}
staticbool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
{ /* * If FEAT_TLBIRANGE is implemented, defer the individual * TLB invalidations until the entire walk is finished, and * then use the range-based TLBI instructions to do the * invalidations. Condition deferred TLB invalidation on the * system supporting FWB as the optimization is entirely * pointless when the unmap walker needs to perform CMOs.
*/ return system_supports_tlb_range() && stage2_has_fwb(pgt);
}
/* * Clear the existing PTE, and perform break-before-make if it was * valid. Depending on the system support, defer the TLB maintenance * for the same until the entire unmap walk is completed.
*/ if (kvm_pte_valid(ctx->old)) {
kvm_clear_pte(ctx->ptep);
if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG;
if (!data->annotation) new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else new = kvm_init_invalid_leaf_owner(data->owner_id);
/* * Skip updating the PTE if we are trying to recreate the exact * same mapping or only change the access permissions. Instead, * the vCPU will exit one more time from guest if still needed * and then go through the path of relaxing permissions.
*/ if (!stage2_pte_needs_update(ctx->old, new)) return -EAGAIN;
/* If we're only changing software bits, then store them and go! */ if (!kvm_pgtable_walk_shared(ctx) &&
!((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) { bool old_is_counted = stage2_pte_is_counted(ctx->old);
if (old_is_counted != stage2_pte_is_counted(new)) { if (old_is_counted)
mm_ops->put_page(ctx->ptep); else
mm_ops->get_page(ctx->ptep);
}
WARN_ON_ONCE(!stage2_try_set_pte(ctx, new)); return 0;
}
if (!stage2_try_break_pte(ctx, data->mmu)) return -EAGAIN;
/* Perform CMOs before installation of the guest stage-2 PTE */ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
stage2_pte_cacheable(pgt, new))
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
granule);
if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
stage2_pte_executable(new))
mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
ret = stage2_map_walker_try_leaf(ctx, data); if (ret != -E2BIG) return ret;
if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL;
if (!data->memcache) return -ENOMEM;
childp = mm_ops->zalloc_page(data->memcache); if (!childp) return -ENOMEM;
if (!stage2_try_break_pte(ctx, data->mmu)) {
mm_ops->put_page(childp); return -EAGAIN;
}
/* * If we've run into an existing block mapping then replace it with * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily.
*/ new = kvm_init_table_pte(childp, mm_ops);
stage2_make_pte(ctx, new);
return 0;
}
/* * The TABLE_PRE callback runs for table entries on the way down, looking * for table entries which we could conceivably replace with a block entry * for this mapping. If it finds one it replaces the entry and calls * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. * * Otherwise, the LEAF callback performs the mapping at the existing leaves * instead.
*/ staticint stage2_map_walker(conststruct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit)
{ struct stage2_map_data *data = ctx->arg;
switch (visit) { case KVM_PGTABLE_WALK_TABLE_PRE: return stage2_map_walk_table_pre(ctx, data); case KVM_PGTABLE_WALK_LEAF: return stage2_map_walk_leaf(ctx, data); default: return -EINVAL;
}
}
/* * This is similar to the map() path in that we unmap the entire * block entry and rely on the remaining portions being faulted * back lazily.
*/
stage2_unmap_put_pte(ctx, mmu, mm_ops);
if (need_flush && mm_ops->dcache_clean_inval_poc)
mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
kvm_granule_size(ctx->level));
/* * We may race with the CPU trying to set the access flag here, * but worst-case the access flag update gets lost and will be * set on the next access instead.
*/ if (data->pte != pte) { /* * Invalidate instruction cache before updating the guest * stage-2 PTE if we are going to add executable permission.
*/ if (mm_ops->icache_inval_pou &&
stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
kvm_granule_size(ctx->level));
if (!stage2_try_set_pte(ctx, pte)) return -EAGAIN;
}
if (!kvm_pte_valid(ctx->old) || new == ctx->old) return 0;
data->young = true;
/* * stage2_age_walker() is always called while holding the MMU lock for * write, so this will always succeed. Nonetheless, this deliberately * follows the race detection pattern of the other stage-2 walkers in * case the locking mechanics of the MMU notifiers is ever changed.
*/ if (data->mkold && !stage2_try_set_pte(ctx, new)) return -EAGAIN;
/* * "But where's the TLBI?!", you scream. * "Over in the core code", I sigh. * * See the '->clear_flush_young()' callback on the KVM mmu notifier.
*/ return 0;
}
kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte)
{ struct stage2_map_data map_data = {
.phys = phys,
.mmu = pgt->mmu,
.memcache = mc,
.force_pte = force_pte,
}; struct kvm_pgtable_walker walker = {
.cb = stage2_map_walker,
.flags = KVM_PGTABLE_WALK_LEAF |
KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
KVM_PGTABLE_WALK_SKIP_CMO,
.arg = &map_data,
}; /* * The input address (.addr) is irrelevant for walking an * unlinked table. Construct an ambiguous IA range to map * kvm_granule_size(level) worth of memory.
*/ struct kvm_pgtable_walk_data data = {
.walker = &walker,
.addr = 0,
.end = kvm_granule_size(level),
}; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
kvm_pte_t *pgtable; int ret;
if (!IS_ALIGNED(phys, kvm_granule_size(level))) return ERR_PTR(-EINVAL);
ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); if (ret) return ERR_PTR(ret);
pgtable = mm_ops->zalloc_page(mc); if (!pgtable) return ERR_PTR(-ENOMEM);
ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
level + 1); if (ret) {
kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); return ERR_PTR(ret);
}
return pgtable;
}
/* * Get the number of page-tables needed to replace a block with a * fully populated tree up to the PTE entries. Note that @level is * interpreted as in "level @level entry".
*/ staticint stage2_block_get_nr_page_tables(s8 level)
{ switch (level) { case 1: return PTRS_PER_PTE + 1; case 2: return 1; case 3: return 0; default:
WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
level > KVM_PGTABLE_LAST_LEVEL); return -EINVAL;
};
}
/* No huge-pages exist at the last level */ if (level == KVM_PGTABLE_LAST_LEVEL) return 0;
/* We only split valid block mappings */ if (!kvm_pte_valid(pte)) return 0;
nr_pages = stage2_block_get_nr_page_tables(level); if (nr_pages < 0) return nr_pages;
if (mc->nobjs >= nr_pages) { /* Build a tree mapped down to the PTE granularity. */
force_pte = true;
} else { /* * Don't force PTEs, so create_unlinked() below does * not populate the tree up to the PTE level. The * consequence is that the call will require a single * page of level 2 entries at level 1, or a single * page of PTEs at level 2. If we are at level 1, the * PTEs will be created recursively.
*/
force_pte = false;
nr_pages = 1;
}
if (!stage2_try_break_pte(ctx, mmu)) {
kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); return -EAGAIN;
}
/* * Note, the contents of the page table are guaranteed to be made * visible before the new PTE is assigned because stage2_make_pte() * writes the PTE using smp_store_release().
*/ new = kvm_init_table_pte(childp, mm_ops);
stage2_make_pte(ctx, new); return 0;
}
/* * At this point the IPA really doesn't matter, as the page * table being traversed has already been removed from the stage * 2. Set an appropriate range to cover the entire page table.
*/
.addr = 0,
.end = kvm_granule_size(level),
};
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.6Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.