/* * Supported radix tree geometry. * Like p9, we support either 5 or 9 bits at the first (lowest) level, * for a page size of 64k or 4k.
*/ staticint p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
unsignedlong __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
gva_t eaddr, void *to, void *from, unsignedlong n)
{ int old_pid, old_lpid; unsignedlong quadrant, ret = n; bool is_load = !!to;
if (kvmhv_is_nestedv2()) return H_UNSUPPORTED;
/* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ if (kvmhv_on_pseries()) return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
(to != NULL) ? __pa(to): 0,
(from != NULL) ? __pa(from): 0, n);
if (eaddr & (0xFFFUL << 52)) return ret;
quadrant = 1; if (!pid)
quadrant = 2; if (is_load)
from = (void *) (eaddr | (quadrant << 62)); else
to = (void *) (eaddr | (quadrant << 62));
preempt_disable();
asmvolatile("hwsync" ::: "memory");
isync(); /* switch the lpid first to avoid running host with unallocated pid */
old_lpid = mfspr(SPRN_LPID); if (old_lpid != lpid)
mtspr(SPRN_LPID, lpid); if (quadrant == 1) {
old_pid = mfspr(SPRN_PID); if (old_pid != pid)
mtspr(SPRN_PID, pid);
}
isync();
pagefault_disable(); if (is_load)
ret = __copy_from_user_inatomic(to, (constvoid __user *)from, n); else
ret = __copy_to_user_inatomic((void __user *)to, from, n);
pagefault_enable();
asmvolatile("hwsync" ::: "memory");
isync(); /* switch the pid first to avoid running host with unallocated pid */ if (quadrant == 1 && pid != old_pid)
mtspr(SPRN_PID, old_pid); if (lpid != old_lpid)
mtspr(SPRN_LPID, old_lpid);
isync();
preempt_enable();
return ret;
}
staticlong kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, void *from, unsignedlong n)
{ int lpid = vcpu->kvm->arch.lpid; int pid;
/* This would cause a data segment intr so don't allow the access */ if (eaddr & (0x3FFUL << 52)) return -EINVAL;
/* Should we be using the nested lpid */ if (vcpu->arch.nested)
lpid = vcpu->arch.nested->shadow_lpid;
/* If accessing quadrant 3 then pid is expected to be 0 */ if (((eaddr >> 62) & 0x3) == 0x3)
pid = 0; else
pid = kvmppc_get_pid(vcpu);
eaddr &= ~(0xFFFUL << 52);
return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
}
long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, unsignedlong n)
{ long ret;
ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); if (ret > 0)
memset(to + (n - ret), 0, ret);
/* Current implementations only support 52-bit space */ if (offset != 52) return -EINVAL;
/* Walk each level of the radix tree */ for (level = 3; level >= 0; --level) {
u64 addr; /* Check a valid size */ if (level && bits != p9_supported_radix_bits[level]) return -EINVAL; if (level == 0 && !(bits == 5 || bits == 9)) return -EINVAL;
offset -= bits;
index = (eaddr >> offset) & ((1UL << bits) - 1); /* Check that low bits of page table base are zero */ if (base & ((1UL << (bits + 3)) - 1)) return -EINVAL; /* Read the entry from guest memory */
addr = base + (index * sizeof(rpte));
kvm_vcpu_srcu_read_lock(vcpu);
ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
kvm_vcpu_srcu_read_unlock(vcpu); if (ret) { if (pte_ret_p)
*pte_ret_p = addr; return ret;
}
pte = __be64_to_cpu(rpte); if (!(pte & _PAGE_PRESENT)) return -ENOENT; /* Check if a leaf entry */ if (pte & _PAGE_PTE) break; /* Get ready to walk the next level */
base = pte & RPDB_MASK;
bits = pte & RPDS_MASK;
}
/* Need a leaf at lowest level; 512GB pages not supported */ if (level < 0 || level == 3) return -EINVAL;
/* We found a valid leaf PTE */ /* Offset is now log base 2 of the page size */
gpa = pte & 0x01fffffffffff000ul; if (gpa & ((1ul << offset) - 1)) return -EINVAL;
gpa |= eaddr & ((1ul << offset) - 1); for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) if (offset == mmu_psize_defs[ps].shift) break;
gpte->page_size = ps;
gpte->page_shift = offset;
gpte->eaddr = eaddr;
gpte->raddr = gpa;
/* Work out permissions */
gpte->may_read = !!(pte & _PAGE_READ);
gpte->may_write = !!(pte & _PAGE_WRITE);
gpte->may_execute = !!(pte & _PAGE_EXEC);
gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
if (pte_ret_p)
*pte_ret_p = pte;
return 0;
}
/* * Used to walk a partition or process table radix tree in guest memory * Note: We exploit the fact that a partition table and a process * table have the same layout, a partition-scoped page table and a * process-scoped page table have the same layout, and the 2nd * doubleword of a partition table entry has the same layout as * the PTCR register.
*/ int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, u64 table, int table_index, u64 *pte_ret_p)
{ struct kvm *kvm = vcpu->kvm; int ret; unsignedlong size, ptbl, root; struct prtb_entry entry;
/* Is the table big enough to contain this entry? */ if ((table_index * sizeof(entry)) >= size) return -EINVAL;
/* Read the table to find the root of the radix tree */
ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
kvm_vcpu_srcu_read_lock(vcpu);
ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
kvm_vcpu_srcu_read_unlock(vcpu); if (ret) return ret;
/* Root is stored in the first double word */
root = be64_to_cpu(entry.prtb0);
if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
kvmppc_update_dirty_map(memslot, gfn, page_size);
}
/* * kvmppc_free_p?d are used to free existing page tables, and recursively * descend and clear and free children. * Callers are responsible for flushing the PWC. * * When page tables are being unmapped/freed as part of page fault path * (full == false), valid ptes are generally not expected; however, there * is one situation where they arise, which is when dirty page logging is * turned off for a memslot while the VM is running. The new memslot * becomes visible to page faults before the memslot commit function * gets to flush the memslot, which can lead to a 2MB page mapping being * installed for a guest physical address where there are already 64kB * (or 4kB) mappings (of sub-pages of the same 2MB page).
*/ staticvoid kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
u64 lpid)
{ if (full) {
memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
} else {
pte_t *p = pte; unsignedlong it;
for (it = 0; it < PTRS_PER_PTE; ++it, ++p) { if (pte_val(*p) == 0) continue;
kvmppc_unmap_pte(kvm, p,
pte_pfn(*p) << PAGE_SHIFT,
PAGE_SHIFT, NULL, lpid);
}
}
/* * Clearing the pmd entry then flushing the PWC ensures that the pte * page no longer be cached by the MMU, so can be freed without * flushing the PWC again.
*/
pmd_clear(pmd);
kvmppc_radix_flush_pwc(kvm, lpid);
/* * Clearing the pud entry then flushing the PWC ensures that the pmd * page and any children pte pages will no longer be cached by the MMU, * so can be freed without flushing the PWC again.
*/
pud_clear(pud);
kvmppc_radix_flush_pwc(kvm, lpid);
kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
}
/* * There are a number of bits which may differ between different faults to * the same partition scope entry. RC bits, in the course of cleaning and * aging. And the write bit can change, either the access could have been * upgraded, or a read fault could happen concurrently with a write fault * that sets those bits first.
*/ #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
/* Check if we might have been invalidated; let the guest retry if so */
spin_lock(&kvm->mmu_lock);
ret = -EAGAIN; if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock;
/* Now traverse again under the lock and change the tree */
ret = -ENOMEM; if (p4d_none(*p4d)) { if (!new_pud) goto out_unlock;
p4d_populate(kvm->mm, p4d, new_pud);
new_pud = NULL;
}
pud = pud_offset(p4d, gpa); if (pud_leaf(*pud)) { unsignedlong hgpa = gpa & PUD_MASK;
/* Check if we raced and someone else has set the same thing */ if (level == 2) { if (pud_raw(*pud) == pte_raw(pte)) {
ret = 0; goto out_unlock;
} /* Valid 1GB page here already, add our extra bits */
WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
PTE_BITS_MUST_MATCH);
kvmppc_radix_update_pte(kvm, (pte_t *)pud,
0, pte_val(pte), hgpa, PUD_SHIFT);
ret = 0; goto out_unlock;
} /* * If we raced with another CPU which has just put * a 1GB pte in after we saw a pmd page, try again.
*/ if (!new_pmd) {
ret = -EAGAIN; goto out_unlock;
} /* Valid 1GB page here already, remove it */
kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
lpid);
} if (level == 2) { if (!pud_none(*pud)) { /* * There's a page table page here, but we wanted to * install a large page, so remove and free the page * table page.
*/
kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
}
kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); if (rmapp && n_rmap)
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0; goto out_unlock;
} if (pud_none(*pud)) { if (!new_pmd) goto out_unlock;
pud_populate(kvm->mm, pud, new_pmd);
new_pmd = NULL;
}
pmd = pmd_offset(pud, gpa); if (pmd_leaf(*pmd)) { unsignedlong lgpa = gpa & PMD_MASK;
/* Check if we raced and someone else has set the same thing */ if (level == 1) { if (pmd_raw(*pmd) == pte_raw(pte)) {
ret = 0; goto out_unlock;
} /* Valid 2MB page here already, add our extra bits */
WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
PTE_BITS_MUST_MATCH);
kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
0, pte_val(pte), lgpa, PMD_SHIFT);
ret = 0; goto out_unlock;
}
/* * If we raced with another CPU which has just put * a 2MB pte in after we saw a pte page, try again.
*/ if (!new_ptep) {
ret = -EAGAIN; goto out_unlock;
} /* Valid 2MB page here already, remove it */
kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
lpid);
} if (level == 1) { if (!pmd_none(*pmd)) { /* * There's a page table page here, but we wanted to * install a large page, so remove and free the page * table page.
*/
kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
}
kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); if (rmapp && n_rmap)
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0; goto out_unlock;
} if (pmd_none(*pmd)) { if (!new_ptep) goto out_unlock;
pmd_populate(kvm->mm, pmd, new_ptep);
new_ptep = NULL;
}
ptep = pte_offset_kernel(pmd, gpa); if (pte_present(*ptep)) { /* Check if someone else set the same thing */ if (pte_raw(*ptep) == pte_raw(pte)) {
ret = 0; goto out_unlock;
} /* Valid page here already, add our extra bits */
WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
PTE_BITS_MUST_MATCH);
kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
ret = 0; goto out_unlock;
}
kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); if (rmapp && n_rmap)
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0;
out_unlock:
spin_unlock(&kvm->mmu_lock); if (new_pud)
pud_free(kvm->mm, new_pud); if (new_pmd)
kvmppc_pmd_free(new_pmd); if (new_ptep)
kvmppc_pte_free(new_ptep); return ret;
}
/* * Need to set an R or C bit in the 2nd-level tables; * since we are just helping out the hardware here, * it is sufficient to do what the hardware does.
*/
pgflags = _PAGE_ACCESSED; if (writing)
pgflags |= _PAGE_DIRTY;
/* used to check for invalidations in progress */
mmu_seq = kvm->mmu_invalidate_seq;
smp_rmb();
hva = gfn_to_hva_memslot(memslot, gfn);
pfn = __kvm_faultin_pfn(memslot, gfn, writing ? FOLL_WRITE : 0,
&upgrade_write, &page); if (is_error_noslot_pfn(pfn)) return -EFAULT;
/* * Read the PTE from the process' radix tree and use that * so we get the shift and attribute bits.
*/
spin_lock(&kvm->mmu_lock);
ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
pte = __pte(0); if (ptep)
pte = READ_ONCE(*ptep);
spin_unlock(&kvm->mmu_lock); /* * If the PTE disappeared temporarily due to a THP * collapse, just return and let the guest try again.
*/ if (!pte_present(pte)) { if (page)
put_page(page); return RESUME_GUEST;
}
/* If we're logging dirty pages, always map single pages */
large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
/* Get pte level from shift/size */ if (large_enable && shift == PUD_SHIFT &&
(gpa & (PUD_SIZE - PAGE_SIZE)) ==
(hva & (PUD_SIZE - PAGE_SIZE))) {
level = 2;
} elseif (large_enable && shift == PMD_SHIFT &&
(gpa & (PMD_SIZE - PAGE_SIZE)) ==
(hva & (PMD_SIZE - PAGE_SIZE))) {
level = 1;
} else {
level = 0; if (shift > PAGE_SHIFT) { /* * If the pte maps more than one page, bring over * bits from the virtual address to get the real * address of the specific single page we want.
*/ unsignedlong rpnmask = (1ul << shift) - PAGE_SIZE;
pte = __pte(pte_val(pte) | (hva & rpnmask));
}
}
/* Allocate space in the tree and write the PTE */
ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
mmu_seq, kvm->arch.lpid, NULL, NULL); if (inserted_pte)
*inserted_pte = pte; if (levelp)
*levelp = level;
if (page) { if (!ret && (pte_val(pte) & _PAGE_WRITE))
set_page_dirty_lock(page);
put_page(page);
}
/* Increment number of large pages if we (successfully) inserted one */ if (!ret) { if (level == 1)
kvm->stat.num_2M_pages++; elseif (level == 2)
kvm->stat.num_1G_pages++;
}
/* Check for unusual errors */ if (dsisr & DSISR_UNSUPP_MMU) {
pr_err("KVM: Got unsupported MMU fault\n"); return -EFAULT;
} if (dsisr & DSISR_BADACCESS) { /* Reflect to the guest as DSI */
pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
kvmppc_core_queue_data_storage(vcpu,
kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
ea, dsisr); return RESUME_GUEST;
}
/* Translate the logical address */
gpa = vcpu->arch.fault_gpa & ~0xfffUL;
gpa &= ~0xF000000000000000ul;
gfn = gpa >> PAGE_SHIFT; if (!(dsisr & DSISR_PRTABLE_FAULT))
gpa |= ea & 0xfff;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return kvmppc_send_page_to_uv(kvm, gfn);
/* Get the corresponding memslot */
memslot = gfn_to_memslot(kvm, gfn);
/* No memslot means it's an emulated MMIO region */ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
DSISR_SET_RC)) { /* * Bad address in guest page table tree, or other * unusual error - reflect it to the guest as DSI.
*/
kvmppc_core_queue_data_storage(vcpu,
kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
ea, dsisr); return RESUME_GUEST;
} return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
}
if (memslot->flags & KVM_MEM_READONLY) { if (writing) { /* give the guest a DSI */
kvmppc_core_queue_data_storage(vcpu,
kvmppc_get_msr(vcpu) & SRR1_PREFIXED,
ea, DSISR_ISSTORE | DSISR_PROTFAULT); return RESUME_GUEST;
}
}
/* Failed to set the reference/change bits */ if (dsisr & DSISR_SET_RC) {
spin_lock(&kvm->mmu_lock); if (kvmppc_hv_handle_set_rc(kvm, false, writing,
gpa, kvm->arch.lpid))
dsisr &= ~DSISR_SET_RC;
spin_unlock(&kvm->mmu_lock);
/* Returns the number of PAGE_SIZE pages that are dirty */ staticint kvm_radix_test_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, int pagenum)
{ unsignedlong gfn = memslot->base_gfn + pagenum; unsignedlong gpa = gfn << PAGE_SHIFT;
pte_t *ptep, pte; unsignedint shift; int ret = 0; unsignedlong old, *rmapp;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) return ret;
/* * For performance reasons we don't hold kvm->mmu_lock while walking the * partition scoped table.
*/
ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift); if (!ptep) return 0;
pte = READ_ONCE(*ptep); if (pte_present(pte) && pte_dirty(pte)) {
spin_lock(&kvm->mmu_lock); /* * Recheck the pte again
*/ if (pte_val(pte) != pte_val(*ptep)) { /* * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can * only find PAGE_SIZE pte entries here. We can continue * to use the pte addr returned by above page table * walk.
*/ if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
spin_unlock(&kvm->mmu_lock); return 0;
}
}
ret = 1;
VM_BUG_ON(shift);
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); /* Also clear bit in ptes in shadow pgtable for nested guests */
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
old & PTE_RPN_MASK,
1UL << shift);
spin_unlock(&kvm->mmu_lock);
} return ret;
}
long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsignedlong *map)
{ unsignedlong i, j; int npages;
for (i = 0; i < memslot->npages; i = j) {
npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
/* * Note that if npages > 0 then i must be a multiple of npages, * since huge pages are only used to back the guest at guest * real addresses that are a multiple of their size. * Since we have at most one PTE covering any given guest * real address, if npages > 1 we can skip to i + npages.
*/
j = i + 1; if (npages) {
set_dirty_bits(map, i, npages);
j = i + npages;
}
} return 0;
}
ret = mutex_lock_interruptible(&p->mutex); if (ret) return ret;
if (p->chars_left) {
n = p->chars_left; if (n > len)
n = len;
r = copy_to_user(buf, p->buf + p->buf_index, n);
n -= r;
p->chars_left -= n;
p->buf_index += n;
buf += n;
len -= n;
ret = n; if (r) { if (!n)
ret = -EFAULT; goto out;
}
}
gpa = p->gpa;
nested = NULL;
pgt = NULL; while (len != 0 && p->lpid >= 0) { if (gpa >= RADIX_PGTABLE_RANGE) {
gpa = 0;
pgt = NULL; if (nested) {
kvmhv_put_nested(nested);
nested = NULL;
}
p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
p->hdr = 0; if (p->lpid < 0) break;
} if (!pgt) { if (p->lpid == 0) {
pgt = kvm->arch.pgtable;
} else {
nested = kvmhv_get_nested(kvm, p->lpid, false); if (!nested) {
gpa = RADIX_PGTABLE_RANGE; continue;
}
pgt = nested->shadow_pgtable;
}
}
n = 0; if (!p->hdr) { if (p->lpid > 0)
n = scnprintf(p->buf, sizeof(p->buf), "\nNested LPID %d: ", p->lpid);
n += scnprintf(p->buf + n, sizeof(p->buf) - n, "pgdir: %lx\n", (unsignedlong)pgt);
p->hdr = 1; goto copy;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.