if (!ptr)
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
__func__, size, size, nid, &min_addr, &max_addr);
return ptr;
}
/* * When allocating pud or pmd pointers, we allocate a complete page * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This * is to ensure that the page obtained from the memblock allocator * can be completely used as page table page and can be freed * correctly when the page table entries are removed.
*/ staticint early_map_kernel_page(unsignedlong ea, unsignedlong pa,
pgprot_t flags, unsignedint map_page_size, int nid, unsignedlong region_start, unsignedlong region_end)
{ unsignedlong pfn = pa >> PAGE_SHIFT;
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
/* * nid, region_start, and region_end are hints to try to place the page * table memory in the same node or region.
*/ staticint __map_kernel_page(unsignedlong ea, unsignedlong pa,
pgprot_t flags, unsignedint map_page_size, int nid, unsignedlong region_start, unsignedlong region_end)
{ unsignedlong pfn = pa >> PAGE_SHIFT;
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep; /* * Make sure task size is correct as per the max adddr
*/
BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
if (unlikely(!slab_is_available())) return early_map_kernel_page(ea, pa, flags, map_page_size,
nid, region_start, region_end);
/* * Should make page table allocation functions be able to take a * node, so we can place kernel page tables on the right nodes after * boot.
*/
pgdp = pgd_offset_k(ea);
p4dp = p4d_offset(pgdp, ea);
pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM; if (map_page_size == PUD_SIZE) {
ptep = (pte_t *)pudp; goto set_the_pte;
}
pmdp = pmd_alloc(&init_mm, pudp, ea); if (!pmdp) return -ENOMEM; if (map_page_size == PMD_SIZE) {
ptep = pmdp_ptep(pmdp); goto set_the_pte;
}
ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM;
// Relocatable kernel running at non-zero real address if (stext_phys != 0) { // The end of interrupts code at zero is a rodata boundary unsignedlong end_intr = __pa_symbol(__end_interrupts) - stext_phys; if (addr < end_intr) return end_intr;
// Start of relocated kernel text is a rodata boundary if (addr < stext_phys) return stext_phys;
}
if (addr < __pa_symbol(__srwx_boundary)) return __pa_symbol(__srwx_boundary); #endif return end;
}
/* * TODO: Support to enable KFENCE after bootup depends on the ability to * split page table mappings. As such support is not currently * implemented for radix pagetables, support enabling KFENCE * only at system startup for now. * * After support for splitting mappings is available on radix, * alloc_kfence_pool() & map_kfence_pool() can be dropped and * mapping for __kfence_pool memory can be * split during arch_kfence_init_pool().
*/ if (!kfence_early_init) goto no_kfence;
kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); if (!kfence_pool) goto no_kfence;
/* We don't support slb for radix */
slb_set_size(0);
kfence_pool = alloc_kfence_pool();
/* * Create the linear mapping
*/
for_each_mem_range(i, &start, &end) { /* * The memblock allocator is up at this point, so the * page tables will be allocated within the range. No * need or a node (which we don't have yet).
*/
if (end >= RADIX_VMALLOC_START) {
pr_warn("Outside the supported range\n"); continue;
}
if (!cpu_has_feature(CPU_FTR_HVMODE) &&
cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { /* * Older versions of KVM on these machines prefer if the * guest only uses the low 19 PID bits.
*/
mmu_pid_bits = 19;
}
mmu_base_pid = 1;
/* * Allocate Partition table and process table for the * host.
*/
BUG_ON(PRTB_SIZE_SHIFT > 36);
process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); /* * Fill in the process table.
*/
rts_field = radix__get_tree_size();
process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
/* * The init_mm context is given the first available (non-zero) PID, * which is the "guard PID" and contains no page table. PIDR should * never be set to zero because that duplicates the kernel address * space at the 0x0... offset (quadrant 0)! * * An arbitrary PID that may later be allocated by the PID allocator * for userspace processes must not be used either, because that * would cause stale user mappings for that PID on CPUs outside of * the TLB invalidation scheme (because it won't be in mm_cpumask). * * So permanently carve out one PID for the purpose of a guard PID.
*/
init_mm.context.id = mmu_base_pid;
mmu_base_pid++;
}
void __init radix__early_init_devtree(void)
{ int rc;
/* * Try to find the available page sizes in the device-tree
*/
rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); if (!rc) { /* * No page size details found in device tree. * Let's assume we have page 4k and 64k support
*/
mmu_psize_defs[MMU_PAGE_4K].shift = 12;
mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
psize_to_rpti_pgsize(MMU_PAGE_4K);
if (altmap) { unsignedlong alt_start, alt_end; unsignedlong base_pfn = page_to_pfn(page);
/* * with 2M vmemmap mmaping we can have things setup * such that even though atlmap is specified we never * used altmap.
*/
alt_start = altmap->base_pfn;
alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
int __meminit radix__vmemmap_create_mapping(unsignedlong start, unsignedlong page_size, unsignedlong phys)
{ /* Create a PTE encoding */ int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); int ret;
if ((start + page_size) >= RADIX_VMEMMAP_END) {
pr_warn("Outside the supported range\n"); return -1;
}
ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
BUG_ON(ret);
if (!reuse) { /* * make sure we don't create altmap mappings * covering things outside the device.
*/ if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
altmap = NULL;
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); if (!p && altmap)
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); if (!p) return NULL;
pr_debug("PAGE_SIZE vmemmap mapping\n");
} else { /* * When a PTE/PMD entry is freed from the init_mm * there's a free_pages() call to this page allocated * above. Thus this get_page() is paired with the * put_page_testzero() on the freeing path. * This can only called by certain ZONE_DEVICE path, * and through vmemmap_populate_compound_pages() when * slab is available.
*/
get_page(reuse);
p = page_to_virt(reuse);
pr_debug("Tail page reuse vmemmap mapping\n");
}
/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ if (unlikely(p4d_none(*p4dp))) { if (unlikely(!slab_is_available())) {
pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
p4d_populate(&init_mm, p4dp, pud); /* go to the pud_offset */
} else return pud_alloc(&init_mm, p4dp, address);
} return pud_offset(p4dp, address);
}
/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ if (unlikely(pud_none(*pudp))) { if (unlikely(!slab_is_available())) {
pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
pud_populate(&init_mm, pudp, pmd);
} else return pmd_alloc(&init_mm, pudp, address);
} return pmd_offset(pudp, address);
}
/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ if (unlikely(pmd_none(*pmdp))) { if (unlikely(!slab_is_available())) {
pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
pmd_populate(&init_mm, pmdp, pte);
} else return pte_alloc_kernel(pmdp, address);
} return pte_offset_kernel(pmdp, address);
}
/* * If altmap is present, Make sure we align the start vmemmap addr * to PAGE_SIZE so that we calculate the correct start_pfn in * altmap boundary check to decide whether we should use altmap or * RAM based backing memory allocation. Also the address need to be * aligned for set_pte operation. If the start addr is already * PMD_SIZE aligned and with in the altmap boundary then we will * try to use a pmd size altmap mapping else we go for page size * mapping. * * If altmap is not present, align the vmemmap addr to PMD_SIZE and * always allocate a PMD size page for vmemmap backing. *
*/
/* * keep it simple by checking addr PMD_SIZE alignment * and verifying the device boundary condition. * For us to use a pmd mapping, both addr and pfn should * be aligned. We skip if addr is not aligned and for * pfn we hope we have extra area in the altmap that * can help to find an aligned block. This can result * in altmap block allocation failures, in which case * we fallback to RAM for vmemmap allocation.
*/ if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
altmap_cross_boundary(altmap, addr, PMD_SIZE))) { /* * make sure we don't create altmap mappings * covering things outside the device.
*/ goto base_mapping;
}
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) {
vmemmap_set_pmd(pmd, p, node, addr, next);
pr_debug("PMD_SIZE vmemmap mapping\n"); continue;
} else { /* * A vmemmap block allocation can fail due to * alignment requirements and we trying to align * things aggressively there by running out of * space. Try base mapping on failure.
*/ goto base_mapping;
}
} elseif (vmemmap_check_pmd(pmd, node, addr, next)) { /* * If a huge mapping exist due to early call to * vmemmap_populate, let's try to use that.
*/ continue;
}
base_mapping: /* * Not able allocate higher order memory to back memmap * or we found a pointer to pte page. Allocate base page * size vmemmap
*/
pte = vmemmap_pte_alloc(pmd, node, addr); if (!pte) return -ENOMEM;
pgd = pgd_offset_k(addr);
p4d = p4d_offset(pgd, addr);
pud = vmemmap_pud_alloc(p4d, node, addr); if (!pud) return NULL;
pmd = vmemmap_pmd_alloc(pud, node, addr); if (!pmd) return NULL; if (pmd_leaf(*pmd)) /* * The second page is mapped as a hugepage due to a nearby request. * Force our mapping to page size without deduplication
*/ return NULL;
pte = vmemmap_pte_alloc(pmd, node, addr); if (!pte) return NULL;
radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
/* the second vmemmap page which we use for duplication */
map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
pgd = pgd_offset_k(map_addr);
p4d = p4d_offset(pgd, map_addr);
pud = vmemmap_pud_alloc(p4d, node, map_addr); if (!pud) return NULL;
pmd = vmemmap_pmd_alloc(pud, node, map_addr); if (!pmd) return NULL; if (pmd_leaf(*pmd)) /* * The second page is mapped as a hugepage due to a nearby request. * Force our mapping to page size without deduplication
*/ return NULL;
pte = vmemmap_pte_alloc(pmd, node, map_addr); if (!pte) return NULL; /* * Check if there exist a mapping to the left
*/ if (pte_none(*pte)) { /* * Populate the head page vmemmap page. * It can fall in different pmd, hence * vmemmap_populate_address()
*/
pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL); if (!pte) return NULL; /* * Populate the tail pages vmemmap page
*/
pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL); if (!pte) return NULL;
vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); return pte;
} return pte;
}
int __meminit vmemmap_populate_compound_pages(unsignedlong start_pfn, unsignedlong start, unsignedlong end, int node, struct dev_pagemap *pgmap)
{ /* * we want to map things as base page size mapping so that * we can save space in vmemmap. We could have huge mapping * covering out both edges.
*/ unsignedlong addr; unsignedlong addr_pfn = start_pfn; unsignedlong next;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (pmd_leaf(READ_ONCE(*pmd))) { /* existing huge mapping. Skip the range */
addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
next = pmd_addr_end(addr, end); continue;
}
pte = vmemmap_pte_alloc(pmd, node, addr); if (!pte) return -ENOMEM; if (!pte_none(*pte)) { /* * This could be because we already have a compound * page whose VMEMMAP_RESERVE_NR pages were mapped and * this request fall in those pages.
*/
addr_pfn += 1;
next = addr + PAGE_SIZE; continue;
} else { unsignedlong nr_pages = pgmap_vmemmap_nr(pgmap); unsignedlong pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
pte_t *tail_page_pte;
/* * if the address is aligned to huge page size it is the * head mapping.
*/ if (pfn_offset == 0) { /* Populate the head page vmemmap page */
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); if (!pte) return -ENOMEM;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
/* * Populate the tail pages vmemmap page * It can fall in different pmd, hence * vmemmap_populate_address()
*/
pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL); if (!pte) return -ENOMEM;
addr_pfn += 2;
next = addr + 2 * PAGE_SIZE; continue;
} /* * get the 2nd mapping details * Also create it if that doesn't exist
*/
tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); if (!tail_page_pte) {
/* * For us pgtable_t is pte_t *. Inorder to save the deposisted * page table, we consider the allocated page table as a list * head. On withdraw we need to make sure we zero out the used * list_head memory area.
*/ void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{ struct list_head *lh = (struct list_head *) pgtable;
unsignedlong change = pte_val(entry) ^ pte_val(*ptep); /* * On POWER9, the NMMU is not able to relax PTE access permissions * for a translation with a TLB. The PTE must be invalidated, TLB * flushed before the new PTE is installed. * * This only needs to be done for radix, because hash translation does * flush when updating the linux pte (and we don't support NMMU * accelerators on HPT on POWER9 anyway XXX: do we?). * * POWER10 (and P9P) NMMU does behave as per ISA.
*/ if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
atomic_read(&mm->context.copros) > 0) { unsignedlong old_pte, new_pte;
old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
new_pte = old_pte | set;
radix__flush_tlb_page_psize(mm, address, psize);
__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
} else {
__radix_pte_update(ptep, 0, set); /* * Book3S does not require a TLB flush when relaxing access * restrictions when the address space (modulo the POWER9 nest * MMU issue above) because the MMU will reload the PTE after * taking an access fault, as defined by the architecture. See * "Setting a Reference or Change Bit or Upgrading Access * Authority (PTE Subject to Atomic Hardware Updates)" in * Power ISA Version 3.1B.
*/
} /* See ptesync comment in radix__set_pte_at */
}
/* * POWER9 NMMU must flush the TLB after clearing the PTE before * installing a PTE with more relaxed access permissions, see * radix__ptep_set_access_flags.
*/ if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
(atomic_read(&mm->context.copros) > 0))
radix__flush_tlb_page(vma, addr);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.