/* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the * physical space so we can cache the place of the first one and move * around without checking the pgd every time.
*/
/* Bits supported by the hardware: */
pteval_t __supported_pte_mask __read_mostly = ~0; /* Bits allowed in normal kernel mappings: */
pteval_t __default_kernel_pte_mask __read_mostly = ~0;
EXPORT_SYMBOL_GPL(__supported_pte_mask); /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */
EXPORT_SYMBOL(__default_kernel_pte_mask);
int force_personality32;
/* * noexec32=on|off * Control non executable heap for 32bit processes. * * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) * off PROT_READ implies PROT_EXEC
*/ staticint __init nonx32_setup(char *str)
{ if (!strcmp(str, "on"))
force_personality32 &= ~READ_IMPLIES_EXEC; elseif (!strcmp(str, "off"))
force_personality32 |= READ_IMPLIES_EXEC; return 1;
}
__setup("noexec32=", nonx32_setup);
/* * With folded p4d, pgd_none() is always false, we need to * handle synchronization on p4d level.
*/
MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
p4d_ref = p4d_offset(pgd_ref, addr);
/* * When memory was added make sure all the processes MM have * suitable PGD entries in the local PGD level page.
*/ staticvoid sync_global_pgds(unsignedlong start, unsignedlong end)
{ if (pgtable_l5_enabled())
sync_global_pgds_l5(start, end); else
sync_global_pgds_l4(start, end);
}
/* * Make kernel mappings visible in all page tables in the system. * This is necessary except when the init task populates kernel mappings * during the boot process. In that case, all processes originating from * the init task copies the kernel mappings, so there is no issue. * Otherwise, missing synchronization could lead to kernel crashes due * to missing page table entries for certain kernel mappings. * * Synchronization is performed at the top level, which is the PGD in * 5-level paging systems. But in 4-level paging systems, however, * pgd_populate() is a no-op, so synchronization is done at the P4D level. * sync_global_pgds() handles this difference between paging levels.
*/ void arch_sync_kernel_mappings(unsignedlong start, unsignedlong end)
{
sync_global_pgds(start, end);
}
/* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
*/ static __ref void *spp_getpage(void)
{ void *ptr;
/* * The head.S code sets up the kernel high mapping: * * from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text) * * phys_base holds the negative offset to the kernel, which is added * to the compile time generated pmds. This results in invalid pmds up * to the point where we hit the physaddr 0 mapping. * * We limit the mappings to the region from _text to _brk_end. _brk_end * is rounded up to the 2MB boundary. This catches the invalid pmds as * well, as they are located before _text:
*/ void __init cleanup_highmap(void)
{ unsignedlong vaddr = __START_KERNEL_map; unsignedlong vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE; unsignedlong end = roundup((unsignedlong)_brk_end, PMD_SIZE) - 1;
pmd_t *pmd = level2_kernel_pgt;
/* * Native path, max_pfn_mapped is not set yet. * Xen has valid max_pfn_mapped set in * arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
*/ if (max_pfn_mapped)
vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { if (pmd_none(*pmd)) continue; if (vaddr < (unsignedlong) _text || vaddr > end)
set_pmd(pmd, __pmd(0));
}
}
/* * Create PTE level page table mapping for physical addresses. * It returns the last physical address mapped.
*/ staticunsignedlong __meminit
phys_pte_init(pte_t *pte_page, unsignedlong paddr, unsignedlong paddr_end,
pgprot_t prot, bool init)
{ unsignedlong pages = 0, paddr_next; unsignedlong paddr_last = paddr_end;
pte_t *pte; int i;
pte = pte_page + pte_index(paddr);
i = pte_index(paddr);
/* * We will re-use the existing mapping. * Xen for example has some special requirements, like mapping * pagetable pages as RO. So assume someone who pre-setup * these mappings are more intelligent.
*/ if (!pte_none(*pte)) { if (!after_bootmem)
pages++; continue;
}
if (!pmd_none(*pmd)) { if (!pmd_leaf(*pmd)) {
spin_lock(&init_mm.page_table_lock);
pte = (pte_t *)pmd_page_vaddr(*pmd);
paddr_last = phys_pte_init(pte, paddr,
paddr_end, prot,
init);
spin_unlock(&init_mm.page_table_lock); continue;
} /* * If we are ok with PG_LEVEL_2M mapping, then we will * use the existing mapping, * * Otherwise, we will split the large page mapping but * use the same existing protection bits except for * large page, so that we don't violate Intel's TLB * Application note (317080) which says, while changing * the page sizes, new and old translations should * not differ with respect to page frame and * attributes.
*/ if (page_size_mask & (1 << PG_LEVEL_2M)) { if (!after_bootmem)
pages++;
paddr_last = paddr_next; continue;
}
new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
}
/* * Create PUD level page table mapping for physical addresses. The virtual * and physical address do not have to be aligned at this level. KASLR can * randomize virtual addresses up to this level. * It returns the last physical address mapped.
*/ staticunsignedlong __meminit
phys_pud_init(pud_t *pud_page, unsignedlong paddr, unsignedlong paddr_end, unsignedlong page_size_mask, pgprot_t _prot, bool init)
{ unsignedlong pages = 0, paddr_next; unsignedlong paddr_last = paddr_end; unsignedlong vaddr = (unsignedlong)__va(paddr); int i = pud_index(vaddr);
for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
pud_t *pud;
pmd_t *pmd;
pgprot_t prot = _prot;
if (!pud_none(*pud)) { if (!pud_leaf(*pud)) {
pmd = pmd_offset(pud, 0);
paddr_last = phys_pmd_init(pmd, paddr,
paddr_end,
page_size_mask,
prot, init); continue;
} /* * If we are ok with PG_LEVEL_1G mapping, then we will * use the existing mapping. * * Otherwise, we will split the gbpage mapping but use * the same existing protection bits except for large * page, so that we don't violate Intel's TLB * Application note (317080) which says, while changing * the page sizes, new and old translations should * not differ with respect to page frame and * attributes.
*/ if (page_size_mask & (1 << PG_LEVEL_1G)) { if (!after_bootmem)
pages++;
paddr_last = paddr_next; continue;
}
prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
}
if (pgd_changed)
sync_global_pgds(vaddr_start, vaddr_end - 1);
return paddr_last;
}
/* * Create page table mapping for the physical memory for specific physical * addresses. Note that it can only be used to populate non-present entries. * The virtual and physical addresses have to be aligned on PMD level * down. It returns the last physical address mapped.
*/ unsignedlong __meminit
kernel_physical_mapping_init(unsignedlong paddr_start, unsignedlong paddr_end, unsignedlong page_size_mask, pgprot_t prot)
{ return __kernel_physical_mapping_init(paddr_start, paddr_end,
page_size_mask, prot, true);
}
/* * This function is similar to kernel_physical_mapping_init() above with the * exception that it uses set_{pud,pmd}() instead of the set_{pud,pte}_safe() * when updating the mapping. The caller is responsible to flush the TLBs after * the function returns.
*/ unsignedlong __meminit
kernel_physical_mapping_change(unsignedlong paddr_start, unsignedlong paddr_end, unsignedlong page_size_mask)
{ return __kernel_physical_mapping_init(paddr_start, paddr_end,
page_size_mask, PAGE_KERNEL, false);
}
/* * clear the default setting with node 0 * note: don't use nodes_clear here, that is really clearing when * numa support is not compiled in, and later node_set_state * will not set it back.
*/
node_clear_state(0, N_MEMORY);
node_clear_state(0, N_NORMAL_MEMORY);
zone_sizes_init();
}
#define PAGE_UNUSED 0xFD
/* * The unused vmemmap range, which was not yet memset(PAGE_UNUSED), ranges * from unused_pmd_start to next PMD_SIZE boundary.
*/ staticunsignedlong unused_pmd_start __meminitdata;
#ifdef CONFIG_MEMORY_HOTPLUG /* Returns true if the PMD is completely unused and thus it can be freed */ staticbool __meminit vmemmap_pmd_is_unused(unsignedlong addr, unsignedlong end)
{ unsignedlong start = ALIGN_DOWN(addr, PMD_SIZE);
/* * Flush the unused range cache to ensure that memchr_inv() will work * for the whole range.
*/
vmemmap_flush_unused_pmd();
memset((void *)addr, PAGE_UNUSED, end - addr);
staticvoid __meminit __vmemmap_use_sub_pmd(unsignedlong start)
{ /* * As we expect to add in the same granularity as we remove, it's * sufficient to mark only some piece used to block the memmap page from * getting removed when removing some other adjacent memmap (just in * case the first memmap never gets initialized e.g., because the memory * block never gets onlined).
*/
memset((void *)start, 0, sizeof(struct page));
}
staticvoid __meminit vmemmap_use_sub_pmd(unsignedlong start, unsignedlong end)
{ /* * We only optimize if the new used range directly follows the * previously unused range (esp., when populating consecutive sections).
*/ if (unused_pmd_start == start) { if (likely(IS_ALIGNED(end, PMD_SIZE)))
unused_pmd_start = 0; else
unused_pmd_start = end; return;
}
/* * If the range does not contiguously follows previous one, make sure * to mark the unused range of the previous one so it can be removed.
*/
vmemmap_flush_unused_pmd();
__vmemmap_use_sub_pmd(start);
}
/* * Could be our memmap page is filled with PAGE_UNUSED already from a * previous remove. Make sure to reset it.
*/
__vmemmap_use_sub_pmd(start);
/* * Mark with PAGE_UNUSED the unused parts of the new memmap range
*/ if (!IS_ALIGNED(start, PMD_SIZE))
memset((void *)page, PAGE_UNUSED, start - page);
/* * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of * consecutive sections. Remember for the last added PMD where the * unused range begins.
*/ if (!IS_ALIGNED(end, PMD_SIZE))
unused_pmd_start = end;
}
/* * Memory hotplug specific functions
*/ #ifdef CONFIG_MEMORY_HOTPLUG /* * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need * updating.
*/ staticvoid update_end_of_memory_vars(u64 start, u64 size)
{ unsignedlong end_pfn = PFN_UP(start + size);
int add_pages(int nid, unsignedlong start_pfn, unsignedlong nr_pages, struct mhp_params *params)
{ unsignedlong end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; int ret;
if (WARN_ON_ONCE(end > DIRECT_MAP_PHYSMEM_END)) return -ERANGE;
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
/* * Special case: add_pages() is called by memremap_pages() for adding device * private pages. Do not bump up max_pfn in the device private path, * because max_pfn changes affect dma_addressing_limited(). * * dma_addressing_limited() returning true when max_pfn is the device's * addressable memory can force device drivers to use bounce buffers * and impact their performance negatively:
*/ if (!params->pgmap) /* update max_pfn, max_low_pfn and high_memory */
update_end_of_memory_vars(start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
pte = pte_start + pte_index(addr); for (; addr < end; addr = next, pte++) {
next = (addr + PAGE_SIZE) & PAGE_MASK; if (next > end)
next = end;
if (!pte_present(*pte)) continue;
/* * We mapped [0,1G) memory as identity mapping when * initializing, in arch/x86/kernel/head_64.S. These * pagetables cannot be removed.
*/
phys_addr = pte_val(*pte) + (addr & PAGE_MASK); if (phys_addr < (phys_addr_t)0x40000000) return;
p4d = p4d_start + p4d_index(addr); for (; addr < end; addr = next, p4d++) {
next = p4d_addr_end(addr, end);
if (!p4d_present(*p4d)) continue;
BUILD_BUG_ON(p4d_leaf(*p4d));
pud_base = pud_offset(p4d, 0);
remove_pud_table(pud_base, addr, next, altmap, direct); /* * For 4-level page tables we do not want to free PUDs, but in the * 5-level case we should free them. This code will have to change * to adapt for boot-time switching between 4 and 5 level page tables.
*/ if (pgtable_l5_enabled())
free_pud_table(pud_base, p4d);
}
if (direct)
update_page_count(PG_LEVEL_512G, -pages);
}
/* start and end are both virtual address. */ staticvoid __meminit
remove_pagetable(unsignedlong start, unsignedlong end, bool direct, struct vmem_altmap *altmap)
{ unsignedlong next; unsignedlong addr;
pgd_t *pgd;
p4d_t *p4d;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
pgd = pgd_offset_k(addr); if (!pgd_present(*pgd)) continue;
/* * Pre-allocates page-table pages for the vmalloc area in the kernel page-table. * Only the level which needs to be synchronized between all page-tables is * allocated because the synchronization can be expensive.
*/ staticvoid __init preallocate_vmalloc_pages(void)
{ unsignedlong addr; constchar *lvl;
/* * The goal here is to allocate all possibly required * hardware page tables pointed to by the top hardware * level. * * On 4-level systems, the P4D layer is folded away and * the above code does no preallocation. Below, go down * to the pud _software_ level to ensure the second * hardware level is allocated on 4-level systems too.
*/
lvl = "pud";
pud = pud_alloc(&init_mm, p4d, addr); if (!pud) goto failed;
}
return;
failed:
/* * The pages have to be there now or they will be missing in * process page-tables later.
*/
panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl);
}
/* * Must be done after boot memory is put on freelist, because here we * might set fields in deferred struct pages that have not yet been * initialized, and memblock_free_all() initializes all the reserved * deferred pages for us.
*/
register_page_bootmem_info();
/* Register memory areas for /proc/kcore */ if (get_gate_vma(&init_mm))
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
/* * The rodata/data/bss/brk section (but not the kernel text!) * should also be not-executable. * * We align all_end to PMD_SIZE because the existing mapping * is a full PMD. If we would align _brk_end to PAGE_SIZE we * split the PMD and the reminder between _brk_end and the end * of the PMD will remain mapped executable. * * Any PMD which was setup after the one which covers _brk_end * has been zapped already via cleanup_highmem().
*/
all_end = roundup((unsignedlong)_brk_end, PMD_SIZE);
set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
/* * Block size is the minimum amount of memory which can be hotplugged or * hotremoved. It must be power of two and must be equal or larger than * MIN_MEMORY_BLOCK_SIZE.
*/ #define MAX_BLOCK_SIZE (2UL << 30)
/* Amount of ram needed to start using large blocks */ #define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30)
/* If memory block size has been set, then use it */
bz = set_memory_block_size; if (bz) goto done;
/* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */ if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) {
bz = MIN_MEMORY_BLOCK_SIZE; goto done;
}
/* * When hotplug alignment is not a concern, maximize blocksize * to minimize overhead. Otherwise, align to the lesser of advice * alignment and end of memory alignment.
*/
bz = memory_block_advised_max_size(); if (!bz) {
bz = MAX_BLOCK_SIZE; if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) goto done;
} else {
bz = max(min(bz, MAX_BLOCK_SIZE), MIN_MEMORY_BLOCK_SIZE);
}
/* Find the largest allowed block size that aligns to memory end */ for (; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { if (IS_ALIGNED(boot_mem_end, bz)) break;
}
done:
pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
return bz;
}
staticunsignedlong memory_block_size_probed; unsignedlong memory_block_size_bytes(void)
{ if (!memory_block_size_probed)
memory_block_size_probed = probe_memory_block_size();
return memory_block_size_probed;
}
/* * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
*/ staticlong __meminitdata addr_start, addr_end; staticvoid __meminitdata *p_start, *p_end; staticint __meminitdata node_start;
/* check to see if we have contiguous blocks */ if (p_end != p || node_start != node) { if (p_start)
pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
addr_start, addr_end-1, p_start, p_end-1, node_start);
addr_start = addr;
node_start = node;
p_start = p;
}
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
if (!IS_ALIGNED(addr, PMD_SIZE) ||
!IS_ALIGNED(next, PMD_SIZE))
vmemmap_use_new_sub_pmd(addr, next);
}
int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, unsignedlong addr, unsignedlong next)
{ int large = pmd_leaf(*pmd);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.