/* * We need to define the tracepoints somewhere, and tlb.c * is only compiled when SMP=y.
*/ #include <trace/events/tlb.h>
#include"mm_internal.h"
/* * Tables translating between page_cache_type_t and pte encoding. * * The default values are defined statically as minimal supported mode; * WC and WT fall back to UC-. pat_init() updates these values to support * more cache modes, WC and WT, when it is safe to do so. See pat_init() * for the details. Note, __early_ioremap() used during early boot-time * takes pgprot_t (pte encoding) and does not use these tables. * * Index into __cachemode2pte_tbl[] is the cachemode. * * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
*/ static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
[_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD,
[_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
};
/* * Check that the write-protect PAT entry is set for write-protect. * To do this without making assumptions how PAT has been set up (Xen has * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache * mode via the __cachemode2pte_tbl[] into protection bits (those protection * bits will select a cache mode of WP or better), and then translate the * protection bits back into the cache mode using __pte2cm_idx() and the * __pte2cachemode_tbl[] array. This will return the really used cache mode.
*/ bool x86_has_pat_wp(void)
{
uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP];
/* * Pages returned are already directly mapped. * * Changing that is likely to break Xen, see commit: * * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve * * for detailed information.
*/
__ref void *alloc_low_pages(unsignedint num)
{ unsignedlong pfn; int i;
/* * By default need to be able to allocate page tables below PGD firstly for * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping. * With KASLR memory randomization, depending on the machine e820 memory and the * PUD alignment, twice that many pages may be needed when KASLR memory * randomization is enabled.
*/
/* * Save some of cr4 feature set we're using (e.g. Pentium 4MB * enable and PPro Global page enable), so that any CPU's that boot * up after us can get the correct flags. Invoked on the boot CPU.
*/ staticinlinevoid cr4_set_bits_and_update_boot(unsignedlong mask)
{
mmu_cr4_features |= mask; if (trampoline_cr4_features)
*trampoline_cr4_features = mmu_cr4_features;
cr4_set_bits(mask);
}
staticvoid __init probe_page_size_mask(void)
{ /* * For pagealloc debugging, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc.
*/ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
page_size_mask |= 1 << PG_LEVEL_2M; else
direct_gbpages = 0;
/* Enable PSE if available */ if (boot_cpu_has(X86_FEATURE_PSE))
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
__supported_pte_mask &= ~_PAGE_GLOBAL; if (boot_cpu_has(X86_FEATURE_PGE)) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
/* By the default is everything supported: */
__default_kernel_pte_mask = __supported_pte_mask; /* Except when with PTI where the kernel is mostly non-Global: */ if (cpu_feature_enabled(X86_FEATURE_PTI))
__default_kernel_pte_mask &= ~_PAGE_GLOBAL;
/* Enable 1 GB linear kernel mappings if available: */ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
printk(KERN_INFO "Using GB pages for direct mapping\n");
page_size_mask |= 1 << PG_LEVEL_1G;
} else {
direct_gbpages = 0;
}
}
/* * INVLPG may not properly flush Global entries on * these CPUs. New microcode fixes the issue.
*/ staticconststruct x86_cpu_id invlpg_miss_ids[] = {
X86_MATCH_VFM(INTEL_ALDERLAKE, 0x2e),
X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0x42c),
X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11),
X86_MATCH_VFM(INTEL_RAPTORLAKE, 0x118),
X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0x4117),
X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0x2e),
{}
};
if (invlpg_miss_match &&
boot_cpu_data.microcode < invlpg_miss_match->driver_data) {
pr_info("Incomplete global flushes, disabling PCID");
setup_clear_cpu_cap(X86_FEATURE_PCID); return;
}
if (boot_cpu_has(X86_FEATURE_PGE)) { /* * This can't be cr4_set_bits_and_update_boot() -- the * trampoline code can't handle CR4.PCIDE and it wouldn't * do any good anyway. Despite the name, * cr4_set_bits_and_update_boot() doesn't actually cause * the bits in question to remain set all the way through * the secondary boot asm. * * Instead, we brute-force it and set CR4.PCIDE manually in * start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
} else { /* * flush_tlb_all(), as currently implemented, won't work if * PCID is on but PGE is not. Since that combination * doesn't exist on real hardware, there's no reason to try * to fully support it, but it's polite to avoid corrupting * data if we're on an improperly configured VM.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
}
}
staticint __meminit save_mr(struct map_range *mr, int nr_range, unsignedlong start_pfn, unsignedlong end_pfn, unsignedlong page_size_mask)
{ if (start_pfn < end_pfn) { if (nr_range >= NR_RANGE_MR)
panic("run out of range for init_memory_mapping\n");
mr[nr_range].start = start_pfn<<PAGE_SHIFT;
mr[nr_range].end = end_pfn<<PAGE_SHIFT;
mr[nr_range].page_size_mask = page_size_mask;
nr_range++;
}
return nr_range;
}
/* * adjust the page_size_mask for small range to go with * big page size instead small one if nearby are ram too.
*/ staticvoid __ref adjust_range_page_size_mask(struct map_range *mr, int nr_range)
{ int i;
for (i = 0; i < nr_range; i++) { if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
!(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { unsignedlong start = round_down(mr[i].start, PMD_SIZE); unsignedlong end = round_up(mr[i].end, PMD_SIZE);
#ifdef CONFIG_X86_32 if ((end >> PAGE_SHIFT) > max_low_pfn) continue; #endif
if (memblock_is_region_memory(start, end - start))
mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
} if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
!(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { unsignedlong start = round_down(mr[i].start, PUD_SIZE); unsignedlong end = round_up(mr[i].end, PUD_SIZE);
if (memblock_is_region_memory(start, end - start))
mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
}
}
}
if (mr->page_size_mask & (1<<PG_LEVEL_1G)) return str_1g; /* * 32-bit without PAE has a 4M large page size. * PG_LEVEL_2M is misnamed, but we can at least * print out the right size in the string.
*/ if (IS_ENABLED(CONFIG_X86_32) &&
!IS_ENABLED(CONFIG_X86_PAE) &&
mr->page_size_mask & (1<<PG_LEVEL_2M)) return str_4m;
if (mr->page_size_mask & (1<<PG_LEVEL_2M)) return str_2m;
return str_4k;
}
staticint __meminit split_mem_range(struct map_range *mr, int nr_range, unsignedlong start, unsignedlong end)
{ unsignedlong start_pfn, end_pfn, limit_pfn; unsignedlong pfn; int i;
limit_pfn = PFN_DOWN(end);
/* head if not big page alignment ? */
pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory * because there are often fixed size MTRRs in there * and overlapping MTRRs into large pages can cause * slowdowns.
*/ if (pfn == 0)
end_pfn = PFN_DOWN(PMD_SIZE); else
end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #else/* CONFIG_X86_64 */
end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif if (end_pfn > limit_pfn)
end_pfn = limit_pfn; if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
pfn = end_pfn;
}
if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
max_low_pfn_mapped = max(max_low_pfn_mapped,
min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
}
bool pfn_range_is_mapped(unsignedlong start_pfn, unsignedlong end_pfn)
{ int i;
for (i = 0; i < nr_pfn_mapped; i++) if ((start_pfn >= pfn_mapped[i].start) &&
(end_pfn <= pfn_mapped[i].end)) returntrue;
returnfalse;
}
/* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped.
*/ unsignedlong __ref init_memory_mapping(unsignedlong start, unsignedlong end, pgprot_t prot)
{ struct map_range mr[NR_RANGE_MR]; unsignedlong ret = 0; int nr_range, i;
pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n",
start, end - 1);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask,
prot);
add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
return ret >> PAGE_SHIFT;
}
/* * We need to iterate through the E820 memory map and create direct mappings * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply * create direct mappings for all pfns from [0 to max_low_pfn) and * [4GB to max_pfn) because of possible memory holes in high addresses * that cannot be marked as UC by fixed/variable range MTRRs. * Depending on the alignment of E820 ranges, this may possibly result * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. * * init_mem_mapping() calls init_range_memory_mapping() with big range. * That range would have hole in the middle or ends, and only ram parts * will be mapped in init_range_memory_mapping().
*/ staticunsignedlong __init init_range_memory_mapping( unsignedlong r_start, unsignedlong r_end)
{ unsignedlong start_pfn, end_pfn; unsignedlong mapped_ram_size = 0; int i;
/* * if it is overlapping with brk pgt, we need to * alloc pgt buf from memblock instead.
*/
can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
init_memory_mapping(start, end, PAGE_KERNEL);
mapped_ram_size += end - start;
can_use_brk_pgt = true;
}
return mapped_ram_size;
}
staticunsignedlong __init get_new_step_size(unsignedlong step_size)
{ /* * Initial mapped size is PMD_SIZE (2M). * We can not set step_size to be PUD_SIZE (1G) yet. * In worse case, when we cross the 1G boundary, and * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) * to map 1G range with PTE. Hence we use one less than the * difference of page table level shifts. * * Don't need to worry about overflow in the top-down case, on 32bit, * when step_size is 0, round_down() returns 0 for start, and that * turns it into 0x100000000ULL. * In the bottom-up case, round_up(x, 0) returns 0 though too, which * needs to be taken into consideration by the code below.
*/ return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
}
/** * memory_map_top_down - Map [map_start, map_end) top down * @map_start: start address of the target memory range * @map_end: end address of the target memory range * * This function will setup direct mapping for memory range * [map_start, map_end) in top-down. That said, the page tables * will be allocated at the end of the memory, and we map the * memory in top-down.
*/ staticvoid __init memory_map_top_down(unsignedlong map_start, unsignedlong map_end)
{ unsignedlong real_end, last_start; unsignedlong step_size; unsignedlong addr; unsignedlong mapped_ram_size = 0;
/* * Systems that have many reserved areas near top of the memory, * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will * require lots of 4K mappings which may exhaust pgt_buf. * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure * there is enough mapped memory that can be allocated from * memblock.
*/
addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
map_end); if (!addr) {
pr_warn("Failed to release memory for alloc_low_pages()");
real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE));
} else {
memblock_phys_free(addr, PMD_SIZE);
real_end = addr + PMD_SIZE;
}
/* step_size need to be small so pgt_buf from BRK could cover it */
step_size = PMD_SIZE;
max_pfn_mapped = 0; /* will get exact value next */
min_pfn_mapped = real_end >> PAGE_SHIFT;
last_start = real_end;
/* * We start from the top (end of memory) and go to the bottom. * The memblock_find_in_range() gets us a block of RAM from the * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages * for page table.
*/ while (last_start > map_start) { unsignedlong start;
if (real_end < map_end)
init_range_memory_mapping(real_end, map_end);
}
/** * memory_map_bottom_up - Map [map_start, map_end) bottom up * @map_start: start address of the target memory range * @map_end: end address of the target memory range * * This function will setup direct mapping for memory range * [map_start, map_end) in bottom-up. Since we have limited the * bottom-up allocation above the kernel, the page tables will * be allocated just above the kernel and we map the memory * in [map_start, map_end) in bottom-up.
*/ staticvoid __init memory_map_bottom_up(unsignedlong map_start, unsignedlong map_end)
{ unsignedlong next, start; unsignedlong mapped_ram_size = 0; /* step_size need to be small so pgt_buf from BRK could cover it */ unsignedlong step_size = PMD_SIZE;
/* * We start from the bottom (@map_start) and go to the top (@map_end). * The memblock_find_in_range() gets us a block of RAM from the * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages * for page table.
*/ while (start < map_end) { if (step_size && map_end - start > step_size) {
next = round_up(start + 1, step_size); if (next > map_end)
next = map_end;
} else {
next = map_end;
}
if (mapped_ram_size >= step_size)
step_size = get_new_step_size(step_size);
}
}
/* * The real mode trampoline, which is required for bootstrapping CPUs * occupies only a small area under the low 1MB. See reserve_real_mode() * for details. * * If KASLR is disabled the first PGD entry of the direct mapping is copied * to map the real mode trampoline. * * If KASLR is enabled, copy only the PUD which covers the low 1MB * area. This limits the randomization granularity to 1GB for both 4-level * and 5-level paging.
*/ staticvoid __init init_trampoline(void)
{ #ifdef CONFIG_X86_64 /* * The code below will alias kernel page-tables in the user-range of the * address space, including the Global bit. So global TLB entries will * be created when using the trampoline page-table.
*/ if (!kaslr_memory_enabled())
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; else
init_trampoline_kaslr(); #endif
}
#ifdef CONFIG_X86_64
end = max_pfn << PAGE_SHIFT; #else
end = max_low_pfn << PAGE_SHIFT; #endif
/* the ISA range is always mapped regardless of memory holes */
init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);
/* Init the trampoline, possibly with KASLR memory offset */
init_trampoline();
/* * If the allocation is in bottom-up direction, we setup direct mapping * in bottom-up, otherwise we setup direct mapping in top-down.
*/ if (memblock_bottom_up()) { unsignedlong kernel_end = __pa_symbol(_end);
/* * we need two separate calls here. This is because we want to * allocate page tables above the kernel. So we first map * [kernel_end, end) to make memory above the kernel be mapped * as soon as possible. And then use page tables allocated above * the kernel to map [ISA_END_ADDRESS, kernel_end).
*/
memory_map_bottom_up(kernel_end, end);
memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
} else {
memory_map_top_down(ISA_END_ADDRESS, end);
}
#ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { /* can we preserve max_low_pfn ?*/
max_low_pfn = max_pfn;
} #else
early_ioremap_page_table_range_init(); #endif
load_cr3(swapper_pg_dir);
__flush_tlb_all();
x86_init.hyper.init_mem_mapping();
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}
/* * Initialize an mm_struct to be used during poking and a pointer to be used * during patching.
*/ void __init poking_init(void)
{
spinlock_t *ptl;
pte_t *ptep;
text_poke_mm = mm_alloc();
BUG_ON(!text_poke_mm);
/* Xen PV guests need the PGD to be pinned. */
paravirt_enter_mmap(text_poke_mm);
set_notrack_mm(text_poke_mm);
/* * Randomize the poking address, but make sure that the following page * will be mapped at the same PMD. We need 2 pages, so find space for 3, * and adjust the address if the PMD ends after the first one.
*/
text_poke_mm_addr = TASK_UNMAPPED_BASE; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE))
text_poke_mm_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) %
(TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE);
/* * We need to trigger the allocation of the page-tables that will be * needed for poking now. Later, poking may be performed in an atomic * section, which might cause allocation to fail.
*/
ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl);
BUG_ON(!ptep);
pte_unmap_unlock(ptep, ptl);
}
/* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. * * On x86, access has to be given to the first megabyte of RAM because that * area traditionally contains BIOS code and data regions used by X, dosemu, * and similar apps. Since they map the entire memory range, the whole range * must be allowed (for mapping), but any areas that would otherwise be * disallowed are flagged as being "zero filled" instead of rejected. * Access has to be given to non-kernel-ram areas as well, these contain the * PCI mmio resources as well as potential bios/acpi data regions.
*/ int devmem_is_allowed(unsignedlong pagenr)
{ if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
!= REGION_DISJOINT) { /* * For disallowed memory regions in the low 1MB range, * request that the page be shown as all zeros.
*/ if (pagenr < 256) return 2;
return 0;
}
/* * This must follow RAM test, since System RAM is considered a * restricted resource under CONFIG_STRICT_DEVMEM.
*/ if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { /* Low 1MB bypasses iomem restrictions. */ if (pagenr < 256) return 1;
/* Make sure boundaries are page aligned */
begin_aligned = PAGE_ALIGN(begin);
end_aligned = end & PAGE_MASK;
if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
begin = begin_aligned;
end = end_aligned;
}
if (begin >= end) return;
/* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will * create a kernel page fault:
*/ if (debug_pagealloc_enabled()) {
pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n",
begin, end - 1); /* * Inform kmemleak about the hole in the memory since the * corresponding pages will be unmapped.
*/
kmemleak_free_part((void *)begin, end - begin);
set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
} else { /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that * writeable and non-executable first.
*/
set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
/* * begin/end can be in the direct map or the "high kernel mapping" * used for the kernel image only. free_init_pages() will do the * right thing for either kind of address.
*/ void free_kernel_image_pages(constchar *what, void *begin, void *end)
{ unsignedlong begin_ul = (unsignedlong)begin; unsignedlong end_ul = (unsignedlong)end; unsignedlong len_pages = (end_ul - begin_ul) >> PAGE_SHIFT;
free_init_pages(what, begin_ul, end_ul);
/* * PTI maps some of the kernel into userspace. For performance, * this includes some kernel areas that do not contain secrets. * Those areas might be adjacent to the parts of the kernel image * being freed, which may contain secrets. Remove the "high kernel * image mapping" for these freed areas, ensuring they are not even * potentially vulnerable to Meltdown regardless of the specific * optimizations PTI is currently using. * * The "noalias" prevents unmapping the direct map alias which is * needed to access the freed pages. * * This is only valid for 64bit kernels. 32bit has only one mapping * which can't be treated in this way for obvious reasons.
*/ if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI))
set_memory_np_noalias(begin_ul, len_pages);
}
#ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsignedlong start, unsignedlong end)
{ /* * end could be not aligned, and We can not align that, * decompressor could be confused by aligned initrd_end * We already reserve the end partial page before in * - i386_start_kernel() * - x86_64_start_kernel() * - relocate_initrd() * So here We can do PAGE_ALIGN() safely to get partial page to be freed
*/
free_init_pages("initrd", start, PAGE_ALIGN(end));
} #endif
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.