// SPDX-License-Identifier: GPL-2.0 /* * kaslr.c * * This contains the routines needed to generate a reasonable level of * entropy to choose a randomized kernel base address offset in support * of Kernel Address Space Layout Randomization (KASLR). Additionally * handles walking the physical memory maps (and tracking memory regions * to avoid) in order to select a physical memory location that can * contain the entire properly aligned running kernel image. *
*/
/* * isspace() in linux/ctype.h is expected by next_args() to filter * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, * since isdigit() is implemented in both of them. Hence disable it * here.
*/ #define BOOT_CTYPE_H
/* Only supporting at most 4 unusable memmap regions with kaslr */ #define MAX_MEMMAP_REGIONS 4
staticbool memmap_too_large;
/* * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit. * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options.
*/ static u64 mem_limit;
/* Number of immovable memory regions */ staticint num_immovable_mem;
switch (*p) { case'#': case'$': case'!':
*start = memparse(p + 1, &p); return 0; case'@': /* * memmap=nn@ss specifies usable region, should * be skipped
*/
*size = 0;
fallthrough; default: /* * If w/o offset, only size specified, memmap=nn[KMG] has the * same behaviour as mem=nn[KMG]. It limits the max address * system can use. Region above the limit should be avoided.
*/
*start = 0; return 0;
}
if (!strcmp(p, "nopentium")) continue;
mem_size = memparse(p, &p); if (mem_size == 0) break;
if (mem_size < mem_limit)
mem_limit = mem_size;
}
}
free(tmp_cmdline); return;
}
/* * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM) * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit. * * The mem_avoid array is used to store the ranges that need to be avoided * when KASLR searches for an appropriate random address. We must avoid any * regions that are unsafe to overlap with during decompression, and other * things like the initrd, cmdline and boot_params. This comment seeks to * explain mem_avoid as clearly as possible since incorrect mem_avoid * memory ranges lead to really hard to debug boot failures. * * The initrd, cmdline, and boot_params are trivial to identify for * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and * MEM_AVOID_BOOTPARAMS respectively below. * * What is not obvious how to avoid is the range of memory that is used * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover * the compressed kernel (ZO) and its run space, which is used to extract * the uncompressed kernel (VO) and relocs. * * ZO's full run size sits against the end of the decompression buffer, so * we can calculate where text, data, bss, etc of ZO are positioned more * easily. * * For additional background, the decompression calculations can be found * in header.S, and the memory diagram is based on the one found in misc.c. * * The following conditions are already enforced by the image layouts and * associated code: * - input + input_size >= output + output_size * - kernel_total_size <= init_size * - kernel_total_size <= output_size (see Note below) * - output + init_size >= output + output_size * * (Note that kernel_total_size and output_size have no fundamental * relationship, but output_size is passed to choose_random_location * as a maximum of the two. The diagram is showing a case where * kernel_total_size is larger than output_size, but this case is * handled by bumping output_size.) * * The above conditions can be illustrated by a diagram: * * 0 output input input+input_size output+init_size * | | | | | * | | | | | * |-----|--------|--------|--------------|-----------|--|-------------| * | | | * | | | * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size * * [output, output+init_size) is the entire memory range used for * extracting the compressed image. * * [output, output+kernel_total_size) is the range needed for the * uncompressed kernel (VO) and its run size (bss, brk, etc). * * [output, output+output_size) is VO plus relocs (i.e. the entire * uncompressed payload contained by ZO). This is the area of the buffer * written to during decompression. * * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case * range of the copied ZO and decompression code. (i.e. the range * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) * * [input, input+input_size) is the original copied compressed image (ZO) * (i.e. it does not include its run size). This range must be avoided * because it contains the data used for decompression. * * [input+input_size, output+init_size) is [_text, _end) for ZO. This * range includes ZO's heap and stack, and must be avoided since it * performs the decompression. * * Since the above two ranges need to be avoided and they are adjacent, * they can be merged, resulting in: [input, output+init_size) which * becomes the MEM_AVOID_ZO_RANGE below.
*/ staticvoid mem_avoid_init(unsignedlong input, unsignedlong input_size, unsignedlong output)
{ unsignedlong init_size = boot_params_ptr->hdr.init_size;
u64 initrd_start, initrd_size; unsignedlong cmd_line, cmd_line_size;
/* * Avoid the region that is unsafe to overlap during * decompression.
*/
mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
/* Avoid initrd. */
initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32;
initrd_start |= boot_params_ptr->hdr.ramdisk_image;
initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32;
initrd_size |= boot_params_ptr->hdr.ramdisk_size;
mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
mem_avoid[MEM_AVOID_INITRD].size = initrd_size; /* No need to set mapping for initrd, it will be handled in VO. */
/* We don't need to set a mapping for setup_data. */
/* Mark the memmap regions we need to avoid */
handle_mem_options();
/* Enumerate the immovable memory regions */
num_immovable_mem = count_immovable_mem_regions();
}
/* * Does this memory vector overlap a known avoided area? If so, record the * overlap region with the lowest address.
*/ staticbool mem_avoid_overlap(struct mem_vector *img, struct mem_vector *overlap)
{ int i; struct setup_data *ptr;
u64 earliest = img->start + img->size; bool is_overlapping = false;
for (i = 0; i < MEM_AVOID_MAX; i++) { if (mem_overlaps(img, &mem_avoid[i]) &&
mem_avoid[i].start < earliest) {
*overlap = mem_avoid[i];
earliest = overlap->start;
is_overlapping = true;
}
}
/* Avoid all entries in the setup_data linked list. */
ptr = (struct setup_data *)(unsignedlong)boot_params_ptr->hdr.setup_data; while (ptr) { struct mem_vector avoid;
/* * Skip as many 1GB huge pages as possible in the passed region * according to the number which users specified:
*/ staticvoid
process_gb_huge_pages(struct mem_vector *region, unsignedlong image_size)
{
u64 pud_start, pud_end; unsignedlong gb_huge_pages; struct mem_vector tmp;
if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) {
store_slot_info(region, image_size); return;
}
/* Are there any 1GB pages in the region? */
pud_start = ALIGN(region->start, PUD_SIZE);
pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE);
/* No good 1GB huge pages found: */ if (pud_start >= pud_end) {
store_slot_info(region, image_size); return;
}
/* Check if the head part of the region is usable. */ if (pud_start >= region->start + image_size) {
tmp.start = region->start;
tmp.size = pud_start - region->start;
store_slot_info(&tmp, image_size);
}
/* Check if the tail part of the region is usable. */ if (region->start + region->size >= pud_end + image_size) {
tmp.start = pud_end;
tmp.size = region->start + region->size - pud_end;
store_slot_info(&tmp, image_size);
}
}
/* Give up if slot area array is full. */ while (slot_area_index < MAX_SLOT_AREA) { /* Potentially raise address to meet alignment needs. */
region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
/* Did we raise the address above the passed in memory entry? */ if (region.start > region_end) return;
/* Reduce size by any delta from the original address. */
region.size = region_end - region.start;
/* Return if region can't contain decompressed kernel */ if (region.size < image_size) return;
/* If nothing overlaps, store the region and return. */ if (!mem_avoid_overlap(®ion, &overlap)) {
process_gb_huge_pages(®ion, image_size); return;
}
/* Store beginning of region if holds at least image_size. */ if (overlap.start >= region.start + image_size) {
region.size = overlap.start - region.start;
process_gb_huge_pages(®ion, image_size);
}
/* Clip off the overlapping region and start over. */
region.start = overlap.start + overlap.size;
}
}
staticbool process_mem_region(struct mem_vector *region, unsignedlong minimum, unsignedlong image_size)
{ int i; /* * If no immovable memory found, or MEMORY_HOTREMOVE disabled, * use @region directly.
*/ if (!num_immovable_mem) {
__process_mem_region(region, minimum, image_size);
if (slot_area_index == MAX_SLOT_AREA) {
debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n"); returntrue;
}
} #endif returnfalse;
}
#ifdef CONFIG_EFI
/* * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are * guaranteed to be free. * * Pick free memory more conservatively than the EFI spec allows: according to * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus * available to place the kernel image into, but in practice there's firmware * where using that memory leads to crashes. Buggy vendor EFI code registers * for an event that triggers on SetVirtualAddressMap(). The handler assumes * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which * is probably true for Windows. * * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap().
*/ staticinlinebool memory_type_is_free(efi_memory_desc_t *md)
{ if (md->type == EFI_CONVENTIONAL_MEMORY) returntrue;
if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) &&
md->type == EFI_UNACCEPTED_MEMORY) returntrue;
returnfalse;
}
/* * Returns true if we processed the EFI memmap, which we prefer over the E820 * table if it is available.
*/ staticbool
process_efi_entries(unsignedlong minimum, unsignedlong image_size)
{ struct efi_info *e = &boot_params_ptr->efi_info; bool efi_mirror_found = false; struct mem_vector region;
efi_memory_desc_t *md; unsignedlong pmap; char *signature;
u32 nr_desc; int i;
#ifdef CONFIG_X86_32 /* Can't handle data above 4GB at this time */ if (e->efi_memmap_hi) {
warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); returnfalse;
}
pmap = e->efi_memmap; #else
pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif
nr_desc = e->efi_memmap_size / e->efi_memdesc_size; for (i = 0; i < nr_desc; i++) {
md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
efi_mirror_found = true; break;
}
}
for (i = 0; i < nr_desc; i++) {
md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
if (!memory_type_is_free(md)) continue;
if (efi_soft_reserve_enabled() &&
(md->attribute & EFI_MEMORY_SP)) continue;
if (efi_mirror_found &&
!(md->attribute & EFI_MEMORY_MORE_RELIABLE)) continue;
/* Verify potential e820 positions, appending to slots list. */ for (i = 0; i < boot_params_ptr->e820_entries; i++) {
entry = &boot_params_ptr->e820_table[i]; /* Skip non-RAM entries. */ if (entry->type != E820_TYPE_RAM) continue;
region.start = entry->addr;
region.size = entry->size; if (process_mem_region(®ion, minimum, image_size)) break;
}
}
/* * If KHO is active, only process its scratch areas to ensure we are not * stepping onto preserved memory.
*/ staticbool process_kho_entries(unsignedlong minimum, unsignedlong image_size)
{ struct kho_scratch *kho_scratch; struct setup_data *ptr; struct kho_data *kho; int i, nr_areas = 0;
if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) returnfalse;
/* Bail out early if it's impossible to succeed. */ if (minimum + image_size > mem_limit) return 0;
/* Check if we had too many memmaps. */ if (memmap_too_large) {
debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); return 0;
}
/* * During kexec handover only process KHO scratch areas that are known * not to contain any data that must be preserved.
*/ if (!process_kho_entries(minimum, image_size) &&
!process_efi_entries(minimum, image_size))
process_e820_entries(minimum, image_size);
phys_addr = slots_fetch_random();
/* Perform a final check to make sure the address is in range. */ if (phys_addr < minimum || phys_addr + image_size > mem_limit) {
warn("Invalid physical address chosen!\n"); return 0;
}
/* * There are how many CONFIG_PHYSICAL_ALIGN-sized slots * that can hold image_size within the range of minimum to * KERNEL_IMAGE_SIZE?
*/
slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN;
/* * Since this function examines addresses much more numerically, * it takes the input and output pointers as 'unsigned long'.
*/ void choose_random_location(unsignedlong input, unsignedlong input_size, unsignedlong *output, unsignedlong output_size, unsignedlong *virt_addr)
{ unsignedlong random_addr, min_addr;
if (cmdline_find_option_bool("nokaslr")) {
warn("KASLR disabled: 'nokaslr' on cmdline."); return;
}
boot_params_ptr->hdr.loadflags |= KASLR_FLAG;
if (IS_ENABLED(CONFIG_X86_32))
mem_limit = KERNEL_IMAGE_SIZE; else
mem_limit = MAXMEM;
/* Record the various known unsafe memory ranges. */
mem_avoid_init(input, input_size, *output);
/* * Low end of the randomization range should be the * smaller of 512M or the initial kernel image * location:
*/
min_addr = min(*output, 512UL << 20); /* Make sure minimum is aligned. */
min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN);
/* Walk available memory entries to find a random address. */
random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) {
warn("Physical KASLR disabled: no suitable memory region!");
} else { /* Update the new physical address location. */ if (*output != random_addr)
*output = random_addr;
}
/* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ if (IS_ENABLED(CONFIG_X86_64))
random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
*virt_addr = random_addr;
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.2 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.