/* * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) * using cpa_lock. So that we don't allow any other cpu, with stale large tlb * entries change the page attribute in parallel to some other cpu * splitting a large page entry along with changing the attribute.
*/ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ #define CPA_COLLAPSE 16 /* try to collapse large pages */
/* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): * * 1. The kernel direct map (0xffff880000000000) * 2. The "high kernel map" (0xffffffff81000000) * * We actually execute out of #2. If we get the address of a kernel symbol, it * points to #2, but almost all physical-to-virtual translations point to #1. * * This is so that we can have both a directmap of all physical memory *and* * take full advantage of the limited (s32) immediate addressing range (2G) * of x86_64. * * See Documentation/arch/x86/x86_64/mm.rst for more detail.
*/
staticinlineunsignedlong highmap_end_pfn(void)
{ /* Do not reference physical address outside the kernel. */ return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
}
staticbool __cpa_pfn_in_highmap(unsignedlong pfn)
{ /* * Kernel text has an alias mapping at a high address, known * here as "highmap".
*/ return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
}
#else
staticbool __cpa_pfn_in_highmap(unsignedlong pfn)
{ /* There is no highmap on 32-bit */ returnfalse;
}
#endif
/* * See set_mce_nospec(). * * Machine check recovery code needs to change cache mode of poisoned pages to * UC to avoid speculative access logging another error. But passing the * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a * speculative access. So we cheat and flip the top bit of the address. This * works fine for the code that updates the page tables. But at the end of the * process we need to flush the TLB and cache and the non-canonical address * causes a #GP fault when used by the INVLPG and CLFLUSH instructions. * * But in the common case we already have a canonical address. This code * will fix the top bit if needed and is a no-op otherwise.
*/ staticinlineunsignedlong fix_addr(unsignedlong addr)
{ #ifdef CONFIG_X86_64 return (long)(addr << 1) >> 1; #else return addr; #endif
}
for (; p < vend; p += clflush_size)
clflushopt(p);
}
/** * clflush_cache_range - flush a cache range with clflush * @vaddr: virtual start address * @size: number of bytes to flush * * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or * SFENCE to avoid ordering issues.
*/ void clflush_cache_range(void *vaddr, unsignedint size)
{
mb();
clflush_cache_range_opt(vaddr, size);
mb();
}
EXPORT_SYMBOL_GPL(clflush_cache_range);
#ifdef CONFIG_PCI_BIOS /* * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS * based config access (CONFIG_PCI_GOBIOS) support.
*/ #define BIOS_PFN PFN_DOWN(BIOS_BEGIN) #define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
/* * The .rodata section needs to be read-only. Using the pfn catches all * aliases. This also includes __ro_after_init, so do not enforce until * kernel_set_to_readonly is true.
*/ static pgprotval_t protect_rodata(unsignedlong spfn, unsignedlong epfn)
{ unsignedlong epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
/* * Note: __end_rodata is at page aligned and not inclusive, so * subtract 1 to get the last enforced PFN in the rodata area.
*/
epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
/* * Protect kernel text against becoming non executable by forbidding * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) * out of which the kernel actually executes. Do not protect the low * mapping. * * This does not cover __inittext since that is gone after boot.
*/ static pgprotval_t protect_kernel_text(unsignedlong start, unsignedlong end)
{ unsignedlong t_end = (unsignedlong)_etext - 1; unsignedlong t_start = (unsignedlong)_text;
if (overlaps(start, end, t_start, t_end)) return _PAGE_NX; return 0;
}
#ifdefined(CONFIG_X86_64) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections * will be always read-only. For the kernel identity mappings covering the * holes caused by this alignment can be anything that user asks. * * This will preserve the large page mappings for kernel text/data at no * extra cost.
*/ static pgprotval_t protect_kernel_text_ro(unsignedlong start, unsignedlong end)
{ unsignedlong t_end = (unsignedlong)__end_rodata_hpage_align - 1; unsignedlong t_start = (unsignedlong)_text; unsignedint level;
if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) return 0; /* * Don't enforce the !RW mapping for the kernel text mapping, if * the current mapping is already using small page mapping. No * need to work hard to preserve large page mappings in this case. * * This also fixes the Linux Xen paravirt guest boot failure caused * by unexpected read-only mappings for kernel identity * mappings. In this paravirt guest case, the kernel text mapping * and the kernel identity mapping share the same page-table pages, * so the protections for kernel text and identity mappings have to * be the same.
*/ if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) return _PAGE_RW; return 0;
} #else static pgprotval_t protect_kernel_text_ro(unsignedlong start, unsignedlong end)
{ return 0;
} #endif
/* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this * right (again, ioremap() on BIOS memory is not uncommon) so this function * checks and fixes these known static required protection bits.
*/ staticinline pgprot_t static_protections(pgprot_t prot, unsignedlong start, unsignedlong pfn, unsignedlong npg, unsignedlong lpsize, int warnlvl)
{
pgprotval_t forbidden, res; unsignedlong end;
/* * There is no point in checking RW/NX conflicts when the requested * mapping is setting the page !PRESENT.
*/ if (!(pgprot_val(prot) & _PAGE_PRESENT)) return prot;
/* Operate on the virtual address */
end = start + npg * PAGE_SIZE - 1;
/* * Special case to preserve a large page. If the change spawns the * full large page mapping then there is no point to split it * up. Happens with ftrace and is going to be removed once ftrace * switched to text_poke().
*/ if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) {
res = protect_kernel_text_ro(start, end);
check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
forbidden |= res;
}
/* * 32-bit has some unfixable W+X issues, like EFI code * and writeable data being in the same page. Disable * detection and enforcement there.
*/ if (IS_ENABLED(CONFIG_X86_32)) returnnew;
/* Only verify when NX is supported: */ if (!(__supported_pte_mask & _PAGE_NX)) returnnew;
if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX))) returnnew;
if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW) returnnew;
/* Non-leaf translation entries can disable writing or execution. */ if (!rw || nx) returnnew;
/* * For now, allow all permission change attempts by returning the * attempted permissions. This can 'return old' to actively * refuse the permission change at a later time.
*/ returnnew;
}
/* * Lookup the page table entry for a virtual address in a specific pgd. * Return a pointer to the entry (or NULL if the entry does not exist), * the level of the entry, and the effective NX and RW bits of all * page table levels.
*/
pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsignedlong address, unsignedint *level, bool *nx, bool *rw)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
/* * Lookup the page table entry for a virtual address in a specific pgd. * Return a pointer to the entry and the level of the mapping.
*/
pte_t *lookup_address_in_pgd(pgd_t *pgd, unsignedlong address, unsignedint *level)
{ bool nx, rw;
/* * Lookup the page table entry for a virtual address. Return a pointer * to the entry and the level of the mapping. * * Note: the function returns p4d, pud or pmd either when the entry is marked * large or when the present bit is not set. Otherwise it returns NULL.
*/
pte_t *lookup_address(unsignedlong address, unsignedint *level)
{ return lookup_address_in_pgd(pgd_offset_k(address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address);
/* * Lookup the PMD entry for a virtual address. Return a pointer to the entry * or NULL if not present.
*/
pmd_t *lookup_pmd_address(unsignedlong address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL;
/* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() * areas on 32-bit NUMA systems. The percpu areas can * end up in this kind of memory, for instance. * * Note that as long as the PTEs are well-formed with correct PFNs, this * works without checking the PRESENT bit in the leaf PTE. This is unlike * the similar vmalloc_to_page() and derivatives. Callers may depend on * this behavior. * * This could be optimized, but it is only used in paths that are not perf * sensitive, and keeping it unoptimized should increase the testing coverage * for the more obscure platforms.
*/
phys_addr_t slow_virt_to_phys(void *__virt_addr)
{ unsignedlong virt_addr = (unsignedlong)__virt_addr;
phys_addr_t phys_addr; unsignedlong offset; enum pg_level level;
pte_t *pte;
/* * Set the new pmd in all the pgds we know about:
*/ staticvoid __set_pmd_pte(pte_t *kpte, unsignedlong address, pte_t pte)
{ /* change init_mm */
set_pte_atomic(kpte, pte); #ifdef CONFIG_X86_32
{ struct page *page;
static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
{ /* * _PAGE_GLOBAL means "global page" for present PTEs. * But, it is also used to indicate _PAGE_PROTNONE * for non-present PTEs. * * This ensures that a _PAGE_GLOBAL PTE going from * present to non-present is not confused as * _PAGE_PROTNONE.
*/ if (!(pgprot_val(prot) & _PAGE_PRESENT))
pgprot_val(prot) &= ~_PAGE_GLOBAL;
/* * Check for races, another CPU might have split this page * up already:
*/
tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) return 1;
/* * Calculate the number of pages, which fit into this large * page starting at address:
*/
lpaddr = (address + psize) & pmask;
numpages = (lpaddr - address) >> PAGE_SHIFT; if (numpages < cpa->numpages)
cpa->numpages = numpages;
/* * We are safe now. Check whether the new pgprot is the same: * Convert protection attributes to 4k-format, as cpa->mask* are set * up accordingly.
*/
/* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
req_prot = pgprot_large_2_4k(old_prot);
/* * req_prot is in format of 4k pages. It must be converted to large * page format: the caching mode includes the PAT bit located at * different bit positions in the two formats.
*/
req_prot = pgprot_4k_2_large(req_prot);
req_prot = pgprot_clear_protnone_bits(req_prot); if (pgprot_val(req_prot) & _PAGE_PRESENT)
pgprot_val(req_prot) |= _PAGE_PSE;
/* * old_pfn points to the large page base pfn. So we need to add the * offset of the virtual address:
*/
pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
/* * Calculate the large page base address and the number of 4K pages * in the large page
*/
lpaddr = address & pmask;
numpages = psize >> PAGE_SHIFT;
/* * Sanity check that the existing mapping is correct versus the static * protections. static_protections() guards against !PRESENT, so no * extra conditional required here.
*/
chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
psize, CPA_CONFLICT);
if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { /* * Split the large page and tell the split code to * enforce static protections.
*/
cpa->force_static_prot = 1; return 1;
}
/* * Optimization: If the requested pgprot is the same as the current * pgprot, then the large page can be preserved and no updates are * required independent of alignment and length of the requested * range. The above already established that the current pgprot is * correct, which in consequence makes the requested pgprot correct * as well if it is the same. The static protection scan below will * not come to a different conclusion.
*/ if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
cpa_inc_lp_sameprot(level); return 0;
}
/* * If the requested range does not cover the full page, split it up
*/ if (address != lpaddr || cpa->numpages != numpages) return 1;
/* * Check whether the requested pgprot is conflicting with a static * protection requirement in the large page.
*/
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
/* * If there is a conflict, split the large page. * * There used to be a 4k wise evaluation trying really hard to * preserve the large pages, but experimentation has shown, that this * does not help at all. There might be corner cases which would * preserve one large page occasionally, but it's really not worth the * extra code and cycles for the common case.
*/ if (pgprot_val(req_prot) != pgprot_val(new_prot)) return 1;
/* All checks passed. Update the large page mapping. */
new_pte = pfn_pte(old_pfn, new_prot);
__set_pmd_pte(kpte, address, new_pte);
cpa->flags |= CPA_FLUSHTLB;
cpa_inc_lp_preserved(level); return 0;
}
/* * If should_split_large_page() discovered an inconsistent mapping, * remove the invalid protection in the split mapping.
*/ if (!cpa->force_static_prot) goto set;
/* Hand in lpsize = 0 to enforce the protection mechanism */
prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT);
if (pgprot_val(prot) == pgprot_val(ref_prot)) goto set;
/* * If this is splitting a PMD, fix it up. PUD splits cannot be * fixed trivially as that would require to rescan the newly * installed PMD mappings after returning from split_large_page() * so an eventual further split can allocate the necessary PTE * pages. Warn for now and revisit it in case this actually * happens.
*/ if (size == PAGE_SIZE)
ref_prot = prot; else
pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
set:
set_pte(pte, pfn_pte(pfn, ref_prot));
}
spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already:
*/
tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) {
spin_unlock(&pgd_lock); return 1;
}
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
switch (level) { case PG_LEVEL_2M:
ref_prot = pmd_pgprot(*(pmd_t *)kpte); /* * Clear PSE (aka _PAGE_PAT) and move * PAT bit to correct position.
*/
ref_prot = pgprot_large_2_4k(ref_prot);
ref_pfn = pmd_pfn(*(pmd_t *)kpte);
lpaddr = address & PMD_MASK;
lpinc = PAGE_SIZE; break;
case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte);
pfninc = PMD_SIZE >> PAGE_SHIFT;
lpaddr = address & PUD_MASK;
lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present() will return true even on a non * present pmd.
*/ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
pgprot_val(ref_prot) &= ~_PAGE_PSE; break;
default:
spin_unlock(&pgd_lock); return 1;
}
ref_prot = pgprot_clear_protnone_bits(ref_prot);
/* * Get the target pfn from the original entry:
*/
pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
if (virt_addr_valid(address)) { unsignedlong pfn = PFN_DOWN(__pa(address));
if (pfn_range_is_mapped(pfn, pfn + 1))
split_page_count(level);
}
/* * Install the new, split up pagetable. * * We use the standard kernel pagetable protections for the new * pagetable protections, the actual ptes set above control the * primary protection behavior:
*/
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
/* * Do a global flush tlb after splitting the large page * and before we do the actual change page attribute in the PTE. * * Without this, we violate the TLB application note, that says: * "The TLBs may contain both ordinary and large-page * translations for a 4-KByte range of linear addresses. This * may occur if software modifies the paging structures so that * the page size used for the address range changes. If the two * translations differ with respect to page frame or attributes * (e.g., permissions), processor behavior is undefined and may * be implementation-specific." * * We do this global tlb flush inside the cpa_lock, so that we * don't allow any other cpu, with stale tlb entries change the * page attribute in parallel, that also falls into the * just split large page entry.
*/
flush_tlb_all();
spin_unlock(&pgd_lock);
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock); if (!base) return -ENOMEM;
if (__split_large_page(cpa, kpte, address, base))
__free_page(base);
/* Make sure alignment is suitable */ if (PFN_PHYS(pfn) & ~PMD_MASK) return 0;
/* The page is 4k intentionally */ if (pte_flags(first) & _PAGE_KERNEL_4K) return 0;
/* Check that the rest of PTEs are compatible with the first one */ for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
pte_t entry = *pte;
if (!pte_present(entry)) return 0; if (pte_flags(entry) != pte_flags(first)) return 0; if (pte_pfn(entry) != pte_pfn(first) + i) return 0;
}
old_pmd = *pmd;
/* Success: set up a large page */
pgprot = pgprot_4k_2_large(pte_pgprot(first));
pgprot_val(pgprot) |= _PAGE_PSE;
_pmd = pfn_pmd(pfn, pgprot);
set_pmd(pmd, _pmd);
/* Queue the page table to be freed after TLB flush */
list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables);
if (IS_ENABLED(CONFIG_X86_32)) { struct page *page;
/* Update all PGD tables to use the same large page */
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
pmd_t *pmd = pmd_offset(pud, addr); /* Something is wrong if entries doesn't match */ if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd))) continue;
set_pmd(pmd, _pmd);
}
}
if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
collapse_page_count(PG_LEVEL_2M);
addr &= PUD_MASK;
pmd = pmd_offset(pud, addr);
first = *pmd;
/* * To restore PUD page all PMD entries must be large and * have suitable alignment
*/
pfn = pmd_pfn(first); if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK)) return 0;
/* * To restore PUD page, all following PMDs must be compatible with the * first one.
*/ for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) {
pmd_t entry = *pmd;
if (!pmd_present(entry) || !pmd_leaf(entry)) return 0; if (pmd_flags(entry) != pmd_flags(first)) return 0; if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE) return 0;
}
/* Restore PUD page and queue page table to be freed after TLB flush */
list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables);
set_pud(pud, pfn_pud(pfn, pmd_pgprot(first)));
if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
collapse_page_count(PG_LEVEL_1G);
return 1;
}
/* * Collapse PMD and PUD pages in the kernel mapping around the address where * possible. * * Caller must flush TLB and free page tables queued on the list before * touching the new entries. CPU must not see TLB entries of different size * with different attributes.
*/ staticint collapse_large_pages(unsignedlong addr, struct list_head *pgtables)
{ int collapsed = 0;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
addr &= PMD_MASK;
spin_lock(&pgd_lock);
pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) goto out;
p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) goto out;
pud = pud_offset(p4d, addr); if (!pud_present(*pud) || pud_leaf(*pud)) goto out;
pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd) || pmd_leaf(*pmd)) goto out;
/* * Not on a Gb page boundary? => map everything up to it with * smaller pages.
*/ if (start & (PUD_SIZE - 1)) { unsignedlong pre_end; unsignedlong next_page = (start + PUD_SIZE) & PUD_MASK;
/* * Allocate a PUD page and hand it down for mapping.
*/
p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1;
ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it.
*/
unmap_pud_range(p4d, addr,
addr + (cpa->numpages << PAGE_SHIFT)); return ret;
}
cpa->numpages = ret; return 0;
}
staticint __cpa_process_fault(struct cpa_data *cpa, unsignedlong vaddr, int primary)
{ if (cpa->pgd) { /* * Right now, we only execute this code path when mapping * the EFI virtual memory map regions, no other users * provide a ->pgd value. This may change in the future.
*/ return populate_pgd(cpa, vaddr);
}
/* * Ignore all non primary paths.
*/ if (!primary) {
cpa->numpages = 1; return 0;
}
/* * Ignore the NULL PTE for kernel identity mapping, as it is expected * to have holes. * Also set numpages to '1' indicating that we processed cpa req for * one virtual address page and its pfn. TBD: numpages can be set based * on the initial value and the level returned by lookup_address().
*/ if (within(vaddr, PAGE_OFFSET,
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
cpa->numpages = 1;
cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; return 0;
} elseif (__cpa_pfn_in_highmap(cpa->pfn)) { /* Faults in the highmap are OK, so do not warn: */ return -EFAULT;
} else {
WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
*cpa->vaddr);
return -EFAULT;
}
}
staticint __change_page_attr(struct cpa_data *cpa, int primary)
{ unsignedlong address; int do_split, err; unsignedint level;
pte_t *kpte, old_pte; bool nx, rw;
/* * We need to keep the pfn from the existing PTE, * after all we're only going to change its attributes * not the memory it points to
*/
new_pte = pfn_pte(pfn, new_prot);
cpa->pfn = pfn; /* * Do we really change anything ?
*/ if (pte_val(old_pte) != pte_val(new_pte)) {
set_pte_atomic(kpte, new_pte);
cpa->flags |= CPA_FLUSHTLB;
}
cpa->numpages = 1; return 0;
}
/* * Check, whether we can keep the large page intact * and just change the pte:
*/
do_split = should_split_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in * try_large_page:
*/ if (do_split <= 0) return do_split;
/* * We have to split the large page:
*/
err = split_large_page(cpa, kpte, address); if (!err) goto repeat;
return err;
}
staticint __change_page_attr_set_clr(struct cpa_data *cpa, int primary);
/* * Check the directmap and "high kernel map" 'aliases'.
*/ staticint cpa_process_alias(struct cpa_data *cpa)
{ struct cpa_data alias_cpa; unsignedlong laddr = (unsignedlong)__va(cpa->pfn << PAGE_SHIFT); unsignedlong vaddr; int ret;
if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0;
/* * No need to redo, when the primary call touched the direct * mapping already:
*/
vaddr = __cpa_addr(cpa, cpa->curpage); if (!(within(vaddr, PAGE_OFFSET,
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
/* Directmap always has NX set, do not modify. */ if (__supported_pte_mask & _PAGE_NX) {
alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
}
cpa->force_flush_all = 1;
ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) return ret;
}
#ifdef CONFIG_X86_64 /* * If the primary call didn't touch the high mapping already * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well:
*/ if (!within(vaddr, (unsignedlong)_text, _brk_end) &&
__cpa_pfn_in_highmap(cpa->pfn)) { unsignedlong temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
__START_KERNEL_map - phys_base;
alias_cpa = *cpa;
alias_cpa.vaddr = &temp_cpa_vaddr;
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
/* * [_text, _brk_end) also covers data, do not modify NX except * in cases where the highmap is the primary target.
*/ if (__supported_pte_mask & _PAGE_NX) {
alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
}
cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the * return value.
*/
__change_page_attr_set_clr(&alias_cpa, 0);
} #endif
return 0;
}
staticint __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
{ unsignedlong numpages = cpa->numpages; unsignedlong rempages = numpages; int ret = 0;
/* * No changes, easy!
*/ if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
!cpa->force_split) return ret;
while (rempages) { /* * Store the remaining nr of pages for the large page * preservation check.
*/
cpa->numpages = rempages; /* for array changes, we can't use large page */ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa->numpages = 1;
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
ret = __change_page_attr(cpa, primary); if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock); if (ret) goto out;
if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
ret = cpa_process_alias(cpa); if (ret) goto out;
}
/* * Adjust the number of pages with the result of the * CPA operation. Either a large page has been * preserved or a single page update happened.
*/
BUG_ON(cpa->numpages > rempages || !cpa->numpages);
rempages -= cpa->numpages;
cpa->curpage += cpa->numpages;
}
out: /* Restore the original numpages */
cpa->numpages = numpages; return ret;
}
staticint change_page_attr_set_clr(unsignedlong *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag, struct page **pages)
{ struct cpa_data cpa; int ret, cache;
memset(&cpa, 0, sizeof(cpa));
/* * Check, if we are requested to set a not supported * feature. Clearing non-supported features is OK.
*/
mask_set = canon_pgprot(mask_set);
if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0;
/* Ensure we are PAGE_SIZE aligned */ if (in_flag & CPA_ARRAY) { int i; for (i = 0; i < numpages; i++) { if (addr[i] & ~PAGE_MASK) {
addr[i] &= PAGE_MASK;
WARN_ON_ONCE(1);
}
}
} elseif (!(in_flag & CPA_PAGES_ARRAY)) { /* * in_flag of CPA_PAGES_ARRAY implies it is aligned. * No need to check in that case
*/ if (*addr & ~PAGE_MASK) {
*addr &= PAGE_MASK; /* * People should not be passing in unaligned addresses:
*/
WARN_ON_ONCE(1);
}
}
/* Must avoid aliasing mappings in the highmem code */
kmap_flush_unused();
/* * __set_memory_prot is an internal helper for callers that have been passed * a pgprot_t value from upper layers and a reservation has already been taken. * If you want to set the pgprot to a specific page protocol, use the * set_memory_xx() functions.
*/ int __set_memory_prot(unsignedlong addr, int numpages, pgprot_t prot)
{ return change_page_attr_set_clr(&addr, numpages, prot,
__pgprot(~pgprot_val(prot)), 0, 0,
NULL);
}
int _set_memory_uc(unsignedlong addr, int numpages)
{ /* * for now UC MINUS. see comments in ioremap() * If you really need strong UC use ioremap_uc(), but note * that you cannot override IO areas with set_memory_*() as * these helpers cannot work with IO memory.
*/ return change_page_attr_set(&addr, numpages,
cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
0);
}
int set_memory_uc(unsignedlong addr, int numpages)
{ int ret;
/* * for now UC MINUS. see comments in ioremap()
*/
ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_MODE_UC_MINUS, NULL); if (ret) goto out_err;
ret = _set_memory_uc(addr, numpages); if (ret) goto out_free;
int _set_memory_wc(unsignedlong addr, int numpages)
{ int ret;
ret = change_page_attr_set(&addr, numpages,
cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
0); if (!ret) {
ret = change_page_attr_set_clr(&addr, numpages,
cachemode2pgprot(_PAGE_CACHE_MODE_WC),
__pgprot(_PAGE_CACHE_MASK),
0, 0, NULL);
} return ret;
}
int set_memory_wc(unsignedlong addr, int numpages)
{ int ret;
ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_MODE_WC, NULL); if (ret) return ret;
ret = _set_memory_wc(addr, numpages); if (ret)
memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
return ret;
}
EXPORT_SYMBOL(set_memory_wc);
int _set_memory_wt(unsignedlong addr, int numpages)
{ return change_page_attr_set(&addr, numpages,
cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
}
int _set_memory_wb(unsignedlong addr, int numpages)
{ /* WB cache mode is hard wired to all cache attribute bits being 0 */ return change_page_attr_clear(&addr, numpages,
__pgprot(_PAGE_CACHE_MASK), 0);
}
int set_memory_wb(unsignedlong addr, int numpages)
{ int ret;
ret = _set_memory_wb(addr, numpages); if (ret) return ret;
/* Prevent speculative access to a page by marking it not-present */ #ifdef CONFIG_X86_64 int set_mce_nospec(unsignedlong pfn)
{ unsignedlong decoy_addr; int rc;
/* SGX pages are not in the 1:1 map */ if (arch_is_platform_page(pfn << PAGE_SHIFT)) return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); * but doing that would radically increase the odds of a * speculative access to the poison page because we'd have * the virtual address of the kernel 1:1 mapping sitting * around in registers. * Instead we get tricky. We create a non-canonical address * that looks just like the one we want, but has bit 63 flipped. * This relies on set_memory_XX() properly sanitizing any __pa() * results with __PHYSICAL_MASK or PTE_PFN_MASK.
*/
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
rc = set_memory_np(decoy_addr, 1); if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc;
}
EXPORT_SYMBOL_GPL(set_mce_nospec);
/* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsignedlong pfn)
{ unsignedlong addr = (unsignedlong) pfn_to_kaddr(pfn);
int set_memory_rw(unsignedlong addr, int numpages)
{ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
int set_memory_np(unsignedlong addr, int numpages)
{ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
int set_memory_np_noalias(unsignedlong addr, int numpages)
{ return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
__pgprot(_PAGE_PRESENT), 0,
CPA_NO_CHECK_ALIAS, NULL);
}
int set_memory_p(unsignedlong addr, int numpages)
{ return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
}
int set_memory_4k(unsignedlong addr, int numpages)
{ return change_page_attr_set_clr(&addr, numpages,
__pgprot(_PAGE_KERNEL_4K),
__pgprot(0), 1, 0, NULL);
}
int set_memory_nonglobal(unsignedlong addr, int numpages)
{ return change_page_attr_clear(&addr, numpages,
__pgprot(_PAGE_GLOBAL), 0);
}
int set_memory_global(unsignedlong addr, int numpages)
{ return change_page_attr_set(&addr, numpages,
__pgprot(_PAGE_GLOBAL), 0);
}
/* * __set_memory_enc_pgtable() is used for the hypervisors that get * informed about "encryption" status via page tables.
*/ staticint __set_memory_enc_pgtable(unsignedlong addr, int numpages, bool enc)
{
pgprot_t empty = __pgprot(0); struct cpa_data cpa; int ret;
/* Should not be working on unaligned addresses */ if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
addr &= PAGE_MASK;
/* Must avoid aliasing mappings in the highmem code */
kmap_flush_unused();
vm_unmap_aliases();
/* Flush the caches as needed before changing the encryption attribute. */ if (x86_platform.guest.enc_tlb_flush_required(enc))
cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
/* Notify hypervisor that we are about to set/clr encryption attribute. */
ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc); if (ret) goto vmm_fail;
ret = __change_page_attr_set_clr(&cpa, 1);
/* * After changing the encryption attribute, we need to flush TLBs again * in case any speculative TLB caching occurred (but no need to flush * caches again). We could just use cpa_flush_all(), but in case TLB * flushing gets optimized in the cpa_flush() path use the same logic * as above.
*/
cpa_flush(&cpa, 0);
if (ret) return ret;
/* Notify hypervisor that we have successfully set/clr encryption attribute. */
ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc); if (ret) goto vmm_fail;
/* * The lock serializes conversions between private and shared memory. * * It is taken for read on conversion. A write lock guarantees that no * concurrent conversions are in progress.
*/ static DECLARE_RWSEM(mem_enc_lock);
/* * Stop new private<->shared conversions. * * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete. * The lock is not released to prevent new conversions from being started.
*/ bool set_memory_enc_stop_conversion(void)
{ /* * In a crash scenario, sleep is not allowed. Try to take the lock. * Failure indicates that there is a race with the conversion.
*/ if (oops_in_progress) return down_write_trylock(&mem_enc_lock);
down_write(&mem_enc_lock);
returntrue;
}
staticint __set_memory_enc_dec(unsignedlong addr, int numpages, bool enc)
{ int ret = 0;
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { if (!down_read_trylock(&mem_enc_lock)) return -EBUSY;
ret = __set_memory_enc_pgtable(addr, numpages, enc);
up_read(&mem_enc_lock);
}
return ret;
}
int set_memory_encrypted(unsignedlong addr, int numpages)
{ return __set_memory_enc_dec(addr, numpages, true);
}
EXPORT_SYMBOL_GPL(set_memory_encrypted);
int set_memory_decrypted(unsignedlong addr, int numpages)
{ return __set_memory_enc_dec(addr, numpages, false);
}
EXPORT_SYMBOL_GPL(set_memory_decrypted);
int set_pages_uc(struct page *page, int numpages)
{ unsignedlong addr = (unsignedlong)page_address(page);
staticint _set_pages_array(struct page **pages, int numpages, enum page_cache_mode new_type)
{ unsignedlong start; unsignedlong end; enum page_cache_mode set_type; int i; int free_idx; int ret;
for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE; if (memtype_reserve(start, end, new_type, NULL)) goto err_out;
}
/* If WC, set to UC- first and then WC */
set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
_PAGE_CACHE_MODE_UC_MINUS : new_type;
ret = cpa_set_pages_array(pages, numpages,
cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC)
ret = change_page_attr_set_clr(NULL, numpages,
cachemode2pgprot(
_PAGE_CACHE_MODE_WC),
__pgprot(_PAGE_CACHE_MASK),
0, CPA_PAGES_ARRAY, pages); if (ret) goto err_out; return 0; /* Success */
err_out:
free_idx = i; for (i = 0; i < free_idx; i++) { if (PageHighMem(pages[i])) continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
memtype_free(start, end);
} return -EINVAL;
}
int set_pages_array_uc(struct page **pages, int numpages)
{ return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
}
EXPORT_SYMBOL(set_pages_array_uc);
int set_pages_array_wc(struct page **pages, int numpages)
{ return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
}
EXPORT_SYMBOL(set_pages_array_wc);
int set_pages_wb(struct page *page, int numpages)
{ unsignedlong addr = (unsignedlong)page_address(page);
int set_pages_array_wb(struct page **pages, int numpages)
{ int retval; unsignedlong start; unsignedlong end; int i;
/* WB cache mode is hard wired to all cache attribute bits being 0 */
retval = cpa_clear_pages_array(pages, numpages,
__pgprot(_PAGE_CACHE_MASK)); if (retval) return retval;
for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
memtype_free(start, end);
}
return 0;
}
EXPORT_SYMBOL(set_pages_array_wb);
int set_pages_ro(struct page *page, int numpages)
{ unsignedlong addr = (unsignedlong)page_address(page);
return set_memory_ro(addr, numpages);
}
int set_pages_rw(struct page *page, int numpages)
{ unsignedlong addr = (unsignedlong)page_address(page);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.51 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.