/* * Note: pte --> Linux PTE * HPTE --> PowerPC Hashed Page Table Entry * * Execution context: * htab_initialize is called with the MMU off (of course), but * the kernel has been copied down to zero so it can directly * reference global data. At this point it is very difficult * to print debug info. *
*/
/* * Flush the partition table cache if this is HV mode.
*/ if (early_cpu_has_feature(CPU_FTR_HVMODE))
tlbiel_hash_set_isa300(0, is, 0, 2, 0);
/* * Now invalidate the process table cache. UPRT=0 HPT modes (what * current hardware implements) do not use the process table, but * add the flushes anyway. * * From ISA v3.0B p. 1078: * The following forms are invalid. * * PRS=1, R=0, and RIC!=2 (The only process-scoped * HPT caching is of the Process Table.)
*/
tlbiel_hash_set_isa300(0, is, 0, 2, 1);
/* * Then flush the sets of the TLB proper. Hash mode uses * partition scoped TLB translations, which may be flushed * in !HV mode.
*/ for (set = 0; set < num_sets; set++)
tlbiel_hash_set_isa300(set, is, 0, 0, 0);
staticvoid hash_linear_map_add_slot(phys_addr_t paddr, int slot)
{ if (is_kfence_address(__va(paddr)))
hash_kfence_add_slot(paddr, slot); else
hash_debug_pagealloc_add_slot(paddr, slot);
} #else staticvoid hash_linear_map_add_slot(phys_addr_t paddr, int slot) {} #endif
/* * 'R' and 'C' update notes: * - Under pHyp or KVM, the updatepp path will not set C, thus it *will* * create writeable HPTEs without C set, because the hcall H_PROTECT * that we use in that case will not update C * - The above is however not a problem, because we also don't do that * fancy "no flush" variant of eviction and we use H_REMOVE which will * do the right thing and thus we don't have the race I described earlier * * - Under bare metal, we do have the race, so we need R and C set * - We make sure R is always set and never lost * - C is _PAGE_DIRTY, and *should* always be set for a writeable mapping
*/ unsignedlong htab_convert_pte_flags(unsignedlong pteflags, unsignedlong flags)
{ unsignedlong rflags = 0;
/* _PAGE_EXEC -> NOEXEC */ if ((pteflags & _PAGE_EXEC) == 0)
rflags |= HPTE_R_N; /* * PPP bits: * Linux uses slb key 0 for kernel and 1 for user. * kernel RW areas are mapped with PPP=0b000 * User area is mapped with PPP=0b010 for read/write * or PPP=0b011 for read-only (including writeable but clean pages).
*/ if (pteflags & _PAGE_PRIVILEGED) { /* * Kernel read only mapped with ppp bits 0b110
*/ if (!(pteflags & _PAGE_WRITE)) { if (mmu_has_feature(MMU_FTR_KERNEL_RO))
rflags |= (HPTE_R_PP0 | 0x2); else
rflags |= 0x3;
}
VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request");
} else { if (pteflags & _PAGE_RWX)
rflags |= 0x2; /* * We should never hit this in normal fault handling because * a permission check (check_pte_access()) will bubble this * to higher level linux handler even for PAGE_NONE.
*/
VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping request"); if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
rflags |= 0x1;
} /* * We can't allow hardware to update hpte bits. Hence always * set 'R' bit and set 'C' if it is a write fault
*/
rflags |= HPTE_R_R;
if (pteflags & _PAGE_DIRTY)
rflags |= HPTE_R_C; /* * Add in WIG bits
*/
if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT)
rflags |= HPTE_R_I; elseif ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)
rflags |= (HPTE_R_I | HPTE_R_G); elseif ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO)
rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M); else /* * Add memory coherence if cache inhibited is not set
*/
rflags |= HPTE_R_M;
int htab_bolt_mapping(unsignedlong vstart, unsignedlong vend, unsignedlong pstart, unsignedlong prot, int psize, int ssize)
{ unsignedlong vaddr, paddr; unsignedint step, shift; int ret = 0;
/* * If we hit a bad address return error.
*/ if (!vsid) return -1; /* Make kernel text executable */ if (overlaps_kernel_text(vaddr, vaddr + step))
tprot &= ~HPTE_R_N;
/* * If relocatable, check if it overlaps interrupt vectors that * are copied down to real 0. For relocatable kernel * (e.g. kdump case) we copy interrupt vectors down to real * address 0. Mark that region as executable. This is * because on p8 system with relocation on exception feature * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence * in order to execute the interrupt handlers in virtual * mode the vector region need to be marked as executable.
*/ if ((PHYSICAL_START > MEMORY_START) &&
overlaps_interrupt_vector_text(vaddr, vaddr + step))
tprot &= ~HPTE_R_N;
BUG_ON(!mmu_hash_ops.hpte_insert);
repeat:
ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
HPTE_V_BOLTED, psize, psize,
ssize); if (ret == -1) { /* * Try to keep bolted entries in primary. * Remove non bolted entries and try insert again
*/
ret = mmu_hash_ops.hpte_remove(hpteg); if (ret != -1)
ret = mmu_hash_ops.hpte_insert(hpteg, vpn, paddr, tprot,
HPTE_V_BOLTED, psize, psize,
ssize); if (ret == -1 && !secondary_hash) {
secondary_hash = true;
hpteg = ((~hash & htab_hash_mask) * HPTES_PER_GROUP); goto repeat;
}
}
if (ret < 0) break;
cond_resched(); /* add slot info in debug_pagealloc / kfence linear map */
hash_linear_map_add_slot(paddr, ret);
} return ret < 0 ? ret : 0;
}
int htab_remove_mapping(unsignedlong vstart, unsignedlong vend, int psize, int ssize)
{ unsignedlong vaddr, time_limit; unsignedint step, shift; int rc; int ret = 0;
/* * For large number of mappings introduce a cond_resched() * to prevent softlockup warnings.
*/ if (time_after(jiffies, time_limit)) {
cond_resched();
time_limit = jiffies + HZ;
} if (rc == -ENOENT) {
ret = -ENOENT; continue;
} if (rc < 0) return rc;
}
/* * per-CPU array allocated if we enable stress_hpt.
*/ #define STRESS_MAX_GROUPS 16 struct stress_hpt_struct { unsignedlong last_group[STRESS_MAX_GROUPS];
};
staticinlineint stress_nr_groups(void)
{ /* * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries * to allow practical forward progress. Bare metal returns 1, which * seems to help uncover more bugs.
*/ if (firmware_has_feature(FW_FEATURE_LPAR)) return STRESS_MAX_GROUPS; else return 1;
}
def->shift = base_shift; if (base_shift <= 23)
def->avpnm = 0; else
def->avpnm = (1 << (base_shift - 23)) - 1;
def->sllp = slbenc; /* * We don't know for sure what's up with tlbiel, so * for now we only set it for 4K and 64K pages
*/ if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K)
def->tlbiel = 1; else
def->tlbiel = 0;
while (size > 0 && lpnum) { unsignedint shift = be32_to_cpu(prop[0]); int penc = be32_to_cpu(prop[1]);
prop += 2; size -= 2;
lpnum--;
idx = get_idx_from_shift(shift); if (idx < 0) continue;
if (penc == -1)
pr_err("Invalid penc for base_shift=%d " "shift=%d\n", base_shift, shift);
#ifdef CONFIG_HUGETLB_PAGE /* * Scan for 16G memory blocks that have been set aside for huge pages * and reserve those blocks for 16G huge pages.
*/ staticint __init htab_dt_scan_hugepage_blocks(unsignedlong node, constchar *uname, int depth, void *data) { constchar *type = of_get_flat_dt_prop(node, "device_type", NULL); const __be64 *addr_prop; const __be32 *page_count_prop; unsignedint expected_pages; longunsignedint phys_addr; longunsignedint block_size;
/* We are scanning "memory" nodes only */ if (type == NULL || strcmp(type, "memory") != 0) return 0;
/* * This property is the log base 2 of the number of virtual pages that * will represent this memory block.
*/
page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); if (page_count_prop == NULL) return 0;
expected_pages = (1 << be32_to_cpu(page_count_prop[0]));
addr_prop = of_get_flat_dt_prop(node, "reg", NULL); if (addr_prop == NULL) return 0;
phys_addr = be64_to_cpu(addr_prop[0]);
block_size = be64_to_cpu(addr_prop[1]); if (block_size != (16 * GB)) return 0;
printk(KERN_INFO "Huge page(16GB) memory: " "addr = 0x%lX size = 0x%lX pages = %d\n",
phys_addr, block_size, expected_pages); if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) {
memblock_reserve(phys_addr, block_size * expected_pages);
pseries_add_gpage(phys_addr, block_size, expected_pages);
} return 0;
} #endif/* CONFIG_HUGETLB_PAGE */
staticvoid __init mmu_psize_set_default_penc(void)
{ int bpsize, apsize; for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++)
mmu_psize_defs[bpsize].penc[apsize] = -1;
}
#ifdef CONFIG_PPC_64K_PAGES
staticbool __init might_have_hea(void)
{ /* * The HEA ethernet adapter requires awareness of the * GX bus. Without that awareness we can easily assume * we will never see an HEA ethernet device.
*/ #ifdef CONFIG_IBMEBUS return !cpu_has_feature(CPU_FTR_ARCH_207S) &&
firmware_has_feature(FW_FEATURE_SPLPAR); #else returnfalse; #endif
}
#endif/* #ifdef CONFIG_PPC_64K_PAGES */
staticvoid __init htab_scan_page_sizes(void)
{ int rc;
/* se the invalid penc to -1 */
mmu_psize_set_default_penc();
/* Default to 4K pages only */
memcpy(mmu_psize_defs, mmu_psize_defaults, sizeof(mmu_psize_defaults));
/* * Try to find the available page sizes in the device-tree
*/
rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); if (rc == 0 && early_mmu_has_feature(MMU_FTR_16M_PAGE)) { /* * Nothing in the device-tree, but the CPU supports 16M pages, * so let's fallback on a known size list for 16M capable CPUs.
*/
memcpy(mmu_psize_defs, mmu_psize_defaults_gp, sizeof(mmu_psize_defaults_gp));
}
/* * Fill in the hpte_page_sizes[] array. * We go through the mmu_psize_defs[] array looking for all the * supported base/actual page size combinations. Each combination * has a unique pagesize encoding (penc) value in the low bits of * the LP field of the HPTE. For actual page sizes less than 1MB, * some of the upper LP bits are used for RPN bits, meaning that * we need to fill in several entries in hpte_page_sizes[]. * * In diagrammatic form, with r = RPN bits and z = page size bits: * PTE LP actual page size * rrrr rrrz >=8KB * rrrr rrzz >=16KB * rrrr rzzz >=32KB * rrrr zzzz >=64KB * ... * * The zzzz bits are implementation-specific but are chosen so that * no encoding for a larger page size uses the same value in its * low-order N bits as the encoding for the 2^(12+N) byte page size * (if it exists).
*/ staticvoid __init init_hpte_page_sizes(void)
{ longint ap, bp; longint shift, penc;
for (bp = 0; bp < MMU_PAGE_COUNT; ++bp) { if (!mmu_psize_defs[bp].shift) continue; /* not a supported page size */ for (ap = bp; ap < MMU_PAGE_COUNT; ++ap) {
penc = mmu_psize_defs[bp].penc[ap]; if (penc == -1 || !mmu_psize_defs[ap].shift) continue;
shift = mmu_psize_defs[ap].shift - LP_SHIFT; if (shift <= 0) continue; /* should never happen */ /* * For page sizes less than 1MB, this loop * replicates the entry for all possible values * of the rrrr bits.
*/ while (penc < (1 << LP_BITS)) {
hpte_page_sizes[penc] = (ap << 4) | bp;
penc += 1 << shift;
}
}
}
}
if (!hash_supports_debug_pagealloc() && !kfence_early_init_enabled()) { /* * Pick a size for the linear mapping. Currently, we only * support 16M, 1M and 4K which is the default
*/ if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) &&
(unsignedlong)_stext % 0x1000000) { if (mmu_psize_defs[MMU_PAGE_16M].shift)
pr_warn("Kernel not 16M aligned, disabling 16M linear map alignment\n");
aligned = false;
}
#ifdef CONFIG_PPC_64K_PAGES /* * Pick a size for the ordinary pages. Default is 4K, we support * 64K for user mappings and vmalloc if supported by the processor. * We only use 64k for ioremap if the processor * (and firmware) support cache-inhibited large pages. * If not, we use 4k and set mmu_ci_restrictions so that * hash_page knows to switch processes that use cache-inhibited * mappings to 4k pages.
*/ if (mmu_psize_defs[MMU_PAGE_64K].shift) {
mmu_virtual_psize = MMU_PAGE_64K;
mmu_vmalloc_psize = MMU_PAGE_64K; if (mmu_linear_psize == MMU_PAGE_4K)
mmu_linear_psize = MMU_PAGE_64K; if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { /* * When running on pSeries using 64k pages for ioremap * would stop us accessing the HEA ethernet. So if we * have the chance of ever seeing one, stay at 4k.
*/ if (!might_have_hea())
mmu_io_psize = MMU_PAGE_64K;
} else
mmu_ci_restrictions = 1;
} #endif/* CONFIG_PPC_64K_PAGES */
#ifdef CONFIG_SPARSEMEM_VMEMMAP /* * We try to use 16M pages for vmemmap if that is supported * and we have at least 1G of RAM at boot
*/ if (mmu_psize_defs[MMU_PAGE_16M].shift &&
memblock_phys_mem_size() >= 0x40000000)
mmu_vmemmap_psize = MMU_PAGE_16M; else
mmu_vmemmap_psize = mmu_virtual_psize; #endif/* CONFIG_SPARSEMEM_VMEMMAP */
/* * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab * size permitted by the architecture.
*/ return max(pteg_shift + 7, 18U);
}
staticunsignedlong __init htab_get_table_size(void)
{ /* * If hash size isn't already provided by the platform, we try to * retrieve it from the device-tree. If it's not there neither, we * calculate it now based on the total RAM size
*/ if (ppc64_pft_size == 0)
of_scan_flat_dt(htab_dt_scan_pftsize, NULL); if (ppc64_pft_size) return 1UL << ppc64_pft_size;
/* * To avoid lots of HPT resizes if memory size is fluctuating * across a boundary, we deliberately have some hysterisis * here: we immediately increase the HPT size if the target * shift exceeds the current shift, but we won't attempt to * reduce unless the target shift is at least 2 below the * current shift
*/ if (target_hpt_shift > ppc64_pft_size ||
target_hpt_shift < ppc64_pft_size - 1) return mmu_hash_ops.resize_hpt(target_hpt_shift);
return 0;
}
int hash__create_section_mapping(unsignedlong start, unsignedlong end, int nid, pgprot_t prot)
{ int rc;
if (end >= H_VMALLOC_START) {
pr_warn("Outside the supported range\n"); return -1;
}
/* * PS field (VRMA page size) is not used for LPID 0, hence set to 0. * For now, UPRT is 0 and we have no segment table.
*/
htab_size = __ilog2(htab_size) - 18;
mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false);
pr_info("Partition table %p\n", partition_tb);
}
if (stress_slb_enabled)
static_branch_enable(&stress_slb_key);
if (stress_hpt_enabled) { unsignedlong tmp;
static_branch_enable(&stress_hpt_key); // Too early to use nr_cpu_ids, so use NR_CPUS
tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
__alignof__(struct stress_hpt_struct),
0, MEMBLOCK_ALLOC_ANYWHERE);
memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS);
stress_hpt_struct = __va(tmp);
/* * Calculate the required size of the htab. We want the number of * PTEGs to equal one half the number of real pages.
*/
htab_size_bytes = htab_get_table_size();
pteg_count = htab_size_bytes >> 7;
htab_hash_mask = pteg_count - 1;
if (firmware_has_feature(FW_FEATURE_LPAR) ||
firmware_has_feature(FW_FEATURE_PS3_LV1)) { /* Using a hypervisor which owns the htab */
htab_address = NULL;
_SDR1 = 0; #ifdef CONFIG_FA_DUMP /* * If firmware assisted dump is active firmware preserves * the contents of htab along with entire partition memory. * Clear the htab if firmware assisted dump is active so * that we dont end up using old mappings.
*/ if (is_fadump_active() && mmu_hash_ops.hpte_clear_all)
mmu_hash_ops.hpte_clear_all(); #endif
} else { unsignedlong limit = MEMBLOCK_ALLOC_ANYWHERE;
table = memblock_phys_alloc_range(htab_size_bytes,
htab_size_bytes,
0, limit); if (!table)
panic("ERROR: Failed to allocate %pa bytes below %pa\n",
&htab_size_bytes, &limit);
DBG("Hash table allocated at %lx, size: %lx\n", table,
htab_size_bytes);
/* Initialize the HPT with no entries */
memset((void *)table, 0, htab_size_bytes);
if (!cpu_has_feature(CPU_FTR_ARCH_300)) /* Set SDR1 */
mtspr(SPRN_SDR1, _SDR1); else
hash_init_partition_table(table, htab_size_bytes);
}
prot = pgprot_val(PAGE_KERNEL);
hash_debug_pagealloc_alloc_slots();
hash_kfence_alloc_pool(); /* create bolted the linear mapping in the hash table */
for_each_mem_range(i, &base, &end) {
size = end - base;
base = (unsignedlong)__va(base);
DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
base, size, prot);
if ((base + size) >= H_VMALLOC_START) {
pr_warn("Outside the supported range\n"); continue;
}
/* * If we have a memory_limit and we've allocated TCEs then we need to * explicitly map the TCE area at the top of RAM. We also cope with the * case that the TCEs start below memory_limit. * tce_alloc_start/end are 16MB aligned so the mapping should work * for either 4K or 16MB pages.
*/ if (tce_alloc_start) {
tce_alloc_start = (unsignedlong)__va(tce_alloc_start);
tce_alloc_end = (unsignedlong)__va(tce_alloc_end);
if (base + size >= tce_alloc_start)
tce_alloc_start = base + size + 1;
staticstruct hash_mm_context init_hash_mm_context; void __init hash__early_init_mmu(void)
{ #ifndef CONFIG_PPC_64K_PAGES /* * We have code in __hash_page_4K() and elsewhere, which assumes it can * do the following: * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); * * Where the slot number is between 0-15, and values of 8-15 indicate * the secondary bucket. For that code to work H_PAGE_F_SECOND and * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here * with a BUILD_BUG_ON().
*/
BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); #endif/* CONFIG_PPC_64K_PAGES */
if (!mmu_hash_ops.hpte_insert)
panic("hash__early_init_mmu: No MMU hash ops defined!\n");
/* * Initialize the MMU Hash table and create the linear mapping * of memory. Has to be done before SLB initialization as this is * currently where the page size encoding is obtained.
*/
htab_initialize();
if (addr < SLICE_LOW_TOP) {
psizes = get_paca()->mm_ctx_low_slices_psize;
index = GET_LOW_SLICE_INDEX(addr);
} else {
psizes = get_paca()->mm_ctx_high_slices_psize;
index = GET_HIGH_SLICE_INDEX(addr);
}
mask_index = index & 0x1; return (psizes[index >> 1] >> (mask_index * 4)) & 0xF;
}
/* * Demote a segment to using 4k pages. * For now this makes the whole process use 4k pages.
*/ #ifdef CONFIG_PPC_64K_PAGES void demote_segment_4k(struct mm_struct *mm, unsignedlong addr)
{ if (get_slice_psize(mm, addr) == MMU_PAGE_4K) return;
slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); #ifdef CONFIG_SPU_BASE
spu_flush_all_slbs(mm); #endif if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
#ifdef CONFIG_PPC_SUBPAGE_PROT /* * This looks up a 2-bit protection code for a 4k subpage of a 64k page. * Userspace sets the subpage permissions using the subpage_prot system call. * * Result is 0: full permissions, _PAGE_RW: read-only, * _PAGE_RWX: no access.
*/ staticint subpage_protection(struct mm_struct *mm, unsignedlong ea)
{ struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context);
u32 spp = 0;
u32 **sbpm, *sbpp;
/* extract 2-bit bitfield for this 4k subpage */
spp >>= 30 - 2 * ((ea >> 12) & 0xf);
/* * 0 -> full permission * 1 -> Read only * 2 -> no access. * We return the flag that need to be cleared.
*/
spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); return spp;
}
/* Get region & vsid */ switch (get_region_id(ea)) { case USER_REGION_ID:
user_region = 1; if (! mm) {
DBG_LOW(" user region with no mm !\n");
rc = 1; goto bail;
}
psize = get_slice_psize(mm, ea);
ssize = user_segment_size(ea);
vsid = get_user_vsid(&mm->context, ea, ssize); break; case VMALLOC_REGION_ID:
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
psize = mmu_vmalloc_psize;
ssize = mmu_kernel_ssize;
flags |= HPTE_USE_KERNEL_KEY; break;
case IO_REGION_ID:
vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
psize = mmu_io_psize;
ssize = mmu_kernel_ssize;
flags |= HPTE_USE_KERNEL_KEY; break; default: /* * Not a valid range * Send the problem up to do_page_fault()
*/
rc = 1; goto bail;
}
DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid);
/* Bad address. */ if (!vsid) {
DBG_LOW("Bad address!\n");
rc = 1; goto bail;
} /* Get pgdir */
pgdir = mm->pgd; if (pgdir == NULL) {
rc = 1; goto bail;
}
/* Check CPU locality */ if (user_region && mm_is_thread_local(mm))
flags |= HPTE_LOCAL_UPDATE;
#ifndef CONFIG_PPC_64K_PAGES /* * If we use 4K pages and our psize is not 4K, then we might * be hitting a special driver mapping, and need to align the * address before we fetch the PTE. * * It could also be a hugepage mapping, in which case this is * not necessary, but it's not harmful, either.
*/ if (psize != MMU_PAGE_4K)
ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); #endif/* CONFIG_PPC_64K_PAGES */
/* Get PTE and page size from page tables */
ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
rc = 1; goto bail;
}
if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !radix_enabled()) { if (hugeshift == PMD_SHIFT && psize == MMU_PAGE_16M)
hugeshift = mmu_psize_defs[MMU_PAGE_16M].shift; if (hugeshift == PUD_SHIFT && psize == MMU_PAGE_16G)
hugeshift = mmu_psize_defs[MMU_PAGE_16G].shift;
}
/* * Add _PAGE_PRESENT to the required access perm. If there are parallel * updates to the pte that can possibly clear _PAGE_PTE, catch that too. * * We can safely use the return pte address in rest of the function * because we do set H_PAGE_BUSY which prevents further updates to pte * from generic code.
*/
access |= _PAGE_PRESENT | _PAGE_PTE;
/* * Pre-check access permissions (will be re-checked atomically * in __hash_page_XX but this pre-check is a fast path
*/ if (!check_pte_access(access, pte_val(*ptep))) {
DBG_LOW(" no access !\n");
rc = 1; goto bail;
}
if (hugeshift) { if (is_thp)
rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep,
trap, flags, ssize, psize); #ifdef CONFIG_HUGETLB_PAGE else
rc = __hash_page_huge(ea, access, vsid, ptep, trap,
flags, ssize, hugeshift, psize); #else else { /* * if we have hugeshift, and is not transhuge with * hugetlb disabled, something is really wrong.
*/
rc = 1;
WARN_ON(1);
} #endif if (current->mm == mm)
check_paca_psize(ea, mm, psize, user_region);
goto bail;
}
#ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); #else
DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep),
pte_val(*(ptep + PTRS_PER_PTE))); #endif /* Do actual hashing */ #ifdef CONFIG_PPC_64K_PAGES /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) {
demote_segment_4k(mm, ea);
psize = MMU_PAGE_4K;
}
/* * If this PTE is non-cacheable and we have restrictions on * using non cacheable large pages, then we switch to 4k
*/ if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { if (user_region) {
demote_segment_4k(mm, ea);
psize = MMU_PAGE_4K;
} elseif (ea < VMALLOC_END) { /* * some driver did a non-cacheable mapping * in vmalloc space, so switch vmalloc * to 4k pages
*/
printk(KERN_ALERT "Reducing vmalloc segment " "to 4kB pages because of " "non-cacheable mapping\n");
psize = mmu_vmalloc_psize = MMU_PAGE_4K; #ifdef CONFIG_SPU_BASE
spu_flush_all_slbs(mm); #endif
}
}
#endif/* CONFIG_PPC_64K_PAGES */
if (current->mm == mm)
check_paca_psize(ea, mm, psize, user_region);
/* * Dump some info in case of hash insertion failure, they should * never happen so it is really useful to know if/when they do
*/ if (rc == -1)
hash_failure_debug(ea, access, vsid, trap, ssize, psize,
psize, pte_val(*ptep)); #ifndef CONFIG_PPC_64K_PAGES
DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); #else
DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep),
pte_val(*(ptep + PTRS_PER_PTE))); #endif
DBG_LOW(" -> rc=%d\n", rc);
if (unlikely(dsisr & (DSISR_BAD_FAULT_64S | DSISR_KEYFAULT))) {
hash__do_page_fault(regs); return;
}
region_id = get_region_id(ea); if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID))
mm = &init_mm; else
mm = current->mm;
if (dsisr & DSISR_NOHPTE)
flags |= HPTE_NOHPTE_UPDATE;
if (dsisr & DSISR_ISSTORE)
access |= _PAGE_WRITE; /* * We set _PAGE_PRIVILEGED only when * kernel mode access kernel space. * * _PAGE_PRIVILEGED is NOT set * 1) when kernel mode access user space * 2) user space access kernel space.
*/
access |= _PAGE_PRIVILEGED; if (user_mode(regs) || (region_id == USER_REGION_ID))
access &= ~_PAGE_PRIVILEGED;
if (TRAP(regs) == INTERRUPT_INST_STORAGE)
access |= _PAGE_EXEC;
err = hash_page_mm(mm, ea, access, TRAP(regs), flags); if (unlikely(err < 0)) { // failed to insert a hash PTE due to an hypervisor error if (user_mode(regs)) { if (IS_ENABLED(CONFIG_PPC_SUBPAGE_PROT) && err == -2)
_exception(SIGSEGV, regs, SEGV_ACCERR, ea); else
_exception(SIGBUS, regs, BUS_ADRERR, ea);
} else {
bad_page_fault(regs, SIGBUS);
}
err = 0;
/* Get Linux PTE if available */
pgdir = mm->pgd; if (pgdir == NULL) return;
/* Get VSID */
ssize = user_segment_size(ea);
vsid = get_user_vsid(&mm->context, ea, ssize); if (!vsid) return;
#ifdef CONFIG_PPC_64K_PAGES /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take * care of it once we actually try to access the page. * That way we don't have to duplicate all of the logic for segment * page size demotion here * Called with PTL held, hence can be sure the value won't change in * between.
*/ if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) return; #endif/* CONFIG_PPC_64K_PAGES */
/* * __hash_page_* must run with interrupts off, including PMI interrupts * off, as it sets the H_PAGE_BUSY bit. * * It's otherwise possible for perf interrupts to hit at any time and * may take a hash fault reading the user stack, which could take a * hash miss and deadlock on the same H_PAGE_BUSY bit. * * Interrupts must also be off for the duration of the * mm_is_thread_local test and update, to prevent preempt running the * mm on another CPU (XXX: this may be racy vs kthread_use_mm).
*/
powerpc_local_irq_pmu_save(flags);
/* Is that local to this CPU ? */ if (mm_is_thread_local(mm))
update_flags |= HPTE_LOCAL_UPDATE;
/* Dump some info in case of hash insertion failure, they should * never happen so it is really useful to know if/when they do
*/ if (rc == -1)
hash_failure_debug(ea, access, vsid, trap, ssize,
mm_ctx_user_psize(&mm->context),
mm_ctx_user_psize(&mm->context),
pte_val(*ptep));
powerpc_local_irq_pmu_restore(flags);
}
/* * This is called at the end of handling a user page fault, when the * fault has been handled by updating a PTE in the linux page tables. * We use it to preload an HPTE into the hash table corresponding to * the updated linux PTE. * * This must always be called with the pte lock held.
*/ void __update_mmu_cache(struct vm_area_struct *vma, unsignedlong address,
pte_t *ptep)
{ /* * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held
*/ unsignedlong trap; bool is_exec;
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) return;
/* * We try to figure out if we are coming from an instruction * access fault and pass that down to __hash_page so we avoid * double-faulting on execution of fresh text. We have to test * for regs NULL since init will get here first thing at boot. * * We also avoid filling the hash if not coming from a fault.
*/
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM staticinlinevoid tm_flush_hash_page(int local)
{ /* * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a * page back to a block device w/PIO could pick up transactional data * (bad!) so we force an abort here. Before the sync the page will be * made read-only, which will flush_hash_page. BIG ISSUE here: if the * kernel uses a page from userspace without unmapping it first, it may * see the speculated version.
*/ if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
MSR_TM_ACTIVE(current->thread.regs->msr)) {
tm_enable();
tm_abort(TM_CAUSE_TLBI);
}
} #else staticinlinevoid tm_flush_hash_page(int local)
{
} #endif
/* * Return the global hash slot, corresponding to the given PTE, which contains * the HPTE.
*/ unsignedlong pte_get_hash_gslot(unsignedlong vpn, unsignedlong shift, int ssize, real_pte_t rpte, unsignedint subpg_index)
{ unsignedlong hash, gslot, hidx;
void flush_hash_page(unsignedlong vpn, real_pte_t pte, int psize, int ssize, unsignedlong flags)
{ unsignedlong index, shift, gslot; int local = flags & HPTE_LOCAL_UPDATE;
DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
gslot = pte_get_hash_gslot(vpn, shift, ssize, pte, index);
DBG_LOW(" sub %ld: gslot=%lx\n", index, gslot); /* * We use same base page size and actual psize, because we don't * use these functions for hugepage
*/
mmu_hash_ops.hpte_invalidate(gslot, vpn, psize, psize,
ssize, local);
} pte_iterate_hashed_end();
tm_flush_hash_page(local);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE void flush_hash_hugepage(unsignedlong vsid, unsignedlong addr,
pmd_t *pmdp, unsignedint psize, int ssize, unsignedlong flags)
{ int i, max_hpte_count, valid; unsignedlong s_addr; unsignedchar *hpte_slot_array; unsignedlong hidx, shift, vpn, hash, slot; int local = flags & HPTE_LOCAL_UPDATE;
s_addr = addr & HPAGE_PMD_MASK;
hpte_slot_array = get_hpte_slot_array(pmdp); /* * IF we try to do a HUGE PTE update after a withdraw is done. * we will find the below NULL. This happens when we do * split_huge_pmd
*/ if (!hpte_slot_array) return;
if (mmu_hash_ops.hugepage_invalidate) {
mmu_hash_ops.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
psize, ssize, local); goto tm_abort;
} /* * No bluk hpte removal support, invalidate each entry
*/
shift = mmu_psize_defs[psize].shift;
max_hpte_count = HPAGE_PMD_SIZE >> shift; for (i = 0; i < max_hpte_count; i++) { /* * 8 bits per each hpte entries * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
*/
valid = hpte_valid(hpte_slot_array, i); if (!valid) continue;
hidx = hpte_hash_index(hpte_slot_array, i);
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize); if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
void flush_hash_range(unsignedlong number, int local)
{ if (mmu_hash_ops.flush_hash_range)
mmu_hash_ops.flush_hash_range(number, local); else { int i; struct ppc64_tlb_batch *batch =
this_cpu_ptr(&ppc64_tlb_batch);
for (i = 0; i < number; i++)
flush_hash_page(batch->vpn[i], batch->pte[i],
batch->psize, batch->ssize, local);
}
}
long hpte_insert_repeating(unsignedlong hash, unsignedlong vpn, unsignedlong pa, unsignedlong rflags, unsignedlong vflags, int psize, int ssize)
{ unsignedlong hpte_group; long slot;
void hpt_clear_stress(void)
{ int cpu = raw_smp_processor_id(); int g;
for (g = 0; g < stress_nr_groups(); g++) { unsignedlong last_group;
last_group = stress_hpt_struct[cpu].last_group[g];
if (last_group != -1UL) { int i; for (i = 0; i < HPTES_PER_GROUP; i++) { if (mmu_hash_ops.hpte_remove(last_group) == -1) break;
}
stress_hpt_struct[cpu].last_group[g] = -1;
}
}
}
void hpt_do_stress(unsignedlong ea, unsignedlong hpte_group)
{ unsignedlong last_group; int cpu = raw_smp_processor_id();
last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1]; if (hpte_group == last_group) return;
if (last_group != -1UL) { int i; /* * Concurrent CPUs might be inserting into this group, so * give up after a number of iterations, to prevent a live * lock.
*/ for (i = 0; i < HPTES_PER_GROUP; i++) { if (mmu_hash_ops.hpte_remove(last_group) == -1) break;
}
stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1;
}
if (ea >= PAGE_OFFSET) { /* * We would really like to prefetch to get the TLB loaded, then * remove the PTE before returning from fault interrupt, to * increase the hash fault rate. * * Unfortunately QEMU TCG does not model the TLB in a way that * makes this possible, and systemsim (mambo) emulator does not * bring in TLBs with prefetches (although loads/stores do * work for non-CI PTEs). * * So remember this PTE and clear it on the next hash fault.
*/
memmove(&stress_hpt_struct[cpu].last_group[1],
&stress_hpt_struct[cpu].last_group[0],
(stress_nr_groups() - 1) * sizeof(unsignedlong));
stress_hpt_struct[cpu].last_group[0] = hpte_group;
}
}
void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
{ /* * We don't currently support the first MEMBLOCK not mapping 0 * physical on those processors
*/
BUG_ON(first_memblock_base != 0);
/* * On virtualized systems the first entry is our RMA region aka VRMA, * non-virtualized 64-bit hash MMU systems don't have a limitation * on real mode access. * * For guests on platforms before POWER9, we clamp the it limit to 1G * to avoid some funky things such as RTAS bugs etc... * * On POWER9 we limit to 1TB in case the host erroneously told us that * the RMA was >1TB. Effective address bits 0:23 are treated as zero * (meaning the access is aliased to zero i.e. addr = addr % 1TB) * for virtual real mode addressing and so it doesn't make sense to * have an area larger than 1TB as it can't be addressed.
*/ if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
ppc64_rma_size = first_memblock_size; if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x40000000); else
ppc64_rma_size = min_t(u64, ppc64_rma_size,
1UL << SID_SHIFT_1T);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.