#if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) #warning Limited user VSID range means pagetable space is wasted #endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP /* * vmemmap is the starting address of the virtual address space where * struct pages are allocated for all possible PFNs present on the system * including holes and bad memory (hence sparse). These virtual struct * pages are stored in sequence in this virtual address space irrespective * of the fact whether the corresponding PFN is valid or not. This achieves * constant relationship between address of struct page and its PFN. * * During boot or memory hotplug operation when a new memory section is * added, physical memory allocation (including hash table bolting) will * be performed for the set of struct pages which are part of the memory * section. This saves memory by not allocating struct pages for PFNs * which are not valid. * * ---------------------------------------------- * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES| * ---------------------------------------------- * * f000000000000000 c000000000000000 * vmemmap +--------------+ +--------------+ * + | page struct | +--------------> | page struct | * | +--------------+ +--------------+ * | | page struct | +--------------> | page struct | * | +--------------+ | +--------------+ * | | page struct | + +------> | page struct | * | +--------------+ | +--------------+ * | | page struct | | +--> | page struct | * | +--------------+ | | +--------------+ * | | page struct | | | * | +--------------+ | | * | | page struct | | | * | +--------------+ | | * | | page struct | | | * | +--------------+ | | * | | page struct | | | * | +--------------+ | | * | | page struct | +-------+ | * | +--------------+ | * | | page struct | +-----------+ * | +--------------+ * | | page struct | No mapping * | +--------------+ * | | page struct | No mapping * v +--------------+ * * ----------------------------------------- * | RELATION BETWEEN STRUCT PAGES AND PFNS| * ----------------------------------------- * * vmemmap +--------------+ +---------------+ * + | page struct | +-------------> | PFN | * | +--------------+ +---------------+ * | | page struct | +-------------> | PFN | * | +--------------+ +---------------+ * | | page struct | +-------------> | PFN | * | +--------------+ +---------------+ * | | page struct | +-------------> | PFN | * | +--------------+ +---------------+ * | | | * | +--------------+ * | | | * | +--------------+ * | | | * | +--------------+ +---------------+ * | | page struct | +-------------> | PFN | * | +--------------+ +---------------+ * | | | * | +--------------+ * | | | * | +--------------+ +---------------+ * | | page struct | +-------------> | PFN | * | +--------------+ +---------------+ * | | page struct | +-------------> | PFN | * v +--------------+ +---------------+
*/ /* * On hash-based CPUs, the vmemmap is bolted in the hash table. *
*/ int __meminit hash__vmemmap_create_mapping(unsignedlong start, unsignedlong page_size, unsignedlong phys)
{ int rc;
if ((start + page_size) >= H_VMEMMAP_END) {
pr_warn("Outside the supported range\n"); return -1;
}
/* * map_kernel_page currently only called by __ioremap * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it
*/ int hash__map_kernel_page(unsignedlong ea, unsignedlong pa, pgprot_t prot)
{
pgd_t *pgdp;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); if (slab_is_available()) {
pgdp = pgd_offset_k(ea);
p4dp = p4d_offset(pgdp, ea);
pudp = pud_alloc(&init_mm, p4dp, ea); if (!pudp) return -ENOMEM;
pmdp = pmd_alloc(&init_mm, pudp, ea); if (!pmdp) return -ENOMEM;
ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM;
set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot));
} else { /* * If the mm subsystem is not fully up, we cannot create a * linux page table entry for this mapping. Simply bolt an * entry in the hardware page table. *
*/ if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot),
mmu_io_psize, mmu_kernel_ssize)) {
printk(KERN_ERR "Failed to do bolted mapping IO " "memory at %016lx !\n", pa); return -ENOMEM;
}
}
pmd = *pmdp;
pmd_clear(pmdp); /* * Wait for all pending hash_page to finish. This is needed * in case of subpage collapse. When we collapse normal pages * to hugepage, we first clear the pmd, then invalidate all * the PTE entries. The assumption here is that any low level * page fault will see a none pmd and take the slow path that * will wait on mmap_lock. But we could very well be in a * hash_page with local ptep pointer value. Such a hash page * can result in adding new HPTE entries for normal subpages. * That means we could be modifying the page content as we * copy them to a huge page. So wait for parallel hash_page * to finish before invalidating HPTE entries. We can do this * by sending an IPI to all the cpus and executing a dummy * function there.
*/
serialize_against_pte_lookup(vma->vm_mm); /* * Now invalidate the hpte entries in the range * covered by pmd. This make sure we take a * fault and will find the pmd as none, which will * result in a major fault which takes mmap_lock and * hence wait for collapse to complete. Without this * the __collapse_huge_page_copy can result in copying * the old content.
*/
flush_hash_table_pmd_range(vma->vm_mm, &pmd, address); return pmd;
}
/* * We want to put the pgtable in pmd and use pgtable for tracking * the base page size hptes
*/ void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
pgtable_t *pgtable_slot;
assert_spin_locked(pmd_lockptr(mm, pmdp)); /* * we store the pgtable in the second half of PMD
*/
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
*pgtable_slot = pgtable; /* * expose the deposited pgtable to other cpus. * before we set the hugepage PTE at pmd level * hash fault code looks at the deposted pgtable * to store hash index values.
*/
smp_wmb();
}
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
pgtable = *pgtable_slot; /* * Once we withdraw, mark the entry NULL.
*/
*pgtable_slot = NULL; /* * We store HPTE information in the deposited PTE fragment. * zero out the content on withdraw.
*/
memset(pgtable, 0, PTE_FRAG_SIZE); return pgtable;
}
/* * A linux hugepage PMD was changed and the corresponding hash table entries * neesd to be flushed.
*/ void hpte_do_hugepage_flush(struct mm_struct *mm, unsignedlong addr,
pmd_t *pmdp, unsignedlong old_pmd)
{ int ssize; unsignedint psize; unsignedlong vsid; unsignedlong flags = 0;
/* get the base page size,vsid and segment size */ #ifdef CONFIG_DEBUG_VM
psize = get_slice_psize(mm, addr);
BUG_ON(psize == MMU_PAGE_16M); #endif if (old_pmd & H_PAGE_COMBO)
psize = MMU_PAGE_4K; else
psize = MMU_PAGE_64K;
old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
old_pmd = __pmd(old); /* * We have pmd == none and we are holding page_table_lock. * So we can safely go and clear the pgtable hash * index info.
*/
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
pgtable = *pgtable_slot; /* * Let's zero out old valid and hash index details * hash fault look at them.
*/
memset(pgtable, 0, PTE_FRAG_SIZE); return old_pmd;
}
int hash__has_transparent_hugepage(void)
{
if (!mmu_has_feature(MMU_FTR_16M_PAGE)) return 0; /* * We support THP only if PMD_SIZE is 16MB.
*/ if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) return 0; /* * We need to make sure that we support 16MB hugepage in a segment * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE * of 64K.
*/ /* * If we have 64K HPTE, we will be using that by default
*/ if (mmu_psize_defs[MMU_PAGE_64K].shift &&
(mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) return 0; /* * Ok we only have 4K HPTE
*/ if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) return 0;
pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n",
start, end, newpp, step);
for (idx = start; idx < end; idx += step) /* Not sure if we can do much with the return value */
mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize,
mmu_kernel_ssize);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.