staticinlinevoid update_page_count(int level, long count)
{ if (IS_ENABLED(CONFIG_PROC_FS))
atomic_long_add(count, &direct_pages_count[level]);
}
/* * The S390 doesn't have any external MMU info: the kernel page * tables contain all the necessary information.
*/ #define update_mmu_cache(vma, address, ptep) do { } while (0) #define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) #define update_mmu_cache_pmd(vma, address, ptep) do { } while (0)
/* * ZERO_PAGE is a global shared page that is always zero; used * for zero-mapped memory areas etc..
*/
/* TODO: s390 cannot support io_remap_pfn_range... */
#define pte_ERROR(e) \
pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_ERROR(e) \
pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) #define p4d_ERROR(e) \
pr_err("%s:%d: bad p4d %016lx.\n", __FILE__, __LINE__, p4d_val(e)) #define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
/* * The vmalloc and module area will always be on the topmost area of the * kernel mapping. 512GB are reserved for vmalloc by default. * At the top of the vmalloc area a 2GB area is reserved where modules * will reside. That makes sure that inter module branches always * happen without trampolines and in addition the placement within a * 2GB frame is branch prediction unit friendly.
*/ externunsignedlong VMALLOC_START; externunsignedlong VMALLOC_END; #define VMALLOC_DEFAULT_SIZE ((512UL << 30) - MODULES_LEN) externstruct page *vmemmap; externunsignedlong vmemmap_size;
/* * A 64 bit pagetable entry of S390 has following format: * | PFRA |0IPC| OS | * 0000000000111111111122222222223333333333444444444455555555556666 * 0123456789012345678901234567890123456789012345678901234567890123 * * I Page-Invalid Bit: Page is not available for address-translation * P Page-Protection Bit: Store access not possible for page * C Change-bit override: HW is not required to set change bit * * A 64 bit segmenttable entry of S390 has following format: * | P-table origin | TT * 0000000000111111111122222222223333333333444444444455555555556666 * 0123456789012345678901234567890123456789012345678901234567890123 * * I Segment-Invalid Bit: Segment is not available for address-translation * C Common-Segment Bit: Segment is not private (PoP 3-30) * P Page-Protection Bit: Store access not possible for page * TT Type 00 * * A 64 bit region table entry of S390 has following format: * | S-table origin | TF TTTL * 0000000000111111111122222222223333333333444444444455555555556666 * 0123456789012345678901234567890123456789012345678901234567890123 * * I Segment-Invalid Bit: Segment is not available for address-translation * TT Type 01 * TF * TL Table length * * The 64 bit regiontable origin of S390 has following format: * | region table origon | DTTL * 0000000000111111111122222222223333333333444444444455555555556666 * 0123456789012345678901234567890123456789012345678901234567890123 * * X Space-Switch event: * G Segment-Invalid Bit: * P Private-Space Bit: * S Storage-Alteration: * R Real space * TL Table-Length: * * A storage key has the following format: * | ACC |F|R|C|0| * 0 3 4 5 6 7 * ACC: access key * F : fetch protection bit * R : referenced bit * C : changed bit
*/
/* Hardware bits in the page table entry */ #define _PAGE_NOEXEC 0x100 /* HW no-execute bit */ #define _PAGE_PROTECT 0x200 /* HW read-only bit */ #define _PAGE_INVALID 0x400 /* HW invalid bit */ #define _PAGE_LARGE 0x800 /* Bit to mark a large pte */
/* Software bits in the page table entry */ #define _PAGE_PRESENT 0x001 /* SW pte present bit */ #define _PAGE_YOUNG 0x004 /* SW pte young bit */ #define _PAGE_DIRTY 0x008 /* SW pte dirty bit */ #define _PAGE_READ 0x010 /* SW pte read bit */ #define _PAGE_WRITE 0x020 /* SW pte write bit */ #define _PAGE_SPECIAL 0x040 /* SW associated with special page */ #define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */
#define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE /* SW pte exclusive swap bit */
/* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \
_PAGE_YOUNG | _PAGE_SOFT_DIRTY)
/* * Mask of bits that must not be changed with RDP. Allow only _PAGE_PROTECT * HW bit and all SW bits.
*/ #define _PAGE_RDP_MASK ~(_PAGE_PROTECT | _PAGE_SW_BITS)
/* * handle_pte_fault uses pte_present and pte_none to find out the pte type * WITHOUT holding the page table lock. The _PAGE_PRESENT bit is used to * distinguish present from not-present ptes. It is changed only with the page * table lock held. * * The following table gives the different possible bit combinations for * the pte hardware and software bits in the last 12 bits of a pte * (. unassigned bit, x don't care, t swap type): * * 842100000000 * 000084210000 * 000000008421 * .IR.uswrdy.p * empty .10.00000000 * swap .11..ttttt.0 * prot-none, clean, old .11.xx0000.1 * prot-none, clean, young .11.xx0001.1 * prot-none, dirty, old .11.xx0010.1 * prot-none, dirty, young .11.xx0011.1 * read-only, clean, old .11.xx0100.1 * read-only, clean, young .01.xx0101.1 * read-only, dirty, old .11.xx0110.1 * read-only, dirty, young .01.xx0111.1 * read-write, clean, old .11.xx1100.1 * read-write, clean, young .01.xx1101.1 * read-write, dirty, old .10.xx1110.1 * read-write, dirty, young .00.xx1111.1 * HW-bits: R read-only, I invalid * SW-bits: p present, y young, d dirty, r read, w write, s special, * u unused, l large * * pte_none is true for the bit pattern .10.00000000, pte == 0x400 * pte_swap is true for the bit pattern .11..ooooo.0, (pte & 0x201) == 0x200 * pte_present is true for the bit pattern .xx.xxxxxx.1, (pte & 0x001) == 0x001
*/
/* Bits in the segment/region table address-space-control-element */ #define _ASCE_ORIGIN ~0xfffUL/* region/segment table origin */ #define _ASCE_PRIVATE_SPACE 0x100 /* private space control */ #define _ASCE_ALT_EVENT 0x80 /* storage alteration event control */ #define _ASCE_SPACE_SWITCH 0x40 /* space switch event */ #define _ASCE_REAL_SPACE 0x20 /* real space control */ #define _ASCE_TYPE_MASK 0x0c /* asce table type mask */ #define _ASCE_TYPE_REGION1 0x0c /* region first table type */ #define _ASCE_TYPE_REGION2 0x08 /* region second table type */ #define _ASCE_TYPE_REGION3 0x04 /* region third table type */ #define _ASCE_TYPE_SEGMENT 0x00 /* segment table type */ #define _ASCE_TABLE_LENGTH 0x03 /* region table length */
/* Bits in the region table entry */ #define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ #define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */ #define _REGION_ENTRY_NOEXEC 0x100 /* region no-execute bit */ #define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */ #define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ #define _REGION_ENTRY_TYPE_MASK 0x0c /* region table type mask */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ #define _REGION_ENTRY_TYPE_R2 0x08 /* region second table type */ #define _REGION_ENTRY_TYPE_R3 0x04 /* region third table type */ #define _REGION_ENTRY_LENGTH 0x03 /* region third length */
#define _REGION3_ENTRY_HARDWARE_BITS 0xfffffffffffff6ffUL #define _REGION3_ENTRY_HARDWARE_BITS_LARGE 0xffffffff8001073cUL #define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */ #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ #define _REGION3_ENTRY_COMM 0x0010 /* Common-Region, marks swap entry */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ #define _REGION3_ENTRY_WRITE 0x8000 /* SW region write bit */ #define _REGION3_ENTRY_READ 0x4000 /* SW region read bit */
#ifdef CONFIG_MEM_SOFT_DIRTY #define _REGION3_ENTRY_SOFT_DIRTY 0x0002 /* SW region soft dirty bit */ #else #define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */ #endif
#define _REGION_ENTRY_BITS 0xfffffffffffff22fUL
/* * SW region present bit. For non-leaf region-third-table entries, bits 62-63 * indicate the TABLE LENGTH and both must be set to 1. But such entries * would always be considered as present, so it is safe to use bit 63 as * PRESENT bit for PUD.
*/ #define _REGION3_ENTRY_PRESENT 0x0001
/* * Segment table and region3 table entry encoding * (R = read-only, I = invalid, y = young bit): * dy..R...I...wr * prot-none, clean, old 00..1...1...00 * prot-none, clean, young 01..1...1...00 * prot-none, dirty, old 10..1...1...00 * prot-none, dirty, young 11..1...1...00 * read-only, clean, old 00..1...1...01 * read-only, clean, young 01..1...0...01 * read-only, dirty, old 10..1...1...01 * read-only, dirty, young 11..1...0...01 * read-write, clean, old 00..1...1...11 * read-write, clean, young 01..1...0...11 * read-write, dirty, old 10..0...1...11 * read-write, dirty, young 11..0...0...11 * The segment table origin is used to distinguish empty (origin==0) from * read-write, old segment table entries (origin!=0) * HW-bits: R read-only, I invalid * SW-bits: y young, d dirty, r read, w write
*/
/* Page status table bits for virtualization */ #define PGSTE_ACC_BITS 0xf000000000000000UL #define PGSTE_FP_BIT 0x0800000000000000UL #define PGSTE_PCL_BIT 0x0080000000000000UL #define PGSTE_HR_BIT 0x0040000000000000UL #define PGSTE_HC_BIT 0x0020000000000000UL #define PGSTE_GR_BIT 0x0004000000000000UL #define PGSTE_GC_BIT 0x0002000000000000UL #define PGSTE_ST2_MASK 0x0000ffff00000000UL #define PGSTE_UC_BIT 0x0000000000008000UL /* user dirty (migration) */ #define PGSTE_IN_BIT 0x0000000000004000UL /* IPTE notify bit */ #define PGSTE_VSIE_BIT 0x0000000000002000UL /* ref'd in a shadow table */
/* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL #define _PGSTE_GPS_NODAT 0x0000000040000000UL #define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL #define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL #define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL #define _PGSTE_GPS_USAGE_POT_VOLATILE 0x0000000002000000UL #define _PGSTE_GPS_USAGE_VOLATILE _PGSTE_GPS_USAGE_MASK
/* * A user page table pointer has the space-switch-event bit, the * private-space-control bit and the storage-alteration-event-control * bit set. A kernel page table pointer doesn't need them.
*/ #define _ASCE_USER_BITS (_ASCE_SPACE_SWITCH | _ASCE_PRIVATE_SPACE | \
_ASCE_ALT_EVENT)
/* * As soon as the guest uses storage keys or enables PV, we deduplicate all * mapped shared zeropages and prevent new shared zeropages from getting * mapped.
*/ #define mm_forbids_zeropage mm_forbids_zeropage staticinlineint mm_forbids_zeropage(struct mm_struct *mm)
{ #ifdef CONFIG_PGSTE if (!mm->context.allow_cow_sharing) return 1; #endif return 0;
}
/** * cspg() - Compare and Swap and Purge (CSPG) * @ptr: Pointer to the value to be exchanged * @old: The expected old value * @new: The new value * * Return: True if compare and swap was successful, otherwise false.
*/ staticinlinebool cspg(unsignedlong *ptr, unsignedlong old, unsignedlongnew)
{ union register_pair r1 = { .even = old, .odd = new, }; unsignedlong address = (unsignedlong)ptr | 1;
/** * crdte() - Compare and Replace DAT Table Entry * @old: The expected old value * @new: The new value * @table: Pointer to the value to be exchanged * @dtt: Table type of the table to be exchanged * @address: The address mapped by the entry to be replaced * @asce: The ASCE of this entry * * Return: True if compare and replace was successful, otherwise false.
*/ staticinlinebool crdte(unsignedlong old, unsignedlongnew, unsignedlong *table, unsignedlong dtt, unsignedlong address, unsignedlong asce)
{ union register_pair r1 = { .even = old, .odd = new, }; union register_pair r2 = { .even = __pa(table) | dtt, .odd = address, };
/* * Extract the pgprot value from the given pte while at the same time making it * usable for kernel address space mappings where fault driven dirty and * young/old accounting is not supported, i.e _PAGE_PROTECT and _PAGE_INVALID * must not be set.
*/ #define pte_pgprot pte_pgprot staticinline pgprot_t pte_pgprot(pte_t pte)
{ unsignedlong pte_flags = pte_val(pte) & _PAGE_CHG_MASK;
/* * The following pte modification functions only work if * pte_present() is true. Undefined behaviour if not..
*/ staticinline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pte = clear_pte_bit(pte, __pgprot(~_PAGE_CHG_MASK));
pte = set_pte_bit(pte, newprot); /* * newprot for PAGE_NONE, PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX * has the invalid bit set, clear it again for readable, young pages
*/ if ((pte_val(pte) & _PAGE_YOUNG) && (pte_val(pte) & _PAGE_READ))
pte = clear_pte_bit(pte, __pgprot(_PAGE_INVALID)); /* * newprot for PAGE_RO, PAGE_RX, PAGE_RW and PAGE_RWX has the page * protection bit set, clear it again for writable, dirty pages
*/ if ((pte_val(pte) & _PAGE_DIRTY) && (pte_val(pte) & _PAGE_WRITE))
pte = clear_pte_bit(pte, __pgprot(_PAGE_PROTECT)); return pte;
}
if (__builtin_constant_p(opt) && opt == 0) { /* Invalidation + TLB flush for the pte */ asmvolatile( " ipte %[r1],%[r2],0,%[m4]"
: "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address),
[m4] "i" (local)); return;
}
/* Invalidate ptes with options + TLB flush of the ptes */
opt = opt | (asce & _ASCE_ORIGIN); asmvolatile( " ipte %[r1],%[r2],%[r3],%[m4]"
: [r2] "+a" (address), [r3] "+a" (opt)
: [r1] "a" (pto), [m4] "i" (local) : "memory");
}
static __always_inline void __ptep_ipte_range(unsignedlong address, int nr,
pte_t *ptep, int local)
{ unsignedlong pto = __pa(ptep);
/* Invalidate a range of ptes + TLB flush of the ptes */ do { asmvolatile( " ipte %[r1],%[r2],%[r3],%[m4]"
: [r2] "+a" (address), [r3] "+a" (nr)
: [r1] "a" (pto), [m4] "i" (local) : "memory");
} while (nr != 255);
}
/* * This is hard to understand. ptep_get_and_clear and ptep_clear_flush * both clear the TLB for the unmapped pte. The reason is that * ptep_get_and_clear is used in common code (e.g. change_pte_range) * to modify an active pte. The sequence is * 1) ptep_get_and_clear * 2) set_pte_at * 3) flush_tlb_range * On s390 the tlb needs to get flushed with the modification of the pte * if the pte is active. The only way how this can be implemented is to * have ptep_get_and_clear do the tlb flush. In exchange flush_tlb_range * is a nop.
*/
pte_t ptep_xchg_direct(struct mm_struct *, unsignedlong, pte_t *, pte_t);
pte_t ptep_xchg_lazy(struct mm_struct *, unsignedlong, pte_t *, pte_t);
res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(mm) && pte_present(res))
uv_convert_from_secure_pte(res); return res;
}
res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); /* At this point the reference through the mapping is still present */ if (mm_is_protected(vma->vm_mm) && pte_present(res))
uv_convert_from_secure_pte(res); return res;
}
/* * The batched pte unmap code uses ptep_get_and_clear_full to clear the * ptes. Here an optimization is possible. tlb_gather_mmu flushes all * tlbs of an mm if it can guarantee that the ptes of the mm_struct * cannot be accessed while the batched unmap is running. In this case * full==1 and a simple pte_clear is enough. See tlb.h.
*/ #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL staticinline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, int full)
{
pte_t res;
if (full) {
res = *ptep;
set_pte(ptep, __pte(_PAGE_INVALID));
} else {
res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
} /* Nothing to do */ if (!mm_is_protected(mm) || !pte_present(res)) return res; /* * At this point the reference through the mapping is still present. * The notifier should have destroyed all protected vCPUs at this * point, so the destroy should be successful.
*/ if (full && !uv_destroy_pte(res)) return res; /* * If something went wrong and the page could not be destroyed, or * if this is not a mm teardown, the slower export is used as * fallback instead.
*/
uv_convert_from_secure_pte(res); return res;
}
if (pte_write(pte))
ptep_xchg_lazy(mm, addr, ptep, pte_wrprotect(pte));
}
/* * Check if PTEs only differ in _PAGE_PROTECT HW bit, but also allow SW PTE * bits in the comparison. Those might change e.g. because of dirty and young * tracking.
*/ staticinlineint pte_allow_rdp(pte_t old, pte_t new)
{ /* * Only allow changes from RO to RW
*/ if (!(pte_val(old) & _PAGE_PROTECT) || pte_val(new) & _PAGE_PROTECT) return 0;
staticinlinevoid flush_tlb_fix_spurious_fault(struct vm_area_struct *vma, unsignedlong address,
pte_t *ptep)
{ /* * RDP might not have propagated the PTE protection reset to all CPUs, * so there could be spurious TLB protection faults. * NOTE: This will also be called when a racing pagetable update on * another thread already installed the correct PTE. Both cases cannot * really be distinguished. * Therefore, only do the local TLB flush when RDP can be used, and the * PTE does not have _PAGE_PROTECT set, to avoid unnecessary overhead. * A local RDP can be used to do the flush.
*/ if (cpu_has_rdp() && !(pte_val(*ptep) & _PAGE_PROTECT))
__ptep_rdp(address, ptep, 0, 0, 1);
} #define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
/* * Set multiple PTEs to consecutive pages with a single call. All PTEs * are within the same folio, PMD and VMA.
*/ staticinlinevoid set_ptes(struct mm_struct *mm, unsignedlong addr,
pte_t *ptep, pte_t entry, unsignedint nr)
{ if (pte_present(entry))
entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); if (mm_has_pgste(mm)) { for (;;) {
ptep_set_pte_at(mm, addr, ptep, entry); if (--nr == 0) break;
ptep++;
entry = __pte(pte_val(entry) + PAGE_SIZE);
addr += PAGE_SIZE;
}
} else { for (;;) {
set_pte(ptep, entry); if (--nr == 0) break;
ptep++;
entry = __pte(pte_val(entry) + PAGE_SIZE);
}
}
} #define set_ptes set_ptes
/* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to.
*/ staticinline pte_t mk_pte_phys(unsignedlong physpage, pgprot_t pgprot)
{
pte_t __pte;
/* * The pgd_offset function *always* adds the index for the top-level * region/segment table. This is done to get a sequence like the * following to work: * pgdp = pgd_offset(current->mm, addr); * pgd = READ_ONCE(*pgdp); * p4dp = p4d_offset(&pgd, addr); * ... * The subsequent p4d_offset, pud_offset and pmd_offset functions * only add an index if they dereferenced the pointer.
*/ staticinline pgd_t *pgd_offset_raw(pgd_t *pgd, unsignedlong address)
{ unsignedlong rste; unsignedint shift;
/* Get the first entry of the top level table */
rste = pgd_val(*pgd); /* Pick up the shift from the table type of the first entry */
shift = ((rste & _REGION_ENTRY_TYPE_MASK) >> 2) * 11 + 20; return pgd + ((address >> shift) & (PTRS_PER_PGD - 1));
}
/* * 64 bit swap entry format: * A page-table entry has some bits we have to treat in a special way. * Bits 54 and 63 are used to indicate the page type. Bit 53 marks the pte * as invalid. * A swap pte is indicated by bit pattern (pte & 0x201) == 0x200 * | offset |E11XX|type |S0| * |0000000000111111111122222222223333333333444444444455|55555|55566|66| * |0123456789012345678901234567890123456789012345678901|23456|78901|23| * * Bits 0-51 store the offset. * Bit 52 (E) is used to remember PG_anon_exclusive. * Bits 57-61 store the type. * Bit 62 (S) is used for softdirty tracking. * Bits 55 and 56 (X) are unused.
*/
/* * 64 bit swap entry format for REGION3 and SEGMENT table entries (RSTE) * Bits 59 and 63 are used to indicate the swap entry. Bit 58 marks the rste * as invalid. * A swap entry is indicated by bit pattern (rste & 0x011) == 0x010 * | offset |Xtype |11TT|S0| * |0000000000111111111122222222223333333333444444444455|555555|5566|66| * |0123456789012345678901234567890123456789012345678901|234567|8901|23| * * Bits 0-51 store the offset. * Bits 53-57 store the type. * Bit 62 (S) is used for softdirty tracking. * Bits 60-61 (TT) indicate the table type: 0x01 for REGION3 and 0x00 for SEGMENT. * Bit 52 (X) is unused.
*/
/* * s390 has different layout for PTE and region / segment table entries (RSTE). * This is also true for swap entries, and their swap type and offset encoding. * For hugetlbfs PTE_MARKER support, s390 has internal __swp_type_rste() and * __swp_offset_rste() helpers to correctly handle RSTE swap entries. * * But common swap code does not know about this difference, and only uses * __swp_type(), __swp_offset() and __swp_entry() helpers for conversion between * arch-dependent and arch-independent representation of swp_entry_t for all * pagetable levels. On s390, those helpers only work for PTE swap entries. * * Therefore, implement __pmd_to_swp_entry() to build a fake PTE swap entry * and return the arch-dependent representation of that. Correspondingly, * implement __swp_entry_to_pmd() to convert that into a proper PMD swap * entry again. With this, the arch-dependent swp_entry_t representation will * always look like a PTE swap entry in common code. * * This is somewhat similar to fake PTEs in hugetlbfs code for s390, but only * requires conversion of the swap type and offset, and not all the possible * PTE bits.
*/ staticinline swp_entry_t __pmd_to_swp_entry(pmd_t pmd)
{
swp_entry_t arch_entry;
pte_t pte;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.