if (!maybe_change_pte_writable(vma, pte)) returnfalse;
/* * Writable MAP_PRIVATE mapping: We can only special-case on * exclusive anonymous pages, because we know that our * write-fault handler similarly would map them writable without * any additional checks while holding the PT lock.
*/
page = vm_normal_page(vma, addr, pte); return page && PageAnon(page) && PageAnonExclusive(page);
}
/* * Writable MAP_SHARED mapping: "clean" might indicate that the FS still * needs a real write-fault for writenotify * (see vma_wants_writenotify()). If "dirty", the assumption is that the * FS was already notified and we can simply mark the PTE writable * just like the write-fault handler would do.
*/ return pte_dirty(pte);
}
staticbool prot_numa_skip(struct vm_area_struct *vma, unsignedlong addr,
pte_t oldpte, pte_t *pte, int target_node, struct folio *folio)
{ bool ret = true; bool toptier; int nid;
/* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) goto skip;
if (!folio) goto skip;
if (folio_is_zone_device(folio) || folio_test_ksm(folio)) goto skip;
/* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) &&
(folio_maybe_dma_pinned(folio) || folio_maybe_mapped_shared(folio))) goto skip;
/* * While migration can move some dirty pages, * it cannot move them all from MIGRATE_ASYNC * context.
*/ if (folio_is_file_lru(folio) && folio_test_dirty(folio)) goto skip;
/* * Don't mess with PTEs if page is already on the node * a single-threaded process is running on.
*/
nid = folio_nid(folio); if (target_node == nid) goto skip;
toptier = node_is_toptier(nid);
/* * Skip scanning top tier node if normal numa * balancing is disabled
*/ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && toptier) goto skip;
ret = false; if (folio_use_access_time(folio))
folio_xchg_access_time(folio, jiffies_to_msecs(jiffies));
skip: return ret;
}
/* Set nr_ptes number of ptes, starting from idx */ staticvoid prot_commit_flush_ptes(struct vm_area_struct *vma, unsignedlong addr,
pte_t *ptep, pte_t oldpte, pte_t ptent, int nr_ptes, int idx, bool set_write, struct mmu_gather *tlb)
{ /* * Advance the position in the batch by idx; note that if idx > 0, * then the nr_ptes passed here is <= batch size - idx.
*/
addr += idx * PAGE_SIZE;
ptep += idx;
oldpte = pte_advance_pfn(oldpte, idx);
ptent = pte_advance_pfn(ptent, idx);
/* * Get max length of consecutive ptes pointing to PageAnonExclusive() pages or * !PageAnonExclusive() pages, starting from start_idx. Caller must enforce * that the ptes point to consecutive pages of the same anon large folio.
*/ staticint page_anon_exclusive_sub_batch(int start_idx, int max_len, struct page *first_page, bool expected_anon_exclusive)
{ int idx;
/* * This function is a result of trying our very best to retain the * "avoid the write-fault handler" optimization. In can_change_pte_writable(), * if the vma is a private vma, and we cannot determine whether to change * the pte to writable just from the vma and the pte, we then need to look * at the actual page pointed to by the pte. Unfortunately, if we have a * batch of ptes pointing to consecutive pages of the same anon large folio, * the anon-exclusivity (or the negation) of the first page does not guarantee * the anon-exclusivity (or the negation) of the other pages corresponding to * the pte batch; hence in this case it is incorrect to decide to change or * not change the ptes to writable just by using information from the first * pte of the batch. Therefore, we must individually check all pages and * retrieve sub-batches.
*/ staticvoid commit_anon_folio_batch(struct vm_area_struct *vma, struct folio *folio, struct page *first_page, unsignedlong addr, pte_t *ptep,
pte_t oldpte, pte_t ptent, int nr_ptes, struct mmu_gather *tlb)
{ bool expected_anon_exclusive; int sub_batch_idx = 0; int len;
page = vm_normal_page(vma, addr, oldpte); if (page)
folio = page_folio(page); /* * Avoid trapping faults against the zero or KSM * pages. See similar comment in change_huge_pmd.
*/ if (prot_numa) { int ret = prot_numa_skip(vma, addr, oldpte, pte,
target_node, folio); if (ret) {
if (uffd_wp)
ptent = pte_mkuffd_wp(ptent); elseif (uffd_wp_resolve)
ptent = pte_clear_uffd_wp(ptent);
/* * In some writable, shared mappings, we might want * to catch actual write access -- see * vma_wants_writenotify(). * * In all writable, private mappings, we have to * properly handle COW. * * In both cases, we can sometimes still change PTEs * writable and avoid the write-fault handler, for * example, if a PTE is already dirty and no other * COW or special handling is required.
*/ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
!pte_write(ptent))
set_write_prot_commit_flush_ptes(vma, folio, page,
addr, pte, oldpte, ptent, nr_ptes, tlb); else
prot_commit_flush_ptes(vma, addr, pte, oldpte, ptent,
nr_ptes, /* idx = */ 0, /* set_write = */ false, tlb);
pages += nr_ptes;
} elseif (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
pte_t newpte;
if (is_writable_migration_entry(entry)) { struct folio *folio = pfn_swap_entry_folio(entry);
/* * A protection check is difficult so * just be safe and disable write
*/ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry)); else
entry = make_readable_migration_entry(swp_offset(entry));
newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
} elseif (is_writable_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See * copy_nonpresent_pte() for explanation.
*/
entry = make_readable_device_private_entry(
swp_offset(entry));
newpte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
} elseif (is_pte_marker_entry(entry)) { /* * Ignore error swap entries unconditionally, * because any access should sigbus/sigsegv * anyway.
*/ if (is_poisoned_swp_entry(entry) ||
is_guard_swp_entry(entry)) continue; /* * If this is uffd-wp pte marker and we'd like * to unprotect it, drop it; the next page * fault will trigger without uffd trapping.
*/ if (uffd_wp_resolve) {
pte_clear(vma->vm_mm, addr, pte);
pages++;
} continue;
} else {
newpte = oldpte;
}
if (uffd_wp)
newpte = pte_swp_mkuffd_wp(newpte); elseif (uffd_wp_resolve)
newpte = pte_swp_clear_uffd_wp(newpte);
if (!pte_same(oldpte, newpte)) {
set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
} else { /* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
/* * Nobody plays with any none ptes besides * userfaultfd when applying the protections.
*/ if (likely(!uffd_wp)) continue;
if (userfaultfd_wp_use_markers(vma)) { /* * For file-backed mem, we need to be able to * wr-protect a none pte, because even if the * pte is none, the page/swap cache could * exist. Doing that by install a marker.
*/
set_pte_at(vma->vm_mm, addr, pte,
make_pte_marker(PTE_MARKER_UFFD_WP));
pages++;
}
}
} while (pte += nr_ptes, addr += nr_ptes * PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return pages;
}
/* * Return true if we want to split THPs into PTE mappings in change * protection procedure, false otherwise.
*/ staticinlinebool
pgtable_split_needed(struct vm_area_struct *vma, unsignedlong cp_flags)
{ /* * pte markers only resides in pte level, if we need pte markers, * we need to split. For example, we cannot wr-protect a file thp * (e.g. 2M shmem) because file thp is handled differently when * split by erasing the pmd so far.
*/ return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
}
/* * Return true if we want to populate pgtables in change protection * procedure, false otherwise
*/ staticinlinebool
pgtable_populate_needed(struct vm_area_struct *vma, unsignedlong cp_flags)
{ /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */ if (!(cp_flags & MM_CP_UFFD_WP)) returnfalse;
/* Populate if the userfaultfd mode requires pte markers */ return userfaultfd_wp_use_markers(vma);
}
/* * Populate the pgtable underneath for whatever reason if requested. * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable * allocation failures during page faults by kicking OOM and returning * error.
*/ #define change_pmd_prepare(vma, pmd, cp_flags) \
({ \ long err = 0; \ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \ if (pte_alloc(vma->vm_mm, pmd)) \
err = -ENOMEM; \
} \
err; \
})
/* * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, * while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
*/ #define change_prepare(vma, high, low, addr, cp_flags) \
({ \ long err = 0; \ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ if (p == NULL) \
err = -ENOMEM; \
} \
err; \
})
/* * Do PROT_NONE PFN permission checks here when we can still * bail out without undoing a lot of state. This is a rather * uncommon case, so doesn't need to be very optimized.
*/ if (arch_has_pfn_modify_check() &&
(oldflags & (VM_PFNMAP|VM_MIXEDMAP)) &&
(newflags & VM_ACCESS_FLAGS) == 0) {
pgprot_t new_pgprot = vm_get_page_prot(newflags);
/* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we * make it unwritable again except in the anonymous case where no * anon_vma has yet to be assigned. * * hugetlb mapping were accounted for even if read-only so there is * no need to account for them here.
*/ if (newflags & VM_WRITE) { /* Check space limits when area turns into data. */ if (!may_expand_vm(mm, newflags, nrpages) &&
may_expand_vm(mm, oldflags, nrpages)) return -ENOMEM; if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
VM_SHARED|VM_NORESERVE))) {
charged = nrpages; if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM;
newflags |= VM_ACCOUNT;
}
} elseif ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) &&
!vma->anon_vma) {
newflags &= ~VM_ACCOUNT;
}
/* * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode.
*/
vma_start_write(vma);
vm_flags_reset_once(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
vma_set_page_prot(vma);
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL;
if (start & ~PAGE_MASK) return -EINVAL; if (!len) return 0;
len = PAGE_ALIGN(len);
end = start + len; if (end <= start) return -ENOMEM; if (!arch_validate_prot(prot, start)) return -EINVAL;
reqprot = prot;
if (mmap_write_lock_killable(current->mm)) return -EINTR;
/* * If userspace did not allocate the pkey, do not let * them use it here.
*/
error = -EINVAL; if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out;
if (vma->vm_start != tmp) {
error = -ENOMEM; break;
}
/* Does the application expect PROT_READ to imply PROT_EXEC */ if (rier && (vma->vm_flags & VM_MAYEXEC))
prot |= PROT_EXEC;
/* * Each mprotect() call explicitly passes r/w/x permissions. * If a permission is not passed to mprotect(), it must be * cleared from the VMA.
*/
mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.