static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsignedlong dst_end)
{ /* Make sure that the dst range is fully within dst_vma. */ if (dst_end > dst_vma->vm_end) returnfalse;
/* * Check the vma is registered in uffd, this is required to * enforce the VM_MAYWRITE check done at uffd registration * time.
*/ if (!dst_vma->vm_userfaultfd_ctx.ctx) returnfalse;
#ifdef CONFIG_PER_VMA_LOCK /* * uffd_lock_vma() - Lookup and lock vma corresponding to @address. * @mm: mm to search vma in. * @address: address that the vma should contain. * * Should be called without holding mmap_lock. * * Return: A locked vma containing @address, -ENOENT if no vma is found, or * -ENOMEM if anon_vma couldn't be allocated.
*/ staticstruct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, unsignedlong address)
{ struct vm_area_struct *vma;
vma = lock_vma_under_rcu(mm, address); if (vma) { /* * We know we're going to need to use anon_vma, so check * that early.
*/ if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma))
vma_end_read(vma); else return vma;
}
/* Check if dst_addr is outside of file's size. Must be called with ptl held. */ staticbool mfill_file_over_size(struct vm_area_struct *dst_vma, unsignedlong dst_addr)
{ struct inode *inode;
pgoff_t offset, max_off;
/* * Install PTEs, to map dst_addr (within dst_vma) to page. * * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem * and anon, and for both shared and private VMAs.
*/ int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsignedlong dst_addr, struct page *page, bool newly_allocated, uffd_flags_t flags)
{ int ret; struct mm_struct *dst_mm = dst_vma->vm_mm;
pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED;
spinlock_t *ptl; struct folio *folio = page_folio(page); bool page_in_cache = folio_mapping(folio);
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
_dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared)
writable = false; if (writable)
_dst_pte = pte_mkwrite(_dst_pte, dst_vma); if (flags & MFILL_ATOMIC_WP)
_dst_pte = pte_mkuffd_wp(_dst_pte);
ret = -EAGAIN;
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out;
if (mfill_file_over_size(dst_vma, dst_addr)) {
ret = -EFAULT; goto out_unlock;
}
ret = -EEXIST; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page.
*/ if (!pte_none_mostly(ptep_get(dst_pte))) goto out_unlock;
if (page_in_cache) { /* Usually, cache pages are already added to LRU */ if (newly_allocated)
folio_add_lru(folio);
folio_add_file_rmap_pte(folio, page, dst_vma);
} else {
folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, dst_vma);
}
/* * Must happen after rmap, as mm_counter() checks mapping (via * PageAnon()), which is set by __page_set_anon_rmap().
*/
inc_mm_counter(dst_mm, mm_counter(folio));
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
out: return ret;
}
if (!*foliop) {
ret = -ENOMEM;
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
dst_addr); if (!folio) goto out;
kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the * mmap_lock being read recursive a deadlock is still * possible if a writer has taken a lock. For example: * * process A thread 1 takes read lock on own mmap_lock * process A thread 2 calls mmap, blocks taking write lock * process B thread 1 takes page fault, read lock on own mmap lock * process B thread 2 calls mmap, blocks taking write lock * process A thread 1 blocks taking read lock on process B * process B thread 1 blocks taking read lock on process A * * Disable page faults to prevent potential deadlock * and retry the copy outside the mmap_lock.
*/
pagefault_disable();
ret = copy_from_user(kaddr, (constvoid __user *) src_addr,
PAGE_SIZE);
pagefault_enable();
kunmap_local(kaddr);
/* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) {
ret = -ENOENT;
*foliop = folio; /* don't free the page */ goto out;
}
/* * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write.
*/
__folio_mark_uptodate(folio);
ret = -ENOMEM; if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) goto out_release;
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
&folio->page, true, flags); if (ret) goto out_release;
out: return ret;
out_release:
folio_put(folio); goto out;
}
staticint mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsignedlong dst_addr)
{ struct folio *folio; int ret = -ENOMEM;
folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); if (!folio) return ret;
if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) goto out_put;
/* * The memory barrier inside __folio_mark_uptodate makes sure that * zeroing out the folio become visible before mapping the page * using set_pte_at(). See do_anonymous_page().
*/
__folio_mark_uptodate(folio);
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
&folio->page, true, 0); if (ret) goto out_put;
if (mm_forbids_zeropage(dst_vma->vm_mm)) return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr);
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
ret = -EAGAIN;
dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out; if (mfill_file_over_size(dst_vma, dst_addr)) {
ret = -EFAULT; goto out_unlock;
}
ret = -EEXIST; if (!pte_none(ptep_get(dst_pte))) goto out_unlock;
set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
out: return ret;
}
/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ staticint mfill_atomic_pte_continue(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsignedlong dst_addr,
uffd_flags_t flags)
{ struct inode *inode = file_inode(dst_vma->vm_file);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); struct folio *folio; struct page *page; int ret;
ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT)
ret = -EFAULT; if (ret) goto out; if (!folio) {
ret = -EFAULT; goto out;
}
page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) {
ret = -EIO; goto out_release;
}
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
page, false, flags); if (ret) goto out_release;
_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
ret = -EAGAIN;
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out;
if (mfill_file_over_size(dst_vma, dst_addr)) {
ret = -EFAULT; goto out_unlock;
}
ret = -EEXIST; /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ if (!pte_none(ptep_get(dst_pte))) goto out_unlock;
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
ret = 0;
out_unlock:
pte_unmap_unlock(dst_pte, ptl);
out: return ret;
}
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address); if (!p4d) return NULL;
pud = pud_alloc(mm, p4d, address); if (!pud) return NULL; /* * Note that we didn't run this because the pmd was * missing, the *pmd may be already established and in * turn it may also be a trans_huge_pmd.
*/ return pmd_alloc(mm, pud, address);
}
#ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is * called with either vma-lock or mmap_lock held, it will release the lock * before returning.
*/ static __always_inline ssize_t mfill_atomic_hugetlb( struct userfaultfd_ctx *ctx, struct vm_area_struct *dst_vma, unsignedlong dst_start, unsignedlong src_start, unsignedlong len,
uffd_flags_t flags)
{ struct mm_struct *dst_mm = dst_vma->vm_mm;
ssize_t err;
pte_t *dst_pte; unsignedlong src_addr, dst_addr; long copied; struct folio *folio; unsignedlong vma_hpagesize;
pgoff_t idx;
u32 hash; struct address_space *mapping;
/* * There is no default zero huge page for all huge page sizes as * supported by hugetlb. A PMD_SIZE huge pages may exist as used * by THP. Since we can not reliably insert a zero page, this * feature is not supported.
*/ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
up_read(&ctx->map_changing_lock);
uffd_mfill_unlock(dst_vma); return -EINVAL;
}
/* * Validate alignment based on huge page size
*/
err = -EINVAL; if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) goto out_unlock;
retry: /* * On routine entry dst_vma is set. If we had to drop mmap_lock and * retry, dst_vma will be set to NULL and we must lookup again.
*/ if (!dst_vma) {
dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); if (IS_ERR(dst_vma)) {
err = PTR_ERR(dst_vma); goto out;
}
err = -ENOENT; if (!is_vm_hugetlb_page(dst_vma)) goto out_unlock_vma;
err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) goto out_unlock_vma;
/* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later
*/
down_read(&ctx->map_changing_lock);
err = -EAGAIN; if (atomic_read(&ctx->mmap_changing)) goto out_unlock;
}
/* * Serialize via vma_lock and hugetlb_fault_mutex. * vma_lock ensures the dst_pte remains valid even * in the case of shared pmds. fault mutex prevents * races with other faulting threads.
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(dst_vma);
/* * The normal page fault path for a shmem will invoke the * fault, fill the hole in the file and COW it right away. The * result generates plain anonymous memory. So when we are * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll * generate anonymous memory directly without actually filling * the hole. For the MAP_PRIVATE case the robustness check * only happens in the pagetable (to verify it's still none) * and not in the radix tree.
*/ if (!(dst_vma->vm_flags & VM_SHARED)) { if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
dst_addr, src_addr,
flags, foliop); else
err = mfill_atomic_pte_zeropage(dst_pmd,
dst_vma, dst_addr);
} else {
err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
dst_addr, src_addr,
flags, foliop);
}
/* Does the address range wrap, or is the span zero-sized? */
VM_WARN_ON_ONCE(src_start + len <= src_start);
VM_WARN_ON_ONCE(dst_start + len <= dst_start);
src_addr = src_start;
dst_addr = dst_start;
copied = 0;
folio = NULL;
retry: /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma.
*/
dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); if (IS_ERR(dst_vma)) {
err = PTR_ERR(dst_vma); goto out;
}
/* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later
*/
down_read(&ctx->map_changing_lock);
err = -EAGAIN; if (atomic_read(&ctx->mmap_changing)) goto out_unlock;
err = -EINVAL; /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false.
*/ if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
dst_vma->vm_flags & VM_SHARED)) goto out_unlock;
/* * validate 'mode' now that we know the dst_vma: don't allow * a wrprotect copy if the userfaultfd didn't register as WP.
*/ if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) goto out_unlock;
/* * If this is a HUGETLB vma, pass off to appropriate routine
*/ if (is_vm_hugetlb_page(dst_vma)) return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
src_start, len, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; if (!vma_is_shmem(dst_vma) &&
uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) goto out_unlock;
while (src_addr < src_start + len) {
pmd_t dst_pmdval;
dst_pmdval = pmdp_get_lockless(dst_pmd); if (unlikely(pmd_none(dst_pmdval)) &&
unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM; break;
}
dst_pmdval = pmdp_get_lockless(dst_pmd); /* * If the dst_pmd is THP don't override it and just be strict. * (This includes the case where the PMD used to be THP and * changed back to none after __pte_alloc().)
*/ if (unlikely(!pmd_present(dst_pmdval) ||
pmd_trans_huge(dst_pmdval))) {
err = -EEXIST; break;
} if (unlikely(pmd_bad(dst_pmdval))) {
err = -EFAULT; break;
} /* * For shmem mappings, khugepaged is allowed to remove page * tables under us; pte_offset_map_lock() will deal with that.
*/
/* * A caller might reasonably assume that UFFDIO_CONTINUE contains an * smp_wmb() to ensure that any writes to the about-to-be-mapped page by * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to * subsequent loads from the page through the newly mapped address range.
*/
smp_wmb();
long uffd_wp_range(struct vm_area_struct *dst_vma, unsignedlong start, unsignedlong len, bool enable_wp)
{ unsignedint mm_cp_flags; struct mmu_gather tlb; long ret;
VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, "The address range exceeds VMA boundary.\n"); if (enable_wp)
mm_cp_flags = MM_CP_UFFD_WP; else
mm_cp_flags = MM_CP_UFFD_WP_RESOLVE;
/* * vma->vm_page_prot already reflects that uffd-wp is enabled for this * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed * to be write-protected as default whenever protection changes. * Try upgrading write permissions manually.
*/ if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
tlb_gather_mmu(&tlb, dst_vma->vm_mm);
ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
tlb_finish_mmu(&tlb);
/* Does the address range wrap, or is the span zero-sized? */
VM_WARN_ON_ONCE(start + len <= start);
mmap_read_lock(dst_mm);
/* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later
*/
down_read(&ctx->map_changing_lock);
err = -EAGAIN; if (atomic_read(&ctx->mmap_changing)) goto out_unlock;
orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) {
set_pte_at(mm, src_addr, src_pte, orig_src_pte);
err = -EBUSY; goto out;
}
orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); /* Set soft dirty bit so userspace can notice the pte was moved */ #ifdef CONFIG_MEM_SOFT_DIRTY
orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); #endif if (pte_dirty(orig_src_pte))
orig_dst_pte = pte_mkdirty(orig_dst_pte);
orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
/* * The src_folio resides in the swapcache, requiring an update to its * index and mapping to align with the dst_vma, where a swap-in may * occur and hit the swapcache after moving the PTE.
*/ if (src_folio) {
folio_move_anon_rmap(src_folio, dst_vma);
src_folio->index = linear_page_index(dst_vma, dst_addr);
} else { /* * Check if the swap entry is cached after acquiring the src_pte * lock. Otherwise, we might miss a newly loaded swap cache folio. * * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. * We are trying to catch newly added swap cache, the only possible case is * when a folio is swapped in and out again staying in swap cache, using the * same entry before the PTE check above. The PTL is acquired and released * twice, each time after updating the swap_map's flag. So holding * the PTL here ensures we see the updated value. False positive is possible, * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the * cache, or during the tiny synchronization window between swap cache and * swap_map, but it will be gone very quickly, worst result is retry jitters.
*/ if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) {
double_pt_unlock(dst_ptl, src_ptl); return -EAGAIN;
}
}
/* * The mmap_lock for reading is held by the caller. Just move the page * from src_pmd to dst_pmd if possible, and return true if succeeded * in moving the page.
*/ staticint move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsignedlong dst_addr, unsignedlong src_addr,
__u64 mode)
{
swp_entry_t entry; struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
spinlock_t *src_ptl, *dst_ptl;
pte_t *src_pte = NULL;
pte_t *dst_pte = NULL;
pmd_t dummy_pmdval;
pmd_t dst_pmdval; struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; int err = 0;
flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
src_addr, src_addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
retry: /* * Use the maywrite version to indicate that dst_pte will be modified, * since dst_pte needs to be none, the subsequent pte_same() check * cannot prevent the dst_pte page from being freed concurrently, so we * also need to abtain dst_pmdval and recheck pmd_same() later.
*/
dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval,
&dst_ptl);
/* Retry if a huge pmd materialized from under us */ if (unlikely(!dst_pte)) {
err = -EAGAIN; goto out;
}
/* * Unlike dst_pte, the subsequent pte_same() check can ensure the * stability of the src_pte page, so there is no need to get pmdval, * just pass a dummy variable to it.
*/
src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval,
&src_ptl);
/* * We held the mmap_lock for reading so MADV_DONTNEED * can zap transparent huge pages under us, or the * transparent huge page fault can establish new * transparent huge pages under us.
*/ if (unlikely(!src_pte)) {
err = -EAGAIN; goto out;
}
/* Sanity checks before the operation */ if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) ||
pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) {
err = -EINVAL; goto out;
}
spin_lock(src_ptl);
orig_src_pte = ptep_get(src_pte);
spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
err = -ENOENT; else/* nothing to do to move a hole */
err = 0; goto out;
}
/* If PTE changed after we locked the folio them start over */ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) {
err = -EAGAIN; goto out;
}
/* * Pin and lock both source folio and anon_vma. Since we are in * RCU read section, we can't block, so on contention have to * unmap the ptes, obtain the lock and retry.
*/ if (!src_folio) { struct folio *folio; bool locked;
/* * Pin the page while holding the lock to be sure the * page isn't freed under us
*/
spin_lock(src_ptl); if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
spin_unlock(src_ptl);
err = -EAGAIN; goto out;
}
locked = folio_trylock(folio); /* * We avoid waiting for folio lock with a raised * refcount for large folios because extra refcounts * will result in split_folio() failing later and * retrying. If multiple tasks are trying to move a * large folio we can end up livelocking.
*/ if (!locked && folio_test_large(folio)) {
spin_unlock(src_ptl);
err = -EAGAIN; goto out;
}
if (!locked) {
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL; /* now we can block and wait */
folio_lock(src_folio); goto retry;
}
if (WARN_ON_ONCE(!folio_test_anon(src_folio))) {
err = -EBUSY; goto out;
}
}
/* at this point we have src_folio locked */ if (folio_test_large(src_folio)) { /* split_folio() can block */
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio); if (err) goto out; /* have to reacquire the folio after it got split */
folio_unlock(src_folio);
folio_put(src_folio);
src_folio = NULL; goto retry;
}
if (!src_anon_vma) { /* * folio_referenced walks the anon_vma chain * without the folio lock. Serialize against it with * the anon_vma lock, the folio lock is not enough.
*/
src_anon_vma = folio_get_anon_vma(src_folio); if (!src_anon_vma) { /* page was unmapped from under us */
err = -EAGAIN; goto out;
} if (!anon_vma_trylock_write(src_anon_vma)) {
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL; /* now we can block and wait */
anon_vma_lock_write(src_anon_vma); goto retry;
}
}
if (!pte_swp_exclusive(orig_src_pte)) {
err = -EBUSY; goto out;
}
si = get_swap_device(entry); if (unlikely(!si)) {
err = -EAGAIN; goto out;
} /* * Verify the existence of the swapcache. If present, the folio's * index and mapping must be updated even when the PTE is a swap * entry. The anon_vma lock is not taken during this process since * the folio has already been unmapped, and the swap entry is * exclusive, preventing rmap walks. * * For large folios, return -EBUSY immediately, as split_folio() * also returns -EBUSY when attempting to split unmapped large * folios in the swapcache. This issue needs to be resolved * separately to allow proper handling.
*/ if (!src_folio)
folio = filemap_get_folio(swap_address_space(entry),
swap_cache_index(entry)); if (!IS_ERR_OR_NULL(folio)) { if (folio_test_large(folio)) {
err = -EBUSY;
folio_put(folio); goto out;
}
src_folio = folio;
src_folio_pte = orig_src_pte; if (!folio_trylock(src_folio)) {
pte_unmap(src_pte);
pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
put_swap_device(si);
si = NULL; /* now we can block and wait */
folio_lock(src_folio); goto retry;
}
}
err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
dst_ptl, src_ptl, src_folio, si, entry);
}
out: if (src_anon_vma) {
anon_vma_unlock_write(src_anon_vma);
put_anon_vma(src_anon_vma);
} if (src_folio) {
folio_unlock(src_folio);
folio_put(src_folio);
} /* * Unmap in reverse order (LIFO) to maintain proper kmap_local * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte * first, then src_pte, so we must unmap src_pte first, then dst_pte.
*/ if (src_pte)
pte_unmap(src_pte); if (dst_pte)
pte_unmap(dst_pte);
mmu_notifier_invalidate_range_end(&range); if (si)
put_swap_device(si);
return err;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE staticinlinebool move_splits_huge_pmd(unsignedlong dst_addr, unsignedlong src_addr, unsignedlong src_end)
{ return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
src_end - src_addr < HPAGE_PMD_SIZE;
} #else staticinlinebool move_splits_huge_pmd(unsignedlong dst_addr, unsignedlong src_addr, unsignedlong src_end)
{ /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ returnfalse;
} #endif
staticint validate_move_areas(struct userfaultfd_ctx *ctx, struct vm_area_struct *src_vma, struct vm_area_struct *dst_vma)
{ /* Only allow moving if both have the same access and protection */ if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) ||
pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) return -EINVAL;
/* Only allow moving if both are mlocked or both aren't */ if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) return -EINVAL;
/* * For now, we keep it simple and only move between writable VMAs. * Access flags are equal, therefore cheching only the source is enough.
*/ if (!(src_vma->vm_flags & VM_WRITE)) return -EINVAL;
/* Check if vma flags indicate content which can be moved */ if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) return -EINVAL;
/* Ensure dst_vma is registered in uffd we are operating on */ if (!dst_vma->vm_userfaultfd_ctx.ctx ||
dst_vma->vm_userfaultfd_ctx.ctx != ctx) return -EINVAL;
/* Only allow moving across anonymous vmas */ if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) return -EINVAL;
vma = uffd_lock_vma(mm, dst_start); if (IS_ERR(vma)) return PTR_ERR(vma);
*dst_vmap = vma; /* * Skip finding src_vma if src_start is in dst_vma. This also ensures * that we don't lock the same vma twice.
*/ if (src_start >= vma->vm_start && src_start < vma->vm_end) {
*src_vmap = vma; return 0;
}
/* * Using uffd_lock_vma() to get src_vma can lead to following deadlock: * * Thread1 Thread2 * ------- ------- * vma_start_read(dst_vma) * mmap_write_lock(mm) * vma_start_write(src_vma) * vma_start_read(src_vma) * mmap_read_lock(mm) * vma_start_write(dst_vma)
*/
*src_vmap = lock_vma_under_rcu(mm, src_start); if (likely(*src_vmap)) return 0;
/* Undo any locking and retry in mmap_lock critical section */
vma_end_read(*dst_vmap);
/** * move_pages - move arbitrary anonymous pages of an existing vma * @ctx: pointer to the userfaultfd context * @dst_start: start of the destination virtual memory range * @src_start: start of the source virtual memory range * @len: length of the virtual memory range * @mode: flags from uffdio_move.mode * * It will either use the mmap_lock in read mode or per-vma locks * * move_pages() remaps arbitrary anonymous pages atomically in zero * copy. It only works on non shared anonymous pages because those can * be relocated without generating non linear anon_vmas in the rmap * code. * * It provides a zero copy mechanism to handle userspace page faults. * The source vma pages should have mapcount == 1, which can be * enforced by using madvise(MADV_DONTFORK) on src vma. * * The thread receiving the page during the userland page fault * will receive the faulting page in the source vma through the network, * storage or any other I/O device (MADV_DONTFORK in the source vma * avoids move_pages() to fail with -EBUSY if the process forks before * move_pages() is called), then it will call move_pages() to map the * page in the faulting address in the destination vma. * * This userfaultfd command works purely via pagetables, so it's the * most efficient way to move physical non shared anonymous pages * across different virtual addresses. Unlike mremap()/mmap()/munmap() * it does not create any new vmas. The mapping in the destination * address is atomic. * * It only works if the vma protection bits are identical from the * source and destination vma. * * It can remap non shared anonymous pages within the same vma too. * * If the source virtual memory range has any unmapped holes, or if * the destination virtual memory range is not a whole unmapped hole, * move_pages() will fail respectively with -ENOENT or -EEXIST. This * provides a very strict behavior to avoid any chance of memory * corruption going unnoticed if there are userland race conditions. * Only one thread should resolve the userland page fault at any given * time for any given faulting address. This means that if two threads * try to both call move_pages() on the same destination address at the * same time, the second thread will get an explicit error from this * command. * * The command retval will return "len" is successful. The command * however can be interrupted by fatal signals or errors. If * interrupted it will return the number of bytes successfully * remapped before the interruption if any, or the negative error if * none. It will never return zero. Either it will return an error or * an amount of bytes successfully moved. If the retval reports a * "short" remap, the move_pages() command should be repeated by * userland with src+retval, dst+reval, len-retval if it wants to know * about the error that interrupted it. * * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to * prevent -ENOENT errors to materialize if there are holes in the * source virtual range that is being remapped. The holes will be * accounted as successfully remapped in the retval of the * command. This is mostly useful to remap hugepage naturally aligned * virtual regions without knowing if there are transparent hugepage * in the regions or not, but preventing the risk of having to split * the hugepmd during the remap. * * If there's any rmap walk that is taking the anon_vma locks without * first obtaining the folio lock (the only current instance is * folio_referenced), they will have to verify if the folio->mapping * has changed after taking the anon_vma lock. If it changed they * should release the lock and retry obtaining a new anon_vma, because * it means the anon_vma was changed by move_pages() before the lock * could be obtained. This is the only additional complexity added to * the rmap code to provide this anonymous page remapping functionality.
*/
ssize_t move_pages(struct userfaultfd_ctx *ctx, unsignedlong dst_start, unsignedlong src_start, unsignedlong len, __u64 mode)
{ struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; unsignedlong src_addr, dst_addr;
pmd_t *src_pmd, *dst_pmd; long err = -EINVAL;
ssize_t moved = 0;
/* Does the address range wrap, or is the span zero-sized? */
VM_WARN_ON_ONCE(src_start + len < src_start);
VM_WARN_ON_ONCE(dst_start + len < dst_start);
/* Re-check after taking map_changing_lock */
err = -EAGAIN;
down_read(&ctx->map_changing_lock); if (likely(atomic_read(&ctx->mmap_changing))) goto out_unlock; /* * Make sure the vma is not shared, that the src and dst remap * ranges are both valid and fully within a single existing * vma.
*/
err = -EINVAL; if (src_vma->vm_flags & VM_SHARED) goto out_unlock; if (src_start + len > src_vma->vm_end) goto out_unlock;
if (dst_vma->vm_flags & VM_SHARED) goto out_unlock; if (dst_start + len > dst_vma->vm_end) goto out_unlock;
err = validate_move_areas(ctx, src_vma, dst_vma); if (err) goto out_unlock;
/* * Below works because anonymous area would not have a * transparent huge PUD. If file-backed support is added, * that case would need to be handled here.
*/
src_pmd = mm_find_pmd(mm, src_addr); if (unlikely(!src_pmd)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) {
err = -ENOENT; break;
}
src_pmd = mm_alloc_pmd(mm, src_addr); if (unlikely(!src_pmd)) {
err = -ENOMEM; break;
}
}
dst_pmd = mm_alloc_pmd(mm, dst_addr); if (unlikely(!dst_pmd)) {
err = -ENOMEM; break;
}
dst_pmdval = pmdp_get_lockless(dst_pmd); /* * If the dst_pmd is mapped as THP don't override it and just * be strict. If dst_pmd changes into TPH after this check, the * move_pages_huge_pmd() will detect the change and retry * while move_pages_pte() will detect the change and fail.
*/ if (unlikely(pmd_trans_huge(dst_pmdval))) {
err = -EEXIST; break;
}
ptl = pmd_trans_huge_lock(src_pmd, src_vma); if (ptl) { /* Check if we can move the pmd without splitting it. */ if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
!pmd_none(dst_pmdval)) { /* Can be a migration entry */ if (pmd_present(*src_pmd)) { struct folio *folio = pmd_folio(*src_pmd);
/* * If we are modifying only and not splitting, just give up on the merge * if OOM prevents us from merging successfully.
*/ if (start == vma->vm_start && end == vma->vm_end)
give_up_on_oom = true;
/* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma))
uffd_wp_range(vma, start, end - start, false);
/* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet.
*/ if (!IS_ERR(ret))
userfaultfd_reset_ctx(ret);
/* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too.
*/ if (vma->vm_userfaultfd_ctx.ctx == ctx &&
(vma->vm_flags & vm_flags) == vm_flags) goto skip;
/* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet.
*/
userfaultfd_set_ctx(vma, ctx, vm_flags);
if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
hugetlb_unshare_all_pmds(vma);
/* the various vma->vm_userfaultfd_ctx still points to it */
mmap_write_lock(mm);
for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == ctx)
userfaultfd_reset_ctx(vma);
}
mmap_write_unlock(mm);
}
/* * Flush page faults out of all CPUs. NOTE: all page faults * must be retried without returning VM_FAULT_SIGBUS if * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx * changes while handle_userfault released the mmap_lock. So * it's critical that released is set to true (above), before * taking the mmap_lock for writing.
*/
mmap_write_lock(mm);
prev = NULL;
for_each_vma(vmi, vma) {
cond_resched();
VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^
!!(vma->vm_flags & __VM_UFFD_FLAGS)); if (vma->vm_userfaultfd_ctx.ctx != ctx) {
prev = vma; continue;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.