/* * Maximum number of attempts we make to install guard pages before we give up * and return -ERESTARTNOINTR to have userspace try again.
*/ #define MAX_MADVISE_GUARD_RETRIES 3
/* * The range over which the behaviour is currently being applied. If * traversing multiple VMAs, this is updated for each.
*/ struct madvise_behavior_range range; /* The VMA and VMA preceding it (if applicable) currently targeted. */ struct vm_area_struct *prev; struct vm_area_struct *vma; bool lock_dropped;
};
/* Add 1 for NUL terminator at the end of the anon_name->name */
count = strlen(name) + 1;
anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); if (anon_name) {
kref_init(&anon_name->kref);
memcpy(anon_name->name, name, count);
}
return 0;
} #endif/* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags or anon_name on region of a vma, splitting it or merging * it as necessary. Must be called with mmap_lock held for writing.
*/ staticint madvise_update_vma(vm_flags_t new_flags, struct madvise_behavior *madv_behavior)
{ struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; struct anon_vma_name *anon_name = madv_behavior->anon_name; bool set_new_anon_name = madv_behavior->behavior == __MADV_SET_ANON_VMA_NAME;
VMA_ITERATOR(vmi, madv_behavior->mm, range->start);
/* vm_flags is protected by the mmap_lock held in write mode. */
vma_start_write(vma);
vm_flags_reset(vma, new_flags); if (set_new_anon_name) return replace_anon_vma_name(vma, anon_name);
if (!xa_is_value(folio)) continue;
entry = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(entry)) continue;
/* * Schedule all required I/O operations. Do not wait for completion.
*/ staticlong madvise_willneed(struct madvise_behavior *madv_behavior)
{ struct vm_area_struct *vma = madv_behavior->vma; struct mm_struct *mm = madv_behavior->mm; struct file *file = vma->vm_file; unsignedlong start = madv_behavior->range.start; unsignedlong end = madv_behavior->range.end;
loff_t offset;
#ifdef CONFIG_SWAP if (!file) {
walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma);
lru_add_drain(); /* Push any new pages onto the LRU now */ return 0;
}
if (shmem_mapping(file->f_mapping)) {
shmem_swapin_range(vma, start, end, file->f_mapping);
lru_add_drain(); /* Push any new pages onto the LRU now */ return 0;
} #else if (!file) return -EBADF; #endif
if (IS_DAX(file_inode(file))) { /* no bad return value, but ignore advice */ return 0;
}
/* * Filesystem's fadvise may need to take various locks. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock.
*/
mark_mmap_lock_dropped(madv_behavior);
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
mmap_read_unlock(mm);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
mmap_read_lock(mm); return 0;
}
staticinlinebool can_do_file_pageout(struct vm_area_struct *vma)
{ if (!vma->vm_file) returnfalse; /* * paging out pagecache only for non-anonymous mappings that correspond * to the files the calling process could (if tried) open for writing; * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel.
*/ return inode_owner_or_capable(&nop_mnt_idmap,
file_inode(vma->vm_file)) ||
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
folio_clear_referenced(folio);
folio_test_clear_young(folio); if (folio_test_active(folio))
folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio))
folio_putback_lru(folio); else
list_add(&folio->lru, &folio_list);
}
} else
folio_deactivate(folio);
huge_unlock:
spin_unlock(ptl); if (pageout)
reclaim_pages(&folio_list); return 0;
}
regular_folio: #endif
tlb_change_page_size(tlb, PAGE_SIZE);
restart:
start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode(); for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
nr = 1;
ptent = ptep_get(pte);
if (++batch_count == SWAP_CLUSTER_MAX) {
batch_count = 0; if (need_resched()) {
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
cond_resched(); goto restart;
}
}
if (pte_none(ptent)) continue;
if (!pte_present(ptent)) continue;
folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue;
/* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be swapped out whole. If we * fail to split a folio, leave it in place and advance to the * next pte in the range.
*/ if (folio_test_large(folio)) {
nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err;
if (folio_maybe_mapped_shared(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; if (!folio_trylock(folio)) continue;
folio_get(folio);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
start_pte = NULL;
err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
start_pte = pte =
pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode(); if (!err)
nr = 0; continue;
}
}
/* * Do not interfere with other mappings of this folio and * non-LRU folio. If we have a large folio at this point, we * know it is fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive.
*/ if (!folio_test_lru(folio) ||
folio_mapcount(folio) != folio_nr_pages(folio)) continue;
if (pageout_anon_only_filter && !folio_test_anon(folio)) continue;
/* * We are deactivating a folio for accelerating reclaiming. * VM couldn't reclaim the folio unless we clear PG_young. * As a side effect, it makes confuse idle-page tracking * because they will miss recent referenced history.
*/
folio_clear_referenced(folio);
folio_test_clear_young(folio); if (folio_test_active(folio))
folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio))
folio_putback_lru(folio); else
list_add(&folio->lru, &folio_list);
}
} else
folio_deactivate(folio);
}
if (start_pte) {
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
} if (pageout)
reclaim_pages(&folio_list);
cond_resched();
/* * If the VMA belongs to a private file mapping, there can be private * dirty pages which can be paged out if even this process is neither * owner nor write capable of the file. We allow private file mappings * further to pageout dirty anon pages.
*/ if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
(vma->vm_flags & VM_MAYSHARE))) return 0;
if (pte_none(ptent)) continue; /* * If the pte has swp_entry, just clear page table to * prevent swap-in which is more expensive rather than * (page allocation + zeroing).
*/ if (!pte_present(ptent)) {
swp_entry_t entry;
folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue;
/* * If we encounter a large folio, only split it if it is not * fully mapped within the range we are operating on. Otherwise * leave it as is so that it can be marked as lazyfree. If we * fail to split a folio, leave it in place and advance to the * next pte in the range.
*/ if (folio_test_large(folio)) {
nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err;
if (folio_maybe_mapped_shared(folio)) continue; if (!folio_trylock(folio)) continue;
folio_get(folio);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
start_pte = NULL;
err = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
start_pte = pte; if (!start_pte) break;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode(); if (!err)
nr = 0; continue;
}
}
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* * If we have a large folio at this point, we know it is * fully mapped so if its mapcount is the same as its * number of pages, it must be exclusive.
*/ if (folio_mapcount(folio) != folio_nr_pages(folio)) {
folio_unlock(folio); continue;
}
if (folio_test_swapcache(folio) &&
!folio_free_swap(folio)) {
folio_unlock(folio); continue;
}
/* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The * zap_page_range_single call sets things up for shrink_active_list to actually * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for * applications like large transactional databases which want to discard * pages in anonymous maps after committing to backing store the data * that was kept in them. There is no reason to write this data out to * the swap area if the application is discarding it. * * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE).
*/ staticlong madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior)
if (!is_vm_hugetlb_page(vma)) { unsignedint forbidden = VM_PFNMAP;
if (behavior != MADV_DONTNEED_LOCKED)
forbidden |= VM_LOCKED;
return !(vma->vm_flags & forbidden);
}
if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) returnfalse; if (range->start & ~huge_page_mask(hstate_vma(vma))) returnfalse;
/* * Madvise callers expect the length to be rounded up to PAGE_SIZE * boundaries, and may be unaware that this VMA uses huge pages. * Avoid unexpected data loss by rounding down the number of * huge pages freed.
*/
range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma)));
if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL;
if (range->start == range->end) return 0;
if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { struct vm_area_struct *vma;
mark_mmap_lock_dropped(madv_behavior);
mmap_read_lock(mm);
madv_behavior->vma = vma = vma_lookup(mm, range->start); if (!vma) return -ENOMEM; /* * Potential end adjustment for hugetlb vma is OK as * the check below keeps end within vma.
*/ if (!madvise_dontneed_free_valid_vma(madv_behavior)) return -EINVAL; if (range->end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an * UFFD_EVENT_REMOVE repetition on the * end-vma->vm_end range, but the manager can * handle a repetition fine.
*/
range->end = vma->vm_end;
} /* * If the memory region between start and end was * originally backed by 4kB pages and then remapped to * be backed by hugepages while mmap_lock was dropped, * the adjustment for hugetlb vma above may have rounded * end down to the start address.
*/ if (range->start == range->end) return 0;
VM_WARN_ON(range->start > range->end);
}
staticlong madvise_populate(struct madvise_behavior *madv_behavior)
{ struct mm_struct *mm = madv_behavior->mm; constbool write = madv_behavior->behavior == MADV_POPULATE_WRITE; int locked = 1; unsignedlong start = madv_behavior->range.start; unsignedlong end = madv_behavior->range.end; long pages;
while (start < end) { /* Populate (prefault) page tables readable/writable. */
pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) {
mmap_read_lock(mm);
locked = 1;
} if (pages < 0) { switch (pages) { case -EINTR: return -EINTR; case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ return -EFAULT; default:
pr_warn_once("%s: unhandled return value: %ld\n",
__func__, pages);
fallthrough; case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM;
}
}
start += pages * PAGE_SIZE;
} return 0;
}
/* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file.
*/ staticlong madvise_remove(struct madvise_behavior *madv_behavior)
{
loff_t offset; int error; struct file *f; struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma = madv_behavior->vma; unsignedlong start = madv_behavior->range.start; unsignedlong end = madv_behavior->range.end;
mark_mmap_lock_dropped(madv_behavior);
if (vma->vm_flags & VM_LOCKED) return -EINVAL;
f = vma->vm_file;
if (!f || !f->f_mapping || !f->f_mapping->host) { return -EINVAL;
}
/* * Filesystem's fallocate may need to take i_rwsem. We need to * explicitly grab a reference because the vma (and hence the * vma's reference to the file) can go away as soon as we drop * mmap_lock.
*/
get_file(f); if (userfaultfd_remove(vma, start, end)) { /* mmap_lock was not released by userfaultfd_remove() */
mmap_read_unlock(mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
mmap_read_lock(mm); return error;
}
/* * A user could lock after setting a guard range but that's fine, as * they'd not be able to fault in. The issue arises when we try to zap * existing locked VMAs. We don't want to do that.
*/ if (!allow_locked)
disallowed |= VM_LOCKED;
staticlong madvise_guard_install(struct madvise_behavior *madv_behavior)
{ struct vm_area_struct *vma = madv_behavior->vma; struct madvise_behavior_range *range = &madv_behavior->range; long err; int i;
if (!is_valid_guard_vma(vma, /* allow_locked = */false)) return -EINVAL;
/* * If we install guard markers, then the range is no longer * empty from a page table perspective and therefore it's * appropriate to have an anon_vma. * * This ensures that on fork, we copy page tables correctly.
*/
err = anon_vma_prepare(vma); if (err) return err;
/* * Optimistically try to install the guard marker pages first. If any * non-guard pages are encountered, give up and zap the range before * trying again. * * We try a few times before giving up and releasing back to userland to * loop around, releasing locks in the process to avoid contention. This * would only happen if there was a great many racing page faults. * * In most cases we should simply install the guard markers immediately * with no zap or looping.
*/ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { unsignedlong nr_pages = 0;
/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
err = walk_page_range_mm(vma->vm_mm, range->start, range->end,
&guard_install_walk_ops, &nr_pages); if (err < 0) return err;
/* * OK some of the range have non-guard pages mapped, zap * them. This leaves existing guard pages in place.
*/
zap_page_range_single(vma, range->start,
range->end - range->start, NULL);
}
/* * We were unable to install the guard pages due to being raced by page * faults. This should not happen ordinarily. We return to userspace and * immediately retry, relieving lock contention.
*/ return restart_syscall();
}
/* * We're ok with removing guards in mlock()'d ranges, as this is a * non-destructive action.
*/ if (!is_valid_guard_vma(vma, /* allow_locked = */true)) return -EINVAL;
#ifdef CONFIG_64BIT /* Does the madvise operation result in discarding of mapped data? */ staticbool is_discard(int behavior)
{ switch (behavior) { case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_REMOVE: case MADV_DONTFORK: case MADV_WIPEONFORK: case MADV_GUARD_INSTALL: returntrue;
}
returnfalse;
}
/* * We are restricted from madvise()'ing mseal()'d VMAs only in very particular * circumstances - discarding of data from read-only anonymous SEALED mappings. * * This is because users cannot trivally discard data from these VMAs, and may * only do so via an appropriate madvise() call.
*/ staticbool can_madvise_modify(struct madvise_behavior *madv_behavior)
{ struct vm_area_struct *vma = madv_behavior->vma;
/* If the VMA isn't sealed we're good. */ if (!vma_is_sealed(vma)) returntrue;
/* For a sealed VMA, we only care about discard operations. */ if (!is_discard(madv_behavior->behavior)) returntrue;
/* * We explicitly permit all file-backed mappings, whether MAP_SHARED or * MAP_PRIVATE. * * The latter causes some complications. Because now, one can mmap() * read/write a MAP_PRIVATE mapping, write to it, then mprotect() * read-only, mseal() and a discard will be permitted. * * However, in order to avoid issues with potential use of madvise(..., * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being, * permit this.
*/ if (!vma_is_anonymous(vma)) returntrue;
/* If the user could write to the mapping anyway, then this is fine. */ if ((vma->vm_flags & VM_WRITE) &&
arch_vma_access_permitted(vma, /* write= */ true, /* execute= */ false, /* foreign= */ false)) returntrue;
/* Otherwise, we are not permitted to perform this operation. */ returnfalse;
} #else staticbool can_madvise_modify(struct madvise_behavior *madv_behavior)
{ returntrue;
} #endif
/* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own * behavior.
*/ staticint madvise_vma_behavior(struct madvise_behavior *madv_behavior)
{ int behavior = madv_behavior->behavior; struct vm_area_struct *vma = madv_behavior->vma;
vm_flags_t new_flags = vma->vm_flags; struct madvise_behavior_range *range = &madv_behavior->range; int error;
if (unlikely(!can_madvise_modify(madv_behavior))) return -EPERM;
switch (behavior) { case MADV_REMOVE: return madvise_remove(madv_behavior); case MADV_WILLNEED: return madvise_willneed(madv_behavior); case MADV_COLD: return madvise_cold(madv_behavior); case MADV_PAGEOUT: return madvise_pageout(madv_behavior); case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: return madvise_dontneed_free(madv_behavior); case MADV_COLLAPSE: return madvise_collapse(vma, range->start, range->end,
&madv_behavior->lock_dropped); case MADV_GUARD_INSTALL: return madvise_guard_install(madv_behavior); case MADV_GUARD_REMOVE: return madvise_guard_remove(madv_behavior);
/* The below behaviours update VMAs via madvise_update_vma(). */
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; case MADV_SEQUENTIAL:
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; break; case MADV_RANDOM:
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; break; case MADV_DONTFORK:
new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: if (new_flags & VM_IO) return -EINVAL;
new_flags &= ~VM_DONTCOPY; break; case MADV_WIPEONFORK: /* MADV_WIPEONFORK is only supported on anonymous memory. */ if (vma->vm_file || new_flags & VM_SHARED) return -EINVAL;
new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: if (new_flags & VM_DROPPABLE) return -EINVAL;
new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) ||
(new_flags & VM_DROPPABLE)) return -EINVAL;
new_flags &= ~VM_DONTDUMP; break; case MADV_MERGEABLE: case MADV_UNMERGEABLE:
error = ksm_madvise(vma, range->start, range->end,
behavior, &new_flags); if (error) goto out; break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; case __MADV_SET_ANON_VMA_NAME: /* Only anonymous mappings can be named */ if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; break;
}
/* This is a write operation.*/
VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK);
error = madvise_update_vma(new_flags, madv_behavior);
out: /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable.
*/ if (error == -ENOMEM)
error = -EAGAIN; return error;
}
#ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling.
*/ staticint madvise_inject_error(struct madvise_behavior *madv_behavior)
{ unsignedlong size; unsignedlong start = madv_behavior->range.start; unsignedlong end = madv_behavior->range.end;
if (!capable(CAP_SYS_ADMIN)) return -EPERM;
for (; start < end; start += size) { unsignedlong pfn; struct page *page; int ret;
ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret;
pfn = page_to_pfn(page);
/* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will * no longer be a compound page.
*/
size = page_size(compound_head(page));
if (madv_behavior->behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
} else {
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
pfn, start);
ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP)
ret = 0;
}
if (ret) return ret;
}
return 0;
}
staticbool is_memory_failure(struct madvise_behavior *madv_behavior)
{ switch (madv_behavior->behavior) { case MADV_HWPOISON: case MADV_SOFT_OFFLINE: returntrue; default: returnfalse;
}
}
staticbool
madvise_behavior_valid(int behavior)
{ switch (behavior) { case MADV_DOFORK: case MADV_DONTFORK: case MADV_NORMAL: case MADV_SEQUENTIAL: case MADV_RANDOM: case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: #endif returntrue;
default: returnfalse;
}
}
/* Can we invoke process_madvise() on a remote mm for the specified behavior? */ staticbool process_madvise_remote_valid(int behavior)
{ switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: case MADV_COLLAPSE: returntrue; default: returnfalse;
}
}
/* * Try to acquire a VMA read lock if possible. * * We only support this lock over a single VMA, which the input range must * span either partially or fully. * * This function always returns with an appropriate lock held. If a VMA read * lock could be acquired, we return true and set madv_behavior state * accordingly. * * If a VMA read lock could not be acquired, we return false and expect caller to * fallback to mmap lock behaviour.
*/ staticbool try_vma_read_lock(struct madvise_behavior *madv_behavior)
{ struct mm_struct *mm = madv_behavior->mm; struct vm_area_struct *vma;
vma = lock_vma_under_rcu(mm, madv_behavior->range.start); if (!vma) goto take_mmap_read_lock; /* * Must span only a single VMA; uffd and remote processes are * unsupported.
*/ if (madv_behavior->range.end > vma->vm_end || current->mm != mm ||
userfaultfd_armed(vma)) {
vma_end_read(vma); goto take_mmap_read_lock;
}
madv_behavior->vma = vma; returntrue;
/* * Walk the vmas in range [start,end), and call the madvise_vma_behavior * function on each one. The function will get start and end parameters that * cover the overlap between the current vma and the original range. Any * unmapped regions in the original range will result in this function returning * -ENOMEM while still calling the madvise_vma_behavior function on all of the * existing vmas in the range. Must be called with the mmap_lock held for * reading or writing.
*/ static int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
{ struct mm_struct *mm = madv_behavior->mm; struct madvise_behavior_range *range = &madv_behavior->range; /* range is updated to span each VMA, so store end of entire range. */ unsignedlong last_end = range->end; int unmapped_error = 0; int error; struct vm_area_struct *prev, *vma;
/* * If VMA read lock is supported, apply madvise to a single VMA * tentatively, avoiding walking VMAs.
*/ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK &&
try_vma_read_lock(madv_behavior)) {
error = madvise_vma_behavior(madv_behavior);
vma_end_read(madv_behavior->vma); return error;
}
for (;;) { /* Still start < end. */ if (!vma) return -ENOMEM;
/* Here start < (last_end|vma->vm_end). */ if (range->start < vma->vm_start) { /* * This indicates a gap between VMAs in the input * range. This does not cause the operation to abort, * rather we simply return -ENOMEM to indicate that this * has happened, but carry on.
*/
unmapped_error = -ENOMEM;
range->start = vma->vm_start; if (range->start >= last_end) break;
}
/* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need * to only take it for reading.
*/ staticenum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
{ if (is_memory_failure(madv_behavior)) return MADVISE_NO_LOCK;
switch (madv_behavior->behavior) { case MADV_REMOVE: case MADV_WILLNEED: case MADV_COLD: case MADV_PAGEOUT: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: case MADV_GUARD_INSTALL: case MADV_GUARD_REMOVE: return MADVISE_MMAP_READ_LOCK; case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: return MADVISE_VMA_READ_LOCK; default: return MADVISE_MMAP_WRITE_LOCK;
}
}
switch (lock_mode) { case MADVISE_NO_LOCK: break; case MADVISE_MMAP_WRITE_LOCK: if (mmap_write_lock_killable(mm)) return -EINTR; break; case MADVISE_MMAP_READ_LOCK:
mmap_read_lock(mm); break; case MADVISE_VMA_READ_LOCK: /* We will acquire the lock per-VMA in madvise_walk_vmas(). */ break;
}
switch (madv_behavior->lock_mode) { case MADVISE_NO_LOCK: return; case MADVISE_MMAP_WRITE_LOCK:
mmap_write_unlock(mm); break; case MADVISE_MMAP_READ_LOCK:
mmap_read_unlock(mm); break; case MADVISE_VMA_READ_LOCK: /* We will drop the lock per-VMA in madvise_walk_vmas(). */ break;
}
madv_behavior->lock_mode = MADVISE_NO_LOCK;
}
staticbool madvise_batch_tlb_flush(int behavior)
{ switch (behavior) { case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: case MADV_FREE: returntrue; default: returnfalse;
}
}
staticvoid madvise_init_tlb(struct madvise_behavior *madv_behavior)
{ if (madvise_batch_tlb_flush(madv_behavior->behavior))
tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm);
}
staticvoid madvise_finish_tlb(struct madvise_behavior *madv_behavior)
{ if (madvise_batch_tlb_flush(madv_behavior->behavior))
tlb_finish_mmu(madv_behavior->tlb);
}
staticbool is_valid_madvise(unsignedlong start, size_t len_in, int behavior)
{
size_t len;
if (!madvise_behavior_valid(behavior)) returnfalse;
if (!PAGE_ALIGNED(start)) returnfalse;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) returnfalse;
if (start + len < start) returnfalse;
returntrue;
}
/* * madvise_should_skip() - Return if the request is invalid or nothing. * @start: Start address of madvise-requested address range. * @len_in: Length of madvise-requested address range. * @behavior: Requested madvise behavor. * @err: Pointer to store an error code from the check. * * If the specified behaviour is invalid or nothing would occur, we skip the * operation. This function returns true in the cases, otherwise false. In * the former case we store an error on @err.
*/ staticbool madvise_should_skip(unsignedlong start, size_t len_in, int behavior, int *err)
{ if (!is_valid_madvise(start, len_in, behavior)) {
*err = -EINVAL; returntrue;
} if (start + PAGE_ALIGN(len_in) == start) {
*err = 0; returntrue;
} returnfalse;
}
staticbool is_madvise_populate(struct madvise_behavior *madv_behavior)
{ switch (madv_behavior->behavior) { case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: returntrue; default: returnfalse;
}
}
/* * untagged_addr_remote() assumes mmap_lock is already held. On * architectures like x86 and RISC-V, tagging is tricky because each * mm may have a different tagging mask. However, we might only hold * the per-VMA lock (currently only local processes are supported), * so untagged_addr is used to avoid the mmap_lock assertion for * local processes.
*/ staticinlineunsignedlong get_untagged_addr(struct mm_struct *mm, unsignedlong start)
{ return current->mm == mm ? untagged_addr(start) :
untagged_addr_remote(mm, start);
}
/* * The madvise(2) system call. * * Applications can use madvise() to advise the kernel how it should * handle paging I/O in this VM area. The idea is to help the kernel * use appropriate read-ahead and caching techniques. The information * provided is advisory only, and can be safely disregarded by the * kernel without affecting the correct operation of the application. * * behavior values: * MADV_NORMAL - the default behavior is to read clusters. This * results in some read-ahead and read-behind. * MADV_RANDOM - the system should read the minimum amount of data * on any access, since it is unlikely that the appli- * cation will need more than what it asks for. * MADV_SEQUENTIAL - pages in the given range will probably be accessed * once, so they can be aggressively read ahead, and * can be freed soon after they are accessed. * MADV_WILLNEED - the application is notifying the system to read * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. * MADV_FREE - the application marks pages in the given range as lazy free, * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. * MADV_WIPEONFORK - present the child process with zero-filled memory in this * range after a fork. * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK * MADV_HWPOISON - trigger memory error handler as if the given memory range * were corrupted by unrecoverable hardware memory failure. * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * MADV_HUGEPAGE - the application wants to back the given range by transparent * huge pages in the future. Existing pages might be coalesced and * new pages might be allocated as THP. * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * MADV_POPULATE_READ - populate (prefault) page tables readable by * triggering read faults if required * MADV_POPULATE_WRITE - populate (prefault) page tables writable by * triggering write faults if required * * return values: * zero - success * -EINVAL - start + len < 0, start is not page-aligned, * "behavior" is not a valid value, or application * is attempting to release locked or shared pages, * or the specified address range includes file, Huge TLB, * MAP_SHARED or VMPFNMAP range. * -ENOMEM - addresses in the specified range are not currently * mapped, or are outside the AS of the process. * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. * -EPERM - memory is sealed.
*/ int do_madvise(struct mm_struct *mm, unsignedlong start, size_t len_in, int behavior)
{ int error; struct mmu_gather tlb; struct madvise_behavior madv_behavior = {
.mm = mm,
.behavior = behavior,
.tlb = &tlb,
};
/* Perform an madvise operation over a vector of addresses and lengths. */ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, int behavior)
{
ssize_t ret = 0;
size_t total_len; struct mmu_gather tlb; struct madvise_behavior madv_behavior = {
.mm = mm,
.behavior = behavior,
.tlb = &tlb,
};
total_len = iov_iter_count(iter);
ret = madvise_lock(&madv_behavior); if (ret) return ret;
madvise_init_tlb(&madv_behavior);
while (iov_iter_count(iter)) { unsignedlong start = (unsignedlong)iter_iov_addr(iter);
size_t len_in = iter_iov_len(iter); int error;
if (madvise_should_skip(start, len_in, behavior, &error))
ret = error; else
ret = madvise_do_behavior(start, len_in, &madv_behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * * We drop and reacquire locks so it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway.
*/ if (ret == -ERESTARTNOINTR) { if (fatal_signal_pending(current)) {
ret = -EINTR; break;
}
/* Drop and reacquire lock to unwind race. */
madvise_finish_tlb(&madv_behavior);
madvise_unlock(&madv_behavior);
ret = madvise_lock(&madv_behavior); if (ret) goto out;
madvise_init_tlb(&madv_behavior); continue;
} if (ret < 0) break;
iov_iter_advance(iter, iter_iov_len(iter));
}
madvise_finish_tlb(&madv_behavior);
madvise_unlock(&madv_behavior);
out:
ret = (total_len - iov_iter_count(iter)) ? : ret;
ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out;
task = pidfd_get_task(pidfd, &f_flags); if (IS_ERR(task)) {
ret = PTR_ERR(task); goto free_iov;
}
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) {
ret = PTR_ERR(mm); goto release_task;
}
/* * We need only perform this check if we are attempting to manipulate a * remote process's address space.
*/ if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
ret = -EINVAL; goto release_mm;
}
/* * Require CAP_SYS_NICE for influencing process performance. Note that * only non-destructive hints are currently supported for remote * processes.
*/ if (mm != current->mm && !capable(CAP_SYS_NICE)) {
ret = -EPERM; goto release_mm;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.