/* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the * value. The extra bit (- 1 in the shift value) is to take the sign * bit into account.
*/ #define PGOFF_LOFFT_MAX \
(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
/* * vma address alignment (but not the pgoff alignment) has * already been checked by prepare_hugepage_range. If you add * any error returns here, do so after setting VM_HUGETLB, so * is_vm_hugetlb_page tests below unmap_region go the right * way when do_mmap unwinds (may be important on powerpc * and ia64).
*/
vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
vma->vm_ops = &hugetlb_vm_ops;
/* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can * only happen on architectures where sizeof(loff_t) == * sizeof(unsigned long). So, only check in those instances.
*/ if (sizeof(unsignedlong) == sizeof(loff_t)) { if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL;
}
/* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL;
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL;
inode_lock(inode);
file_accessed(file);
ret = -ENOMEM;
vm_flags = vma->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode * flag S_PRIVATE is set.
*/ if (inode->i_flags & S_PRIVATE)
vm_flags |= VM_NORESERVE;
if (hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
vm_flags) < 0) goto out;
ret = 0; if (vma->vm_flags & VM_WRITE && inode->i_size < len)
i_size_write(inode, len);
out:
inode_unlock(inode);
/* * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset. * Returns the maximum number of bytes one can read without touching the 1st raw * HWPOISON page. * * The implementation borrows the iteration logic from copy_page_to_iter*.
*/ static size_t adjust_range_hwpoison(struct folio *folio, size_t offset,
size_t bytes)
{ struct page *page;
size_t n = 0;
size_t res = 0;
/* First page to start the loop. */
page = folio_page(folio, offset / PAGE_SIZE);
offset %= PAGE_SIZE; while (1) { if (is_raw_hwpoison_page_in_hugepage(page)) break;
/* Safe to read n bytes without touching HWPOISON subpage. */
n = min(bytes, (size_t)PAGE_SIZE - offset);
res += n;
bytes -= n; if (!bytes || !n) break;
offset += n; if (offset == PAGE_SIZE) {
page = nth_page(page, 1);
offset = 0;
}
}
return res;
}
/* * Support for read() - Find the page attached to f_mapping and copy out the * data. This provides functionality similar to filemap_read().
*/ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{ struct file *file = iocb->ki_filp; struct hstate *h = hstate_file(file); struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsignedlong index = iocb->ki_pos >> huge_page_shift(h); unsignedlong offset = iocb->ki_pos & ~huge_page_mask(h); unsignedlong end_index;
loff_t isize;
ssize_t retval = 0;
while (iov_iter_count(to)) { struct folio *folio;
size_t nr, copied, want;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
isize = i_size_read(inode); if (!isize) break;
end_index = (isize - 1) >> huge_page_shift(h); if (index > end_index) break; if (index == end_index) {
nr = ((isize - 1) & ~huge_page_mask(h)) + 1; if (nr <= offset) break;
}
nr = nr - offset;
/* Find the folio */
folio = filemap_lock_hugetlb_folio(h, mapping, index); if (IS_ERR(folio)) { /* * We have a HOLE, zero out the user-buffer for the * length of the hole or request.
*/
copied = iov_iter_zero(nr, to);
} else {
folio_unlock(folio);
if (!folio_test_hwpoison(folio))
want = nr; else { /* * Adjust how many bytes safe to read without * touching the 1st raw HWPOISON page after * offset.
*/
want = adjust_range_hwpoison(folio, offset, nr); if (want == 0) {
folio_put(folio);
retval = -EIO; break;
}
}
/* * We have the folio, copy it to user space buffer.
*/
copied = copy_folio_to_iter(folio, offset, want, to);
folio_put(folio);
}
offset += copied;
retval += copied; if (copied != nr && iov_iter_count(to)) { if (!retval)
retval = -EFAULT; break;
}
index += offset >> huge_page_shift(h);
offset &= ~huge_page_mask(h);
}
iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; return retval;
}
/* * Called with i_mmap_rwsem held for inode based vma maps. This makes * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault * mutex for the page in the mapping. So, we can not race with page being * faulted into the vma.
*/ staticbool hugetlb_vma_maps_pfn(struct vm_area_struct *vma, unsignedlong addr, unsignedlong pfn)
{
pte_t *ptep, pte;
ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); if (!ptep) returnfalse;
pte = huge_ptep_get(vma->vm_mm, addr, ptep); if (huge_pte_none(pte) || !pte_present(pte)) returnfalse;
if (pte_pfn(pte) == pfn) returntrue;
returnfalse;
}
/* * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB.
*/ staticunsignedlong vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
{ unsignedlong offset = 0;
if (!hugetlb_vma_maps_pfn(vma, v_start, pfn)) continue;
if (!hugetlb_vma_trylock_write(vma)) {
vma_lock = vma->vm_private_data; /* * If we can not get vma lock, we need to drop * immap_sema and take locks in order. First, * take a ref on the vma_lock structure so that * we can be guaranteed it will not go away when * dropping immap_sema.
*/
kref_get(&vma_lock->refs); break;
}
if (vma_lock) { /* * Wait on vma_lock. We know it is still valid as we have * a reference. We must 'open code' vma locking as we do * not know if vma_lock is still attached to vma.
*/
down_write(&vma_lock->rw_sema);
i_mmap_lock_write(mapping);
vma = vma_lock->vma; if (!vma) { /* * If lock is no longer attached to vma, then just * unlock, drop our reference and retry looking for * other vmas.
*/
up_write(&vma_lock->rw_sema);
kref_put(&vma_lock->refs, hugetlb_vma_lock_release); goto retry;
}
/* * vma_lock is still attached to vma. Check to see if vma * still maps page and if so, unmap.
*/
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end); if (hugetlb_vma_maps_pfn(vma, v_start, pfn))
unmap_hugepage_range(vma, v_start, v_end, NULL,
ZAP_FLAG_DROP_MARKER);
/* * end == 0 indicates that the entire range after start should be * unmapped. Note, end is exclusive, whereas the interval tree takes * an inclusive "last".
*/
vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsignedlong v_start; unsignedlong v_end;
/* * Note that vma lock only exists for shared/non-private * vmas. Therefore, lock is not held when calling * unmap_hugepage_range for private vmas.
*/
hugetlb_vma_unlock_write(vma);
}
}
/* * Called with hugetlb fault mutex held. * Returns true if page was actually removed, false otherwise.
*/ staticbool remove_inode_single_folio(struct hstate *h, struct inode *inode, struct address_space *mapping, struct folio *folio, pgoff_t index, bool truncate_op)
{ bool ret = false;
/* * If folio is mapped, it was faulted in after being * unmapped in caller or hugetlb_vmdelete_list() skips * unmapping it due to fail to grab lock. Unmap (again) * while holding the fault mutex. The mutex will prevent * faults until we finish removing the folio. Hold folio * lock to guarantee no concurrent migration.
*/
folio_lock(folio); if (unlikely(folio_mapped(folio)))
hugetlb_unmap_file_folio(h, mapping, folio, index);
/* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In * rare out of memory conditions, removal of the region/reserve * map could fail. Correspondingly, the subpool and global * reserve usage count can need to be adjusted.
*/
VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
hugetlb_delete_from_page_cache(folio);
ret = true; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, index,
index + 1, 1)))
hugetlb_fix_reserve_counts(inode);
}
folio_unlock(folio); return ret;
}
/* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can race with truncation. * During faults, hugetlb_no_page() checks i_size before page allocation, * and again after obtaining page table lock. It will 'back out' * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map * deleted. The region/reserve map for ranges without associated * pages are not modified. Page faults can race with hole punch. * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation.
*/ staticvoid remove_inode_hugepages(struct inode *inode, loff_t lstart,
loff_t lend)
{ struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; const pgoff_t end = lend >> PAGE_SHIFT; struct folio_batch fbatch;
pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX);
folio_batch_init(&fbatch);
next = lstart >> PAGE_SHIFT; while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); ++i) { struct folio *folio = fbatch.folios[i];
u32 hash = 0;
index = folio->index >> huge_page_order(h);
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* * Remove folio that was part of folio_batch.
*/ if (remove_inode_single_folio(h, inode, mapping, folio,
index, truncate_op))
freed++;
/* * Get the resv_map from the address space embedded in the inode. * This is the address space which points to any resv_map allocated * at inode creation time. If this is a device special inode, * i_mapping may not point to the original address space.
*/
resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; /* Only regular and link inodes have associated reserve maps */ if (resv_map)
resv_map_release(&resv_map->refs);
clear_inode(inode);
}
/* * hole_start and hole_end indicate the full pages within the hole.
*/
hole_start = round_up(offset, hpage_size);
hole_end = round_down(offset + len, hpage_size);
inode_lock(inode);
/* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
inode_unlock(inode); return -EPERM;
}
i_mmap_lock_write(mapping);
/* If range starts before first full page, zero partial page. */ if (offset < hole_start)
hugetlbfs_zero_partial_page(h, mapping,
offset, min(offset + len, hole_start));
/* Unmap users of full pages in the hole. */ if (hole_end > hole_start) { if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_start >> PAGE_SHIFT,
hole_end >> PAGE_SHIFT, 0);
}
/* If range extends beyond last full page, zero partial page. */ if ((offset + len) > hole_end && (offset + len) > hole_start)
hugetlbfs_zero_partial_page(h, mapping,
hole_end, offset + len);
i_mmap_unlock_write(mapping);
/* Remove full pages from the file. */ if (hole_end > hole_start)
remove_inode_hugepages(inode, hole_start, hole_end);
/* * Default preallocate case. * For this range, start is rounded down and end is rounded up * as well as being converted to page offsets.
*/
start = offset >> hpage_shift;
end = (offset + len + hpage_size - 1) >> hpage_shift;
inode_lock(inode);
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
error = inode_newsize_ok(inode, offset + len); if (error) goto out;
if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
error = -EPERM; goto out;
}
/* * Initialize a pseudo vma as this is required by the huge page * allocation routines.
*/
vma_init(&pseudo_vma, mm);
vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pseudo_vma.vm_file = file;
for (index = start; index < end; index++) { /* * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here.
*/ struct folio *folio; unsignedlong addr;
cond_resched();
/* * fallocate(2) manpage permits EINTR; we may have been * interrupted because we are using up too much memory.
*/ if (signal_pending(current)) {
error = -EINTR; break;
}
/* addr is the offset within the file (zero based) */
addr = index * hpage_size;
/* mutex taken here, fault path and hole punch */
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
folio = filemap_get_folio(mapping, index << huge_page_order(h)); if (!IS_ERR(folio)) {
folio_put(folio);
mutex_unlock(&hugetlb_fault_mutex_table[hash]); continue;
}
/* * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent.
*/
folio = alloc_hugetlb_folio(&pseudo_vma, addr, false); if (IS_ERR(folio)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
error = PTR_ERR(folio); goto out;
}
folio_zero_user(folio, addr);
__folio_mark_uptodate(folio);
error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) {
restore_reserve_on_error(h, &pseudo_vma, addr, folio);
folio_put(folio);
mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out;
}
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
folio_set_hugetlb_migratable(folio); /* * folio_unlock because locked by hugetlb_add_to_page_cache() * folio_put() due to reference from alloc_hugetlb_folio()
*/
folio_unlock(folio);
folio_put(folio);
}
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
inode_set_ctime_current(inode);
out:
inode_unlock(inode);
/* * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an * annotation because huge_pmd_share() does an allocation under hugetlb's * i_mmap_rwsem.
*/ staticstruct lock_class_key hugetlbfs_i_mmap_rwsem_key;
/* * Reserve maps are only needed for inodes that can have associated * page allocations.
*/ if (S_ISREG(mode) || S_ISLNK(mode)) {
resv_map = resv_map_alloc(); if (!resv_map) return NULL;
}
buf->f_fsid = u64_to_fsid(id);
buf->f_type = HUGETLBFS_MAGIC;
buf->f_bsize = huge_page_size(h); if (sbinfo) {
spin_lock(&sbinfo->stat_lock); /* If no limits set, just report 0 or -1 for max/free/used
* blocks, like simple_statfs() */ if (sbinfo->spool) { long free_pages;
/* * Convert size option passed from command line to number of huge pages * in the pool specified by hstate. Size option could be in bytes * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
*/ staticlong
hugetlbfs_size_to_hpages(struct hstate *h, unsignedlonglong size_opt, enum hugetlbfs_size_type val_type)
{ if (val_type == NO_SIZE) return -1;
switch (opt) { case Opt_uid:
ctx->uid = result.uid; return 0;
case Opt_gid:
ctx->gid = result.gid; return 0;
case Opt_mode:
ctx->mode = result.uint_32 & 01777U; return 0;
case Opt_size: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val;
ctx->max_size_opt = memparse(param->string, &rest);
ctx->max_val_type = SIZE_STD; if (*rest == '%')
ctx->max_val_type = SIZE_PERCENT; return 0;
case Opt_nr_inodes: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val;
ctx->nr_inodes = memparse(param->string, &rest); return 0;
case Opt_pagesize:
ps = memparse(param->string, &rest);
h = size_to_hstate(ps); if (!h) {
pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL;
}
ctx->hstate = h; return 0;
case Opt_min_size: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val;
ctx->min_size_opt = memparse(param->string, &rest);
ctx->min_val_type = SIZE_STD; if (*rest == '%')
ctx->min_val_type = SIZE_PERCENT; return 0;
default: return -EINVAL;
}
bad_val: return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
param->string, param->key);
}
/* * Use huge page pool size (in hstate) to convert the size * options to number of huge pages. If NO_SIZE, -1 is returned.
*/
ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
ctx->max_size_opt,
ctx->max_val_type);
ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
ctx->min_size_opt,
ctx->min_val_type);
/* * If max_size was specified, then min_size must be smaller
*/ if (ctx->max_val_type > NO_SIZE &&
ctx->min_hpages > ctx->max_hpages) {
pr_err("Minimum size can not be greater than maximum size\n"); return -EINVAL;
}
/* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimum size) are taken * when the subpool is created.
*/ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
sbinfo->spool = hugepage_new_subpool(ctx->hstate,
ctx->max_hpages,
ctx->min_hpages); if (!sbinfo->spool) goto out_free;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = huge_page_size(ctx->hstate);
sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_d_flags = DCACHE_DONTCACHE;
sb->s_time_gran = 1;
/* * Due to the special and limited functionality of hugetlbfs, it does * not work well as a stacking filesystem.
*/
sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); if (!sb->s_root) goto out_free; return 0;
out_free:
kfree(sbinfo->spool);
kfree(sbinfo); return -ENOMEM;
}
staticint hugetlbfs_get_tree(struct fs_context *fc)
{ int err = hugetlbfs_validate(fc); if (err) return err; return get_tree_nodev(fc, hugetlbfs_fill_super);
}
/* * Note that size should be aligned to proper hugepage size in caller side, * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/ struct file *hugetlb_file_setup(constchar *name, size_t size,
vm_flags_t acctflag, int creat_flags, int page_size_log)
{ struct inode *inode; struct vfsmount *mnt; int hstate_idx; struct file *file;
hstate_idx = get_hstate_idx(page_size_log); if (hstate_idx < 0) return ERR_PTR(-ENODEV);
mnt = hugetlbfs_vfsmount[hstate_idx]; if (!mnt) return ERR_PTR(-ENOENT);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.