staticint __init init_dax_wait_table(void)
{ int i;
for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
init_waitqueue_head(wait_table + i); return 0;
}
fs_initcall(init_dax_wait_table);
/* * DAX pagecache entries use XArray value entries so they can't be mistaken * for pages. We use one bit for locking, one bit for the entry size (PMD) * and two more to tell us if the entry is a zero page or an empty entry that * is just used for locking. In total four special bits. * * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem * block allocation.
*/ #define DAX_SHIFT (4) #define DAX_LOCKED (1UL << 0) #define DAX_PMD (1UL << 1) #define DAX_ZERO_PAGE (1UL << 2) #define DAX_EMPTY (1UL << 3)
/* * true if the entry that was found is of a smaller order than the entry * we were looking for
*/ staticbool dax_is_conflict(void *entry)
{ return entry == XA_RETRY_ENTRY;
}
/** * enum dax_wake_mode: waitqueue wakeup behaviour * @WAKE_ALL: wake all waiters in the waitqueue * @WAKE_NEXT: wake only the first waiter in the waitqueue
*/ enum dax_wake_mode {
WAKE_ALL,
WAKE_NEXT,
};
/* * If 'entry' is a PMD, align the 'index' that we use for the wait * queue to the start of that PMD. This ensures that all offsets in * the range covered by the PMD map to the same bit lock.
*/ if (dax_is_pmd_entry(entry))
index &= ~PG_PMD_COLOUR;
key->xa = xas->xa;
key->entry_start = index;
/* * @entry may no longer be the entry at the index in the mapping. * The important information it's conveying is whether the entry at * this index used to be a PMD entry.
*/ staticvoid dax_wake_entry(struct xa_state *xas, void *entry, enum dax_wake_mode mode)
{ struct exceptional_entry_key key;
wait_queue_head_t *wq;
wq = dax_entry_waitqueue(xas, entry, &key);
/* * Checking for locked entry and prepare_to_wait_exclusive() happens * under the i_pages lock, ditto for entry handling in our callers. * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them.
*/ if (waitqueue_active(wq))
__wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
}
/* * Look up entry in page cache, wait for it to become unlocked if it * is a DAX entry and return it. The caller must subsequently call * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() * if it did. The entry returned may have a larger order than @order. * If @order is larger than the order of the entry found in i_pages, this * function returns a dax_is_conflict entry. * * Must be called with the i_pages lock held.
*/ staticvoid *get_next_unlocked_entry(struct xa_state *xas, unsignedint order)
{ void *entry; struct wait_exceptional_entry_queue ewait;
wait_queue_head_t *wq;
/* * Wait for the given entry to become unlocked. Caller must hold the i_pages * lock and call either put_unlocked_entry() if it did not lock the entry or * dax_unlock_entry() if it did. Returns an unlocked entry if still present.
*/ staticvoid *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
{ struct wait_exceptional_entry_queue ewait;
wait_queue_head_t *wq;
/* * The only thing keeping the address space around is the i_pages lock * (it's cycled in clear_inode() after removing the entries from i_pages) * After we call xas_unlock_irq(), we cannot touch xas->xa.
*/ staticvoid wait_entry_unlocked(struct xa_state *xas, void *entry)
{ struct wait_exceptional_entry_queue ewait;
wait_queue_head_t *wq;
wq = dax_entry_waitqueue(xas, entry, &ewait.key); /* * Unlike get_next_unlocked_entry() there is no guarantee that this * path ever successfully retrieves an unlocked entry before an * inode dies. Perform a non-exclusive wait in case this path * never successfully performs its own wake up.
*/
prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
xas_unlock_irq(xas);
schedule();
finish_wait(wq, &ewait.wait);
}
/* * We used the xa_state to get the entry, but then we locked the entry and * dropped the xa_lock, so we know the xa_state is stale and must be reset * before use.
*/ staticvoid dax_unlock_entry(struct xa_state *xas, void *entry)
{ void *old;
/* * Return: The entry stored at this location before it was locked.
*/ staticvoid *dax_lock_entry(struct xa_state *xas, void *entry)
{ unsignedlong v = xa_to_value(entry); return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
}
/* * A DAX folio is considered shared if it has no mapping set and ->share (which * shares the ->index field) is non-zero. Note this may return false even if the * page is shared between multiple files but has not yet actually been mapped * into multiple address spaces.
*/ staticinlinebool dax_folio_is_shared(struct folio *folio)
{ return !folio->mapping && folio->share;
}
/* * When it is called by dax_insert_entry(), the shared flag will indicate * whether this entry is shared by multiple files. If the page has not * previously been associated with any mappings the ->mapping and ->index * fields will be set. If it has already been associated with a mapping * the mapping will be cleared and the share count set. It's then up to * reverse map users like memory_failure() to call back into the filesystem to * recover ->mapping and ->index information. For example by implementing * dax_holder_operations.
*/ staticvoid dax_folio_make_shared(struct folio *folio)
{ /* * folio is not currently shared so mark it as shared by clearing * folio->mapping.
*/
folio->mapping = NULL;
/* * folio has previously been mapped into one address space so set the * share count.
*/
folio->share = 1;
}
staticinlineunsignedlong dax_folio_put(struct folio *folio)
{ unsignedlong ref; int order, i;
if (!dax_folio_is_shared(folio))
ref = 0; else
ref = --folio->share;
if (ref) return ref;
folio->mapping = NULL;
order = folio_order(folio); if (!order) return 0;
folio_reset_order(folio);
for (i = 0; i < (1UL << order); i++) { struct dev_pagemap *pgmap = page_pgmap(&folio->page); struct page *page = folio_page(folio, i); struct folio *new_folio = (struct folio *)page;
ClearPageHead(page);
clear_compound_head(page);
new_folio->mapping = NULL; /* * Reset pgmap which was over-written by * prep_compound_page().
*/
new_folio->pgmap = pgmap;
new_folio->share = 0;
WARN_ON_ONCE(folio_ref_count(new_folio));
}
return ref;
}
staticvoid dax_folio_init(void *entry)
{ struct folio *folio = dax_to_folio(entry); int order = dax_entry_order(entry);
/* * Folio should have been split back to order-0 pages in * dax_folio_put() when they were removed from their * final mapping.
*/
WARN_ON_ONCE(folio_order(folio));
if (order > 0) {
prep_compound_page(&folio->page, order); if (order > 1)
INIT_LIST_HEAD(&folio->_deferred_list);
WARN_ON_ONCE(folio_ref_count(folio));
}
}
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) return NULL;
if (folio_ref_count(folio) - folio_mapcount(folio)) return &folio->page; else return NULL;
}
/** * dax_lock_folio - Lock the DAX entry corresponding to a folio * @folio: The folio whose entry we want to lock * * Context: Process context. * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could * not be locked.
*/
dax_entry_t dax_lock_folio(struct folio *folio)
{
XA_STATE(xas, NULL, 0); void *entry;
/* Ensure folio->mapping isn't freed while we look at it */
rcu_read_lock(); for (;;) { struct address_space *mapping = READ_ONCE(folio->mapping);
entry = NULL; if (!mapping || !dax_mapping(mapping)) break;
/* * In the device-dax case there's no need to lock, a * struct dev_pagemap pin is sufficient to keep the * inode alive, and we assume we have dev_pagemap pin * otherwise we would not have a valid pfn_to_page() * translation.
*/
entry = (void *)~0UL; if (S_ISCHR(mapping->host->i_mode)) break;
/* * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping * @mapping: the file's mapping whose entry we want to lock * @index: the offset within this file * @page: output the dax page corresponding to this dax entry * * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry * could not be locked.
*/
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, struct page **page)
{
XA_STATE(xas, NULL, 0); void *entry;
rcu_read_lock(); for (;;) {
entry = NULL; if (!dax_mapping(mapping)) break;
xas.xa = &mapping->i_pages;
xas_lock_irq(&xas);
xas_set(&xas, index);
entry = xas_load(&xas); if (dax_is_locked(entry)) {
rcu_read_unlock();
wait_entry_unlocked(&xas, entry);
rcu_read_lock(); continue;
} if (!entry ||
dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* * Because we are looking for entry from file's mapping * and index, so the entry may not be inserted for now, * or even a zero/empty entry. We don't think this is * an error case. So, return a special value and do * not output @page.
*/
entry = (void *)~0UL;
} else {
*page = pfn_to_page(dax_to_pfn(entry));
dax_lock_entry(&xas, entry);
}
xas_unlock_irq(&xas); break;
}
rcu_read_unlock(); return (dax_entry_t)entry;
}
/* * Find page cache entry at given index. If it is a DAX entry, return it * with the entry locked. If the page cache doesn't contain an entry at * that index, add a locked empty entry. * * When requesting an entry with size DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return VM_FAULT_FALLBACK. * This will happen if there are any PTE entries within the PMD range * that we are requesting. * * We always favor PTE entries over PMD entries. There isn't a flow where we * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD * insertion will fail if it finds any PTE entries already in the tree, and a * PTE insertion will cause an existing PMD entry to be unmapped and * downgraded to PTE entries. This happens for both PMD zero pages as * well as PMD empty entries. * * The exception to this downgrade path is for PMD entries that have * real storage backing them. We will leave these real PMD entries in * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. * * On error, this function does not return an ERR_PTR. Instead it returns * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values * overlap with xarray value entries.
*/ staticvoid *grab_mapping_entry(struct xa_state *xas, struct address_space *mapping, unsignedint order)
{ unsignedlong index = xas->xa_index; bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ void *entry;
if (entry) { if (dax_is_conflict(entry)) goto fallback; if (!xa_is_value(entry)) {
xas_set_err(xas, -EIO); goto out_unlock;
}
if (order == 0) { if (dax_is_pmd_entry(entry) &&
(dax_is_zero_entry(entry) ||
dax_is_empty_entry(entry))) {
pmd_downgrade = true;
}
}
}
if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop * the i_pages lock.
*/
dax_lock_entry(xas, entry);
/* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped.
*/ if (dax_is_zero_entry(entry)) {
xas_unlock_irq(xas);
unmap_mapping_pages(mapping,
xas->xa_index & ~PG_PMD_COLOUR,
PG_PMD_NR, false);
xas_reset(xas);
xas_lock_irq(xas);
}
out_unlock:
xas_unlock_irq(xas); if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) goto retry; if (xas->xa_node == XA_ERROR(-ENOMEM)) return xa_mk_internal(VM_FAULT_OOM); if (xas_error(xas)) return xa_mk_internal(VM_FAULT_SIGBUS); return entry;
fallback:
xas_unlock_irq(xas); return xa_mk_internal(VM_FAULT_FALLBACK);
}
/** * dax_layout_busy_page_range - find first pinned page in @mapping * @mapping: address space to scan for a page with ref count > 1 * @start: Starting offset. Page containing 'start' is included. * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, * pages from 'start' till the end of file are included. * * DAX requires ZONE_DEVICE mapped pages. These pages are never * 'onlined' to the page allocator so they are considered idle when * page->count == 1. A filesystem uses this interface to determine if * any page in the mapping is busy, i.e. for DMA, or other * get_user_pages() usages. * * It is expected that the filesystem is holding locks to block the * establishment of new mappings in this address_space. I.e. it expects * to be able to run unmap_mapping_range() and subsequently not race * mapping_mapped() becoming true.
*/ struct page *dax_layout_busy_page_range(struct address_space *mapping,
loff_t start, loff_t end)
{ void *entry; unsignedint scanned = 0; struct page *page = NULL;
pgoff_t start_idx = start >> PAGE_SHIFT;
pgoff_t end_idx;
XA_STATE(xas, &mapping->i_pages, start_idx);
if (!dax_mapping(mapping)) return NULL;
/* If end == LLONG_MAX, all pages from start to till end of file */ if (end == LLONG_MAX)
end_idx = ULONG_MAX; else
end_idx = end >> PAGE_SHIFT; /* * If we race get_user_pages_fast() here either we'll see the * elevated page count in the iteration and wait, or * get_user_pages_fast() will see that the page it took a reference * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by * pte_lock() and pmd_lock(). New references are not taken without * holding those locks, and unmap_mapping_pages() will not zero the * pte or pmd without holding the respective lock, so we are * guaranteed to either see new references or prevent new * references from being established.
*/
unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
xas_lock_irq(&xas);
xas_for_each(&xas, entry, end_idx) { if (WARN_ON_ONCE(!xa_is_value(entry))) continue;
entry = wait_entry_unlocked_exclusive(&xas, entry); if (entry)
page = dax_busy_page(entry);
put_unlocked_entry(&xas, entry, WAKE_NEXT); if (page) break; if (++scanned % XA_CHECK_SCHED) continue;
/* * Delete DAX entry at @index from @mapping. Wait for it * to be unlocked before deleting it.
*/ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{ int ret = __dax_invalidate_entry(mapping, index, true);
/* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the * page cache (usually fs-private i_mmap_sem for writing). Since the * caller has seen a DAX entry for this index, we better find it * at that index as well...
*/
WARN_ON_ONCE(!ret); return ret;
}
/* * Unmaps the inode and waits for any DMA to complete prior to deleting the * DAX mapping entries for the range. * * For NOWAIT behavior, pass @cb as NULL to early-exit on first found * busy page
*/ int dax_break_layout(struct inode *inode, loff_t start, loff_t end, void (cb)(struct inode *))
{ struct page *page; int error = 0;
if (!dax_mapping(inode->i_mapping)) return 0;
do {
page = dax_layout_busy_page_range(inode->i_mapping, start, end); if (!page) break; if (!cb) {
error = -ERESTARTSYS; break;
}
error = wait_page_idle(page, cb, inode);
} while (error == 0);
if (!page)
dax_delete_mapping_range(inode->i_mapping, start, end);
/* * MAP_SYNC on a dax mapping guarantees dirty metadata is * flushed on write-faults (non-cow), but not read-faults.
*/ staticbool dax_fault_is_synchronous(conststruct iomap_iter *iter, struct vm_area_struct *vma)
{ return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
(iter->iomap.flags & IOMAP_F_DIRTY);
}
/* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to * PTEs. If we happen to be trying to insert a PTE and there is a PMD * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate.
*/ staticvoid *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, conststruct iomap_iter *iter, void *entry, unsignedlong pfn, unsignedlong flags)
{ struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); bool write = iter->flags & IOMAP_WRITE; bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); bool shared = iter->iomap.flags & IOMAP_F_SHARED;
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsignedlong index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry))
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
PG_PMD_NR, false); else/* pte entry */
unmap_mapping_pages(mapping, index, 1, false);
}
/* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or * PMD entry is already in the cache, we leave it alone. This * means that if we are trying to insert a PTE and the * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary.
*/
old = dax_lock_entry(xas, new_entry);
WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
DAX_LOCKED));
entry = new_entry;
} else {
xas_load(xas); /* Walk the xa_state */
}
if (dirty)
xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
if (write && shared)
xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
/* * A page got tagged dirty in DAX mapping? Something is seriously * wrong.
*/ if (WARN_ON(!xa_is_value(entry))) return -EIO;
if (unlikely(dax_is_locked(entry))) { void *old_entry = entry;
entry = get_next_unlocked_entry(xas, 0);
/* Entry got punched out / reallocated? */ if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. * We have to compare pfns as we must not bail out due to * difference in lockbit or entry type.
*/ if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
dax_is_zero_entry(entry))) {
ret = -EIO; goto put_unlocked;
}
/* Another fsync thread may have already done this entry */ if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) goto put_unlocked;
}
/* Lock the entry to serialize with page faults */
dax_lock_entry(xas, entry);
/* * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we * actually flush the caches. This is achieved as the calls will look * at the entry only under the i_pages lock and once they do that * they will see the entry locked and wait for it to unlock.
*/
xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irq(xas);
/* * If dax_writeback_mapping_range() was given a wbc->range_start * in the middle of a PMD, the 'index' we use needs to be * aligned to the start of the PMD. * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks.
*/
pfn = dax_to_pfn(entry);
count = 1UL << dax_entry_order(entry);
index = xas->xa_index & ~(count - 1);
end = index + count - 1;
/* Walk all mappings of a given index of a file and writeprotect them */
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
pfn_mkclean_range(pfn, count, index, vma);
cond_resched();
}
i_mmap_unlock_read(mapping);
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as * the pfn mappings are writeprotected and fault waits for mapping * entry lock.
*/
xas_reset(xas);
xas_lock_irq(xas);
xas_store(xas, entry);
xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
dax_wake_entry(xas, entry, WAKE_NEXT);
/* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation.
*/ int dax_writeback_mapping_range(struct address_space *mapping, struct dax_device *dax_dev, struct writeback_control *wbc)
{
XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); struct inode *inode = mapping->host;
pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; void *entry; int ret = 0; unsignedint scanned = 0;
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO;
if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) return 0;
id = dax_read_lock();
length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
DAX_ACCESS, kaddr, pfnp); if (length < 0) {
rc = length; goto out;
} if (!pfnp) goto out_check_addr;
rc = -EINVAL; if (PFN_PHYS(length) < size) goto out; if (*pfnp & (PHYS_PFN(size)-1)) goto out;
rc = 0;
out_check_addr: if (!kaddr) goto out; if (!*kaddr)
rc = -EFAULT;
out:
dax_read_unlock(id); return rc;
}
/** * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page * by copying the data before and after the range to be written. * @pos: address to do copy from. * @length: size of copy operation. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) * @srcmap: iomap srcmap * @daddr: destination address to copy to. * * This can be called from two places. Either during DAX write fault (page * aligned), to copy the length size data to daddr. Or, while doing normal DAX * write operation, dax_iomap_iter() might call this to do the copy of either * start or end unaligned address. In the latter case the rest of the copy of * aligned ranges is taken care by dax_iomap_iter() itself. * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the * area to make sure no old data remains.
*/ staticint dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, conststruct iomap *srcmap, void *daddr)
{
loff_t head_off = pos & (align_size - 1);
size_t size = ALIGN(head_off + length, align_size);
loff_t end = pos + length;
loff_t pg_end = round_up(end, align_size); /* copy_all is usually in page fault case */ bool copy_all = head_off == 0 && end == pg_end; /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
srcmap->type == IOMAP_UNWRITTEN; void *saddr = NULL; int ret = 0;
if (!zero_edge) {
ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); if (ret) return dax_mem2blk_err(ret);
}
if (copy_all) { if (zero_edge)
memset(daddr, 0, size); else
ret = copy_mc_to_kernel(daddr, saddr, length); goto out;
}
/* Copy the head part of the range */ if (head_off) { if (zero_edge)
memset(daddr, 0, head_off); else {
ret = copy_mc_to_kernel(daddr, saddr, head_off); if (ret) return -EIO;
}
}
/* Copy the tail part of the range */ if (end < pg_end) {
loff_t tail_off = head_off + length;
loff_t tail_len = pg_end - end;
if (zero_edge)
memset(daddr + tail_off, 0, tail_len); else {
ret = copy_mc_to_kernel(daddr + tail_off,
saddr + tail_off, tail_len); if (ret) return -EIO;
}
}
out: if (zero_edge)
dax_flush(srcmap->dax_dev, daddr, size); return ret ? -EIO : 0;
}
/* * The user has performed a load from a hole in the file. Allocating a new * page in the file would cause excessive storage usage for workloads with * sparse files. Instead we insert a read-only mapping of the 4k zero page. * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead.
*/ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, conststruct iomap_iter *iter, void **entry)
{ struct inode *inode = iter->inode; unsignedlong vaddr = vmf->address; unsignedlong pfn = my_zero_pfn(vaddr);
vm_fault_t ret;
if (!iomap_want_unshare_iter(iter)) return iomap_iter_advance_full(iter);
/* * Extend the file range to be aligned to fsblock/pagesize, because * we need to copy entire blocks, not just the byte range specified. * Invalidate the mapping because we're about to CoW.
*/
mod = offset_in_page(copy_pos); if (mod) {
copy_len += mod;
copy_pos -= mod;
}
mod = offset_in_page(copy_pos + copy_len); if (mod)
copy_len += PAGE_SIZE - mod;
/* * invalidate the pages whose sharing state is to be changed * because of CoW.
*/ if (iomap->flags & IOMAP_F_SHARED)
invalidate_inode_pages2_range(iter->inode->i_mapping,
iter->pos >> PAGE_SHIFT,
(iter->pos + length - 1) >> PAGE_SHIFT);
do {
loff_t pos = iter->pos; unsigned offset = offset_in_page(pos);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); int id;
length = min_t(u64, PAGE_SIZE - offset, length);
id = dax_read_lock(); if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else
ret = dax_memzero(iter, pos, length);
dax_read_unlock(id);
if (ret < 0) return ret;
ret = iomap_iter_advance(iter, &length); if (ret) return ret;
} while (length > 0);
/* * In DAX mode, enforce either pure overwrites of written extents, or * writes to unwritten extents as part of a copy-on-write operation.
*/ if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
!(iomap->flags & IOMAP_F_SHARED))) return -EIO;
/* * Write can allocate block for an area which has a hole page mapped * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap.
*/ if (iomap->flags & IOMAP_F_NEW || cow) { /* * Filesystem allows CoW on non-shared extents. The src extents * may have been mmapped with dirty mark before. To be able to * invalidate its dax entries, we need to clear the dirty mark * in advance.
*/ if (cow)
__dax_clear_dirty_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
invalidate_inode_pages2_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
}
length = xfer;
ret = iomap_iter_advance(iomi, &length); if (!ret && xfer == 0)
ret = -EFAULT; if (xfer < map_len) break;
}
dax_read_unlock(id);
return ret;
}
/** * dax_iomap_rw - Perform I/O to a DAX file * @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system * * This function performs read and write operations to directly mapped * persistent memory. The callers needs to take care of read/write exclusion * and evicting any page cache pages in the region under I/O.
*/
ssize_t
dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, conststruct iomap_ops *ops)
{ struct iomap_iter iomi = {
.inode = iocb->ki_filp->f_mapping->host,
.pos = iocb->ki_pos,
.len = iov_iter_count(iter),
.flags = IOMAP_DAX,
};
loff_t done = 0; int ret;
if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC)) return -EIO;
/* * When handling a synchronous page fault and the inode need a fsync, we can * insert the PTE/PMD into page tables only after that fsync happened. Skip * insertion for now and return the pfn so that caller can insert it after the * fsync is done.
*/ static vm_fault_t dax_fault_synchronous_pfnp(unsignedlong *pfnp, unsignedlong pfn)
{ if (WARN_ON_ONCE(!pfnp)) return VM_FAULT_SIGBUS;
*pfnp = pfn; return VM_FAULT_NEEDDSYNC;
}
if (write && iomap->flags & IOMAP_F_SHARED) {
err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); if (err) return dax_fault_return(err);
}
folio = dax_to_folio(*entry); if (dax_fault_is_synchronous(iter, vmf->vma)) return dax_fault_synchronous_pfnp(pfnp, pfn);
folio_ref_inc(folio); if (pmd)
ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write); else
ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write);
folio_put(folio);
trace_dax_pte_fault(iter.inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test.
*/ if (iter.pos >= i_size_read(iter.inode)) {
ret = VM_FAULT_SIGBUS; goto out;
}
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
iter.flags |= IOMAP_WRITE;
entry = grab_mapping_entry(&xas, mapping, 0); if (xa_is_internal(entry)) {
ret = xa_to_internal(entry); goto out;
}
/* * It is possible, particularly with mixed reads & writes to private * mappings, that we have raced with a PMD fault that overlaps with * the PTE we need to set up. If so just return and the fault will be * retried.
*/ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE; goto unlock_entry;
}
/* * Make sure that the faulting address's PMD offset (color) matches * the PMD offset from the start of the file. This is necessary so * that a PMD range in the page table overlaps exactly with a PMD * range in the page cache.
*/ if ((vmf->pgoff & PG_PMD_COLOUR) !=
((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) returntrue;
/* Fall back to PTEs if we're going to COW */ if (write && !(vmf->vma->vm_flags & VM_SHARED)) returntrue;
/* If the PMD would extend outside the VMA */ if (pmd_addr < vmf->vma->vm_start) returntrue; if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end) returntrue;
/* If the PMD would extend beyond the file size */ if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff) returntrue;
if (vmf->flags & FAULT_FLAG_WRITE)
iter.flags |= IOMAP_WRITE;
/* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test.
*/
max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
if (xas.xa_index >= max_pgoff) {
ret = VM_FAULT_SIGBUS; goto out;
}
if (dax_fault_check_fallback(vmf, &xas, max_pgoff)) goto fallback;
/* * grab_mapping_entry() will make sure we get an empty PMD entry, * a zero PMD entry or a DAX PMD. If it can't (because a PTE * entry is already in the array, for instance), it will return * VM_FAULT_FALLBACK.
*/
entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); if (xa_is_internal(entry)) {
ret = xa_to_internal(entry); goto fallback;
}
/* * It is possible, particularly with mixed reads & writes to private * mappings, that we have raced with a PTE fault that overlaps with * the PMD we need to set up. If so just return and the fault will be * retried.
*/ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) {
ret = 0; goto unlock_entry;
}
iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; while (iomap_iter(&iter, ops) > 0) { if (iomap_length(&iter) < PMD_SIZE) continue; /* actually breaks out of the loop */
/** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault * @order: Order of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller * has done all the necessary locking for page fault to proceed * successfully.
*/
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsignedint order, unsignedlong *pfnp, int *iomap_errp, conststruct iomap_ops *ops)
{ if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); elseif (order == PMD_ORDER) return dax_iomap_pmd_fault(vmf, pfnp, ops); else return VM_FAULT_FALLBACK;
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
/* * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables * @vmf: The description of the fault * @pfn: PFN to insert * @order: Order of entry to insert. * * This function inserts a writeable PTE or PMD entry into the page tables * for an mmaped DAX file. It also marks the page cache entry as dirty.
*/ static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, unsignedlong pfn, unsignedint order)
{ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); struct folio *folio; void *entry;
vm_fault_t ret;
xas_lock_irq(&xas);
entry = get_next_unlocked_entry(&xas, order); /* Did we race with someone splitting entry or so? */ if (!entry || dax_is_conflict(entry) ||
(order == 0 && !dax_is_pte_entry(entry))) {
put_unlocked_entry(&xas, entry, WAKE_NEXT);
xas_unlock_irq(&xas);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE;
}
xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
dax_lock_entry(&xas, entry);
xas_unlock_irq(&xas);
folio = pfn_folio(pfn);
folio_ref_inc(folio); if (order == 0)
ret = vmf_insert_page_mkwrite(vmf, &folio->page, true); #ifdef CONFIG_FS_DAX_PMD elseif (order == PMD_ORDER)
ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE); #endif else
ret = VM_FAULT_FALLBACK;
folio_put(folio);
dax_unlock_entry(&xas, entry);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); return ret;
}
/** * dax_finish_sync_fault - finish synchronous page fault * @vmf: The description of the fault * @order: Order of entry to be inserted * @pfn: PFN to insert * * This function ensures that the file range touched by the page fault is * stored persistently on the media and handles inserting of appropriate page * table entry.
*/
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsignedint order, unsignedlong pfn)
{ int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
size_t len = PAGE_SIZE << order;
err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); if (err) return VM_FAULT_SIGBUS; return dax_insert_pfn_mkwrite(vmf, pfn, order);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.