/* * Default memory range size. A power of 2 so it agrees with common FUSE_INIT * map_alignment values 4KB and 64KB.
*/ #define FUSE_DAX_SHIFT 21 #define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) #define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE)
/* Number of ranges reclaimer will try to free in one invocation */ #define FUSE_DAX_RECLAIM_CHUNK (10)
/* * Dax memory reclaim threshold in percetage of total ranges. When free * number of free ranges drops below this threshold, reclaim can trigger * Default is 20%
*/ #define FUSE_DAX_RECLAIM_THRESHOLD (20)
/** Translation information for file offsets to DAX window offsets */ struct fuse_dax_mapping { /* Pointer to inode where this memory range is mapped */ struct inode *inode;
/* Will connect in fcd->free_ranges to keep track of free memory */ struct list_head list;
/* For interval tree in file/inode */ struct interval_tree_node itn;
/* Will connect in fc->busy_ranges to keep track busy memory */ struct list_head busy_list;
/** Position in DAX window */
u64 window_offset;
/** Length of mapping, in bytes */
loff_t length;
/* Is this mapping read-only or read-write */ bool writable;
/* reference count when the mapping is used by dax iomap. */
refcount_t refcnt;
};
/* Per-inode dax map */ struct fuse_inode_dax { /* Semaphore to protect modifications to the dmap tree */ struct rw_semaphore sem;
/* Sorted rb tree of struct fuse_dax_mapping elements */ struct rb_root_cached tree; unsignedlong nr;
};
ptr = remove_one;
list_for_each_entry(dmap, to_remove, list) {
ptr->moffset = dmap->window_offset;
ptr->len = dmap->length;
ptr++;
i++;
num--; if (i >= nr_alloc || num == 0) {
memset(&inarg, 0, sizeof(inarg));
inarg.count = i;
ret = fuse_send_removemapping(inode, &inarg,
remove_one); if (ret) goto out;
ptr = remove_one;
i = 0;
}
}
out:
kfree(remove_one); return ret;
}
/* * Cleanup dmap entry and add back to free list. This should be called with * fcd->lock held.
*/ staticvoid dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, struct fuse_dax_mapping *dmap)
{
pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
dmap->itn.start, dmap->itn.last, dmap->window_offset,
dmap->length);
__dmap_remove_busy_list(fcd, dmap);
dmap->inode = NULL;
dmap->itn.start = dmap->itn.last = 0;
__dmap_add_to_free_pool(fcd, dmap);
}
/* * Free inode dmap entries whose range falls inside [start, end]. * Does not take any locks. At this point of time it should only be * called from evict_inode() path where we know all dmap entries can be * reclaimed.
*/ staticvoid inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, struct inode *inode,
loff_t start, loff_t end)
{ struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_dax_mapping *dmap, *n; int err, num = 0;
LIST_HEAD(to_remove); unsignedlong start_idx = start >> FUSE_DAX_SHIFT; unsignedlong end_idx = end >> FUSE_DAX_SHIFT; struct interval_tree_node *node;
while (1) {
node = interval_tree_iter_first(&fi->dax->tree, start_idx,
end_idx); if (!node) break;
dmap = node_to_dmap(node); /* inode is going away. There should not be any users of dmap */
WARN_ON(refcount_read(&dmap->refcnt) > 1);
interval_tree_remove(&dmap->itn, &fi->dax->tree);
num++;
list_add(&dmap->list, &to_remove);
}
/* Nothing to remove */ if (list_empty(&to_remove)) return;
/* * It is called from evict_inode() and by that time inode is going away. So * this function does not take any locks like fi->dax->sem for traversing * that fuse inode interval tree. If that lock is taken then lock validator * complains of deadlock situation w.r.t fs_reclaim lock.
*/ void fuse_dax_inode_cleanup(struct inode *inode)
{ struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode);
/* * fuse_evict_inode() has already called truncate_inode_pages_final() * before we arrive here. So we should not have to worry about any * pages/exception entries still associated with inode.
*/
inode_reclaim_dmap_range(fc->dax, inode, 0, -1);
WARN_ON(fi->dax->nr);
}
/* If length is beyond end of file, truncate further */ if (pos + len > i_size)
len = i_size - pos;
if (len > 0) {
iomap->addr = dmap->window_offset + offset;
iomap->length = len; if (flags & IOMAP_FAULT)
iomap->length = ALIGN(len, PAGE_SIZE);
iomap->type = IOMAP_MAPPED; /* * increace refcnt so that reclaim code knows this dmap is in * use. This assumes fi->dax->sem mutex is held either * shared/exclusive.
*/
refcount_inc(&dmap->refcnt);
/* iomap->private should be NULL */
WARN_ON_ONCE(iomap->private);
iomap->private = dmap;
} else { /* Mapping beyond end of file is hole */
fuse_fill_iomap_hole(iomap, length);
}
}
/* * Can't do inline reclaim in fault path. We call * dax_layout_busy_page() before we free a range. And * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it. * In fault path we enter with mapping->invalidate_lock held and can't * drop it. Also in fault path we hold mapping->invalidate_lock shared * and not exclusive, so that creates further issues with * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault() * will wait for a memory range to become free and retry.
*/ if (flags & IOMAP_FAULT) {
alloc_dmap = alloc_dax_mapping(fcd); if (!alloc_dmap) return -EAGAIN;
} else {
alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode); if (IS_ERR(alloc_dmap)) return PTR_ERR(alloc_dmap);
}
/* If we are here, we should have memory allocated */ if (WARN_ON(!alloc_dmap)) return -EIO;
/* * Take write lock so that only one caller can try to setup mapping * and other waits.
*/
down_write(&fi->dax->sem); /* * We dropped lock. Check again if somebody else setup * mapping already.
*/
node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); if (node) {
dmap = node_to_dmap(node);
fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
dmap_add_to_free_pool(fcd, alloc_dmap);
up_write(&fi->dax->sem); return 0;
}
/* * Take exclusive lock so that only one caller can try to setup * mapping and others wait.
*/
down_write(&fi->dax->sem);
node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
/* We are holding either inode lock or invalidate_lock, and that should * ensure that dmap can't be truncated. We are holding a reference * on dmap and that should make sure it can't be reclaimed. So dmap * should still be there in tree despite the fact we dropped and * re-acquired the fi->dax->sem lock.
*/
ret = -EIO; if (WARN_ON(!node)) goto out_err;
dmap = node_to_dmap(node);
/* We took an extra reference on dmap to make sure its not reclaimd. * Now we hold fi->dax->sem lock and that reference is not needed * anymore. Drop it.
*/ if (refcount_dec_and_test(&dmap->refcnt)) { /* refcount should not hit 0. This object only goes * away when fuse connection goes away
*/
WARN_ON_ONCE(1);
}
/* Maybe another thread already upgraded mapping while we were not * holding lock.
*/ if (dmap->writable) {
ret = 0; goto out_fill_iomap;
}
/* This is just for DAX and the mapping is ephemeral, do not use it for other * purposes since there is no block device with a permanent mapping.
*/ staticint fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, unsignedint flags, struct iomap *iomap, struct iomap *srcmap)
{ struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_dax_mapping *dmap; bool writable = flags & IOMAP_WRITE; unsignedlong start_idx = pos >> FUSE_DAX_SHIFT; struct interval_tree_node *node;
/* We don't support FIEMAP */ if (WARN_ON(flags & IOMAP_REPORT)) return -EIO;
/* * Both read/write and mmap path can race here. So we need something * to make sure if we are setting up mapping, then other path waits * * For now, use a semaphore for this. It probably needs to be * optimized later.
*/
down_read(&fi->dax->sem);
node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); if (node) {
dmap = node_to_dmap(node); if (writable && !dmap->writable) { /* Upgrade read-only mapping to read-write. This will * require exclusive fi->dax->sem lock as we don't want * two threads to be trying to this simultaneously * for same dmap. So drop shared lock and acquire * exclusive lock. * * Before dropping fi->dax->sem lock, take reference * on dmap so that its not freed by range reclaim.
*/
refcount_inc(&dmap->refcnt);
up_read(&fi->dax->sem);
pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
__func__, pos, length); return fuse_upgrade_dax_mapping(inode, pos, length,
flags, iomap);
} else {
fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
up_read(&fi->dax->sem); return 0;
}
} else {
up_read(&fi->dax->sem);
pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
__func__, pos, length); if (pos >= i_size_read(inode)) goto iomap_hole;
if (dmap) { if (refcount_dec_and_test(&dmap->refcnt)) { /* refcount should not hit 0. This object only goes * away when fuse connection goes away
*/
WARN_ON_ONCE(1);
}
}
/* DAX writes beyond end-of-file aren't handled using iomap, so the * file size is unchanged and there is nothing to do here.
*/ return 0;
}
/* Should be called with mapping->invalidate_lock held exclusively. */ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
u64 dmap_end)
{ return dax_break_layout(inode, dmap_start, dmap_end,
fuse_wait_dax_page);
}
if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN;
} else {
inode_lock(inode);
}
ret = generic_write_checks(iocb, from); if (ret <= 0) goto out;
ret = file_remove_privs(iocb->ki_filp); if (ret) goto out; /* TODO file_update_time() but we don't want metadata I/O */
/* Do not use dax for file extending writes as write and on * disk i_size increase are not atomic otherwise.
*/ if (file_extending_write(iocb, from))
ret = fuse_dax_direct_write(iocb, from); else
ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
out:
inode_unlock(inode);
if (ret > 0)
ret = generic_write_sync(iocb, ret); return ret;
}
if (write)
sb_start_pagefault(sb);
retry: if (retry && !(fcd->nr_free_ranges > 0))
wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0));
/* * We need to serialize against not only truncate but also against * fuse dax memory range reclaim. While a range is being reclaimed, * we do not want any read/write/mmap to make progress and try * to populate page cache or access memory we are trying to free.
*/
filemap_invalidate_lock_shared(inode->i_mapping);
ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
error = 0;
retry = true;
filemap_invalidate_unlock_shared(inode->i_mapping); goto retry;
}
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, order, pfn);
filemap_invalidate_unlock_shared(inode->i_mapping);
/* * igrab() was done to make sure inode won't go under us, and this * further avoids the race with evict().
*/
ret = dmap_writeback_invalidate(inode, dmap); if (ret) return ret;
/* Remove dax mapping from inode interval tree now */
interval_tree_remove(&dmap->itn, &fi->dax->tree);
fi->dax->nr--;
/* It is possible that umount/shutdown has killed the fuse connection * and worker thread is trying to reclaim memory in parallel. Don't * warn in that case.
*/
ret = dmap_removemapping_one(inode, dmap); if (ret && ret != -ENOTCONN) {
pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n",
dmap->window_offset, dmap->length, ret);
} return 0;
}
/* Find first mapped dmap for an inode and return file offset. Caller needs * to hold fi->dax->sem lock either shared or exclusive.
*/ staticstruct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode)
{ struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_dax_mapping *dmap; struct interval_tree_node *node;
for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node;
node = interval_tree_iter_next(node, 0, -1)) {
dmap = node_to_dmap(node); /* still in use. */ if (refcount_read(&dmap->refcnt) > 1) continue;
return dmap;
}
return NULL;
}
/* * Find first mapping in the tree and free it and return it. Do not add * it back to free pool.
*/ staticstruct fuse_dax_mapping *
inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, bool *retry)
{ struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_dax_mapping *dmap;
u64 dmap_start, dmap_end; unsignedlong start_idx; int ret; struct interval_tree_node *node;
filemap_invalidate_lock(inode->i_mapping);
/* Lookup a dmap and corresponding file offset to reclaim. */
down_read(&fi->dax->sem);
dmap = inode_lookup_first_dmap(inode); if (dmap) {
start_idx = dmap->itn.start;
dmap_start = start_idx << FUSE_DAX_SHIFT;
dmap_end = dmap_start + FUSE_DAX_SZ - 1;
}
up_read(&fi->dax->sem);
if (!dmap) goto out_mmap_sem; /* * Make sure there are no references to inode pages using * get_user_pages()
*/
ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); if (ret) {
pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n",
ret);
dmap = ERR_PTR(ret); goto out_mmap_sem;
}
down_write(&fi->dax->sem);
node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); /* Range already got reclaimed by somebody else */ if (!node) { if (retry)
*retry = true; goto out_write_dmap_sem;
}
dmap = node_to_dmap(node); /* still in use. */ if (refcount_read(&dmap->refcnt) > 1) {
dmap = NULL; if (retry)
*retry = true; goto out_write_dmap_sem;
}
ret = reclaim_one_dmap_locked(inode, dmap); if (ret < 0) {
dmap = ERR_PTR(ret); goto out_write_dmap_sem;
}
/* Clean up dmap. Do not add back to free list */
dmap_remove_busy_list(fcd, dmap);
dmap->inode = NULL;
dmap->itn.start = dmap->itn.last = 0;
dmap = alloc_dax_mapping(fcd); if (dmap) return dmap;
dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry); /* * Either we got a mapping or it is an error, return in both * the cases.
*/ if (dmap) return dmap;
/* If we could not reclaim a mapping because it * had a reference or some other temporary failure, * Try again. We want to give up inline reclaim only * if there is no range assigned to this node. Otherwise * if a deadlock is possible if we sleep with * mapping->invalidate_lock held and worker to free memory * can't make progress due to unavailability of * mapping->invalidate_lock. So sleep only if fi->dax->nr=0
*/ if (retry) continue; /* * There are no mappings which can be reclaimed. Wait for one. * We are not holding fi->dax->sem. So it is possible * that range gets added now. But as we are not holding * mapping->invalidate_lock, worker should still be able to * free up a range and wake us up.
*/ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { if (wait_event_killable_exclusive(fcd->range_waitq,
(fcd->nr_free_ranges > 0))) { return ERR_PTR(-EINTR);
}
}
}
}
/* Range already got cleaned up by somebody else */ if (!node) return 0;
dmap = node_to_dmap(node);
/* still in use. */ if (refcount_read(&dmap->refcnt) > 1) return 0;
ret = reclaim_one_dmap_locked(inode, dmap); if (ret < 0) return ret;
/* Cleanup dmap entry and add back to free list */
spin_lock(&fcd->lock);
dmap_reinit_add_to_free_pool(fcd, dmap);
spin_unlock(&fcd->lock); return ret;
}
/* * Free a range of memory. * Locking: * 1. Take mapping->invalidate_lock to block dax faults. * 2. Take fi->dax->sem to protect interval tree and also to make sure * read/write can not reuse a dmap which we might be freeing.
*/ staticint lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, struct inode *inode, unsignedlong start_idx, unsignedlong end_idx)
{ int ret; struct fuse_inode *fi = get_fuse_inode(inode);
loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
filemap_invalidate_lock(inode->i_mapping);
ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); if (ret) {
pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
ret); goto out_mmap_sem;
}
/* Pick first busy range and free it for now*/ while (1) { if (nr_freed >= nr_to_free) break;
dmap = NULL;
spin_lock(&fcd->lock);
if (!fcd->nr_busy_ranges) {
spin_unlock(&fcd->lock); return 0;
}
list_for_each_entry_safe(pos, temp, &fcd->busy_ranges,
busy_list) { /* skip this range if it's in use. */ if (refcount_read(&pos->refcnt) > 1) continue;
inode = igrab(pos->inode); /* * This inode is going away. That will free * up all the ranges anyway, continue to * next range.
*/ if (!inode) continue; /* * Take this element off list and add it tail. If * this element can't be freed, it will help with * selecting new element in next iteration of loop.
*/
dmap = pos;
list_move_tail(&dmap->busy_list, &fcd->busy_ranges);
start_idx = end_idx = dmap->itn.start; break;
}
spin_unlock(&fcd->lock); if (!dmap) return 0;
ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx);
iput(inode); if (ret) return ret;
nr_freed++;
} return 0;
}
staticvoid fuse_dax_free_mem_worker(struct work_struct *work)
{ int ret; struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax,
free_work.work);
ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK); if (ret) {
pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n",
ret);
}
/* If number of free ranges are still below threshold, requeue */
kick_dmap_free_worker(fcd, 1);
}
for (i = 0; i < nr_ranges; i++) {
range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL);
ret = -ENOMEM; if (!range) goto out_err;
/* TODO: This offset only works if virtio-fs driver is not * having some memory hidden at the beginning. This needs * better handling
*/
range->window_offset = i * FUSE_DAX_SZ;
range->length = FUSE_DAX_SZ;
INIT_LIST_HEAD(&range->busy_list);
refcount_set(&range->refcnt, 1);
list_add_tail(&range->list, &fcd->free_ranges);
}
fcd->nr_free_ranges = nr_ranges;
fcd->nr_ranges = nr_ranges; return 0;
out_err: /* Free All allocated elements */
fuse_free_dax_mem_ranges(&fcd->free_ranges); return ret;
}
int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode, struct dax_device *dax_dev)
{ struct fuse_conn_dax *fcd; int err;
fc->dax_mode = dax_mode;
if (!dax_dev) return 0;
fcd = kzalloc(sizeof(*fcd), GFP_KERNEL); if (!fcd) return -ENOMEM;
/* * fc->dax may be NULL in 'inode' mode when filesystem device doesn't * support DAX, in which case it will silently fallback to 'never' mode.
*/ if (!fc->dax) returnfalse;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.