staticvoid fuse_link_write_file(struct file *file)
{ struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; /* * file may be written through mmap, so chain it onto the * inodes's write_file list
*/
spin_lock(&fi->lock); if (list_empty(&ff->write_entry))
list_add(&ff->write_entry, &fi->write_files);
spin_unlock(&fi->lock);
}
int fuse_finish_open(struct inode *inode, struct file *file)
{ struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); int err;
err = fuse_file_io_open(file, inode); if (err) return err;
if (fuse_file_passthrough(ff))
fuse_passthrough_release(ff, fuse_inode_backing(fi));
/* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) {
spin_lock(&fi->lock);
list_del(&ff->write_entry);
spin_unlock(&fi->lock);
}
spin_lock(&fc->lock); if (!RB_EMPTY_NODE(&ff->polled_node))
rb_erase(&ff->polled_node, &fc->polled_files);
spin_unlock(&fc->lock);
wake_up_interruptible_all(&ff->poll_wait);
if (!ra) return;
/* ff->args was used for open outarg */
memset(ff->args, 0, sizeof(*ff->args));
ra->inarg.fh = ff->fh;
ra->inarg.flags = flags;
ra->args.in_numargs = 1;
ra->args.in_args[0].size = sizeof(struct fuse_release_in);
ra->args.in_args[0].value = &ra->inarg;
ra->args.opcode = opcode;
ra->args.nodeid = ff->nodeid;
ra->args.force = true;
ra->args.nocreds = true;
/* * Hold inode until release is finished. * From fuse_sync_release() the refcount is 1 and everything's * synchronous, so we are fine with not doing igrab() here.
*/
ra->inode = sync ? NULL : igrab(&fi->inode);
}
/* * Normally this will send the RELEASE request, however if * some asynchronous READ or WRITE requests are outstanding, * the sending will be delayed. * * Make the release synchronous if this is a fuseblk mount, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. * * Always use the asynchronous file put because the current thread * might be the fuse server. This can happen if a process starts some * aio and closes the fd before the aio completes. Since aio takes its * own ref to the file, the IO completion has to drop the ref, which is * how the fuse server can end up closing its clients' files.
*/
fuse_file_put(ff, false);
}
/* * Dirty pages might remain despite write_inode_now() call from * fuse_flush() due to writes racing with the close.
*/ if (fc->writeback_cache)
write_inode_now(inode, 1);
/* * Scramble the ID space with XTEA, so that the value of the files_struct * pointer is not exposed to userspace.
*/
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
{
u32 *k = fc->scramble_key;
u64 v = (unsignedlong) id;
u32 v0 = v;
u32 v1 = v >> 32;
u32 sum = 0; int i;
/* * Wait for all pending writepages on the inode to finish. * * This is currently done by blocking further writes with FUSE_NOWRITE * and waiting for all sent writes to complete. * * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage * could conflict with truncation.
*/ staticvoid fuse_sync_writes(struct inode *inode)
{
fuse_set_nowrite(inode);
fuse_release_nowrite(inode);
}
inval_attr_out: /* * In memory i_blocks is not maintained by fuse, if writeback cache is * enabled, i_blocks from cached attr may not be accurate.
*/ if (!err && fm->fc->writeback_cache)
fuse_invalidate_attr_mask(inode, STATX_BLOCKS); return err;
}
/* * Start writeback against all dirty pages of the inode, then * wait for all outstanding writes, before sending the FSYNC * request.
*/
err = file_write_and_wait_range(file, start, end); if (err) goto out;
fuse_sync_writes(inode);
/* * Due to implementation of fuse writeback * file_write_and_wait_range() does not catch errors. * We have to do this directly after fuse_sync_writes()
*/
err = file_check_and_advance_wb_err(file); if (err) goto out;
err = sync_inode_metadata(inode, 1); if (err) goto out;
static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
{ if (io->err) return io->err;
if (io->bytes >= 0 && io->write) return -EIO;
return io->bytes < 0 ? io->size : io->bytes;
}
/* * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. * * An example: * User requested DIO read of 64K. It was split into two 32K fuse requests, * both submitted asynchronously. The first of them was ACKed by userspace as * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The * second request was ACKed as short, e.g. only 1K was read, resulting in * pos == 33K. * * Thus, when all fuse requests are completed, the minimal non-negative 'pos' * will be equal to the length of the longest contiguous fragment of * transferred data starting from the beginning of IO request.
*/ staticvoid fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
{ int left;
/* * If writeback_cache is enabled, a short read means there's a hole in * the file. Some data after the hole is in page cache, but has not * reached the client fs yet. So the hole is not present there.
*/ if (!fc->writeback_cache) {
loff_t pos = folio_pos(ap->folios[0]) + num_read;
fuse_read_update_size(inode, pos, attr_ver);
}
}
/* Don't overflow end offset */ if (pos + (desc.length - 1) == LLONG_MAX)
desc.length--;
fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
res = fuse_simple_request(fm, &ia.ap.args); if (res < 0) return res; /* * Short read means EOF. If file size is larger, truncate it
*/ if (res < desc.length)
fuse_short_read(inode, attr_ver, res, &ia.ap);
for (i = 0; mapping == NULL && i < ap->num_folios; i++)
mapping = ap->folios[i]->mapping;
if (mapping) { struct inode *inode = mapping->host;
/* * Short read means EOF. If file size is larger, truncate it
*/ if (!err && num_read < count)
fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
fuse_invalidate_atime(inode);
}
for (i = 0; i < ap->num_folios; i++) {
folio_end_read(ap->folios[i], !err);
folio_put(ap->folios[i]);
} if (ia->ff)
fuse_file_put(ia->ff, false);
/* * This is only accurate the first time through, since readahead_folio() * doesn't update readahead_count() from the previous folio until the * next call. Grab nr_pages here so we know how many pages we're going * to have to process. This means that we will exit here with * readahead_count() == folio_nr_pages(last_folio), but we will have * consumed all of the folios, and read_pages() will call * readahead_folio() again which will clean up the rac.
*/
nr_pages = readahead_count(rac);
if (fc->num_background >= fc->congestion_threshold &&
rac->ra->async_size >= readahead_count(rac)) /* * Congested and only async pages left, so skip the * rest.
*/ break;
ia = fuse_io_alloc(NULL, cur_pages); if (!ia) break;
ap = &ia->ap;
while (pages < cur_pages) { unsignedint folio_pages;
/* * This returns a folio with a ref held on it. * The ref needs to be held until the request is * completed, since the splice case (see * fuse_try_move_page()) drops the ref after it's * replaced in the page cache.
*/ if (!folio)
folio = __readahead_folio(rac);
folio_pages = folio_nr_pages(folio); if (folio_pages > cur_pages - pages) { /* * Large folios belonging to fuse will never * have more pages than max_pages.
*/
WARN_ON(!pages); break;
}
/* * In auto invalidate mode, always update attributes on read. * Otherwise, only update if we attempt to read past EOF (to ensure * i_size is up to date).
*/ if (fc->auto_inval_data ||
(iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err;
err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE); if (err) return err;
}
if (!tmp) {
folio_unlock(folio);
folio_put(folio);
/* * Ensure forward progress by faulting in * while not holding the folio lock:
*/ if (fault_in_iov_iter_readable(ii, bytes)) {
err = -EFAULT; break;
}
*exclusive = fuse_dio_wr_exclusive_lock(iocb, from); if (*exclusive) {
inode_lock(inode);
} else {
inode_lock_shared(inode); /* * New parallal dio allowed only if inode is not in caching * mode and denies new opens in caching mode. This check * should be performed only after taking shared inode lock. * Previous past eof check was without inode lock and might * have raced, so check it again.
*/ if (fuse_io_past_eof(iocb, from) ||
fuse_inode_uncached_io_start(fi, NULL) != 0) {
inode_unlock_shared(inode);
inode_lock(inode);
*exclusive = true;
}
}
}
if (exclusive) {
inode_unlock(inode);
} else { /* Allow opens in caching mode after last parallel dio end */
fuse_inode_uncached_io_end(fi);
inode_unlock_shared(inode);
}
}
if (iocb->ki_flags & IOCB_DIRECT) {
written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) goto out;
written = direct_write_fallback(iocb, from, written,
fuse_perform_write(iocb, from));
} elseif (writeback) { /* * Use iomap so that we can do granular uptodate reads * and granular dirty tracking for large folios.
*/
written = iomap_file_buffered_write(iocb, from,
&fuse_iomap_ops,
&fuse_iomap_write_ops,
file);
} else {
written = fuse_perform_write(iocb, from);
}
out:
inode_unlock(inode); if (written > 0)
written = generic_write_sync(iocb, written);
/* Special case for kernel I/O: can copy directly into the buffer. * However if the implementation of fuse_conn requires pages instead of * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead.
*/ if (iov_iter_is_kvec(ii)) { void *user_addr = (void *)fuse_get_user_addr(ii);
if (!use_pages_for_kvec_io) {
size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
if (write)
ap->args.in_args[1].value = user_addr; else
ap->args.out_args[0].value = user_addr;
/* * Until there is support for iov_iter_extract_folios(), we have to * manually extract pages using iov_iter_extract_pages() and then * copy that to a folios array.
*/ struct page **pages = kzalloc(max_pages * sizeof(struct page *),
GFP_KERNEL); if (!pages) {
ret = -ENOMEM; goto out;
}
for (i = 0; i < ap->num_folios; i++) { /* * Benchmarks showed that ending writeback within the * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance.
*/
iomap_finish_folio_write(inode, ap->folios[i], 1);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
wb_writeout_inc(&bdi->wb);
}
wake_up(&fi->page_waitq);
}
/* Called under fi->lock, may release and reacquire it */ staticvoid fuse_send_writepage(struct fuse_mount *fm, struct fuse_writepage_args *wpa, loff_t size)
__releases(fi->lock)
__acquires(fi->lock)
{ struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_args_pages *ap = &wpa->ia.ap; struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &ap->args;
__u64 data_size = 0; int err, i;
for (i = 0; i < ap->num_folios; i++)
data_size += ap->descs[i].length;
mapping_set_error(inode->i_mapping, error); /* * A writeback finished and this might have updated mtime/ctime on * server making local mtime/ctime stale. Hence invalidate attrs. * Do this only if writeback_cache is not enabled. If writeback_cache * is enabled, we trust local ctime/mtime.
*/ if (!fc->writeback_cache)
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
spin_lock(&fi->lock);
fi->writectr--;
fuse_writepage_finish(wpa);
spin_unlock(&fi->lock);
fuse_writepage_free(wpa);
}
rcu_read_lock(); /* Prevent resurrection of dead bucket in unlikely race with syncfs */ do {
wpa->bucket = rcu_dereference(fc->curr_bucket);
} while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
rcu_read_unlock();
}
/* Need to grow the pages array? If so, did the expansion fail? */ if (ap->num_folios == data->max_folios &&
!fuse_pages_realloc(data, fc->max_pages)) returntrue;
if (folio_clear_dirty_for_io(folio)) {
err = iomap_writeback_folio(&wpc, folio);
err = fuse_iomap_writeback_submit(&wpc, err); if (!err)
folio_wait_writeback(folio);
} return err;
}
/* * Write back dirty data/metadata now (there may not be any suitable * open files later for data)
*/ staticvoid fuse_vma_close(struct vm_area_struct *vma)
{ int err;
/* * Wait for writeback against this page to complete before allowing it * to be marked dirty again, and hence written back again, possibly * before the previous writepage completed. * * Block here, instead of in ->writepage(), so that the userspace fs * can only block processes actually operating on the filesystem. * * Otherwise unprivileged userspace fs would be able to block * unrelated: * * - page migration * - sync(2) * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
*/ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
{ struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file);
/* DAX mmap is superior to direct_io mmap */ if (FUSE_IS_DAX(inode)) return fuse_dax_mmap(file, vma);
/* * If inode is in passthrough io mode, because it has some file open * in passthrough mode, either mmap to backing file or fail mmap, * because mixing cached mmap and passthrough io mode is not allowed.
*/ if (fuse_file_passthrough(ff)) return fuse_passthrough_mmap(file, vma); elseif (fuse_inode_backing(get_fuse_inode(inode))) return -ENODEV;
/* * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP.
*/ if (ff->open_flags & FOPEN_DIRECT_IO) { /* * Can't provide the coherency needed for MAP_SHARED * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
*/ if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) return -ENODEV;
/* * First mmap of direct_io file enters caching inode io mode. * Also waits for parallel dio writers to go into serial mode * (exclusive instead of shared lock). * After first mmap, the inode stays in caching io mode until * the direct_io file release.
*/
rc = fuse_file_cached_io_open(inode, ff); if (rc) return rc;
}
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
fuse_link_write_file(file);
case F_RDLCK: case F_WRLCK: if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
ffl->end < ffl->start) return -EIO;
fl->fl_start = ffl->start;
fl->fl_end = ffl->end;
/* * Convert pid into init's pid namespace. The locks API will * translate it into the caller's pid namespace.
*/
rcu_read_lock();
fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
rcu_read_unlock(); break;
switch (whence) { case SEEK_SET: case SEEK_CUR: /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
retval = generic_file_llseek(file, offset, whence); break; case SEEK_END:
inode_lock(inode);
retval = fuse_update_attributes(inode, file, STATX_SIZE); if (!retval)
retval = generic_file_llseek(file, offset, whence);
inode_unlock(inode); break; case SEEK_HOLE: case SEEK_DATA:
inode_lock(inode);
retval = fuse_lseek(file, offset, whence);
inode_unlock(inode); break; default:
retval = -EINVAL;
}
return retval;
}
/* * All files which have been polled are linked to RB tree * fuse_conn->polled_files which is indexed by kh. Walk the tree and * find the matching one.
*/ staticstruct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, struct rb_node **parent_out)
{ struct rb_node **link = &fc->polled_files.rb_node; struct rb_node *last = NULL;
while (*link) { struct fuse_file *ff;
last = *link;
ff = rb_entry(last, struct fuse_file, polled_node);
if (kh < ff->kh)
link = &last->rb_left; elseif (kh > ff->kh)
link = &last->rb_right; else return link;
}
if (parent_out)
*parent_out = last; return link;
}
/* * The file is about to be polled. Make sure it's on the polled_files * RB tree. Note that files once added to the polled_files tree are * not removed before the file is released. This is because a file * polled once is likely to be polled again.
*/ staticvoid fuse_register_polled_file(struct fuse_conn *fc, struct fuse_file *ff)
{
spin_lock(&fc->lock); if (RB_EMPTY_NODE(&ff->polled_node)) { struct rb_node **link, *parent;
/* * Ask for notification iff there's someone waiting for it. * The client may ignore the flag and always notify.
*/ if (waitqueue_active(&ff->poll_wait)) {
inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
fuse_register_polled_file(fm->fc, ff);
}
if (!err) return demangle_poll(outarg.revents); if (err == -ENOSYS) {
fm->fc->no_poll = 1; return DEFAULT_POLLMASK;
} return EPOLLERR;
}
EXPORT_SYMBOL_GPL(fuse_file_poll);
/* * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and * wakes up the poll waiters.
*/ int fuse_notify_poll_wakeup(struct fuse_conn *fc, struct fuse_notify_poll_wakeup_out *outarg)
{
u64 kh = outarg->kh; struct rb_node **link;
spin_lock(&fc->lock);
link = fuse_find_polled_node(fc, kh, NULL); if (*link) { struct fuse_file *ff;
/* * We cannot asynchronously extend the size of a file. * In such case the aio will behave exactly like sync io.
*/ if ((offset + count > i_size) && io->write)
io->blocking = true;
if (io->async && io->blocking) { /* * Additional reference to keep io around after * calling fuse_aio_complete()
*/
kref_get(&io->refcnt);
io->done = &wait;
}
if (iov_iter_rw(iter) == WRITE) {
ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else {
ret = __fuse_direct_read(io, iter, &pos);
}
iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
if (io->async) { bool blocking = io->blocking;
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
/* we have a non-extending, async request, so return */ if (!blocking) return -EIOCBQUEUED;
wait_for_completion(&wait);
ret = fuse_get_res_by_io(io);
}
kref_put(&io->refcnt, fuse_io_release);
if (iov_iter_rw(iter) == WRITE) {
fuse_write_update_attr(inode, pos, ret); /* For extending writes we already hold exclusive lock */ if (ret < 0 && offset + count > i_size)
fuse_do_truncate(file);
}
/* we could have extended the file */ if (!(mode & FALLOC_FL_KEEP_SIZE)) { if (fuse_write_update_attr(inode, offset + length, length))
file_update_time(file);
}
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV;
inode_lock(inode_in);
err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
inode_unlock(inode_in); if (err) return err;
inode_lock(inode_out);
err = file_modified(file_out); if (err) goto out;
/* * Write out dirty pages in the destination file before sending the COPY * request to userspace. After the request is completed, truncate off * pages (including partial ones) from the cache that have been copied, * since these contain stale data at that point. * * This should be mostly correct, but if the COPY writes to partial * pages (at the start or end) and the parts not covered by the COPY are * written through a memory map after calling fuse_writeback_range(), * then these partial page modifications will be lost on truncation. * * It is unlikely that someone would rely on such mixed style * modifications. Yet this does give less guarantees than if the * copying was performed with write(2). * * To fix this a mapping->invalidate_lock could be used to prevent new * faults while the copy is ongoing.
*/
err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); if (err) goto out;
if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.