// SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/direct.c * * Copyright (C) 2003 by Chuck Lever <cel@netapp.com> * * High-performance uncached I/O for the Linux NFS client * * There are important applications whose performance or correctness * depends on uncached access to file data. Database clusters * (multiple copies of the same instance running on separate hosts) * implement their own cache coherency protocol that subsumes file * system cache protocols. Applications that process datasets * considerably larger than the client's memory do not always benefit * from a local cache. A streaming video server, for instance, has no * need to cache the contents of a file. * * When an application requests uncached I/O, all read and write requests * are made directly to the server; data stored or fetched via these * requests is not cached in the Linux page cache. The client does not * correct unaligned requests from applications. All requested bytes are * held on permanent storage before a direct write system call returns to * an application. * * Solaris implements an uncached I/O facility called directio() that * is used for backups and sequential I/O to very large files. Solaris * also supports uncaching whole NFS partitions with "-o forcedirectio," * an undocumented mount option. * * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with * help from Andrew Morton. * * 18 Dec 2001 Initial implementation for 2.4 --cel * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy * 08 Jun 2003 Port to 2.5 APIs --cel * 31 Mar 2004 Handle direct I/O without VFS support --cel * 15 Sep 2004 Parallel async reads --cel * 04 May 2005 support O_DIRECT with aio --cel *
*/
/** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block * @iter: I/O buffer * * Perform IO to the swap-file. This is much like direct IO.
*/ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret;
if (iov_iter_rw(iter) == READ)
ret = nfs_file_direct_read(iocb, iter, true); else
ret = nfs_file_direct_write(iocb, iter, true); if (ret < 0) return ret; return 0;
}
staticvoid nfs_direct_release_pages(struct page **pages, unsignedint npages)
{ unsignedint i; for (i = 0; i < npages; i++)
put_page(pages[i]);
}
/* * Collects and returns the final error value/byte-count.
*/ static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
ssize_t result = -EIOCBQUEUED;
/* Async requests don't wait here */ if (dreq->iocb) goto out;
result = wait_for_completion_killable(&dreq->completion);
if (!result) {
result = dreq->count;
WARN_ON_ONCE(dreq->count < 0);
} if (!result)
result = dreq->error;
out: return (ssize_t) result;
}
/* * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request.
*/ staticvoid nfs_direct_complete(struct nfs_direct_req *dreq)
{ struct inode *inode = dreq->inode;
inode_dio_end(inode);
if (dreq->iocb) { long res = (long) dreq->error; if (dreq->count != 0) {
res = (long) dreq->count;
WARN_ON_ONCE(dreq->count < 0);
}
dreq->iocb->ki_complete(dreq->iocb, res);
}
/* * For each rsize'd chunk of the user's buffer, dispatch an NFS READ * operation. If nfs_readdata_alloc() or get_user_pages() fails, * bail and stop sending more reads. Read length accounting is * handled automatically by nfs_direct_read_result(). Otherwise, if * no requests have been sent, just return an error.
*/
result = iov_iter_get_pages_alloc2(iter, &pagevec,
rsize, &pgbase); if (result < 0) break;
bytes = result;
npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; for (i = 0; i < npages; i++) { struct nfs_page *req; unsignedint req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); /* XXX do we need to do the eof zeroing found in async_filler? */
req = nfs_page_create_from_page(dreq->ctx, pagevec[i],
pgbase, pos, req_len); if (IS_ERR(req)) {
result = PTR_ERR(req); break;
} if (!nfs_pageio_add_request(&desc, req)) {
result = desc.pg_error;
nfs_release_request(req); break;
}
pgbase = 0;
bytes -= req_len;
requested_bytes += req_len;
pos += req_len;
}
nfs_direct_release_pages(pagevec, npages);
kvfree(pagevec); if (result < 0) break;
}
nfs_pageio_complete(&desc);
/* * If no bytes were started, return the error, and let the * generic layer handle the completion.
*/ if (requested_bytes == 0) {
inode_dio_end(inode);
nfs_direct_req_release(dreq); return result < 0 ? result : -EIO;
}
if (put_dreq(dreq))
nfs_direct_complete(dreq); return requested_bytes;
}
/** * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if * the request starts before the end of the file. For that check * to work, we must generate a GETATTR before each direct read, and * even then there is a window between the GETATTR and the subsequent * READ where the file size could change. Our preference is simply * to do all reads the application wants, and the server will take * care of managing the end of file boundary. * * This function also eliminates unnecessarily updating the file's * atime locally, as the NFS server sets the file's atime, and this * client must read the updated atime from the server back into its * cache.
*/
ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, bool swap)
{ struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx;
ssize_t result, requested;
size_t count = iov_iter_count(iter);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
file, count, (longlong) iocb->ki_pos);
result = 0; if (!count) goto out;
task_io_account_read(count);
result = -ENOMEM;
dreq = nfs_direct_req_alloc(); if (dreq == NULL) goto out;
if (user_backed_iter(iter))
dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
if (!swap) {
result = nfs_start_io_direct(inode); if (result) { /* release the reference that would usually be * consumed by nfs_direct_read_schedule_iovec()
*/
nfs_direct_req_release(dreq); goto out_release;
}
}
list_for_each_entry(req, list, wb_list) { if (req->wb_head != req) {
nfs_direct_add_page_head(&req->wb_list, req); continue;
}
subreq = req->wb_this_page; if (subreq == req) continue; do { /* * Remove subrequests from this list before freeing * them in the call to nfs_join_page_group().
*/ if (!list_empty(&subreq->wb_list)) {
nfs_list_remove_request(subreq);
nfs_release_request(subreq);
}
} while ((subreq = subreq->wb_this_page) != req);
nfs_join_page_group(req, cinfo, inode);
}
}
/* * NB: Return the value of the first error return code. Subsequent * errors after the first one are ignored.
*/ /* * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE * operation. If nfs_writedata_alloc() or get_user_pages() fails, * bail and stop sending more writes. Write length accounting is * handled automatically by nfs_direct_write_result(). Otherwise, if * no requests have been sent, just return an error.
*/ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, struct iov_iter *iter,
loff_t pos, int ioflags)
{ struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; struct nfs_commit_info cinfo;
ssize_t result = 0;
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); bool defer = false;
if (defer) {
nfs_mark_request_commit(req, NULL, &cinfo, 0); continue;
}
nfs_lock_request(req); if (nfs_pageio_add_request(&desc, req)) continue;
/* Exit on hard errors */ if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
result = desc.pg_error;
nfs_unlock_and_release_request(req); break;
}
/* If the error is soft, defer remaining requests */
nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
spin_unlock(&dreq->lock);
nfs_unlock_request(req);
nfs_mark_request_commit(req, NULL, &cinfo, 0);
desc.pg_error = 0;
defer = true;
}
nfs_direct_release_pages(pagevec, npages);
kvfree(pagevec); if (result < 0) break;
}
nfs_pageio_complete(&desc);
/* * If no bytes were started, return the error, and let the * generic layer handle the completion.
*/ if (requested_bytes == 0) {
inode_dio_end(inode);
nfs_direct_req_release(dreq); return result < 0 ? result : -EIO;
}
if (put_dreq(dreq))
nfs_direct_write_complete(dreq); return requested_bytes;
}
/** * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode * semaphore and updating the i_size. The NFS server will set * the new i_size and this client must read the updated size * back into its cache. We let the server do generic write * parameter checking and report problems. * * We eliminate local atime updates, see direct read above. * * We avoid unnecessary page cache invalidations for normal cached * readers of this file. * * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol.
*/
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, bool swap)
{
ssize_t result, requested;
size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx;
loff_t pos, end;
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (longlong) iocb->ki_pos);
if (swap) /* bypass generic checks */
result = iov_iter_count(iter); else
result = generic_write_checks(iocb, iter); if (result <= 0) return result;
count = result;
nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
if (swap) {
requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
FLUSH_STABLE);
} else {
result = nfs_start_io_direct(inode); if (result) { /* release the reference that would usually be * consumed by nfs_direct_write_schedule_iovec()
*/
nfs_direct_req_release(dreq); goto out_release;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.