/* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator * @size: maximum length * * Fault in one or more iovecs of the given iov_iter, to a maximum length of * @size. For each iovec, fault in each page that constitutes the iovec. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). * * Always returns 0 for non-userspace iterators.
*/
size_t fault_in_iov_iter_readable(conststruct iov_iter *i, size_t size)
{ if (iter_is_ubuf(i)) {
size_t n = min(size, iov_iter_count(i));
n -= fault_in_readable(i->ubuf + i->iov_offset, n); return size - n;
} elseif (iter_is_iovec(i)) {
size_t count = min(size, iov_iter_count(i)); conststruct iovec *p;
size_t skip;
if (unlikely(!len)) continue;
ret = fault_in_readable(p->iov_base + skip, len);
count -= len - ret; if (ret) break;
} return count + size;
} return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_readable);
/* * fault_in_iov_iter_writeable - fault in iov iterator for writing * @i: iterator * @size: maximum length * * Faults in the iterator using get_user_pages(), i.e., without triggering * hardware page faults. This is primarily useful when we already know that * some or all of the pages in @i aren't in memory. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). * * Always returns 0 for non-user-space iterators.
*/
size_t fault_in_iov_iter_writeable(conststruct iov_iter *i, size_t size)
{ if (iter_is_ubuf(i)) {
size_t n = min(size, iov_iter_count(i));
n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); return size - n;
} elseif (iter_is_iovec(i)) {
size_t count = min(size, iov_iter_count(i)); conststruct iovec *p;
size_t skip;
/** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length * @i: destination iterator * * The pmem driver deploys this for the dax operation * (dax_copy_to_iter()) for dax reads (bypass page-cache and the * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes * successfully copied. * * The main differences between this and typical _copy_to_iter(). * * * Typical tail/residue handling after a fault retries the copy * byte-by-byte until the fault happens again. Re-triggering machine * checks is potentially fatal so the implementation uses source * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * * * ITER_KVEC and ITER_BVEC can return short copies. Compare to * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0)
*/
size_t _copy_mc_to_iter(constvoid *addr, size_t bytes, struct iov_iter *i)
{ if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i))
might_fault(); return iterate_and_advance(i, bytes, (void *)addr,
copy_to_user_iter_mc, memcpy_to_iter_mc);
}
EXPORT_SYMBOL_GPL(_copy_mc_to_iter); #endif/* CONFIG_ARCH_HAS_COPY_MC */
/** * _copy_from_iter_flushcache - write destination through cpu cache * @addr: destination kernel address * @bytes: total transfer length * @i: source iterator * * The pmem driver arranges for filesystem-dax to use this facility via * dax_copy_from_iter() for ensuring that writes to persistent memory * are flushed through the CPU cache. It is differentiated from * _copy_from_iter_nocache() in that guarantees all data is flushed for * all iterator types. The _copy_from_iter_nocache() only attempts to * bypass the cache for the ITER_IOVEC case, and on some archs may use * instructions that strand dirty-data in the cache. * * Return: number of bytes copied (may be %0)
*/
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{ if (WARN_ON_ONCE(!i->data_source)) return 0;
staticinlinebool page_copy_sane(struct page *page, size_t offset, size_t n)
{ struct page *head;
size_t v = n + offset;
/* * The general case needs to access the page order in order * to compute the page size. * However, we mostly deal with order-0 pages and thus can * avoid a possible cache line miss for requests that fit all * page orders.
*/ if (n <= v && v <= PAGE_SIZE) returntrue;
head = compound_head(page);
v += (page - head) << PAGE_SHIFT;
if (WARN_ON(n > v || v > page_size(head))) returnfalse; returntrue;
}
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i)
{
size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0;
page += offset / PAGE_SIZE; // first subpage
offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page);
size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
n = _copy_to_iter(kaddr + offset, n, i);
kunmap_local(kaddr);
res += n;
bytes -= n; if (!bytes || !n) break;
offset += n; if (offset == PAGE_SIZE) {
page++;
offset = 0;
}
} return res;
}
EXPORT_SYMBOL(copy_page_to_iter);
/** * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue * @i: The iterator to initialise. * @direction: The direction of the transfer. * @folioq: The starting point in the folio queue. * @first_slot: The first slot in the folio queue to use * @offset: The offset into the folio in the first slot to start at * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller.
*/ void iov_iter_folio_queue(struct iov_iter *i, unsignedint direction, conststruct folio_queue *folioq, unsignedint first_slot, unsignedint offset, size_t count)
{
BUG_ON(direction & ~1);
*i = (struct iov_iter) {
.iter_type = ITER_FOLIOQ,
.data_source = direction,
.folioq = folioq,
.folioq_slot = first_slot,
.count = count,
.iov_offset = offset,
};
}
EXPORT_SYMBOL(iov_iter_folio_queue);
/** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. * @direction: The direction of the transfer. * @xarray: The xarray to access. * @start: The start file position. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller.
*/ void iov_iter_xarray(struct iov_iter *i, unsignedint direction, struct xarray *xarray, loff_t start, size_t count)
{
BUG_ON(direction & ~1);
*i = (struct iov_iter) {
.iter_type = ITER_XARRAY,
.data_source = direction,
.xarray = xarray,
.xarray_start = start,
.count = count,
.iov_offset = 0
};
}
EXPORT_SYMBOL(iov_iter_xarray);
/** * iov_iter_discard - Initialise an I/O iterator that discards data * @i: The iterator to initialise. * @direction: The direction of the transfer. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator that just discards everything that's written to it. * It's only available as a READ iterator.
*/ void iov_iter_discard(struct iov_iter *i, unsignedint direction, size_t count)
{
BUG_ON(direction != READ);
*i = (struct iov_iter){
.iter_type = ITER_DISCARD,
.data_source = false,
.count = count,
.iov_offset = 0
};
}
EXPORT_SYMBOL(iov_iter_discard);
if (len > size)
len = size; if (len & len_mask) returnfalse; if ((unsignedlong)(bvec->bv_offset + skip) & addr_mask) returnfalse;
bvec++;
size -= len;
skip = 0;
} while (size);
returntrue;
}
/** * iov_iter_is_aligned() - Check if the addresses and lengths of each segments * are aligned to the parameters. * * @i: &struct iov_iter to restore * @addr_mask: bit mask to check against the iov element's addresses * @len_mask: bit mask to check against the iov element's lengths * * Return: false if any addresses or lengths intersect with the provided masks
*/ bool iov_iter_is_aligned(conststruct iov_iter *i, unsigned addr_mask, unsigned len_mask)
{ if (likely(iter_is_ubuf(i))) { if (i->count & len_mask) returnfalse; if ((unsignedlong)(i->ubuf + i->iov_offset) & addr_mask) returnfalse; returntrue;
}
if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_aligned_iovec(i, addr_mask, len_mask);
if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask);
/* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_xarray(i)) { if (i->count & len_mask) returnfalse; if ((i->xarray_start + i->iov_offset) & addr_mask) returnfalse;
} if (iov_iter_is_folioq(i)) { if (i->count & len_mask) returnfalse; if (i->iov_offset & addr_mask) returnfalse;
}
do {
size_t len = iov->iov_len - skip; if (len) {
res |= (unsignedlong)iov->iov_base + skip; if (len > size)
len = size;
res |= len;
size -= len;
}
iov++;
skip = 0;
} while (size); return res;
}
do {
size_t len = bvec->bv_len - skip;
res |= (unsignedlong)bvec->bv_offset + skip; if (len > size)
len = size;
res |= len;
bvec++;
size -= len;
skip = 0;
} while (size);
/* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_alignment_iovec(i);
if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i);
/* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_folioq(i)) return i->iov_offset | i->count; if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count;
return 0;
}
EXPORT_SYMBOL(iov_iter_alignment);
unsignedlong iov_iter_gap_alignment(conststruct iov_iter *i)
{ unsignedlong res = 0; unsignedlong v = 0;
size_t size = i->count; unsigned k;
if (iter_is_ubuf(i)) return 0;
if (WARN_ON(!iter_is_iovec(i))) return ~0U;
for (k = 0; k < i->nr_segs; k++) { conststruct iovec *iov = iter_iov(i) + k; if (iov->iov_len) { unsignedlong base = (unsignedlong)iov->iov_base; if (v) // if not the first one
res |= base | v; // this start | previous end
v = base + iov->iov_len; if (size <= iov->iov_len) break;
size -= iov->iov_len;
}
} return res;
}
EXPORT_SYMBOL(iov_iter_gap_alignment);
/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ staticunsignedlong first_iovec_segment(conststruct iov_iter *i, size_t *size)
{
size_t skip; long k;
if (iter_is_ubuf(i)) return (unsignedlong)i->ubuf + i->iov_offset;
for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { conststruct iovec *iov = iter_iov(i) + k;
size_t len = iov->iov_len - skip;
if (unlikely(!len)) continue; if (*size > len)
*size = len; return (unsignedlong)iov->iov_base + skip;
}
BUG(); // if it had been empty, we wouldn't get called
}
/* must be done on non-empty ITER_BVEC one */ staticstruct page *first_bvec_segment(conststruct iov_iter *i,
size_t *size, size_t *start)
{ struct page *page;
size_t skip = i->iov_offset, len;
/* * SuS says "The readv() function *may* fail if the iovcnt argument was * less than or equal to 0, or greater than {IOV_MAX}. Linux has * traditionally returned zero for zero segments, so...
*/ if (nr_segs == 0) return iov; if (nr_segs > UIO_MAXIOV) return ERR_PTR(-EINVAL); if (nr_segs > fast_segs) {
iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (!iov) return ERR_PTR(-ENOMEM);
}
if (unlikely(compat))
ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else
ret = copy_iovec_from_user(iov, uvec, nr_segs); if (ret) { if (iov != fast_iov)
kfree(iov); return ERR_PTR(ret);
}
return iov;
}
/* * Single segment iovec supplied by the user, import it as ITER_UBUF.
*/ static ssize_t __import_iovec_ubuf(int type, conststruct iovec __user *uvec, struct iovec **iovp, struct iov_iter *i, bool compat)
{ struct iovec *iov = *iovp;
ssize_t ret;
*iovp = NULL;
if (compat)
ret = copy_compat_iovec_from_user(iov, uvec, 1); else
ret = copy_iovec_from_user(iov, uvec, 1); if (unlikely(ret)) return ret;
ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; return i->count;
}
/* * According to the Single Unix Specification we should return EINVAL if * an element length is < 0 when cast to ssize_t or if the total length * would overflow the ssize_t return value of the system call. * * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the * overflow case.
*/ for (seg = 0; seg < nr_segs; seg++) {
ssize_t len = (ssize_t)iov[seg].iov_len;
if (!access_ok(iov[seg].iov_base, len)) { if (iov != *iovp)
kfree(iov);
*iovp = NULL; return -EFAULT;
}
if (len > MAX_RW_COUNT - total_len) {
len = MAX_RW_COUNT - total_len;
iov[seg].iov_len = len;
}
total_len += len;
}
/** * import_iovec() - Copy an array of &struct iovec from userspace * into the kernel, check that it is valid, and initialize a new * &struct iov_iter iterator to access it. * * @type: One of %READ or %WRITE. * @uvec: Pointer to the userspace array. * @nr_segs: Number of elements in userspace array. * @fast_segs: Number of elements in @iov. * @iovp: (input and output parameter) Pointer to pointer to (usually small * on-stack) kernel array. * @i: Pointer to iterator that will be initialized on success. * * If the array pointed to by *@iov is large enough to hold all @nr_segs, * then this function places %NULL in *@iov on return. Otherwise, a new * array will be allocated and the result placed in *@iov. This means that * the caller may call kfree() on *@iov regardless of whether the small * on-stack array was used or not (and regardless of whether this function * returns an error or not). * * Return: Negative error code on error, bytes imported on success
*/
ssize_t import_iovec(int type, conststruct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i)
{ return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
in_compat_syscall());
}
EXPORT_SYMBOL(import_iovec);
int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{ if (len > MAX_RW_COUNT)
len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT;
/** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. * * @i: &struct iov_iter to restore * @state: state to restore from * * Used after iov_iter_save_state() to bring restore @i, if operations may * have advanced it. * * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
*/ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{ if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
!iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return;
i->iov_offset = state->iov_offset;
i->count = state->count; if (iter_is_ubuf(i)) return; /* * For the *vec iters, nr_segs + iov is constant - if we increment * the vec, then we also decrement the nr_segs count. Hence we don't * need to track both of these, just one is enough and we can deduct * the other from that. ITER_KVEC and ITER_IOVEC are the same struct * size, so we can just increment the iov pointer as they are unionzed. * ITER_BVEC _may_ be the same size on some archs, but on others it is * not. Be safe and handle it separately.
*/
BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); if (iov_iter_is_bvec(i))
i->bvec -= state->nr_segs - i->nr_segs; else
i->__iov -= state->nr_segs - i->nr_segs;
i->nr_segs = state->nr_segs;
}
/* * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does * not get references on the pages, nor does it get a pin on them.
*/ static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsignedint maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0)
{ conststruct folio_queue *folioq = i->folioq; struct page **p; unsignedint nr = 0;
size_t extracted = 0, offset, slot = i->folioq_slot;
if (slot >= folioq_nr_slots(folioq)) {
folioq = folioq->next;
slot = 0; if (WARN_ON(i->iov_offset != 0)) return -EIO;
}
/* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them.
*/ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsignedint maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0)
{ struct page **p; struct folio *folio; unsignedint nr = 0, offset;
loff_t pos = i->xarray_start + i->iov_offset;
XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT);
offset = pos & ~PAGE_MASK;
*offset0 = offset;
maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM;
p = *pages;
rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue;
/* Has the folio moved or been split? */ if (unlikely(folio != xas_reload(&xas))) {
xas_reset(&xas); continue;
}
/* * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. * This does not get references on the pages, nor does it get a pin on them.
*/ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsignedint maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0)
{
size_t skip = i->iov_offset, size = 0; struct bvec_iter bi; int k = 0;
/* * The iov_iter_extract_pages interface only allows an offset * into the first page. Break out of the loop if we see an * offset into subsequent pages, the caller will have to call * iov_iter_extract_pages again for the reminder.
*/ if (k) { if (bv.bv_offset) break;
} else {
*offset0 = bv.bv_offset;
}
(*pages)[k++] = bv.bv_page;
size += bv.bv_len;
if (k >= maxpages) break;
/* * We are done when the end of the bvec doesn't align to a page * boundary as that would create a hole in the returned space. * The caller will handle this with another call to * iov_iter_extract_pages.
*/ if (bv.bv_offset + bv.bv_len != PAGE_SIZE) break;
/* * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. * This does not get references on the pages, nor does it get a pin on them.
*/ static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsignedint maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0)
{ struct page **p, *page; constvoid *kaddr;
size_t skip = i->iov_offset, offset, len, size; int k;
for (;;) { if (i->nr_segs == 0) return 0;
size = min(maxsize, i->kvec->iov_len - skip); if (size) break;
i->iov_offset = 0;
i->nr_segs--;
i->kvec++;
skip = 0;
}
/* * Extract a list of contiguous pages from a user iterator and get a pin on * each of them. This should only be used if the iterator is user-backed * (IOBUF/UBUF). * * It does not get refs on the pages, but the pages must be unpinned by the * caller once the transfer is complete. * * This is safe to be used where background IO/DMA *is* going to be modifying * the buffer; using a pin rather than a ref makes forces fork() to give the * child a copy of the page.
*/ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, struct page ***pages,
size_t maxsize, unsignedint maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0)
{ unsignedlong addr; unsignedint gup_flags = 0;
size_t offset; int res;
if (i->data_source == ITER_DEST)
gup_flags |= FOLL_WRITE; if (extraction_flags & ITER_ALLOW_P2PDMA)
gup_flags |= FOLL_PCI_P2PDMA; if (i->nofault)
gup_flags |= FOLL_NOFAULT;
/** * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator * @i: The iterator to extract from * @pages: Where to return the list of pages * @maxsize: The maximum amount of iterator to extract * @maxpages: The maximum size of the list of pages * @extraction_flags: Flags to qualify request * @offset0: Where to return the starting offset into (*@pages)[0] * * Extract a list of contiguous pages from the current point of the iterator, * advancing the iterator. The maximum number of pages and the maximum amount * of page contents can be set. * * If *@pages is NULL, a page list will be allocated to the required size and * *@pages will be set to its base. If *@pages is not NULL, it will be assumed * that the caller allocated a page list at least @maxpages in size and this * will be filled in. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * The iov_iter_extract_will_pin() function can be used to query how cleanup * should be performed. * * Extra refs or pins on the pages may be obtained as follows: * * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the * pages are merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: * * (*) Use with ITER_DISCARD is not supported as that has no content. * * On success, the function sets *@pages to the new pagelist, if allocated, and * sets *offset0 to the offset into the first page. * * It may also return -ENOMEM and -EFAULT.
*/
ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
size_t maxsize, unsignedint maxpages,
iov_iter_extraction_t extraction_flags,
size_t *offset0)
{
maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); if (!maxsize) return 0;
if (likely(user_backed_iter(i))) return iov_iter_extract_user_pages(i, pages, maxsize,
maxpages, extraction_flags,
offset0); if (iov_iter_is_kvec(i)) return iov_iter_extract_kvec_pages(i, pages, maxsize,
maxpages, extraction_flags,
offset0); if (iov_iter_is_bvec(i)) return iov_iter_extract_bvec_pages(i, pages, maxsize,
maxpages, extraction_flags,
offset0); if (iov_iter_is_folioq(i)) return iov_iter_extract_folioq_pages(i, pages, maxsize,
maxpages, extraction_flags,
offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize,
maxpages, extraction_flags,
offset0); return -EFAULT;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
Messung V0.5
¤ Dauer der Verarbeitung: 0.7 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.