/* Direct lock must be taken before the extent lock. */ if (nowait) { if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) return -EAGAIN;
} else {
btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
}
while (1) { if (nowait) { if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
cached_state)) {
ret = -EAGAIN; break;
}
} else {
btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
} /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure there's no ordered * extents in this range.
*/
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
lockend - lockstart + 1);
/* * We need to make sure there are no buffered pages in this * range either, we could have raced between the invalidate in * generic_file_direct_write and locking the extent. The * invalidate needs to happen so that reads after a write do not * get stale data.
*/ if (!ordered &&
(!writing || !filemap_range_has_page(inode->i_mapping,
lockstart, lockend))) break;
if (ordered) { if (nowait) {
btrfs_put_ordered_extent(ordered);
ret = -EAGAIN; break;
} /* * If we are doing a DIO read and the ordered extent we * found is for a buffered write, we can not wait for it * to complete and retry, because if we do so we can * deadlock with concurrent buffered writes on page * locks. This happens only if our DIO read covers more * than one extent map, if at this point has already * created an ordered extent for a previous extent map * and locked its range in the inode's io tree, and a * concurrent write against that previous extent map's * range and this range started (we unlock the ranges * in the io tree only when the bios complete and * buffered writes always lock pages before attempting * to lock range in the io tree).
*/ if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
btrfs_start_ordered_extent(ordered); else
ret = nowait ? -EAGAIN : -ENOTBLK;
btrfs_put_ordered_extent(ordered);
} else { /* * We could trigger writeback for this range (and wait * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page.
*/
ret = nowait ? -EAGAIN : -ENOTBLK;
}
if (ret) break;
cond_resched();
}
if (ret)
btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret;
}
/* * We don't allocate a new extent in the following cases * * 1) The inode is marked as NODATACOW. In this case we'll just use the * existing extent. * 2) The extent is marked as PREALLOC. We're good to go here and can * just use the extent. *
*/ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->disk_bytenr != EXTENT_MAP_HOLE)) { if (em->flags & EXTENT_FLAG_PREALLOC)
type = BTRFS_ORDERED_PREALLOC; else
type = BTRFS_ORDERED_NOCOW;
len = min(len, em->len - (start - em->start));
block_start = btrfs_extent_map_block_start(em) + (start - em->start);
if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, false) == 1) {
bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg)
can_nocow = true;
}
}
prev_len = len; if (can_nocow) { struct extent_map *em2;
/* We can NOCOW, so only need to reserve metadata space. */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */
btrfs_free_extent_map(em);
*map = NULL;
btrfs_dec_nocow_writers(bg); if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
ret = -EAGAIN; goto out;
}
space_reserved = true;
if (IS_ERR(em2)) {
ret = PTR_ERR(em2); goto out;
}
dio_data->nocow_done = true;
} else { /* Our caller expects us to free the input extent map. */
btrfs_free_extent_map(em);
*map = NULL;
if (nowait) {
ret = -EAGAIN; goto out;
}
/* * If we could not allocate data space before locking the file * range and we can't do a NOCOW write, then we have to fail.
*/ if (!dio_data->data_space_reserved) {
ret = -ENOSPC; goto out;
}
/* * We have to COW and we have already reserved data space before, * so now we reserve only metadata.
*/
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, false); if (ret < 0) goto out;
space_reserved = true;
em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); if (IS_ERR(em)) {
ret = PTR_ERR(em); goto out;
}
*map = em;
len = min(len, em->len - (start - em->start)); if (len < prev_len)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
prev_len - len, true);
}
/* * We have created our ordered extent, so we can now release our reservation * for an outstanding extent.
*/
btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
/* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock.
*/ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
out: if (ret && space_reserved) {
btrfs_delalloc_release_extents(BTRFS_I(inode), len);
btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
}
*lenp = len; return ret;
}
/* * We could potentially fault if we have a buffer > PAGE_SIZE, and if * we're NOWAIT we may submit a bio for a partial range and return * EIOCBQUEUED, which would result in an errant short read. * * The best way to handle this would be to allow for partial completions * of iocb's, so we could submit the partial bio, return and fault in * the rest of the pages, and then submit the io for the rest of the * range. However we don't have that currently, so simply return * -EAGAIN at this point so that the normal path is used.
*/ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) return -EAGAIN;
/* * Cap the size of reads to that usually seen in buffered I/O as we need * to allocate a contiguous array for the checksums.
*/ if (!write)
len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
lockstart = start;
lockend = start + len - 1;
/* * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't * enough if we've written compressed pages to this area, so we need to * flush the dirty pages again to make absolutely sure that any * outstanding dirty pages are on disk - the first flush only starts * compression on the data, while keeping the pages locked, so by the * time the second flush returns we know bios for the compressed pages * were submitted and finished, and the pages no longer under writeback. * * If we have a NOWAIT request and we have any pages in the range that * are locked, likely due to compression still in progress, we don't want * to block on page locks. We also don't want to block on pages marked as * dirty or under writeback (same as for the non-compression case). * iomap_dio_rw() did the same check, but after that and before we got * here, mmap'ed writes may have happened or buffered reads started * (readpage() and readahead(), which lock pages), as we haven't locked * the file range yet.
*/ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags)) { if (flags & IOMAP_NOWAIT) { if (filemap_range_needs_writeback(inode->i_mapping,
lockstart, lockend)) return -EAGAIN;
} else {
ret = filemap_fdatawrite_range(inode->i_mapping, start,
start + length - 1); if (ret) return ret;
}
}
memset(dio_data, 0, sizeof(*dio_data));
/* * We always try to allocate data space and must do it before locking * the file range, to avoid deadlocks with concurrent writes to the same * range if the range has several extents and the writes don't expand the * current i_size (the inode lock is taken in shared mode). If we fail to * allocate data space here we continue and later, after locking the * file range, we fail with ENOSPC only if we figure out we can not do a * NOCOW write.
*/ if (write && !(flags & IOMAP_NOWAIT)) {
ret = btrfs_check_data_free_space(BTRFS_I(inode),
&dio_data->data_reserved,
start, data_alloc_len, false); if (!ret)
dio_data->data_space_reserved = true; elseif (!(BTRFS_I(inode)->flags &
(BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) goto err;
}
/* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered IO, or we are doing a * NOWAIT read/write and we need to block.
*/
ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); if (ret < 0) goto err;
em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); if (IS_ERR(em)) {
ret = PTR_ERR(em); goto unlock_err;
}
/* * Ok for INLINE and COMPRESSED extents we need to fallback on buffered * io. INLINE is special, and we could probably kludge it in here, but * it's still buffered so for safety lets just fall back to the generic * buffered path. * * For COMPRESSED we _have_ to read the entire extent in so we can * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code.
*/ if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
btrfs_free_extent_map(em); /* * If we are in a NOWAIT context, return -EAGAIN in order to * fallback to buffered IO. This is not only because we can * block with buffered IO (no support for NOWAIT semantics at * the moment) but also to avoid returning short reads to user * space - this happens if we were able to read some data from * previous non-compressed extents and then when we fallback to * buffered IO, at btrfs_file_read_iter() by calling * filemap_read(), we fail to fault in pages for the read buffer, * in which case filemap_read() returns a short read (the number * of bytes previously read is > 0, so it does not return -EFAULT).
*/
ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err;
}
len = min(len, em->len - (start - em->start));
/* * If we have a NOWAIT request and the range contains multiple extents * (or a mix of extents and holes), then we return -EAGAIN to make the * caller fallback to a context where it can do a blocking (without * NOWAIT) request. This way we avoid doing partial IO and returning * success to the caller, which is not optimal for writes and for reads * it can result in unexpected behaviour for an application. * * When doing a read, because we use IOMAP_DIO_PARTIAL when calling * iomap_dio_rw(), we can end up returning less data then what the caller * asked for, resulting in an unexpected, and incorrect, short read. * That is, the caller asked to read N bytes and we return less than that, * which is wrong unless we are crossing EOF. This happens if we get a * page fault error when trying to fault in pages for the buffer that is * associated to the struct iov_iter passed to iomap_dio_rw(), and we * have previously submitted bios for other extents in the range, in * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of * those bios have completed by the time we get the page fault error, * which we return back to our caller - we should only return EIOCBQUEUED * after we have submitted bios for all the extents in the range.
*/ if ((flags & IOMAP_NOWAIT) && len < length) {
btrfs_free_extent_map(em);
ret = -EAGAIN; goto unlock_err;
}
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
start, &len, flags); if (ret < 0) goto unlock_err; /* Recalc len in case the new em is smaller than requested */
len = min(len, em->len - (start - em->start)); if (dio_data->data_space_reserved) {
u64 release_offset;
u64 release_len = 0;
if (release_len > 0)
btrfs_free_reserved_data_space(BTRFS_I(inode),
dio_data->data_reserved,
release_offset,
release_len);
}
}
/* * Translate extent map information to iomap. * We trim the extents (and move the addr) even though iomap code does * that, since we have locked only the parts we are performing I/O in.
*/ if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
iomap->type = IOMAP_MAPPED;
}
iomap->offset = start;
iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
btrfs_free_extent_map(em);
/* * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, * writes only hold it for this part. We hold the extent lock until * we're completely done with the extent map to make sure it remains * valid.
*/ if (write)
unlock_bits |= EXTENT_DIO_LOCKED;
/* We didn't use everything, unlock the dio extent for the remainder. */ if (!write && (start + len) < lockend)
btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
lockend, NULL);
return 0;
unlock_err: /* * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget * to update this, be explicit that we expect EXTENT_LOCKED and * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
*/
btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
err: if (dio_data->data_space_reserved) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
dio_data->data_reserved,
start, data_alloc_len);
extent_changeset_free(dio_data->data_reserved);
}
/* Must always be called for the beginning of an ordered extent. */ if (WARN_ON_ONCE(start != ordered->disk_bytenr)) return -EINVAL;
/* No need to split if the ordered extent covers the entire bio. */ if (ordered->disk_num_bytes == len) {
refcount_inc(&ordered->refs);
bbio->ordered = ordered; return 0;
}
/* * Don't split the extent_map for NOCOW extents, as we're writing into * a pre-existing one.
*/ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
ordered->num_bytes, len,
ordered->disk_bytenr); if (ret) return ret;
}
new = btrfs_split_ordered_extent(ordered, len); if (IS_ERR(new)) return PTR_ERR(new);
bbio->ordered = new; return 0;
}
/* * Check if we are doing a partial write. If we are, we need to split * the ordered extent to match the submitted bio. Hang on to the * remaining unfinishable ordered_extent in dio_data so that it can be * cancelled in iomap_end to avoid a deadlock wherein faulting the * remaining pages is blocked on the outstanding ordered extent.
*/ if (iter->flags & IOMAP_WRITE) { int ret;
ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) {
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
file_offset, dip->bytes,
!ret);
bio->bi_status = errno_to_blk_status(ret);
iomap_dio_bio_end_io(bio); return;
}
}
if (iocb->ki_flags & IOCB_NOWAIT)
ilock_flags |= BTRFS_ILOCK_TRY;
/* * If the write DIO is within EOF, use a shared lock and also only if * security bits will likely not be dropped by file_remove_privs() called * from btrfs_write_check(). Either will need to be rechecked after the * lock was acquired.
*/ if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) return ret;
/* Shared lock cannot be used with security bits set. */ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
ilock_flags &= ~BTRFS_ILOCK_SHARED; goto relock;
}
ret = generic_write_checks(iocb, from); if (ret <= 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return ret;
}
ret = btrfs_write_check(iocb, ret); if (ret < 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out;
}
pos = iocb->ki_pos; /* * Re-check since file size may have changed just before taking the * lock or pos may have changed because of O_APPEND in generic_write_check()
*/ if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
pos + iov_iter_count(from) > i_size_read(inode)) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
ilock_flags &= ~BTRFS_ILOCK_SHARED; goto relock;
}
if (check_direct_IO(fs_info, from, pos)) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered;
} /* * We can't control the folios being passed in, applications can write * to them while a direct IO write is in progress. This means the * content might change after we calculated the data checksum. * Therefore we can end up storing a checksum that doesn't match the * persisted data. * * To be extra safe and avoid false data checksum mismatch, if the * inode requires data checksum, just fallback to buffered IO. * For buffered IO we have full control of page cache and can ensure * no one is modifying the content during writeback.
*/ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered;
}
/* * The iov_iter can be mapped to the same file range we are writing to. * If that's the case, then we will deadlock in the iomap code, because * it first calls our callback btrfs_dio_iomap_begin(), which will create * an ordered extent, and after that it will fault in the pages that the * iov_iter refers to. During the fault in we end up in the readahead * pages code (starting at btrfs_readahead()), which will lock the range, * find that ordered extent and then wait for it to complete (at * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since * obviously the ordered extent can never complete as we didn't submit * yet the respective bio(s). This always happens when the buffer is * memory mapped to the same file range, since the iomap DIO code always * invalidates pages in the target file range (after starting and waiting * for any writeback). * * So here we disable page faults in the iov_iter and then retry if we * got -EFAULT, faulting in the pages before the retry.
*/
again:
from->nofault = true;
dio = btrfs_dio_write(iocb, from, written);
from->nofault = false;
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
} else { /* * If we have a synchronous write, we must make sure the fsync * triggered by the iomap_dio_complete() call below doesn't * deadlock on the inode lock - we are already holding it and we * can't call it after unlocking because we may need to complete * partial writes due to the input buffer (or parts of it) not * being already faulted in.
*/
ASSERT(current->journal_info == NULL);
current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
ret = iomap_dio_complete(dio);
current->journal_info = NULL;
}
/* No increment (+=) because iomap returns a cumulative value. */ if (ret > 0)
written = ret;
if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { const size_t left = iov_iter_count(from); /* * We have more data left to write. Try to fault in as many as * possible of the remainder pages and retry. We do this without * releasing and locking again the inode, to prevent races with * truncate. * * Also, in case the iov refers to pages in the file range of the * file we want to write to (due to a mmap), we could enter an * infinite loop if we retry after faulting the pages in, since * iomap will invalidate any pages in the range early on, before * it tries to fault in the pages of the iov. So we keep track of * how much was left of iov in the previous EFAULT and fallback * to buffered IO in case we haven't made any progress.
*/ if (left == prev_left) {
ret = -ENOTBLK;
} else {
fault_in_iov_iter_readable(from, left);
prev_left = left; goto again;
}
}
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
/* * If 'ret' is -ENOTBLK or we have not written all data, then it means * we must fallback to buffered IO.
*/ if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) goto out;
buffered: /* * If we are in a NOWAIT context, then return -EAGAIN to signal the caller * it must retry the operation in a context where blocking is acceptable, * because even if we end up not blocking during the buffered IO attempt * below, we will block when flushing and waiting for the IO.
*/ if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN; goto out;
}
pos = iocb->ki_pos;
written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) {
ret = written_buffered; goto out;
} /* * Ensure all data is persisted. We want the next direct IO read to be * able to read what was just written.
*/
endbyte = pos + written_buffered - 1;
ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte); if (ret) goto out;
ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); if (ret) goto out;
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
out: return ret < 0 ? ret : written;
}
staticint check_direct_read(struct btrfs_fs_info *fs_info, conststruct iov_iter *iter, loff_t offset)
{ int ret; int i, seg;
ret = check_direct_IO(fs_info, iter, offset); if (ret < 0) return ret;
if (!iter_is_iovec(iter)) return 0;
for (seg = 0; seg < iter->nr_segs; seg++) { for (i = seg + 1; i < iter->nr_segs; i++) { conststruct iovec *iov1 = iter_iov(iter) + seg; conststruct iovec *iov2 = iter_iov(iter) + i;
if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos)) return 0;
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
again: /* * This is similar to what we do for direct IO writes, see the comment * at btrfs_direct_write(), but we also disable page faults in addition * to disabling them only at the iov_iter level. This is because when * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), * which can still trigger page fault ins despite having set ->nofault * to true of our 'to' iov_iter. * * The difference to direct IO writes is that we deadlock when trying * to lock the extent range in the inode's tree during he page reads * triggered by the fault in (while for writes it is due to waiting for * our own ordered extent). This is because for direct IO reads, * btrfs_dio_iomap_begin() returns with the extent range locked, which * is only unlocked in the endio callback (end_bio_extent_readpage()).
*/
pagefault_disable();
to->nofault = true;
ret = btrfs_dio_read(iocb, to, read);
to->nofault = false;
pagefault_enable();
/* No increment (+=) because iomap returns a cumulative value. */ if (ret > 0)
read = ret;
if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { const size_t left = iov_iter_count(to);
if (left == prev_left) { /* * We didn't make any progress since the last attempt, * fallback to a buffered read for the remainder of the * range. This is just to avoid any possibility of looping * for too long.
*/
ret = read;
} else { /* * We made some progress since the last retry or this is * the first time we are retrying. Fault in as many pages * as possible and retry.
*/
fault_in_iov_iter_writeable(to, left);
prev_left = left; goto again;
}
}
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); return ret < 0 ? ret : read;
}
int __init btrfs_init_dio(void)
{ if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_dio_private, bbio.bio),
BIOSET_NEED_BVECS)) return -ENOMEM;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.