/* All write I/Os should always be within the file maximum size */ if (WARN_ON_ONCE(offset + length > z->z_capacity)) return -EIO;
/* * Sequential zones can only accept direct writes. This is already * checked when writes are issued, so warn if we see a page writeback * operation.
*/ if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) return -EIO;
/* * For conventional zones, all blocks are always mapped. For sequential * zones, all blocks after always mapped below the inode size (zone * write pointer) and unwriten beyond.
*/
mutex_lock(&zi->i_truncate_mutex);
iomap->bdev = inode->i_sb->s_bdev;
iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
isize = i_size_read(inode); if (iomap->offset >= isize) {
iomap->type = IOMAP_UNWRITTEN;
iomap->length = z->z_capacity - iomap->offset;
} else {
iomap->type = IOMAP_MAPPED;
iomap->length = isize - iomap->offset;
}
mutex_unlock(&zi->i_truncate_mutex);
/* * Map blocks for page writeback. This is used only on conventional zone files, * which implies that the page range can only be within the fixed inode size.
*/ static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 offset, unsigned len, u64 end_pos)
{ struct zonefs_zone *z = zonefs_inode_zone(wpc->inode);
if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) return -EIO; if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode))) return -EIO;
/* If the mapping is already OK, nothing needs to be done */ if (offset < wpc->iomap.offset ||
offset >= wpc->iomap.offset + wpc->iomap.length) { int error;
int zonefs_file_truncate(struct inode *inode, loff_t isize)
{ struct zonefs_inode_info *zi = ZONEFS_I(inode); struct zonefs_zone *z = zonefs_inode_zone(inode);
loff_t old_isize; enum req_op op; int ret = 0;
/* * Only sequential zone files can be truncated and truncation is allowed * only down to a 0 size, which is equivalent to a zone reset, and to * the maximum file size, which is equivalent to a zone finish.
*/ if (!zonefs_zone_is_seq(z)) return -EPERM;
if (!isize)
op = REQ_OP_ZONE_RESET; elseif (isize == z->z_capacity)
op = REQ_OP_ZONE_FINISH; else return -EPERM;
inode_dio_wait(inode);
/* Serialize against page faults */
filemap_invalidate_lock(inode->i_mapping);
/* Serialize against zonefs_iomap_begin() */
mutex_lock(&zi->i_truncate_mutex);
old_isize = i_size_read(inode); if (isize == old_isize) goto unlock;
ret = zonefs_inode_zone_mgmt(inode, op); if (ret) goto unlock;
/* * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, * take care of open zones.
*/ if (z->z_flags & ZONEFS_ZONE_OPEN) { /* * Truncating a zone to EMPTY or FULL is the equivalent of * closing the zone. For a truncation to 0, we need to * re-open the zone to ensure new writes can be processed. * For a truncation to the maximum file size, the zone is * closed and writes cannot be accepted anymore, so clear * the open flag.
*/ if (!isize)
ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); else
z->z_flags &= ~ZONEFS_ZONE_OPEN;
}
staticint zonefs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{ struct inode *inode = file_inode(file); int ret = 0;
if (unlikely(IS_IMMUTABLE(inode))) return -EPERM;
/* * Since only direct writes are allowed in sequential files, page cache * flush is needed only for conventional zone files.
*/ if (zonefs_inode_is_cnv(inode))
ret = file_write_and_wait_range(file, start, end); if (!ret)
ret = blkdev_issue_flush(inode->i_sb->s_bdev);
/* * Conventional zones accept random writes, so their files can support * shared writable mappings. For sequential zone files, only read * mappings are possible since there are no guarantees for write * ordering between msync() and page cache writeback.
*/ if (zonefs_inode_is_seq(file_inode(file)) &&
(desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) return -EINVAL;
/* * Seeks are limited to below the zone size for conventional zones * and below the zone write pointer for sequential zones. In both * cases, this limit is the inode size.
*/ return generic_file_llseek_size(file, offset, whence, isize, isize);
}
if (error) { /* * For Sync IOs, error recovery is called from * zonefs_file_dio_write().
*/ if (!is_sync_kiocb(iocb))
zonefs_io_error(inode, true); return error;
}
if (size && zonefs_inode_is_seq(inode)) { /* * Note that we may be seeing completions out of order, * but that is not a problem since a write completed * successfully necessarily means that all preceding writes * were also successful. So we can safely increase the inode * size to the write end location.
*/
mutex_lock(&zi->i_truncate_mutex); if (i_size_read(inode) < iocb->ki_pos + size) {
zonefs_update_stats(inode, iocb->ki_pos + size);
zonefs_i_size_write(inode, iocb->ki_pos + size);
}
mutex_unlock(&zi->i_truncate_mutex);
}
/* * Do not exceed the LFS limits nor the file zone size. If pos is under the * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
*/ static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
loff_t count)
{ struct inode *inode = file_inode(file); struct zonefs_zone *z = zonefs_inode_zone(inode);
loff_t limit = rlimit(RLIMIT_FSIZE);
loff_t max_size = z->z_capacity;
/* * Handle direct writes. For sequential zone files, this is the only possible * write path. For these files, check that the user is issuing writes * sequentially from the end of the file. This code assumes that the block layer * delivers write requests to the device in sequential order. This is always the * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE * elevator feature is being used (e.g. mq-deadline). The block layer always * automatically select such an elevator for zoned block devices during the * device initialization.
*/ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
{ struct inode *inode = file_inode(iocb->ki_filp); struct zonefs_inode_info *zi = ZONEFS_I(inode); struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb;
ssize_t ret, count;
/* * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT * as this can cause write reordering (e.g. the first aio gets EAGAIN * on the inode lock but the second goes through but is now unaligned).
*/ if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) &&
(iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP;
if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN;
} else {
inode_lock(inode);
}
count = zonefs_write_checks(iocb, from); if (count <= 0) {
ret = count; goto inode_unlock;
}
if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
ret = -EINVAL; goto inode_unlock;
}
/* Enforce sequential writes (append only) in sequential zones */ if (zonefs_zone_is_seq(z)) {
mutex_lock(&zi->i_truncate_mutex); if (iocb->ki_pos != z->z_wpoffset) {
mutex_unlock(&zi->i_truncate_mutex);
ret = -EINVAL; goto inode_unlock;
} /* * Advance the zone write pointer offset. This assumes that the * IO will succeed, which is OK to do because we do not allow * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO * fails, the error path will correct the write pointer offset.
*/
z->z_wpoffset += count;
zonefs_inode_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
/* * iomap_dio_rw() may return ENOTBLK if there was an issue with * page invalidation. Overwrite that error code with EBUSY so that * the user can make sense of the error.
*/
ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
&zonefs_write_dio_ops, 0, NULL, 0); if (ret == -ENOTBLK)
ret = -EBUSY;
/* * For a failed IO or partial completion, trigger error recovery * to update the zone write pointer offset to a correct value. * For asynchronous IOs, zonefs_file_write_dio_end_io() may already * have executed error recovery if the IO already completed when we * reach here. However, we cannot know that and execute error recovery * again (that will not change anything).
*/ if (zonefs_zone_is_seq(z)) { if (ret > 0 && ret != count)
ret = -EIO; if (ret < 0 && ret != -EIOCBQUEUED)
zonefs_io_error(inode, true);
}
/* * Direct IO writes are mandatory for sequential zone files so that the * write IO issuing order is preserved.
*/ if (zonefs_inode_is_seq(inode)) return -EIO;
if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN;
} else {
inode_lock(inode);
}
ret = zonefs_write_checks(iocb, from); if (ret <= 0) goto inode_unlock;
ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops,
NULL, NULL); if (ret == -EIO)
zonefs_io_error(inode, true);
inode_unlock:
inode_unlock(inode); if (ret > 0)
ret = generic_write_sync(iocb, ret);
/* Offline zones cannot be read */ if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) return -EPERM;
if (*ppos >= z->z_capacity) return 0;
inode_lock_shared(inode);
/* Limit read operations to written data */
mutex_lock(&zi->i_truncate_mutex);
isize = i_size_read(inode); if (*ppos >= isize)
len = 0; else
len = min_t(loff_t, len, isize - *ppos);
mutex_unlock(&zi->i_truncate_mutex);
if (len > 0) {
ret = filemap_splice_read(in, ppos, pipe, len, flags); if (ret == -EIO)
zonefs_io_error(inode, false);
}
inode_unlock_shared(inode); return ret;
}
/* * Write open accounting is done only for sequential files.
*/ staticinlinebool zonefs_seq_file_need_wro(struct inode *inode, struct file *file)
{ if (zonefs_inode_is_cnv(inode)) returnfalse;
if (!(file->f_mode & FMODE_WRITE)) returnfalse;
returntrue;
}
staticint zonefs_seq_file_write_open(struct inode *inode)
{ struct zonefs_inode_info *zi = ZONEFS_I(inode); struct zonefs_zone *z = zonefs_inode_zone(inode); int ret = 0;
zi->i_wr_refcnt--; if (zi->i_wr_refcnt) goto unlock;
/* * The file zone may not be open anymore (e.g. the file was truncated to * its maximum size or it was fully written). For this case, we only * need to decrement the write open count.
*/ if (z->z_flags & ZONEFS_ZONE_OPEN) {
ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); if (ret) {
__zonefs_io_error(inode, false); /* * Leaving zones explicitly open may lead to a state * where most zones cannot be written (zone resources * exhausted). So take preventive action by remounting * read-only.
*/ if (z->z_flags & ZONEFS_ZONE_OPEN &&
!(sb->s_flags & SB_RDONLY)) {
zonefs_warn(sb, "closing zone at %llu failed %d\n",
z->z_sector, ret);
zonefs_warn(sb, "remounting filesystem read-only\n");
sb->s_flags |= SB_RDONLY;
} goto unlock;
}
staticint zonefs_file_release(struct inode *inode, struct file *file)
{ /* * If we explicitly open a zone we must close it again as well, but the * zone management operation can fail (either due to an IO error or as * the zone has gone offline or read-only). Make sure we don't fail the * close(2) for user-space.
*/ if (zonefs_seq_file_need_wro(inode, file))
zonefs_seq_file_write_close(inode);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.