// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2020-2024 Oracle. All Rights Reserved. * Author: Darrick J. Wong <djwong@kernel.org>
*/ #include"xfs.h" #include"xfs_shared.h" #include"xfs_format.h" #include"xfs_log_format.h" #include"xfs_trans_resv.h" #include"xfs_mount.h" #include"xfs_defer.h" #include"xfs_inode.h" #include"xfs_trans.h" #include"xfs_quota.h" #include"xfs_bmap_util.h" #include"xfs_reflink.h" #include"xfs_trace.h" #include"xfs_exchrange.h" #include"xfs_exchmaps.h" #include"xfs_sb.h" #include"xfs_icache.h" #include"xfs_log.h" #include"xfs_rtbitmap.h" #include <linux/fsnotify.h>
/* Lock (and optionally join) two inodes for a file range exchange. */ void
xfs_exchrange_ilock( struct xfs_trans *tp, struct xfs_inode *ip1, struct xfs_inode *ip2)
{ if (ip1 != ip2)
xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
ip2, XFS_ILOCK_EXCL); else
xfs_ilock(ip1, XFS_ILOCK_EXCL); if (tp) {
xfs_trans_ijoin(tp, ip1, 0); if (ip2 != ip1)
xfs_trans_ijoin(tp, ip2, 0);
}
}
/* Unlock two inodes after a file range exchange operation. */ void
xfs_exchrange_iunlock( struct xfs_inode *ip1, struct xfs_inode *ip2)
{ if (ip2 != ip1)
xfs_iunlock(ip2, XFS_ILOCK_EXCL);
xfs_iunlock(ip1, XFS_ILOCK_EXCL);
}
/* * Estimate the resource requirements to exchange file contents between the two * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to * have flushed both inodes' pagecache and active direct-ios.
*/ int
xfs_exchrange_estimate( struct xfs_exchmaps_req *req)
{ int error;
/* * Check that file2's metadata agree with the snapshot that we took for the * range commit request. * * This should be called after the filesystem has locked /all/ inode metadata * against modification.
*/ STATICint
xfs_exchrange_check_freshness( conststruct xfs_exchrange *fxr, struct xfs_inode *ip2)
{ struct inode *inode2 = VFS_I(ip2); struct timespec64 ctime = inode_get_ctime(inode2); struct timespec64 mtime = inode_get_mtime(inode2);
trace_xfs_exchrange_freshness(fxr, ip2);
/* Check that file2 hasn't otherwise been modified. */ if (fxr->file2_ino != ip2->i_ino ||
fxr->file2_gen != inode2->i_generation ||
!timespec64_equal(&fxr->file2_ctime, &ctime) ||
!timespec64_equal(&fxr->file2_mtime, &mtime)) return -EBUSY;
return 0;
}
#define QRETRY_IP1 (0x1) #define QRETRY_IP2 (0x2)
/* * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip * this if quota enforcement is disabled or if both inodes' dquots are the * same. The qretry structure must be initialized to zeroes before the first * call to this function.
*/ STATICint
xfs_exchrange_reserve_quota( struct xfs_trans *tp, conststruct xfs_exchmaps_req *req, unsignedint *qretry)
{
int64_t ddelta, rdelta; int ip1_error = 0; int error;
/* * Don't bother with a quota reservation if we're not enforcing them * or the two inodes have the same dquots.
*/ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
(req->ip1->i_udquot == req->ip2->i_udquot &&
req->ip1->i_gdquot == req->ip2->i_gdquot &&
req->ip1->i_pdquot == req->ip2->i_pdquot)) return 0;
*qretry = 0;
/* * For each file, compute the net gain in the number of regular blocks * that will be mapped into that file and reserve that much quota. The * quota counts must be able to absorb at least that much space.
*/
ddelta = req->ip2_bcount - req->ip1_bcount;
rdelta = req->ip2_rtbcount - req->ip1_rtbcount; if (ddelta > 0 || rdelta > 0) {
error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
ddelta > 0 ? ddelta : 0,
rdelta > 0 ? rdelta : 0, false); if (error == -EDQUOT || error == -ENOSPC) { /* * Save this error and see what happens if we try to * reserve quota for ip2. Then report both.
*/
*qretry |= QRETRY_IP1;
ip1_error = error;
error = 0;
} if (error) return error;
} if (ddelta < 0 || rdelta < 0) {
error = xfs_trans_reserve_quota_nblks(tp, req->ip2,
ddelta < 0 ? -ddelta : 0,
rdelta < 0 ? -rdelta : 0, false); if (error == -EDQUOT || error == -ENOSPC)
*qretry |= QRETRY_IP2; if (error) return error;
} if (ip1_error) return ip1_error;
/* * For each file, forcibly reserve the gross gain in mapped blocks so * that we don't trip over any quota block reservation assertions. * We must reserve the gross gain because the quota code subtracts from * bcount the number of blocks that we unmap; it does not add that * quantity back to the quota block reservation.
*/
error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount,
req->ip1_rtbcount, true); if (error) return error;
if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
req.flags |= XFS_EXCHMAPS_SET_SIZES; if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN)
req.flags |= XFS_EXCHMAPS_INO1_WRITTEN;
/* * Round the request length up to the nearest file allocation unit. * The prep function already checked that the request offsets and * length in @fxr are safe to round up.
*/ if (xfs_inode_has_bigrtalloc(ip2))
req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount);
error = xfs_exchrange_estimate(&req); if (error) return error;
retry: /* Allocate the transaction, lock the inodes, and join them. */
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0,
XFS_TRANS_RES_FDBLKS, &tp); if (error) return error;
error = xfs_exchmaps_check_forks(mp, &req); if (error) goto out_trans_cancel;
/* * Reserve ourselves some quota if any of them are in enforcing mode. * In theory we only need enough to satisfy the change in the number * of blocks between the two ranges being remapped.
*/
error = xfs_exchrange_reserve_quota(tp, &req, &qretry); if ((error == -EDQUOT || error == -ENOSPC) && !retried) {
xfs_trans_cancel(tp);
xfs_exchrange_iunlock(ip1, ip2); if (qretry & QRETRY_IP1)
xfs_blockgc_free_quota(ip1, 0); if (qretry & QRETRY_IP2)
xfs_blockgc_free_quota(ip2, 0);
retried = true; goto retry;
} if (error) goto out_trans_cancel;
/* If we got this far on a dry run, all parameters are ok. */ if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN) goto out_trans_cancel;
/* Update the mtime and ctime of both files. */ if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1)
xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_exchange_mappings(tp, &req);
/* * Force the log to persist metadata updates if the caller or the * administrator requires this. The generic prep function already * flushed the relevant parts of the page cache.
*/ if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC))
xfs_trans_set_sync(tp);
/* * If the caller wanted us to exchange the contents of two complete * files of unequal length, exchange the incore sizes now. This should * be safe because we flushed both files' page caches, exchanged all * the mappings, and updated the ondisk sizes.
*/ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
loff_t temp;
/* * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE. * This part deals with struct file objects and byte ranges and does not deal * with XFS-specific data structures such as xfs_inodes and block ranges. This * separation may some day facilitate porting to another filesystem. * * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in * file1 with the same number of bytes starting at fxr.file2_offset in file2. * Implementations must call xfs_exchange_range_prep to prepare the two * files prior to taking locks; and they must update the inode change and mod * times of both files as part of the metadata update. The timestamp update * and freshness checks must be done atomically as part of the data exchange * operation to ensure correctness of the freshness check. * xfs_exchange_range_finish must be called after the operation completes * successfully but before locks are dropped.
*/
/* * Performs necessary checks before doing a range exchange, having stabilized * mutable inode attributes via i_rwsem.
*/ staticinlineint
xfs_exchange_range_checks( struct xfs_exchrange *fxr, unsignedint alloc_unit)
{ struct inode *inode1 = file_inode(fxr->file1);
loff_t size1 = i_size_read(inode1); struct inode *inode2 = file_inode(fxr->file2);
loff_t size2 = i_size_read(inode2);
uint64_t allocmask = alloc_unit - 1;
int64_t test_len;
uint64_t blen;
loff_t tmp; int error;
/* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2)) return -EPERM; if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) return -ETXTBSY;
/* Ranges cannot start after EOF. */ if (fxr->file1_offset > size1 || fxr->file2_offset > size2) return -EINVAL;
if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { /* * If the caller said to exchange to EOF, we set the length of * the request large enough to cover everything to the end of * both files.
*/
fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
size2 - fxr->file2_offset);
} else { /* * Otherwise we require both ranges to end within EOF.
*/ if (fxr->file1_offset + fxr->length > size1 ||
fxr->file2_offset + fxr->length > size2) return -EINVAL;
}
/* * The start of both ranges must be aligned to the file allocation * unit.
*/ if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
!IS_ALIGNED(fxr->file2_offset, alloc_unit)) return -EINVAL;
/* * Make sure we don't hit any file size limits. If we hit any size * limits such that test_length was adjusted, we abort the whole * operation.
*/
test_len = fxr->length;
error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
&test_len); if (error) return error;
error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
&test_len); if (error) return error; if (test_len != fxr->length) return -EINVAL;
/* * If the user wanted us to exchange up to the infile's EOF, round up * to the next allocation unit boundary for this check. Do the same * for the outfile. * * Otherwise, reject the range length if it's not aligned to an * allocation unit.
*/ if (fxr->file1_offset + fxr->length == size1)
blen = ALIGN(size1, alloc_unit) - fxr->file1_offset; elseif (fxr->file2_offset + fxr->length == size2)
blen = ALIGN(size2, alloc_unit) - fxr->file2_offset; elseif (!IS_ALIGNED(fxr->length, alloc_unit)) return -EINVAL; else
blen = fxr->length;
/* Don't allow overlapped exchanges within the same file. */ if (inode1 == inode2 &&
fxr->file2_offset + blen > fxr->file1_offset &&
fxr->file1_offset + blen > fxr->file2_offset) return -EINVAL;
/* * Ensure that we don't exchange a partial EOF block into the middle of * another file.
*/ if ((fxr->length & allocmask) == 0) return 0;
if (fxr->file1_offset + blen < size1)
blen &= ~allocmask;
return blen == fxr->length ? 0 : -EINVAL;
}
/* * Check that the two inodes are eligible for range exchanges, the ranges make * sense, and then flush all dirty data. Caller must ensure that the inodes * have been locked against any other modifications.
*/ staticinlineint
xfs_exchange_range_prep( struct xfs_exchrange *fxr, unsignedint alloc_unit)
{ struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); bool same_inode = (inode1 == inode2); int error;
/* Check that we don't violate system file offset limits. */
error = xfs_exchange_range_checks(fxr, alloc_unit); if (error || fxr->length == 0) return error;
/* Wait for the completion of any pending IOs on both files */
inode_dio_wait(inode1); if (!same_inode)
inode_dio_wait(inode2);
/* * If the files or inodes involved require synchronous writes, amend * the request to force the filesystem to flush all data and metadata * to disk after the operation completes.
*/ if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
IS_SYNC(inode1) || IS_SYNC(inode2))
fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
return 0;
}
/* * Finish a range exchange operation, if it was successful. Caller must ensure * that the inodes are still locked against any other modifications.
*/ staticinlineint
xfs_exchange_range_finish( struct xfs_exchrange *fxr)
{ int error;
error = file_remove_privs(fxr->file1); if (error) return error; if (file_inode(fxr->file1) == file_inode(fxr->file2)) return 0;
return file_remove_privs(fxr->file2);
}
/* * Check the alignment of an exchange request when the allocation unit size * isn't a power of two. The generic file-level helpers use (fast) * bitmask-based alignment checks, but here we have to use slow long division.
*/ staticint
xfs_exchrange_check_rtalign( conststruct xfs_exchrange *fxr, struct xfs_inode *ip1, struct xfs_inode *ip2, unsignedint alloc_unit)
{
uint64_t length = fxr->length;
uint64_t blen;
loff_t size1, size2;
/* The start of both ranges must be aligned to a rt extent. */ if (!isaligned_64(fxr->file1_offset, alloc_unit) ||
!isaligned_64(fxr->file2_offset, alloc_unit)) return -EINVAL;
/* * If the user wanted us to exchange up to the infile's EOF, round up * to the next rt extent boundary for this check. Do the same for the * outfile. * * Otherwise, reject the range length if it's not rt extent aligned. * We already confirmed the starting offsets' rt extent block * alignment.
*/ if (fxr->file1_offset + length == size1)
blen = roundup_64(size1, alloc_unit) - fxr->file1_offset; elseif (fxr->file2_offset + length == size2)
blen = roundup_64(size2, alloc_unit) - fxr->file2_offset; elseif (!isaligned_64(length, alloc_unit)) return -EINVAL; else
blen = length;
/* Don't allow overlapped exchanges within the same file. */ if (ip1 == ip2 &&
fxr->file2_offset + blen > fxr->file1_offset &&
fxr->file1_offset + blen > fxr->file2_offset) return -EINVAL;
/* * Ensure that we don't exchange a partial EOF rt extent into the * middle of another file.
*/ if (isaligned_64(length, alloc_unit)) return 0;
if (fxr->file1_offset + blen < size1)
blen = rounddown_64(blen, alloc_unit);
return blen == length ? 0 : -EINVAL;
}
/* Prepare two files to have their data exchanged. */ STATICint
xfs_exchrange_prep( struct xfs_exchrange *fxr, struct xfs_inode *ip1, struct xfs_inode *ip2)
{ struct xfs_mount *mp = ip2->i_mount; unsignedint alloc_unit = xfs_inode_alloc_unitsize(ip2); int error;
trace_xfs_exchrange_prep(fxr, ip1, ip2);
/* Verify both files are either real-time or non-realtime */ if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2)) return -EINVAL;
/* Check non-power of two alignment issues, if necessary. */ if (!is_power_of_2(alloc_unit)) {
error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit); if (error) return error;
/* * Do the generic file-level checks with the regular block * alignment.
*/
alloc_unit = mp->m_sb.sb_blocksize;
}
if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
error = xfs_exchrange_check_freshness(fxr, ip2); if (error) return error;
}
/* Attach dquots to both inodes before changing block maps. */
error = xfs_qm_dqattach(ip2); if (error) return error;
error = xfs_qm_dqattach(ip1); if (error) return error;
trace_xfs_exchrange_flush(fxr, ip1, ip2);
/* Flush the relevant ranges of both files. */
error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length); if (error) return error;
error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length); if (error) return error;
/* * Cancel CoW fork preallocations for the ranges of both files. The * prep function should have flushed all the dirty data, so the only * CoW mappings remaining should be speculative.
*/ if (xfs_inode_has_cow_data(ip1)) {
error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset,
fxr->length, true); if (error) return error;
}
if (xfs_inode_has_cow_data(ip2)) {
error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset,
fxr->length, true); if (error) return error;
}
return 0;
}
/* * Exchange contents of files. This is the binding between the generic * file-level concepts and the XFS inode-specific implementation.
*/ STATICint
xfs_exchrange_contents( struct xfs_exchrange *fxr)
{ struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); struct xfs_inode *ip1 = XFS_I(inode1); struct xfs_inode *ip2 = XFS_I(inode2); struct xfs_mount *mp = ip1->i_mount; int error;
if (!xfs_has_exchange_range(mp)) return -EOPNOTSUPP;
if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
XFS_EXCHANGE_RANGE_PRIV_FLAGS)) return -EINVAL;
if (xfs_is_shutdown(mp)) return -EIO;
/* Lock both files against IO */
error = xfs_ilock2_io_mmap(ip1, ip2); if (error) goto out_err;
/* Prepare and then exchange file contents. */
error = xfs_exchrange_prep(fxr, ip1, ip2); if (error) goto out_unlock;
error = xfs_exchrange_mappings(fxr, ip1, ip2); if (error) goto out_unlock;
/* * Finish the exchange by removing special file privileges like any * other file write would do. This may involve turning on support for * logged xattrs if either file has security capabilities.
*/
error = xfs_exchange_range_finish(fxr); if (error) goto out_unlock;
/* Both files must be on the same mount/filesystem. */ if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) return -EXDEV;
if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
__XFS_EXCHANGE_RANGE_CHECK_FRESH2)) return -EINVAL;
/* Userspace requests only honored for regular files. */ if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode)) return -EISDIR; if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) return -EINVAL;
/* Both files must be opened for read and write. */ if (!(fxr->file1->f_mode & FMODE_READ) ||
!(fxr->file1->f_mode & FMODE_WRITE) ||
!(fxr->file2->f_mode & FMODE_READ) ||
!(fxr->file2->f_mode & FMODE_WRITE)) return -EBADF;
/* Neither file can be opened append-only. */ if ((fxr->file1->f_flags & O_APPEND) ||
(fxr->file2->f_flags & O_APPEND)) return -EBADF;
/* * If we're exchanging to EOF we can't calculate the length until taking * the iolock. Pass a 0 length to remap_verify_area similar to the * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well.
*/ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
check_len = 0;
ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true); if (ret) return ret;
ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true); if (ret) return ret;
/* Update cmtime if the fd/inode don't forbid it. */ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1; if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
file_start_write(fxr->file2);
ret = xfs_exchrange_contents(fxr);
file_end_write(fxr->file2); if (ret) return ret;
fsnotify_modify(fxr->file1); if (fxr->file2 != fxr->file1)
fsnotify_modify(fxr->file2); return 0;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.