/* * This can be called from log recovery, where the zone_info structure * hasn't been allocated yet. Skip all work as xfs_mount_zones will * add the zones to the right buckets before the file systems becomes * active.
*/ if (!zi) return;
if (!used) { /* * The zone is now empty, remove it from the bottom bucket and * trigger a reset.
*/
trace_xfs_zone_emptied(rtg);
if (!was_full)
xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE);
spin_lock(&zi->zi_used_buckets_lock); if (!was_full)
xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
spin_unlock(&zi->zi_used_buckets_lock);
if (zi->zi_gc_thread)
wake_up_process(zi->zi_gc_thread);
} elseif (was_full) { /* * The zone transitioned from full, mark it up as reclaimable * and wake up GC which might be waiting for zones to reclaim.
*/
spin_lock(&zi->zi_used_buckets_lock);
xfs_zone_add_to_bucket(zi, rgno, to_bucket);
spin_unlock(&zi->zi_used_buckets_lock);
xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); if (zi->zi_gc_thread && xfs_zoned_need_gc(mp))
wake_up_process(zi->zi_gc_thread);
} elseif (to_bucket != from_bucket) { /* * Move the zone to a new bucket if it dropped below the * threshold.
*/
spin_lock(&zi->zi_used_buckets_lock);
xfs_zone_add_to_bucket(zi, rgno, to_bucket);
xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
spin_unlock(&zi->zi_used_buckets_lock);
}
}
/* * Called for blocks that have been written to disk, but not actually linked to * an inode, which can happen when garbage collection races with user data * writes to a file.
*/ staticvoid
xfs_zone_skip_blocks( struct xfs_open_zone *oz,
xfs_filblks_t len)
{ struct xfs_rtgroup *rtg = oz->oz_rtg;
/* Grab the corresponding mapping in the data fork. */
error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
&nmaps, 0); if (error) return error;
/* * Cap the update to the existing extent in the data fork because we can * only overwrite one extent at a time.
*/
ASSERT(new->br_blockcount >= data.br_blockcount);
new->br_blockcount = data.br_blockcount;
/* * If a data write raced with this GC write, keep the existing data in * the data fork, mark our newly written GC extent as reclaimable, then * move on to the next extent.
*/ if (old_startblock != NULLFSBLOCK &&
old_startblock != data.br_startblock) goto skip;
/* * "Free" blocks allocated in a zone. * * Just decrement the used blocks counter and report the space as freed.
*/ int
xfs_zone_free_blocks( struct xfs_trans *tp, struct xfs_rtgroup *rtg,
xfs_fsblock_t fsbno,
xfs_filblks_t len)
{ struct xfs_mount *mp = tp->t_mountp; struct xfs_inode *rmapip = rtg_rmap(rtg);
xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL);
if (len > rmapip->i_used_blocks) {
xfs_err(mp, "trying to free more blocks (%lld) than used counter (%u).",
len, rmapip->i_used_blocks);
ASSERT(len <= rmapip->i_used_blocks);
xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return -EFSCORRUPTED;
}
rmapip->i_used_blocks -= len; /* * Don't add open zones to the reclaimable buckets. The I/O completion * for writing the last block will take care of accounting for already * unused blocks instead.
*/ if (!READ_ONCE(rtg->rtg_open_zone))
xfs_zone_account_reclaimable(rtg, len);
xfs_add_frextents(mp, len);
xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); return 0;
}
/* * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap * inode, but we don't really want to take that here because we are * under the zone_list_lock. Ensure the pointer is only set for a fully * initialized open zone structure so that a racy lookup finding it is * fine.
*/
WRITE_ONCE(rtg->rtg_open_zone, oz); return oz;
}
/* * Find a completely free zone, open it, and return a reference.
*/ struct xfs_open_zone *
xfs_open_zone( struct xfs_mount *mp, enum rw_hint write_hint, bool is_gc)
{ struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_group *xg;
xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); if (!xg)
xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); if (!xg) return NULL;
if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) return NULL; if (atomic_read(&zi->zi_nr_free_zones) <
XFS_GC_ZONES - XFS_OPEN_GC_ZONES) return NULL;
/* * Increment the open zone count to reserve our slot before dropping * zi_open_zones_lock.
*/
zi->zi_nr_open_zones++;
spin_unlock(&zi->zi_open_zones_lock);
oz = xfs_open_zone(mp, write_hint, false);
spin_lock(&zi->zi_open_zones_lock); if (!oz) {
zi->zi_nr_open_zones--; return NULL;
}
/* * If this was the last free zone, other waiters might be waiting * on us to write to it as well.
*/
wake_up_all(&zi->zi_zone_wait);
if (xfs_zoned_need_gc(mp))
wake_up_process(zi->zi_gc_thread);
trace_xfs_zone_opened(oz->oz_rtg); return oz;
}
/* * For data with short or medium lifetime, try to colocated it into an * already open zone with a matching temperature.
*/ staticbool
xfs_colocate_eagerly( enum rw_hint file_hint)
{ switch (file_hint) { case WRITE_LIFE_MEDIUM: case WRITE_LIFE_SHORT: case WRITE_LIFE_NONE: returntrue; default: returnfalse;
}
}
staticbool
xfs_good_hint_match( struct xfs_open_zone *oz, enum rw_hint file_hint)
{ switch (oz->oz_write_hint) { case WRITE_LIFE_LONG: case WRITE_LIFE_EXTREME: /* colocate long and extreme */ if (file_hint == WRITE_LIFE_LONG ||
file_hint == WRITE_LIFE_EXTREME) returntrue; break; case WRITE_LIFE_MEDIUM: /* colocate medium with medium */ if (file_hint == WRITE_LIFE_MEDIUM) returntrue; break; case WRITE_LIFE_SHORT: case WRITE_LIFE_NONE: case WRITE_LIFE_NOT_SET: /* colocate short and none */ if (file_hint <= WRITE_LIFE_SHORT) returntrue; break;
} returnfalse;
}
staticbool
xfs_try_use_zone( struct xfs_zone_info *zi, enum rw_hint file_hint, struct xfs_open_zone *oz, bool lowspace)
{ if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) returnfalse; if (!lowspace && !xfs_good_hint_match(oz, file_hint)) returnfalse; if (!atomic_inc_not_zero(&oz->oz_ref)) returnfalse;
/* * If we have a hint set for the data, use that for the zone even if * some data was written already without any hint set, but don't change * the temperature after that as that would make little sense without * tracking per-temperature class written block counts, which is * probably overkill anyway.
*/ if (file_hint != WRITE_LIFE_NOT_SET &&
oz->oz_write_hint == WRITE_LIFE_NOT_SET)
oz->oz_write_hint = file_hint;
/* * If we couldn't match by inode or life time we just pick the first * zone with enough space above. For that we want the least busy zone * for some definition of "least" busy. For now this simple LRU * algorithm that rotates every zone to the end of the list will do it, * even if it isn't exactly cache friendly.
*/ if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones))
list_move_tail(&oz->oz_entry, &zi->zi_open_zones); returntrue;
}
/* * Try to pack inodes that are written back after they were closed tight instead * of trying to open new zones for them or spread them to the least recently * used zone. This optimizes the data layout for workloads that untar or copy * a lot of small files. Right now this does not separate multiple such * streams.
*/ staticinlinebool xfs_zoned_pack_tight(struct xfs_inode *ip)
{ return !inode_is_open_for_write(VFS_I(ip)) &&
!(ip->i_diflags & XFS_DIFLAG_APPEND);
}
/* * Try to fill up open zones with matching temperature if available. It * is better to try to co-locate data when this is favorable, so we can * activate empty zones when it is statistically better to separate * data.
*/
spin_lock(&zi->zi_open_zones_lock); if (xfs_colocate_eagerly(write_hint))
oz = xfs_select_open_zone_lru(zi, write_hint, false); elseif (pack_tight)
oz = xfs_select_open_zone_mru(zi, write_hint); if (oz) goto out_unlock;
/* * See if we can open a new zone and use that so that data for different * files is mixed as little as possible.
*/
oz = xfs_try_open_zone(mp, write_hint); if (oz) goto out_unlock;
/* * Try to colocate cold data with other cold data if we failed to open a * new zone for it.
*/ if (write_hint != WRITE_LIFE_NOT_SET &&
!xfs_colocate_eagerly(write_hint))
oz = xfs_select_open_zone_lru(zi, write_hint, false); if (!oz)
oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); if (!oz)
oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
out_unlock:
spin_unlock(&zi->zi_open_zones_lock); return oz;
}
if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}
/* * Cache the last zone written to for an inode so that it is considered first * for subsequent writes.
*/ struct xfs_zone_cache_item { struct xfs_mru_cache_elem mru; struct xfs_open_zone *oz;
};
/* * Check if we have a cached last open zone available for the inode and * if yes return a reference to it.
*/ staticstruct xfs_open_zone *
xfs_cached_zone( struct xfs_mount *mp, struct xfs_inode *ip)
{ struct xfs_mru_cache_elem *mru; struct xfs_open_zone *oz;
mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); if (!mru) return NULL;
oz = xfs_zone_cache_item(mru)->oz; if (oz) { /* * GC only steals open zones at mount time, so no GC zones * should end up in the cache.
*/
ASSERT(!oz->oz_is_gc);
ASSERT(atomic_read(&oz->oz_ref) > 0);
atomic_inc(&oz->oz_ref);
}
xfs_mru_cache_done(mp->m_zone_cache); return oz;
}
/* * Update the last used zone cache for a given inode. * * The caller must have a reference on the open zone.
*/ staticvoid
xfs_zone_cache_create_association( struct xfs_inode *ip, struct xfs_open_zone *oz)
{ struct xfs_mount *mp = ip->i_mount; struct xfs_zone_cache_item *item = NULL; struct xfs_mru_cache_elem *mru;
mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); if (mru) { /* * If we have an association already, update it to point to the * new zone.
*/
item = xfs_zone_cache_item(mru);
xfs_open_zone_put(item->oz);
item->oz = oz;
xfs_mru_cache_done(mp->m_zone_cache); return;
}
/* * If we don't have a locally cached zone in this write context, see if * the inode is still associated with a zone and use that if so.
*/ if (!*oz)
*oz = xfs_cached_zone(mp, ip);
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight); if (!*oz) goto out_error;
/* * Wake up all threads waiting for a zoned space allocation when the file system * is shut down.
*/ void
xfs_zoned_wake_all( struct xfs_mount *mp)
{ /* * Don't wake up if there is no m_zone_info. This is complicated by the * fact that unmount can't atomically clear m_zone_info and thus we need * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE * during log recovery so we can't entirely rely on that either.
*/ if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info)
wake_up_all(&mp->m_zone_info->zi_zone_wait);
}
/* * Check if @rgbno in @rgb is a potentially valid block. It might still be * unused, but that information is only found in the rmap.
*/ bool
xfs_zone_rgbno_is_valid( struct xfs_rtgroup *rtg,
xfs_rgnumber_t rgbno)
{
lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) return -EFSCORRUPTED;
/* * For sequential write required zones we retrieved the hardware write * pointer above. * * For conventional zones or conventional devices we don't have that * luxury. Instead query the rmap to find the highest recorded block * and set the write pointer to the block after that. In case of a * power loss this misses blocks where the data I/O has completed but * not recorded in the rmap yet, and it also rewrites blocks if the most * recently written ones got deleted again before unmount, but this is * the best we can do without hardware support.
*/ if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) {
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); if (highest_rgbno == NULLRGBLOCK)
write_pointer = 0; else
write_pointer = highest_rgbno + 1;
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
}
/* * If there are no used blocks, but the zone is not in empty state yet * we lost power before the zoned reset. In that case finish the work * here.
*/ if (write_pointer == rtg_blocks(rtg) && used == 0) {
error = xfs_zone_gc_reset_sync(rtg); if (error) return error;
write_pointer = 0;
}
if (write_pointer == 0) { /* zone is empty */
atomic_inc(&zi->zi_nr_free_zones);
xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
iz->available += rtg_blocks(rtg);
} elseif (write_pointer < rtg_blocks(rtg)) { /* zone is open */ struct xfs_open_zone *oz;
atomic_inc(&rtg_group(rtg)->xg_active_ref);
oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, false);
list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
zi->zi_nr_open_zones++;
if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); return -EFSCORRUPTED;
}
rgno = xfs_rtb_to_rgno(mp, zsbno);
rtg = xfs_rtgroup_grab(mp, rgno); if (!rtg) {
xfs_warn(mp, "realtime group not found for zone %u.", rgno); return -EFSCORRUPTED;
}
error = xfs_init_zone(iz, rtg, zone);
xfs_rtgroup_rele(rtg); return error;
}
/* * Calculate the max open zone limit based on the of number of backing zones * available.
*/ staticinline uint32_t
xfs_max_open_zones( struct xfs_mount *mp)
{ unsignedint max_open, max_open_data_zones;
/* * We need two zones for every open data zone, one in reserve as we * don't reclaim open zones. One data zone and its spare is included * in XFS_MIN_ZONES to support at least one user data writer.
*/
max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
/* * Cap the max open limit to 1/4 of available space. Without this we'd * run out of easy reclaim targets too quickly and storage devices don't * handle huge numbers of concurrent write streams overly well.
*/
max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
return max(XFS_MIN_OPEN_ZONES, max_open);
}
/* * Normally we use the open zone limit that the device reports. If there is * none let the user pick one from the command line. * * If the device doesn't report an open zone limit and there is no override, * allow to hold about a quarter of the zones open. In theory we could allow * all to be open, but at that point we run into GC deadlocks because we can't * reclaim open zones. * * When used on conventional SSDs a lower open limit is advisable as we'll * otherwise overwhelm the FTL just as much as a conventional block allocator. * * Note: To debug the open zone management code, force max_open to 1 here.
*/ staticint
xfs_calc_open_zones( struct xfs_mount *mp)
{ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; unsignedint bdev_open_zones = bdev_max_open_zones(bdev);
if (!mp->m_max_open_zones) { if (bdev_open_zones)
mp->m_max_open_zones = bdev_open_zones; else
mp->m_max_open_zones = xfs_max_open_zones(mp);
}
if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
xfs_notice(mp, "need at least %u open zones.",
XFS_MIN_OPEN_ZONES); return -EIO;
}
if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
mp->m_max_open_zones = bdev_open_zones;
xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
bdev_open_zones);
}
if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
mp->m_max_open_zones = xfs_max_open_zones(mp);
xfs_info(mp, "limiting open zones to %u due to total zone count (%u)",
mp->m_max_open_zones, mp->m_sb.sb_rgcount);
}
zi = kzalloc(sizeof(*zi), GFP_KERNEL); if (!zi) return NULL;
INIT_LIST_HEAD(&zi->zi_open_zones);
INIT_LIST_HEAD(&zi->zi_reclaim_reservations);
spin_lock_init(&zi->zi_reset_list_lock);
spin_lock_init(&zi->zi_open_zones_lock);
spin_lock_init(&zi->zi_reservation_lock);
init_waitqueue_head(&zi->zi_zone_wait);
spin_lock_init(&zi->zi_used_buckets_lock); for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); if (!zi->zi_used_bucket_bitmap[i]) goto out_free_bitmaps;
} return zi;
out_free_bitmaps: while (--i > 0)
kvfree(zi->zi_used_bucket_bitmap[i]);
kfree(zi); return NULL;
}
staticvoid
xfs_free_zone_info( struct xfs_zone_info *zi)
{ int i;
xfs_free_open_zones(zi); for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++)
kvfree(zi->zi_used_bucket_bitmap[i]);
kfree(zi);
}
int
xfs_mount_zones( struct xfs_mount *mp)
{ struct xfs_init_zones iz = {
.mp = mp,
}; struct xfs_buftarg *bt = mp->m_rtdev_targp; int error;
if (!bt) {
xfs_notice(mp, "RT device missing."); return -EINVAL;
}
if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
xfs_notice(mp, "invalid flag combination."); return -EFSCORRUPTED;
} if (mp->m_sb.sb_rextsize != 1) {
xfs_notice(mp, "zoned file systems do not support rextsize."); return -EFSCORRUPTED;
} if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
xfs_notice(mp, "zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); return -EFSCORRUPTED;
}
error = xfs_calc_open_zones(mp); if (error) return error;
mp->m_zone_info = xfs_alloc_zone_info(mp); if (!mp->m_zone_info) return -ENOMEM;
xfs_info(mp, "%u zones of %u blocks size (%u max open)",
mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
/* * The user may configure GC to free up a percentage of unused blocks. * By default this is 0. GC will always trigger at the minimum level * for keeping max_open_zones available for data placement.
*/
mp->m_zonegc_low_space = 0;
error = xfs_zone_gc_mount(mp); if (error) goto out_free_zone_info;
/* * Set up a mru cache to track inode to open zone for data placement * purposes. The magic values for group count and life time is the * same as the defaults for file streams, which seems sane enough.
*/
xfs_mru_cache_create(&mp->m_zone_cache, mp,
5000, 10, xfs_zone_cache_free_func); return 0;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.