/* * Free Space Btree Repair * ======================= * * The reverse mappings are supposed to record all space usage for the entire * AG. Therefore, we can recreate the free extent records in an AG by looking * for gaps in the physical extents recorded in the rmapbt. These records are * staged in @free_records. Identifying the gaps is more difficult on a * reflink filesystem because rmap records are allowed to overlap. * * Because the final step of building a new index is to free the space used by * the old index, repair needs to find that space. Unfortunately, all * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share * the same rmapbt owner code (OWN_AG), so this is not straightforward. * * The scan of the reverse mapping information records the space used by OWN_AG * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to * record all visited rmap btree blocks and all blocks owned by the AGFL. * * After that is where the definitions of old_allocbt_blocks shifts. This * expression identifies possible former bnobt/cntbt blocks: * * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); * * Substituting from above definitions, that becomes: * * old_allocbt_blocks & ~not_allocbt_blocks * * The OWN_AG bitmap itself isn't needed after this point, so what we really do * instead is: * * old_allocbt_blocks &= ~not_allocbt_blocks; * * After this point, @old_allocbt_blocks is a bitmap of alleged former * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first * parameter in place to avoid copying records around. * * Next, some of the space described by @free_records are diverted to the newbt * reservation and used to format new btree blocks. The remaining records are * written to the new btree indices. We reconstruct both bnobt and cntbt at * the same time since we've already done all the work. * * We use the prefix 'xrep_abt' here because we regenerate both free space * allocation btrees at the same time.
*/
struct xrep_abt { /* Blocks owned by the rmapbt or the agfl. */ struct xagb_bitmap not_allocbt_blocks;
/* All OWN_AG blocks. */ struct xagb_bitmap old_allocbt_blocks;
/* * New bnobt information. All btree block reservations are added to * the reservation list in new_bnobt.
*/ struct xrep_newbt new_bnobt;
/* new cntbt information */ struct xrep_newbt new_cntbt;
/* Free space extents. */ struct xfarray *free_records;
struct xfs_scrub *sc;
/* Number of non-null records in @free_records. */
uint64_t nr_real_records;
/* get_records()'s position in the free space record array. */
xfarray_idx_t array_cur;
/* * Next block we anticipate seeing in the rmap records. If the next * rmap record is greater than next_agbno, we have found unused space.
*/
xfs_agblock_t next_agbno;
/* Number of free blocks in this AG. */
xfs_agblock_t nr_blocks;
/* Longest free extent we found in the AG. */
xfs_agblock_t longest;
};
/* Set up to repair AG free space btrees. */ int
xrep_setup_ag_allocbt( struct xfs_scrub *sc)
{ struct xfs_group *xg = pag_group(sc->sa.pag); unsignedint busy_gen;
/* * Make sure the busy extent list is clear because we can't put extents * on there twice.
*/ if (xfs_extent_busy_list_empty(xg, &busy_gen)) return 0; return xfs_extent_busy_flush(sc->tp, xg, busy_gen, 0);
}
/* Check for any obvious conflicts in the free extent. */ STATICint
xrep_abt_check_free_ext( struct xfs_scrub *sc, conststruct xfs_alloc_rec_incore *rec)
{ enum xbtree_recpacking outcome; int error;
if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) return -EFSCORRUPTED;
/* Must not be an inode chunk. */
error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
rec->ar_startblock, rec->ar_blockcount, &outcome); if (error) return error; if (outcome != XBTREE_RECPACKING_EMPTY) return -EFSCORRUPTED;
/* Must not be shared or CoW staging. */ if (sc->sa.refc_cur) {
error = xfs_refcount_has_records(sc->sa.refc_cur,
XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
rec->ar_blockcount, &outcome); if (error) return error; if (outcome != XBTREE_RECPACKING_EMPTY) return -EFSCORRUPTED;
error = xfs_refcount_has_records(sc->sa.refc_cur,
XFS_REFC_DOMAIN_COW, rec->ar_startblock,
rec->ar_blockcount, &outcome); if (error) return error; if (outcome != XBTREE_RECPACKING_EMPTY) return -EFSCORRUPTED;
}
return 0;
}
/* * Stash a free space record for all the space since the last bno we found * all the way up to @end.
*/ staticint
xrep_abt_stash( struct xrep_abt *ra,
xfs_agblock_t end)
{ struct xfs_alloc_rec_incore arec = {
.ar_startblock = ra->next_agbno,
.ar_blockcount = end - ra->next_agbno,
}; struct xfs_scrub *sc = ra->sc; int error = 0;
if (xchk_should_terminate(sc, &error)) return error;
error = xrep_abt_check_free_ext(ra->sc, &arec); if (error) return error;
trace_xrep_abt_found(sc->sa.pag, &arec);
error = xfarray_append(ra->free_records, &arec); if (error) return error;
ra->nr_blocks += arec.ar_blockcount; return 0;
}
/* Record extents that aren't in use from gaps in the rmap records. */ STATICint
xrep_abt_walk_rmap( struct xfs_btree_cur *cur, conststruct xfs_rmap_irec *rec, void *priv)
{ struct xrep_abt *ra = priv; int error;
/* Record all the OWN_AG blocks... */ if (rec->rm_owner == XFS_RMAP_OWN_AG) {
error = xagb_bitmap_set(&ra->old_allocbt_blocks,
rec->rm_startblock, rec->rm_blockcount); if (error) return error;
}
/* ...and all the rmapbt blocks... */
error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); if (error) return error;
/* ...and all the free space. */ if (rec->rm_startblock > ra->next_agbno) {
error = xrep_abt_stash(ra, rec->rm_startblock); if (error) return error;
}
/* * rmap records can overlap on reflink filesystems, so project * next_agbno as far out into the AG space as we currently know about.
*/
ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
rec->rm_startblock + rec->rm_blockcount); return 0;
}
/* Collect an AGFL block for the not-to-release list. */ staticint
xrep_abt_walk_agfl( struct xfs_mount *mp,
xfs_agblock_t agbno, void *priv)
{ struct xrep_abt *ra = priv;
/* * Compare two free space extents by block number. We want to sort in order of * increasing block number.
*/ staticint
xrep_bnobt_extent_cmp( constvoid *a, constvoid *b)
{ conststruct xfs_alloc_rec_incore *ap = a; conststruct xfs_alloc_rec_incore *bp = b;
/* * Re-sort the free extents by block number so that we can put the records into * the bnobt in the correct order. Make sure the records do not overlap in * physical space.
*/ STATICint
xrep_bnobt_sort_records( struct xrep_abt *ra)
{ struct xfs_alloc_rec_incore arec;
xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
xfs_agblock_t next_agbno = 0; int error;
error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); if (error) return error;
while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { if (arec.ar_startblock < next_agbno) return -EFSCORRUPTED;
/* * Compare two free space extents by length and then block number. We want * to sort first in order of increasing length and then in order of increasing * block number.
*/ staticint
xrep_cntbt_extent_cmp( constvoid *a, constvoid *b)
{ conststruct xfs_alloc_rec_incore *ap = a; conststruct xfs_alloc_rec_incore *bp = b;
/* * Sort the free extents by length so so that we can put the records into the * cntbt in the correct order. Don't let userspace kill us if we're resorting * after allocating btree blocks.
*/ STATICint
xrep_cntbt_sort_records( struct xrep_abt *ra, bool is_resort)
{ return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
is_resort ? 0 : XFARRAY_SORT_KILLABLE);
}
/* * Iterate all reverse mappings to find (1) the gaps between rmap records (all * unowned space), (2) the OWN_AG extents (which encompass the free space * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL * blocks. The free space is (1) + (2) - (3) - (4).
*/ STATICint
xrep_abt_find_freespace( struct xrep_abt *ra)
{ struct xfs_scrub *sc = ra->sc; struct xfs_mount *mp = sc->mp; struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_buf *agfl_bp;
xfs_agblock_t agend; int error;
xagb_bitmap_init(&ra->not_allocbt_blocks);
xrep_ag_btcur_init(sc, &sc->sa);
/* * Iterate all the reverse mappings to find gaps in the physical * mappings, all the OWN_AG blocks, and all the rmapbt extents.
*/
error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); if (error) goto err;
/* Insert a record for space between the last rmap and EOAG. */
agend = be32_to_cpu(agf->agf_length); if (ra->next_agbno < agend) {
error = xrep_abt_stash(ra, agend); if (error) goto err;
}
/* Collect all the AGFL blocks. */
error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); if (error) goto err;
/* * We're going to use the observed free space records to reserve blocks for the * new free space btrees, so we play an iterative game where we try to converge * on the number of blocks we need: * * 1. Estimate how many blocks we'll need to store the records. * 2. If the first free record has more blocks than we need, we're done. * We will have to re-sort the records prior to building the cntbt. * 3. If that record has exactly the number of blocks we need, null out the * record. We're done. * 4. Otherwise, we still need more blocks. Null out the record, subtract its * length from the number of blocks we need, and go back to step 1. * * Fortunately, we don't have to do any transaction work to play this game, so * we don't have to tear down the staging cursors.
*/ STATICint
xrep_abt_reserve_space( struct xrep_abt *ra, struct xfs_btree_cur *bno_cur, struct xfs_btree_cur *cnt_cur, bool *needs_resort)
{ struct xfs_scrub *sc = ra->sc;
xfarray_idx_t record_nr; unsignedint allocated = 0; int error = 0;
/* Compute how many blocks we'll need. */
error = xfs_btree_bload_compute_geometry(cnt_cur,
&ra->new_cntbt.bload, ra->nr_real_records); if (error) break;
error = xfs_btree_bload_compute_geometry(bno_cur,
&ra->new_bnobt.bload, ra->nr_real_records); if (error) break;
/* How many btree blocks do we need to store all records? */
required = ra->new_bnobt.bload.nr_blocks +
ra->new_cntbt.bload.nr_blocks;
ASSERT(required < INT_MAX);
/* If we've reserved enough blocks, we're done. */ if (allocated >= required) break;
desired = required - allocated;
/* We need space but there's none left; bye! */ if (ra->nr_real_records == 0) {
error = -ENOSPC; break;
}
/* Grab the first record from the list. */
error = xfarray_load(ra->free_records, record_nr, &arec); if (error) break;
ASSERT(arec.ar_blockcount <= UINT_MAX);
len = min_t(unsignedint, arec.ar_blockcount, desired);
if (arec.ar_blockcount > desired) { /* * Record has more space than we need. The number of * free records doesn't change, so shrink the free * record, inform the caller that the records are no * longer sorted by length, and exit.
*/
arec.ar_startblock += desired;
arec.ar_blockcount -= desired;
error = xfarray_store(ra->free_records, record_nr,
&arec); if (error) break;
*needs_resort = true; return 0;
}
/* * We're going to use up the entire record, so unset it and * move on to the next one. This changes the number of free * records (but doesn't break the sorting order), so we must * go around the loop once more to re-run _bload_init.
*/
error = xfarray_unset(ra->free_records, record_nr); if (error) break;
ra->nr_real_records--;
record_nr--;
} while (1);
/* Add a deferred rmap for each extent we used. */ if (resv->used > 0)
xfs_rmap_alloc_extent(sc->tp, false,
xfs_agbno_to_fsb(pag, resv->agbno), resv->used,
XFS_RMAP_OWN_AG);
/* * For each reserved btree block we didn't use, add it to the free * space btree. We didn't touch fdblocks when we reserved them, so * we don't touch it now.
*/ if (free_aglen == 0) return 0;
/* * Deal with all the space we reserved. Blocks that were allocated for the * free space btrees need to have a (deferred) rmap added for the OWN_AG * allocation, and blocks that didn't get used can be freed via the usual * (deferred) means.
*/ STATICvoid
xrep_abt_dispose_reservations( struct xrep_abt *ra, int error)
{ struct xrep_newbt_resv *resv, *n;
if (error) goto junkit;
list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
error = xrep_abt_dispose_one(ra, resv); if (error) goto junkit;
}
junkit:
list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
xfs_perag_put(resv->pag);
list_del(&resv->list);
kfree(resv);
}
/* Feed one of the new btree blocks to the bulk loader. */ STATICint
xrep_abt_claim_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv)
{ struct xrep_abt *ra = priv;
/* * Reset the AGF counters to reflect the free space btrees that we just * rebuilt, then reinitialize the per-AG data.
*/ STATICint
xrep_abt_reset_counters( struct xrep_abt *ra)
{ struct xfs_scrub *sc = ra->sc; struct xfs_perag *pag = sc->sa.pag; struct xfs_agf *agf = sc->sa.agf_bp->b_addr; unsignedint freesp_btreeblks = 0;
/* * Compute the contribution to agf_btreeblks for the new free space * btrees. This is the computed btree size minus anything we didn't * use.
*/
freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
/* * The AGF header contains extra information related to the free space * btrees, so we must update those fields here.
*/
agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
(be32_to_cpu(agf->agf_rmap_blocks) - 1));
agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
agf->agf_longest = cpu_to_be32(ra->longest);
xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
XFS_AGF_LONGEST |
XFS_AGF_FREEBLKS);
/* * After we commit the new btree to disk, it is possible that the * process to reap the old btree blocks will race with the AIL trying * to checkpoint the old btree blocks into the filesystem. If the new * tree is shorter than the old one, the allocbt write verifier will * fail and the AIL will shut down the filesystem. * * To avoid this, save the old incore btree height values as the alt * height values before re-initializing the perag info from the updated * AGF to capture all the new values.
*/
pag->pagf_repair_bno_level = pag->pagf_bno_level;
pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
/* Reinitialize with the values we just logged. */ return xrep_reinit_pagf(sc);
}
/* * Use the collected free space information to stage new free space btrees. * If this is successful we'll return with the new btree root * information logged to the repair transaction but not yet committed.
*/ STATICint
xrep_abt_build_new_trees( struct xrep_abt *ra)
{ struct xfs_scrub *sc = ra->sc; struct xfs_btree_cur *bno_cur; struct xfs_btree_cur *cnt_cur; struct xfs_perag *pag = sc->sa.pag; bool needs_resort = false; int error;
/* * Sort the free extents by length so that we can set up the free space * btrees in as few extents as possible. This reduces the amount of * deferred rmap / free work we have to do at the end.
*/
error = xrep_cntbt_sort_records(ra, false); if (error) return error;
/* * Prepare to construct the new btree by reserving disk space for the * new btree and setting up all the accounting information we'll need * to root the new btree while it's under construction and before we * attach it to the AG header.
*/
xrep_newbt_init_bare(&ra->new_bnobt, sc);
xrep_newbt_init_bare(&ra->new_cntbt, sc);
/* Last chance to abort before we start committing fixes. */ if (xchk_should_terminate(sc, &error)) goto err_cur;
/* Reserve the space we'll need for the new btrees. */
error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); if (error) goto err_cur;
/* * If we need to re-sort the free extents by length, do so so that we * can put the records into the cntbt in the correct order.
*/ if (needs_resort) {
error = xrep_cntbt_sort_records(ra, needs_resort); if (error) goto err_cur;
}
/* * Due to btree slack factors, it's possible for a new btree to be one * level taller than the old btree. Update the alternate incore btree * height so that we don't trip the verifiers when writing the new * btree blocks to disk.
*/
pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
/* Load the free space by length tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
ra->longest = 0;
error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); if (error) goto err_levels;
error = xrep_bnobt_sort_records(ra); if (error) goto err_levels;
/* Load the free space by block number tree. */
ra->array_cur = XFARRAY_CURSOR_INIT;
error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); if (error) goto err_levels;
/* * Install the new btrees in the AG header. After this point the old * btrees are no longer accessible and the new trees are live.
*/
xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
xfs_btree_del_cursor(bno_cur, 0);
xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
xfs_btree_del_cursor(cnt_cur, 0);
/* Reset the AGF counters now that we've changed the btree shape. */
error = xrep_abt_reset_counters(ra); if (error) goto err_newbt;
/* Dispose of any unused blocks and the accounting information. */
xrep_abt_dispose_reservations(ra, error);
/* * Now that we've logged the roots of the new btrees, invalidate all of the * old blocks and free them.
*/ STATICint
xrep_abt_remove_old_trees( struct xrep_abt *ra)
{ struct xfs_perag *pag = ra->sc->sa.pag; int error;
/* Free the old btree blocks if they're not in use. */
error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
&XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); if (error) return error;
/* * Now that we've zapped all the old allocbt blocks we can turn off * the alternate height mechanism.
*/
pag->pagf_repair_bno_level = 0;
pag->pagf_repair_cnt_level = 0; return 0;
}
/* Repair the freespace btrees for some AG. */ int
xrep_allocbt( struct xfs_scrub *sc)
{ struct xrep_abt *ra; struct xfs_mount *mp = sc->mp; unsignedint busy_gen; char *descr; int error;
/* We require the rmapbt to rebuild anything. */ if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP;
ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); if (!ra) return -ENOMEM;
ra->sc = sc;
/* We rebuild both data structures. */
sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
/* * Make sure the busy extent list is clear because we can't put extents * on there twice. In theory we cleared this before we started, but * let's not risk the filesystem.
*/ if (!xfs_extent_busy_list_empty(pag_group(sc->sa.pag), &busy_gen)) {
error = -EDEADLOCK; goto out_ra;
}
/* Set up enough storage to handle maximally fragmented free space. */
descr = xchk_xfile_ag_descr(sc, "free space records");
error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, sizeof(struct xfs_alloc_rec_incore),
&ra->free_records);
kfree(descr); if (error) goto out_ra;
/* Collect the free space data and find the old btree blocks. */
xagb_bitmap_init(&ra->old_allocbt_blocks);
error = xrep_abt_find_freespace(ra); if (error) goto out_bitmap;
/* Rebuild the free space information. */
error = xrep_abt_build_new_trees(ra); if (error) goto out_bitmap;
/* Kill the old trees. */
error = xrep_abt_remove_old_trees(ra); if (error) goto out_bitmap;
/* Make sure both btrees are ok after we've rebuilt them. */ int
xrep_revalidate_allocbt( struct xfs_scrub *sc)
{
__u32 old_type = sc->sm->sm_type; int error;
/* * We must update sm_type temporarily so that the tree-to-tree cross * reference checks will work in the correct direction, and also so * that tracing will report correctly if there are more errors.
*/
sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
error = xchk_allocbt(sc); if (error) goto out;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.