/* * Rebuilding the Reference Count Btree * ==================================== * * This algorithm is "borrowed" from xfs_repair. Imagine the rmap * entries as rectangles representing extents of physical blocks, and * that the rectangles can be laid down to allow them to overlap each * other; then we know that we must emit a refcnt btree entry wherever * the amount of overlap changes, i.e. the emission stimulus is * level-triggered: * * - --- * -- ----- ---- --- ------ * -- ---- ----------- ---- --------- * -------------------------------- ----------- * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ * 2 1 23 21 3 43 234 2123 1 01 2 3 0 * * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). * * Note that in the actual refcnt btree we don't store the refcount < 2 * cases because the bnobt tells us which blocks are free; single-use * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt * supports storing multiple entries covering a given block we could * theoretically dispense with the refcntbt and simply count rmaps, but * that's inefficient in the (hot) write path, so we'll take the cost of * the extra tree to save time. Also there's no guarantee that rmap * will be enabled. * * Given an array of rmaps sorted by physical block number, a starting * physical block (sp), a bag to hold rmaps that cover sp, and the next * physical block where the level changes (np), we can reconstruct the * refcount btree as follows: * * While there are still unprocessed rmaps in the array, * - Set sp to the physical block (pblk) of the next unprocessed rmap. * - Add to the bag all rmaps in the array where startblock == sp. * - Set np to the physical block where the bag size will change. This * is the minimum of (the pblk of the next unprocessed rmap) and * (startblock + len of each rmap in the bag). * - Record the bag size as old_bag_size. * * - While the bag isn't empty, * - Remove from the bag all rmaps where startblock + len == np. * - Add to the bag all rmaps in the array where startblock == np. * - If the bag size isn't old_bag_size, store the refcount entry * (sp, np - sp, bag_size) in the refcnt btree. * - If the bag is empty, break out of the inner loop. * - Set old_bag_size to the bag size * - Set sp = np. * - Set np to the physical block where the bag size will change. * This is the minimum of (the pblk of the next unprocessed rmap) * and (startblock + len of each rmap in the bag). * * Like all the other repairers, we make a list of all the refcount * records we need, then reinitialize the refcount btree root and * insert all the records.
*/
/* Check for any obvious conflicts with this shared/CoW staging extent. */ STATICint
xrep_refc_check_ext( struct xfs_scrub *sc, conststruct xfs_refcount_irec *rec)
{ enum xbtree_recpacking outcome; int error;
if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL) return -EFSCORRUPTED;
/* Make sure this isn't free space. */
error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock,
rec->rc_blockcount, &outcome); if (error) return error; if (outcome != XBTREE_RECPACKING_EMPTY) return -EFSCORRUPTED;
/* Must not be an inode chunk. */
error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
rec->rc_startblock, rec->rc_blockcount, &outcome); if (error) return error; if (outcome != XBTREE_RECPACKING_EMPTY) return -EFSCORRUPTED;
/* Decide if an rmap could describe a shared extent. */ staticinlinebool
xrep_refc_rmap_shareable( struct xfs_mount *mp, conststruct xfs_rmap_irec *rmap)
{ /* AG metadata are never sharable */ if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) returnfalse;
/* Metadata in files are never shareable */ if (xfs_is_sb_inum(mp, rmap->rm_owner)) returnfalse;
/* Metadata and unwritten file blocks are not shareable. */ if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
XFS_RMAP_UNWRITTEN)) returnfalse;
returntrue;
}
/* * Walk along the reverse mapping records until we find one that could describe * a shared extent.
*/ STATICint
xrep_refc_walk_rmaps( struct xrep_refc *rr, struct xfs_rmap_irec *rmap, bool *have_rec)
{ struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; struct xfs_mount *mp = cur->bc_mp; int have_gt; int error = 0;
*have_rec = false;
/* * Loop through the remaining rmaps. Remember CoW staging * extents and the refcountbt blocks from the old tree for later * disposal. We can only share written data fork extents, so * keep looping until we find an rmap for one.
*/ do { if (xchk_should_terminate(rr->sc, &error)) return error;
error = xfs_btree_increment(cur, 0, &have_gt); if (error) return error; if (!have_gt) return 0;
error = xfs_rmap_get_rec(cur, rmap, &have_gt); if (error) return error; if (XFS_IS_CORRUPT(mp, !have_gt)) {
xfs_btree_mark_sick(cur); return -EFSCORRUPTED;
}
if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
error = xrep_refc_stash_cow(rr, rmap->rm_startblock,
rmap->rm_blockcount); if (error) return error;
} elseif (rmap->rm_owner == XFS_RMAP_OWN_REFC) { /* refcountbt block, dump it when we're done. */
rr->btblocks += rmap->rm_blockcount;
error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
rmap->rm_startblock,
rmap->rm_blockcount); if (error) return error;
}
} while (!xrep_refc_rmap_shareable(mp, rmap));
/* Sort in the same order as the ondisk records. */ staticint
xrep_refc_extent_cmp( constvoid *a, constvoid *b)
{ conststruct xfs_refcount_irec *ap = a; conststruct xfs_refcount_irec *bp = b;
uint32_t sa, sb;
sa = xrep_refc_encode_startblock(ap);
sb = xrep_refc_encode_startblock(bp);
if (sa > sb) return 1; if (sa < sb) return -1; return 0;
}
/* * Sort the refcount extents by startblock or else the btree records will be in * the wrong order. Make sure the records do not overlap in physical space.
*/ STATICint
xrep_refc_sort_records( struct xrep_refc *rr)
{ struct xfs_refcount_irec irec;
xfarray_idx_t cur; enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED;
xfs_agblock_t next_agbno = 0; int error;
error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp,
XFARRAY_SORT_KILLABLE); if (error) return error;
foreach_xfarray_idx(rr->refcount_records, cur) { if (xchk_should_terminate(rr->sc, &error)) return error;
error = xfarray_load(rr->refcount_records, cur, &irec); if (error) return error;
if (dom == XFS_REFC_DOMAIN_SHARED &&
irec.rc_domain == XFS_REFC_DOMAIN_COW) {
dom = irec.rc_domain;
next_agbno = 0;
}
if (dom != irec.rc_domain) return -EFSCORRUPTED; if (irec.rc_startblock < next_agbno) return -EFSCORRUPTED;
/* * Walk forward through the rmap btree to collect all rmaps starting at * @bno in @rmap_bag. These represent the file(s) that share ownership of * the current block. Upon return, the rmap cursor points to the last record * satisfying the startblock constraint.
*/ staticint
xrep_refc_push_rmaps_at( struct xrep_refc *rr, struct rcbag *rcstack,
xfs_agblock_t bno, struct xfs_rmap_irec *rmap, bool *have)
{ struct xfs_scrub *sc = rr->sc; int have_gt; int error;
while (*have && rmap->rm_startblock == bno) {
error = rcbag_add(rcstack, rr->sc->tp, rmap); if (error) return error;
error = xrep_refc_walk_rmaps(rr, rmap, have); if (error) return error;
}
error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); if (error) return error; if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
xfs_btree_mark_sick(sc->sa.rmap_cur); return -EFSCORRUPTED;
}
return 0;
}
/* Iterate all the rmap records to generate reference count data. */ STATICint
xrep_refc_find_refcounts( struct xrep_refc *rr)
{ struct xfs_scrub *sc = rr->sc; struct rcbag *rcstack;
uint64_t old_stack_height;
xfs_agblock_t sbno;
xfs_agblock_t cbno;
xfs_agblock_t nbno; bool have; int error;
xrep_ag_btcur_init(sc, &sc->sa);
/* * Set up a bag to store all the rmap records that we're tracking to * generate a reference count record. If the size of the bag exceeds * XFS_REFC_REFCOUNT_MAX, we clamp rc_refcount.
*/
error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); if (error) goto out_cur;
/* Start the rmapbt cursor to the left of all records. */
error = xfs_btree_goto_left_edge(sc->sa.rmap_cur); if (error) goto out_bag;
/* Process reverse mappings into refcount data. */ while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { struct xfs_rmap_irec rmap;
/* Push all rmaps with pblk == sbno onto the stack */
error = xrep_refc_walk_rmaps(rr, &rmap, &have); if (error) goto out_bag; if (!have) break;
sbno = cbno = rmap.rm_startblock;
error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap,
&have); if (error) goto out_bag;
/* Set nbno to the bno of the next refcount change */
error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); if (error) goto out_bag;
/* While stack isn't empty... */ while (rcbag_count(rcstack) > 0) { /* Pop all rmaps that end at nbno */
error = rcbag_remove_ending_at(rcstack, sc->tp, nbno); if (error) goto out_bag;
/* Push array items that start at nbno */
error = xrep_refc_walk_rmaps(rr, &rmap, &have); if (error) goto out_bag; if (have) {
error = xrep_refc_push_rmaps_at(rr, rcstack,
nbno, &rmap, &have); if (error) goto out_bag;
}
/* Emit refcount if necessary */
ASSERT(nbno > cbno); if (rcbag_count(rcstack) != old_stack_height) { if (old_stack_height > 1) {
error = xrep_refc_stash(rr,
XFS_REFC_DOMAIN_SHARED,
cbno, nbno - cbno,
old_stack_height); if (error) goto out_bag;
}
cbno = nbno;
}
/* Stack empty, go find the next rmap */ if (rcbag_count(rcstack) == 0) break;
old_stack_height = rcbag_count(rcstack);
sbno = nbno;
/* Set nbno to the bno of the next refcount change */
error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
&nbno); if (error) goto out_bag;
/* Feed one of the new btree blocks to the bulk loader. */ STATICint
xrep_refc_claim_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv)
{ struct xrep_refc *rr = priv;
/* * After we commit the new btree to disk, it is possible that the * process to reap the old btree blocks will race with the AIL trying * to checkpoint the old btree blocks into the filesystem. If the new * tree is shorter than the old one, the refcountbt write verifier will * fail and the AIL will shut down the filesystem. * * To avoid this, save the old incore btree height values as the alt * height values before re-initializing the perag info from the updated * AGF to capture all the new values.
*/
pag->pagf_repair_refcount_level = pag->pagf_refcount_level;
/* Reinitialize with the values we just logged. */ return xrep_reinit_pagf(sc);
}
/* * Use the collected refcount information to stage a new refcount btree. If * this is successful we'll return with the new btree root information logged * to the repair transaction but not yet committed.
*/ STATICint
xrep_refc_build_new_tree( struct xrep_refc *rr)
{ struct xfs_scrub *sc = rr->sc; struct xfs_btree_cur *refc_cur; struct xfs_perag *pag = sc->sa.pag; int error;
error = xrep_refc_sort_records(rr); if (error) return error;
/* * Prepare to construct the new btree by reserving disk space for the * new btree and setting up all the accounting information we'll need * to root the new btree while it's under construction and before we * attach it to the AG header.
*/
xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC,
xfs_agbno_to_fsb(pag, xfs_refc_block(sc->mp)),
XFS_AG_RESV_METADATA);
rr->new_btree.bload.get_records = xrep_refc_get_records;
rr->new_btree.bload.claim_block = xrep_refc_claim_block;
/* Compute how many blocks we'll need. */
refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag);
xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake);
error = xfs_btree_bload_compute_geometry(refc_cur,
&rr->new_btree.bload,
xfarray_length(rr->refcount_records)); if (error) goto err_cur;
/* Last chance to abort before we start committing fixes. */ if (xchk_should_terminate(sc, &error)) goto err_cur;
/* Reserve the space we'll need for the new btree. */
error = xrep_newbt_alloc_blocks(&rr->new_btree,
rr->new_btree.bload.nr_blocks); if (error) goto err_cur;
/* * Due to btree slack factors, it's possible for a new btree to be one * level taller than the old btree. Update the incore btree height so * that we don't trip the verifiers when writing the new btree blocks * to disk.
*/
pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height;
/* Add all observed refcount records. */
rr->array_cur = XFARRAY_CURSOR_INIT;
error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); if (error) goto err_level;
/* * Install the new btree in the AG header. After this point the old * btree is no longer accessible and the new tree is live.
*/
xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp);
xfs_btree_del_cursor(refc_cur, 0);
/* Reset the AGF counters now that we've changed the btree shape. */
error = xrep_refc_reset_counters(rr); if (error) goto err_newbt;
/* Dispose of any unused blocks and the accounting information. */
error = xrep_newbt_commit(&rr->new_btree); if (error) return error;
/* * Now that we've logged the roots of the new btrees, invalidate all of the * old blocks and free them.
*/ STATICint
xrep_refc_remove_old_tree( struct xrep_refc *rr)
{ struct xfs_scrub *sc = rr->sc; struct xfs_perag *pag = sc->sa.pag; int error;
/* Free the old refcountbt blocks if they're not in use. */
error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks,
&XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); if (error) return error;
/* * Now that we've zapped all the old refcountbt blocks we can turn off * the alternate height mechanism and reset the per-AG space * reservations.
*/
pag->pagf_repair_refcount_level = 0;
sc->flags |= XREP_RESET_PERAG_RESV; return 0;
}
/* Rebuild the refcount btree. */ int
xrep_refcountbt( struct xfs_scrub *sc)
{ struct xrep_refc *rr; struct xfs_mount *mp = sc->mp; char *descr; int error;
/* We require the rmapbt to rebuild anything. */ if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP;
/* Set up enough storage to handle one refcount record per block. */
descr = xchk_xfile_ag_descr(sc, "reference count records");
error = xfarray_create(descr, mp->m_sb.sb_agblocks, sizeof(struct xfs_refcount_irec),
&rr->refcount_records);
kfree(descr); if (error) goto out_rr;
/* Collect all reference counts. */
xagb_bitmap_init(&rr->old_refcountbt_blocks);
error = xrep_refc_find_refcounts(rr); if (error) goto out_bitmap;
/* Rebuild the refcount information. */
error = xrep_refc_build_new_tree(rr); if (error) goto out_bitmap;
/* Kill the old tree. */
error = xrep_refc_remove_old_tree(rr); if (error) goto out_bitmap;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.