/* * Inode Btree Repair * ================== * * A quick refresher of inode btrees on a v5 filesystem: * * - Inode records are read into memory in units of 'inode clusters'. However * many inodes fit in a cluster buffer is the smallest number of inodes that * can be allocated or freed. Clusters are never smaller than one fs block * though they can span multiple blocks. The size (in fs blocks) is * computed with xfs_icluster_size_fsb(). The fs block alignment of a * cluster is computed with xfs_ialloc_cluster_alignment(). * * - Each inode btree record can describe a single 'inode chunk'. The chunk * size is defined to be 64 inodes. If sparse inodes are enabled, every * inobt record must be aligned to the chunk size; if not, every record must * be aligned to the start of a cluster. It is possible to construct an XFS * geometry where one inobt record maps to multiple inode clusters; it is * also possible to construct a geometry where multiple inobt records map to * different parts of one inode cluster. * * - If sparse inodes are not enabled, the smallest unit of allocation for * inode records is enough to contain one inode chunk's worth of inodes. * * - If sparse inodes are enabled, the holemask field will be active. Each * bit of the holemask represents 4 potential inodes; if set, the * corresponding space does *not* contain inodes and must be left alone. * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation * of inode records is one inode cluster. * * So what's the rebuild algorithm? * * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT * records. The OWN_INOBT records are the old inode btree blocks and will be * cleared out after we've rebuilt the tree. Each possible inode cluster * within an OWN_INODES record will be read in; for each possible inobt record * associated with that cluster, compute the freemask calculated from the * i_mode data in the inode chunk. For sparse inodes the holemask will be * calculated by creating the properly aligned inobt record and punching out * any chunk that's missing. Inode allocations and frees grab the AGI first, * so repair protects itself from concurrent access by locking the AGI. * * Once we've reconstructed all the inode records, we can create new inode * btree roots and reload the btrees. We rebuild both inode trees at the same * time because they have the same rmap owner and it would be more complex to * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT * blocks it owns. We have all the data we need to build both, so dump * everything and start over. * * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
*/
struct xrep_ibt { /* Record under construction. */ struct xfs_inobt_rec_incore rie;
/* new inobt information */ struct xrep_newbt new_inobt;
/* new finobt information */ struct xrep_newbt new_finobt;
/* Old inode btree blocks we found in the rmap. */ struct xagb_bitmap old_iallocbt_blocks;
/* Reconstructed inode records. */ struct xfarray *inode_records;
struct xfs_scrub *sc;
/* Number of inodes assigned disk space. */ unsignedint icount;
/* Number of inodes in use. */ unsignedint iused;
/* Number of finobt records needed. */ unsignedint finobt_recs;
/* get_records()'s position in the inode record array. */
xfarray_idx_t array_cur;
};
/* * Is this inode in use? If the inode is in memory we can tell from i_mode, * otherwise we have to check di_mode in the on-disk buffer. We only care * that the high (i.e. non-permission) bits of _mode are zero. This should be * safe because repair keeps all AG headers locked until the end, and process * trying to perform an inode allocation/free must lock the AGI. * * @cluster_ag_base is the inode offset of the cluster within the AG. * @cluster_bp is the cluster buffer. * @cluster_index is the inode offset within the inode cluster.
*/ STATICint
xrep_ibt_check_ifree( struct xrep_ibt *ri,
xfs_agino_t cluster_ag_base, struct xfs_buf *cluster_bp, unsignedint cluster_index, bool *inuse)
{ struct xfs_scrub *sc = ri->sc; struct xfs_mount *mp = sc->mp; struct xfs_dinode *dip;
xfs_agino_t agino; unsignedint cluster_buf_base; unsignedint offset; int error;
agino = cluster_ag_base + cluster_index;
/* Inode uncached or half assembled, read disk buffer */
cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; if (offset >= BBTOB(cluster_bp->b_length)) return -EFSCORRUPTED;
dip = xfs_buf_offset(cluster_bp, offset); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) return -EFSCORRUPTED;
/* Will the in-core inode tell us if it's in use? */
error = xchk_inode_is_allocated(sc, agino, inuse); if (!error) return 0;
*inuse = dip->di_mode != 0; return 0;
}
/* Stash the accumulated inobt record for rebuilding. */ STATICint
xrep_ibt_stash( struct xrep_ibt *ri)
{ int error = 0;
if (xchk_should_terminate(ri->sc, &error)) return error;
ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie); if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL) return -EFSCORRUPTED;
if (ri->rie.ir_freecount > 0)
ri->finobt_recs++;
trace_xrep_ibt_found(ri->sc->sa.pag, &ri->rie);
error = xfarray_append(ri->inode_records, &ri->rie); if (error) return error;
ri->rie.ir_startino = NULLAGINO; return 0;
}
/* * Given an extent of inodes and an inode cluster buffer, calculate the * location of the corresponding inobt record (creating it if necessary), * then update the parts of the holemask and freemask of that record that * correspond to the inode extent we were given. * * @cluster_ir_startino is the AG inode number of an inobt record that we're * proposing to create for this inode cluster. If sparse inodes are enabled, * we must round down to a chunk boundary to find the actual sparse record. * @cluster_bp is the buffer of the inode cluster. * @nr_inodes is the number of inodes to check from the cluster.
*/ STATICint
xrep_ibt_cluster_record( struct xrep_ibt *ri,
xfs_agino_t cluster_ir_startino, struct xfs_buf *cluster_bp, unsignedint nr_inodes)
{ struct xfs_scrub *sc = ri->sc; struct xfs_mount *mp = sc->mp;
xfs_agino_t ir_startino; unsignedint cluster_base; unsignedint cluster_index; int error = 0;
/* * If the accumulated inobt record doesn't map this cluster, add it to * the list and reset it.
*/ if (ri->rie.ir_startino != NULLAGINO &&
ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
error = xrep_ibt_stash(ri); if (error) return error;
}
/* * For each inode cluster covering the physical extent recorded by the rmapbt, * we must calculate the properly aligned startino of that cluster, then * iterate each cluster to fill in used and filled masks appropriately. We * then use the (startino, used, filled) information to construct the * appropriate inode records.
*/ STATICint
xrep_ibt_process_cluster( struct xrep_ibt *ri,
xfs_agblock_t cluster_bno)
{ struct xfs_imap imap; struct xfs_buf *cluster_bp; struct xfs_scrub *sc = ri->sc; struct xfs_mount *mp = sc->mp; struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_agino_t cluster_ag_base;
xfs_agino_t irec_index; unsignedint nr_inodes; int error;
/* * Grab the inode cluster buffer. This is safe to do with a broken * inobt because imap_to_bp directly maps the buffer without touching * either inode btree.
*/
imap.im_blkno = xfs_agbno_to_daddr(sc->sa.pag, cluster_bno);
imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
imap.im_boffset = 0;
error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); if (error) return error;
/* * Record the contents of each possible inobt record mapping this * cluster.
*/
cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno); for (irec_index = 0;
irec_index < igeo->inodes_per_cluster;
irec_index += XFS_INODES_PER_CHUNK) {
error = xrep_ibt_cluster_record(ri,
cluster_ag_base + irec_index, cluster_bp,
nr_inodes); if (error) break;
/* Check for any obvious conflicts in the inode chunk extent. */ STATICint
xrep_ibt_check_inode_ext( struct xfs_scrub *sc,
xfs_agblock_t agbno,
xfs_extlen_t len)
{ struct xfs_mount *mp = sc->mp; struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_agino_t agino; enum xbtree_recpacking outcome; int error;
/* Inode records must be within the AG. */ if (!xfs_verify_agbext(sc->sa.pag, agbno, len)) return -EFSCORRUPTED;
/* The entire record must align to the inode cluster size. */ if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
!IS_ALIGNED(agbno + len, igeo->blocks_per_cluster)) return -EFSCORRUPTED;
/* * The entire record must also adhere to the inode cluster alignment * size if sparse inodes are not enabled.
*/ if (!xfs_has_sparseinodes(mp) &&
(!IS_ALIGNED(agbno, igeo->cluster_align) ||
!IS_ALIGNED(agbno + len, igeo->cluster_align))) return -EFSCORRUPTED;
/* * On a sparse inode fs, this cluster could be part of a sparse chunk. * Sparse clusters must be aligned to sparse chunk alignment.
*/ if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
(!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
!IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) return -EFSCORRUPTED;
/* Make sure the entire range of blocks are valid AG inodes. */
agino = XFS_AGB_TO_AGINO(mp, agbno); if (!xfs_verify_agino(sc->sa.pag, agino)) return -EFSCORRUPTED;
/* Make sure this isn't free space. */
error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); if (error) return error; if (outcome != XBTREE_RECPACKING_EMPTY) return -EFSCORRUPTED;
return 0;
}
/* Found a fragment of the old inode btrees; dispose of them later. */ STATICint
xrep_ibt_record_old_btree_blocks( struct xrep_ibt *ri, conststruct xfs_rmap_irec *rec)
{ if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
rec->rm_blockcount)) return -EFSCORRUPTED;
/* Record extents that belong to inode cluster blocks. */ STATICint
xrep_ibt_record_inode_blocks( struct xrep_ibt *ri, conststruct xfs_rmap_irec *rec)
{ struct xfs_mount *mp = ri->sc->mp; struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_agblock_t cluster_base; int error;
error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
rec->rm_blockcount); if (error) return error;
trace_xrep_ibt_walk_rmap(ri->sc->sa.pag, rec);
/* * Record the free/hole masks for each inode cluster that could be * mapped by this rmap record.
*/ for (cluster_base = 0;
cluster_base < rec->rm_blockcount;
cluster_base += igeo->blocks_per_cluster) {
error = xrep_ibt_process_cluster(ri,
rec->rm_startblock + cluster_base); if (error) return error;
}
if (xchk_should_terminate(ri->sc, &error)) return error;
switch (rec->rm_owner) { case XFS_RMAP_OWN_INOBT: return xrep_ibt_record_old_btree_blocks(ri, rec); case XFS_RMAP_OWN_INODES: return xrep_ibt_record_inode_blocks(ri, rec);
} return 0;
}
/* * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct * the inode btrees. The caller must clean up the lists if anything goes * wrong.
*/ STATICint
xrep_ibt_find_inodes( struct xrep_ibt *ri)
{ struct xfs_scrub *sc = ri->sc; int error;
ri->rie.ir_startino = NULLAGINO;
/* Collect all reverse mappings for inode blocks. */
xrep_ag_btcur_init(sc, &sc->sa);
error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
xchk_ag_btcur_free(&sc->sa); if (error) return error;
/* If we have a record ready to go, add it to the array. */ if (ri->rie.ir_startino != NULLAGINO) return xrep_ibt_stash(ri);
/* * The AGI header contains extra information related to the inode * btrees, so we must update those fields here.
*/
agi->agi_count = cpu_to_be32(ri->icount);
agi->agi_freecount = cpu_to_be32(freecount);
xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
/* Reinitialize with the values we just logged. */ return xrep_reinit_pagi(sc);
}
/* Retrieve finobt data for bulk load. */ STATICint
xrep_fibt_get_records( struct xfs_btree_cur *cur, unsignedint idx, struct xfs_btree_block *block, unsignedint nr_wanted, void *priv)
{ struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; struct xrep_ibt *ri = priv; union xfs_btree_rec *block_rec; unsignedint loaded; int error;
for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { do {
error = xfarray_load(ri->inode_records,
ri->array_cur++, irec);
} while (error == 0 && xfs_inobt_rec_freecount(irec) == 0); if (error) return error;
/* Feed one of the new inobt blocks to the bulk loader. */ STATICint
xrep_ibt_claim_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv)
{ struct xrep_ibt *ri = priv;
/* Feed one of the new finobt blocks to the bulk loader. */ STATICint
xrep_fibt_claim_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv)
{ struct xrep_ibt *ri = priv;
/* Make sure the records do not overlap in inumber address space. */ STATICint
xrep_ibt_check_overlap( struct xrep_ibt *ri)
{ struct xfs_inobt_rec_incore irec;
xfarray_idx_t cur;
xfs_agino_t next_agino = 0; int error = 0;
foreach_xfarray_idx(ri->inode_records, cur) { if (xchk_should_terminate(ri->sc, &error)) return error;
error = xfarray_load(ri->inode_records, cur, &irec); if (error) return error;
if (irec.ir_startino < next_agino) return -EFSCORRUPTED;
/* Build new inode btrees and dispose of the old one. */ STATICint
xrep_ibt_build_new_trees( struct xrep_ibt *ri)
{ struct xfs_scrub *sc = ri->sc; struct xfs_btree_cur *ino_cur; struct xfs_btree_cur *fino_cur = NULL; bool need_finobt; int error;
need_finobt = xfs_has_finobt(sc->mp);
/* * Create new btrees for staging all the inobt records we collected * earlier. The records were collected in order of increasing agino, * so we do not have to sort them. Ensure there are no overlapping * records.
*/
error = xrep_ibt_check_overlap(ri); if (error) return error;
/* * The new inode btrees will not be rooted in the AGI until we've * successfully rebuilt the tree. * * Start by setting up the inobt staging cursor.
*/
xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT,
xfs_agbno_to_fsb(sc->sa.pag, XFS_IBT_BLOCK(sc->mp)),
XFS_AG_RESV_NONE);
ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
ri->new_inobt.bload.get_records = xrep_ibt_get_records;
/* Last chance to abort before we start committing fixes. */ if (xchk_should_terminate(sc, &error)) goto err_finocur;
/* Reserve all the space we need to build the new btrees. */
error = xrep_newbt_alloc_blocks(&ri->new_inobt,
ri->new_inobt.bload.nr_blocks); if (error) goto err_finocur;
if (need_finobt) {
error = xrep_newbt_alloc_blocks(&ri->new_finobt,
ri->new_finobt.bload.nr_blocks); if (error) goto err_finocur;
}
/* Add all inobt records. */
ri->array_cur = XFARRAY_CURSOR_INIT;
error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri); if (error) goto err_finocur;
/* Add all finobt records. */ if (need_finobt) {
ri->array_cur = XFARRAY_CURSOR_INIT;
error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri); if (error) goto err_finocur;
}
/* * Install the new btrees in the AG header. After this point the old * btrees are no longer accessible and the new trees are live.
*/
xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
xfs_btree_del_cursor(ino_cur, 0);
if (fino_cur) {
xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
xfs_btree_del_cursor(fino_cur, 0);
}
/* Reset the AGI counters now that we've changed the inode roots. */
error = xrep_ibt_reset_counters(ri); if (error) goto err_finobt;
/* Free unused blocks and bitmap. */ if (need_finobt) {
error = xrep_newbt_commit(&ri->new_finobt); if (error) goto err_inobt;
}
error = xrep_newbt_commit(&ri->new_inobt); if (error) return error;
return xrep_roll_ag_trans(sc);
err_finocur: if (need_finobt)
xfs_btree_del_cursor(fino_cur, error);
err_inocur:
xfs_btree_del_cursor(ino_cur, error);
err_finobt: if (need_finobt)
xrep_newbt_cancel(&ri->new_finobt);
err_inobt:
xrep_newbt_cancel(&ri->new_inobt); return error;
}
/* * Now that we've logged the roots of the new btrees, invalidate all of the * old blocks and free them.
*/ STATICint
xrep_ibt_remove_old_trees( struct xrep_ibt *ri)
{ struct xfs_scrub *sc = ri->sc; int error;
/* * Free the old inode btree blocks if they're not in use. It's ok to * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG * reservation because we reset the reservation before releasing the * AGI and AGF header buffer locks.
*/
error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
&XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE); if (error) return error;
/* * If the finobt is enabled and has a per-AG reservation, make sure we * reinitialize the per-AG reservations.
*/ if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
sc->flags |= XREP_RESET_PERAG_RESV;
return 0;
}
/* Repair both inode btrees. */ int
xrep_iallocbt( struct xfs_scrub *sc)
{ struct xrep_ibt *ri; struct xfs_mount *mp = sc->mp; char *descr;
xfs_agino_t first_agino, last_agino; int error = 0;
/* We require the rmapbt to rebuild anything. */ if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP;
ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS); if (!ri) return -ENOMEM;
ri->sc = sc;
/* We rebuild both inode btrees. */
sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
/* Set up enough storage to handle an AG with nothing but inodes. */
xfs_agino_range(mp, pag_agno(sc->sa.pag), &first_agino, &last_agino);
last_agino /= XFS_INODES_PER_CHUNK;
descr = xchk_xfile_ag_descr(sc, "inode index records");
error = xfarray_create(descr, last_agino, sizeof(struct xfs_inobt_rec_incore),
&ri->inode_records);
kfree(descr); if (error) goto out_ri;
/* Collect the inode data and find the old btree blocks. */
xagb_bitmap_init(&ri->old_iallocbt_blocks);
error = xrep_ibt_find_inodes(ri); if (error) goto out_bitmap;
/* Rebuild the inode indexes. */
error = xrep_ibt_build_new_trees(ri); if (error) goto out_bitmap;
/* Kill the old tree. */
error = xrep_ibt_remove_old_trees(ri); if (error) goto out_bitmap;
/* Make sure both btrees are ok after we've rebuilt them. */ int
xrep_revalidate_iallocbt( struct xfs_scrub *sc)
{
__u32 old_type = sc->sm->sm_type; int error;
/* * We must update sm_type temporarily so that the tree-to-tree cross * reference checks will work in the correct direction, and also so * that tracing will report correctly if there are more errors.
*/
sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
error = xchk_iallocbt(sc); if (error) goto out;
if (xfs_has_finobt(sc->mp)) {
sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
error = xchk_iallocbt(sc);
}
out:
sc->sm->sm_type = old_type; return error;
}
Messung V0.5
¤ Dauer der Verarbeitung: 0.12 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.