/* * bcachefs on disk data structures * * OVERVIEW: * * There are three main types of on disk data structures in bcachefs (this is * reduced from 5 in bcache) * * - superblock * - journal * - btree * * The btree is the primary structure; most metadata exists as keys in the * various btrees. There are only a small number of btrees, they're not * sharded - we have one btree for extents, another for inodes, et cetera. * * SUPERBLOCK: * * The superblock contains the location of the journal, the list of devices in * the filesystem, and in general any metadata we need in order to decide * whether we can start a filesystem or prior to reading the journal/btree * roots. * * The superblock is extensible, and most of the contents of the superblock are * in variable length, type tagged fields; see struct bch_sb_field. * * Backup superblocks do not reside in a fixed location; also, superblocks do * not have a fixed size. To locate backup superblocks we have struct * bch_sb_layout; we store a copy of this inside every superblock, and also * before the first superblock. * * JOURNAL: * * The journal primarily records btree updates in the order they occurred; * journal replay consists of just iterating over all the keys in the open * journal entries and re-inserting them into the btrees. * * The journal also contains entry types for the btree roots, and blacklisted * journal sequence numbers (see journal_seq_blacklist.c). * * BTREE: * * bcachefs btrees are copy on write b+ trees, where nodes are big (typically * 128k-256k) and log structured. We use struct btree_node for writing the first * entry in a given node (offset 0), and struct btree_node_entry for all * subsequent writes. * * After the header, btree node entries contain a list of keys in sorted order. * Values are stored inline with the keys; since values are variable length (and * keys effectively are variable length too, due to packing) we can't do random * access without building up additional in memory tables in the btree node read * path. * * BTREE KEYS (struct bkey): * * The various btrees share a common format for the key - so as to avoid * switching in fastpath lookup/comparison code - but define their own * structures for the key values. * * The size of a key/value pair is stored as a u8 in units of u64s, so the max * size is just under 2k. The common part also contains a type tag for the * value, and a format field indicating whether the key is packed or not (and * also meant to allow adding new key fields in the future, if desired). * * bkeys, when stored within a btree node, may also be packed. In that case, the * bkey_format in that node is used to unpack it. Packed bkeys mean that we can * be generous with field sizes in the common part of the key format (64 bit * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
*/
#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) #define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) #define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e)
struct bkey_format {
__u8 key_u64s;
__u8 nr_fields; /* One unused slot for now: */
__u8 bits_per_field[6];
__le64 field_offset[6];
};
/* Btree keys - all units are in sectors */
struct bpos { /* * Word order matches machine byte order - btree code treats a bpos as a * single large integer, for search/comparison purposes * * Note that wherever a bpos is embedded in another on disk data * structure, it has to be byte swabbed when reading in metadata that * wasn't written in native endian order:
*/ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u32 snapshot;
__u64 offset;
__u64 inode; #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
__u64 inode;
__u64 offset; /* Points to end of extent - sectors */
__u32 snapshot; #else #error edit for your odd byteorder. #endif
} __packed #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__aligned(4) #endif
;
struct bkey { /* Size of combined key and value, in u64s */
__u8 u64s;
/* Format of key (0 for format local to btree node) */ #ifdefined(__LITTLE_ENDIAN_BITFIELD)
__u8 format:7,
needs_whiteout:1; #elifdefined (__BIG_ENDIAN_BITFIELD)
__u8 needs_whiteout:1,
format:7; #else #error edit for your odd byteorder. #endif
__u8 pad[1]; #endif
} __packed #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ /* * The big-endian version of bkey can't be compiled by rustc with the "aligned" * attr since it doesn't allow types to have both "packed" and "aligned" attrs. * So for Rust compatibility, don't include this. It can be included in the LE * version because the "packed" attr is redundant in that case. * * History: (quoting Kent) * * Specifically, when i was designing bkey, I wanted the header to be no * bigger than necessary so that bkey_packed could use the rest. That means that * decently offten extent keys will fit into only 8 bytes, instead of spilling over * to 16. * * But packed_bkey treats the part after the header - the packed section - * as a single multi word, variable length integer. And bkey, the unpacked * version, is just a special case version of a bkey_packed; all the packed * bkey code will work on keys in any packed format, the in-memory * representation of an unpacked key also is just one type of packed key... * * So that constrains the key part of a bkig endian bkey to start right * after the header. * * If we ever do a bkey_v2 and need to expand the hedaer by another byte for * some reason - that will clean up this wart.
*/
__aligned(8) #endif
;
struct bkey_packed {
__u64 _data[0];
/* Size of combined key and value, in u64s */
__u8 u64s;
/* Format of key (0 for format local to btree node) */
/* * XXX: next incompat on disk format change, switch format and * needs_whiteout - bkey_packed() will be cheaper if format is the high * bits of the bitfield
*/ #ifdefined(__LITTLE_ENDIAN_BITFIELD)
__u8 format:7,
needs_whiteout:1; #elifdefined (__BIG_ENDIAN_BITFIELD)
__u8 needs_whiteout:1,
format:7; #endif
/* Type of the value */
__u8 type;
__u8 key_start[0];
/* * We copy bkeys with struct assignment in various places, and while * that shouldn't be done with packed bkeys we can't disallow it in C, * and it's legal to cast a bkey to a bkey_packed - so padding it out * to the same size as struct bkey should hopefully be safest.
*/
__u8 pad[sizeof(struct bkey) - 3];
} __packed __aligned(8);
/* * - DELETED keys are used internally to mark keys that should be ignored but * override keys in composition order. Their version number is ignored. * * - DISCARDED keys indicate that the data is all 0s because it has been * discarded. DISCARDs may have a version; if the version is nonzero the key * will be persistent, otherwise the key will be dropped whenever the btree * node is rewritten (like DELETED keys). * * - ERROR: any read of the data returns a read error, as the data was lost due * to a failing device. Like DISCARDED keys, they can be removed (overridden) * by new writes or cluster-wide GC. Node repair can also overwrite them with * the same or a more recent version number, but not with an older version * number. * * - WHITEOUT: for hash table btrees
*/ #define BCH_BKEY_TYPES() \
x(deleted, 0, 0) \
x(whiteout, 1, 0) \
x(error, 2, 0) \
x(cookie, 3, 0) \
x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \
x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \
x(extent, 6, BKEY_TYPE_strict_btree_checks) \
x(reservation, 7, BKEY_TYPE_strict_btree_checks) \
x(inode, 8, BKEY_TYPE_strict_btree_checks) \
x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \
x(dirent, 10, BKEY_TYPE_strict_btree_checks) \
x(xattr, 11, BKEY_TYPE_strict_btree_checks) \
x(alloc, 12, BKEY_TYPE_strict_btree_checks) \
x(quota, 13, BKEY_TYPE_strict_btree_checks) \
x(stripe, 14, BKEY_TYPE_strict_btree_checks) \
x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \
x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \
x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \
x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \
x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \
x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \
x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \
x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \
x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \
x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \
x(set, 25, 0) \
x(lru, 26, BKEY_TYPE_strict_btree_checks) \
x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \
x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \
x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \
x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \
x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \
x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \
x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \
x(accounting, 34, BKEY_TYPE_strict_btree_checks) \
x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks)
/* * Most superblock fields are replicated in all device's superblocks - a few are * not:
*/ #define BCH_SINGLE_DEVICE_SB_FIELDS \
((1U << BCH_SB_FIELD_journal)| \
(1U << BCH_SB_FIELD_journal_v2))
/* * If this field is present in the superblock, it stores an encryption key which * is used encrypt all other data/metadata. The key will normally be encrypted * with the key userspace provides, but if encryption has been turned off we'll * just store the master key unencrypted in the superblock so we can access the * previously encrypted data.
*/ struct bch_sb_field_crypt { struct bch_sb_field field;
/* stored as base 2 log of scrypt params: */
LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
/* * On clean shutdown, store btree roots and current journal sequence number in * the superblock:
*/ struct jset_entry {
__le16 u64s;
__u8 btree_id;
__u8 level;
__u8 type; /* designates what this jset holds */
__u8 pad[3];
/* * New versioning scheme: * One common version number for all on disk data structures - superblock, btree * nodes, journal entries
*/ #define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10)) #define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) #define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0)
/* * @offset - sector where this sb was written * @version - on disk format version * @version_min - Oldest metadata version this filesystem contains; so we can * safely drop compatibility code and refuse to mount filesystems * we'd need it for * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC) * @seq - incremented each time superblock is written * @uuid - used for generating various magic numbers and identifying * member devices, never changes * @user_uuid - user visible UUID, may be changed * @label - filesystem label * @seq - identifies most recent superblock, incremented each time * superblock is written * @features - enabled incompatible features
*/ struct bch_sb { struct bch_csum csum;
__le16 version;
__le16 version_min;
__le16 pad[2];
__uuid_t magic;
__uuid_t uuid;
__uuid_t user_uuid;
__u8 label[BCH_SB_LABEL_SIZE];
__le64 offset;
__le64 seq;
/* * Flags: * BCH_SB_INITALIZED - set on first mount * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect * behaviour of mount/recovery path: * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides * DATA/META_CSUM_TYPE. Also indicates encryption * algorithm in use, if/when we get more than one
*/
/* * Max size of an extent that may require bouncing to read or write * (checksummed, compressed): 64k
*/
LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, struct bch_sb, flags[1], 14, 20);
enum bch_compression_opts { #define x(t, n) BCH_COMPRESSION_OPT_##t = n,
BCH_COMPRESSION_OPTS() #undef x
BCH_COMPRESSION_OPT_NR
};
/* * Magic numbers * * The various other data structures have their own magic numbers, which are * xored with the first part of the cache set's UUID
*/
staticinlinebool jset_entry_is_key(struct jset_entry *e)
{ switch (e->type) { case BCH_JSET_ENTRY_btree_keys: case BCH_JSET_ENTRY_btree_root: case BCH_JSET_ENTRY_write_buffer_keys: returntrue;
}
returnfalse;
}
/* * Journal sequence numbers can be blacklisted: bsets record the max sequence * number of all the journal entries they contain updates for, so that on * recovery we can ignore those bsets that contain index updates newer that what * made it into the journal. * * This means that we can't reuse that journal_seq - we have to skip it, and * then record that we skipped it so that the next time we crash and recover we * don't think there was a missing journal entry.
*/ struct jset_entry_blacklist { struct jset_entry entry;
__le64 seq;
};
/* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique * sequence number. * * last_seq is the oldest journal entry that still has keys the btree hasn't * flushed to disk yet. * * version is for on disk format changes.
*/ struct jset { struct bch_csum csum;
/* * Maximum number of btrees that we will _ever_ have under the current scheme, * where we refer to them with 64 bit bitfields - and we also need a bit for * the interior btree node type:
*/ #define BTREE_ID_NR_MAX 63
staticinlinebool btree_id_is_alloc(enum btree_id id)
{ switch (id) { case BTREE_ID_alloc: case BTREE_ID_backpointers: case BTREE_ID_need_discard: case BTREE_ID_freespace: case BTREE_ID_bucket_gens: case BTREE_ID_lru: case BTREE_ID_accounting: returntrue; default: returnfalse;
}
}
#define BTREE_MAX_DEPTH 4U
/* Btree nodes */
/* * Btree nodes * * On disk a btree node is a list/log of these; within each set the keys are * sorted
*/ struct bset {
__le64 seq;
/* * Highest journal entry this bset contains keys for. * If on recovery we don't see that journal entry, this bset is ignored: * this allows us to preserve the order of all index updates after a * crash, since the journal records a total order of all index updates * and anything that didn't make it to the journal doesn't get used.
*/
__le64 journal_seq;
__le32 flags;
__le16 version;
__le16 u64s; /* count of d[] in u64s */
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.