/* * This picks a non-stale pointer, preferably from a device other than @avoid. * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to * other devices, it will still pick a pointer from avoid.
*/ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct bch_io_failures *failed, struct extent_ptr_decoded *pick, int dev)
{ bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; bool have_dirty_ptrs = false, have_pick = false;
if (k.k->type == KEY_TYPE_error) return bch_err_throw(c, key_type_error);
if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) {
rcu_read_unlock(); int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); if (ret) return ret;
rcu_read_lock();
}
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue;
have_missing_devs |= ca && !bch2_dev_is_online(ca);
if (!ca || !bch2_dev_is_online(ca)) { if (!p.has_ec) continue;
p.do_ec_reconstruct = true;
}
if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec)
p.do_ec_reconstruct = true;
u64 p_latency = dev_latency(ca); /* * Square the latencies, to bias more in favor of the faster * device - we never want to stop issuing reads to the slower * device altogether, so that we can update our latency numbers:
*/
p_latency *= p_latency;
if (have_pick) return 1; if (!have_dirty_ptrs) return 0; if (have_missing_devs) return bch_err_throw(c, no_device_to_read_from); if (have_csum_errors) return bch_err_throw(c, data_read_csum_err); if (have_io_errors) return bch_err_throw(c, data_read_io_err);
/* * If we get here, we have pointers (bkey_ptrs_validate() ensures that), * but they don't point to valid devices:
*/ return bch_err_throw(c, no_devices_valid);
}
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from)
{ int ret = 0;
bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
c, btree_ptr_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
ret = bch2_bkey_ptrs_validate(c, k, from);
fsck_err: return ret;
}
int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from)
{ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0;
bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
c, btree_ptr_v2_val_too_big, "value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
if (lp.crc.compression_type != rp.crc.compression_type ||
lp.crc.nonce != rp.crc.nonce) returnfalse;
if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
lp.crc.uncompressed_size) { /* can use left extent's crc entry */
} elseif (lp.crc.live_size <= rp.crc.offset) { /* can use right extent's crc entry */
} else { /* check if checksums can be merged: */ if (lp.crc.csum_type != rp.crc.csum_type ||
lp.crc.nonce != rp.crc.nonce ||
crc_is_compressed(lp.crc) ||
!bch2_checksum_mergeable(lp.crc.csum_type)) returnfalse;
if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size ||
rp.crc.offset) returnfalse;
int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from)
{ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); int ret = 0;
bkey_for_each_crc(k.k, ptrs, crc, i) if (can_narrow_crc(crc, n)) returntrue;
returnfalse;
}
/* * We're writing another replica for this extent, so while we've got the data in * memory we'll be computing a new checksum for the currently live data. * * If there are other replicas we aren't moving, and they are checksummed but * not compressed, we can modify them to point to only the data that is * currently live (so that readers won't have to bounce) while we've got the * checksum we need:
*/ bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
{ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); struct bch_extent_crc_unpacked u; struct extent_ptr_decoded p; union bch_extent_entry *i; bool ret = false;
/* Find a checksum entry that covers only live data: */ if (!n.csum_type) {
bkey_for_each_crc(&k->k, ptrs, u, i) if (!crc_is_compressed(u) &&
u.csum_type &&
u.live_size == u.uncompressed_size) {
n = u; goto found;
} returnfalse;
}
found:
BUG_ON(crc_is_compressed(n));
BUG_ON(n.offset);
BUG_ON(n.live_size != k->k.size);
staticunion bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, union bch_extent_entry *entry)
{ union bch_extent_entry *i = ptrs.start;
if (i == entry) return NULL;
while (extent_entry_next(i) != entry)
i = extent_entry_next(i); return i;
}
/* * Returns pointer to the next entry after the one being dropped:
*/ void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr)
{ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; bool drop_crc = true;
if (k.k->type == KEY_TYPE_stripe) {
ptr->dev = BCH_SB_MEMBER_INVALID; return;
}
/* * If we deleted all the dirty pointers and there's still cached * pointers, we could set the cached pointers to dirty if they're not * stale - but to do that correctly we'd need to grab an open_bucket * reference so that we don't race with bucket reuse:
*/ if (have_dirty &&
!bch2_bkey_dirty_devs(k.s_c).nr) {
k.k->type = KEY_TYPE_error;
set_bkey_val_u64s(k.k, 0);
} elseif (!bch2_bkey_nr_ptrs(k.s_c)) {
k.k->type = KEY_TYPE_deleted;
set_bkey_val_u64s(k.k, 0);
}
}
/* * Returns true if two extents refer to the same data:
*/ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
{ if (k1.k->type != k2.k->type) returnfalse;
/* * This checks that the two pointers point * to the same region on disk - adjusting * for the difference in where the extents * start, since one may have been trimmed:
*/
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
/* * This additionally checks that the * extents overlap on disk, since the * previous check may trigger spuriously * when one extent is immediately partially * overwritten with another extent (so that * on disk they are adjacent) and * compression is in use:
*/
((p1.ptr.offset >= p2.ptr.offset &&
p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
(p2.ptr.offset >= p1.ptr.offset &&
p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) returntrue;
/* * bch2_extent_normalize - clean up an extent, dropping stale pointers etc. * * Returns true if @k should be dropped entirely * * For existing keys, only called when btree nodes are being rewritten, not when * they're merely being compacted/resorted in memory.
*/ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
{ struct bch_dev *ca;
/* * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc. * * Like bch2_extent_normalize(), but also only keeps a single cached pointer on * the promote target.
*/ bool bch2_extent_normalize_by_opts(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s k)
{ struct bkey_ptrs ptrs; bool have_cached_ptr;
if (c)
prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
bkey_extent_entry_for_each(ptrs, entry) { if (!first)
prt_printf(out, " ");
switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr:
bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry)); break;
case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: { struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr:
bkey_fsck_err_on(have_ec,
c, ptr_stripe_redundant, "redundant stripe entry");
have_ec = true; break; case BCH_EXTENT_ENTRY_rebalance: { /* * this shouldn't be a fsck error, for forward * compatibility; the rebalance code should just refetch * the compression opt if it's unknown
*/ #if 0 conststruct bch_extent_rebalance *r = &entry->rebalance;
if (!bch2_compression_opt_valid(r->compression)) { struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
prt_printf(err, "invalid compression opt %u:%u",
opt.type, opt.level); return bch_err_throw(c, invalid_bkey);
} #endif break;
} case BCH_EXTENT_ENTRY_flags:
bkey_fsck_err_on(entry != ptrs.start,
c, extent_flags_not_at_start, "extent flags entry not at start"); break;
}
}
for (d = (u64 *) ptrs.start;
d != (u64 *) ptrs.end;
d++)
*d = swab64(*d);
for (entry = ptrs.start;
entry < ptrs.end;
entry = extent_entry_next(entry)) { switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: break; case BCH_EXTENT_ENTRY_crc32:
entry->crc32.csum = swab32(entry->crc32.csum); break; case BCH_EXTENT_ENTRY_crc64:
entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); break; case BCH_EXTENT_ENTRY_crc128:
entry->crc128.csum.hi = (__force __le64)
swab64((__force u64) entry->crc128.csum.hi);
entry->crc128.csum.lo = (__force __le64)
swab64((__force u64) entry->crc128.csum.lo); break; case BCH_EXTENT_ENTRY_stripe_ptr: break; case BCH_EXTENT_ENTRY_rebalance: break; default: /* Bad entry type: will be caught by validate() */ return;
}
}
}
int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
{ int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); if (ret) return ret;
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
{ unsigned new_val_u64s = bkey_val_u64s(k.k); int val_u64s_delta;
u64 sub;
if (bkey_le(where, bkey_start_pos(k.k))) return 0;
EBUG_ON(bkey_gt(where, k.k->p));
sub = where.offset - bkey_start_offset(k.k);
k.k->size -= sub;
if (!k.k->size) {
k.k->type = KEY_TYPE_deleted;
new_val_u64s = 0;
}
switch (k.k->type) { case KEY_TYPE_extent: case KEY_TYPE_reflink_v: { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; bool seen_crc = false;
bkey_extent_entry_for_each(ptrs, entry) { switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: if (!seen_crc)
entry->ptr.offset += sub; break; case BCH_EXTENT_ENTRY_crc32:
entry->crc32.offset += sub; break; case BCH_EXTENT_ENTRY_crc64:
entry->crc64.offset += sub; break; case BCH_EXTENT_ENTRY_crc128:
entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: case BCH_EXTENT_ENTRY_rebalance: case BCH_EXTENT_ENTRY_flags: break;
}
if (extent_entry_is_crc(entry))
seen_crc = true;
}
break;
} case KEY_TYPE_reflink_p: { struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); break;
} case KEY_TYPE_inline_data: case KEY_TYPE_indirect_inline_data: { void *p = bkey_inline_data_p(k); unsigned bytes = bkey_inline_data_bytes(k.k);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.