if (latency_threshold && latency_over > 0) { /* * bump up congested by approximately latency_over * 4 / * latency_threshold - we don't need much accuracy here so don't * bother with the divide:
*/ if (atomic_read(&ca->congested) < CONGESTED_MAX)
atomic_add(latency_over >>
max_t(int, ilog2(latency_threshold) - 2, 0),
&ca->congested);
old = atomic64_read(latency); do { /* * If the io latency was reasonably close to the current * latency, skip doing the update and atomic operation - most of * the time:
*/ if (abs((int) (old - io_latency)) < (old >> 1) &&
now & ~(~0U << 5)) break;
new = ewma_add(old, io_latency, 5);
} while (!atomic64_try_cmpxchg(latency, &old, new));
staticinlineint bch2_extent_update_i_size_sectors(struct btree_trans *trans, struct btree_iter *extent_iter,
u64 new_i_size,
s64 i_sectors_delta)
{ /* * Crazy performance optimization: * Every extent update needs to also update the inode: the inode trigger * will set bi->journal_seq to the journal sequence number of this * transaction - for fsync. * * But if that's the only reason we're updating the inode (we're not * updating bi_size or bi_sectors), then we don't need the inode update * to be journalled - if we crash, the bi_journal_seq update will be * lost, but that's fine.
*/ unsigned inode_update_flags = BTREE_UPDATE_nojournal;
struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0,
extent_iter->pos.inode,
extent_iter->snapshot),
BTREE_ITER_intent|
BTREE_ITER_cached); int ret = bkey_err(k); if (unlikely(ret)) return ret;
/* * varint_decode_fast(), in the inode .invalid method, reads up to 7 * bytes past the end of the buffer:
*/ struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8);
ret = PTR_ERR_OR_ZERO(k_mut); if (unlikely(ret)) goto err;
bkey_reassemble(k_mut, k);
if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) {
k_mut = bch2_inode_to_v3(trans, k_mut);
ret = PTR_ERR_OR_ZERO(k_mut); if (unlikely(ret)) goto err;
}
/* * extents, dirents and xattrs updates require that an inode update also * happens - to ensure that if a key exists in one of those btrees with * a given snapshot ID an inode is also present - so we may have to skip * the nojournal optimization:
*/ if (inode->k.p.snapshot != iter.snapshot) {
inode->k.p.snapshot = iter.snapshot;
inode_update_flags = 0;
}
/* * This traverses us the iterator without changing iter->path->pos to * search_key() (which is pos + 1 for extents): we want there to be a * path already traversed at iter->pos because * bch2_trans_extent_update() will use it to attempt extent merging
*/
ret = __bch2_btree_iter_traverse(trans, iter); if (ret) return ret;
ret = bch2_extent_trim_atomic(trans, iter, k); if (ret) return ret;
next_pos = k->k.p;
ret = bch2_sum_sector_overwrites(trans, iter, k,
&usage_increasing,
&i_sectors_delta,
&disk_sectors_delta); if (ret) return ret;
if (disk_res &&
disk_sectors_delta > (s64) disk_res->sectors) {
ret = bch2_disk_reservation_add(trans->c, disk_res,
disk_sectors_delta - disk_res->sectors,
!check_enospc || !usage_increasing
? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) return ret;
}
/* * Note: * We always have to do an inode update - even when i_size/i_sectors * aren't changing - for fsync to work properly; fsync relies on * inode->bi_journal_seq which is updated by the trigger code:
*/
ret = bch2_extent_update_i_size_sectors(trans, iter,
min(k->k.p.offset << 9, new_i_size),
i_sectors_delta) ?:
bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc); if (unlikely(ret)) return ret;
k = bch2_keylist_front(keys);
bch2_bkey_buf_copy(&sk, c, k);
ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
&sk.k->k.p.snapshot); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break;
/** * __bch2_write_index - after a write, update index to point to new data * @op: bch_write_op to process
*/ staticvoid __bch2_write_index(struct bch_write_op *op)
{ struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; unsigned dev; int ret = 0;
if (unlikely(op->flags & BCH_WRITE_io_error)) {
ret = bch2_write_drop_io_error_ptrs(op); if (ret) goto err;
}
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
ret = !(op->flags & BCH_WRITE_move)
? bch2_write_index_default(op)
: bch2_data_update_index_update(op);
while (1) {
spin_lock_irq(&wp->writes_lock);
op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list);
wp_update_state(wp, op != NULL);
spin_unlock_irq(&wp->writes_lock);
if (!op) break;
op->flags |= BCH_WRITE_in_worker;
__bch2_write_index(op);
if (!(op->flags & BCH_WRITE_submitted))
__bch2_write(op); else
bch2_write_done(&op->cl);
}
}
if (buf) {
bch2_bio_map(bio, buf, output_available); return bio;
}
wbio->bounce = true;
/* * We can't use mempool for more than c->sb.encoded_extent_max * worth of pages, but we'd like to allocate more if we can:
*/
bch2_bio_alloc_pages_pool(c, bio,
min_t(unsigned, output_available,
c->opts.encoded_extent_max));
/* Can we just write the entire extent as is? */ if (op->crc.uncompressed_size == op->crc.live_size &&
op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
op->crc.compressed_size <= wp->sectors_free &&
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) { if (!crc_is_compressed(op->crc) &&
op->csum_type != op->crc.csum_type) {
ret = bch2_write_rechecksum(c, op, op->csum_type); if (ret) return ret;
}
return 1;
}
/* * If the data is compressed and we couldn't write the entire extent as * is, we have to decompress it:
*/ if (crc_is_compressed(op->crc)) { /* Last point we can still verify checksum: */ struct nonce nonce = extent_nonce(op->version, op->crc);
csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) goto csum_err;
if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); if (ret) return ret;
ret = bch2_bio_uncompress_inplace(op, bio); if (ret) return ret;
}
/* * No longer have compressed data after this point - data might be * encrypted:
*/
/* * If the data is checksummed and we're only writing a subset, * rechecksum and adjust bio to point to currently live data:
*/ if (op->crc.live_size != op->crc.uncompressed_size ||
op->crc.csum_type != op->csum_type) {
ret = bch2_write_rechecksum(c, op, op->csum_type); if (ret) return ret;
}
/* * If we want to compress the data, it has to be decrypted:
*/ if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
(op->compression_opt || op->crc.csum_type != op->csum_type)) { struct nonce nonce = extent_nonce(op->version, op->crc);
csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) goto csum_err;
ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); if (ret) return ret;
if ((op->flags & BCH_WRITE_data_encoded) &&
!crc_is_compressed(crc) &&
bch2_csum_type_is_encryption(op->crc.csum_type) ==
bch2_csum_type_is_encryption(op->csum_type)) {
u8 compression_type = crc.compression_type;
u16 nonce = crc.nonce; /* * Note: when we're using rechecksum(), we need to be * checksumming @src because it has all the data our * existing checksum covers - if we bounced (because we * were trying to compress), @dst will only have the * part of the data the new checksum will cover. * * But normally we want to be checksumming post bounce, * because part of the reason for bouncing is so the * data can't be modified (by userspace) while it's in * flight.
*/
ret = bch2_rechecksum_bio(c, src, version, op->crc,
&crc, &op->crc,
src_len >> 9,
bio_sectors(src) - (src_len >> 9),
op->csum_type); if (ret) goto err; /* * rchecksum_bio sets compression_type on crc from op->crc, * this isn't always correct as sometimes we're changing * an extent from uncompressed to incompressible.
*/
crc.compression_type = compression_type;
crc.nonce = nonce;
} else { if ((op->flags & BCH_WRITE_data_encoded) &&
(ret = bch2_rechecksum_bio(c, src, version, op->crc,
NULL, &op->crc,
src_len >> 9,
bio_sectors(src) - (src_len >> 9),
op->crc.csum_type))) goto err;
/* * Note that we're not calling bch2_subvol_get_snapshot() in this path - * that was done when we kicked off the write, and here it's important * that we update the extent that we wrote to - even if a snapshot has * since been created. The write is still outstanding, so we're ok * w.r.t. snapshot atomicity:
*/ return bch2_extent_update_i_size_sectors(trans, iter,
min(new->k.p.offset << 9, new_i_size), 0) ?:
bch2_trans_update(trans, iter, new,
BTREE_UPDATE_internal_snapshot_node);
}
staticvoid bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{ struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); int ret = 0;
int gen = bucket_gen_get(ca, i->b.offset);
stale = gen < 0 ? gen : gen_after(gen, i->gen); if (unlikely(stale)) {
stale_at = i; goto err_bucket_stale;
}
}
/* fallback to cow write path? */ if (!(op->flags & BCH_WRITE_submitted)) {
closure_sync(&op->cl);
__bch2_nocow_write_done(op);
op->insert_keys.top = op->insert_keys.keys;
} elseif (op->flags & BCH_WRITE_sync) {
closure_sync(&op->cl);
bch2_nocow_write_done(&op->cl.work);
} else { /* * XXX * needs to run out of process context because ei_quota_lock is * a mutex
*/
continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
} return;
err_get_ioref:
darray_for_each(buckets, i)
enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
BCH_DEV_WRITE_REF_io_write);
/* Fall back to COW path: */ goto out;
err_bucket_stale:
darray_for_each(buckets, i) {
bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); if (i == stale_at) break;
}
struct printbuf buf = PRINTBUF; if (bch2_fs_inconsistent_on(stale < 0, c, "pointer to invalid bucket in nocow path on device %llu\n %s",
stale_at->b.inode,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch_err_throw(c, data_write_invalid_ptr);
} else { /* We can retry this: */
ret = bch_err_throw(c, transaction_restart);
}
printbuf_exit(&buf);
/* +1 for possible cache device: */ if (op->open_buckets.nr + op->nr_replicas + 1 >
ARRAY_SIZE(op->open_buckets.v)) break;
if (bch2_keylist_realloc(&op->insert_keys,
op->inline_keys,
ARRAY_SIZE(op->inline_keys),
BKEY_EXTENT_U64s_MAX)) break;
/* * The copygc thread is now global, which means it's no longer * freeing up space on specific disks, which means that * allocations for specific disks may hang arbitrarily long:
*/
ret = bch2_trans_run(c, lockrestart_do(trans,
bch2_alloc_sectors_start_trans(trans,
op->target,
op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
op->write_point,
&op->devs_have,
op->nr_replicas,
op->nr_replicas_required,
op->watermark,
op->flags,
&op->cl, &wp))); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) break;
goto err;
}
EBUG_ON(!wp);
bch2_open_bucket_get(c, wp, &op->open_buckets);
ret = bch2_write_extent(op, wp, &bio);
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
key_to_write, false);
} while (ret);
/* * Sync or no? * * If we're running asynchronously, wne may still want to block * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller.
*/ if ((op->flags & BCH_WRITE_sync) ||
(!(op->flags & BCH_WRITE_submitted) &&
!(op->flags & BCH_WRITE_in_worker))) {
bch2_wait_on_allocator(c, &op->cl);
/** * bch2_write() - handle a write to a cache device or flash only volume * @cl: &bch_write_op->cl * * This is the starting point for any data to end up in a cache device; it could * be from a normal write, or a writeback write, or a write to a flash only * volume - it's also used by the moving garbage collector to compact data in * mostly empty buckets. * * It first writes the data to the cache, creating a list of keys to be inserted * (if the data won't fit in a single open bucket, there will be multiple keys); * after the data is written it calls bch_journal, and after the keys have been * added to the next journal write they're inserted into the btree. * * If op->discard is true, instead of inserting the data it invalidates the * region of the cache represented by op->bio and op->inode.
*/
CLOSURE_CALLBACK(bch2_write)
{
closure_type(op, struct bch_write_op, cl); struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; unsigned data_len;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.