/* arena offsets may be shifted from the base of the device */
offset = adjust_initial_offset(nd_btt, offset); return nvdimm_read_bytes(ndns, offset, buf, n, flags);
}
/* arena offsets may be shifted from the base of the device */
offset = adjust_initial_offset(nd_btt, offset); return nvdimm_write_bytes(ndns, offset, buf, n, flags);
}
staticint btt_info_write(struct arena_info *arena, struct btt_sb *super)
{ int ret;
/* * infooff and info2off should always be at least 512B aligned. * We rely on that to make sure rw_bytes does error clearing * correctly, so make sure that is the case.
*/
dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512), "arena->infooff: %#llx is unaligned\n", arena->infooff);
dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512), "arena->info2off: %#llx is unaligned\n", arena->info2off);
ret = arena_write_bytes(arena, arena->info2off, super, sizeof(struct btt_sb), 0); if (ret) return ret;
/* * This 'mapping' is supposed to be just the LBA mapping, without * any flags set, so strip the flag bits.
*/
mapping = ent_lba(mapping);
ze = (z_flag << 1) + e_flag; switch (ze) { case 0: /* * We want to set neither of the Z or E flags, and * in the actual layout, this means setting the bit * positions of both to '1' to indicate a 'normal' * map entry
*/
mapping |= MAP_ENT_NORMAL; break; case 1:
mapping |= (1 << MAP_ERR_SHIFT); break; case 2:
mapping |= (1 << MAP_TRIM_SHIFT); break; default: /* * The case where Z and E are both sent in as '1' could be * construed as a valid 'normal' case, but we decide not to, * to avoid confusion
*/
dev_err_ratelimited(to_dev(arena), "Invalid use of Z and E flags\n"); return -EIO;
}
if (unlikely(lba >= arena->external_nlba))
dev_err_ratelimited(to_dev(arena), "%s: lba %#x out of range (max: %#x)\n",
__func__, lba, arena->external_nlba);
ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags); if (ret) return ret;
/* * This function accepts two log entries, and uses the * sequence number to find the 'older' entry. * It also updates the sequence number in this old entry to * make it the 'new' one if the mark_flag is set. * Finally, it returns which of the entries was the older one. * * TODO The logic feels a bit kludge-y. make it better..
*/ staticint btt_log_get_old(struct arena_info *a, struct log_group *log)
{ int idx0 = a->log_index[0]; int idx1 = a->log_index[1]; int old;
/* * the first ever time this is seen, the entry goes into [0] * the next time, the following logic works out to put this * (next) entry into [1]
*/ if (log_seq(log, idx0) == 0) {
log->ent[idx0].seq = cpu_to_le32(1); return 0;
}
if (log_seq(log, idx0) == log_seq(log, idx1)) return -EINVAL; if (log_seq(log, idx0) + log_seq(log, idx1) > 5) return -EINVAL;
if (log_seq(log, idx0) < log_seq(log, idx1)) { if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1)
old = 0; else
old = 1;
} else { if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1)
old = 1; else
old = 0;
}
return old;
}
/* * This function copies the desired (old/new) log entry into ent if * it is not NULL. It returns the sub-slot number (0 or 1) * where the desired log entry was found. Negative return values * indicate errors.
*/ staticint btt_log_read(struct arena_info *arena, u32 lane, struct log_entry *ent, int old_flag)
{ int ret; int old_ent, ret_ent; struct log_group log;
ret = btt_log_group_read(arena, lane, &log); if (ret) return -EIO;
old_ent = btt_log_get_old(arena, &log); if (old_ent < 0 || old_ent > 1) {
dev_err(to_dev(arena), "log corruption (%d): lane %d seq [%d, %d]\n",
old_ent, lane, log.ent[arena->log_index[0]].seq,
log.ent[arena->log_index[1]].seq); /* TODO set error state? */ return -EIO;
}
ret_ent = (old_flag ? old_ent : (1 - old_ent));
if (ent != NULL)
memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE);
return ret_ent;
}
/* * This function commits a log entry to media * It does _not_ prepare the freelist entry for the next write * btt_flog_write is the wrapper for updating the freelist elements
*/ staticint __btt_log_write(struct arena_info *arena, u32 lane,
u32 sub, struct log_entry *ent, unsignedlong flags)
{ int ret;
u32 group_slot = arena->log_index[sub]; unsignedint log_half = LOG_ENT_SIZE / 2; void *src = ent;
u64 ns_off;
ns_off = arena->logoff + (lane * LOG_GRP_SIZE) +
(group_slot * LOG_ENT_SIZE); /* split the 16B write into atomic, durable halves */
ret = arena_write_bytes(arena, ns_off, src, log_half, flags); if (ret) return ret;
ret = __btt_log_write(arena, lane, sub, ent, NVDIMM_IO_ATOMIC); if (ret) return ret;
/* prepare the next free entry */
arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; if (++(arena->freelist[lane].seq) == 4)
arena->freelist[lane].seq = 1; if (ent_e_flag(le32_to_cpu(ent->old_map)))
arena->freelist[lane].has_err = 1;
arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map));
return ret;
}
/* * This function initializes the BTT map to the initial state, which is * all-zeroes, and indicates an identity mapping
*/ staticint btt_map_init(struct arena_info *arena)
{ int ret = -EINVAL; void *zerobuf;
size_t offset = 0;
size_t chunk_size = SZ_2M;
size_t mapsize = arena->logoff - arena->mapoff;
zerobuf = kzalloc(chunk_size, GFP_KERNEL); if (!zerobuf) return -ENOMEM;
/* * mapoff should always be at least 512B aligned. We rely on that to * make sure rw_bytes does error clearing correctly, so make sure that * is the case.
*/
dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512), "arena->mapoff: %#llx is unaligned\n", arena->mapoff);
while (mapsize) {
size_t size = min(mapsize, chunk_size);
dev_WARN_ONCE(to_dev(arena), size < 512, "chunk size: %#zx is unaligned\n", size);
ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
size, 0); if (ret) goto free;
/* * This function initializes the BTT log with 'fake' entries pointing * to the initial reserved set of blocks as being free
*/ staticint btt_log_init(struct arena_info *arena)
{
size_t logsize = arena->info2off - arena->logoff;
size_t chunk_size = SZ_4K, offset = 0; struct log_entry ent; void *zerobuf; int ret;
u32 i;
zerobuf = kzalloc(chunk_size, GFP_KERNEL); if (!zerobuf) return -ENOMEM; /* * logoff should always be at least 512B aligned. We rely on that to * make sure rw_bytes does error clearing correctly, so make sure that * is the case.
*/
dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512), "arena->logoff: %#llx is unaligned\n", arena->logoff);
while (logsize) {
size_t size = min(logsize, chunk_size);
dev_WARN_ONCE(to_dev(arena), size < 512, "chunk size: %#zx is unaligned\n", size);
ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf,
size, 0); if (ret) goto free;
arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry),
GFP_KERNEL); if (!arena->freelist) return -ENOMEM;
for (i = 0; i < arena->nfree; i++) { new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT); if (new < 0) returnnew;
/* old and new map entries with any flags stripped out */
log_oldmap = ent_lba(le32_to_cpu(log_new.old_map));
log_newmap = ent_lba(le32_to_cpu(log_new.new_map));
/* sub points to the next one to be overwritten */
arena->freelist[i].sub = 1 - new;
arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
arena->freelist[i].block = log_oldmap;
/* * FIXME: if error clearing fails during init, we want to make * the BTT read-only
*/ if (ent_e_flag(le32_to_cpu(log_new.old_map)) &&
!ent_normal(le32_to_cpu(log_new.old_map))) {
arena->freelist[i].has_err = 1;
ret = arena_clear_freelist_error(arena, i); if (ret)
dev_err_ratelimited(to_dev(arena), "Unable to clear known errors\n");
}
/* This implies a newly created or untouched flog entry */ if (log_oldmap == log_newmap) continue;
/* Check if map recovery is needed */
ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
NULL, NULL, 0); if (ret) return ret;
/* * The map_entry from btt_read_map is stripped of any flag bits, * so use the stripped out versions from the log as well for * testing whether recovery is needed. For restoration, use the * 'raw' version of the log entries as that captured what we * were going to write originally.
*/ if ((log_newmap != map_entry) && (log_oldmap == map_entry)) { /* * Last transaction wrote the flog, but wasn't able * to complete the map write. So fix up the map.
*/
ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
le32_to_cpu(log_new.new_map), 0, 0, 0); if (ret) return ret;
}
}
/* * Detecting valid log indices: We read a log group (see the comments in btt.h * for a description of a 'log_group' and its 'slots'), and iterate over its * four slots. We expect that a padding slot will be all-zeroes, and use this * to detect a padding slot vs. an actual entry. * * If a log_group is in the initial state, i.e. hasn't been used since the * creation of this BTT layout, it will have three of the four slots with * zeroes. We skip over these log_groups for the detection of log_index. If * all log_groups are in the initial state (i.e. the BTT has never been * written to), it is safe to assume the 'new format' of log entries in slots * (0, 1).
*/ staticint log_set_indices(struct arena_info *arena)
{ bool idx_set = false, initial_state = true; int ret, log_index[2] = {-1, -1};
u32 i, j, next_idx = 0; struct log_group log;
u32 pad_count = 0;
for (i = 0; i < arena->nfree; i++) {
ret = btt_log_group_read(arena, i, &log); if (ret < 0) return ret;
for (j = 0; j < 4; j++) { if (!idx_set) { if (ent_is_padding(&log.ent[j])) {
pad_count++; continue;
} else { /* Skip if index has been recorded */ if ((next_idx == 1) &&
(j == log_index[0])) continue; /* valid entry, record index */
log_index[next_idx] = j;
next_idx++;
} if (next_idx == 2) { /* two valid entries found */
idx_set = true;
} elseif (next_idx > 2) { /* too many valid indices */ return -ENXIO;
}
} else { /* * once the indices have been set, just verify * that all subsequent log groups are either in * their initial state or follow the same * indices.
*/ if (j == log_index[0]) { /* entry must be 'valid' */ if (ent_is_padding(&log.ent[j])) return -ENXIO;
} elseif (j == log_index[1]) {
; /* * log_index[1] can be padding if the * lane never got used and it is still * in the initial state (three 'padding' * entries)
*/
} else { /* entry must be invalid (padding) */ if (!ent_is_padding(&log.ent[j])) return -ENXIO;
}
}
} /* * If any of the log_groups have more than one valid, * non-padding entry, then the we are no longer in the * initial_state
*/ if (pad_count < 3)
initial_state = false;
pad_count = 0;
}
if (!initial_state && !idx_set) return -ENXIO;
/* * If all the entries in the log were in the initial state, * assume new padding scheme
*/ if (initial_state)
log_index[1] = 1;
/* * Only allow the known permutations of log/padding indices, * i.e. (0, 1), and (0, 2)
*/ if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2)))
; /* known index possibilities */ else {
dev_err(to_dev(arena), "Found an unknown padding scheme\n"); return -ENXIO;
}
/* * This function completes arena initialization by writing * all the metadata. * It is only called for an uninitialized arena when a write * to that arena occurs for the first time.
*/ staticint btt_arena_write_layout(struct arena_info *arena)
{ int ret;
u64 sum; struct btt_sb *super; struct nd_btt *nd_btt = arena->nd_btt; const uuid_t *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev);
ret = btt_map_init(arena); if (ret) return ret;
ret = btt_log_init(arena); if (ret) return ret;
super = kzalloc(sizeof(*super), GFP_NOIO); if (!super) return -ENOMEM;
/* * This function completes the initialization for the BTT namespace * such that it is ready to accept IOs
*/ staticint btt_meta_init(struct btt *btt)
{ int ret = 0; struct arena_info *arena;
mutex_lock(&btt->init_lock);
list_for_each_entry(arena, &btt->arena_list, list) {
ret = btt_arena_write_layout(arena); if (ret) goto unlock;
ret = btt_freelist_init(arena); if (ret) goto unlock;
ret = btt_rtt_init(arena); if (ret) goto unlock;
ret = btt_maplocks_init(arena); if (ret) goto unlock;
}
/* * This function calculates the arena in which the given LBA lies * by doing a linear walk. This is acceptable since we expect only * a few arenas. If we have backing devices that get much larger, * we can construct a balanced binary tree of arenas at init time * so that this range search becomes faster.
*/ staticint lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap, struct arena_info **arena)
{ struct arena_info *arena_list;
__u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size);
/* * The following (lock_map, unlock_map) are mostly just to improve * readability, since they index into an array of locks
*/ staticvoid lock_map(struct arena_info *arena, u32 premap)
__acquires(&arena->map_locks[idx].lock)
{
u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree;
while (len) { unsignedint cur_len; struct bio_vec bv; void *mem;
bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); /* * The 'bv' obtained from bvec_iter_bvec has its .bv_len and * .bv_offset already adjusted for iter->bi_bvec_done, and we * can use those directly
*/
cur_len = min(len, bv.bv_len);
mem = bvec_kmap_local(&bv); if (rw)
ret = arena_write_bytes(arena, meta_nsoff, mem, cur_len,
NVDIMM_IO_ATOMIC); else
ret = arena_read_bytes(arena, meta_nsoff, mem, cur_len,
NVDIMM_IO_ATOMIC);
kunmap_local(mem); if (ret) return ret;
len -= cur_len;
meta_nsoff += cur_len; if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len)) return -EIO;
}
staticint btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip, struct page *page, unsignedint off, sector_t sector, unsignedint len)
{ int ret = 0; int t_flag, e_flag; struct arena_info *arena = NULL;
u32 lane = 0, premap, postmap;
while (len) {
u32 cur_len;
lane = nd_region_acquire_lane(btt->nd_region);
ret = lba_to_arena(btt, sector, &premap, &arena); if (ret) goto out_lane;
cur_len = min(btt->sector_size, len);
ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag,
NVDIMM_IO_ATOMIC); if (ret) goto out_lane;
/* * We loop to make sure that the post map LBA didn't change * from under us between writing the RTT and doing the actual * read.
*/ while (1) {
u32 new_map; int new_t, new_e;
if (t_flag) {
zero_fill_data(page, off, cur_len); goto out_lane;
}
if (e_flag) {
ret = -EIO; goto out_lane;
}
arena->rtt[lane] = RTT_VALID | postmap; /* * Barrier to make sure this write is not reordered * to do the verification map_read before the RTT store
*/
barrier();
ret = btt_map_read(arena, premap, &new_map, &new_t,
&new_e, NVDIMM_IO_ATOMIC); if (ret) goto out_rtt;
ret = btt_data_read(arena, page, off, postmap, cur_len); if (ret) { /* Media error - set the e_flag */ if (btt_map_write(arena, premap, postmap, 0, 1, NVDIMM_IO_ATOMIC))
dev_warn_ratelimited(to_dev(arena), "Error persistently tracking bad blocks at %#x\n",
premap); goto out_rtt;
}
if (bip) {
ret = btt_rw_integrity(btt, bip, arena, postmap, READ); if (ret) goto out_rtt;
}
/* * Normally, arena_{read,write}_bytes will take care of the initial offset * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem, * we need the final, raw namespace offset here
*/ staticbool btt_is_badblock(struct btt *btt, struct arena_info *arena,
u32 postmap)
{
u64 nsoff = adjust_initial_offset(arena->nd_btt,
to_namespace_offset(arena, postmap));
sector_t phys_sector = nsoff >> 9;
/** * btt_init - initialize a block translation table for the given device * @nd_btt: device with BTT geometry and backing device info * @rawsize: raw size in bytes of the backing device * @lbasize: lba size of the backing device * @uuid: A uuid for the backing device - this is stored on media * @nd_region: &struct nd_region for the REGION device * * Initialize a Block Translation Table on a backing device to provide * single sector power fail atomicity. * * Context: * Might sleep. * * Returns: * Pointer to a new struct btt on success, NULL on failure.
*/ staticstruct btt *btt_init(struct nd_btt *nd_btt, unsignedlonglong rawsize,
u32 lbasize, uuid_t *uuid, struct nd_region *nd_region)
{ int ret; struct btt *btt; struct nd_namespace_io *nsio; struct device *dev = &nd_btt->dev;
btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL); if (!btt) return NULL;
/* * If this returns < 0, that is ok as it just means there wasn't * an existing BTT, and we're creating a new one. We still need to * call this as we need the version dependent fields in nd_btt to be * set correctly based on the holder class
*/
nd_btt_version(nd_btt, ndns, btt_sb);
rawsize = size - nd_btt->initial_offset; if (rawsize < ARENA_MIN_SIZE) {
dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n",
dev_name(&ndns->dev),
ARENA_MIN_SIZE + nd_btt->initial_offset); return -ENXIO;
}
nd_region = to_nd_region(nd_btt->dev.parent);
btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid,
nd_region); if (!btt) return -ENOMEM;
nd_btt->btt = btt;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.