/* * Snapshots in ceph are driven in large part by cooperation from the * client. In contrast to local file systems or file servers that * implement snapshots at a single point in the system, ceph's * distributed access to storage requires clients to help decide * whether a write logically occurs before or after a recently created * snapshot. * * This provides a perfect instantanous client-wide snapshot. Between * clients, however, snapshots may appear to be applied at slightly * different points in time, depending on delays in delivering the * snapshot notification. * * Snapshots are _not_ file system-wide. Instead, each snapshot * applies to the subdirectory nested beneath some directory. This * effectively divides the hierarchy into multiple "realms," where all * of the files contained by each realm share the same set of * snapshots. An individual realm's snap set contains snapshots * explicitly created on that realm, as well as any snaps in its * parent's snap set _after_ the point at which the parent became it's * parent (due to, say, a rename). Similarly, snaps from prior parents * during the time intervals during which they were the parent are included. * * The client is spared most of this detail, fortunately... it must only * maintains a hierarchy of realms reflecting the current parent/child * realm relationship, and for each realm has an explicit list of snaps * inherited from prior parents. * * A snap_realm struct is maintained for realms containing every inode * with an open cap in the system. (The needed snap realm information is * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' * version number is used to ensure that as realm parameters change (new * snapshot, new parent, etc.) the client's realm hierarchy is updated. * * The realm hierarchy drives the generation of a 'snap context' for each * realm, which simply lists the resulting set of snaps for the realm. This * is attached to any writes sent to OSDs.
*/ /* * Unfortunately error handling is a bit mixed here. If we get a snap * update, but don't have enough memory to update our realm hierarchy, * it's not clear what we can do about it (besides complaining to the * console).
*/
/* * increase ref count for the realm * * caller must hold snap_rwsem.
*/ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm)
{
lockdep_assert_held(&mdsc->snap_rwsem);
/* * The 0->1 and 1->0 transitions must take the snap_empty_lock * atomically with the refcount change. Go ahead and bump the * nref here, unless it's 0, in which case we take the spinlock * and then do the increment and remove it from the list.
*/ if (atomic_inc_not_zero(&realm->nref)) return;
spin_lock(&mdsc->snap_empty_lock); if (atomic_inc_return(&realm->nref) == 1)
list_del_init(&realm->empty_item);
spin_unlock(&mdsc->snap_empty_lock);
}
/* * create and get the realm rooted at @ino and bump its ref count. * * caller must hold snap_rwsem for write.
*/ staticstruct ceph_snap_realm *ceph_create_snap_realm( struct ceph_mds_client *mdsc,
u64 ino)
{ struct ceph_snap_realm *realm;
lockdep_assert_held_write(&mdsc->snap_rwsem);
realm = kzalloc(sizeof(*realm), GFP_NOFS); if (!realm) return ERR_PTR(-ENOMEM);
/* Do not release the global dummy snaprealm until unmouting */ if (ino == CEPH_INO_GLOBAL_SNAPREALM)
atomic_set(&realm->nref, 2); else
atomic_set(&realm->nref, 1);
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
INIT_LIST_HEAD(&realm->empty_item);
INIT_LIST_HEAD(&realm->dirty_item);
INIT_LIST_HEAD(&realm->rebuild_item);
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
mdsc->num_snap_realms++;
/* * We do not require the snap_empty_lock here, as any caller that * increments the value must hold the snap_rwsem.
*/ if (atomic_dec_and_test(&realm->nref))
__destroy_snap_realm(mdsc, realm);
}
/* * See comments in ceph_get_snap_realm. Caller needn't hold any locks.
*/ void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm)
{ if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock)) return;
/* * Clean up any realms whose ref counts have dropped to zero. Note * that this does not include realms who were created but not yet * used. * * Called under snap_rwsem (write)
*/ staticvoid __cleanup_empty_realms(struct ceph_mds_client *mdsc)
{ struct ceph_snap_realm *realm;
/* * adjust the parent realm of a given @realm. adjust child list, and parent * pointers, and ref counts appropriately. * * return true if parent was changed, 0 if unchanged, <0 on error. * * caller must hold snap_rwsem for write.
*/ staticint adjust_snap_realm_parent(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm,
u64 parentino)
{ struct ceph_client *cl = mdsc->fsc->client; struct ceph_snap_realm *parent;
/* * build the snap context for a given realm.
*/ staticint build_snap_context(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm, struct list_head *realm_queue, struct list_head *dirty_realms)
{ struct ceph_client *cl = mdsc->fsc->client; struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; int err = 0;
u32 num = realm->num_prior_parent_snaps + realm->num_snaps;
/* * build parent context, if it hasn't been built. * conservatively estimate that all parent snaps might be * included by us.
*/ if (parent) { if (!parent->cached_context) { /* add to the queue head */
list_add(&parent->rebuild_item, realm_queue); return 1;
}
num += parent->cached_context->num_snaps;
}
/* do i actually need to update? not if my context seq matches realm seq, and my parents' does to. (this works because we rebuild_snap_realms() works _downward_ in
hierarchy after each update.) */ if (realm->cached_context &&
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
doutc(cl, "%llx %p: %p seq %lld (%u snaps) (unchanged)\n",
realm->ino, realm, realm->cached_context,
realm->cached_context->seq,
(unsignedint)realm->cached_context->num_snaps); return 0;
}
/* alloc new snap context */
err = -ENOMEM; if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail;
snapc = ceph_create_snap_context(num, GFP_NOFS); if (!snapc) goto fail;
/* build (reverse sorted) snap vector */
num = 0;
snapc->seq = realm->seq; if (parent) {
u32 i;
/* include any of parent's snaps occurring _after_ my
parent became my parent */ for (i = 0; i < parent->cached_context->num_snaps; i++) if (parent->cached_context->snaps[i] >=
realm->parent_since)
snapc->snaps[num++] =
parent->cached_context->snaps[i]; if (parent->cached_context->seq > snapc->seq)
snapc->seq = parent->cached_context->seq;
}
memcpy(snapc->snaps + num, realm->snaps, sizeof(u64)*realm->num_snaps);
num += realm->num_snaps;
memcpy(snapc->snaps + num, realm->prior_parent_snaps, sizeof(u64)*realm->num_prior_parent_snaps);
num += realm->num_prior_parent_snaps;
fail: /* * if we fail, clear old (incorrect) cached_context... hopefully * we'll have better luck building it later
*/ if (realm->cached_context) {
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
pr_err_client(cl, "%llx %p fail %d\n", realm->ino, realm, err); return err;
}
/* * rebuild snap context for the given realm and all of its children.
*/ staticvoid rebuild_snap_realms(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm, struct list_head *dirty_realms)
{ struct ceph_client *cl = mdsc->fsc->client;
LIST_HEAD(realm_queue); int last = 0; bool skip = false;
/* * If the last building failed dues to memory * issue, just empty the realm_queue and return * to avoid infinite loop.
*/ if (last < 0) {
list_del_init(&_realm->rebuild_item); continue;
}
/* is any child in the list ? */
list_for_each_entry(child, &_realm->children, child_item) { if (!list_empty(&child->rebuild_item)) {
skip = true; break;
}
}
if (!skip) {
list_for_each_entry(child, &_realm->children, child_item)
list_add_tail(&child->rebuild_item, &realm_queue);
}
/* last == 1 means need to build parent first */ if (last <= 0)
list_del_init(&_realm->rebuild_item);
}
}
/* * helper to allocate and decode an array of snapids. free prior * instance, if any.
*/ staticint dup_array(u64 **dst, __le64 *src, u32 num)
{
u32 i;
kfree(*dst); if (num) {
*dst = kcalloc(num, sizeof(u64), GFP_NOFS); if (!*dst) return -ENOMEM; for (i = 0; i < num; i++)
(*dst)[i] = get_unaligned_le64(src + i);
} else {
*dst = NULL;
} return 0;
}
staticbool has_new_snaps(struct ceph_snap_context *o, struct ceph_snap_context *n)
{ if (n->num_snaps == 0) returnfalse; /* snaps are in descending order */ return n->snaps[0] > o->seq;
}
/* * When a snapshot is applied, the size/mtime inode metadata is queued * in a ceph_cap_snap (one for each snapshot) until writeback * completes and the metadata can be flushed back to the MDS. * * However, if a (sync) write is currently in-progress when we apply * the snapshot, we have to wait until the write succeeds or fails * (and a final size/mtime is known). In this case the * cap_snap->writing = 1, and is said to be "pending." When the write * finishes, we __ceph_finish_cap_snap(). * * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change).
*/ staticvoid ceph_queue_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap **pcapsnap)
{ struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_snap_context *old_snapc, *new_snapc; struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; int used, dirty;
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
/* * If there is a write in progress, treat that as a dirty Fw, * even though it hasn't completed yet; by the time we finish * up this capsnap it will be.
*/ if (used & CEPH_CAP_FILE_WR)
dirty |= CEPH_CAP_FILE_WR;
if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any writes in progress now were started before the previous
cap_snap. lucky us. */
doutc(cl, "%p %llx.%llx already pending\n", inode,
ceph_vinop(inode)); goto update_snapc;
} if (ci->i_wrbuffer_ref_head == 0 &&
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
doutc(cl, "%p %llx.%llx nothing dirty|writing\n", inode,
ceph_vinop(inode)); goto update_snapc;
}
BUG_ON(!old_snapc);
/* * There is no need to send FLUSHSNAP message to MDS if there is * no new snapshot. But when there is dirty pages or on-going * writes, we still need to create cap_snap. cap_snap is needed * by the write path and page writeback path. * * also see ceph_try_drop_cap_snap()
*/ if (has_new_snaps(old_snapc, new_snapc)) { if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
capsnap->need_flush = true;
} else { if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
doutc(cl, "%p %llx.%llx no new_snap|dirty_page|writing\n",
inode, ceph_vinop(inode)); goto update_snapc;
}
}
/* * Finalize the size, mtime for a cap_snap.. that is, settle on final values * to be used for the snapshot, to be flushed back to the mds. * * If capsnap can now be flushed, add to snap_flush list, and return 1. * * Caller must hold i_ceph_lock.
*/ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap)
{ struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = mdsc->fsc->client;
/* * Allocate the capsnap memory outside of ceph_queue_cap_snap() * to reduce very possible but unnecessary frequently memory * allocate/free in this loop.
*/ if (!capsnap) {
capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); if (!capsnap) {
pr_err_client(cl, "ENOMEM allocating ceph_cap_snap on %p\n",
inode); return;
}
}
capsnap->cap_flush.is_capsnap = true;
refcount_set(&capsnap->nref, 1);
INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
INIT_LIST_HEAD(&capsnap->ci_item);
/* * Parse and apply a snapblob "snap trace" from the MDS. This specifies * the snap realm parameters from a given realm and all of its ancestors, * up to the root. * * Caller must hold snap_rwsem for write.
*/ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, void *p, void *e, bool deletion, struct ceph_snap_realm **realm_ret)
{ struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */ struct ceph_snap_realm *realm; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; int ret;
LIST_HEAD(dirty_realms);
lockdep_assert_held_write(&mdsc->snap_rwsem);
doutc(cl, "deletion=%d\n", deletion);
more:
realm = NULL;
rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
le32_to_cpu(ri->num_prior_parent_snaps)), bad);
snaps = p;
p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
prior_parent_snaps = p;
p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
/* * this will always track the uppest parent realm from which * we need to rebuild the snapshot contexts _downward_ in * hierarchy.
*/ if (rebuild_snapcs)
realm_to_rebuild = realm;
/* rebuild_snapcs when we reach the _end_ (root) of the trace */ if (realm_to_rebuild && p >= e)
rebuild_snap_realms(mdsc, realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm; else
ceph_put_snap_realm(mdsc, realm);
if (p < e) goto more;
/* * queue cap snaps _after_ we've built the new snap contexts, * so that i_head_snapc can be set appropriately.
*/ while (!list_empty(&dirty_realms)) {
realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
dirty_item);
list_del_init(&realm->dirty_item);
queue_realm_cap_snaps(mdsc, realm);
}
if (realm_ret)
*realm_ret = first_realm; else
ceph_put_snap_realm(mdsc, first_realm);
__cleanup_empty_realms(mdsc); return 0;
bad:
err = -EIO;
fail: if (realm && !IS_ERR(realm))
ceph_put_snap_realm(mdsc, realm); if (first_realm)
ceph_put_snap_realm(mdsc, first_realm);
pr_err_client(cl, "error %d\n", err);
/* * When receiving a corrupted snap trace we don't know what * exactly has happened in MDS side. And we shouldn't continue * writing to OSD, which may corrupt the snapshot contents. * * Just try to blocklist this kclient and then this kclient * must be remounted to continue after the corrupted metadata * fixed in the MDS side.
*/
WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO);
ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); if (ret)
pr_err_client(cl, "failed to blocklist %s: %d\n",
ceph_pr_addr(&client->msgr.inst.addr), ret);
WARN(1, "[client.%lld] %s %s%sdo remount to continue%s",
client->monc.auth->global_id, __func__,
ret ? "" : ceph_pr_addr(&client->msgr.inst.addr),
ret ? "" : " was blocklisted, ",
err == -EIO ? " after corrupted snaptrace is fixed" : "");
return err;
}
/* * Send any cap_snaps that are queued for flush. Try to carry * s_mutex across multiple snap flushes to avoid locking overhead. * * Caller holds no locks.
*/ staticvoid flush_snaps(struct ceph_mds_client *mdsc)
{ struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; struct ceph_mds_session *session = NULL;
/** * ceph_change_snap_realm - change the snap_realm for an inode * @inode: inode to move to new snap realm * @realm: new realm to move inode into (may be NULL) * * Detach an inode from its old snaprealm (if any) and attach it to * the new snaprealm (if any). The old snap realm reference held by * the inode is put. If realm is non-NULL, then the caller's reference * to it is taken over by the inode.
*/ void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm)
{ struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
lockdep_assert_held(&ci->i_ceph_lock);
if (oldrealm) {
spin_lock(&oldrealm->inodes_with_caps_lock);
list_del_init(&ci->i_snap_realm_item); if (oldrealm->ino == ci->i_vino.ino)
oldrealm->inode = NULL;
spin_unlock(&oldrealm->inodes_with_caps_lock);
ceph_put_snap_realm(mdsc, oldrealm);
}
ci->i_snap_realm = realm;
if (realm) {
spin_lock(&realm->inodes_with_caps_lock);
list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); if (realm->ino == ci->i_vino.ino)
realm->inode = inode;
spin_unlock(&realm->inodes_with_caps_lock);
}
}
/* * Handle a snap notification from the MDS. * * This can take two basic forms: the simplest is just a snap creation * or deletion notification on an existing realm. This should update the * realm and its children. * * The more difficult case is realm creation, due to snap creation at a * new point in the file hierarchy, or due to a rename that moves a file or * directory into another realm.
*/ void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg)
{ struct ceph_client *cl = mdsc->fsc->client; struct super_block *sb = mdsc->fsc->sb; int mds = session->s_mds;
u64 split; int op; int trace_len; struct ceph_snap_realm *realm = NULL; void *p = msg->front.iov_base; void *e = p + msg->front.iov_len; struct ceph_mds_snap_head *h; int num_split_inos, num_split_realms;
__le64 *split_inos = NULL, *split_realms = NULL; int i; int locked_rwsem = 0; bool close_sessions = false;
if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return;
/* decode */ if (msg->front.iov_len < sizeof(*h)) goto bad;
h = p;
op = le32_to_cpu(h->op);
split = le64_to_cpu(h->split); /* non-zero if we are splitting an
* existing realm */
num_split_inos = le32_to_cpu(h->num_split_inos);
num_split_realms = le32_to_cpu(h->num_split_realms);
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
if (op == CEPH_SNAP_OP_SPLIT) { struct ceph_mds_snap_realm *ri;
/* * A "split" breaks part of an existing realm off into * a new realm. The MDS provides a list of inodes * (with caps) and child realms that belong to the new * child.
*/
split_inos = p;
p += sizeof(u64) * num_split_inos;
split_realms = p;
p += sizeof(u64) * num_split_realms;
ceph_decode_need(&p, e, sizeof(*ri), bad); /* we will peek at realm info here, but will _not_ * advance p, as the realm update will occur below in
* ceph_update_snap_trace. */
ri = p;
realm = ceph_lookup_snap_realm(mdsc, split); if (!realm) {
realm = ceph_create_snap_realm(mdsc, split); if (IS_ERR(realm)) goto out;
}
spin_lock(&ci->i_ceph_lock); if (!ci->i_snap_realm) goto skip_inode; /* * If this inode belongs to a realm that was * created after our new realm, we experienced * a race (due to another split notifications * arriving from a different MDS). So skip * this inode.
*/ if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
doutc(cl, " leaving %p %llx.%llx in newer realm %llx %p\n",
inode, ceph_vinop(inode), ci->i_snap_realm->ino,
ci->i_snap_realm); goto skip_inode;
}
doutc(cl, " will move %p %llx.%llx to split realm %llx %p\n",
inode, ceph_vinop(inode), realm->ino, realm);
/* we may have taken some of the old realm's children. */ for (i = 0; i < num_split_realms; i++) { struct ceph_snap_realm *child =
__lookup_snap_realm(mdsc,
le64_to_cpu(split_realms[i])); if (!child) continue;
adjust_snap_realm_parent(mdsc, child, realm->ino);
}
} else { /* * In the non-split case both 'num_split_inos' and * 'num_split_realms' should be 0, making this a no-op. * However the MDS happens to populate 'split_realms' list * in one of the UPDATE op cases by mistake. * * Skip both lists just in case to ensure that 'p' is * positioned at the start of realm info, as expected by * ceph_update_snap_trace().
*/
p += sizeof(u64) * num_split_inos;
p += sizeof(u64) * num_split_realms;
}
/* * update using the provided snap trace. if we are deleting a * snap, we can avoid queueing cap_snaps.
*/ if (ceph_update_snap_trace(mdsc, p, e,
op == CEPH_SNAP_OP_DESTROY,
NULL)) {
close_sessions = true; goto bad;
}
if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */
ceph_put_snap_realm(mdsc, realm);
while (!list_empty(&mdsc->snapid_map_lru)) {
sm = list_first_entry(&mdsc->snapid_map_lru, struct ceph_snapid_map, lru); if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) break;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.