/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
* reason it could fail was no space in skb, and there are 4k available. */ staticint drbd_msg_put_info(struct sk_buff *skb, constchar *info)
{ struct nlattr *nla; int err = -EMSGSIZE;
if (!info || !info[0]) return 0;
nla = nla_nest_start_noflag(skb, DRBD_NLA_CFG_REPLY); if (!nla) return err;
/* maybe: retry with larger reserve, if truncated */
txt->nla_len = nla_attr_size(len+1);
nlmsg_trim(skb, (char*)txt + NLA_ALIGN(txt->nla_len));
nla_nest_end(skb, nla);
return 0;
}
/* This would be a good candidate for a "pre_doit" hook, * and per-family private info->pointers. * But we need to stay compatible with older kernels. * If it returns successfully, adm_ctx members are valid. * * At this point, we still rely on the global genl_lock(). * If we want to avoid that, and allow "genl_family.parallel_ops", we may need * to add additional synchronization against object destruction/modification.
*/ #define DRBD_ADM_NEED_MINOR 1 #define DRBD_ADM_NEED_RESOURCE 2 #define DRBD_ADM_NEED_CONNECTION 4 staticint drbd_adm_prepare(struct drbd_config_context *adm_ctx, struct sk_buff *skb, struct genl_info *info, unsigned flags)
{ struct drbd_genlmsghdr *d_in = genl_info_userhdr(info); const u8 cmd = info->genlhdr->cmd; int err;
memset(adm_ctx, 0, sizeof(*adm_ctx));
/* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) return -EPERM;
adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb,
info, &drbd_genl_family, 0, cmd); /* put of a few bytes into a fresh skb of >= 4k will always succeed.
* but anyways */ if (!adm_ctx->reply_dh) {
err = -ENOMEM; goto fail;
}
adm_ctx->volume = VOLUME_UNSPECIFIED; if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { struct nlattr *nla; /* parse and validate only */
err = drbd_cfg_context_from_attrs(NULL, info); if (err) goto fail;
/* It was present, and valid,
* copy it over to the reply skb. */
err = nla_put_nohdr(adm_ctx->reply_skb,
info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
info->attrs[DRBD_NLA_CFG_CONTEXT]); if (err) goto fail;
/* and assign stuff to the adm_ctx */
nla = nested_attr_tb[__nla_type(T_ctx_volume)]; if (nla)
adm_ctx->volume = nla_get_u32(nla);
nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; if (nla)
adm_ctx->resource_name = nla_data(nla);
adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; if ((adm_ctx->my_addr &&
nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) ||
(adm_ctx->peer_addr &&
nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) {
err = -EINVAL; goto fail;
}
}
/* We are protected by the global genl_lock(). * But we may explicitly drop it/retake it in drbd_adm_set_role(),
* so make sure this object stays around. */ if (adm_ctx->device)
kref_get(&adm_ctx->device->kref);
if (adm_ctx->resource_name) {
adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
}
if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) {
drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor"); return ERR_MINOR_INVALID;
} if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource"); if (adm_ctx->resource_name) return ERR_RES_NOT_KNOWN; return ERR_INVALID_REQUEST;
}
if (flags & DRBD_ADM_NEED_CONNECTION) { if (adm_ctx->resource) {
drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected"); return ERR_INVALID_REQUEST;
} if (adm_ctx->device) {
drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected"); return ERR_INVALID_REQUEST;
} if (adm_ctx->my_addr && adm_ctx->peer_addr)
adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr),
nla_len(adm_ctx->my_addr),
nla_data(adm_ctx->peer_addr),
nla_len(adm_ctx->peer_addr)); if (!adm_ctx->connection) {
drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection"); return ERR_INVALID_REQUEST;
}
}
/* some more paranoia, if the request was over-determined */ if (adm_ctx->device && adm_ctx->resource &&
adm_ctx->device->resource != adm_ctx->resource) {
pr_warn("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
adm_ctx->minor, adm_ctx->resource->name,
adm_ctx->device->resource->name);
drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource"); return ERR_INVALID_REQUEST;
} if (adm_ctx->device &&
adm_ctx->volume != VOLUME_UNSPECIFIED &&
adm_ctx->volume != adm_ctx->device->vnr) {
pr_warn("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
adm_ctx->minor, adm_ctx->volume,
adm_ctx->device->vnr, adm_ctx->device->resource->name);
drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume"); return ERR_INVALID_REQUEST;
}
/* still, provide adm_ctx->resource always, if possible. */ if (!adm_ctx->resource) {
adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource
: adm_ctx->connection ? adm_ctx->connection->resource : NULL; if (adm_ctx->resource)
kref_get(&adm_ctx->resource->kref);
}
fp = highest_fencing_policy(connection); switch (fp) { case FP_NOT_AVAIL:
drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
spin_lock_irq(&resource->req_lock); if (connection->cstate < C_WF_REPORT_PARAMS) {
_conn_request_state(connection,
(union drbd_state) { { .susp_fen = 1 } },
(union drbd_state) { { .susp_fen = 0 } },
CS_VERBOSE | CS_HARD | CS_DC_SUSP); /* We are no longer suspended due to the fencing policy. * We may still be suspended due to the on-no-data-accessible policy.
* If that was OND_IO_ERROR, fail pending requests. */ if (!resource_is_supended(resource))
_tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
} /* Else: in case we raced with a connection handshake, * let the handshake figure out if we maybe can RESEND, * and do not resume/fail pending requests here. * Worst case is we stay suspended for now, which may be * resolved by either re-establishing the replication link, or
* the next link failure, or eventually the administrator. */
spin_unlock_irq(&resource->req_lock); returnfalse;
case FP_DONT_CARE: returntrue; default: ;
}
r = conn_khelper(connection, "fence-peer");
switch ((r>>8) & 0xff) { case P_INCONSISTENT: /* peer is inconsistent */
ex_to_string = "peer is inconsistent or worse";
mask.pdsk = D_MASK;
val.pdsk = D_INCONSISTENT; break; case P_OUTDATED: /* peer got outdated, or was already outdated */
ex_to_string = "peer was fenced";
mask.pdsk = D_MASK;
val.pdsk = D_OUTDATED; break; case P_DOWN: /* peer was down */ if (conn_highest_disk(connection) == D_UP_TO_DATE) { /* we will(have) create(d) a new UUID anyways... */
ex_to_string = "peer is unreachable, assumed to be dead";
mask.pdsk = D_MASK;
val.pdsk = D_OUTDATED;
} else {
ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
} break; case P_PRIMARY: /* Peer is primary, voluntarily outdate myself. * This is useful when an unconnected R_SECONDARY is asked to
* become R_PRIMARY, but finds the other peer being active. */
ex_to_string = "peer is active";
drbd_warn(connection, "Peer is primary, outdating myself.\n");
mask.disk = D_MASK;
val.disk = D_OUTDATED; break; case P_FENCING: /* THINK: do we need to handle this
* like case 4, or more like case 5? */ if (fp != FP_STONITH)
drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
ex_to_string = "peer was stonithed";
mask.pdsk = D_MASK;
val.pdsk = D_OUTDATED; break; default: /* The script is broken ... */
drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); returnfalse; /* Eventually leave IO frozen */
}
drbd_info(connection, "fence-peer helper returned %d (%s)\n",
(r>>8) & 0xff, ex_to_string);
/* Not using conn_request_state(connection, mask, val, CS_VERBOSE); here, because we might were able to re-establish the connection in the
meantime. */
spin_lock_irq(&resource->req_lock); if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) { if (connection->connect_cnt != connect_cnt) /* In case the connection was established and droped
while the fence-peer handler was running, ignore it */
drbd_info(connection, "Ignoring fence-peer exit code\n"); else
_conn_request_state(connection, mask, val, CS_VERBOSE);
}
spin_unlock_irq(&resource->req_lock);
kref_get(&connection->kref); /* We may have just sent a signal to this thread * to get it out of some blocking network function. * Clear signals; otherwise kthread_run(), which internally uses * wait_on_completion_killable(), will mistake our pending signal
* for a new fatal signal and fail. */
flush_signals(current);
opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h"); if (IS_ERR(opa)) {
drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n");
kref_put(&connection->kref, drbd_destroy_connection);
}
}
/* in case we first succeeded to outdate,
* but now suddenly could establish a connection */ if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
val.pdsk = 0;
mask.pdsk = 0; continue;
}
if (rv == SS_NOTHING_TO_DO) goto out; if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { if (!conn_try_outdate_peer(connection) && force) {
drbd_warn(device, "Forced into split brain situation!\n");
mask.pdsk = D_MASK;
val.pdsk = D_OUTDATED;
} continue;
} if (rv == SS_TWO_PRIMARIES) { /* Maybe the peer is detected as dead very soon...
retry at most once more in this case. */ if (try < max_tries) { int timeo; try = max_tries - 1;
rcu_read_lock();
nc = rcu_dereference(connection->net_conf);
timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
rcu_read_unlock();
schedule_timeout_interruptible(timeo);
} continue;
} if (rv < SS_SUCCESS) {
rv = _drbd_request_state(device, mask, val,
CS_VERBOSE + CS_WAIT_COMPLETE); if (rv < SS_SUCCESS) goto out;
} break;
}
if (rv < SS_SUCCESS) goto out;
if (forced)
drbd_warn(device, "Forced to consider local data as UpToDate!\n");
/* Wait until nothing is on the fly :) */
wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
/* FIXME also wait for all pending P_BARRIER_ACK? */
if (new_role == R_SECONDARY) { if (get_ldev(device)) {
device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
put_ldev(device);
}
} else {
mutex_lock(&device->resource->conf_update);
nc = connection->net_conf; if (nc)
nc->discard_my_data = 0; /* without copy; single bit op is atomic */
mutex_unlock(&device->resource->conf_update);
if (get_ldev(device)) { if (((device->state.conn < C_CONNECTED ||
device->state.pdsk <= D_FAILED)
&& device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
drbd_uuid_new_current(device);
/* writeout of activity log covered areas of the bitmap
* to stable storage done in after state change already */
if (device->state.conn >= C_WF_REPORT_PARAMS) { /* if this was forced, we should consider sync */ if (forced)
drbd_send_uuids(peer_device);
drbd_send_current_state(peer_device);
}
/* input size is expected to be in KB */ char *ppsize(char *buf, unsignedlonglong size)
{ /* Needs 9 bytes at max including trailing NUL:
* -1ULL ==> "16384 EB" */ staticchar units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; int base = 0; while (size >= 10000 && base < sizeof(units)-1) { /* shift + round */
size = (size >> 10) + !!(size & (1<<9));
base++;
}
sprintf(buf, "%u %cB", (unsigned)size, units[base]);
return buf;
}
/* there is still a theoretical deadlock when called from receiver * on an D_INCONSISTENT R_PRIMARY: * remote READ does inc_ap_bio, receiver would need to receive answer * packet from remote to dec_ap_bio again. * receiver receive_sizes(), comes here, * waits for ap_bio_cnt == 0. -> deadlock. * but this cannot happen, actually, because: * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable * (not connected, or bad/no disk on peer): * see drbd_fail_request_early, ap_bio_cnt is zero. * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: * peer may not initiate a resize.
*/ /* Note these are not to be confused with * drbd_adm_suspend_io/drbd_adm_resume_io, * which are (sub) state changes triggered by admin (drbdsetup), * and can be long lived. * This changes an device->flag, is triggered by drbd internals,
* and should be short-lived. */ /* It needs to be a counter, since multiple threads might
independently suspend and resume IO. */ void drbd_suspend_io(struct drbd_device *device)
{
atomic_inc(&device->suspend_cnt); if (drbd_suspended(device)) return;
wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
}
void drbd_resume_io(struct drbd_device *device)
{ if (atomic_dec_and_test(&device->suspend_cnt))
wake_up(&device->misc_wait);
}
/* * drbd_determine_dev_size() - Sets the right device size obeying all constraints * @device: DRBD device. * * Returns 0 on success, negative return values indicate errors. * You should call drbd_md_sync() after calling this function.
*/ enum determine_dev_size
drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
{ struct md_offsets_and_sizes {
u64 last_agreed_sect;
u64 md_offset;
s32 al_offset;
s32 bm_offset;
u32 md_size_sect;
int md_moved, la_size_changed; enum determine_dev_size rv = DS_UNCHANGED;
/* We may change the on-disk offsets of our meta data below. Lock out * anything that may cause meta data IO, to avoid acting on incomplete * layout changes or scribbling over meta data that is in the process * of being moved. * * Move is not exactly correct, btw, currently we have all our meta * data in core memory, to "move" it we just write it all out, there
* are no reads. */
drbd_suspend_io(device);
buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ if (!buffer) {
drbd_resume_io(device); return DS_ERROR;
}
if (rs) { /* rs is non NULL if we should change the AL layout only */
md->al_stripes = rs->al_stripes;
md->al_stripe_size_4k = rs->al_stripe_size / 4;
md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
}
if (size < prev.last_agreed_sect) { if (rs && u_size == 0) { /* Remove "rs &&" later. This check should always be active, but
right now the receiver expects the permissive behavior */
drbd_warn(device, "Implicit shrink not allowed. " "Use --size=%llus for explicit shrink.\n",
(unsignedlonglong)size);
rv = DS_ERROR_SHRINK;
} if (u_size > size)
rv = DS_ERROR_SPACE_MD; if (rv != DS_UNCHANGED) goto err_out;
}
if (get_capacity(device->vdisk) != size ||
drbd_bm_capacity(device) != size) { int err;
err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC)); if (unlikely(err)) { /* currently there is only one error: ENOMEM! */
size = drbd_bm_capacity(device); if (size == 0) {
drbd_err(device, "OUT OF MEMORY! " "Could not allocate bitmap!\n");
} else {
drbd_err(device, "BM resizing failed. " "Leaving size unchanged\n");
}
rv = DS_ERROR;
} /* racy, see comments above. */
drbd_set_my_capacity(device, size);
md->la_size_sect = size;
} if (rv <= DS_ERROR) goto err_out;
if (la_size_changed || md_moved || rs) {
u32 prev_flags;
/* We do some synchronous IO below, which may take some time. * Clear the timer, to avoid scary "timer expired!" messages,
* "Superblock" is written out at least twice below, anyways. */
timer_delete(&device->md_sync_timer);
/* We won't change the "al-extents" setting, we just may need * to move the on-disk location of the activity log ringbuffer. * Lock for transaction is good enough, it may well be "dirty"
* or even "starving". */
wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log));
/* mark current on-disk bitmap and activity log as unreliable */
prev_flags = md->flags;
md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED;
drbd_md_write(device, buffer);
drbd_al_initialize(device, buffer);
drbd_info(device, "Writing the whole bitmap, %s\n",
la_size_changed && md_moved ? "size changed and md moved" :
la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, "size changed", BM_LOCKED_MASK, NULL);
/* on-disk bitmap and activity log is authoritative again
* (unless there was an IO error meanwhile...) */
md->flags = prev_flags;
drbd_md_write(device, buffer);
if (rs)
drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
md->al_stripes, md->al_stripe_size_4k * 4);
}
if (size > prev.last_agreed_sect)
rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO; if (size < prev.last_agreed_sect)
rv = DS_SHRUNK;
sector_t
drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
sector_t u_size, int assume_peer_has_space)
{
sector_t p_size = device->p_size; /* partner's disk size. */
sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
sector_t m_size; /* my size */
sector_t size = 0;
m_size = drbd_get_max_capacity(bdev);
if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
drbd_warn(device, "Resize while not connected was forced by the user!\n");
p_size = m_size;
}
if (p_size && m_size) {
size = min_t(sector_t, p_size, m_size);
} else { if (la_size_sect) {
size = la_size_sect; if (m_size && m_size < size)
size = m_size; if (p_size && p_size < size)
size = p_size;
} else { if (m_size)
size = m_size; if (p_size)
size = p_size;
}
}
if (size == 0)
drbd_err(device, "Both nodes diskless!\n");
if (u_size) { if (u_size > size)
drbd_err(device, "Requested disk size is too big (%lu > %lu)\n",
(unsignedlong)u_size>>1, (unsignedlong)size>>1); else
size = u_size;
}
return size;
}
/* * drbd_check_al_size() - Ensures that the AL is of the right size * @device: DRBD device. * * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation * failed, and 0 on success. You should call drbd_md_sync() after you called * this function.
*/ staticint drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
{ struct lru_cache *n, *t; struct lc_element *e; unsignedint in_use; int i;
if (device->act_log &&
device->act_log->nr_elements == dc->al_extents) return 0;
in_use = 0;
t = device->act_log;
n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
dc->al_extents, sizeof(struct lc_element), 0);
if (n == NULL) {
drbd_err(device, "Cannot allocate act_log lru!\n"); return -ENOMEM;
}
spin_lock_irq(&device->al_lock); if (t) { for (i = 0; i < t->nr_elements; i++) {
e = lc_element_by_index(t, i); if (e->refcnt)
drbd_err(device, "refcnt(%d)==%d\n",
e->lc_number, e->refcnt);
in_use += e->refcnt;
}
} if (!in_use)
device->act_log = n;
spin_unlock_irq(&device->al_lock); if (in_use) {
drbd_err(device, "Activity log still in use!\n");
lc_destroy(n); return -EBUSY;
} else {
lc_destroy(t);
}
drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */ return 0;
}
staticunsignedint drbd_max_peer_bio_size(struct drbd_device *device)
{ /* * We may ignore peer limits if the peer is modern enough. From 8.3.8 * onwards the peer can use multiple BIOs for a single peer_request.
*/ if (device->state.conn < C_WF_REPORT_PARAMS) return device->peer_max_bio_size;
if (first_peer_device(device)->connection->agreed_pro_version < 94) return min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
/* * Correct old drbd (up to 8.3.7) if it believes it can do more than * 32KiB.
*/ if (first_peer_device(device)->connection->agreed_pro_version == 94) return DRBD_MAX_SIZE_H80_PACKET;
/* * drbd 8.3.8 onwards, before 8.4.0
*/ if (first_peer_device(device)->connection->agreed_pro_version < 100) return DRBD_MAX_BIO_SIZE_P95; return DRBD_MAX_BIO_SIZE;
}
staticunsignedint drbd_max_discard_sectors(struct drbd_connection *connection)
{ /* when we introduced REQ_WRITE_SAME support, we also bumped
* our maximum supported batch bio size used for discards. */ if (connection->agreed_features & DRBD_FF_WSAME) return DRBD_MAX_BBIO_SECTORS; /* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */ return AL_EXTENT_SIZE >> 9;
}
if (connection->cstate >= C_CONNECTED &&
!(connection->agreed_features & DRBD_FF_TRIM)) {
drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n"); returnfalse;
}
returntrue;
}
/* This is the workaround for "bio would need to, but cannot, be split" */ staticunsignedint drbd_backing_dev_max_segments(struct drbd_device *device)
{ unsignedint max_segments;
/* * We may later detach and re-attach on a disconnected Primary. Avoid * decreasing the value in this case. * * We want to store what we know the peer DRBD can handle, not what the * peer IO backend can handle.
*/ new = min3(DRBD_MAX_BIO_SIZE, device->local_max_bio_size,
max(drbd_max_peer_bio_size(device), device->peer_max_bio_size)); if (new != now) { if (device->state.role == R_PRIMARY && new < now)
drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
drbd_info(device, "max BIO size = %u\n", new);
}
lim.max_hw_sectors = new >> SECTOR_SHIFT;
lim.seg_boundary_mask = PAGE_SIZE - 1;
/* * We don't care for the granularity, really. * * Stacking limits below should fix it for the local device. Whether or * not it is a suitable granularity on the remote device is not our * problem, really. If you care, you need to use devices with similar * topology on all peers.
*/ if (drbd_discard_supported(connection, bdev)) {
lim.discard_granularity = 512;
lim.max_hw_discard_sectors =
drbd_max_discard_sectors(connection);
} else {
lim.discard_granularity = 0;
lim.max_hw_discard_sectors = 0;
}
if (bdev)
blk_stack_limits(&lim, &b->limits, 0);
/* * If we can handle "zeroes" efficiently on the protocol, we want to do * that, even if our backend does not announce max_write_zeroes_sectors * itself.
*/ if (connection->agreed_features & DRBD_FF_WZEROES)
lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; else
lim.max_write_zeroes_sectors = 0;
lim.max_hw_wzeroes_unmap_sectors = 0;
staticunsignedint drbd_al_extents_max(struct drbd_backing_dev *bdev)
{ /* This is limited by 16 bit "slot" numbers, * and by available on-disk context storage. * * Also (u16)~0 is special (denotes a "free" extent). * * One transaction occupies one 4kB on-disk block, * we have n such blocks in the on disk ring buffer, * the "current" transaction may fail (n-1), * and there is 919 slot numbers context information per transaction. * * 72 transaction blocks amounts to more than 2**16 context slots, * so cap there first.
*/ constunsignedint max_al_nr = DRBD_AL_EXTENTS_MAX; constunsignedint sufficient_on_disk =
(max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
/AL_CONTEXT_PER_TRANSACTION;
unsignedint al_size_4k = bdev->md.al_size_4k;
if (al_size_4k > sufficient_on_disk) return max_al_nr;
if (device->act_log &&
device->act_log->nr_elements == dc->al_extents) return 0;
drbd_suspend_io(device); /* If IO completion is currently blocked, we would likely wait
* "forever" for the activity log to become unused. So we don't. */ if (atomic_read(&device->ap_bio_cnt)) goto out;
err = disk_opts_check_al_size(device, new_disk_conf); if (err) { /* Could be just "busy". Ignore?
* Introduce dedicated error code? */
drbd_msg_put_info(adm_ctx.reply_skb, "Try again without changing current al-extents setting");
retcode = ERR_NOMEM; goto fail_unlock;
}
/* * meta_dev_idx >= 0: external fixed size, possibly multiple * drbd sharing one meta device. TODO in that case, paranoia * check that [md_bdev, meta_dev_idx] is not yet used by some * other drbd minor! (if you use drbd.conf + drbdadm, that * should check it for you already; but if you don't, or * someone fooled it, we need to double check here)
*/
file = open_backing_dev(device, new_disk_conf->meta_dev, /* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
* if potentially shared with other drbd minors */
(new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder, /* avoid double bd_claim_by_disk() for the same (source,target) tuple,
* as would happen with internal metadata. */
(new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL)); if (IS_ERR(file)) return ERR_OPEN_MD_DISK;
nbc->md_bdev = file_bdev(file);
nbc->f_md_bdev = file; return NO_ERROR;
}
staticvoid close_backing_dev(struct drbd_device *device, struct file *bdev_file, bool do_bd_unlink)
{ if (!bdev_file) return; if (do_bd_unlink)
bd_unlink_disk_holder(file_bdev(bdev_file), device->vdisk);
fput(bdev_file);
}
/* if you want to reconfigure, please tear down first */ if (device->state.disk > D_DISKLESS) {
retcode = ERR_DISK_CONFIGURED; goto fail;
} /* It may just now have detached because of IO error. Make sure * drbd_ldev_destroy is done already, we may end up here very fast, * e.g. if someone calls attach from the on-io-error handler,
* to realize a "hot spare" feature (not that I'd recommend that) */
wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
/* make sure there is no leftover from previous force-detach attempts */
clear_bit(FORCE_DETACH, &device->flags);
clear_bit(WAS_IO_ERROR, &device->flags);
clear_bit(WAS_READ_ERROR, &device->flags);
/* and no leftover from previously aborted resync or verify, either */
device->rs_total = 0;
device->rs_failed = 0;
atomic_set(&device->rs_pending_cnt, 0);
/* allocation not in the IO path, drbdsetup context */
nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); if (!nbc) {
retcode = ERR_NOMEM; goto fail;
}
spin_lock_init(&nbc->md.uuid_lock);
/* Read our meta data super block early.
* This also sets other on-disk offsets. */
retcode = drbd_md_read(device, nbc); if (retcode != NO_ERROR) goto fail;
sanitize_disk_conf(device, new_disk_conf, nbc);
if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
(unsignedlonglong) drbd_get_max_capacity(nbc),
(unsignedlonglong) new_disk_conf->disk_size);
retcode = ERR_DISK_TOO_SMALL; goto fail;
}
if (new_disk_conf->meta_dev_idx < 0) {
max_possible_sectors = DRBD_MAX_SECTORS_FLEX; /* at least one MB, otherwise it does not make sense */
min_md_device_sectors = (2<<10);
} else {
max_possible_sectors = DRBD_MAX_SECTORS;
min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
}
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
retcode = ERR_MD_DISK_TOO_SMALL;
drbd_warn(device, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n",
(unsignedlonglong) min_md_device_sectors); goto fail;
}
/* Make sure the new disk is big enough
* (we may currently be R_PRIMARY with no local disk...) */ if (drbd_get_max_capacity(nbc) < get_capacity(device->vdisk)) {
retcode = ERR_DISK_TOO_SMALL; goto fail;
}
if (nbc->known_size > max_possible_sectors) {
drbd_warn(device, "==> truncating very big lower level device " "to currently maximum possible %llu sectors <==\n",
(unsignedlonglong) max_possible_sectors); if (new_disk_conf->meta_dev_idx >= 0)
drbd_warn(device, "==>> using internal or flexible " "meta data may help <<==\n");
}
drbd_suspend_io(device); /* also wait for the last barrier ack. */ /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171 * We need a way to either ignore barrier acks for barriers sent before a device * was attached, or a way to wait for all pending barrier acks to come in. * As barriers are counted per resource, * we'd need to suspend io on all devices of a resource.
*/
wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device)); /* and for any other previously queued work */
drbd_flush_workqueue(&connection->sender_work);
if (!get_ldev_if_state(device, D_ATTACHING)) goto force_diskless;
if (!device->bitmap) { if (drbd_bm_init(device)) {
retcode = ERR_NOMEM; goto force_diskless_dec;
}
}
if (device->state.pdsk != D_UP_TO_DATE && device->ed_uuid &&
(device->state.role == R_PRIMARY || device->state.peer == R_PRIMARY) &&
(device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
(unsignedlonglong)device->ed_uuid);
retcode = ERR_DATA_NOT_CURRENT; goto force_diskless_dec;
}
/* Since we are diskless, fix the activity log first... */ if (drbd_check_al_size(device, new_disk_conf)) {
retcode = ERR_NOMEM; goto force_diskless_dec;
}
/* Prevent shrinking of consistent devices ! */
{ unsignedlonglong nsz = drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0); unsignedlonglong eff = nbc->md.la_size_sect; if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && nsz < eff) { if (nsz == nbc->disk_conf->disk_size) {
drbd_warn(device, "truncating a consistent device during attach (%llu < %llu)\n", nsz, eff);
} else {
drbd_warn(device, "refusing to truncate a consistent device (%llu < %llu)\n", nsz, eff);
drbd_msg_sprintf_info(adm_ctx.reply_skb, "To-be-attached device has last effective > current size, and is consistent\n" "(%llu > %llu sectors). Refusing to attach.", eff, nsz);
retcode = ERR_IMPLICIT_SHRINK; goto force_diskless_dec;
}
}
}
/* Reset the "barriers don't work" bits here, then force meta data to
* be written, to ensure we determine if barriers are supported. */ if (new_disk_conf->md_flushes)
clear_bit(MD_NO_FUA, &device->flags); else
set_bit(MD_NO_FUA, &device->flags);
/* Point of no return reached. * Devices and memory are no longer released by error cleanup below. * now device takes over responsibility, and the state engine should
* clean it up somewhere. */
D_ASSERT(device, device->ldev == NULL);
device->ldev = nbc;
device->resync = resync_lru;
device->rs_plan_s = new_plan;
nbc = NULL;
resync_lru = NULL;
new_disk_conf = NULL;
new_plan = NULL;
/* If I am currently not R_PRIMARY, * but meta data primary indicator is set, * I just now recover from a hard crash, * and have been R_PRIMARY before that crash. * * Now, if I had no connection before that crash * (have been degraded R_PRIMARY), chances are that * I won't find my peer now either. * * In that case, and _only_ in that case, * we use the degr-wfc-timeout instead of the default, * so we can automatically recover from a crash of a * degraded but active "cluster" after a certain timeout.
*/
clear_bit(USE_DEGR_WFC_T, &device->flags); if (device->state.role != R_PRIMARY &&
drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
!drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
set_bit(USE_DEGR_WFC_T, &device->flags);
if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
(test_bit(CRASHED_PRIMARY, &device->flags) &&
drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
drbd_info(device, "Assuming that all blocks are out of sync " "(aka FullSync)\n"); if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from attaching", BM_LOCKED_MASK,
NULL)) {
retcode = ERR_IO_MD_DISK; goto force_diskless_dec;
}
} else { if (drbd_bitmap_io(device, &drbd_bm_read, "read from attaching", BM_LOCKED_MASK,
NULL)) {
retcode = ERR_IO_MD_DISK; goto force_diskless_dec;
}
}
if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
drbd_suspend_al(device); /* IO is still suspended here... */
spin_lock_irq(&device->resource->req_lock);
os = drbd_read_state(device);
ns = os; /* If MDF_CONSISTENT is not set go into inconsistent state, otherwise investigate MDF_WasUpToDate... If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, otherwise into D_CONSISTENT state.
*/ if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) { if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
ns.disk = D_CONSISTENT; else
ns.disk = D_OUTDATED;
} else {
ns.disk = D_INCONSISTENT;
}
if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
ns.pdsk = D_OUTDATED;
/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before this point, because drbd_request_state() modifies these
flags. */
if (rcu_dereference(device->ldev->disk_conf)->al_updates)
device->ldev->md.flags &= ~MDF_AL_DISABLED; else
device->ldev->md.flags |= MDF_AL_DISABLED;
rcu_read_unlock();
/* In case we are C_CONNECTED postpone any decision on the new disk
state after the negotiation phase. */ if (device->state.conn == C_CONNECTED) {
device->new_state_tmp.i = ns.i;
ns.i = os.i;
ns.disk = D_NEGOTIATING;
/* We expect to receive up-to-date UUIDs soon. To avoid a race in receive_state, free p_uuid while
holding req_lock. I.e. atomic with the state change */
kfree(device->p_uuid);
device->p_uuid = NULL;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.