/* This is used to stop/restart our threads. * Cannot use SIGTERM nor SIGKILL, since these * are sent out by init on runlevel changes * I choose SIGHUP for now.
*/ #define DRBD_SIGKILL SIGHUP
/* for sending/receiving the bitmap,
* possibly in some encoding scheme */ struct bm_xfer_ctx { /* "const" * stores total bits and long words * of the bitmap, so we don't need to
* call the accessor functions over and again. */ unsignedlong bm_bits; unsignedlong bm_words; /* during xfer, current position within the bitmap */ unsignedlong bit_offset; unsignedlong word_offset;
staticinlinevoid bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
{ /* word_offset counts "native long words" (32 or 64 bit), * aligned at 64 bit. * Encoded packet may end at an unaligned bit offset. * In case a fallback clear text packet is transmitted in * between, we adjust this offset back to the last 64bit * aligned "native long word", which makes coding and decoding
* the plain text bitmap much more convenient. */ #if BITS_PER_LONG == 64
c->word_offset = c->bit_offset >> 6; #elif BITS_PER_LONG == 32
c->word_offset = c->bit_offset >> 5;
c->word_offset &= ~(1UL); #else # error "unsupported BITS_PER_LONG" #endif
}
staticinlineenum drbd_thread_state get_t_state(struct drbd_thread *thi)
{ /* THINK testing the t_state seems to be uncritical in all cases * (but thread_{start,stop}), so we can read it *without* the lock.
* --lge */
smp_rmb(); return thi->t_state;
}
struct drbd_work { struct list_head list; int (*cb)(struct drbd_work *, int cancel);
};
/* if local IO is not allowed, will be NULL. * if local IO _is_ allowed, holds the locally submitted bio clone, * or, after local IO completion, the ERR_PTR(error).
* see drbd_request_endio(). */ struct bio *private_bio;
struct drbd_interval i;
/* epoch: used to check on "completion" whether this req was in * the current epoch, and we therefore have to close it, * causing a p_barrier packet to be send, starting a new epoch. * * This corresponds to "barrier" in struct p_barrier[_ack], * and to "barrier_nr" in struct drbd_epoch (and various * comments/function parameters/local variable names).
*/ unsignedint epoch;
struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */
/* for generic IO accounting */ unsignedlong start_jif;
/* for DRBD internal statistics */
/* Minimal set of time stamps to determine if we wait for activity log * transactions, local disk or peer. 32 bit "jiffies" are good enough, * we don't expect a DRBD request to be stalled for several month.
*/
/* before actual request processing */ unsignedlong in_actlog_jif;
/* local disk */ unsignedlong pre_submit_jif;
/* per connection */ unsignedlong pre_send_jif; unsignedlong acked_jif; unsignedlong net_done_jif;
/* Possibly even more detail to track each phase: * master_completion_jif * how long did it take to complete the master bio * (application visible latency) * allocated_jif * how long the master bio was blocked until we finally allocated * a tracking struct * in_actlog_jif * how long did we wait for activity log transactions * * net_queued_jif * when did we finally queue it for sending * pre_send_jif * when did we start sending it * post_send_jif * how long did we block in the network stack trying to send it * acked_jif * when did we receive (or fake, in protocol A) a remote ACK * net_done_jif * when did we receive final acknowledgement (P_BARRIER_ACK), * or decide, e.g. on connection loss, that we do no longer expect * anything from this peer for this request. * * pre_submit_jif * post_sub_jif * when did we start submiting to the lower level device, * and how long did we block in that submit function * local_completion_jif * how long did it take the lower level device to complete this request
*/
/* once it hits 0, we may complete the master_bio */
atomic_t completion_ref; /* once it hits 0, we may destroy this drbd_request object */ struct kref kref;
unsigned rq_state; /* see comments above _req_mod() */
};
struct drbd_epoch { struct drbd_connection *connection; struct list_head list; unsignedint barrier_nr;
atomic_t epoch_size; /* increased on every request added. */
atomic_t active; /* increased on every req. added, and dec on every finished. */ unsignedlong flags;
};
/* drbd_epoch flag bits */ enum {
DE_HAVE_BARRIER_NUMBER,
};
enum epoch_event {
EV_PUT,
EV_GOT_BARRIER_NR,
EV_BECAME_LAST,
EV_CLEANUP = 32, /* used as flag */
};
struct digest_info { int digest_size; void *digest;
};
struct drbd_peer_request { struct drbd_work w; struct drbd_peer_device *peer_device; struct drbd_epoch *epoch; /* for writes */ struct page *pages;
blk_opf_t opf;
atomic_t pending_bios; struct drbd_interval i; /* see comments on ee flag bits below */ unsignedlong flags; unsignedlong submit_jif; union {
u64 block_id; struct digest_info *digest;
};
};
/* Equivalent to bio_op and req_op. */ #define peer_req_op(peer_req) \
((peer_req)->opf & REQ_OP_MASK)
/* ee flag bits. * While corresponding bios are in flight, the only modification will be * set_bit WAS_ERROR, which has to be atomic. * If no bios are in flight yet, or all have been completed, * non-atomic modification to ee->flags is ok.
*/ enum {
__EE_CALL_AL_COMPLETE_IO,
__EE_MAY_SET_IN_SYNC,
/* is this a TRIM aka REQ_OP_DISCARD? */
__EE_TRIM, /* explicit zero-out requested, or * our lower level cannot handle trim,
* and we want to fall back to zeroout instead */
__EE_ZEROOUT,
/* In case a barrier failed,
* we need to resubmit without the barrier flag. */
__EE_RESUBMITTED,
/* we may have several bios per peer request. * if any of those fail, we set this flag atomically
* from the endio callback */
__EE_WAS_ERROR,
/* This ee has a pointer to a digest instead of a block id */
__EE_HAS_DIGEST,
/* Conflicting local requests need to be restarted after this request */
__EE_RESTART_REQUESTS,
/* The peer wants a write ACK for this (wire proto C) */
__EE_SEND_WRITE_ACK,
/* Is set when net_conf had two_primaries set while creating this peer_req */
__EE_IN_INTERVAL_TREE,
/* for debugfs: */ /* has this been submitted, or does it still wait for something else? */
__EE_SUBMITTED,
/* this is/was a write request */
__EE_WRITE,
/* hand back using mempool_free(e, drbd_buffer_page_pool) */
__EE_RELEASE_TO_MEMPOOL,
/* this is/was a write same request */
__EE_WRITE_SAME,
/* this originates from application on peer
* (not some resync or verify or other DRBD internal request) */
__EE_APPLICATION,
/* flag bits per device */ enum {
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
CL_ST_CHG_SUCCESS,
CL_ST_CHG_FAIL,
CRASHED_PRIMARY, /* This node was a crashed primary. * Gets cleared when the state.conn
* goes into C_CONNECTED state. */
CONSIDER_RESYNC,
MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
BITMAP_IO, /* suspend application io;
once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */
WAS_IO_ERROR, /* Local disk failed, returned IO error */
WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
RESIZE_PENDING, /* Size change detected locally, waiting for the response from
* the peer, if it changed there as well. */
NEW_CUR_UUID, /* Create new current UUID when thawing IO */
AL_SUSPENDED, /* Activity logging is currently suspended. */
AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
B_RS_H_DONE, /* Before resync handler done (already executed) */
DISCARD_MY_DATA, /* discard_my_data flag per volume */
READ_BALANCE_RR,
FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush
* from drbd_flush_after_epoch() */
/* cleared only after backing device related structures have been destroyed. */
GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */
/* to be used in drbd_device_post_work() */
GO_DISKLESS, /* tell worker to schedule cleanup before detach */
DESTROY_DISK, /* tell worker to close backing devices and destroy related structures. */
MD_SYNC, /* tell worker to call drbd_md_sync() */
RS_START, /* tell worker to start resync/OV */
RS_PROGRESS, /* tell worker that resync made significant progress */
RS_DONE, /* tell worker that resync is done */
};
struct drbd_bitmap; /* opaque for drbd_device */
/* definition of bits in bm_flags to be used in drbd_bm_lock
* and drbd_bitmap_io and friends. */ enum bm_flag { /* currently locked for bulk operation */
BM_LOCKED_MASK = 0xf,
/* in detail, that is: */
BM_DONT_CLEAR = 0x1,
BM_DONT_SET = 0x2,
BM_DONT_TEST = 0x4,
/* so we can mark it locked for bulk operation,
* and still allow all non-bulk operations */
BM_IS_LOCKED = 0x8,
/* testing bits, as well as setting new bits allowed, but clearing bits * would be unexpected. Used during bitmap receive. Setting new bits
* requires sending of "out-of-sync" information, though. */
BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
/* for drbd_bm_write_copy_pages, everything is allowed,
* only concurrent bulk operations are locked out. */
BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
};
struct drbd_work_queue { struct list_head q;
spinlock_t q_lock; /* to protect the list. */
wait_queue_head_t q_wait;
};
struct drbd_socket { struct mutex mutex; struct socket *socket; /* this way we get our
* send/receive buffers off the stack */ void *sbuf; void *rbuf;
};
struct fifo_buffer { unsignedint head_index; unsignedint size; int total; /* sum of all values */ int values[] __counted_by(size);
}; externstruct fifo_buffer *fifo_alloc(unsignedint fifo_size);
/* flag bits per connection */ enum {
NET_CONGESTED, /* The data socket is congested */
RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
SEND_PING,
GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
CONN_WD_ST_CHG_OKAY,
CONN_WD_ST_CHG_FAIL,
CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
STATE_SENT, /* Do not change state/UUIDs while this is set */
CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) * pending, from drbd worker context.
*/
DISCONNECT_SENT,
DEVICE_WORK_PENDING, /* tell worker that some device has pending work */
};
unsigned susp:1; /* IO suspended by user */ unsigned susp_nod:1; /* IO suspended because no data */ unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
struct drbd_connection { struct list_head connections; struct drbd_resource *resource; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_conn; struct dentry *debugfs_conn_callback_history; struct dentry *debugfs_conn_oldest_requests; #endif struct kref kref; struct idr peer_devices; /* volume number to peer device mapping */ enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ struct mutex cstate_mutex; /* Protects graceful disconnects */ unsignedint connect_cnt; /* Inc each time a connection is established */
unsignedlong flags; struct net_conf *net_conf; /* content protected by rcu */
wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */
struct sockaddr_storage my_addr; int my_addr_len; struct sockaddr_storage peer_addr; int peer_addr_len;
struct drbd_socket data; /* data/barrier/cstate/parameter packets */ struct drbd_socket meta; /* ping/ack (metadata) packets */ int agreed_pro_version; /* actually used protocol version */
u32 agreed_features; unsignedlong last_received; /* in jiffies, either socket */ unsignedint ko_count;
struct list_head transfer_log; /* all requests not yet fully processed */
struct crypto_shash *cram_hmac_tfm; struct crypto_shash *integrity_tfm; /* checksums we compute, updates protected by connection->data->mutex */ struct crypto_shash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ struct crypto_shash *csums_tfm; struct crypto_shash *verify_tfm; void *int_dig_in; void *int_dig_vv;
/* receiver side */ struct drbd_epoch *current_epoch;
spinlock_t epoch_lock; unsignedint epochs;
atomic_t current_tle_nr; /* transfer log epoch number */ unsigned current_tle_writes; /* writes seen within this tl epoch */
unsignedlong last_reconnect_jif; /* empty member on older kernels without blk_start_plug() */ struct blk_plug receiver_plug; struct drbd_thread receiver; struct drbd_thread worker; struct drbd_thread ack_receiver; struct workqueue_struct *ack_sender;
/* cached pointers, * so we can look up the oldest pending requests more quickly.
* protected by resource->req_lock */ struct drbd_request *req_next; /* DRBD 9: todo.req_next */ struct drbd_request *req_ack_pending; struct drbd_request *req_not_net_done;
/* sender side */ struct drbd_work_queue sender_work;
/* whether this sender thread
* has processed a single write yet. */ bool seen_any_write_yet;
/* Which barrier number to send with the next P_BARRIER */ int current_epoch_nr;
/* how many write requests have been sent * with req->epoch == current_epoch_nr.
* If none, no P_BARRIER will be sent. */ unsigned current_epoch_writes;
} send;
};
/* Used after attach while negotiating new disk state. */ union drbd_state new_state_tmp;
union drbd_dev_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */ unsignedint send_cnt; unsignedint recv_cnt; unsignedint read_cnt; unsignedint writ_cnt; unsignedint al_writ_cnt; unsignedint bm_writ_cnt;
atomic_t ap_bio_cnt; /* Requests we need to complete */
atomic_t ap_actlog_cnt; /* Requests waiting for activity log */
atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
atomic_t unacked_cnt; /* Need to send replies for */
atomic_t local_cnt; /* Waiting for local completion */
atomic_t suspend_cnt;
/* Interval tree of pending local requests */ struct rb_root read_requests; struct rb_root write_requests;
/* for statistics and timeouts */ /* [0] read, [1] write */ struct list_head pending_master_completion[2]; struct list_head pending_completion[2];
/* use checksums for *this* resync */ bool use_csums; /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ unsignedlong rs_total; /* number of resync blocks that failed in this run */ unsignedlong rs_failed; /* Syncer's start time [unit jiffies] */ unsignedlong rs_start; /* cumulated time in PausedSyncX state [unit jiffies] */ unsignedlong rs_paused; /* skipped because csum was equal [unit BM_BLOCK_SIZE] */ unsignedlong rs_same_csum; #define DRBD_SYNC_MARKS 8 #define DRBD_SYNC_MARK_STEP (3*HZ) /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ unsignedlong rs_mark_left[DRBD_SYNC_MARKS]; /* marks's time [unit jiffies] */ unsignedlong rs_mark_time[DRBD_SYNC_MARKS]; /* current index into rs_mark_{left,time} */ int rs_last_mark; unsignedlong rs_last_bcast; /* [unit jiffies] */
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
sector_t ov_stop_sector; /* where are we now? (sector) */
sector_t ov_position; /* Start sector of out of sync range (to merge printk reporting). */
sector_t ov_last_oos_start; /* size of out-of-sync range in sectors. */
sector_t ov_last_oos_size; unsignedlong ov_left; /* in bits */
struct drbd_bitmap *bitmap; unsignedlong bm_resync_fo; /* bit offset for drbd_bm_find_next */
/* Used to track operations of resync... */ struct lru_cache *resync; /* Number of locked elements in resync LRU */ unsignedint resync_locked; /* resync extent number waiting for application requests */ unsignedint resync_wenr;
int open_cnt;
u64 *p_uuid;
struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ struct list_head done_ee; /* need to send P_WRITE_ACK */ struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait; struct drbd_md_io md_io;
spinlock_t al_lock;
wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ unsignedint al_tr_number; int al_tr_cycle;
wait_queue_head_t seq_wait;
atomic_t packet_seq; unsignedint peer_seq;
spinlock_t peer_seq_lock; unsignedlong comm_bm_set; /* communicated number of set bits. */ struct bm_io_work bm_io_work;
u64 ed_uuid; /* UUID of the exposed data */ struct mutex own_state_mutex; struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */ char congestion_reason; /* Why we where congested... */
atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
atomic_t rs_sect_ev; /* for submitted resync data rate, both */ int rs_last_sect_ev; /* counter to compare with */ int rs_last_events; /* counter of read or write "events" (unit sectors)
* on the lower level device when we last looked. */ int c_sync_rate; /* current resync rate after syncer throttle magic */ struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ unsignedint peer_max_bio_size; unsignedint local_max_bio_size;
/* any requests that would block in drbd_make_request()
* are deferred to this single-threaded work queue */ struct submit_worker submit;
};
/* Meta data layout * * We currently have two possible layouts. * Offsets in (512 byte) sectors. * external: * |----------- md_size_sect ------------------| * [ 4k superblock ][ activity log ][ Bitmap ] * | al_offset == 8 | * | bm_offset = al_offset + X | * ==> bitmap sectors = md_size_sect - bm_offset * * Variants: * old, indexed fixed size meta data: * * internal: * |----------- md_size_sect ------------------| * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*] * | al_offset < 0 | * | bm_offset = al_offset - Y | * ==> bitmap sectors = Y = al_offset - bm_offset * * [padding*] are zero or up to 7 unused 512 Byte sectors to the * end of the device, so that the [4k superblock] will be 4k aligned. * * The activity log consists of 4k transaction blocks, * which are written in a ring-buffer, or striped ring-buffer like fashion, * which are writtensize used to be fixed 32kB, * but is about to become configurable.
*/
/* Our old fixed size meta data layout * allows up to about 3.8TB, so if you want more,
* you need to use the "flexible" meta data format. */ #define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */ #define MD_4kB_SECT 8 #define MD_32kB_SECT 64
/* One activity log extent represents 4M of storage */ #define AL_EXTENT_SHIFT 22 #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
/* We could make these currently hardcoded constants configurable * variables at create-md time (or even re-configurable at runtime?). * Which will require some more changes to the DRBD "super block" * and attach code. * * updates per transaction: * This many changes to the active set can be logged with one transaction. * This number is arbitrary. * context per transaction: * This many context extent numbers are logged with each transaction. * This number is resulting from the transaction block size (4k), the layout * of the transaction header, and the number of updates per transaction. * See drbd_actlog.c:struct al_transaction_on_disk
* */ #define AL_UPDATES_PER_TRANSACTION 64 // arbitrary #define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
/* resync bitmap */ /* 16MB sized 'bitmap extent' to track syncer usage */ struct bm_extent { int rs_left; /* number of bits set (out of sync) in this extent. */ int rs_failed; /* number of failed resync requests in this extent. */ unsignedlong flags; struct lc_element lce;
};
#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */ #define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */ #define BME_PRIORITY 2 /* finish resync IO on this extent ASAP! App IO waiting! */
/* drbd_bitmap.c */ /* * We need to store one bit for a block. * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap. * Bit 0 ==> local node thinks this block is binary identical on both nodes * Bit 1 ==> local node thinks this block needs to be synced.
*/
#define SLEEP_TIME (HZ/10)
/* We do bitmap IO in units of 4k blocks.
* We also still have a hardcoded 4k per bit relation. */ #define BM_BLOCK_SHIFT 12 /* 4k per bit */ #define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT) /* mostly arbitrarily set the represented size of one bitmap extent, * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
* at 4k per bit resolution) */ #define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */ #define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12) #error"HAVE YOU FIXED drbdmeta AS WELL??" #endif
/* thus many _storage_ sectors are described by one bit */ #define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9)) #define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9)) #define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
/* bit to represented kilo byte conversion */ #define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
/* in which _bitmap_ extent (resp. sector) the bit for a certain
* _storage_ sector is located in */ #define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) #define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
/* first storage sector a bitmap extent corresponds to */ #define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) /* how much _storage_ sectors we have per bitmap extent */ #define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) /* how many bits are covered by one bitmap extent (resync extent) */ #define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
/* in one sector of the bitmap, we have this many activity_log extents. */ #define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
/* the extent in "PER_EXTENT" below is an activity log extent * we need that many (long words/bytes) to store the bitmap * of one AL_EXTENT_SIZE chunk of storage. * we can store the bitmap for that many AL_EXTENTS within * one sector of the _on_disk_ bitmap: * bit 0 bit 37 bit 38 bit (512*8)-1 * ...|........|........|.. // ..|........| * sect. 0 `296 `304 ^(512*8*8)-1 * #define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG ) #define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128 #define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
*/
#define DRBD_MAX_SECTORS_32 (0xffffffffLU) /* we have a certain meta data variant that has a fixed on-disk size of 128 * MiB, of which 4k are our "superblock", and 32k are the fixed size activity * log, leaving this many sectors for the bitmap.
*/
#define DRBD_MAX_SECTORS_FIXED_BM \
((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9))) #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM /* 16 TB in units of sectors */ #if BITS_PER_LONG == 32 /* adjust by one page worth of bitmap, * so we won't wrap around in drbd_bm_find_next_bit.
* you should use 64bit OS for that much storage, anyways. */ #define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) #else /* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */ #define DRBD_MAX_SECTORS_FLEX (1UL << 51) /* corresponds to (1UL << 38) bits right now. */ #endif
/* Estimate max bio size as 256 * PAGE_SIZE, * so for typical PAGE_SIZE of 4k, that is (1<<20) Byte. * Since we may live in a mixed-platform cluster, * we limit us to a platform agnostic constant here for now. * A followup commit may allow even bigger BIO sizes,
* once we thought that through. */ #define DRBD_MAX_BIO_SIZE (1U << 20) #if DRBD_MAX_BIO_SIZE > (BIO_MAX_VECS << PAGE_SHIFT) #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE #endif #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
/* For now, don't allow more than half of what we can "activate" in one * activity log transaction to be discarded in one go. We may need to rework
* drbd_al_begin_io() to allow for even larger discard ranges */ #define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE) #define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9)
externint drbd_bm_init(struct drbd_device *device); externint drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); externvoid drbd_bm_cleanup(struct drbd_device *device); externvoid drbd_bm_set_all(struct drbd_device *device); externvoid drbd_bm_clear_all(struct drbd_device *device); /* set/clear/test only a few bits at a time */ externint drbd_bm_set_bits( struct drbd_device *device, unsignedlong s, unsignedlong e); externint drbd_bm_clear_bits( struct drbd_device *device, unsignedlong s, unsignedlong e); externint drbd_bm_count_bits( struct drbd_device *device, constunsignedlong s, constunsignedlong e); /* bm_set_bits variant for use while holding drbd_bm_lock,
* may process the whole bitmap in one go */ externvoid _drbd_bm_set_bits(struct drbd_device *device, constunsignedlong s, constunsignedlong e); externint drbd_bm_test_bit(struct drbd_device *device, unsignedlong bitnr); externint drbd_bm_e_weight(struct drbd_device *device, unsignedlong enr); externint drbd_bm_read(struct drbd_device *device, struct drbd_peer_device *peer_device) __must_hold(local); externvoid drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); externint drbd_bm_write(struct drbd_device *device, struct drbd_peer_device *peer_device) __must_hold(local); externvoid drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local); externint drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); externint drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); externint drbd_bm_write_all(struct drbd_device *device, struct drbd_peer_device *peer_device) __must_hold(local); externint drbd_bm_write_copy_pages(struct drbd_device *device, struct drbd_peer_device *peer_device) __must_hold(local); extern size_t drbd_bm_words(struct drbd_device *device); externunsignedlong drbd_bm_bits(struct drbd_device *device); extern sector_t drbd_bm_capacity(struct drbd_device *device);
#define DRBD_END_OF_BITMAP (~(unsignedlong)0) externunsignedlong drbd_bm_find_next(struct drbd_device *device, unsignedlong bm_fo); /* bm_find_next variants for use while you hold drbd_bm_lock() */ externunsignedlong _drbd_bm_find_next(struct drbd_device *device, unsignedlong bm_fo); externunsignedlong _drbd_bm_find_next_zero(struct drbd_device *device, unsignedlong bm_fo); externunsignedlong _drbd_bm_total_weight(struct drbd_device *device); externunsignedlong drbd_bm_total_weight(struct drbd_device *device); /* for receive_bitmap */ externvoid drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
size_t number, unsignedlong *buffer); /* for _drbd_send_bitmap */ externvoid drbd_bm_get_lel(struct drbd_device *device, size_t offset,
size_t number, unsignedlong *buffer);
/* We also need a standard (emergency-reserve backed) page pool * for meta data IO (activity log, bitmap). * We can keep it global, as long as it is used as "N pages at a time". * 128 should be plenty, currently we probably can get away with as few as 1.
*/ #define DRBD_MIN_POOL_PAGES 128 extern mempool_t drbd_md_io_page_pool; extern mempool_t drbd_buffer_page_pool;
/* We also need to make sure we get a bio
* when we need it for housekeeping purposes */ externstruct bio_set drbd_md_io_bio_set;
/* And a bio_set for cloning */ externstruct bio_set drbd_io_bio_set;
rcu_read_lock();
ep = rcu_dereference(device->ldev->disk_conf)->on_io_error;
rcu_read_unlock(); switch (ep) { case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) { if (drbd_ratelimit())
drbd_err(device, "Local IO failed in %s.\n", where); if (device->state.disk > D_INCONSISTENT)
_drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL); break;
}
fallthrough; /* for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */ case EP_DETACH: case EP_CALL_HELPER: /* Remember whether we saw a READ or WRITE error. * * Recovery of the affected area for WRITE failure is covered * by the activity log. * READ errors may fall outside that area though. Certain READ * errors can be "healed" by writing good data to the affected * blocks, which triggers block re-allocation in lower layers. * * If we can not write the bitmap after a READ error, * we may need to trigger a full sync (see w_go_diskless()). * * Force-detach is not really an IO error, but rather a * desperate measure to try to deal with a completely * unresponsive lower level IO stack. * Still it should be treated as a WRITE error. * * Meta IO error is always WRITE error: * we read meta data only once during attach, * which will fail in case of errors.
*/
set_bit(WAS_IO_ERROR, &device->flags); if (df == DRBD_READ_ERROR)
set_bit(WAS_READ_ERROR, &device->flags); if (df == DRBD_FORCE_DETACH)
set_bit(FORCE_DETACH, &device->flags); if (device->state.disk > D_FAILED) {
_drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL);
drbd_err(device, "Local IO failed in %s. Detaching...\n", where);
} break;
}
}
/** * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers * @device: DRBD device. * @error: Error code passed to the IO completion callback * @forcedetach: Force detach. I.e. the error happened while accessing the meta data * * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
*/ #define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) staticinlinevoid drbd_chk_io_error_(struct drbd_device *device, int error, enum drbd_force_detach_flags forcedetach, constchar *where)
{ if (error) { unsignedlong flags;
spin_lock_irqsave(&device->resource->req_lock, flags);
__drbd_chk_io_error_(device, forcedetach, where);
spin_unlock_irqrestore(&device->resource->req_lock, flags);
}
}
/** * drbd_md_first_sector() - Returns the first sector number of the meta data area * @bdev: Meta data block device. * * BTW, for internal meta data, this happens to be the maximum capacity * we could agree upon with our peer node.
*/ staticinline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
{ switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + bdev->md.bm_offset; case DRBD_MD_INDEX_FLEX_EXT: default: return bdev->md.md_offset;
}
}
/** * drbd_md_last_sector() - Return the last sector number of the meta data area * @bdev: Meta data block device.
*/ staticinline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
{ switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return bdev->md.md_offset + MD_4kB_SECT -1; case DRBD_MD_INDEX_FLEX_EXT: default: return bdev->md.md_offset + bdev->md.md_size_sect -1;
}
}
/* Returns the number of 512 byte sectors of the device */ staticinline sector_t drbd_get_capacity(struct block_device *bdev)
{ return bdev ? bdev_nr_sectors(bdev) : 0;
}
/** * drbd_get_max_capacity() - Returns the capacity we announce to out peer * @bdev: Meta data block device. * * returns the capacity we announce to out peer. we clip ourselves at the * various MAX_SECTORS, because if we don't, current implementation will * oops sooner or later
*/ staticinline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
{
sector_t s;
switch (bdev->md.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT:
s = drbd_get_capacity(bdev->backing_bdev)
? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
drbd_md_first_sector(bdev))
: 0; break; case DRBD_MD_INDEX_FLEX_EXT:
s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
drbd_get_capacity(bdev->backing_bdev)); /* clip at maximum size the meta device can support */
s = min_t(sector_t, s,
BM_EXT_TO_SECT(bdev->md.md_size_sect
- bdev->md.bm_offset)); break; default:
s = min_t(sector_t, DRBD_MAX_SECTORS,
drbd_get_capacity(bdev->backing_bdev));
} return s;
}
/** * drbd_md_ss() - Return the sector number of our meta data super block * @bdev: Meta data block device.
*/ staticinline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
{ constint meta_dev_idx = bdev->md.meta_dev_idx;
if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT) return 0;
/* Since drbd08, internal meta data is always "flexible".
* position: last 4k aligned block of 4k size */ if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
/* external, some index; this is the old fixed size layout */ return MD_128MB_SECT * bdev->md.meta_dev_idx;
}
/* To get the ack_receiver out of the blocking network stack, * so it can change its sk_rcvtimeo from idle- to ping-timeout, * and send a ping, we need to send a signal.
* Which signal we send is irrelevant. */ staticinlinevoid wake_ack_receiver(struct drbd_connection *connection)
{
--> --------------------