/* * ocfs2_la_default_mb() - determine a default size, in megabytes of * the local alloc. * * Generally, we'd like to pick as large a local alloc as * possible. Performance on large workloads tends to scale * proportionally to la size. In addition to that, the reservations * code functions more efficiently as it can reserve more windows for * write. * * Some things work against us when trying to choose a large local alloc: * * - We need to ensure our sizing is picked to leave enough space in * group descriptors for other allocations (such as block groups, * etc). Picking default sizes which are a multiple of 4 could help * - block groups are allocated in 2mb and 4mb chunks. * * - Likewise, we don't want to starve other nodes of bits on small * file systems. This can easily be taken care of by limiting our * default to a reasonable size (256M) on larger cluster sizes. * * - Some file systems can't support very large sizes - 4k and 8k in * particular are limited to less than 128 and 256 megabytes respectively. * * The following reference table shows group descriptor and local * alloc maximums at various cluster sizes (4k blocksize) * * csize: 4K group: 126M la: 121M * csize: 8K group: 252M la: 243M * csize: 16K group: 504M la: 486M * csize: 32K group: 1008M la: 972M * csize: 64K group: 2016M la: 1944M * csize: 128K group: 4032M la: 3888M * csize: 256K group: 8064M la: 7776M * csize: 512K group: 16128M la: 15552M * csize: 1024K group: 32256M la: 31104M
*/ #define OCFS2_LA_MAX_DEFAULT_MB 256 #define OCFS2_LA_OLD_DEFAULT 8 unsignedint ocfs2_la_default_mb(struct ocfs2_super *osb)
{ unsignedint la_mb; unsignedint gd_mb; unsignedint la_max_mb; unsignedint megs_per_slot; struct super_block *sb = osb->sb;
/* * This takes care of files systems with very small group * descriptors - 512 byte blocksize at cluster sizes lower * than 16K and also 1k blocksize with 4k cluster size.
*/ if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192)
|| (sb->s_blocksize == 1024 && osb->s_clustersize == 4096)) return OCFS2_LA_OLD_DEFAULT;
/* * Leave enough room for some block groups and make the final * value we work from a multiple of 4.
*/
gd_mb -= 16;
gd_mb &= 0xFFFFFFFB;
la_mb = gd_mb;
/* * Keep window sizes down to a reasonable default
*/ if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) { /* * Some clustersize / blocksize combinations will have * given us a larger than OCFS2_LA_MAX_DEFAULT_MB * default size, but get poor distribution when * limited to exactly 256 megabytes. * * As an example, 16K clustersize at 4K blocksize * gives us a cluster group size of 504M. Paring the * local alloc size down to 256 however, would give us * only one window and around 200MB left in the * cluster group. Instead, find the first size below * 256 which would give us an even distribution. * * Larger cluster group sizes actually work out pretty * well when pared to 256, so we don't have to do this * for any group that fits more than two * OCFS2_LA_MAX_DEFAULT_MB windows.
*/ if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB))
la_mb = 256; else { unsignedint gd_mult = gd_mb;
while (gd_mult > 256)
gd_mult = gd_mult >> 1;
la_mb = gd_mult;
}
}
megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots;
megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot); /* Too many nodes, too few disk clusters. */ if (megs_per_slot < la_mb)
la_mb = megs_per_slot;
/* We can't store more bits than we can in a block. */
la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
ocfs2_local_alloc_size(sb) * 8); if (la_mb > la_max_mb)
la_mb = la_max_mb;
if (requested_mb == -1) { /* No user request - use defaults */
osb->local_alloc_default_bits =
ocfs2_megabytes_to_clusters(sb, la_default_mb);
} elseif (requested_mb > la_max_mb) { /* Request is too big, we give the maximum available */
osb->local_alloc_default_bits =
ocfs2_megabytes_to_clusters(sb, la_max_mb);
} else {
osb->local_alloc_default_bits =
ocfs2_megabytes_to_clusters(sb, requested_mb);
}
/* * Tell us whether a given allocation should use the local alloc * file. Otherwise, it has to go to the main bitmap. * * This function does semi-dirty reads of local alloc size and state! * This is ok however, as the values are re-checked once under mutex.
*/ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
{ int ret = 0; int la_bits;
/* la_bits should be at least twice the size (in clusters) of * a new block group. We want to be sure block group * allocations go through the local alloc, so allow an
* allocation to take up to half the bitmap. */ if (bits > (la_bits / 2)) goto bail;
int ocfs2_load_local_alloc(struct ocfs2_super *osb)
{ int status = 0; struct ocfs2_dinode *alloc = NULL; struct buffer_head *alloc_bh = NULL;
u32 num_used; struct inode *inode = NULL; struct ocfs2_local_alloc *la;
if (osb->local_alloc_bits == 0) goto bail;
if (osb->local_alloc_bits >= osb->bitmap_cpg) {
mlog(ML_NOTICE, "Requested local alloc window %d is larger " "than max possible %u. Using defaults.\n",
osb->local_alloc_bits, (osb->bitmap_cpg - 1));
osb->local_alloc_bits =
ocfs2_megabytes_to_clusters(osb->sb,
ocfs2_la_default_mb(osb));
}
/* read the alloc off disk */
inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
osb->slot_num); if (!inode) {
status = -EINVAL;
mlog_errno(status); goto bail;
}
status = ocfs2_read_inode_block_full(inode, &alloc_bh,
OCFS2_BH_IGNORE_CACHE); if (status < 0) {
mlog_errno(status); goto bail;
}
alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
if (!(le32_to_cpu(alloc->i_flags) &
(OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
mlog(ML_ERROR, "Invalid local alloc inode, %llu\n",
(unsignedlonglong)OCFS2_I(inode)->ip_blkno);
status = -EINVAL; goto bail;
}
if ((la->la_size == 0) ||
(le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
le16_to_cpu(la->la_size));
status = -EINVAL; goto bail;
}
/* do a little verification. */
num_used = ocfs2_local_alloc_count_bits(alloc);
/* hopefully the local alloc has always been recovered before
* we load it. */ if (num_used
|| alloc->id1.bitmap1.i_used
|| alloc->id1.bitmap1.i_total
|| la->la_bm_off) {
mlog(ML_ERROR, "inconsistent detected, clean journal with" " unrecovered local alloc, please run fsck.ocfs2!\n" "found = %u, set = %u, taken = %u, off = %u\n",
num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
le32_to_cpu(alloc->id1.bitmap1.i_total),
le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off));
/* * return any unused bits to the bitmap and write out a clean * local_alloc. * * local_alloc_bh is optional. If not passed, we will simply use the * one off osb. If you do pass it however, be warned that it *will* be
* returned brelse'd and NULL'd out.*/ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
{ int status;
handle_t *handle; struct inode *local_alloc_inode = NULL; struct buffer_head *bh = NULL; struct buffer_head *main_bm_bh = NULL; struct inode *main_bm_inode = NULL; struct ocfs2_dinode *alloc_copy = NULL; struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq); if (osb->ocfs2_wq)
flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out;
local_alloc_inode =
ocfs2_get_system_file_inode(osb,
LOCAL_ALLOC_SYSTEM_INODE,
osb->slot_num); if (!local_alloc_inode) {
status = -ENOENT;
mlog_errno(status); goto out;
}
osb->local_alloc_state = OCFS2_LA_DISABLED;
ocfs2_resmap_uninit(&osb->osb_la_resmap);
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT); if (!main_bm_inode) {
status = -EINVAL;
mlog_errno(status); goto out;
}
inode_lock(main_bm_inode);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) {
mlog_errno(status); goto out_mutex;
}
/* WINDOW_MOVE_CREDITS is a bit heavy... */
handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS); if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
handle = NULL; goto out_unlock;
}
/* * We want to free the bitmap bits outside of any recovery context as * we'll need a cluster lock to do so, but we must clear the local * alloc before giving up the recovered nodes journal. To solve this, * we kmalloc a copy of the local alloc before it's change for the * caller to process with ocfs2_complete_local_alloc_recovery
*/ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, int slot_num, struct ocfs2_dinode **alloc_copy)
{ int status = 0; struct buffer_head *alloc_bh = NULL; struct inode *inode = NULL; struct ocfs2_dinode *alloc;
trace_ocfs2_begin_local_alloc_recovery(slot_num);
*alloc_copy = NULL;
inode = ocfs2_get_system_file_inode(osb,
LOCAL_ALLOC_SYSTEM_INODE,
slot_num); if (!inode) {
status = -EINVAL;
mlog_errno(status); goto bail;
}
inode_lock(inode);
status = ocfs2_read_inode_block_full(inode, &alloc_bh,
OCFS2_BH_IGNORE_CACHE); if (status < 0) {
mlog_errno(status); goto bail;
}
*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL); if (!(*alloc_copy)) {
status = -ENOMEM; goto bail;
}
memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
/* * Step 2: By now, we've completed the journal recovery, we've stamped * a clean local alloc on disk and dropped the node out of the * recovery map. Dlm locks will no longer stall, so lets clear out the * main bitmap.
*/ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, struct ocfs2_dinode *alloc)
{ int status;
handle_t *handle; struct buffer_head *main_bm_bh = NULL; struct inode *main_bm_inode;
main_bm_inode = ocfs2_get_system_file_inode(osb,
GLOBAL_BITMAP_SYSTEM_INODE,
OCFS2_INVALID_SLOT); if (!main_bm_inode) {
status = -EINVAL;
mlog_errno(status); goto out;
}
inode_lock(main_bm_inode);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) {
mlog_errno(status); goto out_mutex;
}
handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS); if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
mlog_errno(status); goto out_unlock;
}
/* we want the bitmap change to be recorded on disk asap */
handle->h_sync = 1;
status = ocfs2_sync_local_to_main(osb, handle, alloc,
main_bm_inode, main_bm_bh); if (status < 0)
mlog_errno(status);
ocfs2_commit_trans(osb, handle);
out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
inode_unlock(main_bm_inode);
brelse(main_bm_bh);
iput(main_bm_inode);
out: if (!status)
ocfs2_init_steal_slots(osb); if (status)
mlog_errno(status); return status;
}
/* * make sure we've got at least bits_wanted contiguous bits in the * local alloc. You lose them when you drop i_rwsem. * * We will add ourselves to the transaction passed in, but may start * our own in order to shift windows.
*/ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
u32 bits_wanted, struct ocfs2_alloc_context *ac)
{ int status; struct ocfs2_dinode *alloc; struct inode *local_alloc_inode; unsignedint free_bits;
BUG_ON(!ac);
local_alloc_inode =
ocfs2_get_system_file_inode(osb,
LOCAL_ALLOC_SYSTEM_INODE,
osb->slot_num); if (!local_alloc_inode) {
status = -ENOENT;
mlog_errno(status); goto bail;
}
inode_lock(local_alloc_inode);
/* * We must double check state and allocator bits because * another process may have changed them while holding i_rwsem.
*/
spin_lock(&osb->osb_lock); if (!ocfs2_la_state_enabled(osb) ||
(bits_wanted > osb->local_alloc_bits)) {
spin_unlock(&osb->osb_lock);
status = -ENOSPC; goto bail;
}
spin_unlock(&osb->osb_lock);
#ifdef CONFIG_OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
status = ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
(unsignedlonglong)le64_to_cpu(alloc->i_blkno),
le32_to_cpu(alloc->id1.bitmap1.i_used),
ocfs2_local_alloc_count_bits(alloc)); goto bail;
} #endif
free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
le32_to_cpu(alloc->id1.bitmap1.i_used); if (bits_wanted > free_bits) { /* uhoh, window change time. */
status =
ocfs2_local_alloc_slide_window(osb, local_alloc_inode); if (status < 0) { if (status != -ENOSPC)
mlog_errno(status); goto bail;
}
/* * Under certain conditions, the window slide code * might have reduced the number of bits available or * disabled the local alloc entirely. Re-check * here and return -ENOSPC if necessary.
*/
status = -ENOSPC; if (!ocfs2_la_state_enabled(osb)) goto bail;
ac->ac_inode = local_alloc_inode; /* We should never use localalloc from another slot */
ac->ac_alloc_slot = osb->slot_num;
ac->ac_which = OCFS2_AC_USE_LOCAL;
get_bh(osb->local_alloc_bh);
ac->ac_bh = osb->local_alloc_bh;
status = 0;
bail: if (status < 0 && local_alloc_inode) {
inode_unlock(local_alloc_inode);
iput(local_alloc_inode);
}
/* * Code error. While reservations are enabled, local * allocation should _always_ go through them.
*/
BUG_ON(osb->osb_resv_level != 0);
/* * Reservations are disabled. Handle this the old way.
*/
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
numfound = bitoff = startoff = 0;
left = le32_to_cpu(alloc->id1.bitmap1.i_total); while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) <
left) { /* Ok, we found a zero bit... is it contig. or do we
* start over?*/ if (bitoff == startoff) { /* we found a zero */
numfound++;
startoff++;
} else { /* got a zero after some ones */
numfound = 1;
startoff = bitoff+1;
} /* we got everything we needed */ if (numfound == *numbits) { /* mlog(0, "Found it all!\n"); */ break;
}
}
#if 0 /* turn this on and uncomment below to aid debugging window shifts. */ staticvoid ocfs2_verify_zero_bits(unsignedlong *bitmap, unsignedint start, unsignedint count)
{ unsignedint tmp = count; while(tmp--) { if (ocfs2_test_bit(start + tmp, bitmap)) {
printk("ocfs2_verify_zero_bits: start = %u, count = " "%u\n", start, count);
printk("ocfs2_verify_zero_bits: bit %u is set!",
start + tmp);
BUG();
}
}
} #endif
/* * sync the local alloc to main bitmap. * * assumes you've already locked the main bitmap -- the bitmap inode * passed is used for caching.
*/ staticint ocfs2_sync_local_to_main(struct ocfs2_super *osb,
handle_t *handle, struct ocfs2_dinode *alloc, struct inode *main_bm_inode, struct buffer_head *main_bm_bh)
{ int status = 0; int bit_off, left, count, start;
u64 la_start_blk;
u64 blkno; void *bitmap; struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
bail: if (status)
mlog_errno(status); return status;
}
enum ocfs2_la_event {
OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */
OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has * enough bits theoretically * free, but a contiguous * allocation could not be
* found. */
OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have * enough bits free to satisfy
* our request. */
}; #define OCFS2_LA_ENABLE_INTERVAL (30 * HZ) /* * Given an event, calculate the size of our next local alloc window. * * This should always be called under i_rwsem of the local alloc inode * so that local alloc disabling doesn't race with processes trying to * use the allocator. * * Returns the state which the local alloc was left in. This value can * be ignored by some paths.
*/ staticint ocfs2_recalc_la_window(struct ocfs2_super *osb, enum ocfs2_la_event event)
{ unsignedint bits; int state;
/* * ENOSPC and fragmentation are treated similarly for now.
*/ if (event == OCFS2_LA_EVENT_ENOSPC ||
event == OCFS2_LA_EVENT_FRAGMENTED) { /* * We ran out of contiguous space in the primary * bitmap. Drastically reduce the number of bits used * by local alloc until we have to disable it.
*/
bits = osb->local_alloc_bits >> 1; if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) { /* * By setting state to THROTTLED, we'll keep * the number of local alloc bits used down * until an event occurs which would give us * reason to assume the bitmap situation might * have changed.
*/
osb->local_alloc_state = OCFS2_LA_THROTTLED;
osb->local_alloc_bits = bits;
} else {
osb->local_alloc_state = OCFS2_LA_DISABLED;
}
queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
OCFS2_LA_ENABLE_INTERVAL); goto out_unlock;
}
/* * Don't increase the size of the local alloc window until we * know we might be able to fulfill the request. Otherwise, we * risk bouncing around the global bitmap during periods of * low space.
*/ if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
osb->local_alloc_bits = osb->local_alloc_default_bits;
out_unlock:
state = osb->local_alloc_state;
spin_unlock(&osb->osb_lock);
/* * pass it the bitmap lock in lock_bh if you have it.
*/ staticint ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
handle_t *handle, struct ocfs2_alloc_context *ac)
{ int status = 0;
u32 cluster_off, cluster_count; struct ocfs2_dinode *alloc = NULL; struct ocfs2_local_alloc *la;
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
la = OCFS2_LOCAL_ALLOC(alloc);
/* Instruct the allocation code to try the most recently used * cluster group. We'll re-record the group used this pass
* below. */
ac->ac_last_group = osb->la_last_gd;
/* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use
* the more specific cluster api to claim bits. */
status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits,
&cluster_off, &cluster_count); if (status == -ENOSPC) {
retry_enospc: /* * Note: We could also try syncing the journal here to * allow use of any free bits which the current * transaction can't give us access to. --Mark
*/ if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
OCFS2_LA_DISABLED) goto bail;
ac->ac_bits_wanted = osb->local_alloc_bits;
status = ocfs2_claim_clusters(handle, ac,
osb->local_alloc_bits,
&cluster_off,
&cluster_count); if (status == -ENOSPC) goto retry_enospc; /* * We only shrunk the *minimum* number of in our * request - it's entirely possible that the allocator * might give us more than we asked for.
*/ if (status == 0) {
spin_lock(&osb->osb_lock);
osb->local_alloc_bits = cluster_count;
spin_unlock(&osb->osb_lock);
}
} if (status < 0) { if (status != -ENOSPC)
mlog_errno(status); goto bail;
}
osb->la_last_gd = ac->ac_last_group;
la->la_bm_off = cpu_to_le32(cluster_off);
alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); /* just in case... In the future when we find space ourselves, * we don't have to get all contiguous -- but we'll have to * set all previously used bits in bitmap and update
* la_bits_set before setting the bits in the main bitmap. */
alloc->id1.bitmap1.i_used = 0;
memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
le16_to_cpu(la->la_size));
/* This will lock the main bitmap for us. */
status = ocfs2_local_alloc_reserve_for_window(osb,
&ac,
&main_bm_inode,
&main_bm_bh); if (status < 0) { if (status != -ENOSPC)
mlog_errno(status); goto bail;
}
handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS); if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
mlog_errno(status); goto bail;
}
/* We want to clear the local alloc before doing anything * else, so that if we error later during this operation, * local alloc shutdown won't try to double free main bitmap * bits. Make a copy so the sync function knows which bits to
* free. */
alloc_copy = kmemdup(alloc, osb->local_alloc_bh->b_size, GFP_NOFS); if (!alloc_copy) {
status = -ENOMEM;
mlog_errno(status); goto bail;
}
status = ocfs2_journal_access_di(handle,
INODE_CACHE(local_alloc_inode),
osb->local_alloc_bh,
OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) {
mlog_errno(status); goto bail;
}