/* * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. * * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0
*/
/* * This must be power of 2 and greater than or equal to sizeof(link_free). * These two conditions ensure that any 'struct link_free' itself doesn't * span more than 1 page which avoids complex case of mapping 2 pages simply * to restore link_free pointer values.
*/ #define ZS_ALIGN 8
#define ZS_HANDLE_SIZE (sizeof(unsignedlong))
/* * Object location (<PFN>, <obj_idx>) is encoded as * a single (unsigned long) handle value. * * Note that object index <obj_idx> starts from 0. * * This is made more complicated by various memory models and PAE.
*/
#ifndef MAX_POSSIBLE_PHYSMEM_BITS #ifdef MAX_PHYSMEM_BITS #define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS #else /* * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just * be PAGE_SHIFT
*/ #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG #endif #endif
/* * Head in allocated object should have OBJ_ALLOCATED_TAG * to identify the object was allocated or not. * It's okay to add the status bit in the least bit because * header keeps handle which is 4byte-aligned address so we * have room for two bit at least.
*/ #define OBJ_ALLOCATED_TAG 1
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \
MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) /* each chunk includes extra space to keep handle */ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE
/* * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes * - Small number of size classes causes large internal fragmentation * - Probably its better to use specific size classes (empirically * determined). NOTE: all those class sizes must be set as multiple of * ZS_ALIGN to make sure link_free itself never has to span 2 pages. * * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN * (reason above)
*/ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
ZS_SIZE_CLASS_DELTA) + 1)
/* * Pages are distinguished by the ratio of used memory (that is the ratio * of ->inuse objects to all objects that page can store). For example, * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%. * * The number of fullness groups is not random. It allows us to keep * difference between the least busy page in the group (minimum permitted * number of ->inuse objects) and the most busy page (maximum permitted * number of ->inuse objects) at a reasonable value.
*/ enum fullness_group {
ZS_INUSE_RATIO_0,
ZS_INUSE_RATIO_10, /* NOTE: 8 more fullness groups here */
ZS_INUSE_RATIO_99 = 10,
ZS_INUSE_RATIO_100,
NR_FULLNESS_GROUPS,
};
enum class_stat_type { /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */
ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS,
ZS_OBJS_INUSE,
NR_CLASS_STAT_TYPES,
};
struct size_class {
spinlock_t lock; struct list_head fullness_list[NR_FULLNESS_GROUPS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN.
*/ int size; int objs_per_zspage; /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage;
unsignedint index; struct zs_size_stat stats;
};
/* * Placed within free objects to form a singly linked list. * For every zspage, zspage->freeobj gives head of this list. * * This must be power of 2 and less than or equal to ZS_ALIGN
*/ struct link_free { union { /* * Free object index; * It's valid for non-allocated object
*/ unsignedlong next; /* * Handle of allocated object.
*/ unsignedlong handle;
};
};
/* * The zspage lock can be held from atomic contexts, but it needs to remain * preemptible when held for reading because it remains held outside of those * atomic contexts, otherwise we unnecessarily lose preemptibility. * * To achieve this, the following rules are enforced on readers and writers: * * - Writers are blocked by both writers and readers, while readers are only * blocked by writers (i.e. normal rwlock semantics). * * - Writers are always atomic (to allow readers to spin waiting for them). * * - Writers always use trylock (as the lock may be held be sleeping readers). * * - Readers may spin on the lock (as they can only wait for atomic writers). * * - Readers may sleep while holding the lock (as writes only use trylock).
*/ staticvoid zspage_read_lock(struct zspage *zspage)
{ struct zspage_lock *zsl = &zspage->zsl;
staticvoid *zs_zpool_create(constchar *name, gfp_t gfp)
{ /* * Ignore global gfp flags: zs_malloc() may be invoked from * different contexts and its caller must provide a valid * gfp mask.
*/ return zs_create_pool(name);
}
/* * zsmalloc divides the pool into various size classes where each * class maintains a list of zspages where each zspage is divided * into equal sized chunks. Each allocation falls into one of these * classes depending on its size. This function returns index of the * size class which has chunk size big enough to hold the given size.
*/ staticint get_size_class_index(int size)
{ int idx = 0;
if (likely(size > ZS_MIN_ALLOC_SIZE))
idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
ZS_SIZE_CLASS_DELTA);
/* * For each size class, zspages are divided into different groups * depending on their usage ratio. This function returns fullness * status of the given page.
*/ staticint get_fullness_group(struct size_class *class, struct zspage *zspage)
{ int inuse, objs_per_zspage, ratio;
if (inuse == 0) return ZS_INUSE_RATIO_0; if (inuse == objs_per_zspage) return ZS_INUSE_RATIO_100;
ratio = 100 * inuse / objs_per_zspage; /* * Take integer division into consideration: a page with one inuse * object out of 127 possible, will end up having 0 usage ratio, * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group.
*/ return ratio / 10 + 1;
}
/* * Each size class maintains various freelists and zspages are assigned * to one of these freelists based on the number of live objects they * have. This functions inserts the given zspage into the freelist * identified by <class, fullness_group>.
*/ staticvoid insert_zspage(struct size_class *class, struct zspage *zspage, int fullness)
{
class_stat_add(class, fullness, 1);
list_add(&zspage->list, &class->fullness_list[fullness]);
zspage->fullness = fullness;
}
/* * This function removes the given zspage from the freelist identified * by <class, fullness_group>.
*/ staticvoid remove_zspage(struct size_class *class, struct zspage *zspage)
{ int fullness = zspage->fullness;
/* * Each size class maintains zspages in different fullness groups depending * on the number of live objects they contain. When allocating or freeing * objects, the fullness status of the page can change, for instance, from * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function * checks if such a status change has occurred for the given page and * accordingly moves the page from the list of the old fullness group to that * of the new fullness group.
*/ staticint fix_fullness_group(struct size_class *class, struct zspage *zspage)
{ int newfg;
newfg = get_fullness_group(class, zspage); if (newfg == zspage->fullness) goto out;
/* * Since zs_free couldn't be sleepable, this function cannot call * lock_page. The page locks trylock_zspage got will be released * by __free_zspage.
*/ if (!trylock_zspage(zspage)) {
kick_deferred_free(pool); return;
}
vaddr = kmap_local_zpdesc(zpdesc);
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
link->next = freeobj++ << OBJ_TAG_BITS;
link += class->size / sizeof(*link);
}
/* * We now come to the last (full or partial) object on this * page, which must point to the first object on the next * page (if present)
*/
next_zpdesc = get_next_zpdesc(zpdesc); if (next_zpdesc) {
link->next = freeobj++ << OBJ_TAG_BITS;
} else { /* * Reset OBJ_TAG_BITS bit to last link to tell * whether it's allocated object or not.
*/
link->next = -1UL << OBJ_TAG_BITS;
}
kunmap_local(vaddr);
zpdesc = next_zpdesc;
off %= PAGE_SIZE;
}
/* * Allocate individual pages and link them together as: * 1. all pages are linked together using zpdesc->next * 2. each sub-page point to zspage using zpdesc->zspage * * we set PG_private to identify the first zpdesc (i.e. no other zpdesc * has this flag set).
*/ for (i = 0; i < nr_zpdescs; i++) {
zpdesc = zpdescs[i];
zpdesc->zspage = zspage;
zpdesc->next = NULL; if (i == 0) {
zspage->first_zpdesc = zpdesc;
zpdesc_set_first(zpdesc); if (unlikely(class->objs_per_zspage == 1 &&
class->pages_per_zspage == 1))
SetZsHugePage(zspage);
} else {
prev_zpdesc->next = zpdesc;
}
prev_zpdesc = zpdesc;
}
}
/* * Allocate a zspage for the given size class
*/ staticstruct zspage *alloc_zspage(struct zs_pool *pool, struct size_class *class,
gfp_t gfp, constint nid)
{ int i; struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; struct zspage *zspage = cache_alloc_zspage(pool, gfp);
if (!zspage) return NULL;
if (!IS_ENABLED(CONFIG_COMPACTION))
gfp &= ~__GFP_MOVABLE;
for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) {
zspage = list_first_entry_or_null(&class->fullness_list[i], struct zspage, list); if (zspage) break;
}
return zspage;
}
staticbool can_merge(struct size_class *prev, int pages_per_zspage, int objs_per_zspage)
{ if (prev->pages_per_zspage == pages_per_zspage &&
prev->objs_per_zspage == objs_per_zspage) returntrue;
/** * zs_lookup_class_index() - Returns index of the zsmalloc &size_class * that hold objects of the provided size. * @pool: zsmalloc pool to use * @size: object size * * Context: Any context. * * Return: the index of the zsmalloc &size_class that hold objects of the * provided size.
*/ unsignedint zs_lookup_class_index(struct zs_pool *pool, unsignedint size)
{ struct size_class *class;
class = pool->size_class[get_size_class_index(size)];
/* Guarantee we can get zspage from handle safely */
read_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_location(obj, &zpdesc, &obj_idx);
zspage = get_zspage(zpdesc);
/* Make sure migration doesn't move any pages in this zspage */
zspage_read_lock(zspage);
read_unlock(&pool->lock);
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */
addr = kmap_local_zpdesc(zpdesc);
addr += off;
} else {
size_t sizes[2];
/* this object spans two pages */
sizes[0] = PAGE_SIZE - off;
sizes[1] = class->size - sizes[0];
addr = local_copy;
/* Guarantee we can get zspage from handle safely */
read_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_location(obj, &zpdesc, &obj_idx);
zspage = get_zspage(zpdesc);
/* Make sure migration doesn't move any pages in this zspage */
zspage_read_lock(zspage);
read_unlock(&pool->lock);
class = zspage_class(pool, zspage);
off = offset_in_page(class->size * obj_idx);
if (!ZsHugePage(zspage))
off += ZS_HANDLE_SIZE;
if (off + mem_len <= PAGE_SIZE) { /* this object is contained entirely within a page */ void *dst = kmap_local_zpdesc(zpdesc);
memcpy(dst + off, handle_mem, mem_len);
kunmap_local(dst);
} else { /* this object spans two pages */
size_t sizes[2];
/** * zs_huge_class_size() - Returns the size (in bytes) of the first huge * zsmalloc &size_class. * @pool: zsmalloc pool to use * * The function returns the size of the first huge class - any object of equal * or bigger size will be stored in zspage consisting of a single physical * page. * * Context: Any context. * * Return: the size (in bytes) of the first huge zsmalloc &size_class.
*/
size_t zs_huge_class_size(struct zs_pool *pool)
{ return huge_class_size;
}
EXPORT_SYMBOL_GPL(zs_huge_class_size);
/** * zs_malloc - Allocate block of given size from pool. * @pool: pool to allocate from * @size: size of block to allocate * @gfp: gfp flags when allocating object * @nid: The preferred node id to allocate new zspage (if needed) * * On success, handle to the allocated object is returned, * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/ unsignedlong zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, constint nid)
{ unsignedlong handle; struct size_class *class; int newfg; struct zspage *zspage;
if (unlikely(!size)) return (unsignedlong)ERR_PTR(-EINVAL);
if (unlikely(size > ZS_MAX_ALLOC_SIZE)) return (unsignedlong)ERR_PTR(-ENOSPC);
handle = cache_alloc_handle(pool, gfp); if (!handle) return (unsignedlong)ERR_PTR(-ENOMEM);
/* extra space in chunk to keep the handle */
size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)];
/* class->lock effectively protects the zpage migration */
spin_lock(&class->lock);
zspage = find_get_zspage(class); if (likely(zspage)) {
obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */
fix_fullness_group(class, zspage);
class_stat_add(class, ZS_OBJS_INUSE, 1);
/* * The pool->lock protects the race with zpage's migration * so it's safe to get the page from handle.
*/
read_lock(&pool->lock);
obj = handle_to_obj(handle);
obj_to_zpdesc(obj, &f_zpdesc);
zspage = get_zspage(f_zpdesc); class = zspage_class(pool, zspage);
spin_lock(&class->lock);
read_unlock(&pool->lock);
/* * Calling kunmap_local(d_addr) is necessary. kunmap_local() * calls must occurs in reverse order of calls to kmap_local_page(). * So, to call kunmap_local(s_addr) we should first call * kunmap_local(d_addr). For more details see * Documentation/mm/highmem.rst.
*/ if (s_off >= PAGE_SIZE) {
kunmap_local(d_addr);
kunmap_local(s_addr);
s_zpdesc = get_next_zpdesc(s_zpdesc);
s_addr = kmap_local_zpdesc(s_zpdesc);
d_addr = kmap_local_zpdesc(d_zpdesc);
s_size = class->size - written;
s_off = 0;
}
#ifdef CONFIG_COMPACTION /* * To prevent zspage destroy during migration, zspage freeing should * hold locks of all pages in the zspage.
*/ staticvoid lock_zspage(struct zspage *zspage)
{ struct zpdesc *curr_zpdesc, *zpdesc;
/* * Pages we haven't locked yet can be migrated off the list while we're * trying to lock them, so we need to be careful and only attempt to * lock each page under zspage_read_lock(). Otherwise, the page we lock * may no longer belong to the zspage. This means that we may wait for * the wrong page to unlock, so we must take a reference to the page * prior to waiting for it to unlock outside zspage_read_lock().
*/ while (1) {
zspage_read_lock(zspage);
zpdesc = get_first_zpdesc(zspage); if (zpdesc_trylock(zpdesc)) break;
zpdesc_get(zpdesc);
zspage_read_unlock(zspage);
zpdesc_wait_locked(zpdesc);
zpdesc_put(zpdesc);
}
staticbool zs_page_isolate(struct page *page, isolate_mode_t mode)
{ /* * Page is locked so zspage can't be destroyed concurrently * (see free_zspage()). But if the page was already destroyed * (see reset_zpdesc()), refuse isolation here.
*/ return page_zpdesc(page)->zspage;
}
/* * TODO: nothing prevents a zspage from getting destroyed while * it is isolated for migration, as the page lock is temporarily * dropped after zs_page_isolate() succeeded: we should rework that * and defer destroying such pages once they are un-isolated (putback) * instead.
*/ if (!zpdesc->zspage) return 0;
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(zpdesc);
pool = zspage->pool;
/* * The pool migrate_lock protects the race between zpage migration * and zs_free.
*/
write_lock(&pool->lock); class = zspage_class(pool, zspage);
/* * the class lock protects zpage alloc/free in the zspage.
*/
spin_lock(&class->lock); /* the zspage write_lock protects zpage access via zs_obj_read/write() */ if (!zspage_write_trylock(zspage)) {
spin_unlock(&class->lock);
write_unlock(&pool->lock); return -EINVAL;
}
/* We're committed, tell the world that this is a Zsmalloc page. */
__zpdesc_set_zsmalloc(newzpdesc);
/* * Here, any user cannot access all objects in the zspage so let's move.
*/
d_addr = kmap_local_zpdesc(newzpdesc);
copy_page(d_addr, s_addr);
kunmap_local(d_addr);
for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
addr += class->size) { if (obj_allocated(zpdesc, addr, &handle)) {
replace_sub_page(class, zspage, newzpdesc, zpdesc); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release migration_lock.
*/
write_unlock(&pool->lock);
spin_unlock(&class->lock);
zspage_write_unlock(zspage);
zpdesc_get(newzpdesc); if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
zpdesc_dec_zone_page_state(zpdesc);
zpdesc_inc_zone_page_state(newzpdesc);
}
/* * Caller should hold page_lock of all pages in the zspage * In here, we cannot use zspage meta data.
*/ staticvoid async_free_zspage(struct work_struct *work)
{ int i; struct size_class *class; struct zspage *zspage, *tmp;
LIST_HEAD(free_pages); struct zs_pool *pool = container_of(work, struct zs_pool,
free_work);
for (i = 0; i < ZS_SIZE_CLASSES; i++) { class = pool->size_class[i]; if (class->index != i) continue;
/* * * Based on the number of unused allocated objects calculate * and return the number of pages that we can free.
*/ staticunsignedlong zs_can_compact(struct size_class *class)
{ unsignedlong obj_wasted; unsignedlong obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED); unsignedlong obj_used = class_stat_read(class, ZS_OBJS_INUSE);
/* * protect the race between zpage migration and zs_free * as well as zpage allocation/free
*/
write_lock(&pool->lock);
spin_lock(&class->lock); while (zs_can_compact(class)) { int fg;
if (!dst_zspage) {
dst_zspage = isolate_dst_zspage(class); if (!dst_zspage) break;
}
src_zspage = isolate_src_zspage(class); if (!src_zspage) break;
/* * Pool compaction is performed under pool->lock so it is basically * single-threaded. Having more than one thread in __zs_compact() * will increase pool->lock contention, which will impact other * zsmalloc operations that need pool->lock.
*/ if (atomic_xchg(&pool->compaction_in_progress, 1)) return 0;
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; if (class->index != i) continue;
pages_freed += __zs_compact(pool, class);
}
atomic_long_add(pages_freed, &pool->stats.pages_compacted);
atomic_set(&pool->compaction_in_progress, 0);
/* * Compact classes and calculate compaction delta. * Can run concurrently with a manually triggered * (by user) compaction.
*/
pages_freed = zs_compact(pool);
staticint calculate_zspage_chain_size(int class_size)
{ int i, min_waste = INT_MAX; int chain_size = 1;
if (is_power_of_2(class_size)) return chain_size;
for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { int waste;
waste = (i * PAGE_SIZE) % class_size; if (waste < min_waste) {
min_waste = waste;
chain_size = i;
}
}
return chain_size;
}
/** * zs_create_pool - Creates an allocation pool to work from. * @name: pool name to be created * * This function must be called before anything when using * the zsmalloc allocator. * * On success, a pointer to the newly created pool is returned, * otherwise NULL.
*/ struct zs_pool *zs_create_pool(constchar *name)
{ int i; struct zs_pool *pool; struct size_class *prev_class = NULL;
pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL;
pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) goto err;
if (create_cache(pool)) goto err;
/* * Iterate reversely, because, size of size_class that we want to use * for merging should be larger or equal to current size.
*/ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { int size; int pages_per_zspage; int objs_per_zspage; struct size_class *class; int fullness;
/* * We iterate from biggest down to smallest classes, * so huge_class_size holds the size of the first huge * class. Any object bigger than or equal to that will * endup in the huge class.
*/ if (pages_per_zspage != 1 && objs_per_zspage != 1 &&
!huge_class_size) {
huge_class_size = size; /* * The object uses ZS_HANDLE_SIZE bytes to store the * handle. We need to subtract it, because zs_malloc() * unconditionally adds handle size before it performs * size class search - so object may be smaller than * huge class size, yet it still can end up in the huge * class because it grows by ZS_HANDLE_SIZE extra bytes * right before class lookup.
*/
huge_class_size -= (ZS_HANDLE_SIZE - 1);
}
/* * size_class is used for normal zsmalloc operation such * as alloc/free for that size. Although it is natural that we * have one size_class for each size, there is a chance that we * can get more memory utilization if we use one size_class for * many different sizes whose size_class have same * characteristics. So, we makes size_class point to * previous size_class if possible.
*/ if (prev_class) { if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) {
pool->size_class[i] = prev_class; continue;
}
}
class = kzalloc(sizeof(struct size_class), GFP_KERNEL); if (!class) goto err;
/* debug only, don't abort if it fails */
zs_pool_stat_create(pool, name);
/* * Not critical since shrinker is only used to trigger internal * defragmentation of the pool which is pretty optional thing. If * registration fails we still can use the pool normally and user can * trigger compaction manually. Thus, ignore return code.
*/
zs_register_shrinker(pool);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.