/* * Maximum loops in mem_cgroup_soft_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur.
*/ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
/* * cgroup_event represents events which userspace want to receive.
*/ struct mem_cgroup_event { /* * memcg which the event belongs to.
*/ struct mem_cgroup *memcg; /* * eventfd to signal userspace about the event.
*/ struct eventfd_ctx *eventfd; /* * Each of these stored in a list by the cgroup.
*/ struct list_head list; /* * register_event() callback will be used to add new userspace * waiter for changes related to this event. Use eventfd_signal() * on eventfd to send notification to userspace.
*/ int (*register_event)(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, constchar *args); /* * unregister_event() callback will be called when userspace closes * the eventfd or on cgroup removing. This callback must be set, * if you want provide notification functionality.
*/ void (*unregister_event)(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); /* * All fields below needed to unregister event when * userspace closes eventfd.
*/
poll_table pt;
wait_queue_head_t *wqh;
wait_queue_entry_t wait; struct work_struct remove;
};
if (lru_gen_enabled()) { if (soft_limit_excess(memcg))
lru_gen_soft_reclaim(memcg, nid); return;
}
mctz = soft_limit_tree.rb_tree_per_node[nid]; if (!mctz) return; /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched.
*/ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = memcg->nodeinfo[nid];
excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit.
*/ if (excess || mz->on_tree) { unsignedlong flags;
spin_lock_irqsave(&mctz->lock, flags); /* if on-tree, remove it */ if (mz->on_tree)
__mem_cgroup_remove_exceeded(mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops.
*/
__mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock_irqrestore(&mctz->lock, flags);
}
}
}
retry:
mz = NULL; if (!mctz->rb_rightmost) goto done; /* Nothing to reclaim from */
mz = rb_entry(mctz->rb_rightmost, struct mem_cgroup_per_node, tree_node); /* * Remove the node now but someone else can add it back, * we will to add it back at the end of reclaim to its correct * position in the tree.
*/
__mem_cgroup_remove_exceeded(mz, mctz); if (!soft_limit_excess(mz->memcg) ||
!css_tryget(&mz->memcg->css)) goto retry;
done: return mz;
}
while (1) {
victim = mem_cgroup_iter(root_memcg, victim, &reclaim); if (!victim) {
loop++; if (loop >= 2) { /* * If we have not been able to reclaim * anything, it might because there are * no reclaimable pages under this hierarchy
*/ if (!total) break; /* * We want to do more targeted reclaim. * excess >> 2 is not to excessive so as to * reclaim too much, nor too less that we keep * coming back to reclaim from this cgroup
*/ if (total >= (excess >> 2) ||
(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) break;
} continue;
}
total += mem_cgroup_shrink_node(victim, gfp_mask, false,
pgdat, &nr_scanned);
*total_scanned += nr_scanned; if (!soft_limit_excess(root_memcg)) break;
}
mem_cgroup_iter_break(root_memcg, victim); return total;
}
/* * Do not even bother to check the largest node if the root * is empty. Do it lockless to prevent lock bouncing. Races * are acceptable as soft limit is best effort anyway.
*/ if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) return 0;
/* * This loop can run a while, specially if mem_cgroup's continuously * keep exceeding their soft limit and putting the system under * pressure
*/ do { if (next_mz)
mz = next_mz; else
mz = mem_cgroup_largest_soft_limit_node(mctz); if (!mz) break;
/* * If we failed to reclaim anything from this memory cgroup * it is time to move on to the next cgroup
*/
next_mz = NULL; if (!reclaimed)
next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
excess = soft_limit_excess(mz->memcg); /* * One school of thought says that we should not add * back the node to the tree if reclaim returns 0. * But our reclaim could return 0, simply because due * to priority we are exposing a smaller subset of * memory to reclaim from. Consider this as a longer * term TODO.
*/ /* If excess == 0, no tree ops */
__mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock_irq(&mctz->lock);
css_put(&mz->memcg->css);
loop++; /* * Could not reclaim anything and there are no more * mem cgroups to try or we seem to be looping without * reclaiming anything.
*/ if (!nr_reclaimed &&
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) break;
} while (!nr_reclaimed); if (next_mz)
css_put(&next_mz->memcg->css); return nr_reclaimed;
}
#ifdef CONFIG_MMU staticint mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val)
{
pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " "Please report your usecase to linux-mm@kvack.org if you " "depend on this functionality.\n");
rcu_read_lock(); if (!swap)
t = rcu_dereference(memcg->thresholds.primary); else
t = rcu_dereference(memcg->memsw_thresholds.primary);
if (!t) goto unlock;
usage = mem_cgroup_usage(memcg, swap);
/* * current_threshold points to threshold just below or equal to usage. * If it's not true, a threshold was crossed after last * call of __mem_cgroup_threshold().
*/
i = t->current_threshold;
/* * Iterate backward over array of thresholds starting from * current_threshold and check if a threshold is crossed. * If none of thresholds below usage is crossed, we read * only one element of the array here.
*/ for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
eventfd_signal(t->entries[i].eventfd);
/* i = current_threshold + 1 */
i++;
/* * Iterate forward over array of thresholds starting from * current_threshold+1 and check if a threshold is crossed. * If none of thresholds above usage is crossed, we read * only one element of the array here.
*/ for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
eventfd_signal(t->entries[i].eventfd);
staticvoid mem_cgroup_threshold(struct mem_cgroup *memcg)
{ while (memcg) {
__mem_cgroup_threshold(memcg, false); if (do_memsw_account())
__mem_cgroup_threshold(memcg, true);
memcg = parent_mem_cgroup(memcg);
}
}
/* Cgroup1: threshold notifications & softlimit tree updates */
/* * Per memcg event counter is incremented at every pagein/pageout. With THP, * it will be incremented by the number of pages. This counter is used * to trigger some periodic events. This is straightforward and better * than using jiffies etc. to handle periodic memcg event.
*/ enum mem_cgroup_events_target {
MEM_CGROUP_TARGET_THRESH,
MEM_CGROUP_TARGET_SOFTLIMIT,
MEM_CGROUP_NTARGETS,
};
staticvoid memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
{ /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0)
count_memcg_events(memcg, PGPGIN, 1); else {
count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
val = __this_cpu_read(memcg->events_percpu->nr_page_events);
next = __this_cpu_read(memcg->events_percpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET; break; case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET; break; default: break;
}
__this_cpu_write(memcg->events_percpu->targets[target], next); returntrue;
} returnfalse;
}
/* * Check events in order. *
*/ staticvoid memcg1_check_events(struct mem_cgroup *memcg, int nid)
{ if (IS_ENABLED(CONFIG_PREEMPT_RT)) return;
/* threshold event is triggered in finer grain than soft limit */ if (unlikely(memcg1_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) { bool do_softlimit;
/** * memcg1_swapout - transfer a memsw charge to swap * @folio: folio whose memsw charge to transfer * @entry: swap entry to move the charge to * * Transfer the memsw charge of @folio to @entry.
*/ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{ struct mem_cgroup *memcg, *swap_memcg; unsignedint nr_entries;
VM_WARN_ON_ONCE_FOLIO(!memcg, folio); if (!memcg) return;
/* * In case the memcg owning these pages has been offlined and doesn't * have an ID allocated to it anymore, charge the closest online * ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
nr_entries = folio_nr_pages(folio); /* Get references for the tail pages, too */ if (nr_entries > 1)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries);
}
/* * Interrupts should be disabled here because the caller holds the * i_pages lock which is taken with interrupts-off. It is * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables.
*/
preempt_disable_nested();
VM_WARN_ON_IRQS_ENABLED();
memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
preempt_enable_nested();
memcg1_check_events(memcg, folio_nid(folio));
css_put(&memcg->css);
}
/* * memcg1_swapin - uncharge swap slot * @entry: the first swap entry for which the pages are charged * @nr_pages: number of pages which will be uncharged * * Call this function after successfully adding the charged page to swapcache. * * Note: This function assumes the page for which swap slot is being uncharged * is order 0 page.
*/ void memcg1_swapin(swp_entry_t entry, unsignedint nr_pages)
{ /* * Cgroup1's unified memory+swap counter has been charged with the * new swapcache page, finish the transfer by uncharging the swap * slot. The swap slot would also get uncharged when it dies, but * it can stick around indefinitely and we'd count the page twice * the entire time. * * Cgroup2 has separate resource counters for memory and swap, * so this is a non-issue here. Memory and swap charge lifetimes * correspond 1:1 to page and swap slot lifetimes: we charge the * page to memory here, and uncharge swap when the slot is freed.
*/ if (do_memsw_account()) { /* * The swap entry might not get freed for a long time, * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate.
*/
mem_cgroup_uncharge_swap(entry, nr_pages);
}
}
/* Allocate memory for new array of thresholds */ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); if (!new) {
ret = -ENOMEM; goto unlock;
}
new->size = size;
/* Copy thresholds (if any) to new array */ if (thresholds->primary)
memcpy(new->entries, thresholds->primary->entries,
flex_array_size(new, entries, size - 1));
/* Sort thresholds. Registering of new threshold isn't time-critical */
sort(new->entries, size, sizeof(*new->entries),
compare_thresholds, NULL);
/* Find current threshold */
new->current_threshold = -1; for (i = 0; i < size; i++) { if (new->entries[i].threshold <= usage) { /* * new->current_threshold will not be used until * rcu_assign_pointer(), so it's safe to increment * it here.
*/
++new->current_threshold;
} else break;
}
/* Free old spare buffer and save old primary buffer as spare */
kfree(thresholds->spare);
thresholds->spare = thresholds->primary;
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
/* Check if a threshold crossed before removing */
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
size = entries = 0; for (i = 0; i < thresholds->primary->size; i++) { if (thresholds->primary->entries[i].eventfd != eventfd)
size++; else
entries++;
}
new = thresholds->spare;
/* If no items related to eventfd have been cleared, nothing to do */ if (!entries) goto unlock;
/* Set thresholds array to NULL if we don't have thresholds */ if (!size) {
kfree(new); new = NULL; goto swap_buffers;
}
new->size = size;
/* Copy thresholds and find current threshold */
new->current_threshold = -1; for (i = 0, j = 0; i < thresholds->primary->size; i++) { if (thresholds->primary->entries[i].eventfd == eventfd) continue;
new->entries[j] = thresholds->primary->entries[i]; if (new->entries[j].threshold <= usage) { /* * new->current_threshold will not be used * until rcu_assign_pointer(), so it's safe to increment * it here.
*/
++new->current_threshold;
}
j++;
}
/* To be sure that nobody uses thresholds */
synchronize_rcu();
/* If all events are unregistered, free the spare array */ if (!new) {
kfree(thresholds->spare);
thresholds->spare = NULL;
}
unlock:
mutex_unlock(&memcg->thresholds_lock);
}
/* * DO NOT USE IN NEW FILES. * * "cgroup.event_control" implementation. * * This is way over-engineered. It tries to support fully configurable * events for each user. Such level of flexibility is completely * unnecessary especially in the light of the planned unified hierarchy. * * Please deprecate this and replace with something simpler if at all * possible.
*/
/* * Gets called on EPOLLHUP on eventfd when user closes it. * * Called with wqh->lock held and interrupts disabled.
*/ staticint memcg_event_wake(wait_queue_entry_t *wait, unsignedint mode, int sync, void *key)
{ struct mem_cgroup_event *event =
container_of(wait, struct mem_cgroup_event, wait); struct mem_cgroup *memcg = event->memcg;
__poll_t flags = key_to_poll(key);
if (flags & EPOLLHUP) { /* * If the event has been detached at cgroup removal, we * can simply return knowing the other side will cleanup * for us. * * We can't race against event freeing since the other * side will require wqh->lock via remove_wait_queue(), * which we hold.
*/
spin_lock(&memcg->event_list_lock); if (!list_empty(&event->list)) {
list_del_init(&event->list); /* * We are in atomic context, but cgroup_event_remove() * may sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
spin_unlock(&memcg->event_list_lock);
}
/* * DO NOT USE IN NEW FILES. * * Parse input and register new cgroup event handler. * * Input must be in format '<event_fd> <control_fd> <args>'. * Interpretation of args is defined by control file implementation.
*/ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off)
{ struct cgroup_subsys_state *css = of_css(of); struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsignedint efd, cfd; struct dentry *cdentry; constchar *name; char *endp; int ret;
if (IS_ENABLED(CONFIG_PREEMPT_RT)) return -EOPNOTSUPP;
event->eventfd = eventfd_ctx_fileget(fd_file(efile)); if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd); goto out_kfree;
}
if (fd_empty(cfile)) {
ret = -EBADF; goto out_put_eventfd;
}
/* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */
ret = file_permission(fd_file(cfile), MAY_READ); if (ret < 0) goto out_put_eventfd;
/* * The control file must be a regular cgroup1 file. As a regular cgroup * file can't be renamed, it's safe to access its name afterwards.
*/
cdentry = fd_file(cfile)->f_path.dentry; if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
ret = -EINVAL; goto out_put_eventfd;
}
/* * Determine the event callbacks and set them in @event. This used * to be done via struct cftype but cgroup core no longer knows * about these events. The following is crude but the whole thing * is for compatibility anyway. * * DO NOT ADD NEW FILES.
*/
name = cdentry->d_name.name;
if (!strcmp(name, "memory.usage_in_bytes")) {
event->register_event = mem_cgroup_usage_register_event;
event->unregister_event = mem_cgroup_usage_unregister_event;
} elseif (!strcmp(name, "memory.oom_control")) {
pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org" " if you depend on this functionality.\n");
event->register_event = mem_cgroup_oom_register_event;
event->unregister_event = mem_cgroup_oom_unregister_event;
} elseif (!strcmp(name, "memory.pressure_level")) {
pr_warn_once("pressure_level is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org " "if you depend on this functionality.\n");
event->register_event = vmpressure_register_event;
event->unregister_event = vmpressure_unregister_event;
} elseif (!strcmp(name, "memory.memsw.usage_in_bytes")) {
event->register_event = memsw_cgroup_usage_register_event;
event->unregister_event = memsw_cgroup_usage_unregister_event;
} else {
ret = -EINVAL; goto out_put_eventfd;
}
/* * Verify @cfile should belong to @css. Also, remaining events are * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css.
*/
cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
&memory_cgrp_subsys);
ret = -EINVAL; if (IS_ERR(cfile_css)) goto out_put_eventfd; if (cfile_css != css) goto out_put_css;
ret = event->register_event(memcg, event->eventfd, buf); if (ret) goto out_put_css;
/* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup * directory to avoid race between userspace and kernelspace.
*/
spin_lock_irq(&memcg->event_list_lock);
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
spin_unlock_irq(&memcg->event_list_lock);
}
/* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false.
*/ staticbool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
{ struct mem_cgroup *iter, *failed = NULL;
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock.
*/
failed = iter;
mem_cgroup_iter_break(memcg, iter); break;
}
iter->oom_lock = true;
}
if (failed) { /* * OK, we failed to lock the whole subtree so we have * to clean up what we set up to the failing subtree
*/
for_each_mem_cgroup_tree(iter, memcg) { if (iter == failed) {
mem_cgroup_iter_break(memcg, iter); break;
}
iter->oom_lock = false;
}
} else
mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
/* * Be careful about under_oom underflows because a child memcg * could have been added after mem_cgroup_mark_under_oom.
*/
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg) if (iter->under_oom > 0)
iter->under_oom--;
spin_unlock(&memcg_oom_lock);
}
void memcg1_oom_recover(struct mem_cgroup *memcg)
{ /* * For the following lockless ->under_oom test, the only required * guarantee is that it must see the state asserted by an OOM when * this function is called as a result of userland actions * triggered by the notification of the OOM. This is trivially * achieved by invoking mem_cgroup_mark_under_oom() before * triggering notification.
*/ if (memcg && memcg->under_oom)
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
/** * mem_cgroup_oom_synchronize - complete memcg OOM handling * @handle: actually kill/wait or just clean up the OOM state * * This has to be called at the end of a page fault if the memcg OOM * handler was enabled. * * Memcg supports userspace OOM handling where failed allocations must * sleep on a waitqueue until the userspace task resolves the * situation. Sleeping directly in the charge context with all kinds * of locks held is not a good idea, instead we remember an OOM state * in the task and mem_cgroup_oom_synchronize() has to be called at * the end of the page fault to complete the OOM handling. * * Returns %true if an ongoing memcg OOM situation was detected and * completed, %false otherwise.
*/ bool mem_cgroup_oom_synchronize(bool handle)
{ struct mem_cgroup *memcg = current->memcg_in_oom; struct oom_wait_info owait; bool locked;
/* OOM is global, do not handle */ if (!memcg) returnfalse;
if (locked)
mem_cgroup_oom_unlock(memcg);
cleanup:
current->memcg_in_oom = NULL;
css_put(&memcg->css); returntrue;
}
bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
{ /* * We are in the middle of the charge context here, so we * don't want to block when potentially sitting on a callstack * that holds all kinds of filesystem and mm locks. * * cgroup1 allows disabling the OOM killer and waiting for outside * handling until the charge can succeed; remember the context and put * the task to sleep at the end of the page fault when all locks are * released. * * On the other hand, in-kernel OOM killer allows for an async victim * memory reclaim (oom_reaper) and that means that we are not solely * relying on the oom victim to make a forward progress and we can * invoke the oom killer here. * * Please note that mem_cgroup_out_of_memory might fail to find a * victim and then we have to bail out from the charge path.
*/ if (READ_ONCE(memcg->oom_kill_disable)) { if (current->in_user_fault) {
css_get(&memcg->css);
current->memcg_in_oom = memcg;
} returnfalse;
}
do { if (signal_pending(current)) {
ret = -EINTR; break;
}
mutex_lock(&memcg_max_mutex); /* * Make sure that the new limit (memsw or memory limit) doesn't * break our basic invariant rule memory.max <= memsw.max.
*/
limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
max <= memcg->memsw.max; if (!limits_invariant) {
mutex_unlock(&memcg_max_mutex);
ret = -EINVAL; break;
} if (max > counter->max)
enlarge = true;
ret = page_counter_set_max(counter, max);
mutex_unlock(&memcg_max_mutex);
if (!ret) break;
if (!drained) {
drain_all_stock(memcg);
drained = true; continue;
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
ret = -EBUSY; break;
}
} while (true);
if (!ret && enlarge)
memcg1_oom_recover(memcg);
return ret;
}
/* * Reclaims as many pages from the given memcg as possible. * * Caller is responsible for holding css reference for memcg.
*/ staticint mem_cgroup_force_empty(struct mem_cgroup *memcg)
{ int nr_retries = MAX_RECLAIM_RETRIES;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
drain_all_stock(memcg);
/* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { if (signal_pending(current)) return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
MEMCG_RECLAIM_MAY_SWAP, NULL))
nr_retries--;
}
switch (MEMFILE_TYPE(cft->private)) { case _MEM:
counter = &memcg->memory; break; case _MEMSWAP:
counter = &memcg->memsw; break; case _KMEM:
counter = &memcg->kmem; break; case _TCP:
counter = &memcg->tcpmem; break; default:
BUG();
}
switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: if (counter == &memcg->memory) return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; if (counter == &memcg->memsw) return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: return counter->failcnt; case RES_SOFT_LIMIT: return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; default:
BUG();
}
}
/* * This function doesn't do anything useful. Its only job is to provide a read * handler for a file so that cgroup_file_mode() will add read permissions.
*/ staticint mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
__always_unused void *v)
{ return -EINVAL;
}
staticint memcg_update_tcp_max(struct mem_cgroup *memcg, unsignedlong max)
{ int ret;
mutex_lock(&memcg_max_mutex);
ret = page_counter_set_max(&memcg->tcpmem, max); if (ret) goto out;
if (!memcg->tcpmem_active) { /* * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation * function is the last one to run. See mem_cgroup_sk_alloc() * for details, and note that we don't mark any socket as * belonging to this memcg until that flag is up. * * We need to do this, because static_keys will span multiple * sites, but we can't control their order. If we mark a socket * as accounted, but the accounting functions are not patched in * yet, we'll lose accounting. * * We never race with the readers in mem_cgroup_sk_alloc(), * because when this value change, the code to process it is not * patched in yet.
*/
static_branch_inc(&memcg_sockets_enabled_key);
memcg->tcpmem_active = true;
}
out:
mutex_unlock(&memcg_max_mutex); return ret;
}
/* * The user of this function is... * RES_LIMIT.
*/ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off)
{ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsignedlong nr_pages; int ret;
buf = strstrip(buf);
ret = page_counter_memparse(buf, "-1", &nr_pages); if (ret) return ret;
switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
ret = -EINVAL; break;
} switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM:
ret = mem_cgroup_resize_max(memcg, nr_pages, false); break; case _MEMSWAP:
ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM:
pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " "Writing any value to this file has no effect. " "Please report your usecase to linux-mm@kvack.org if you " "depend on this functionality.\n");
ret = 0; break; case _TCP:
pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. " "Please report your usecase to linux-mm@kvack.org if you " "depend on this functionality.\n");
ret = memcg_update_tcp_max(memcg, nr_pages); break;
} break; case RES_SOFT_LIMIT: if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
ret = -EOPNOTSUPP;
} else {
pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. " "Please report your usecase to linux-mm@kvack.org if you " "depend on this functionality.\n");
WRITE_ONCE(memcg->soft_limit, nr_pages);
ret = 0;
} break;
} return ret ?: nbytes;
}
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsignedlong nr;
nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
memcg_page_state_local(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsignedlong nr;
nr = memcg_page_state_output(memcg, memcg1_stats[i]);
seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
(u64)nr);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_buf_printf(s, "total_%s %llu\n",
vm_event_name(memcg1_events[i]),
(u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
(u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
PAGE_SIZE);
if (!mem_cgroup_is_root(memcg)) {
pr_info_once("Per memcg swappiness does not exist in cgroup v2. " "See memory.reclaim or memory.swap.max there\n ");
WRITE_ONCE(memcg->swappiness, val);
} else
WRITE_ONCE(vm_swappiness, val);
pr_warn_once("oom_control is deprecated and will be removed. " "Please report your usecase to linux-mm-@kvack.org if you " "depend on this functionality.\n");
/* cannot set to root cgroup and only 0 and 1 are allowed */ if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) return -EINVAL;
WRITE_ONCE(memcg->oom_kill_disable, val); if (!val)
memcg1_oom_recover(memcg);
return 0;
}
#ifdef CONFIG_SLUB_DEBUG staticint mem_cgroup_slab_show(struct seq_file *m, void *p)
{ /* * Deprecated. * Please, take a look at tools/cgroup/memcg_slabinfo.py .
*/ return 0;
} #endif
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.