/* * Generic process-grouping system. * * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * * Notifications support * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * --------------------------------------------------- * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details.
*/
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2) /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/* * To avoid confusing the compiler (and generating warnings) with code * that attempts to access what would be a 0-element array (i.e. sized * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this * constant expression can be added.
*/ #define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations.
*/
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock); #endif
/* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex.
*/ static DEFINE_SPINLOCK(cgroup_idr_lock);
/* * Protects cgroup_file->kn for !self csses. It synchronizes notifications * against file removal/re-creation across css hiding.
*/ static DEFINE_SPINLOCK(cgroup_file_kn_lock);
/* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. * * A cgroup destruction should enqueue work sequentially to: * cgroup_offline_wq: use for css offline work * cgroup_release_wq: use for css release work * cgroup_free_wq: use for free work * * Rationale for using separate workqueues: * The cgroup root free work may depend on completion of other css offline * operations. If all tasks were enqueued to a single workqueue, this could * create a deadlock scenario where: * - Free work waits for other css offline work to complete. * - But other css offline work is queued after free work in the same queue. * * Example deadlock scenario with single workqueue (cgroup_destroy_wq): * 1. umount net_prio * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, * which can never complete as it's behind in the same queue and * workqueue's max_active is 1.
*/ staticstruct workqueue_struct *cgroup_offline_wq; staticstruct workqueue_struct *cgroup_release_wq; staticstruct workqueue_struct *cgroup_free_wq;
/* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility.
*/ bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */ static u16 cgrp_dfl_threaded_ss_mask;
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots); staticint cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr);
/* * Assign a monotonically increasing serial number to csses. It guarantees * cgroups with bigger numbers are newer than those with smaller numbers. * Also, as csses are always appended to the parent's ->children list, it * guarantees that sibling csses are always sorted in the ascending serial * number order on the list. Protected by cgroup_mutex.
*/ static u64 css_serial_nr_next = 1;
/* * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly.
*/ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly;
/** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest * * cgroup_subsys_enabled() can only be used with literal subsys names which * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid.
*/ bool cgroup_ssid_enabled(int ssid)
{ if (!CGROUP_HAS_SUBSYS_CONFIG) returnfalse;
/** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" * and "name" are disallowed. * * - When mounting an existing superblock, mount options should match. * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. * * - "cgroup.clone_children" is removed. * * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup * and its descendants contain no task; otherwise, 1. The file also * generates kernfs notification which can be monitored through poll and * [di]notify when the value of the file changes. * * - cpuset: tasks will be kept in empty cpusets when hotplug happens and * take masks of ancestors with non-empty cpus/mems, instead of being * moved to an ancestor. * * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical.
*/ bool cgroup_on_dfl(conststruct cgroup *cgrp)
{ return cgrp->root == &cgrp_dfl_root;
}
/* IDR wrappers which synchronize using cgroup_idr_lock */ staticint cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
{ int ret;
/* can @cgrp host both domain and threaded children? */ staticbool cgroup_is_mixable(struct cgroup *cgrp)
{ /* * Root isn't under domain level resource control exempting it from * the no-internal-process constraint, so it can serve as a thread * root and a parent of resource domains at the same time.
*/ return !cgroup_parent(cgrp);
}
/* can @cgrp become a thread root? Should always be true for a thread root */ staticbool cgroup_can_be_thread_root(struct cgroup *cgrp)
{ /* mixables don't care */ if (cgroup_is_mixable(cgrp)) returntrue;
/* domain roots can't be nested under threaded */ if (cgroup_is_threaded(cgrp)) returnfalse;
/* can only have either domain or threaded children */ if (cgrp->nr_populated_domain_children) returnfalse;
/* and no domain controllers can be enabled */ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) returnfalse;
returntrue;
}
/* is @cgrp root of a threaded subtree? */ staticbool cgroup_is_thread_root(struct cgroup *cgrp)
{ /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) returnfalse;
/* a domain w/ threaded children is a thread root */ if (cgrp->nr_threaded_children) returntrue;
/* * A domain which has tasks and explicit threaded controllers * enabled is a thread root.
*/ if (cgroup_has_tasks(cgrp) &&
(cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) returntrue;
returnfalse;
}
/* a domain which isn't connected to the root w/o brekage can't be used */ staticbool cgroup_is_valid_domain(struct cgroup *cgrp)
{ /* the cgroup itself can be a thread root */ if (cgroup_is_threaded(cgrp)) returnfalse;
/* but the ancestors can't be unless mixable */ while ((cgrp = cgroup_parent(cgrp))) { if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) returnfalse; if (cgroup_is_threaded(cgrp)) returnfalse;
}
if (parent) {
u16 ss_mask = parent->subtree_control;
/* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp))
ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask;
}
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
cgrp_dfl_implicit_ss_mask); return root_ss_mask;
}
/* subsystems enabled on a cgroup */ static u16 cgroup_ss_mask(struct cgroup *cgrp)
{ struct cgroup *parent = cgroup_parent(cgrp);
if (parent) {
u16 ss_mask = parent->subtree_ss_mask;
/* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp))
ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask;
}
return cgrp->root->subsys_mask;
}
/** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and * the caller is responsible for pinning the returned css if it wants to * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled.
*/ staticstruct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{ if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex)); else return &cgrp->self;
}
/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css.
*/ staticstruct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
if (!ss) return &cgrp->self;
/* * This function is used while updating css associations and thus * can't test the csses directly. Test ss_mask.
*/ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
cgrp = cgroup_parent(cgrp); if (!cgrp) return NULL;
}
return cgroup_css(cgrp, ss);
}
/** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the * callers responsibility to try get a reference for it.
*/ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{ struct cgroup_subsys_state *css;
if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL;
do {
css = cgroup_css(cgrp, ss);
if (css) return css;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
return init_css_set.subsys[ss->id];
}
/** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * The returned css must be put using css_put().
*/ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{ struct cgroup_subsys_state *css;
if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL;
rcu_read_lock();
do {
css = cgroup_css(cgrp, ss);
if (css && css_tryget_online(css)) goto out_unlock;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
/** * __cgroup_task_count - count the number of tasks in a cgroup. The caller * is responsible for taking the css_set_lock. * @cgrp: the cgroup in question
*/ int __cgroup_task_count(conststruct cgroup *cgrp)
{ int count = 0; struct cgrp_cset_link *link;
/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question
*/ int cgroup_task_count(conststruct cgroup *cgrp)
{ int count;
staticstruct cgroup *kn_priv(struct kernfs_node *kn)
{ struct kernfs_node *parent; /* * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT. * Therefore it is always safe to dereference this pointer outside of a * RCU section.
*/
parent = rcu_dereference_check(kn->__parent,
kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT); return parent->priv;
}
/* * This is open and unprotected implementation of cgroup_css(). * seq_css() is only called from a kernfs file operation which has * an active reference on the file. Because all the subsystem * files are drained before a css is disassociated with a cgroup, * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete.
*/ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self;
}
EXPORT_SYMBOL_GPL(of_css);
/** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_mutex.
*/ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
lockdep_is_held(&cgroup_mutex)))) { } \ else
/** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of * @ss_mask is set.
*/ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsignedlong __ss_mask = (ss_mask); \ if (!CGROUP_HAS_SUBSYS_CONFIG) { \
(ssid) = 0; \ break; \
} \
for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
(ss) = cgroup_subsys[ssid]; \
{
#define while_each_subsys_mask() \
} \
} \
} while (false)
/* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \
list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \
cgroup_is_dead(child); })) \
; \ else
/* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \ else
/* walk live descendants in postorder */ #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \ else
/* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created.
*/ struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
.dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
.mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/* * The following field is re-initialized when this cset gets linked * in cgroup_init(). However, let's initialize the field * statically too so that the default cgroup can be accessed safely * early during boot.
*/
.dfl_cgrp = &cgrp_dfl_root.cgrp,
};
staticint css_set_count = 1; /* 1 for init_css_set */
/** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set * * css_set_populated() should be the same as !!cset->nr_tasks at steady * state. However, css_set_populated() can be called while a task is being * added to or removed from the linked list before the nr_tasks is * properly updated. Hence, we can't just look at ->nr_tasks here.
*/ staticbool css_set_populated(struct css_set *cset)
{
lockdep_assert_held(&css_set_lock);
/** * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first * task or losing the last. Update @cgrp->nr_populated_* accordingly. The * count is propagated towards root so that a given cgroup's * nr_populated_children is zero iff none of its descendants contain any * tasks. * * @cgrp's interface file "cgroup.populated" is zero if both * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and * 1 otherwise. When the sum changes from or to zero, userland is notified * that the content of the interface file has changed. This can be used to * detect when @cgrp and its descendants become populated or empty.
*/ staticvoid cgroup_update_populated(struct cgroup *cgrp, bool populated)
{ struct cgroup *child = NULL; int adj = populated ? 1 : -1;
lockdep_assert_held(&css_set_lock);
do { bool was_populated = cgroup_is_populated(cgrp);
if (!child) {
cgrp->nr_populated_csets += adj;
} else { if (cgroup_is_threaded(child))
cgrp->nr_populated_threaded_children += adj; else
cgrp->nr_populated_domain_children += adj;
}
if (was_populated == cgroup_is_populated(cgrp)) break;
child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
/** * css_set_update_populated - update populated state of a css_set * @cset: target css_set * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the * populated counters of all associated cgroups accordingly.
*/ staticvoid css_set_update_populated(struct css_set *cset, bool populated)
{ struct cgrp_cset_link *link;
/* * @task is leaving, advance task iterators which are pointing to it so * that they can resume at the next position. Advancing an iterator might * remove it from the list, use safe walk. See css_task_iter_skip() for * details.
*/ staticvoid css_set_skip_task_iters(struct css_set *cset, struct task_struct *task)
{ struct css_task_iter *it, *pos;
/** * css_set_move_task - move a task from one css_set to another * @task: task being moved * @from_cset: css_set @task currently belongs to (may be NULL) * @to_cset: new css_set @task is being moved to (may be NULL) * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks * * Move @task from @from_cset to @to_cset. If @task didn't belong to any * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts.
*/ staticvoid css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, bool use_mg_tasks)
{
lockdep_assert_held(&css_set_lock);
if (to_cset && !css_set_populated(to_cset))
css_set_update_populated(to_cset, true);
if (from_cset) {
WARN_ON_ONCE(list_empty(&task->cg_list));
if (to_cset) { /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race * against cgroup_exit()/cgroup_free() dropping the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
/* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into * account cgroups in empty hierarchies.
*/ #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
if (!refcount_dec_and_test(&cset->refcount)) return;
WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
/* This css_set is dead. Unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
css_put(cset->subsys[ssid]);
}
hash_del(&cset->hlist);
css_set_count--;
if (css_set_threaded(cset)) {
list_del(&cset->threaded_csets_node);
put_css_set_locked(cset->dom_cset);
}
kfree_rcu(cset, rcu_head);
}
/** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/ staticbool compare_css_sets(struct css_set *cset, struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[])
{ struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2;
/* * On the default hierarchy, there can be csets which are * associated with the same set of cgroups but different csses. * Let's first ensure that csses match.
*/ if (memcmp(template, cset->subsys, sizeof(cset->subsys))) returnfalse;
/* @cset's domain should match the default cgroup's */ if (cgroup_on_dfl(new_cgrp))
new_dfl_cgrp = new_cgrp; else
new_dfl_cgrp = old_cset->dfl_cgrp;
if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) returnfalse;
/* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may * share the same effective css, this comparison is always * necessary.
*/
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links; while (1) { struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next; /* See if we reached the end - both lists are equal length. */ if (l1 == &cset->cgrp_links) {
BUG_ON(l2 != &old_cset->cgrp_links); break;
} else {
BUG_ON(l2 == &old_cset->cgrp_links);
} /* Locate the cgroups associated with these links. */
link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
cgrp1 = link1->cgrp;
cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */
BUG_ON(cgrp1->root != cgrp2->root);
/* * If this hierarchy is the hierarchy of the cgroup * that's changing, then we need to check that this * css_set points to the new cgroup; if it's any other * hierarchy, then this css_set should point to the * same cgroup as the old css_set.
*/ if (cgrp1->root == new_cgrp->root) { if (cgrp1 != new_cgrp) returnfalse;
} else { if (cgrp1 != cgrp2) returnfalse;
}
} returntrue;
}
/** * find_existing_css_set - init css array and find the matching css_set * @old_cset: the css_set that we're using before the cgroup transition * @cgrp: the cgroup that we're moving into * @template: out param for the new set of csses, should be clear on entry
*/ staticstruct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state **template)
{ struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsignedlong key; int i;
/* * Build the set of subsystem state objects that we want to see in the * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking.
*/
for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* * @ss is in this hierarchy, so we want the * effective css from @cgrp.
*/ template[i] = cgroup_e_css_by_mask(cgrp, ss);
} else { /* * @ss is not in this hierarchy, so we don't want * to change the css.
*/ template[i] = old_cset->subsys[i];
}
}
/** * allocate_cgrp_cset_links - allocate cgrp_cset_links * @count: the number of links to allocate * @tmp_links: list_head the allocated links are put on * * Allocate @count cgrp_cset_link structures and chain them on @tmp_links * through ->cset_link. Returns 0 on success or -errno.
*/ staticint allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{ struct cgrp_cset_link *link; int i;
INIT_LIST_HEAD(tmp_links);
for (i = 0; i < count; i++) {
link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) {
free_cgrp_cset_links(tmp_links); return -ENOMEM;
}
list_add(&link->cset_link, tmp_links);
} return 0;
}
/** * link_css_set - a helper function to link a css_set to a cgroup * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup
*/ staticvoid link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp)
{ struct cgrp_cset_link *link;
/* * Always add links to the tail of the lists so that the lists are * in chronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
if (cgroup_parent(cgrp))
cgroup_get_live(cgrp);
}
/** * find_css_set - return a new css_set with one cgroup updated * @old_cset: the baseline css_set * @cgrp: the cgroup to be updated * * Return a new css_set that's equivalent to @old_cset, but with @cgrp * substituted into the appropriate hierarchy.
*/ staticstruct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp)
{ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; struct cgroup_subsys *ss; unsignedlong key; int ssid;
lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
spin_lock_irq(&css_set_lock);
cset = find_existing_css_set(old_cset, cgrp, template); if (cset)
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (cset) return cset;
cset = kzalloc(sizeof(*cset), GFP_KERNEL); if (!cset) return NULL;
/* Allocate all the cgrp_cset_link objects that we'll need */ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
kfree(cset); return NULL;
}
/* * If @cset should be threaded, look up the matching dom_cset and * link them up. We first fully initialize @cset then look for the * dom_cset. It's simpler this way and safe as @cset is guaranteed * to stay empty until we return.
*/ if (cgroup_is_threaded(cset->dfl_cgrp)) { struct css_set *dcset;
/* * Returned cgroup is without refcount but it's valid as long as cset pins it.
*/ staticinlinestruct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root)
{ struct cgroup *res_cgroup = NULL;
/* * If cgroup_mutex is not held, the cgrp_cset_link will be freed * before we remove the cgroup root from the root_list. Consequently, * when accessing a cgroup root, the cset_link may have already been * freed, resulting in a NULL res_cgroup. However, by holding the * cgroup_mutex, we ensure that res_cgroup can't be NULL. * If we don't hold cgroup_mutex in the caller, we must do the NULL * check.
*/ return res_cgroup;
}
/* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy
*/ staticstruct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root *root)
{ struct cgroup *res = NULL; struct css_set *cset;
lockdep_assert_held(&css_set_lock);
rcu_read_lock();
cset = current->nsproxy->cgroup_ns->root_cset;
res = __cset_cgroup_from_root(cset, root);
rcu_read_unlock();
/* * The namespace_sem is held by current, so the root cgroup can't * be umounted. Therefore, we can ensure that the res is non-NULL.
*/
WARN_ON_ONCE(!res); return res;
}
/* * Look up cgroup associated with current task's cgroup namespace on the default * hierarchy. * * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu * pointers. * - css_set_lock is not needed because we just read cset->dfl_cgrp. * - As a bonus returned cgrp is pinned with the current because it cannot * switch cgroup_ns asynchronously.
*/ staticstruct cgroup *current_cgns_cgroup_dfl(void)
{ struct css_set *cset;
if (current->nsproxy) {
cset = current->nsproxy->cgroup_ns->root_cset; return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
} else { /* * NOTE: This function may be called from bpf_cgroup_from_id() * on a task which has already passed exit_task_namespaces() and * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all * cgroups visible for lookups.
*/ return &cgrp_dfl_root.cgrp;
}
}
/* look up cgroup associated with given css_set on the specified hierarchy */ staticstruct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root)
{
lockdep_assert_held(&css_set_lock);
return __cset_cgroup_from_root(cset, root);
}
/* * Return the cgroup for "task" from the given hierarchy. Must be * called with css_set_lock held to prevent task's groups from being modified. * Must be called with either cgroup_mutex or rcu read lock to prevent the * cgroup root from being destroyed.
*/ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root)
{ /* * No need to lock the task - since we hold css_set_lock the * task can't change groups.
*/ return cset_cgroup_from_root(task_css_set(task), root);
}
/* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to * increment the count). So code holding cgroup_mutex can safely * assume that if the count is zero, it will stay zero. Similarly, if * a task holds cgroup_mutex on a cgroup with zero count, it * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task()
*/
/** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * * S_IRUGO for read, S_IWUSR for write.
*/ static umode_t cgroup_file_mode(conststruct cftype *cft)
{
umode_t mode = 0;
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
if (cft->write_u64 || cft->write_s64 || cft->write) { if (cft->flags & CFTYPE_WORLD_WRITABLE)
mode |= S_IWUGO; else
mode |= S_IWUSR;
}
return mode;
}
/** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask.
*/ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid;
/* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies.
*/
new_ss_mask &= this_ss_mask;
if (new_ss_mask == cur_ss_mask) break;
cur_ss_mask = new_ss_mask;
}
return cur_ss_mask;
}
/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * * This helper undoes cgroup_kn_lock_live() and should be invoked before * the method finishes if locking succeeded. Note that once this function * returns the cgroup returned by cgroup_kn_lock_live() may become * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function.
*/ void cgroup_kn_unlock(struct kernfs_node *kn)
{ struct cgroup *cgrp;
/** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal.
*/ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{ struct cgroup *cgrp;
do_each_subsys_mask(ss, ssid, ss_mask) { /* * If @ss has non-root csses attached to it, can't move. * If @ss is an implicit controller, it is exempt from this * rule and can be stolen.
*/ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
!ss->implicit_on_dfl) return -EBUSY;
/* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY;
/* * Collect ssid's that need to be disabled from default * hierarchy.
*/ if (ss->root == &cgrp_dfl_root)
dfl_disable_ss_mask |= 1 << ssid;
} while_each_subsys_mask();
if (dfl_disable_ss_mask) { struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
/* * Controllers from default hierarchy that need to be rebound * are all disabled together in one go.
*/
cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
WARN_ON(cgroup_apply_control(scgrp));
cgroup_finalize_control(scgrp, 0);
}
spin_lock_irq(&css_set_lock);
css->cgroup = dcgrp;
WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
e_cset_node[ss->id]) {
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]); /* * all css_sets of scgrp together in same order to dcgrp, * patch in-flight iterators to preserve correct iteration. * since the iterator is always advanced right away and * finished when it->cset_pos meets it->cset_head, so only * update it->cset_head is enough here.
*/
list_for_each_entry(it, &cset->task_iters, iters_node) if (it->cset_head == &scgrp->e_csets[ss->id])
it->cset_head = &dcgrp->e_csets[ss->id];
}
spin_unlock_irq(&css_set_lock);
/* DYNMODS must be modified through cgroup_favor_dynmods() */
root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name)
strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); if (ctx->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret;
lockdep_assert_held(&cgroup_mutex);
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
0, GFP_KERNEL); if (ret) goto out;
/* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. Later rebinding may disable * controllers on the default hierarchy and thus create new csets, * which can't be more than the existing ones. Allocate 2x.
*/
ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref;
ret = cgroup_init_root_id(root); if (ret) goto cancel_ref;
ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root;
ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root;
ret = rebind_subsystems(root, ss_mask); if (ret) goto exit_stats;
ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
CGROUP_LIFETIME_ONLINE, root_cgrp);
WARN_ON_ONCE(notifier_to_errno(ret));
trace_cgroup_setup_root(root);
/* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path.
*/
list_add_rcu(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/* * Link the root cgroup in this hierarchy into all the css_set * objects.
*/
spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset))
cgroup_update_populated(root_cgrp, true);
}
spin_unlock_irq(&css_set_lock);
/* * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * * And don't kill the default root.
*/ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
!percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead
*/ staticint cpuset_init_fs_context(struct fs_context *fc)
{ char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER); struct cgroup_fs_context *ctx; int err;
err = cgroup_init_fs_context(fc); if (err) {
kfree(agent); return err;
}
int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns)
{ int ret;
cgroup_lock();
spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
spin_unlock_irq(&css_set_lock);
cgroup_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);
/** * cgroup_attach_lock - Lock for ->attach() * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() * implementations (e.g. cpuset), also need to disable CPU hotplug. * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can * lead to deadlocks. * * Bringing up a CPU may involve creating and destroying tasks which requires * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while * write-locking threadgroup_rwsem, the locking order is reversed and we end up * waiting for an on-going CPU hotplug operation which in turn is waiting for * the threadgroup_rwsem to be released to create new tasks. For more details: * * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu * * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry.
*/ void cgroup_attach_lock(bool lock_threadgroup)
{
cpus_read_lock(); if (lock_threadgroup)
percpu_down_write(&cgroup_threadgroup_rwsem);
}
/** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context * * Add @task, which is a migration target, to @mgctx->tset. This function * becomes noop if @task doesn't need to be migrated. @task's css_set * should have been added as a migration source and @task->cg_list will be * moved from the css_set's tasks list to mg_tasks one.
*/ staticvoid cgroup_migrate_add_task(struct task_struct *task, struct cgroup_mgctx *mgctx)
{ struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) return;
/* cgroup_threadgroup_rwsem protects racing against forks */
WARN_ON_ONCE(list_empty(&task->cg_list));
cset = task_css_set(task); if (!cset->mg_src_cgrp) return;
mgctx->tset.nr_tasks++;
list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
&mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node))
list_add_tail(&cset->mg_dst_cset->mg_node,
&mgctx->tset.dst_csets);
}
/** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned.
*/ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp)
{
tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
tset->cur_task = NULL;
return cgroup_taskset_next(tset, dst_cssp);
}
/** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first().
*/ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp)
{ struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task;
while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) { if (!task)
task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); else
task = list_next_entry(task, cg_list);
/* * This function may be called both before and * after cgroup_migrate_execute(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set.
*/ if (cset->mg_dst_cset)
*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; else
*dst_cssp = cset->subsys[tset->ssid];
/** * cgroup_migrate_execute - migrate a taskset * @mgctx: migration context * * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and * guarantees that either all or none of the tasks in @mgctx are migrated. * @mgctx is consumed regardless of success.
*/ staticint cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{ struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret;
/* check that we can legitimately attach to the cgroup */ if (tset->nr_tasks) {
do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) {
tset->ssid = ssid;
ret = ss->can_attach(tset); if (ret) {
failed_ssid = ssid; goto out_cancel_attach;
}
}
} while_each_subsys_mask();
}
/* * Now that we're guaranteed success, proceed to move all tasks to * the new cgroup. There are no failure cases after here, so this * is the commit point.
*/
spin_lock_irq(&css_set_lock);
list_for_each_entry(cset, &tset->src_csets, mg_node) {
list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); struct css_set *to_cset = cset->mg_dst_cset;
get_css_set(to_cset);
to_cset->nr_tasks++;
css_set_move_task(task, from_cset, to_cset, true);
from_cset->nr_tasks--; /* * If the source or destination cgroup is frozen, * the task might require to change its state.
*/
cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
to_cset->dfl_cgrp);
put_css_set_locked(from_cset);
}
}
spin_unlock_irq(&css_set_lock);
/* * Migration is committed, all target tasks are now on dst_csets. * Nothing is sensitive to fork() after this point. Notify * controllers that migration is complete.
*/
tset->csets = &tset->dst_csets;
if (tset->nr_tasks) {
do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) {
tset->ssid = ssid;
ss->attach(tset);
}
} while_each_subsys_mask();
}
/* * Re-initialize the cgroup_taskset structure in case it is reused * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() * iteration.
*/
tset->nr_tasks = 0;
tset->csets = &tset->src_csets; return ret;
}
/** * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * * On the default hierarchy, except for the mixable, (possible) thread root * and threaded cgroups, subtree_control must be zero for migration * destination cgroups with tasks so that child cgroups don't compete * against tasks.
*/ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
{ /* v1 doesn't have any restriction */ if (!cgroup_on_dfl(dst_cgrp)) return 0;
/* verify @dst_cgrp can host resources */ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP;
/* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter.
*/ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) return 0;
/* apply no-internal-process constraint */ if (dst_cgrp->subtree_control) return -EBUSY;
return 0;
}
/** * cgroup_migrate_finish - cleanup after attach * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details.
*/ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{ struct css_set *cset, *tmp_cset;
/** * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem * even if the target is a process. Threads may be created and destroyed * but as long as cgroup_mutex is not dropped, no new css_set can be put * into play and the preloaded css_sets are guaranteed to cover all * migrations.
*/ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx)
{ struct cgroup *src_cgrp;
/* * If ->dead, @src_set is associated with one or more dead cgroups * and doesn't contain any migratable tasks. Ignore it early so * that the rest of migration path doesn't get confused by it.
*/ if (src_cset->dead) return;
if (!list_empty(&src_cset->mg_src_preload_node)) return;
/** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been * preloaded to @mgctx->preloaded_src_csets. This function looks up and * pins all destination css_sets, links each to its source, and append them * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @mgctx.
*/ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{ struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid;
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) return -ENOMEM;
/* * If src cset equals dst, it's noop. Drop the src. * cgroup_migrate() will skip the cset too. Note that we * can't handle src == dst as some nodes are used by both.
*/ if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
list_del_init(&src_cset->mg_src_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset); continue;
}
src_cset->mg_dst_cset = dst_cset;
if (list_empty(&dst_cset->mg_dst_preload_node))
list_add_tail(&dst_cset->mg_dst_preload_node,
&mgctx->preloaded_dst_csets); else
put_css_set(dst_cset);
/** * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * * As long as a controller's ->can_attach() doesn't fail, this function is * guaranteed to succeed. This means that, excluding ->can_attach() * failure, when migrating multiple targets, the success or failure can be * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating.
*/ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx)
{ struct task_struct *task;
/* * The following thread iteration should be inside an RCU critical * section to prevent tasks from being freed while taking the snapshot. * spin_lock_irq() implies RCU critical section here.
*/
spin_lock_irq(&css_set_lock);
task = leader; do {
cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break;
} while_each_thread(leader, task);
spin_unlock_irq(&css_set_lock);
return cgroup_migrate_execute(mgctx);
}
/** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @dst_cgrp: the cgroup to attach to * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
*/ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup)
{
DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret = 0;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
rcu_read_lock();
task = leader; do {
cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break;
} while_each_thread(leader, task);
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret)
ret = cgroup_migrate(leader, threadgroup, &mgctx);
cgroup_migrate_finish(&mgctx);
if (!ret)
TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL);
/* * If we migrate a single thread, we don't care about threadgroup * stability. If the thread is `current`, it won't exit(2) under our * hands or change PID through exec(2). We exclude * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write * callers by cgroup_mutex. * Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
*threadgroup_locked = pid || threadgroup;
cgroup_attach_lock(*threadgroup_locked);
/* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no.
*/ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
tsk = ERR_PTR(-EINVAL); goto out_unlock_threadgroup;
}
/* show controllers which are enabled from the parent */ staticint cgroup_controllers_show(struct seq_file *seq, void *v)
{ struct cgroup *cgrp = seq_css(seq)->cgroup;
/* show controllers which are enabled for a given cgroup's children */ staticint cgroup_subtree_control_show(struct seq_file *seq, void *v)
{ struct cgroup *cgrp = seq_css(seq)->cgroup;
/** * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * * @cgrp's control masks have changed and its subtree's css associations * need to be updated accordingly. This function looks up all css_sets * which are attached to the subtree, creates the matching updated css_sets * and migrates the tasks to the new ones.
*/ staticint cgroup_update_dfl_csses(struct cgroup *cgrp)
{
DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; bool has_tasks; int ret;
lockdep_assert_held(&cgroup_mutex);
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link;
/* * As cgroup_update_dfl_csses() is only called by * cgroup_apply_control(). The csses associated with the * given cgrp will not be affected by changes made to * its subtree_control file. We can skip them.
*/ if (dsct == cgrp) continue;
/* * We need to write-lock threadgroup_rwsem while migrating tasks. * However, if there are no source csets for @cgrp, changing its * controllers isn't gonna produce any task migrations and the * write-locking can be skipped safely.
*/
has_tasks = !list_empty(&mgctx.preloaded_src_csets);
cgroup_attach_lock(has_tasks);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish;
/* all tasks in src_csets need to be migrated */
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
cgroup_migrate_add_task(task, &mgctx);
}
spin_unlock_irq(&css_set_lock);
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
cgroup_attach_unlock(has_tasks); return ret;
}
/** * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
*/ void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
__acquires(&cgroup_mutex)
{ struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid;
/** * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself.
*/ staticvoid cgroup_save_control(struct cgroup *cgrp)
{ struct cgroup *dsct; struct cgroup_subsys_state *d_css;
/** * cgroup_propagate_control - refresh control masks of a subtree * @cgrp: root of the target subtree * * For @cgrp and its subtree, ensure ->subtree_ss_mask matches * ->subtree_control and propagate controller availability through the * subtree so that descendants don't have unavailable controllers enabled.
*/ staticvoid cgroup_propagate_control(struct cgroup *cgrp)
{ struct cgroup *dsct; struct cgroup_subsys_state *d_css;
/** * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself.
*/ staticvoid cgroup_restore_control(struct cgroup *cgrp)
{ struct cgroup *dsct; struct cgroup_subsys_state *d_css;
if (cgroup_control(cgrp) & (1 << ss->id)) returntrue; if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) returnfalse; return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
}
/** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and create new csses or make the existing ones * visible. A css is created invisible if it's being implicitly enabled * through dependency. An invisible css is made visible when the userland * explicitly enables it. * * Returns 0 on success, -errno on failure. On failure, csses which have * been processed already aren't cleaned up. The caller is responsible for * cleaning up with cgroup_apply_control_disable().
*/ staticint cgroup_apply_control_enable(struct cgroup *cgrp)
{ struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid, ret;
if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue;
if (!css) {
css = css_create(dsct, ss); if (IS_ERR(css)) return PTR_ERR(css);
}
WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
if (css_visible(css)) {
ret = css_populate_dir(css); if (ret) return ret;
}
}
}
return 0;
}
/** * cgroup_apply_control_disable - kill or hide csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and kill and hide csses so that they match * cgroup_ss_mask() and cgroup_visible_mask(). * * A css is hidden when the userland requests it to be disabled while other * subsystems are still depending on it. The css must not actively control * resources and be in the vanilla state if it's made visible again later. * Controllers which may be depended upon should provide ->css_reset() for * this purpose.
*/ staticvoid cgroup_apply_control_disable(struct cgroup *cgrp)
{ struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid;
/** * cgroup_apply_control - apply control mask updates to the subtree * @cgrp: root of the target subtree * * subsystems can be enabled and disabled in a subtree using the following * steps. * * 1. Call cgroup_save_control() to stash the current state. * 2. Update ->subtree_control masks in the subtree as desired. * 3. Call cgroup_apply_control() to apply the changes. * 4. Optionally perform other related operations. * 5. Call cgroup_finalize_control() to finish up. * * This function implements step 3 and propagates the mask changes * throughout @cgrp's subtree, updates csses accordingly and perform * process migrations.
*/ staticint cgroup_apply_control(struct cgroup *cgrp)
{ int ret;
cgroup_propagate_control(cgrp);
ret = cgroup_apply_control_enable(cgrp); if (ret) return ret;
/* * At this point, cgroup_e_css_by_mask() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree.
*/ return cgroup_update_dfl_csses(cgrp);
}
/** * cgroup_finalize_control - finalize control mask update * @cgrp: root of the target subtree * @ret: the result of the update * * Finalize control mask update. See cgroup_apply_control() for more info.
*/ staticvoid cgroup_finalize_control(struct cgroup *cgrp, int ret)
{ if (ret) {
cgroup_restore_control(cgrp);
cgroup_propagate_control(cgrp);
}
/* if nothing is getting enabled, nothing to worry about */ if (!enable) return 0;
/* can @cgrp host any resources? */ if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) return -EOPNOTSUPP;
/* mixables don't care */ if (cgroup_is_mixable(cgrp)) return 0;
if (domain_enable) { /* can't enable domain controllers inside a thread subtree */ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return -EOPNOTSUPP;
} else { /* * Threaded controllers can handle internal competitions * and are always allowed inside a (prospective) thread * subtree.
*/ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return 0;
}
/* * Controllers can't be enabled for a cgroup with tasks to avoid * child cgroups competing against tasks.
*/ if (cgroup_has_tasks(cgrp)) return -EBUSY;
return 0;
}
/* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
loff_t off)
{
u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret;
/* * Parse input - space separated list of subsystem names prefixed * with either + or -.
*/
buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue;
do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) ||
strcmp(tok + 1, ss->name)) continue;
ret = cgroup_apply_control(cgrp);
cgroup_finalize_control(cgrp, ret); if (ret) goto out_unlock;
kernfs_activate(cgrp->kn);
out_unlock:
cgroup_kn_unlock(of->kn); return ret ?: nbytes;
}
/** * cgroup_enable_threaded - make @cgrp threaded * @cgrp: the target cgroup * * Called when "threaded" is written to the cgroup.type interface file and * tries to make @cgrp threaded and join the parent's resource domain. * This function is never called on the root cgroup as cgroup.type doesn't * exist on it.
*/ staticint cgroup_enable_threaded(struct cgroup *cgrp)
{ struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; struct cgroup *dsct; struct cgroup_subsys_state *d_css; int ret;
lockdep_assert_held(&cgroup_mutex);
/* noop if already threaded */ if (cgroup_is_threaded(cgrp)) return 0;
/* * If @cgroup is populated or has domain controllers enabled, it * can't be switched. While the below cgroup_can_be_thread_root() * test can catch the same conditions, that's only when @parent is * not mixable, so let's check it explicitly.
*/ if (cgroup_is_populated(cgrp) ||
cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return -EOPNOTSUPP;
/* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) ||
!cgroup_can_be_thread_root(dom_cgrp)) return -EOPNOTSUPP;
/* * The following shouldn't cause actual migrations and should * always succeed.
*/
cgroup_save_control(cgrp);
/* * Show the number of live and dying csses associated with each of * non-inhibited cgroup subsystems that is bound to cgroup v2. * * Without proper lock protection, racing is possible. So the * numbers may not be consistent when that happens.
*/
rcu_read_lock(); for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) {
dying_cnt[ssid] = -1; if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) ||
(cgroup_subsys[ssid]->root != &cgrp_dfl_root)) continue;
css = rcu_dereference_raw(cgroup->subsys[ssid]);
dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid];
seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name,
css ? (css->nr_descendants + 1) : 0);
}
#ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get @cgrp's css associated with @ss. If the css doesn't exist * or is offline, %NULL is returned.
*/ staticstruct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{ struct cgroup_subsys_state *css;
ret = kstrtoint(strstrip(buf), 0, &kill); if (ret) return ret;
if (kill != 1) return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT;
/* * Killing is a process directed operation, i.e. the whole thread-group * is taken down so act like we do for cgroup.procs and only make this * writable in non-threaded cgroups.
*/ if (cgroup_is_threaded(cgrp))
ret = -EOPNOTSUPP; else
cgroup_kill(cgrp);
/* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control.
*/ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM;
if (cft->write) return cft->write(of, buf, nbytes, off);
/* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and * doesn't need to be pinned. The RCU locking is not necessary * either. It's just for the convenience of using cgroup_css().
*/
rcu_read_lock();
css = cgroup_css(cgrp, cft->ss);
rcu_read_unlock();
if (cft->write_u64) { unsignedlonglong v;
ret = kstrtoull(buf, 0, &v); if (!ret)
ret = cft->write_u64(css, cft, v);
} elseif (cft->write_s64) { longlong v;
ret = kstrtoll(buf, 0, &v); if (!ret)
ret = cft->write_s64(css, cft, v);
} else {
ret = -EINVAL;
}
/** * cgroup_addrm_files - add or remove files to a cgroup directory * @css: the target css * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. * For removals, this function never fails.
*/ staticint cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add)
{ struct cftype *cft, *cft_end = NULL; int ret = 0;
lockdep_assert_held(&cgroup_mutex);
restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; if (is_add) {
ret = cgroup_add_file(css, cgrp, cft); if (ret) {
pr_warn("%s: failed to add %s, err=%d\n",
__func__, cft->name, ret);
cft_end = cft;
is_add = false; goto restart;
}
} else {
cgroup_rm_file(cgrp, cft);
}
} return ret;
}
if (cft->flags & __CFTYPE_ADDED) {
ret = -EBUSY; break;
}
if (cft->seq_start)
kf_ops = &cgroup_kf_ops; else
kf_ops = &cgroup_kf_single_ops;
/* * Ugh... if @cft wants a custom max_write_len, we need to * make a copy of kf_ops to set its atomic_write_len.
*/ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) {
ret = -ENOMEM; break;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
/** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem * @cfts: zero-length name terminated array of cftypes * * Unregister @cfts. Files described by @cfts are removed from all * existing cgroups and all future cgroups won't have them either. This * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered.
*/ int cgroup_rm_cftypes(struct cftype *cfts)
{ if (!cfts || cfts[0].name[0] == '\0') return 0;
if (!(cfts[0].flags & __CFTYPE_ADDED)) return -ENOENT;
/** * cgroup_add_cftypes - add an array of cftypes to a subsystem * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Register @cfts to @ss. Files described by @cfts are created for all * existing cgroups to which @ss is attached and all future cgroups will * have them too. This function can be called anytime whether @ss is * attached or not. * * Returns 0 on successful registration, -errno on failure. Note that this * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail.
*/ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{ int ret;
if (!cgroup_ssid_enabled(ss->id)) return 0;
if (!cfts || cfts[0].name[0] == '\0') return 0;
ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret;
cgroup_lock();
list_add_tail(&cfts->node, &ss->cfts);
ret = cgroup_apply_cftypes(cfts, true); if (ret)
cgroup_rm_cftypes_locked(cfts);
cgroup_unlock(); return ret;
}
/** * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the default hierarchy.
*/ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{ struct cftype *cft;
/** * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the legacy hierarchies.
*/ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{ struct cftype *cft;
/** * cgroup_file_notify - generate a file modified event for a cgroup_file * @cfile: target cgroup_file * * @cfile must have been obtained by setting cftype->file_offset.
*/ void cgroup_file_notify(struct cgroup_file *cfile)
{ unsignedlong flags;
spin_lock_irqsave(&cgroup_file_kn_lock, flags); if (cfile->kn) { unsignedlong last = cfile->notified_at; unsignedlong next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
if (time_in_range(jiffies, last, next)) {
timer_reduce(&cfile->notify_timer, next);
} else {
kernfs_notify(cfile->kn);
cfile->notified_at = jiffies;
}
}
spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
/** * cgroup_file_show - show or hide a hidden cgroup file * @cfile: target cgroup_file obtained by setting cftype->file_offset * @show: whether to show or hide
*/ void cgroup_file_show(struct cgroup_file *cfile, bool show)
{ struct kernfs_node *kn;
/** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk * * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is * that @parent and @pos are accessible. The next sibling is guaranteed to * be returned regardless of their states. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining.
*/ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent)
{ struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked();
/* * @pos could already have been unlinked from the sibling list. * Once a cgroup is removed, its ->sibling.next is no longer * updated when its next sibling changes. CSS_RELEASED is set when * @pos is taken off list, at which time its next pointer is valid, * and, as releases are serialized, the one pointed to by the next * pointer is guaranteed to not have started release yet. This * implies that if we observe !CSS_RELEASED on @pos in this RCU * critical section, the one pointed to by its next pointer is * guaranteed to not have finished its RCU grace period even if we * have dropped rcu_read_lock() in-between iterations. * * If @pos has CSS_RELEASED set, its next pointer can't be * dereferenced; however, as each css is given a monotonically * increasing unique serial number and always appended to the * sibling list, the next one can be found by walking the parent's * children until the first css with higher serial number than * @pos's. While this path can be slower, it happens iff iteration * races against release and the race window is very small.
*/ if (!pos) {
next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
} elseif (likely(!(pos->flags & CSS_RELEASED))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
} else {
list_for_each_entry_rcu(next, &parent->children, sibling,
lockdep_is_held(&cgroup_mutex)) if (next->serial_nr > pos->serial_nr) break;
}
/* * @next, if not pointing to the head, can be dereferenced and is * the next sibling.
*/ if (&next->sibling != &parent->children) return next; return NULL;
}
/** * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. Additionally, it isn't necessary to hold onto a reference to @pos. * This function will return the correct next descendant as long as both @pos * and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining.
*/ struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root)
{ struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked();
/* if first iteration, visit @root */ if (!pos) return root;
/* visit the first child if exists */
next = css_next_child(NULL, pos); if (next) return next;
/* no child, visit my or the closest ancestor's next sibling */ while (pos != root) {
next = css_next_child(pos, pos->parent); if (next) return next;
pos = pos->parent;
}
/** * css_rightmost_descendant - return the rightmost descendant of a css * @pos: css of interest * * Return the rightmost descendant of @pos. If there's no descendant, @pos * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. Additionally, it isn't necessary to hold onto a reference to @pos. * This function will return the correct rightmost descendant as long as @pos * is accessible.
*/ struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
{ struct cgroup_subsys_state *last, *tmp;
cgroup_assert_mutex_or_rcu_locked();
do {
last = pos; /* ->prev isn't RCU safe, walk ->next till the end */
pos = NULL;
css_for_each_child(tmp, last)
pos = tmp;
} while (pos);
do {
last = pos;
pos = css_next_child(NULL, pos);
} while (pos);
return last;
}
/** * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. Additionally, it isn't necessary to hold onto a reference to @pos. * This function will return the correct next descendant as long as both @pos * and @cgroup are accessible and @pos is a descendant of @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining.
*/ struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root)
{ struct cgroup_subsys_state *next;
cgroup_assert_mutex_or_rcu_locked();
/* if first iteration, visit leftmost descendant which may be @root */ if (!pos) return css_leftmost_descendant(root);
/* if we visited @root, we're done */ if (pos == root) return NULL;
/* if there's an unvisited sibling, visit its leftmost descendant */
next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next);
/* no sibling left, visit parent */ return pos->parent;
}
/** * css_has_online_children - does a css have online children * @css: the target css * * Returns %true if @css has any online children; otherwise, %false. This * function can be called from any context but the caller is responsible * for synchronizing against on/offlining as necessary.
*/ bool css_has_online_children(struct cgroup_subsys_state *css)
{ struct cgroup_subsys_state *child; bool ret = false;
rcu_read_lock();
css_for_each_child(child, css) { if (child->flags & CSS_ONLINE) {
ret = true; break;
}
}
rcu_read_unlock(); return ret;
}
/** * css_task_iter_advance_css_set - advance a task iterator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk.
*/ staticvoid css_task_iter_advance_css_set(struct css_task_iter *it)
{ struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set and find first non-empty tasks list*/ while ((cset = css_task_iter_next_css_set(it))) { if (!list_empty(&cset->tasks)) {
it->cur_tasks_head = &cset->tasks; break;
} elseif (!list_empty(&cset->mg_tasks)) {
it->cur_tasks_head = &cset->mg_tasks; break;
} elseif (!list_empty(&cset->dying_tasks)) {
it->cur_tasks_head = &cset->dying_tasks; break;
}
} if (!cset) {
it->task_pos = NULL; return;
}
it->task_pos = it->cur_tasks_head->next;
/* * We don't keep css_sets locked across iteration steps and thus * need to take steps to ensure that iteration can be resumed after * the lock is re-acquired. Iteration is performed at two levels - * css_sets and tasks in them. * * Once created, a css_set never leaves its cgroup lists, so a * pinned css_set is guaranteed to stay put and we can resume * iteration afterwards. * * Tasks may leave @cset across iteration steps. This is resolved * by registering each iterator with the css_set currently being * walked and making css_set_move_task() advance iterators whose * next task is leaving.
*/ if (it->cur_cset) {
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
}
get_css_set(cset);
it->cur_cset = cset;
list_add(&it->iters_node, &cset->task_iters);
}
lockdep_assert_held(&css_set_lock);
repeat: if (it->task_pos) { /* * Advance iterator to find next entry. We go through cset * tasks, mg_tasks and dying_tasks, when consumed we move onto * the next cset.
*/ if (it->flags & CSS_TASK_ITER_SKIPPED)
it->flags &= ~CSS_TASK_ITER_SKIPPED; else
it->task_pos = it->task_pos->next;
if (it->task_pos == &it->cur_cset->tasks) {
it->cur_tasks_head = &it->cur_cset->mg_tasks;
it->task_pos = it->cur_tasks_head->next;
} if (it->task_pos == &it->cur_cset->mg_tasks) {
it->cur_tasks_head = &it->cur_cset->dying_tasks;
it->task_pos = it->cur_tasks_head->next;
} if (it->task_pos == &it->cur_cset->dying_tasks)
css_task_iter_advance_css_set(it);
} else { /* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
}
if (it->flags & CSS_TASK_ITER_PROCS) { /* if PROCS, skip over tasks which aren't group leaders */ if (!thread_group_leader(task)) goto repeat;
/* and dying leaders w/o live member threads */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
!atomic_read(&task->signal->live)) goto repeat;
} else { /* skip all dying ones */ if (it->cur_tasks_head == &it->cur_cset->dying_tasks) goto repeat;
}
}
/** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call * css_task_iter_next() to walk through the tasks until the function * returns NULL. On completion of iteration, css_task_iter_end() must be * called.
*/ void css_task_iter_start(struct cgroup_subsys_state *css, unsignedint flags, struct css_task_iter *it)
{ unsignedlong irqflags;
/** * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via css_task_iter_start(). Returns NULL when the iteration * reaches the end.
*/ struct task_struct *css_task_iter_next(struct css_task_iter *it)
{ unsignedlong irqflags;
if (it->cur_task) {
put_task_struct(it->cur_task);
it->cur_task = NULL;
}
spin_lock_irqsave(&css_set_lock, irqflags);
/* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED)
css_task_iter_advance(it);
/* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos.
*/ if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL);
css_task_iter_start(&cgrp->self, iter_flags, it);
ctx->procs.started = true;
} elseif (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
} else return it->cur_task;
/* * All processes of a threaded subtree belong to the domain cgroup * of the subtree. Only threads can be distributed across the * subtree. Reject reads on cgroup.procs in the subtree proper. * They're always empty anyway.
*/ if (cgroup_is_threaded(cgrp)) return ERR_PTR(-EOPNOTSUPP);
/* find the common ancestor */ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
com_cgrp = cgroup_parent(com_cgrp);
/* %current should be authorized to migrate to the common ancestor */
ret = cgroup_may_write(com_cgrp, sb); if (ret) return ret;
/* * If namespaces are delegation boundaries, %current must be able * to see both source and destination cgroups from its namespace.
*/ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
(!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
!cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) return -ENOENT;
return 0;
}
staticint cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, bool threadgroup, struct cgroup_namespace *ns)
{ int ret = 0;
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret;
ret = cgroup_migrate_vet_dst(dst_cgrp); if (ret) return ret;
if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
ret = -EOPNOTSUPP;
/* * Process and thread migrations follow same delegation rule. Check * permissions using the credentials from file open to protect against * inherited fd attacks.
*/
saved_cred = override_creds(of->file->f_cred);
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
of->file->f_path.dentry->d_sb,
threadgroup, ctx->ns);
revert_creds(saved_cred); if (ret) goto out_finish;
ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
/* * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be * offlined by invoking offline_css(). After offlining, the base ref is * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in * css_free_rwork_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional * steps to the already complex sequence.
*/ staticvoid css_free_rwork_fn(struct work_struct *work)
{ struct cgroup_subsys_state *css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup;
if (parent)
css_put(parent);
} else { /* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps); if (!cgroup_on_dfl(cgrp))
cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
bpf_cgrp_storage_free(cgrp);
if (cgroup_parent(cgrp)) { /* * We get a ref to the parent, and put the ref when * this cgroup is being freed, so it's guaranteed * that the parent won't be destroyed before its * children.
*/
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
kfree(cgrp);
} else { /* * This is root cgroup's refcnt reaching zero, * which indicates that the root should be * released.
*/
cgroup_destroy_root(cgrp->root);
}
}
}
if (!css_is_self(css)) { struct cgroup *parent_cgrp;
css_rstat_flush(css);
cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released)
ss->css_released(css);
cgrp->nr_dying_subsys[ss->id]--; /* * When a css is released and ready to be freed, its * nr_descendants must be zero. However, the corresponding * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem * is activated and deactivated multiple times with one or * more of its previous activation leaving behind dying csses.
*/
WARN_ON_ONCE(css->nr_descendants);
parent_cgrp = cgroup_parent(cgrp); while (parent_cgrp) {
parent_cgrp->nr_dying_subsys[ss->id]--;
parent_cgrp = cgroup_parent(parent_cgrp);
}
} else { struct cgroup *tcgrp;
/* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - * cgroupstats_build() and css_tryget_online_from_dir(). * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer.
*/ if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
}
if (cgroup_parent(cgrp)) {
css->parent = cgroup_css(cgroup_parent(cgrp), ss);
css_get(css->parent);
}
BUG_ON(cgroup_css(cgrp, ss));
}
/* invoke ->css_online() on a new CSS and mark it online if successful */ staticint online_css(struct cgroup_subsys_state *css)
{ struct cgroup_subsys *ss = css->ss; int ret = 0;
lockdep_assert_held(&cgroup_mutex);
if (ss->css_online)
ret = ss->css_online(css); if (!ret) {
css->flags |= CSS_ONLINE;
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
atomic_inc(&css->online_cnt); if (css->parent) {
atomic_inc(&css->parent->online_cnt); while ((css = css->parent))
css->nr_descendants++;
}
} return ret;
}
/* if the CSS is online, invoke ->css_offline() on it and mark it offline */ staticvoid offline_css(struct cgroup_subsys_state *css)
{ struct cgroup_subsys *ss = css->ss;
css->cgroup->nr_dying_subsys[ss->id]++; /* * Parent css and cgroup cannot be freed until after the freeing * of child css, see css_free_rwork_fn().
*/ while ((css = css->parent)) {
css->nr_descendants--;
css->cgroup->nr_dying_subsys[ss->id]++;
}
}
/** * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css * * Create a new css associated with @cgrp - @ss pair. On success, the new * css is online and installed in @cgrp. This function doesn't create the * interface files. Returns 0 on success, -errno on failure.
*/ staticstruct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss)
{ struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err;
lockdep_assert_held(&cgroup_mutex);
css = ss->css_alloc(parent_css); if (!css)
css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) return css;
init_and_link_css(css, ss, cgrp);
err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css;
err = css_rstat_init(css); if (err) goto err_free_css;
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
cgroup_idr_replace(&ss->css_idr, css, css->id);
err = online_css(css); if (err) goto err_list_del;
/* * The returned cgroup is fully initialized including its control mask, but * it doesn't have the control mask applied.
*/ staticstruct cgroup *cgroup_create(struct cgroup *parent, constchar *name,
umode_t mode)
{ struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; struct kernfs_node *kn; int i, level = parent->level + 1; int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM);
ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp;
/* create the directory */
kn = kernfs_create_dir_ns(parent->kn, name, mode,
current_fsuid(), current_fsgid(),
cgrp, NULL); if (IS_ERR(kn)) {
ret = PTR_ERR(kn); goto out_cancel_ref;
}
cgrp->kn = kn;
/* * Now that init_cgroup_housekeeping() has been called and cgrp->self * is setup, it is safe to perform rstat initialization on it.
*/
ret = css_rstat_init(&cgrp->self); if (ret) goto out_kernfs_remove;
ret = psi_cgroup_alloc(cgrp); if (ret) goto out_stat_exit;
/* * New cgroup inherits effective freeze counter, and * if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze; if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be * attached to the child cgroup, it will become frozen. * At this point the new cgroup is unpopulated, so we can * consider it frozen immediately.
*/
set_bit(CGRP_FREEZE, &cgrp->flags);
set_bit(CGRP_FROZEN, &cgrp->flags);
}
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
cgrp->self.serial_nr = css_serial_nr_next++;
ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier,
CGROUP_LIFETIME_ONLINE,
CGROUP_LIFETIME_OFFLINE, cgrp);
ret = notifier_to_errno(ret); if (ret) goto out_psi_free;
/* allocation complete, commit to creation */
spin_lock_irq(&css_set_lock); for (i = 0; i < level; i++) {
tcgrp = cgrp->ancestors[i];
tcgrp->nr_descendants++;
/* * If the new cgroup is frozen, all ancestor cgroups get a new * frozen descendant, but their state can't change because of * this.
*/ if (cgrp->freezer.e_freeze)
tcgrp->freezer.nr_frozen_descendants++;
}
spin_unlock_irq(&css_set_lock);
/* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually.
*/ if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
/* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to * initiate destruction and put the css ref from kill_css().
*/ staticvoid css_killed_work_fn(struct work_struct *work)
{ struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
cgroup_lock();
do {
offline_css(css);
css_put(css); /* @css can't go away while we're holding cgroup_mutex */
css = css->parent;
} while (css && atomic_dec_and_test(&css->online_cnt));
if (atomic_dec_and_test(&css->online_cnt)) {
INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_offline_wq, &css->destroy_work);
}
}
/** * kill_css - destroy a css * @css: css to destroy * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked * asynchronously once css_tryget_online() is guaranteed to fail and when * the reference count reaches zero, @css will be released.
*/ staticvoid kill_css(struct cgroup_subsys_state *css)
{
lockdep_assert_held(&cgroup_mutex);
if (css->flags & CSS_DYING) return;
/* * Call css_killed(), if defined, before setting the CSS_DYING flag
*/ if (css->ss->css_killed)
css->ss->css_killed(css);
css->flags |= CSS_DYING;
/* * This must happen before css is disassociated with its cgroup. * See seq_css() for details.
*/
css_clear_dir(css);
/* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline().
*/
css_get(css);
/* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs.
*/
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}
/** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to * guarantee that css_tryget_online() won't succeed by the time * ->css_offline() is invoked. To satisfy all the requirements, * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of * css's. Set up so that the next stage will be kicked off once all * the percpu refcnts are confirmed to be killed. * * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the * rest of destruction. Once all cgroup references are gone, the * cgroup is RCU-freed. * * This function implements s1. After this step, @cgrp is gone as far as * the userland is concerned and a new cgroup with the same name may be * created. As cgroup doesn't care about the names internally, this * doesn't cause any problem.
*/ staticint cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{ struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid, ret;
lockdep_assert_held(&cgroup_mutex);
/* * Only migration can raise populated from zero and we're already * holding cgroup_mutex.
*/ if (cgroup_is_populated(cgrp)) return -EBUSY;
/* * Make sure there's no live children. We can't test emptiness of * ->self.children as dead children linger on it while being * drained; otherwise, "rmdir parent/child parent" may fail.
*/ if (css_has_online_children(&cgrp->self)) return -EBUSY;
/* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling * cgroup_kn_lock_live(). The latter makes the csets ignored by * the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
/* * Root csses are never destroyed and we can't initialize * percpu_ref during early init. Disable refcnting.
*/
css->flags |= CSS_NO_REF;
if (early) { /* allocation can't be done safely during early init */
css->id = 1;
} else {
css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
BUG_ON(css->id < 0);
/* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the
* init_css_set is in the subsystem's root cgroup. */
init_css_set.subsys[ss->id] = css;
/* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
BUG_ON(online_css(css));
cgroup_unlock();
}
/** * cgroup_init_early - cgroup initialization at system boot * * Initialize cgroups at system boot, and initialize any * subsystems that request early init.
*/ int __init cgroup_init_early(void)
{ staticstruct cgroup_fs_context __initdata ctx; struct cgroup_subsys *ss; int i;
if (ss->early_init)
cgroup_init_subsys(ss, true);
} return 0;
}
/** * cgroup_init - cgroup initialization * * Register cgroup filesystem and /proc file, and initialize * any subsystems that didn't request early init.
*/ int __init cgroup_init(void)
{ struct cgroup_subsys *ss; int ssid;
/* * Add init_css_set to the hash table so that dfl_root can link to * it during init.
*/
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
/* * Setting dfl_root subsys_mask needs to consider the * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init.
*/ if (!cgroup_ssid_enabled(ssid)) continue;
if (cgroup1_ssid_disabled(ssid))
pr_info("Disabling %s control group subsystem in v1 mounts\n",
ss->legacy_name);
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
/* implicit controllers must be threaded too */
WARN_ON(ss->implicit_on_dfl && !ss->threaded);
staticint __init cgroup_wq_init(void)
{ /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. * Use 1 for @max_active. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after.
*/
cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1);
BUG_ON(!cgroup_offline_wq);
/* * __cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure * There are no cgroup NS restrictions.
*/ struct cgroup *__cgroup_get_from_id(u64 id)
{ struct kernfs_node *kn; struct cgroup *cgrp;
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return ERR_PTR(-ENOENT);
if (kernfs_type(kn) != KERNFS_DIR) {
kernfs_put(kn); return ERR_PTR(-ENOENT);
}
if (!cgrp) return ERR_PTR(-ENOENT); return cgrp;
}
/* * cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure * Only cgroups within current task's cgroup NS are valid.
*/ struct cgroup *cgroup_get_from_id(u64 id)
{ struct cgroup *cgrp, *root_cgrp;
cgrp = __cgroup_get_from_id(id); if (IS_ERR(cgrp)) return cgrp;
/* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup.
*/ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk)
{ char *buf; int retval; struct cgroup_root *root;
if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue;
cgrp = task_cgroup_from_root(tsk, root); /* The root has already been unmounted. */ if (!cgrp) continue;
seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid))
seq_printf(m, "%s%s", count++ ? "," : "",
ss->legacy_name); if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':'); /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, * while a zombie doesn't show up in "cgroup.procs" and * thus can't be migrated, its /proc/PID/cgroup keeps * reporting the cgroup it belonged to before exiting. If * the cgroup is removed before the zombie is reaped, * " (deleted)" is appended to the cgroup path.
*/ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns); if (retval == -E2BIG)
retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock;
/** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() * attaches it to the target css_set.
*/ void cgroup_fork(struct task_struct *child)
{
RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
}
/** * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer * @f: file corresponding to cgroup_dir * * Find the cgroup from a file pointer associated with a cgroup directory. * Returns a pointer to the cgroup on success. ERR_PTR is returned if the * cgroup cannot be found.
*/ staticstruct cgroup *cgroup_v1v2_get_from_file(struct file *f)
{ struct cgroup_subsys_state *css;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css);
return css->cgroup;
}
/** * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports * cgroup2. * @f: file corresponding to cgroup2_dir
*/ staticstruct cgroup *cgroup_get_from_file(struct file *f)
{ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
if (IS_ERR(cgrp)) return ERR_CAST(cgrp);
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp); return ERR_PTR(-EBADF);
}
return cgrp;
}
/** * cgroup_css_set_fork - find or create a css_set for a child process * @kargs: the arguments passed to create the child process * * This functions finds or creates a new css_set which the child * process will be attached to in cgroup_post_fork(). By default, * the child process will be given the same css_set as its parent. * * If CLONE_INTO_CGROUP is specified this function will try to find an * existing css_set which includes the requested cgroup and if not create * a new css_set that the child will be attached to later. If this function * succeeds it will hold cgroup_threadgroup_rwsem on return. If * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex * before grabbing cgroup_threadgroup_rwsem and will hold a reference * to the target cgroup.
*/ staticint cgroup_css_set_fork(struct kernel_clone_args *kargs)
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{ int ret; struct cgroup *dst_cgrp = NULL; struct css_set *cset; struct super_block *sb;
if (kargs->flags & CLONE_INTO_CGROUP)
cgroup_lock();
CLASS(fd_raw, f)(kargs->cgroup); if (fd_empty(f)) {
ret = -EBADF; goto err;
}
sb = fd_file(f)->f_path.dentry->d_sb;
dst_cgrp = cgroup_get_from_file(fd_file(f)); if (IS_ERR(dst_cgrp)) {
ret = PTR_ERR(dst_cgrp);
dst_cgrp = NULL; goto err;
}
if (cgroup_is_dead(dst_cgrp)) {
ret = -ENODEV; goto err;
}
/* * Verify that we the target cgroup is writable for us. This is * usually done by the vfs layer but since we're not going through * the vfs layer here we need to do it "manually".
*/
ret = cgroup_may_write(dst_cgrp, sb); if (ret) goto err;
/* * Spawning a task directly into a cgroup works by passing a file * descriptor to the target cgroup directory. This can even be an O_PATH * file descriptor. But it can never be a cgroup.procs file descriptor. * This was done on purpose so spawning into a cgroup could be * conceptualized as an atomic * * fd = openat(dfd_cgroup, "cgroup.procs", ...); * write(fd, <child-pid>, ...); * * sequence, i.e. it's a shorthand for the caller opening and writing * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us * to always use the caller's credentials.
*/
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD),
current->nsproxy->cgroup_ns); if (ret) goto err;
kargs->cset = find_css_set(cset, dst_cgrp); if (!kargs->cset) {
ret = -ENOMEM; goto err;
}
err:
cgroup_threadgroup_change_end(current);
cgroup_unlock(); if (dst_cgrp)
cgroup_put(dst_cgrp);
put_css_set(cset); if (kargs->cset)
put_css_set(kargs->cset); return ret;
}
/** * cgroup_css_set_put_fork - drop references we took during fork * @kargs: the arguments passed to create the child process * * Drop references to the prepared css_set and target cgroup if * CLONE_INTO_CGROUP was requested.
*/ staticvoid cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{ struct cgroup *cgrp = kargs->cgrp; struct css_set *cset = kargs->cset;
cgroup_threadgroup_change_end(current);
if (cset) {
put_css_set(cset);
kargs->cset = NULL;
}
if (kargs->flags & CLONE_INTO_CGROUP) {
cgroup_unlock(); if (cgrp) {
cgroup_put(cgrp);
kargs->cgrp = NULL;
}
}
}
/** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() * callback returns an error, the fork aborts with that error code. This * allows for a cgroup subsystem to conditionally allow or deny new forks.
*/ int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{ struct cgroup_subsys *ss; int i, j, ret;
ret = cgroup_css_set_fork(kargs); if (ret) return ret;
do_each_subsys_mask(ss, i, have_canfork_callback) {
ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert;
} while_each_subsys_mask();
return 0;
out_revert:
for_each_subsys(ss, j) { if (j >= i) break; if (ss->cancel_fork)
ss->cancel_fork(child, kargs->cset);
}
cgroup_css_set_put_fork(kargs);
return ret;
}
/** * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() * @child: the child process * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeeded and cleans up references we took to * prepare a new css_set for the child process in cgroup_can_fork().
*/ void cgroup_cancel_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{ struct cgroup_subsys *ss; int i;
for_each_subsys(ss, i) if (ss->cancel_fork)
ss->cancel_fork(child, kargs->cset);
cgroup_css_set_put_fork(kargs);
}
/** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks.
*/ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{ unsignedint cgrp_kill_seq = 0; unsignedlong cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; struct css_set *cset; int i;
cset = kargs->cset;
kargs->cset = NULL;
spin_lock_irq(&css_set_lock);
/* init tasks are special, only link regular threads */ if (likely(child->pid)) { if (kargs->cgrp) {
cgrp_flags = kargs->cgrp->flags;
cgrp_kill_seq = kargs->cgrp->kill_seq;
} else {
cgrp_flags = cset->dfl_cgrp->flags;
cgrp_kill_seq = cset->dfl_cgrp->kill_seq;
}
if (!(child->flags & PF_KTHREAD)) { if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) { /* * If the cgroup has to be frozen, the new task has * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to * get the task into the frozen state.
*/
spin_lock(&child->sighand->siglock);
WARN_ON_ONCE(child->frozen);
child->jobctl |= JOBCTL_TRAP_FREEZE;
spin_unlock(&child->sighand->siglock);
/* * Calling cgroup_update_frozen() isn't required here, * because it will be called anyway a bit later from * do_freezer_trap(). So we avoid cgroup's transient * switch from the frozen state and back.
*/
}
/* * If the cgroup is to be killed notice it now and take the * child down right after we finished preparing it for * userspace.
*/
kill = kargs->kill_seq != cgrp_kill_seq;
}
spin_unlock_irq(&css_set_lock);
/* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() * and addition to css_set.
*/
do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child);
} while_each_subsys_mask();
/* Make the new cset the root_cset of the new cgroup namespace. */ if (kargs->flags & CLONE_NEWCGROUP) { struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
/** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * * If @dentry is a directory for a cgroup which has @ss enabled on it, try * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned.
*/ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss)
{ struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp;
/* is @dentry a cgroup dir? */ if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
!kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF);
rcu_read_lock();
/* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details.
*/
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp)
css = cgroup_css(cgrp, ss);
if (!css || !css_tryget_online(css))
css = ERR_PTR(-ENOENT);
rcu_read_unlock(); return css;
}
/** * css_from_id - lookup css by id * @id: the cgroup id * @ss: cgroup subsys to be looked into * * Returns the css if there's valid one with @id, otherwise returns NULL. * Should be called under rcu_read_lock().
*/ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held()); return idr_find(&ss->css_idr, id);
}
/** * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path * @path: path on the default hierarchy * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
*/ struct cgroup *cgroup_get_from_path(constchar *path)
{ struct kernfs_node *kn; struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup *root_cgrp;
/** * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found.
*/ struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{ CLASS(fd_raw, f)(fd); if (fd_empty(f)) return ERR_PTR(-EBADF);
return cgroup_v1v2_get_from_file(fd_file(f));
}
/** * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports * cgroup2. * @fd: fd obtained by open(cgroup2_dir)
*/ struct cgroup *cgroup_get_from_fd(int fd)
{ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
static u64 power_of_ten(int power)
{
u64 v = 1; while (power--)
v *= 10; return v;
}
/** * cgroup_parse_float - parse a floating number * @input: input string * @dec_shift: number of decimal digits to shift * @v: output * * Parse a decimal floating point number in @input and store the result in * @v with decimal point right shifted @dec_shift times. For example, if * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. * Returns 0 on success, -errno otherwise. * * There's nothing cgroup specific about this function except that it's * currently the only user.
*/ int cgroup_parse_float(constchar *input, unsigned dec_shift, s64 *v)
{
s64 whole, frac = 0; int fstart = 0, fend = 0, flen;
if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) return -EINVAL; if (frac < 0) return -EINVAL;
/* * We might be cloning a socket which is left in an empty * cgroup and the cgroup might have already been rmdir'd. * Don't use cgroup_get_live().
*/
cgroup_get(cgrp);
cgroup_bpf_get(cgrp);
}
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.125Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.