/* * Generic process-grouping system. * * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * * Notifications support * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * --------------------------------------------------- * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details.
*/
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2) /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/* * To avoid confusing the compiler (and generating warnings) with code * that attempts to access what would be a 0-element array (i.e. sized * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this * constant expression can be added.
*/ #define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations.
*/
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock); #endif
/* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex.
*/ static DEFINE_SPINLOCK(cgroup_idr_lock);
/* * Protects cgroup_file->kn for !self csses. It synchronizes notifications * against file removal/re-creation across css hiding.
*/ static DEFINE_SPINLOCK(cgroup_file_kn_lock);
/* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. * * A cgroup destruction should enqueue work sequentially to: * cgroup_offline_wq: use for css offline work * cgroup_release_wq: use for css release work * cgroup_free_wq: use for free work * * Rationale for using separate workqueues: * The cgroup root free work may depend on completion of other css offline * operations. If all tasks were enqueued to a single workqueue, this could * create a deadlock scenario where: * - Free work waits for other css offline work to complete. * - But other css offline work is queued after free work in the same queue. * * Example deadlock scenario with single workqueue (cgroup_destroy_wq): * 1. umount net_prio * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, * which can never complete as it's behind in the same queue and * workqueue's max_active is 1.
*/ staticstruct workqueue_struct *cgroup_offline_wq; staticstruct workqueue_struct *cgroup_release_wq; staticstruct workqueue_struct *cgroup_free_wq;
/* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility.
*/ bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask;
/* some controllers can be threaded on the default hierarchy */ static u16 cgrp_dfl_threaded_ss_mask;
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots); staticint cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr);
/* * Assign a monotonically increasing serial number to csses. It guarantees * cgroups with bigger numbers are newer than those with smaller numbers. * Also, as csses are always appended to the parent's ->children list, it * guarantees that sibling csses are always sorted in the ascending serial * number order on the list. Protected by cgroup_mutex.
*/ static u64 css_serial_nr_next = 1;
/* * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly.
*/ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly;
/** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest * * cgroup_subsys_enabled() can only be used with literal subsys names which * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid.
*/ bool cgroup_ssid_enabled(int ssid)
{ if (!CGROUP_HAS_SUBSYS_CONFIG) returnfalse;
/** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" * and "name" are disallowed. * * - When mounting an existing superblock, mount options should match. * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. * * - "cgroup.clone_children" is removed. * * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup * and its descendants contain no task; otherwise, 1. The file also * generates kernfs notification which can be monitored through poll and * [di]notify when the value of the file changes. * * - cpuset: tasks will be kept in empty cpusets when hotplug happens and * take masks of ancestors with non-empty cpus/mems, instead of being * moved to an ancestor. * * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * * - blkcg: blk-throttle becomes properly hierarchical.
*/ bool cgroup_on_dfl(conststruct cgroup *cgrp)
{ return cgrp->root == &cgrp_dfl_root;
}
/* IDR wrappers which synchronize using cgroup_idr_lock */ staticint cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
{ int ret;
/* can @cgrp host both domain and threaded children? */ staticbool cgroup_is_mixable(struct cgroup *cgrp)
{ /* * Root isn't under domain level resource control exempting it from * the no-internal-process constraint, so it can serve as a thread * root and a parent of resource domains at the same time.
*/ return !cgroup_parent(cgrp);
}
/* can @cgrp become a thread root? Should always be true for a thread root */ staticbool cgroup_can_be_thread_root(struct cgroup *cgrp)
{ /* mixables don't care */ if (cgroup_is_mixable(cgrp)) returntrue;
/* domain roots can't be nested under threaded */ if (cgroup_is_threaded(cgrp)) returnfalse;
/* can only have either domain or threaded children */ if (cgrp->nr_populated_domain_children) returnfalse;
/* and no domain controllers can be enabled */ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) returnfalse;
returntrue;
}
/* is @cgrp root of a threaded subtree? */ staticbool cgroup_is_thread_root(struct cgroup *cgrp)
{ /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) returnfalse;
/* a domain w/ threaded children is a thread root */ if (cgrp->nr_threaded_children) returntrue;
/* * A domain which has tasks and explicit threaded controllers * enabled is a thread root.
*/ if (cgroup_has_tasks(cgrp) &&
(cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) returntrue;
returnfalse;
}
/* a domain which isn't connected to the root w/o brekage can't be used */ staticbool cgroup_is_valid_domain(struct cgroup *cgrp)
{ /* the cgroup itself can be a thread root */ if (cgroup_is_threaded(cgrp)) returnfalse;
/* but the ancestors can't be unless mixable */ while ((cgrp = cgroup_parent(cgrp))) { if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) returnfalse; if (cgroup_is_threaded(cgrp)) returnfalse;
}
if (parent) {
u16 ss_mask = parent->subtree_control;
/* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp))
ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask;
}
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
cgrp_dfl_implicit_ss_mask); return root_ss_mask;
}
/* subsystems enabled on a cgroup */ static u16 cgroup_ss_mask(struct cgroup *cgrp)
{ struct cgroup *parent = cgroup_parent(cgrp);
if (parent) {
u16 ss_mask = parent->subtree_ss_mask;
/* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp))
ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask;
}
return cgrp->root->subsys_mask;
}
/** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and * the caller is responsible for pinning the returned css if it wants to * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled.
*/ staticstruct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{ if (CGROUP_HAS_SUBSYS_CONFIG && ss) return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex)); else return &cgrp->self;
}
/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css.
*/ staticstruct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
if (!ss) return &cgrp->self;
/* * This function is used while updating css associations and thus * can't test the csses directly. Test ss_mask.
*/ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
cgrp = cgroup_parent(cgrp); if (!cgrp) return NULL;
}
return cgroup_css(cgrp, ss);
}
/** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the * callers responsibility to try get a reference for it.
*/ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{ struct cgroup_subsys_state *css;
if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL;
do {
css = cgroup_css(cgrp, ss);
if (css) return css;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
return init_css_set.subsys[ss->id];
}
/** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * The returned css must be put using css_put().
*/ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{ struct cgroup_subsys_state *css;
if (!CGROUP_HAS_SUBSYS_CONFIG) return NULL;
rcu_read_lock();
do {
css = cgroup_css(cgrp, ss);
if (css && css_tryget_online(css)) goto out_unlock;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
/** * __cgroup_task_count - count the number of tasks in a cgroup. The caller * is responsible for taking the css_set_lock. * @cgrp: the cgroup in question
*/ int __cgroup_task_count(conststruct cgroup *cgrp)
{ int count = 0; struct cgrp_cset_link *link;
/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question
*/ int cgroup_task_count(conststruct cgroup *cgrp)
{ int count;
staticstruct cgroup *kn_priv(struct kernfs_node *kn)
{ struct kernfs_node *parent; /* * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT. * Therefore it is always safe to dereference this pointer outside of a * RCU section.
*/
parent = rcu_dereference_check(kn->__parent,
kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT); return parent->priv;
}
/* * This is open and unprotected implementation of cgroup_css(). * seq_css() is only called from a kernfs file operation which has * an active reference on the file. Because all the subsystem * files are drained before a css is disassociated with a cgroup, * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete.
*/ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self;
}
EXPORT_SYMBOL_GPL(of_css);
/** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_mutex.
*/ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
lockdep_is_held(&cgroup_mutex)))) { } \ else
/** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of * @ss_mask is set.
*/ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsignedlong __ss_mask = (ss_mask); \ if (!CGROUP_HAS_SUBSYS_CONFIG) { \
(ssid) = 0; \ break; \
} \
for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
(ss) = cgroup_subsys[ssid]; \
{
#define while_each_subsys_mask() \
} \
} \
} while (false)
/* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \
list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \
cgroup_is_dead(child); })) \
; \ else
/* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \ else
/* walk live descendants in postorder */ #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \ else
/* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created.
*/ struct css_set init_css_set = {
.refcount = REFCOUNT_INIT(1),
.dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
.mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/* * The following field is re-initialized when this cset gets linked * in cgroup_init(). However, let's initialize the field * statically too so that the default cgroup can be accessed safely * early during boot.
*/
.dfl_cgrp = &cgrp_dfl_root.cgrp,
};
staticint css_set_count = 1; /* 1 for init_css_set */
/** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set * * css_set_populated() should be the same as !!cset->nr_tasks at steady * state. However, css_set_populated() can be called while a task is being * added to or removed from the linked list before the nr_tasks is * properly updated. Hence, we can't just look at ->nr_tasks here.
*/ staticbool css_set_populated(struct css_set *cset)
{
lockdep_assert_held(&css_set_lock);
/** * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first * task or losing the last. Update @cgrp->nr_populated_* accordingly. The * count is propagated towards root so that a given cgroup's * nr_populated_children is zero iff none of its descendants contain any * tasks. * * @cgrp's interface file "cgroup.populated" is zero if both * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and * 1 otherwise. When the sum changes from or to zero, userland is notified * that the content of the interface file has changed. This can be used to * detect when @cgrp and its descendants become populated or empty.
*/ staticvoid cgroup_update_populated(struct cgroup *cgrp, bool populated)
{ struct cgroup *child = NULL; int adj = populated ? 1 : -1;
lockdep_assert_held(&css_set_lock);
do { bool was_populated = cgroup_is_populated(cgrp);
if (!child) {
cgrp->nr_populated_csets += adj;
} else { if (cgroup_is_threaded(child))
cgrp->nr_populated_threaded_children += adj; else
cgrp->nr_populated_domain_children += adj;
}
if (was_populated == cgroup_is_populated(cgrp)) break;
child = cgrp;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
/** * css_set_update_populated - update populated state of a css_set * @cset: target css_set * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the * populated counters of all associated cgroups accordingly.
*/ staticvoid css_set_update_populated(struct css_set *cset, bool populated)
{ struct cgrp_cset_link *link;
/* * @task is leaving, advance task iterators which are pointing to it so * that they can resume at the next position. Advancing an iterator might * remove it from the list, use safe walk. See css_task_iter_skip() for * details.
*/ staticvoid css_set_skip_task_iters(struct css_set *cset, struct task_struct *task)
{ struct css_task_iter *it, *pos;
/** * css_set_move_task - move a task from one css_set to another * @task: task being moved * @from_cset: css_set @task currently belongs to (may be NULL) * @to_cset: new css_set @task is being moved to (may be NULL) * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks * * Move @task from @from_cset to @to_cset. If @task didn't belong to any * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts.
*/ staticvoid css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, bool use_mg_tasks)
{
lockdep_assert_held(&css_set_lock);
if (to_cset && !css_set_populated(to_cset))
css_set_update_populated(to_cset, true);
if (from_cset) {
WARN_ON_ONCE(list_empty(&task->cg_list));
if (to_cset) { /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race * against cgroup_exit()/cgroup_free() dropping the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
/* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into * account cgroups in empty hierarchies.
*/ #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
if (!refcount_dec_and_test(&cset->refcount)) return;
WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
/* This css_set is dead. Unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
css_put(cset->subsys[ssid]);
}
hash_del(&cset->hlist);
css_set_count--;
if (css_set_threaded(cset)) {
list_del(&cset->threaded_csets_node);
put_css_set_locked(cset->dom_cset);
}
kfree_rcu(cset, rcu_head);
}
/** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/ staticbool compare_css_sets(struct css_set *cset, struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[])
{ struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2;
/* * On the default hierarchy, there can be csets which are * associated with the same set of cgroups but different csses. * Let's first ensure that csses match.
*/ if (memcmp(template, cset->subsys, sizeof(cset->subsys))) returnfalse;
/* @cset's domain should match the default cgroup's */ if (cgroup_on_dfl(new_cgrp))
new_dfl_cgrp = new_cgrp; else
new_dfl_cgrp = old_cset->dfl_cgrp;
if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) returnfalse;
/* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may * share the same effective css, this comparison is always * necessary.
*/
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links; while (1) { struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next; /* See if we reached the end - both lists are equal length. */ if (l1 == &cset->cgrp_links) {
BUG_ON(l2 != &old_cset->cgrp_links); break;
} else {
BUG_ON(l2 == &old_cset->cgrp_links);
} /* Locate the cgroups associated with these links. */
link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
cgrp1 = link1->cgrp;
cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */
BUG_ON(cgrp1->root != cgrp2->root);
/* * If this hierarchy is the hierarchy of the cgroup * that's changing, then we need to check that this * css_set points to the new cgroup; if it's any other * hierarchy, then this css_set should point to the * same cgroup as the old css_set.
*/ if (cgrp1->root == new_cgrp->root) { if (cgrp1 != new_cgrp) returnfalse;
} else { if (cgrp1 != cgrp2) returnfalse;
}
} returntrue;
}
/** * find_existing_css_set - init css array and find the matching css_set * @old_cset: the css_set that we're using before the cgroup transition * @cgrp: the cgroup that we're moving into * @template: out param for the new set of csses, should be clear on entry
*/ staticstruct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state **template)
{ struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsignedlong key; int i;
/* * Build the set of subsystem state objects that we want to see in the * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking.
*/
for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* * @ss is in this hierarchy, so we want the * effective css from @cgrp.
*/ template[i] = cgroup_e_css_by_mask(cgrp, ss);
} else { /* * @ss is not in this hierarchy, so we don't want * to change the css.
*/ template[i] = old_cset->subsys[i];
}
}
/** * allocate_cgrp_cset_links - allocate cgrp_cset_links * @count: the number of links to allocate * @tmp_links: list_head the allocated links are put on * * Allocate @count cgrp_cset_link structures and chain them on @tmp_links * through ->cset_link. Returns 0 on success or -errno.
*/ staticint allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
{ struct cgrp_cset_link *link; int i;
INIT_LIST_HEAD(tmp_links);
for (i = 0; i < count; i++) {
link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) {
free_cgrp_cset_links(tmp_links); return -ENOMEM;
}
list_add(&link->cset_link, tmp_links);
} return 0;
}
/** * link_css_set - a helper function to link a css_set to a cgroup * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup
*/ staticvoid link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp)
{ struct cgrp_cset_link *link;
/* * Always add links to the tail of the lists so that the lists are * in chronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
if (cgroup_parent(cgrp))
cgroup_get_live(cgrp);
}
/** * find_css_set - return a new css_set with one cgroup updated * @old_cset: the baseline css_set * @cgrp: the cgroup to be updated * * Return a new css_set that's equivalent to @old_cset, but with @cgrp * substituted into the appropriate hierarchy.
*/ staticstruct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp)
{ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; struct cgroup_subsys *ss; unsignedlong key; int ssid;
lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
spin_lock_irq(&css_set_lock);
cset = find_existing_css_set(old_cset, cgrp, template); if (cset)
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (cset) return cset;
cset = kzalloc(sizeof(*cset), GFP_KERNEL); if (!cset) return NULL;
/* Allocate all the cgrp_cset_link objects that we'll need */ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
kfree(cset); return NULL;
}
/* * If @cset should be threaded, look up the matching dom_cset and * link them up. We first fully initialize @cset then look for the * dom_cset. It's simpler this way and safe as @cset is guaranteed * to stay empty until we return.
*/ if (cgroup_is_threaded(cset->dfl_cgrp)) { struct css_set *dcset;
/* * Returned cgroup is without refcount but it's valid as long as cset pins it.
*/ staticinlinestruct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root)
{ struct cgroup *res_cgroup = NULL;
/* * If cgroup_mutex is not held, the cgrp_cset_link will be freed * before we remove the cgroup root from the root_list. Consequently, * when accessing a cgroup root, the cset_link may have already been * freed, resulting in a NULL res_cgroup. However, by holding the * cgroup_mutex, we ensure that res_cgroup can't be NULL. * If we don't hold cgroup_mutex in the caller, we must do the NULL * check.
*/ return res_cgroup;
}
/* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy
*/ staticstruct cgroup *
current_cgns_cgroup_from_root(struct cgroup_root *root)
{ struct cgroup *res = NULL; struct css_set *cset;
lockdep_assert_held(&css_set_lock);
rcu_read_lock();
cset = current->nsproxy->cgroup_ns->root_cset;
res = __cset_cgroup_from_root(cset, root);
rcu_read_unlock();
/* * The namespace_sem is held by current, so the root cgroup can't * be umounted. Therefore, we can ensure that the res is non-NULL.
*/
WARN_ON_ONCE(!res); return res;
}
/* * Look up cgroup associated with current task's cgroup namespace on the default * hierarchy. * * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu * pointers. * - css_set_lock is not needed because we just read cset->dfl_cgrp. * - As a bonus returned cgrp is pinned with the current because it cannot * switch cgroup_ns asynchronously.
*/ staticstruct cgroup *current_cgns_cgroup_dfl(void)
{ struct css_set *cset;
if (current->nsproxy) {
cset = current->nsproxy->cgroup_ns->root_cset; return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
} else { /* * NOTE: This function may be called from bpf_cgroup_from_id() * on a task which has already passed exit_task_namespaces() and * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all * cgroups visible for lookups.
*/ return &cgrp_dfl_root.cgrp;
}
}
/* look up cgroup associated with given css_set on the specified hierarchy */ staticstruct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root)
{
lockdep_assert_held(&css_set_lock);
return __cset_cgroup_from_root(cset, root);
}
/* * Return the cgroup for "task" from the given hierarchy. Must be * called with css_set_lock held to prevent task's groups from being modified. * Must be called with either cgroup_mutex or rcu read lock to prevent the * cgroup root from being destroyed.
*/ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root)
{ /* * No need to lock the task - since we hold css_set_lock the * task can't change groups.
*/ return cset_cgroup_from_root(task_css_set(task), root);
}
/* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to * increment the count). So code holding cgroup_mutex can safely * assume that if the count is zero, it will stay zero. Similarly, if * a task holds cgroup_mutex on a cgroup with zero count, it * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task()
*/
/** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * * S_IRUGO for read, S_IWUSR for write.
*/ static umode_t cgroup_file_mode(conststruct cftype *cft)
{
umode_t mode = 0;
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
if (cft->write_u64 || cft->write_s64 || cft->write) { if (cft->flags & CFTYPE_WORLD_WRITABLE)
mode |= S_IWUGO; else
mode |= S_IWUSR;
}
return mode;
}
/** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask.
*/ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid;
/* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies.
*/
new_ss_mask &= this_ss_mask;
if (new_ss_mask == cur_ss_mask) break;
cur_ss_mask = new_ss_mask;
}
return cur_ss_mask;
}
/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * * This helper undoes cgroup_kn_lock_live() and should be invoked before * the method finishes if locking succeeded. Note that once this function * returns the cgroup returned by cgroup_kn_lock_live() may become * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function.
*/ void cgroup_kn_unlock(struct kernfs_node *kn)
{ struct cgroup *cgrp;
/** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal.
*/ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{ struct cgroup *cgrp;
do_each_subsys_mask(ss, ssid, ss_mask) { /* * If @ss has non-root csses attached to it, can't move. * If @ss is an implicit controller, it is exempt from this * rule and can be stolen.
*/ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
!ss->implicit_on_dfl) return -EBUSY;
/* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY;
/* * Collect ssid's that need to be disabled from default * hierarchy.
*/ if (ss->root == &cgrp_dfl_root)
dfl_disable_ss_mask |= 1 << ssid;
} while_each_subsys_mask();
if (dfl_disable_ss_mask) { struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
/* * Controllers from default hierarchy that need to be rebound * are all disabled together in one go.
*/
cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
WARN_ON(cgroup_apply_control(scgrp));
cgroup_finalize_control(scgrp, 0);
}
spin_lock_irq(&css_set_lock);
css->cgroup = dcgrp;
WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
e_cset_node[ss->id]) {
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]); /* * all css_sets of scgrp together in same order to dcgrp, * patch in-flight iterators to preserve correct iteration. * since the iterator is always advanced right away and * finished when it->cset_pos meets it->cset_head, so only * update it->cset_head is enough here.
*/
list_for_each_entry(it, &cset->task_iters, iters_node) if (it->cset_head == &scgrp->e_csets[ss->id])
it->cset_head = &dcgrp->e_csets[ss->id];
}
spin_unlock_irq(&css_set_lock);
/* DYNMODS must be modified through cgroup_favor_dynmods() */
root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name)
strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); if (ctx->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret;
lockdep_assert_held(&cgroup_mutex);
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
0, GFP_KERNEL); if (ret) goto out;
/* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. Later rebinding may disable * controllers on the default hierarchy and thus create new csets, * which can't be more than the existing ones. Allocate 2x.
*/
ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref;
ret = cgroup_init_root_id(root); if (ret) goto cancel_ref;
ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root;
ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root;
ret = rebind_subsystems(root, ss_mask); if (ret) goto exit_stats;
ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
CGROUP_LIFETIME_ONLINE, root_cgrp);
WARN_ON_ONCE(notifier_to_errno(ret));
trace_cgroup_setup_root(root);
/* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path.
*/
list_add_rcu(&root->root_list, &cgroup_roots);
cgroup_root_count++;
/* * Link the root cgroup in this hierarchy into all the css_set * objects.
*/
spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset))
cgroup_update_populated(root_cgrp, true);
}
spin_unlock_irq(&css_set_lock);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.