/* * cgroup bpf destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup bpf * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock.
*/ staticstruct workqueue_struct *cgroup_bpf_destroy_wq;
staticint __init cgroup_bpf_wq_init(void)
{
cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); if (!cgroup_bpf_destroy_wq)
panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); return 0;
}
core_initcall(cgroup_bpf_wq_init);
/* Called when bpf_cgroup_link is auto-detached from dying cgroup. * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It * doesn't free link memory, which will eventually be done by bpf_link's * release() callback, when its last FD is closed.
*/ staticvoid bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
{
cgroup_put(link->cgroup);
link->cgroup = NULL;
}
/** * cgroup_bpf_release() - put references of all bpf programs and * release all cgroup bpf data * @work: work structure embedded into the cgroup to modify
*/ staticvoid cgroup_bpf_release(struct work_struct *work)
{ struct cgroup *p, *cgrp = container_of(work, struct cgroup,
bpf.release_work); struct bpf_prog_array *old_array; struct list_head *storages = &cgrp->bpf.storages; struct bpf_cgroup_storage *storage, *stmp;
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through * link or direct prog.
*/ staticstruct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
{ if (pl->prog) return pl->prog; if (pl->link) return pl->link->link.prog; return NULL;
}
/* count number of elements in the list. * it's slow but the list cannot be long
*/ static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
{ struct bpf_prog_list *pl;
u32 cnt = 0;
hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
(*preorder_cnt)++;
cnt++;
} return cnt;
}
/* if parent has non-overridable prog attached, * disallow attaching new programs to the descendent cgroup. * if parent has overridable or multi-prog, allow attaching
*/ staticbool hierarchy_allows_attach(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype)
{ struct cgroup *p;
p = cgroup_parent(cgrp); if (!p) returntrue; do {
u32 flags = p->bpf.flags[atype];
u32 cnt;
if (flags & BPF_F_ALLOW_MULTI) returntrue;
cnt = prog_list_length(&p->bpf.progs[atype], NULL);
WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE);
p = cgroup_parent(p);
} while (p); returntrue;
}
/* compute a chain of effective programs for a given cgroup: * start from the list of programs in this cgroup and add * all parent programs. * Note that parent's F_ALLOW_OVERRIDE-type program is yielding * to programs in this cgroup
*/ staticint compute_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype, struct bpf_prog_array **array)
{ struct bpf_prog_array_item *item; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
/* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
p = cgroup_parent(p);
} while (p);
progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); if (!progs) return -ENOMEM;
/* populate the array with effective progs */
cnt = 0;
p = cgrp;
fstart = preorder_cnt;
bstart = preorder_cnt - 1; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue;
init_bstart = bstart;
hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue;
/* reverse pre-ordering progs at this cgroup level */ for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
swap(progs->items[i], progs->items[j]);
} while ((p = cgroup_parent(p)));
*array = progs; return 0;
}
staticvoid activate_effective_progs(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype, struct bpf_prog_array *old_array)
{
old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array
*/
bpf_prog_array_free(old_array);
}
/** * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify
*/ staticint cgroup_bpf_inherit(struct cgroup *cgrp)
{ /* has to use marco instead of const int, since compiler thinks * that array below is variable length
*/ #define NR ARRAY_SIZE(cgrp->bpf.effective) struct bpf_prog_array *arrays[NR] = {}; struct cgroup *p; int ret, i;
ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
GFP_KERNEL); if (ret) return ret;
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
cgroup_bpf_get(p);
for (i = 0; i < NR; i++)
INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
INIT_LIST_HEAD(&cgrp->bpf.storages);
for (i = 0; i < NR; i++) if (compute_effective_progs(cgrp, i, &arrays[i])) goto cleanup;
for (i = 0; i < NR; i++)
activate_effective_progs(cgrp, i, arrays[i]);
return 0;
cleanup: for (i = 0; i < NR; i++)
bpf_prog_array_free(arrays[i]);
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
cgroup_bpf_put(p);
percpu_ref_exit(&cgrp->bpf.refcnt);
return -ENOMEM;
}
staticint cgroup_bpf_lifetime_notify(struct notifier_block *nb, unsignedlong action, void *data)
{ struct cgroup *cgrp = data; int ret = 0;
if (cgrp->root != &cgrp_dfl_root) return NOTIFY_OK;
switch (action) { case CGROUP_LIFETIME_ONLINE:
ret = cgroup_bpf_inherit(cgrp); break; case CGROUP_LIFETIME_OFFLINE:
cgroup_bpf_offline(cgrp); break;
}
cleanup: /* oom while computing effective. Free all computed effective arrays * since they were not activated
*/
css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self);
/* single-attach case */ if (!allow_multi) { if (hlist_empty(progs)) return NULL; return hlist_entry(progs->first, typeof(*pl), node);
}
hlist_for_each_entry(pl, progs, node) { if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); if (link && pl->link == link) /* disallow attaching the same link twice */ return ERR_PTR(-EINVAL);
}
/* direct prog multi-attach w/ replacement case */ if (replace_prog) {
hlist_for_each_entry(pl, progs, node) { if (pl->prog == replace_prog) /* a match found */ return pl;
} /* prog to replace not found for cgroup */ return ERR_PTR(-ENOENT);
}
is_before = flags & BPF_F_BEFORE;
is_after = flags & BPF_F_AFTER; if (is_link || is_id || id_or_fd) { /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */ if (is_before == is_after) return ERR_PTR(-EINVAL); if ((is_link && !link) || (!is_link && !prog)) return ERR_PTR(-EINVAL);
} elseif (!hlist_empty(progs)) { /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */ if (is_before && is_after) return ERR_PTR(-EINVAL);
}
if (is_link) {
anchor_link = bpf_get_anchor_link(flags, id_or_fd); if (IS_ERR(anchor_link)) return ERR_CAST(anchor_link);
} elseif (is_id || id_or_fd) {
anchor_prog = bpf_get_anchor_prog(flags, id_or_fd); if (IS_ERR(anchor_prog)) return ERR_CAST(anchor_prog);
}
if (!anchor_prog && !anchor_link) { /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER * doesn't matter since either prepend or append to a combined * list of progs will end up with correct result.
*/
hlist_for_each_entry(pltmp, progs, node) { if (is_before) return pltmp; if (pltmp->node.next) continue; return pltmp;
} return NULL;
}
/** * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach * @link: A link to attach * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags * @id_or_fd: Relative prog id or fd * @revision: bpf_prog_list revision * * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held.
*/ staticint __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, enum bpf_attach_type type, u32 flags, u32 id_or_fd,
u64 revision)
{
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct bpf_prog *old_prog = NULL; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; struct bpf_prog *new_prog = prog ? : link->link.prog; enum cgroup_bpf_attach_type atype; struct bpf_prog_list *pl; struct hlist_head *progs; int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER))) /* only either replace or insertion with before/after */ return -EINVAL; if (link && (prog || replace_prog)) /* only either link or prog/replace_prog can be specified */ return -EINVAL; if (!!replace_prog != !!(flags & BPF_F_REPLACE)) /* replace_prog implies BPF_F_REPLACE, and vice versa */ return -EINVAL;
atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; if (revision && revision != cgrp->bpf.revisions[atype]) return -ESTALE;
progs = &cgrp->bpf.progs[atype];
if (!hierarchy_allows_attach(cgrp, atype)) return -EPERM;
if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) /* Disallow attaching non-overridable on top * of existing overridable in this cgroup. * Disallow attaching multi-prog if overridable or none
*/ return -EPERM;
if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) return -E2BIG;
/* Swap updated BPF program for given link in effective program arrays across * all descendant cgroups. This function is guaranteed to succeed.
*/ staticvoid replace_effective_prog(struct cgroup *cgrp, enum cgroup_bpf_attach_type atype, struct bpf_cgroup_link *link)
{ struct bpf_prog_array_item *item; struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct hlist_head *head; struct cgroup *cg; int pos;
if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue;
/* find position of link in effective progs array */ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue;
/** * __cgroup_bpf_replace() - Replace link's program and propagate the change * to descendants * @cgrp: The cgroup which descendants to traverse * @link: A link for which to replace BPF program * @new_prog: &struct bpf_prog for the target BPF program with its refcnt * incremented * * Must be called with cgroup_mutex held.
*/ staticint __cgroup_bpf_replace(struct cgroup *cgrp, struct bpf_cgroup_link *link, struct bpf_prog *new_prog)
{ enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; struct hlist_head *progs; bool found = false;
atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL;
progs = &cgrp->bpf.progs[atype];
if (link->link.prog->type != new_prog->type) return -EINVAL;
hlist_for_each_entry(pl, progs, node) { if (pl->link == link) {
found = true; break;
}
} if (!found) return -ENOENT;
cgroup_lock(); /* link might have been auto-released by dying cgroup, so fail */ if (!cg_link->cgroup) {
ret = -ENOLINK; goto out_unlock;
} if (old_prog && link->prog != old_prog) {
ret = -EPERM; goto out_unlock;
}
ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
out_unlock:
cgroup_unlock(); return ret;
}
if (!allow_multi) { if (hlist_empty(progs)) /* report error when trying to detach and nothing is attached */ return ERR_PTR(-ENOENT);
/* to maintain backward compatibility NONE and OVERRIDE cgroups * allow detaching with invalid FD (prog==NULL) in legacy mode
*/ return hlist_entry(progs->first, typeof(*pl), node);
}
if (!prog && !link) /* to detach MULTI prog the user has to specify valid FD * of the program or link to be detached
*/ return ERR_PTR(-EINVAL);
/* find the prog or link and detach it */
hlist_for_each_entry(pl, progs, node) { if (pl->prog == prog && pl->link == link) return pl;
} return ERR_PTR(-ENOENT);
}
/** * purge_effective_progs() - After compute_effective_progs fails to alloc new * cgrp->bpf.inactive table we can recover by * recomputing the array in place. * * @cgrp: The cgroup which descendants to travers * @prog: A program to detach or NULL * @link: A link to detach or NULL * @atype: Type of detach operation
*/ staticvoid purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum cgroup_bpf_attach_type atype)
{ struct cgroup_subsys_state *css; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct hlist_head *head; struct cgroup *cg; int pos;
/* recompute effective prog array in place */
css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self);
if (percpu_ref_is_zero(&desc->bpf.refcnt)) continue;
/* find position of link or prog in effective progs array */ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue;
head = &cg->bpf.progs[atype];
hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; if (pl->prog == prog && pl->link == link) goto found;
pos++;
}
}
/* no link or prog match, skip the cgroup of this layer */ continue;
found:
progs = rcu_dereference_protected(
desc->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
/* Remove the program from the array */
WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos), "Failed to purge a prog from array at index %d", pos);
}
}
/** * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @prog: A program to detach or NULL * @link: A link to detach or NULL * @type: Type of detach operation * @revision: bpf_prog_list revision * * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held.
*/ staticint __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_cgroup_link *link, enum bpf_attach_type type,
u64 revision)
{ enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; struct bpf_prog_list *pl; struct hlist_head *progs;
u32 attach_btf_id = 0;
u32 flags;
if (prog)
attach_btf_id = prog->aux->attach_btf_id; if (link)
attach_btf_id = link->link.prog->aux->attach_btf_id;
atype = bpf_cgroup_atype_find(type, attach_btf_id); if (atype < 0) return -EINVAL;
if (revision && revision != cgrp->bpf.revisions[atype]) return -ESTALE;
/* mark it deleted, so it's ignored while recomputing effective */
old_prog = pl->prog;
pl->prog = NULL;
pl->link = NULL;
if (update_effective_progs(cgrp, atype)) { /* if update effective array failed replace the prog with a dummy prog*/
pl->prog = old_prog;
pl->link = link;
purge_effective_progs(cgrp, old_prog, link, atype);
}
/* now can actually delete it from this cgroup list */
hlist_del(&pl->node);
cgrp->bpf.revisions[atype] += 1;
kfree(pl); if (hlist_empty(progs)) /* last program was detached, reset flags to zero */
cgrp->bpf.flags[atype] = 0; if (old_prog) { if (type == BPF_LSM_CGROUP)
bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
}
static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0;
}
int cgroup_bpf_prog_query(constunion bpf_attr *attr, union bpf_attr __user *uattr)
{ struct cgroup *cgrp; int ret;
cgrp = cgroup_get_from_fd(attr->query.target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp);
ret = cgroup_bpf_query(cgrp, attr, uattr);
cgroup_put(cgrp); return ret;
}
/** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received * @atype: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. * * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * * For egress packets, this function can return: * NET_XMIT_SUCCESS (0) - continue with packet output * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr * NET_XMIT_CN (2) - continue with packet output and notify TCP * to call cwr * -err - drop packet * * For ingress packets, this function will return -EPERM if any * attached program was found and if it returned != 1 during execution. * Otherwise 0 is returned.
*/ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum cgroup_bpf_attach_type atype)
{ unsignedint offset = -skb_network_offset(skb); struct sock *save_sk; void *saved_data_end; struct cgroup *cgrp; int ret;
if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0;
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
__bpf_prog_run_save_cb, 0, &flags);
/* Return values of CGROUP EGRESS BPF programs are: * 0: drop packet * 1: keep packet * 2: drop packet and cn * 3: keep packet and cn * * The returned value is then converted to one of the NET_XMIT * or an error code that is then interpreted as drop packet * (and no cn): * 0: NET_XMIT_SUCCESS skb should be transmitted * 1: NET_XMIT_DROP skb should be dropped and cn * 2: NET_XMIT_CN skb should be transmitted and cn * 3: -err skb should be dropped
*/
cn = flags & BPF_RET_SET_CN; if (ret && !IS_ERR_VALUE((long)ret))
ret = -EFAULT; if (!ret)
ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); else
ret = (cn ? NET_XMIT_DROP : ret);
} else {
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
skb, __bpf_prog_run_save_cb, 0,
NULL); if (ret && !IS_ERR_VALUE((long)ret))
ret = -EFAULT;
}
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
/** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate * @atype: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * * The program type passed in via @type must be suitable for sock * filtering. No further check is performed to assert that. * * This function will return %-EPERM if any if an attached program was found * and if it returned != 1 during execution. In all other cases, 0 is returned.
*/ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype)
{ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
/** * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX * uaddr. * @atype: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). * * socket is expected to be of type INET, INET6 or UNIX. * * This function will return %-EPERM if an attached program is found and * returned value != 1 during execution. In all other cases, 0 is returned.
*/ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx,
u32 *flags)
{ struct bpf_sock_addr_kern ctx = {
.sk = sk,
.uaddr = uaddr,
.t_ctx = t_ctx,
}; struct sockaddr_storage unspec; struct cgroup *cgrp; int ret;
/* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX).
*/ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6 &&
sk->sk_family != AF_UNIX) return 0;
/** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. * @atype: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * * The program type passed in via @type must be suitable for sock_ops * filtering. No further check is performed to assert that. * * This function will return %-EPERM if any if an attached program was found * and if it returned != 1 during execution. In all other cases, 0 is returned.
*/ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum cgroup_bpf_attach_type atype)
{ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
{ /* flags argument is not used now, * but provides an ability to extend the API. * verifier checks that its value is correct.
*/ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); struct bpf_cgroup_storage *storage; struct bpf_cg_run_ctx *ctx; void *ptr;
/* get current cgroup storage from BPF run context */
ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
storage = ctx->prog_item->cgroup_storage[stype];
/** * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl * * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise * @atype: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and * can allow or deny such access. * * This function will return %-EPERM if an attached program is found and * returned value != 1 during execution. In all other cases 0 is returned.
*/ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, conststruct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, enum cgroup_bpf_attach_type atype)
{ struct bpf_sysctl_kern ctx = {
.head = head,
.table = table,
.write = write,
.ppos = ppos,
.cur_val = NULL,
.cur_len = PAGE_SIZE,
.new_val = NULL,
.new_len = 0,
.new_updated = 0,
}; struct cgroup *cgrp;
loff_t pos = 0; int ret;
ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); if (!ctx.cur_val ||
table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */
ctx.cur_len = 0;
}
if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user.
*/
ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); if (ctx.new_val) {
memcpy(ctx.new_val, *buf, ctx.new_len);
} else { /* Let BPF program decide how to proceed. */
ctx.new_len = 0;
}
}
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, sockptr_t optval, int *optlen, char **kernel_optval)
{ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = *level,
.optname = *optname,
}; int ret, max_optlen;
/* Allocate a bit more than the initial user buffer for * BPF program. The canonical use case is overriding * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
*/
max_optlen = max_t(int, 16, *optlen);
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen;
ctx.optlen = *optlen;
if (copy_from_sockptr(ctx.optval, optval,
min(*optlen, max_optlen))) {
ret = -EFAULT; goto out;
}
lock_sock(sk);
ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
&ctx, bpf_prog_run, 0, NULL);
release_sock(sk);
if (ret) goto out;
if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */
ret = 1;
} elseif (ctx.optlen > max_optlen || ctx.optlen < -1) { /* optlen is out of bounds */ if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
ctx.optlen, max_optlen);
ret = 0; goto out;
}
ret = -EFAULT;
} else { /* optlen within bounds, run kernel handler */
ret = 0;
/* optlen == 0 from BPF indicates that we should * use original userspace data.
*/ if (ctx.optlen != 0) {
*optlen = ctx.optlen; /* We've used bpf_sockopt_kern->buf as an intermediary * storage, but the BPF program indicates that we need * to pass this data to the kernel setsockopt handler. * No way to export on-stack buf, have to allocate a * new buffer.
*/ if (!sockopt_buf_allocated(&ctx, &buf)) { void *p = kmalloc(ctx.optlen, GFP_USER);
if (!retval) { /* If kernel getsockopt finished successfully, * copy whatever was returned to the user back * into our temporary buffer. Set optlen to the * one that kernel returned as well to let * BPF programs inspect the value.
*/ if (copy_from_sockptr(&ctx.optlen, optlen, sizeof(ctx.optlen))) {
ret = -EFAULT; goto out;
}
if (ctx.optlen < 0) {
ret = -EFAULT; goto out;
}
orig_optlen = ctx.optlen;
if (copy_from_sockptr(ctx.optval, optval,
min(ctx.optlen, max_optlen))) {
ret = -EFAULT; goto out;
}
}
lock_sock(sk);
ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
&ctx, bpf_prog_run, retval, NULL);
release_sock(sk);
if (ret < 0) goto out;
if (!sockptr_is_null(optval) &&
(ctx.optlen > max_optlen || ctx.optlen < 0)) { if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) {
pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
ctx.optlen, max_optlen);
ret = retval; goto out;
}
ret = -EFAULT; goto out;
}
if (ctx.optlen != 0) { if (!sockptr_is_null(optval) &&
copy_to_sockptr(optval, ctx.optval, ctx.optlen)) {
ret = -EFAULT; goto out;
} if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) {
ret = -EFAULT; goto out;
}
}
out:
sockopt_free_buf(&ctx, &buf); return ret;
}
int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, int optname, void *optval, int *optlen, int retval)
{ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = level,
.optname = optname,
.optlen = *optlen,
.optval = optval,
.optval_end = optval + *optlen,
.current_task = current,
}; int ret;
/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy * user data back into BPF buffer when reval != 0. This is * done as an optimization to avoid extra copy, assuming * kernel won't populate the data in case of an error. * Here we always pass the data and memset() should * be called if that data shouldn't be "exported".
*/
ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
&ctx, bpf_prog_run, retval, NULL); if (ret < 0) return ret;
if (ctx.optlen > *optlen) return -EFAULT;
/* BPF programs can shrink the buffer, export the modifications.
*/ if (ctx.optlen != 0)
*optlen = ctx.optlen;
switch (si->off) { case offsetof(struct bpf_sysctl, write):
*insn++ = BPF_LDX_MEM(
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct bpf_sysctl_kern, write,
sizeof_field(struct bpf_sysctl_kern,
write),
target_size)); break; case offsetof(struct bpf_sysctl, file_pos): /* ppos is a pointer so it should be accessed via indirect * loads and stores. Also for stores additional temporary * register is used since neither src_reg nor dst_reg can be * overridden.
*/ if (type == BPF_WRITE) { int treg = BPF_REG_9;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.