struct bpf_struct_ops_map { struct bpf_map map; conststruct bpf_struct_ops_desc *st_ops_desc; /* protect map_update */ struct mutex lock; /* link has all the bpf_links that is populated * to the func ptr of the kernel's struct * (in kvalue.data).
*/ struct bpf_link **links; /* ksyms for bpf trampolines */ struct bpf_ksym **ksyms;
u32 funcs_cnt;
u32 image_pages_cnt; /* image_pages is an array of pages that has all the trampolines * that stores the func args before calling the bpf_prog.
*/ void *image_pages[MAX_TRAMP_IMAGE_PAGES]; /* The owner moduler's btf. */ struct btf *btf; /* uvalue->data stores the kernel struct * (e.g. tcp_congestion_ops) that is more useful * to userspace than the kvalue. For example, * the bpf_prog's id is stored instead of the kernel * address of a func ptr.
*/ struct bpf_struct_ops_value *uvalue; /* kvalue.data stores the actual kernel's struct * (e.g. tcp_congestion_ops) that will be * registered to the kernel subsystem.
*/ struct bpf_struct_ops_value kvalue;
};
vt = btf_type_by_id(btf, value_id); if (btf_vlen(vt) != 2) {
pr_warn("The number of %s's members should be 2, but we get %d\n",
value_name, btf_vlen(vt)); returnfalse;
}
member = btf_type_member(vt);
mt = btf_type_by_id(btf, member->type);
common_value_type = btf_type_by_id(btf_vmlinux,
st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]); if (mt != common_value_type) {
pr_warn("The first member of %s should be bpf_struct_ops_common_value\n",
value_name); returnfalse;
}
member++;
mt = btf_type_by_id(btf, member->type); if (mt != type) {
pr_warn("The second member of %s should be %s\n",
value_name, btf_name_by_offset(btf, type->name_off)); returnfalse;
}
returntrue;
}
staticvoid *bpf_struct_ops_image_alloc(void)
{ void *image; int err;
err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) return ERR_PTR(err);
image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!image) {
bpf_jit_uncharge_modmem(PAGE_SIZE); return ERR_PTR(-ENOMEM);
}
/* Prepare argument info for every nullable argument of a member of a * struct_ops type. * * Initialize a struct bpf_struct_ops_arg_info according to type info of * the arguments of a stub function. (Check kCFI for more information about * stub functions.) * * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info * to provide an array of struct bpf_ctx_arg_aux, which in turn provides * the information that used by the verifier to check the arguments of the * BPF struct_ops program assigned to the member. Here, we only care about * the arguments that are marked as __nullable. * * The array of struct bpf_ctx_arg_aux is eventually assigned to * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the * verifier. (See check_struct_ops_btf_id()) * * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If * fails, it will be kept untouched.
*/ staticint prepare_arg_info(struct btf *btf, constchar *st_ops_name, constchar *member_name, conststruct btf_type *func_proto, void *stub_func_addr, struct bpf_struct_ops_arg_info *arg_info)
{ conststruct btf_type *stub_func_proto, *pointed_type; bool is_nullable = false, is_refcounted = false; conststruct btf_param *stub_args, *args; struct bpf_ctx_arg_aux *info, *info_buf;
u32 nargs, arg_no, info_cnt = 0; char ksym[KSYM_SYMBOL_LEN]; constchar *stub_fname; constchar *suffix;
s32 stub_func_id;
u32 arg_btf_id; int offset;
stub_fname = kallsyms_lookup((unsignedlong)stub_func_addr, NULL, NULL, NULL, ksym); if (!stub_fname) {
pr_warn("Cannot find the stub function name for the %s in struct %s\n",
member_name, st_ops_name); return -ENOENT;
}
stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC); if (stub_func_id < 0) {
pr_warn("Cannot find the stub function %s in btf\n", stub_fname); return -ENOENT;
}
/* Check if the number of arguments of the stub function is the same * as the number of arguments of the function pointer.
*/
nargs = btf_type_vlen(func_proto); if (nargs != btf_type_vlen(stub_func_proto)) {
pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n",
stub_fname, member_name, st_ops_name); return -EINVAL;
}
info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL); if (!info_buf) return -ENOMEM;
/* Prepare info for every nullable argument */
info = info_buf; for (arg_no = 0; arg_no < nargs; arg_no++) { /* Skip arguments that is not suffixed with * "__nullable or __ref".
*/
is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no],
MAYBE_NULL_SUFFIX);
is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no],
REFCOUNTED_SUFFIX);
/* Should be a pointer to struct */
pointed_type = btf_type_resolve_ptr(btf,
args[arg_no].type,
&arg_btf_id); if (!pointed_type ||
!btf_type_is_struct(pointed_type)) {
pr_warn("stub function %s has %s tagging to an unsupported type\n",
stub_fname, suffix); goto err_out;
}
offset = btf_ctx_arg_offset(btf, func_proto, arg_no); if (offset < 0) {
pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n",
stub_fname, arg_no); goto err_out;
}
if (args[arg_no].type != stub_args[arg_no].type) {
pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n",
arg_no, stub_fname); goto err_out;
}
/* Fill the information of the new argument */
info->btf_id = arg_btf_id;
info->btf = btf;
info->offset = offset; if (is_nullable) {
info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
} elseif (is_refcounted) {
info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID;
info->refcounted = true;
}
/* Clean up the arg_info in a struct bpf_struct_ops_desc. */ void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
{ struct bpf_struct_ops_arg_info *arg_info; int i;
arg_info = st_ops_desc->arg_info; for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++)
kfree(arg_info[i].info);
moff = __btf_member_bit_offset(t, member) / 8;
mname = btf_name_by_offset(btf, member->name_off); if (!*mname) {
pr_warn("anon member in struct %s is not supported\n",
st_ops->name);
err = -EOPNOTSUPP; goto errout;
}
if (__btf_member_bitfield_size(t, member)) {
pr_warn("bit field member %s in struct %s is not supported\n",
mname, st_ops->name);
err = -EOPNOTSUPP; goto errout;
}
if (!st_ops_ids[IDX_MODULE_ID] && is_module_member(btf, member->type)) {
pr_warn("'struct module' btf id not found. Is CONFIG_MODULES enabled? bpf_struct_ops '%s' needs module support.\n",
st_ops->name);
err = -EOPNOTSUPP; goto errout;
}
/* The member is not a function pointer or * the function pointer is not supported.
*/ if (!func_proto || bpf_struct_ops_supported(st_ops, moff)) continue;
if (func_proto->type) {
ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL); if (ret_type && !__btf_type_is_struct(ret_type)) {
pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n",
mname, st_ops->name);
err = -EOPNOTSUPP; goto errout;
}
}
if (btf_distill_func_proto(log, btf,
func_proto, mname,
&st_ops->func_models[i])) {
pr_warn("Error in parsing func ptr %s in struct %s\n",
mname, st_ops->name);
err = -EINVAL; goto errout;
}
kvalue = &st_map->kvalue; /* Pair with smp_store_release() during map_update */
state = smp_load_acquire(&kvalue->common.state); if (state == BPF_STRUCT_OPS_STATE_INIT) {
memset(value, 0, map->value_size); return 0;
}
/* No lock is needed. state and refcnt do not need * to be updated together under atomic context.
*/
uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->common.state = state;
/* This value offers the user space a general estimate of how * many sockets are still utilizing this struct_ops for TCP * congestion control. The number might not be exact, but it * should sufficiently meet our present goals.
*/
refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0));
/* put prog_id to udata */
*(unsignedlong *)(udata + moff) = prog->aux->id;
/* init ksym for this trampoline */
bpf_struct_ops_ksym_init(tname, mname,
image + trampoline_start,
image_off - trampoline_start,
ksym);
}
if (st_ops->validate) {
err = st_ops->validate(kdata); if (err) goto reset_unlock;
} for (i = 0; i < st_map->image_pages_cnt; i++) {
err = arch_protect_bpf_trampoline(st_map->image_pages[i],
PAGE_SIZE); if (err) goto reset_unlock;
}
if (st_map->map.map_flags & BPF_F_LINK) {
err = 0; /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem().
*/
smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY); goto unlock;
}
err = st_ops->reg(kdata, NULL); if (likely(!err)) { /* This refcnt increment on the map here after * 'st_ops->reg()' is secure since the state of the * map must be set to INIT at this moment, and thus * bpf_struct_ops_map_delete_elem() can't unregister * or transition it to TOBEFREE concurrently.
*/
bpf_map_inc(map); /* Pair with smp_load_acquire() during lookup_elem(). * It ensures the above udata updates (e.g. prog->aux->id) * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
*/
smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE); goto unlock;
}
/* Error during st_ops->reg(). Can happen if this struct_ops needs to be * verified as a whole, after all init_member() calls. Can also happen if * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps.
*/
prev_state = cmpxchg(&st_map->kvalue.common.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE); switch (prev_state) { case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);
bpf_map_put(map); return 0; case BPF_STRUCT_OPS_STATE_TOBEFREE: return -EINPROGRESS; case BPF_STRUCT_OPS_STATE_INIT: return -ENOENT; default:
WARN_ON_ONCE(1); /* Should never happen. Treat it as not found. */ return -ENOENT;
}
}
/* st_ops->owner was acquired during map_alloc to implicitly holds * the btf's refcnt. The acquire was only done when btf_is_module() * st_map->btf cannot be NULL here.
*/ if (btf_is_module(st_map->btf))
module_put(st_map->st_ops_desc->st_ops->owner);
bpf_struct_ops_map_del_ksyms(st_map);
/* The struct_ops's function may switch to another struct_ops. * * For example, bpf_tcp_cc_x->init() may switch to * another tcp_cc_y by calling * setsockopt(TCP_CONGESTION, "tcp_cc_y"). * During the switch, bpf_struct_ops_put(tcp_cc_x) is called * and its refcount may reach 0 which then free its * trampoline image while tcp_cc_x is still running. * * A vanilla rcu gp is to wait for all bpf-tcp-cc prog * to finish. bpf-tcp-cc prog is non sleepable. * A rcu_tasks gp is to wait for the last few insn * in the tramopline image to finish before releasing * the trampoline image.
*/
synchronize_rcu_mult(call_rcu, call_rcu_tasks);
if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) { /* The map holds btf for its whole life time. */
btf = btf_get_by_fd(attr->value_type_btf_obj_fd); if (IS_ERR(btf)) return ERR_CAST(btf); if (!btf_is_module(btf)) {
btf_put(btf); return ERR_PTR(-EINVAL);
}
mod = btf_try_get_module(btf); /* mod holds a refcnt to btf. We don't need an extra refcnt * here.
*/
btf_put(btf); if (!mod) return ERR_PTR(-EINVAL);
} else {
btf = bpf_get_btf_vmlinux(); if (IS_ERR(btf)) return ERR_CAST(btf); if (!btf) return ERR_PTR(-ENOTSUPP);
}
st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id); if (!st_ops_desc) {
ret = -ENOTSUPP; goto errout;
}
vt = st_ops_desc->value_type; if (attr->value_size != vt->size) {
ret = -EINVAL; goto errout;
}
old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); /* The new and old struct_ops must be the same type. */ if (st_map->st_ops_desc != old_st_map->st_ops_desc) {
err = -EINVAL; goto err_out;
}
err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link); if (err) goto err_out;
RCU_INIT_POINTER(st_link->map, NULL); /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or * bpf_map_inc() in bpf_struct_ops_map_link_update().
*/
bpf_map_put(&st_map->map);
map = bpf_map_get(attr->link_create.map_fd); if (IS_ERR(map)) return PTR_ERR(map);
st_map = (struct bpf_struct_ops_map *)map;
if (!bpf_struct_ops_valid_to_reg(map)) {
err = -EINVAL; goto err_out;
}
link = kzalloc(sizeof(*link), GFP_USER); if (!link) {
err = -ENOMEM; goto err_out;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
attr->link_create.attach_type);
err = bpf_link_prime(&link->link, &link_primer); if (err) goto err_out;
init_waitqueue_head(&link->wait_hup);
/* Hold the update_mutex such that the subsystem cannot * do link->ops->detach() before the link is fully initialized.
*/
mutex_lock(&update_mutex);
err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); if (err) {
mutex_unlock(&update_mutex);
bpf_link_cleanup(&link_primer);
link = NULL; goto err_out;
}
RCU_INIT_POINTER(link->map, map);
mutex_unlock(&update_mutex);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.