if (p) { /* -EAGAIN */ if (task_cpu(p) != smp_processor_id()) return;
/* * Now that we're on right CPU with IRQs disabled, we can test * if we hit the right task without races.
*/
tfc->ret = -ESRCH; /* No such (running) process */ if (p != current) return;
}
tfc->ret = tfc->func(tfc->info);
}
/** * task_function_call - call a function on the cpu on which a task runs * @p: the task to evaluate * @func: the function to be called * @info: the function call argument * * Calls the function @func when the task is currently running. This might * be on the current CPU, which just calls the function directly. This will * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * * returns @func return value or -ESRCH or -ENXIO when the process isn't running
*/ staticint
task_function_call(struct task_struct *p, remote_function_f func, void *info)
{ struct remote_function_call data = {
.p = p,
.func = func,
.info = info,
.ret = -EAGAIN,
}; int ret;
for (;;) {
ret = smp_call_function_single(task_cpu(p), remote_function,
&data, 1); if (!ret)
ret = data.ret;
if (ret != -EAGAIN) break;
cond_resched();
}
return ret;
}
/** * cpu_function_call - call a function on the cpu * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * * Calls the function @func on the remote cpu. * * returns: @func return value or -ENXIO when the cpu is offline
*/ staticint cpu_function_call(int cpu, remote_function_f func, void *info)
{ struct remote_function_call data = {
.p = NULL,
.func = func,
.info = info,
.ret = -ENXIO, /* No such CPU */
};
staticinlinevoid __perf_ctx_unlock(struct perf_event_context *ctx)
{ /* * If ctx_sched_in() didn't again set any ALL flags, clean up * after ctx_sched_out() by clearing is_active.
*/ if (ctx->is_active & EVENT_FROZEN) { if (!(ctx->is_active & EVENT_ALL))
ctx->is_active = 0; else
ctx->is_active &= ~EVENT_FROZEN;
}
raw_spin_unlock(&ctx->lock);
}
/* * On task ctx scheduling... * * When !ctx->nr_events a task context will not be scheduled. This means * we can disable the scheduler hooks (for performance) without leaving * pending task ctx state. * * This however results in two special cases: * * - removing the last event from a task ctx; this is relatively straight * forward and is done in __perf_remove_from_context. * * - adding the first event to a task ctx; this is tricky because we cannot * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
*/
perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit.
*/ if (ctx->task) { if (ctx->task != current) {
ret = -ESRCH; goto unlock;
}
/* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or * rather, we'll have bailed in task_function_call() or the * above ctx->task != current test), therefore we must have * ctx->is_active here.
*/
WARN_ON_ONCE(!ctx->is_active); /* * And since we have ctx->is_active, cpuctx->task_ctx must * match.
*/
WARN_ON_ONCE(task_ctx != ctx);
} else {
WARN_ON_ONCE(&cpuctx->ctx != ctx);
}
if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to * stabilize the event->ctx relation. See * perf_event_ctx_lock().
*/
lockdep_assert_held(&ctx->mutex);
}
if (!task) {
cpu_function_call(event->cpu, event_function, &efs); return;
}
if (task == TASK_TOMBSTONE) return;
again: if (!task_function_call(task, event_function, &efs)) return;
local_irq_disable();
cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out().
*/
task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (ctx->is_active) {
perf_ctx_unlock(cpuctx, ctx);
local_irq_enable(); goto again;
}
func(event, NULL, ctx, data);
unlock:
perf_ctx_unlock(cpuctx, ctx);
local_irq_enable();
}
/* * Similar to event_function_call() + event_function(), but hard assumes IRQs * are already disabled and we're on the right CPU.
*/ staticvoid event_function_local(struct perf_event *event, event_f func, void *data)
{ struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL;
lockdep_assert_irqs_disabled();
if (task) { if (task == TASK_TOMBSTONE) return;
task_ctx = ctx;
}
perf_ctx_lock(cpuctx, task_ctx);
task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock;
if (task) { /* * We must be either inactive or active and the right task, * otherwise we're screwed, since we cannot IPI to somewhere * else.
*/ if (ctx->is_active) { if (WARN_ON_ONCE(task != current)) goto unlock;
/* * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv
*/ int sysctl_perf_event_paranoid __read_mostly = 2;
/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ staticint sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
/* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not * get any real work done. This will drop the sample rate when * we detect that events are taking too long.
*/ #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length);
/* Decay the counter by 1 average sample. */
running_len = __this_cpu_read(running_sample_length);
running_len -= running_len/NR_ACCUMULATED_SAMPLES;
running_len += sample_len_ns;
__this_cpu_write(running_sample_length, running_len);
/* * Note: this will be biased artificially low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count.
*/
avg_len = running_len/NR_ACCUMULATED_SAMPLES; if (avg_len <= max_len) return;
/* * Compute a throttle threshold 25% below the current duration.
*/
avg_len += avg_len / 4;
max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; if (avg_len < max)
max /= (u32)avg_len; else
max = 1;
sysctl_perf_event_sample_rate = max * HZ;
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
if (!irq_work_queue(&perf_duration_work)) {
early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n",
__report_avg, __report_allowed,
sysctl_perf_event_sample_rate);
}
}
/* * State based event timekeeping... * * The basic idea is to use event->state to determine which (if any) time * fields to increment with the current delta. This means we only need to * update timestamps when we change state or when they are explicitly requested * (read). * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is * OFF, irrespective of what the group member states are. This results in * __perf_effective_state(). * * A further ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we * need to make sure the relevant context time is updated before we try and * update our timestamps.
*/
perf_event_update_time(event); /* * If a group leader gets enabled/disabled all its siblings * are affected too.
*/ if ((event->state < 0) ^ (state < 0))
perf_event_update_sibling_time(event);
WRITE_ONCE(event->state, state);
}
/* * UP store-release, load-acquire
*/
#define __store_release(ptr, val) \ do { \
barrier(); \
WRITE_ONCE(*(ptr), (val)); \
} while (0)
/* @event doesn't care about cgroup */ if (!event->cgrp) returntrue;
/* wants specific cgroup scope but @cpuctx isn't associated with any */ if (!cpuctx->cgrp) returnfalse;
/* * Cgroup scoping is recursive. An event enabled for a cgroup is * also enabled for all its descendant cgroups. If @cpuctx's * cgroup is a descendant of @event's (the test covers identity * case), it's a match.
*/ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
event->cgrp->css.cgroup);
}
/* * ensure we access cgroup data only when needed and * when we know the cgroup is pinned (css_get)
*/ if (!is_cgroup_event(event)) return;
info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active
*/ if (info->active)
__update_cgrp_time(info, perf_clock(), true);
}
/* * reschedule events based on the cgroup constraint of task.
*/ staticvoid perf_cgroup_switch(struct task_struct *task)
{ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp;
/* * cpuctx->cgrp is set when the first cgroup event enabled, * and is cleared when the last cgroup event disabled.
*/ if (READ_ONCE(cpuctx->cgrp) == NULL) return;
cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return;
guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); /* * Re-check, could've raced vs perf_remove_from_context().
*/ if (READ_ONCE(cpuctx->cgrp) == NULL) return;
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in * ctx_sched_out()
*/
cpuctx->cgrp = cgrp; /* * set cgrp before ctxsw in to allow * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around
*/
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
}
staticint perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css)
{ struct perf_cpu_context *cpuctx; struct perf_event **storage; int cpu, heap_size, ret = 0;
/* * Allow storage to have sufficient space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup.
*/ for (heap_size = 1; css; css = css->parent)
heap_size++;
/* * all events in a group must monitor * the same cgroup because a task belongs * to only one perf cgroup at a time
*/ if (group_leader && group_leader->cgrp != cgrp) {
perf_detach_cgroup(event);
ret = -EINVAL;
} return ret;
}
/* * set default to be dependent on timer tick just * like original code
*/ #define PERF_CPU_HRTIMER (1000 / HZ) /* * function must be called with interrupts disabled
*/ staticenum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{ struct perf_cpu_pmu_context *cpc; bool rotations;
/* * check default is sane, if not set then force to * default interval (1/tick)
*/
interval = pmu->hrtimer_interval_ms; if (interval < 1)
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
staticvoid put_ctx(struct perf_event_context *ctx)
{ if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
} else {
smp_mb__after_atomic(); /* pairs with wait_var_event() */ if (ctx->task == TASK_TOMBSTONE)
wake_up_var(&ctx->refcount);
}
}
/* * Because of perf_event::ctx migration in sys_perf_event_open::move_group and * perf_pmu_migrate_context() we need some magic. * * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * * Lock ordering is by mutex address. There are two other sites where * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] * perf_event_exit_event() * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() * inherit_group() * inherit_event() * perf_event_alloc() * perf_init_event() * perf_try_init_event() [ child , 1 ] * * While it appears there is an obvious deadlock here -- the parent and child * nesting levels are inverted between the two. This is in fact safe because * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break * the ctx parent<->child relation, and perf_pmu_migrate_context() is only * concerned with cpuctx and that doesn't have children. * * The places that change perf_event::ctx will issue: * * perf_remove_from_context(); * synchronize_rcu(); * perf_install_in_context(); * * to affect the change. The remove_from_context() + synchronize_rcu() should * quiesce the event, after which we can install it in the new location. This * means that only external vectors (perf_fops, prctl) can perturb the event * while in transit. Therefore all such accessors should also acquire * perf_event_context::mutex to serialize against this. * * However; because event->ctx can change while we're waiting to acquire * ctx->mutex we must be careful and use the below perf_event_ctx_lock() * function. * * Lock order: * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock * mmap_lock * perf_event::mmap_mutex * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock * cpuctx->mutex / perf_event_context::mutex
*/ staticstruct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{ struct perf_event_context *ctx;
/* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those.
*/ static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
{ struct perf_event_context *parent_ctx = ctx->parent_ctx;
lockdep_assert_held(&ctx->lock);
if (parent_ctx)
ctx->parent_ctx = NULL;
ctx->generation++;
return parent_ctx;
}
static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, enum pid_type type)
{
u32 nr; /* * only top level events have the pid namespace they were created in
*/ if (event->parent)
event = event->parent;
nr = __task_pid_nr_ns(p, type, event->ns); /* avoid -1 if it is idle thread or runs in another ns */ if (!nr && !pid_alive(p))
nr = -1; return nr;
}
/* * If we inherit events we want to return the parent event id * to userspace.
*/ static u64 primary_event_id(struct perf_event *event)
{
u64 id = event->id;
if (event->parent)
id = event->parent->id;
return id;
}
/* * Get the perf_event_context for a task and lock it. * * This has to cope with the fact that until it is locked, * the context could get moved to another task.
*/ staticstruct perf_event_context *
perf_lock_task_context(struct task_struct *task, unsignedlong *flags)
{ struct perf_event_context *ctx;
retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read * side critical section has interrupts disabled.
*/
local_irq_save(*flags);
rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry * if so. If we locked the right context, then it * can't get swapped on us any more.
*/
raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp)) {
raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
local_irq_restore(*flags); goto retry;
}
/* * Get the context for a task and increment its pin_count so it * can't get swapped to another task. This also increments its * reference count so that the context can't get freed.
*/ staticstruct perf_event_context *
perf_pin_task_context(struct task_struct *task)
{ struct perf_event_context *ctx; unsignedlong flags;
/* * Update the record of the current time in a context.
*/ staticvoid __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
lockdep_assert_held(&ctx->lock);
if (adv)
ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
/* * The above: time' = time + (now - timestamp), can be re-arranged * into: time` = now + (time - timestamp), which gives a single value * offset to compute future time without locks on. * * See perf_event_time_now(), which can be used from NMI context where * it's (obviously) not possible to acquire ctx->lock in order to read * both the above values in a consistent manner.
*/
WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
}
/* * Helper function to initialize event group nodes.
*/ staticvoid init_event_group(struct perf_event *event)
{
RB_CLEAR_NODE(&event->group_node);
event->group_index = 0;
}
/* * Extract pinned or flexible groups from the context * based on event attrs bits.
*/ staticstruct perf_event_groups *
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{ if (event->attr.pinned) return &ctx->pinned_groups; else return &ctx->flexible_groups;
}
#ifdef CONFIG_CGROUP_PERF if (event->cgrp)
cgroup = event->cgrp->css.cgroup; #endif
return cgroup;
}
/* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU.
*/ static __always_inline int
perf_event_groups_cmp(constint left_cpu, conststruct pmu *left_pmu, conststruct cgroup *left_cgroup, const u64 left_group_index, conststruct perf_event *right)
{ if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1;
if (left_pmu) { if (left_pmu < right->pmu_ctx->pmu) return -1; if (left_pmu > right->pmu_ctx->pmu) return 1;
}
if (left_cgroup != right_cgroup) { if (!left_cgroup) { /* * Left has no cgroup but right does, no * cgroups come first.
*/ return -1;
} if (!right_cgroup) { /* * Right has no cgroup but left does, no * cgroups come first.
*/ return 1;
} /* Two dissimilar cgroups, order by id. */ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) return -1;
return 1;
}
} #endif
if (left_group_index < right->group_index) return -1; if (left_group_index > right->group_index) return 1;
/* * Helper function to insert event into the pinned or flexible groups.
*/ staticvoid
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
{ struct perf_event_groups *groups;
groups = get_event_groups(event, ctx);
perf_event_groups_insert(groups, event);
}
/* * Delete a group from a tree.
*/ staticvoid
perf_event_groups_delete(struct perf_event_groups *groups, struct perf_event *event)
{
WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
RB_EMPTY_ROOT(&groups->tree));
/* * Iterate through the whole groups tree.
*/ #define perf_event_groups_for_each(event, groups) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
typeof(*event), group_node); event; \
event = rb_entry_safe(rb_next(&event->group_node), \
typeof(*event), group_node))
/* * Does the event attribute request inherit with PERF_SAMPLE_READ
*/ staticinlinebool has_inherit_and_sample_read(struct perf_event_attr *attr)
{ return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
}
/* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held.
*/ staticvoid
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
lockdep_assert_held(&ctx->lock);
/* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings.
*/ if (event->group_leader == event) {
event->group_caps = event->event_caps;
add_event_to_groups(event, ctx);
}
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
ctx->nr_user++; if (event->attr.inherit_stat)
ctx->nr_stat++; if (has_inherit_and_sample_read(&event->attr))
local_inc(&ctx->nr_no_switch_fast);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
event->pmu_ctx->nr_events++;
}
/* * Initialize event state based on the perf_event_attr::disabled.
*/ staticinlinevoid perf_event__state_init(struct perf_event *event)
{
event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
PERF_EVENT_STATE_INACTIVE;
}
staticint __perf_event_read_size(u64 read_format, int nr_siblings)
{ int entry = sizeof(u64); /* value */ int size = 0; int nr = 1;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
if (read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
if (read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
if (read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
}
/* * Since perf_event_validate_size() limits this to 16k and inhibits * adding more siblings, this will never overflow.
*/ return size + nr * entry;
}
if (sample_type & PERF_SAMPLE_IP)
size += sizeof(data->ip);
if (sample_type & PERF_SAMPLE_ADDR)
size += sizeof(data->addr);
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
size += sizeof(data->weight.full);
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
if (sample_type & PERF_SAMPLE_DATA_SRC)
size += sizeof(data->data_src.val);
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
size += sizeof(data->phys_addr);
if (sample_type & PERF_SAMPLE_CGROUP)
size += sizeof(data->cgroup);
if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
size += sizeof(data->data_page_size);
if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
size += sizeof(data->code_page_size);
event->header_size = size;
}
/* * Called at perf_event creation and when events are attached/detached from a * group.
*/ staticvoid perf_event__header_size(struct perf_event *event)
{
event->read_size =
__perf_event_read_size(event->attr.read_format,
event->group_leader->nr_siblings);
__perf_event_header_size(event, event->attr.sample_type);
}
if (sample_type & PERF_SAMPLE_TID)
size += sizeof(data->tid_entry);
if (sample_type & PERF_SAMPLE_TIME)
size += sizeof(data->time);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
size += sizeof(data->id);
if (sample_type & PERF_SAMPLE_ID)
size += sizeof(data->id);
if (sample_type & PERF_SAMPLE_STREAM_ID)
size += sizeof(data->stream_id);
if (sample_type & PERF_SAMPLE_CPU)
size += sizeof(data->cpu_entry);
event->id_header_size = size;
}
/* * Check that adding an event to the group does not result in anybody * overflowing the 64k event limit imposed by the output buffer. * * Specifically, check that the read_size for the event does not exceed 16k, * read_size being the one term that grows with groups size. Since read_size * depends on per-event read_format, also (re)check the existing events. * * This leaves 48k for the constant size fields and things like callchains, * branch stacks and register sets.
*/ staticbool perf_event_validate_size(struct perf_event *event)
{ struct perf_event *sibling, *group_leader = event->group_leader;
if (__perf_event_read_size(event->attr.read_format,
group_leader->nr_siblings + 1) > 16*1024) returnfalse;
if (__perf_event_read_size(group_leader->attr.read_format,
group_leader->nr_siblings + 1) > 16*1024) returnfalse;
/* * When creating a new group leader, group_leader->ctx is initialized * after the size has been validated, but we cannot safely use * for_each_sibling_event() until group_leader->ctx is set. A new group * leader cannot have any siblings yet, so we can safely skip checking * the non-existent siblings.
*/ if (event == group_leader) returntrue;
/* * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held.
*/ staticvoid
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
/* * We can have double detach due to exit/hot-unplug + close.
*/ if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return;
event->attach_state &= ~PERF_ATTACH_CONTEXT;
ctx->nr_events--; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
ctx->nr_user--; if (event->attr.inherit_stat)
ctx->nr_stat--; if (has_inherit_and_sample_read(&event->attr))
local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
if (event->group_leader == event)
del_event_from_groups(event, ctx);
/* * If event uses aux_event tear down the link
*/ if (event->aux_event) {
iter = event->aux_event;
event->aux_event = NULL;
put_event(iter); return;
}
/* * If the event is an aux_event, tear down all links to * it from other events.
*/
for_each_sibling_event(iter, event) { if (iter->aux_event != event) continue;
iter->aux_event = NULL;
put_event(event);
/* * If it's ACTIVE, schedule it out and put it into ERROR * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status.
*/
__event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
}
}
staticint perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader)
{ /* * Our group leader must be an aux event if we want to be * an aux_output. This way, the aux event will precede its * aux_output events in the group, and therefore will always * schedule first.
*/ if (!group_leader) return 0;
/* * aux_output and aux_sample_size are mutually exclusive.
*/ if (event->attr.aux_output && event->attr.aux_sample_size) return 0;
if (event->attr.aux_output &&
!perf_aux_output_match(event, group_leader)) return 0;
if ((event->attr.aux_pause || event->attr.aux_resume) &&
!(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) return 0;
if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0;
if (!atomic_long_inc_not_zero(&group_leader->refcount)) return 0;
/* * Link aux_outputs to their aux event; this is undone in * perf_group_detach() by perf_put_aux_event(). When the * group in torn down, the aux_output events loose their * link to the aux_event and can't schedule any more.
*/
event->aux_event = group_leader;
/* * We can have double detach due to exit/hot-unplug + close.
*/ if (!(event->attach_state & PERF_ATTACH_GROUP)) return;
event->attach_state &= ~PERF_ATTACH_GROUP;
perf_put_aux_event(event);
/* * If this is a sibling, remove it from its group.
*/ if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
event->group_leader->group_generation++; goto out;
}
/* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
/* * Events that have PERF_EV_CAP_SIBLING require being part of * a group and cannot exist on their own, schedule them out * and move them into the ERROR state. Also see * _perf_event_enable(), it will not be able to recover this * ERROR state.
*/ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
__event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
if (event->state != PERF_EVENT_STATE_ACTIVE) return;
/* * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but * we can schedule events _OUT_ individually through things like * __perf_remove_from_context().
*/
list_del_init(&event->active_list);
perf_pmu_disable(event->pmu);
event->pmu->del(event, 0);
event->oncpu = -1;
if (event->pending_disable) {
event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
if (!is_software_event(event))
cpc->active_oncpu--; if (is_event_in_freq_mode(event)) {
ctx->nr_freq--;
epc->nr_freq--;
} if (event->attr.exclusive || !cpc->active_oncpu)
cpc->exclusive = 0;
/* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list.
*/ staticvoid
__perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info)
{ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; enum perf_event_state state = PERF_EVENT_STATE_OFF; unsignedlong flags = (unsignedlong)info;
ctx_time_update(cpuctx, ctx);
/* * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time.
*/ if (flags & DETACH_EXIT)
state = PERF_EVENT_STATE_EXIT; if (flags & DETACH_REVOKE)
state = PERF_EVENT_STATE_REVOKED; if (flags & DETACH_DEAD)
state = PERF_EVENT_STATE_DEAD;
event_sched_out(event, ctx);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_disable(event, ctx);
/* * Remove the event from a task's (or a CPU's) list of events. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. * When called from perf_event_exit_task, it's OK because the * context has been detached from its task.
*/ staticvoid perf_remove_from_context(struct perf_event *event, unsignedlong flags)
{ struct perf_event_context *ctx = event->ctx;
lockdep_assert_held(&ctx->mutex);
/* * Because of perf_event_exit_task(), perf_remove_from_context() ought * to work in the face of TASK_TOMBSTONE, unlike every other * event_function_call() user.
*/
raw_spin_lock_irq(&ctx->lock); if (!ctx->is_active) {
__perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock); return;
}
raw_spin_unlock_irq(&ctx->lock);
/* * When disabling a group leader, the whole group becomes ineligible * to run, so schedule out the full group.
*/ if (event == event->group_leader)
group_sched_out(event, ctx);
/* * But only mark the leader OFF; the siblings will remain * INACTIVE.
*/
__event_disable(event, ctx, PERF_EVENT_STATE_OFF);
perf_pmu_enable(event->pmu_ctx->pmu);
}
/* * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context.
*/ staticvoid _perf_event_disable(struct perf_event *event)
{ struct perf_event_context *ctx = event->ctx;
raw_spin_lock_irq(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) {
raw_spin_unlock_irq(&ctx->lock); return;
}
raw_spin_unlock_irq(&ctx->lock);
/* * Strictly speaking kernel users cannot create groups and therefore this * interface does not need the perf_event_ctx_lock() magic.
*/ void perf_event_disable(struct perf_event *event)
{ struct perf_event_context *ctx;
if (event->state <= PERF_EVENT_STATE_OFF) return 0;
WRITE_ONCE(event->oncpu, smp_processor_id()); /* * Order event::oncpu write to happen before the ACTIVE state is * visible. This allows perf_event_{stop,read}() to observe the correct * ->oncpu if it sees ACTIVE.
*/
smp_wmb();
perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
/* * Unthrottle events, since we scheduled we might have missed several * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner.
*/ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS))
perf_event_unthrottle(event, false);
perf_pmu_disable(event->pmu);
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
event->oncpu = -1;
ret = -EAGAIN; goto out;
}
if (!is_software_event(event))
cpc->active_oncpu++; if (is_event_in_freq_mode(event)) {
ctx->nr_freq++;
epc->nr_freq++;
} if (event->attr.exclusive)
cpc->exclusive = 1;
if (group_event->state == PERF_EVENT_STATE_OFF) return 0;
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
if (event_sched_in(group_event, ctx)) goto error;
/* * Schedule in siblings as one group (if any):
*/
for_each_sibling_event(event, group_event) { if (event_sched_in(event, ctx)) {
partial_group = event; goto group_error;
}
}
if (!pmu->commit_txn(pmu)) return 0;
group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: * The events up to the failed event are scheduled out normally.
*/
for_each_sibling_event(event, group_event) { if (event == partial_group) break;
/* * Work out whether we can put this event group on the CPU now.
*/ staticint group_can_go_on(struct perf_event *event, int can_add_hw)
{ struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
/* * Groups consisting entirely of software events can always go on.
*/ if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware * events can go on.
*/ if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already * events on the CPU, it can't go on.
*/ if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able * to go on.
*/ return can_add_hw;
}
/* * We want to maintain the following priority of scheduling: * - CPU pinned (EVENT_CPU | EVENT_PINNED) * - task pinned (EVENT_PINNED) * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) * - task flexible (EVENT_FLEXIBLE). * * In order to avoid unscheduling and scheduling back in everything every * time an event is added, only do it for the groups of equal priority and * below. * * This can be called after a batch operation on task events, in which case * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/ staticvoid ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, struct pmu *pmu, enum event_type_t event_type)
{ bool cpu_event = !!(event_type & EVENT_CPU); struct perf_event_pmu_context *epc;
/* * If pinned groups are involved, flexible groups also need to be * scheduled out.
*/ if (event_type & EVENT_PINNED)
event_type |= EVENT_FLEXIBLE;
if (task_ctx) {
for_each_epc(epc, task_ctx, pmu, false)
perf_pmu_disable(epc->pmu);
task_ctx_sched_out(task_ctx, pmu, event_type);
}
/* * Decide which cpu ctx groups to schedule out based on the types * of events that caused rescheduling: * - EVENT_CPU: schedule out corresponding groups; * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; * - otherwise, do nothing more.
*/ if (cpu_event)
ctx_sched_out(&cpuctx->ctx, pmu, event_type); elseif (event_type & EVENT_PINNED)
ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
/* * Cross CPU call to install and enable a performance event * * Very similar to remote_function() + event_function() but cannot assume that * things like ctx->is_active and cpuctx->task_ctx are set.
*/ staticint __perf_install_in_context(void *info)
{ struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0;
raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) {
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
reprogram = (ctx->task == current);
/* * If the task is running, it must be running on this CPU, * otherwise we cannot reprogram things. * * If its not running, we don't care, ctx->lock will * serialize against it becoming runnable.
*/ if (task_curr(ctx->task) && !reprogram) {
ret = -ESRCH; goto unlock;
}
#ifdef CONFIG_CGROUP_PERF if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it.
*/ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
reprogram = cgroup_is_descendant(cgrp->css.cgroup,
event->cgrp->css.cgroup);
} #endif
/* * Attach a performance event to a context. * * Very similar to event_function_call, see comment there.
*/ staticvoid
perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu)
{ struct task_struct *task = READ_ONCE(ctx->task);
if (event->cpu != -1)
WARN_ON_ONCE(event->cpu != cpu);
/* * Ensures that if we can observe event->ctx, both the event and ctx * will be 'complete'. See perf_iterate_sb_cpu().
*/
smp_store_release(&event->ctx, ctx);
/* * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware.
*/ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
ctx->nr_events && !is_cgroup_event(event)) {
raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) {
raw_spin_unlock_irq(&ctx->lock); return;
}
add_event_to_ctx(event, ctx);
raw_spin_unlock_irq(&ctx->lock); return;
}
if (!task) {
cpu_function_call(cpu, __perf_install_in_context, event); return;
}
/* * Should not happen, we validate the ctx is still alive before calling.
*/ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return;
/* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. * * Instead we use task_curr(), which tells us if the task is running. * However, since we use task_curr() outside of rq::lock, we can race * against the actual state. This means the result can be wrong. * * If we get a false positive, we retry, this is harmless. * * If we get a false negative, things are complicated. If we are after * perf_event_context_sched_in() ctx::lock will serialize us, and the * value must be correct. If we're before, it doesn't matter since * perf_event_context_sched_in() will program the counter. * * However, this hinges on the remote context switch having observed * our task->perf_event_ctxp[] store, such that it will in fact take * ctx::lock in perf_event_context_sched_in(). * * We do this by task_function_call(), if the IPI fails to hit the task * we know any future context switch of task must see the * perf_event_ctpx[] store.
*/
/* * This smp_mb() orders the task->perf_event_ctxp[] store with the * task_cpu() load, such that if the IPI then does not find the task * running, a future context switch of that task must observe the * store.
*/
smp_mb();
again: if (!task_function_call(task, __perf_install_in_context, event)) return;
raw_spin_lock_irq(&ctx->lock);
task = ctx->task; if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* * Cannot happen because we already checked above (which also * cannot happen), and we hold ctx->mutex, which serializes us * against perf_event_exit_task_context().
*/
raw_spin_unlock_irq(&ctx->lock); return;
} /* * If the task is not running, ctx->lock will avoid it becoming so, * thus we can safely install the event.
*/ if (task_curr(task)) {
raw_spin_unlock_irq(&ctx->lock); goto again;
}
add_event_to_ctx(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
/* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on.
*/ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return;
task_ctx = cpuctx->task_ctx; if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
/* * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable.
*/ staticvoid _perf_event_enable(struct perf_event *event)
{ struct perf_event_context *ctx = event->ctx;
/* * If the event is in error state, clear that first. * * That way, if we see the event in error state below, we know that it * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived.
*/ if (event->state == PERF_EVENT_STATE_ERROR) { /* * Detached SIBLING events cannot leave ERROR state.
*/ if (event->event_caps & PERF_EV_CAP_SIBLING &&
event->group_leader == event) goto out;
/* if it's already INACTIVE, do nothing */ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0;
/* matches smp_wmb() in event_sched_in() */
smp_rmb();
/* * There is a window with interrupts enabled before we get here, * so we need to check again lest we try to stop another CPU's event.
*/ if (READ_ONCE(event->oncpu) != smp_processor_id()) return -EAGAIN;
event->pmu->stop(event, PERF_EF_UPDATE);
/* * May race with the actual stop (through perf_pmu_output_stop()), * but it is only used for events with AUX ring buffer, and such * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * * Since this is happening on an event-local CPU, no trace is lost * while restarting.
*/ if (sd->restart)
event->pmu->start(event, 0);
return 0;
}
staticint perf_event_stop(struct perf_event *event, int restart)
{ struct stop_event_data sd = {
.event = event,
.restart = restart,
}; int ret = 0;
do { if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0;
/* matches smp_wmb() in event_sched_in() */
smp_rmb();
/* * We only want to restart ACTIVE events, so if the event goes * inactive here (event->oncpu==-1), there's nothing more to do; * fall through with ret==-ENXIO.
*/
ret = cpu_function_call(READ_ONCE(event->oncpu),
__perf_event_stop, &sd);
} while (ret == -EAGAIN);
return ret;
}
/* * In order to contain the amount of racy and tricky in the address filter * configuration management, it is a two part process: * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. * * If (p1) happens while the event is active, we restart it to force (p2). * * (1) perf_addr_filters_apply(): adjusting filters' offsets based on * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec.
*/ void perf_event_addr_filters_sync(struct perf_event *event)
{ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
/* * Copy event-type-independent attributes that may be modified.
*/ staticvoid perf_event_modify_copy_attr(struct perf_event_attr *to, conststruct perf_event_attr *from)
{
to->sig_data = from->sig_data;
}
staticint perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr)
{ int (*func)(struct perf_event *, struct perf_event_attr *); struct perf_event *child; int err;
if (event->attr.type != attr->type) return -EINVAL;
switch (event->attr.type) { case PERF_TYPE_BREAKPOINT:
func = perf_event_modify_breakpoint; break; default: /* Place holder for future additions. */ return -EOPNOTSUPP;
}
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->child_mutex); /* * Event-type-independent attributes must be copied before event-type * modification, which will validate that final attributes match the * source attributes after all relevant attributes have been copied.
*/
perf_event_modify_copy_attr(&event->attr, attr);
err = func(event, attr); if (err) goto out;
list_for_each_entry(child, &event->child_list, child_list) {
perf_event_modify_copy_attr(&child->attr, attr);
err = func(child, attr); if (err) goto out;
}
out:
mutex_unlock(&event->child_mutex); return err;
}
if (event_type & EVENT_FLEXIBLE) {
list_for_each_entry_safe(event, tmp,
&pmu_ctx->flexible_active,
active_list)
group_sched_out(event, ctx); /* * Since we cleared EVENT_FLEXIBLE, also clear * rotate_necessary, is will be reset by * ctx_flexible_sched_in() when needed.
*/
pmu_ctx->rotate_necessary = 0;
}
perf_pmu_enable(pmu);
}
/* * Be very careful with the @pmu argument since this will change ctx state. * The @pmu argument works for ctx_resched(), because that is symmetric in * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. * * However, if you were to be asymmetrical, you could end up with messed up * state, eg. ctx->is_active cleared even though most EPCs would still actually * be active.
*/ staticvoid
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP;
event_type &= ~EVENT_CGROUP;
lockdep_assert_held(&ctx->lock);
if (likely(!ctx->nr_events)) { /* * See __perf_remove_from_context().
*/
WARN_ON_ONCE(ctx->is_active); if (ctx->task)
WARN_ON_ONCE(cpuctx->task_ctx); return;
}
/* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last * context we sched out. For example: * * ctx_sched_out(.event_type = EVENT_FLEXIBLE) * ctx_sched_out(.event_type = EVENT_PINNED) * * would only update time for the pinned events.
*/
__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
/* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now()
*/
barrier();
ctx->is_active &= ~event_type;
if (!(ctx->is_active & EVENT_ALL)) { /* * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() * does not observe a hole. perf_ctx_unlock() will clean up.
*/ if (ctx->is_active & EVENT_FROZEN)
ctx->is_active &= EVENT_TIME_FROZEN; else
ctx->is_active = 0;
}
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!(ctx->is_active & EVENT_ALL))
cpuctx->task_ctx = NULL;
}
/* * Test whether two contexts are equivalent, i.e. whether they have both been * cloned from the same version of the same context. * * Equivalence is measured using a generation number in the context that is * incremented on each modification to it; see unclone_ctx(), list_add_event() * and list_del_event().
*/ staticint context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2)
{
lockdep_assert_held(&ctx1->lock);
lockdep_assert_held(&ctx2->lock);
/* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0;
/* If ctx1 is the parent of ctx2 */ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) return 1;
/* If ctx2 is the parent of ctx1 */ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) return 1;
/* * If ctx1 and ctx2 have the same parent; we flatten the parent * hierarchy, see perf_event_init_context().
*/ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
ctx1->parent_gen == ctx2->parent_gen) return 1;
/* * Update the event value, we cannot use perf_event_read() * because we're in the middle of a context switch and have IRQs * disabled, which upsets smp_call_function_single(), however * we know the event must be on the current CPU, therefore we * don't need to use it.
*/
perf_pmu_read(event);
perf_event_update_time(event);
/* * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts.
*/
value = local64_read(&next_event->count);
value = local64_xchg(&event->count, value);
local64_set(&next_event->count, value);
/* If neither context have a parent context; they cannot be clones. */ if (!parent && !next_parent) goto unlock;
if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both * contexts and check that they are clones under the * lock (including re-checking that neither has been * uncloned in the meantime). It doesn't matter which * order we take the locks because no other cpu could * be trying to lock both of these tasks.
*/
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) {
perf_ctx_disable(ctx, false);
/* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) ||
local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. * * Likewise, when a context contains inherit + * SAMPLE_READ events they should be switched * out using the slow path so that they are * treated as if they were distinct contexts.
*/
raw_spin_unlock(&next_ctx->lock);
rcu_read_unlock(); goto inside_switch;
}
/* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of * ctx->task is immaterial since this value is * always verified under ctx->lock which we're now * holding.
*/
RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
if (!cpc->sched_cb_usage++)
list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
barrier();
this_cpu_inc(perf_sched_cb_usages);
}
/* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. * * This callback is relevant even to per-cpu events; for example multi event * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task.
*/ staticvoid __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, struct task_struct *task, bool sched_in)
{ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu;
pmu = cpc->epc.pmu;
/* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return;
/* * Called from scheduler to remove the events of the current task, * with interrupts disabled. * * We stop each event and update the event value in event->count. * * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * not restart the event.
*/ void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next)
{ if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
perf_event_context_sched_out(task, next);
/* * if cgroup events exist on this CPU, then we need * to check if we have to switch out PMU state. * cgroup event are system-wide mode only
*/
perf_cgroup_switch(next);
}
/* * Because the userpage is strictly per-event (there is no concept of context, * so there cannot be a context indirection), every userpage must be updated * when context time starts :-( * * IOW, we must not miss EVENT_TIME edges.
*/ staticinlinebool event_update_userpage(struct perf_event *event)
{ if (likely(!atomic_read(&event->mmap_count))) returnfalse;
if (!(is_active & EVENT_TIME)) { /* start ctx time */
__update_context_time(ctx, false);
perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now()
*/
barrier();
}
ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!(is_active & EVENT_ALL))
cpuctx->task_ctx = ctx; else
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
}
is_active ^= ctx->is_active; /* changed bits */
/* * First go through the list and put on any pinned groups * in order to give them the best chance of going on.
*/ if (is_active & EVENT_PINNED) {
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
}
/* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) {
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
}
}
perf_ctx_lock(cpuctx, ctx); /* * We must check ctx->nr_events while holding ctx->lock, such * that we serialize against perf_install_in_context().
*/ if (!ctx->nr_events) goto unlock;
perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. * * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around.
*/ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
perf_ctx_disable(&cpuctx->ctx, false);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}
/* * Called from scheduler to add the events of the current task * with interrupts disabled. * * We restore the event value and then enable it. * * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * keep the event running.
*/ void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task)
{
perf_event_context_sched_in(task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
}
/* * We got @count in @nsec, with a target of sample_freq HZ * the target period becomes: * * @count * 10^9 * period = ------------------- * @nsec * sample_freq *
*/
/* * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude.
*/ #define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \
a >>= 1; \
a##_fls--; \
} else { \
b >>= 1; \
b##_fls--; \
} \
} while (0)
/* * Reduce accuracy until either term fits in a u64, then proceed with * the other, so that finally we can do a u64/u64 division.
*/ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
REDUCE_FLS(nsec, frequency);
REDUCE_FLS(sec, count);
}
list_for_each_entry(event, event_list, active_list) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue;
// XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue;
hwc = &event->hw;
if (hwc->interrupts == MAX_INTERRUPTS)
perf_event_unthrottle_group(event, is_event_in_freq_mode(event));
if (!is_event_in_freq_mode(event)) continue;
/* * stop the event and update event->count
*/
event->pmu->stop(event, PERF_EF_UPDATE);
now = local64_read(&event->count);
delta = now - hwc->freq_count_stamp;
hwc->freq_count_stamp = now;
/* * restart the event * reload only if value has changed * we have stopped the event so tell that * to perf_adjust_period() to avoid stopping it * twice.
*/ if (delta > 0)
perf_adjust_period(event, period, delta, false);
/* * combine freq adjustment with unthrottling to avoid two passes over the * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias.
*/ staticvoid
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{ struct perf_event_pmu_context *pmu_ctx;
/* * only need to iterate over all events iff: * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu
*/ if (!(ctx->nr_freq || unthrottle)) return;
raw_spin_lock(&ctx->lock);
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (!(pmu_ctx->nr_freq || unthrottle)) continue; if (!perf_pmu_ctx_is_active(pmu_ctx)) continue; if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) continue;
/* * Move @event to the tail of the @ctx's elegible events.
*/ staticvoid rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{ /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code.
*/ if (ctx->rotate_disable) return;
/* pick an event from the flexible_groups to rotate */ staticinlinestruct perf_event *
ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{ struct perf_event *event; struct rb_node *node; struct rb_root *tree; struct __group_key key = {
.pmu = pmu_ctx->pmu,
};
/* pick the first active flexible event */
event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); if (event) goto out;
/* if no active flexible event, pick the first event */
tree = &pmu_ctx->ctx->flexible_groups.tree;
if (!pmu_ctx->ctx->task) {
key.cpu = smp_processor_id();
out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again.
*/
pmu_ctx->rotate_necessary = 0;
if (task_rotate)
task_event = ctx_event_to_rotate(task_epc); if (cpu_rotate)
cpu_event = ctx_event_to_rotate(cpu_epc);
/* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible.
*/ if (task_event || (task_epc && cpu_event)) {
update_context_time(task_epc->ctx);
__pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
}
/* * Unclone and reschedule this context if we enabled any event.
*/ if (enabled) {
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, NULL, event_type);
}
perf_ctx_unlock(cpuctx, ctx);
/* * Removes all events from the current task that have been marked * remove-on-exec, and feeds their values back to parent events.
*/ staticvoid perf_event_remove_on_exec(struct perf_event_context *ctx)
{ struct perf_event_context *clone_ctx = NULL; struct perf_event *event, *next; unsignedlong flags; bool modified = false;
mutex_lock(&ctx->mutex);
if (WARN_ON_ONCE(ctx->task != current)) goto unlock;
list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { if (!event->attr.remove_on_exec) continue;
if (!is_kernel_event(event))
perf_remove_from_owner(event);
modified = true;
perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags); if (modified)
clone_ctx = unclone_ctx(ctx);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
/* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. In that case * event->count would have been updated to a recent sample * when the event was scheduled out.
*/ if (ctx->task && cpuctx->task_ctx != ctx) return;
/* * NMI-safe method to read a local event, that is an event that * is: * - either for the current task, or for this CPU * - does not have inherit set, for inherited task events * will not be local and we cannot read them atomically * - must not have a pmu::count method
*/ int perf_event_read_local(struct perf_event *event, u64 *value,
u64 *enabled, u64 *running)
{ unsignedlong flags; int event_oncpu; int event_cpu; int ret = 0;
/* * Disabling interrupts avoids all counter scheduling (context * switches, timer based rotation and IPIs).
*/
local_irq_save(flags);
/* * It must not be an event with inherit set, we cannot read * all child counters from atomic context.
*/ if (event->attr.inherit) {
ret = -EOPNOTSUPP; goto out;
}
/* If this is a per-task event, it must be for current */ if ((event->attach_state & PERF_ATTACH_TASK) &&
event->hw.target != current) {
ret = -EINVAL; goto out;
}
/* * Get the event CPU numbers, and adjust them to local if the event is * a per-package event that can be read locally
*/
event_oncpu = __perf_event_read_cpu(event, event->oncpu);
event_cpu = __perf_event_read_cpu(event, event->cpu);
/* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) &&
event_cpu != smp_processor_id()) {
ret = -EINVAL; goto out;
}
/* If this is a pinned event it must be running on this CPU */ if (event->attr.pinned && event_oncpu != smp_processor_id()) {
ret = -EBUSY; goto out;
}
/* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1).
*/ if (event_oncpu == smp_processor_id())
event->pmu->read(event);
calc_timer_values(event, &__now, &__enabled, &__running); if (enabled)
*enabled = __enabled; if (running)
*running = __running;
}
out:
local_irq_restore(flags);
return ret;
}
staticint perf_event_read(struct perf_event *event, bool group)
{ enum perf_event_state state = READ_ONCE(event->state); int event_cpu, ret = 0;
/* * If event is enabled and currently active on a CPU, update the * value in the event structure:
*/
again: if (state == PERF_EVENT_STATE_ACTIVE) { struct perf_read_data data;
/* * Orders the ->state and ->oncpu loads such that if we see * ACTIVE we must also see the right ->oncpu. * * Matches the smp_wmb() from event_sched_in().
*/
smp_rmb();
event_cpu = READ_ONCE(event->oncpu); if ((unsigned)event_cpu >= nr_cpu_ids) return 0;
/* * Purposely ignore the smp_call_function_single() return * value. * * If event_cpu isn't a valid CPU it means the event got * scheduled out and that will have updated the event count. * * Therefore, either way, we'll have an up-to-date event count * after this.
*/
(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
preempt_enable();
ret = data.ret;
raw_spin_lock_irqsave(&ctx->lock, flags);
state = event->state; if (state != PERF_EVENT_STATE_INACTIVE) {
raw_spin_unlock_irqrestore(&ctx->lock, flags); goto again;
}
/* * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time
*/
ctx_time_update_event(ctx, event);
perf_event_update_time(event); if (group)
perf_event_update_sibling_time(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
return ret;
}
/* * Initialize the perf_event context in a task_struct:
*/ staticvoid __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->pmu_ctx_list);
perf_event_groups_init(&ctx->pinned_groups);
perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
refcount_set(&ctx->refcount, 1);
}
if (clone_ctx)
put_ctx(clone_ctx);
} else {
ctx = alloc_perf_context(task);
err = -ENOMEM; if (!ctx) goto errout;
err = 0;
mutex_lock(&task->perf_event_mutex); /* * If it has already passed perf_event_exit_task(). * we must see PF_EXITING, it takes this mutex too.
*/ if (task->flags & PF_EXITING)
err = -ESRCH; elseif (task->perf_event_ctxp)
err = -EAGAIN; else {
get_ctx(ctx);
++ctx->pin_count;
rcu_assign_pointer(task->perf_event_ctxp, ctx);
}
mutex_unlock(&task->perf_event_mutex);
if (!ctx->task) { /* * perf_pmu_migrate_context() / __perf_pmu_install_event() * relies on the fact that find_get_pmu_context() cannot fail * for CPU contexts.
*/ struct perf_cpu_pmu_context *cpc;
cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
epc = &cpc->epc;
raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { /* * One extra reference for the pmu; see perf_pmu_free().
*/
atomic_set(&epc->refcount, 2);
epc->embedded = 1;
list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
epc->ctx = ctx;
} else {
WARN_ON_ONCE(epc->ctx != ctx);
atomic_inc(&epc->refcount);
}
raw_spin_unlock_irq(&ctx->lock); return epc;
}
new = kzalloc(sizeof(*epc), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM);
__perf_init_event_pmu_context(new, pmu);
/* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because perf_event_init_task() doesn't actually hold the * child_ctx->mutex.
*/
raw_spin_lock_irq(&ctx->lock);
list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (epc->pmu == pmu) {
WARN_ON_ONCE(epc->ctx != ctx);
atomic_inc(&epc->refcount); goto found_epc;
} /* Make sure the pmu_ctx_list is sorted by PMU type: */ if (!pos && epc->pmu->type > pmu->type)
pos = epc;
}
epc = new; new = NULL;
if (!pos)
list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); else
list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev);
/* * XXX * * lockdep_assert_held(&ctx->mutex); * * can't because of the call-site in _free_event()/put_event() * which isn't always called under ctx->mutex.
*/ if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return;
scoped_guard (rcu) {
cd = rcu_dereference(p->perf_ctx_data); if (!cd || !refcount_dec_and_test(&cd->refcount)) return;
}
/* * The old ctx_data may be lost because of the race. * Nothing is required to do for the case. * See attach_task_ctx_data().
*/ if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
perf_free_ctx_data_rcu(cd);
}
staticvoid unaccount_event(struct perf_event *event)
{ bool dec = false;
if (event->parent) return;
if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
dec = true; if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events); if (event->attr.build_id)
atomic_dec(&nr_build_id_events); if (event->attr.comm)
atomic_dec(&nr_comm_events); if (event->attr.namespaces)
atomic_dec(&nr_namespaces_events); if (event->attr.cgroup)
atomic_dec(&nr_cgroup_events); if (event->attr.task)
atomic_dec(&nr_task_events); if (event->attr.freq)
unaccount_freq_event(); if (event->attr.context_switch) {
dec = true;
atomic_dec(&nr_switch_events);
} if (is_cgroup_event(event))
dec = true; if (has_branch_stack(event))
dec = true; if (event->attr.ksymbol)
atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event)
atomic_dec(&nr_bpf_events); if (event->attr.text_poke)
atomic_dec(&nr_text_poke_events);
if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1))
schedule_delayed_work(&perf_sched_work, HZ);
}
/* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled * at a time, so we disallow creating events that might conflict, namely: * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context().
*/ staticint exclusive_event_init(struct perf_event *event)
{ struct pmu *pmu = event->pmu;
if (!is_exclusive_pmu(pmu)) return 0;
/* * Prevent co-existence of per-task and cpu-wide events on the * same exclusive pmu. * * Negative pmu::exclusive_cnt means there are cpu-wide * events on this "exclusive" pmu, positive means there are * per-task events. * * Since this is called in perf_event_alloc() path, event::ctx * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK * to mean "per-task event", because unlike other attach states it * never gets cleared.
*/ if (event->attach_state & PERF_ATTACH_TASK) { if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) return -EBUSY;
} else { if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) return -EBUSY;
}
/* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK)
atomic_dec(&pmu->exclusive_cnt); else
atomic_inc(&pmu->exclusive_cnt);
if (event->attach_state & PERF_ATTACH_CALLCHAIN)
put_callchain_buffers();
kfree(event->addr_filter_ranges);
if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
exclusive_event_destroy(event);
if (is_cgroup_event(event))
perf_detach_cgroup(event);
if (event->attach_state & PERF_ATTACH_TASK_DATA)
detach_perf_ctx_data(event);
if (event->destroy)
event->destroy(event);
/* * Must be after ->destroy(), due to uprobe_perf_close() using * hw.target.
*/ if (event->hw.target)
put_task_struct(event->hw.target);
if (event->pmu_ctx) { /* * put_pmu_ctx() needs an event->ctx reference, because of * epc->ctx.
*/
WARN_ON_ONCE(!pmu);
WARN_ON_ONCE(!event->ctx);
WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
}
/* * perf_event_free_task() relies on put_ctx() being 'last', in * particular all task references must be cleaned up.
*/ if (event->ctx)
put_ctx(event->ctx);
if (event->rb) { /* * Can happen when we close an event with re-directed output. * * Since we have a 0 refcount, perf_mmap_close() will skip * over us; possibly making our ring_buffer_put() the last.
*/
mutex_lock(&event->mmap_mutex);
ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
}
/* * Used to free events which have a known refcount of 1, such as in error paths * of inherited events.
*/ staticvoid free_event(struct perf_event *event)
{ if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, "unexpected event refcount: %ld; ptr=%p\n",
atomic_long_read(&event->refcount), event)) { /* leak to avoid use-after-free */ return;
}
_free_event(event);
}
/* * Remove user event from the owner task.
*/ staticvoid perf_remove_from_owner(struct perf_event *event)
{ struct task_struct *owner;
rcu_read_lock(); /* * Matches the smp_store_release() in perf_event_exit_task(). If we * observe !owner it means the list deletion is complete and we can * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex.
*/
owner = READ_ONCE(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last * task reference we can safely take a new reference * while holding the rcu_read_lock().
*/
get_task_struct(owner);
}
rcu_read_unlock();
if (owner) { /* * If we're here through perf_event_exit_task() we're already * holding ctx->mutex which would be an inversion wrt. the * normal lock order. * * However we can safely take this lock because its the child * ctx->mutex.
*/
mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
/* * We have to re-check the event->owner field, if it is cleared * we raced with perf_event_exit_task(), acquiring the mutex * ensured they're done, and we can proceed with freeing the * event.
*/ if (event->owner) {
list_del_init(&event->owner_entry);
smp_store_release(&event->owner, NULL);
}
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
}
if (!atomic_long_dec_and_test(&event->refcount)) return;
parent = event->parent;
_free_event(event);
/* Matches the refcount bump in inherit_event() */ if (parent)
put_event(parent);
}
/* * Kill an event dead; while event:refcount will preserve the event * object, it will not preserve its functionality. Once the last 'user' * gives up the object, we'll destroy the thing.
*/ int perf_event_release_kernel(struct perf_event *event)
{ struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp;
/* * If we got here through err_alloc: free_event(event); we will not * have attached to a context yet.
*/ if (!ctx) {
WARN_ON_ONCE(event->attach_state &
(PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); goto no_ctx;
}
if (!is_kernel_event(event))
perf_remove_from_owner(event);
/* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. * * Anybody acquiring event->child_mutex after the below loop _must_ * also see this, most importantly inherit_event() which will avoid * placing more children on the list. * * Thus this guarantees that we will in fact observe and kill _ALL_ * child events.
*/ if (event->state > PERF_EVENT_STATE_REVOKED) {
perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
} else {
event->state = PERF_EVENT_STATE_DEAD;
}
perf_event_ctx_unlock(event, ctx);
again:
mutex_lock(&event->child_mutex);
list_for_each_entry(child, &event->child_list, child_list) { /* * Cannot change, child events are not migrated, see the * comment with perf_event_ctx_lock_nested().
*/
ctx = READ_ONCE(child->ctx); /* * Since child_mutex nests inside ctx::mutex, we must jump * through hoops. We start by grabbing a reference on the ctx. * * Since the event cannot get freed while we hold the * child_mutex, the context must also exist and have a !0 * reference count.
*/
get_ctx(ctx);
/* * Now that we have a ctx ref, we can drop child_mutex, and * acquire ctx::mutex without fear of it going away. Then we * can re-acquire child_mutex.
*/
mutex_unlock(&event->child_mutex);
mutex_lock(&ctx->mutex);
mutex_lock(&event->child_mutex);
/* * Now that we hold ctx::mutex and child_mutex, revalidate our * state, if child is still the first entry, it didn't get freed * and we can continue doing so.
*/
tmp = list_first_entry_or_null(&event->child_list, struct perf_event, child_list); if (tmp == child) {
perf_remove_from_context(child, DETACH_GROUP | DETACH_CHILD);
} else {
child = NULL;
}
if (child) { /* Last reference unless ->pending_task work is pending */
put_event(child);
}
put_ctx(ctx);
goto again;
}
mutex_unlock(&event->child_mutex);
no_ctx: /* * Last reference unless ->pending_task work is pending on this event * or any of its children.
*/
put_event(event); return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
/* * Called when the last reference to the file is gone.
*/ staticint perf_release(struct inode *inode, struct file *file)
{
perf_event_release_kernel(file->private_data); return 0;
}
staticint __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{ struct perf_event_context *ctx = leader->ctx; struct perf_event *sub, *parent; unsignedlong flags; int n = 1; /* skip @nr */ int ret;
ret = perf_event_read(leader, true); if (ret) return ret;
raw_spin_lock_irqsave(&ctx->lock, flags); /* * Verify the grouping between the parent and child (inherited) * events is still in tact. * * Specifically: * - leader->ctx->lock pins leader->sibling_list * - parent->child_mutex pins parent->child_list * - parent->ctx->mutex pins parent->sibling_list * * Because parent->ctx != leader->ctx (and child_list nests inside * ctx->mutex), group destruction is not atomic between children, also * see perf_event_release_kernel(). Additionally, parent can grow the * group. * * Therefore it is possible to have parent and child groups in a * different configuration and summing over such a beast makes no sense * what so ever. * * Reject this.
*/
parent = leader->parent; if (parent &&
(parent->group_generation != leader->group_generation ||
parent->nr_siblings != leader->nr_siblings)) {
ret = -ECHILD; goto unlock;
}
/* * Since we co-schedule groups, {enabled,running} times of siblings * will be identical to those of the leader, so we only publish one * set.
*/ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] += leader->total_time_enabled +
atomic64_read(&leader->child_total_time_enabled);
}
/* * Read the performance event - simple non blocking version for now
*/ static ssize_t
__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
u64 read_format = event->attr.read_format; int ret;
/* * Return end-of-file for a read on an event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point).
*/ if (event->state == PERF_EVENT_STATE_ERROR) return 0;
if (count < event->read_size) return -ENOSPC;
WARN_ON_ONCE(event->ctx->parent_ctx); if (read_format & PERF_FORMAT_GROUP)
ret = perf_read_group(event, read_format, buf); else
ret = perf_read_one(event, read_format, buf);
if (event->state <= PERF_EVENT_STATE_REVOKED) return EPOLLERR;
poll_wait(file, &event->waitq, wait);
if (event->state <= PERF_EVENT_STATE_REVOKED) return EPOLLERR;
if (is_event_hup(event)) return events;
if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
event->attr.pinned)) return EPOLLERR;
/* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups.
*/
mutex_lock(&event->mmap_mutex);
rb = event->rb; if (rb)
events = atomic_xchg(&rb->poll, 0);
mutex_unlock(&event->mmap_mutex); return events;
}
/* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable.
*/ staticvoid perf_event_for_each_child(struct perf_event *event, void (*func)(struct perf_event *))
{ struct perf_event *child;
active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) {
perf_pmu_disable(event->pmu);
event->pmu->stop(event, PERF_EF_UPDATE);
}
local64_set(&event->hw.period_left, 0);
if (active) {
event->pmu->start(event, PERF_EF_RELOAD); /* * Once the period is force-reset, the event starts immediately. * But the event/group could be throttled. Unthrottle the * event/group now to avoid the next tick trying to unthrottle * while we already re-started the event/group.
*/ if (event->hw.interrupts == MAX_INTERRUPTS)
perf_event_unthrottle_group(event, true);
perf_pmu_enable(event->pmu);
}
}
/* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch * code calls this from NMI context.
*/ void perf_event_update_userpage(struct perf_event *event)
{ struct perf_event_mmap_page *userpg; struct perf_buffer *rb;
u64 enabled, running, now;
rcu_read_lock();
rb = rcu_dereference(event->rb); if (!rb) goto unlock;
/* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we can be called in * NMI context
*/
calc_timer_values(event, &now, &enabled, &running);
userpg = rb->user_page; /* * Disable preemption to guarantee consistent time stamps are stored to * the user page.
*/
preempt_disable();
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
userpg->offset = perf_event_count(event, false); if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
if (event->rb) { /* * Should be impossible, we set this when removing * event->rb_entry and wait/clear when adding event->rb_entry.
*/
WARN_ON_ONCE(event->rcu_pending);
/* * Avoid racing with perf_mmap_close(AUX): stop the event * before swizzling the event::rb pointer; if it's getting * unmapped, its aux_mmap_count will be 0 and it won't * restart. See the comment in __perf_pmu_output_stop(). * * Data will inevitably be lost when set_output is done in * mid-air, but then again, whoever does it like this is * not in for the data anyway.
*/ if (has_aux(event))
perf_event_stop(event, 0);
rcu_assign_pointer(event->rb, rb);
if (old_rb) {
ring_buffer_put(old_rb); /* * Since we detached before setting the new rb, so that we * could attach the new rb, we could have missed a wakeup. * Provide it now.
*/
wake_up_all(&event->waitq);
}
}
/* * A buffer can be mmap()ed multiple times; either directly through the same * event, or through other events by use of perf_event_set_output(). * * In order to undo the VM accounting done by perf_mmap() we need to destroy * the buffer here, where we still have a VM context. This means we need * to detach all events redirecting to us.
*/ staticvoid perf_mmap_close(struct vm_area_struct *vma)
{ struct perf_event *event = vma->vm_file->private_data;
mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsignedlong size = perf_data_size(rb); bool detach_rest = false;
/* FIXIES vs perf_pmu_unregister() */ if (unmapped)
unmapped(event, vma->vm_mm);
/* * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex * to avoid complications.
*/ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU * data. Note that after rb::aux_mmap_count dropped to zero, * they won't start any more (see perf_aux_output_begin()).
*/
perf_pmu_output_stop(event);
/* now it's safe to free the pages */
atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
rb_free_aux(rb);
WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
mutex_unlock(&rb->aux_mutex);
}
if (atomic_dec_and_test(&rb->mmap_count))
detach_rest = true;
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put;
/* If there's still other mmap()s of this buffer, we're done. */ if (!detach_rest) goto out_put;
/* * No other mmap()s, detach from all other events that might redirect * into the now unreachable buffer. Somewhat complicated by the * fact that rb::event_lock otherwise nests inside mmap_mutex.
*/
again:
rcu_read_lock();
list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { if (!atomic_long_inc_not_zero(&event->refcount)) { /* * This event is en-route to free_event() which will * detach it and remove it from the list.
*/ continue;
}
rcu_read_unlock();
mutex_lock(&event->mmap_mutex); /* * Check we didn't race with perf_event_set_output() which can * swizzle the rb from under us while we were waiting to * acquire mmap_mutex. * * If we find a different rb; ignore this event, a next * iteration will no longer find it on the list. We have to * still restart the iteration to make sure we're not now * iterating the wrong list.
*/ if (event->rb == rb)
ring_buffer_attach(event, NULL);
/* * Restart the iteration; either we're on the wrong list or * destroyed its integrity by doing a deletion.
*/ goto again;
}
rcu_read_unlock();
/* * It could be there's still a few 0-ref events on the list; they'll * get cleaned up by free_event() -- they'll also still have their * ref on the rb and will free it whenever they are done with it. * * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting.
*/
out_put:
ring_buffer_put(rb); /* could be last */
}
static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
{ /* The first page is the user control page, others are read-only. */ return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}
staticint perf_mmap_may_split(struct vm_area_struct *vma, unsignedlong addr)
{ /* * Forbid splitting perf mappings to prevent refcount leaks due to * the resulting non-matching offsets and sizes. See open()/close().
*/ return -EINVAL;
}
/* * We map this as a VM_PFNMAP VMA. * * This is not ideal as this is designed broadly for mappings of PFNs * referencing memory-mapped I/O ranges or non-system RAM i.e. for which * !pfn_valid(pfn). * * We are mapping kernel-allocated memory (memory we manage ourselves) * which would more ideally be mapped using vm_insert_page() or a * similar mechanism, that is as a VM_MIXEDMAP mapping. * * However this won't work here, because: * * 1. It uses vma->vm_page_prot, but this field has not been completely * setup at the point of the f_op->mmp() hook, so we are unable to * indicate that this should be mapped CoW in order that the * mkwrite() hook can be invoked to make the first page R/W and the * rest R/O as desired. * * 2. Anything other than a VM_PFNMAP of valid PFNs will result in * vm_normal_page() returning a struct page * pointer, which means * vm_ops->page_mkwrite() will be invoked rather than * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping * to work around retry logic in the fault handler, however this * field is no longer allowed to be used within struct page. * * 3. Having a struct page * made available in the fault logic also * means that the page gets put on the rmap and becomes * inappropriately accessible and subject to map and ref counting. * * Ideally we would have a mechanism that could explicitly express our * desires, but this is not currently the case, so we instead use * VM_PFNMAP. * * We manage the lifetime of these mappings with internal refcounts (see * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of * this mapping is maintained correctly.
*/ for (pagenum = 0; pagenum < nr_pages; pagenum++) { unsignedlong va = vma->vm_start + PAGE_SIZE * pagenum; struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);
if (page == NULL) {
err = -EINVAL; break;
}
/* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); if (err) break;
}
#ifdef CONFIG_MMU /* Clear any partial mappings on error. */ if (err)
zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); #endif
/* * Don't allow mmap() of inherited per-task counters. This would * create a performance issue due to all children writing to the * same rb.
*/ if (event->cpu == -1 && event->attr.inherit) return -EINVAL;
if (!(vma->vm_flags & VM_SHARED)) return -EINVAL;
ret = security_perf_event_read(event); if (ret) return ret;
if (vma_size != PAGE_SIZE * nr_pages) return -EINVAL;
user_extra = nr_pages;
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
/* * This relies on __pmu_detach_event() taking mmap_mutex after marking * the event REVOKED. Either we observe the state, or __pmu_detach_event() * will detach the rb created here.
*/ if (event->state <= PERF_EVENT_STATE_REVOKED) {
ret = -ENODEV; goto unlock;
}
if (vma->vm_pgoff == 0) {
nr_pages -= 1;
/* * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo.
*/ if (nr_pages != 0 && !is_power_of_2(nr_pages)) goto unlock;
WARN_ON_ONCE(event->ctx->parent_ctx);
if (event->rb) { if (data_page_nr(event->rb) != nr_pages) goto unlock;
if (atomic_inc_not_zero(&event->rb->mmap_count)) { /* * Success -- managed to mmap() the same buffer * multiple times.
*/
ret = 0; /* We need the rb to map pages. */
rb = event->rb; goto unlock;
}
/* * Raced against perf_mmap_close()'s * atomic_dec_and_mutex_lock() remove the * event and continue as if !event->rb
*/
ring_buffer_attach(event, NULL);
}
} else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already * mapped, all subsequent mappings should have the same size * and offset. Must be above the normal perf buffer.
*/
u64 aux_offset, aux_size;
/* * Increase the limit linearly with more CPUs:
*/
user_lock_limit *= num_online_cpus();
user_locked = atomic_long_read(&user->locked_vm);
/* * sysctl_perf_event_mlock may have changed, so that * user->locked_vm > user_lock_limit
*/ if (user_locked > user_lock_limit)
user_locked = user_lock_limit;
user_locked += user_extra;
if (user_locked > user_lock_limit) { /* * charge locked_vm until it hits user_lock_limit; * charge the rest from pinned_vm
*/
extra = user_locked - user_lock_limit;
user_extra -= extra;
}
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
ret = 0;
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags); if (!ret) {
atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
}
}
unlock: if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
atomic64_add(extra, &vma->vm_mm->pinned_vm);
atomic_inc(&event->mmap_count);
} elseif (rb) { /* AUX allocation failed */
atomic_dec(&rb->mmap_count);
}
aux_unlock: if (aux_mutex)
mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
if (ret) return ret;
/* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma.
*/
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
mapped = get_mapped(event, event_mapped); if (mapped)
mapped(event, vma->vm_mm);
/* * Try to map it into the page table. On fail, invoke * perf_mmap_close() to undo the above, as the callsite expects * full cleanup in this case and therefore does not invoke * vmops::close().
*/
ret = map_range(rb, vma); if (ret)
perf_mmap_close(vma);
staticvoid perf_sigtrap(struct perf_event *event)
{ /* * Both perf_pending_task() and perf_pending_irq() can race with the * task exiting.
*/ if (current->flags & PF_EXITING) return;
/* * We'd expect this to only occur if the irq_work is delayed and either * ctx->task or current has changed in the meantime. This can be the * case on architectures that do not implement arch_irq_work_raise().
*/ if (WARN_ON_ONCE(event->ctx->task != current)) return;
/* * Deliver the pending work in-event-context or follow the context.
*/ staticvoid __perf_pending_disable(struct perf_event *event)
{ int cpu = READ_ONCE(event->oncpu);
/* * If the event isn't running; we done. event_sched_out() will have * taken care of things.
*/ if (cpu < 0) return;
/* * Yay, we hit home and are in the context of the event.
*/ if (cpu == smp_processor_id()) { if (event->pending_disable) {
event->pending_disable = 0;
perf_event_disable_local(event);
} return;
}
/* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'.
*/
rctx = perf_swevent_get_recursion_context();
__perf_pending_disable(event); if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
/* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'.
*/
rctx = perf_swevent_get_recursion_context();
/* * The wakeup isn't bound to the context of the event -- it can happen * irrespective of where the event is.
*/ if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
}
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
/* * If we 'fail' here, that's OK, it means recursion is already disabled * and we won't recurse 'further'.
*/
rctx = perf_swevent_get_recursion_context();
if (event->pending_work) {
event->pending_work = 0;
perf_sigtrap(event);
local_dec(&event->ctx->nr_no_switch_fast);
}
put_event(event);
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
/* * Get remaining task size from user stack pointer. * * It'd be better to take stack vma map and limit this more * precisely, but there's no way to get it safely under interrupt, * so using TASK_SIZE as limit.
*/ static u64 perf_ustack_task_size(struct pt_regs *regs)
{ unsignedlong addr = perf_user_stack_pointer(regs);
/* No regs, no stack pointer, no dump. */ if (!regs) return 0;
/* No mm, no stack, no dump. */ if (!current->mm) return 0;
/* * Check if we fit in with the requested stack size into the: * - TASK_SIZE * If we don't, we limit the size to the TASK_SIZE. * * - remaining sample size * If we don't, we customize the stack size to * fit in to the remaining sample size.
*/
/* Current header size plus static size and dynamic size. */
header_size += 2 * sizeof(u64);
/* Do we fit in with the current stack dump size? */ if ((u16) (header_size + stack_size) < header_size) { /* * If we overflow the maximum size for the sample, * we customize the stack dump size to fit in.
*/
stack_size = USHRT_MAX - header_size - sizeof(u64);
stack_size = round_up(stack_size, sizeof(u64));
}
return stack_size;
}
staticvoid
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, struct pt_regs *regs)
{ /* Case of a kernel thread, nothing to dump */ if (!regs) {
u64 size = 0;
perf_output_put(handle, size);
} else { unsignedlong sp; unsignedint rem;
u64 dyn_size;
/* * We dump: * static size * - the size requested by user or the best one we can fit * in to the sample max size * data * - user stack dump data * dynamic size * - the actual dumped size
*/
if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE)) goto out;
if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out;
rb = ring_buffer_get(sampler); if (!rb) goto out;
/* * If this is an NMI hit inside sampling code, don't take * the sample. See also perf_aux_sample_output().
*/ if (READ_ONCE(rb->aux_in_sampling)) {
data->aux_size = 0;
} else {
size = min_t(size_t, size, perf_aux_size(rb));
data->aux_size = ALIGN(size, sizeof(u64));
}
ring_buffer_put(rb);
/* * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler * paths. If we start calling them in NMI context, they may race with * the IRQ ones, that is, for example, re-starting an event that's just * been stopped, which is why we're using a separate callback that * doesn't change the event state. * * IRQs need to be disabled to prevent IPIs from racing with us.
*/
local_irq_save(flags); /* * Guard against NMI hits inside the critical section; * see also perf_prepare_sample_aux().
*/
WRITE_ONCE(rb->aux_in_sampling, 1);
barrier();
ret = event->pmu->snapshot_aux(event, handle, size);
/* * An error here means that perf_output_copy() failed (returned a * non-zero surplus that it didn't copy), which in its current * enlightened implementation is not possible. If that changes, we'd * like to know.
*/ if (WARN_ON_ONCE(size < 0)) goto out_put;
/* * The pad comes from ALIGN()ing data->aux_size up to u64 in * perf_prepare_sample_aux(), so should not be more than that.
*/
pad = data->aux_size - size; if (WARN_ON_ONCE(pad >= sizeof(u64)))
pad = 8;
if (pad) {
u64 zero = 0;
perf_output_copy(handle, &zero, pad);
}
out_put:
ring_buffer_put(rb);
}
/* * A set of common sample data types saved even for non-sample records * when event->attr.sample_id_all is set.
*/ #define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \
PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \
PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)
/* * XXX PERF_SAMPLE_READ vs inherited events seems difficult. * * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. * * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread * counts rather than attempting to accumulate some value across all children on * all cores.
*/ staticvoid perf_output_read(struct perf_output_handle *handle, struct perf_event *event)
{
u64 enabled = 0, running = 0, now;
u64 read_format = event->attr.read_format;
/* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event * was last scheduled in. * * we cannot simply called update_context_time() * because of locking issue as we are called in * NMI context
*/ if (read_format & PERF_FORMAT_TOTAL_TIMES)
calc_timer_values(event, &now, &enabled, &running);
perf_output_put(handle, data->br_stack->nr); if (branch_sample_hw_index(event))
perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size); /* * Add the extension space which is appended * right after the struct perf_branch_stack.
*/ if (data->br_stack_cntr) {
size = data->br_stack->nr * sizeof(u64);
perf_output_copy(handle, data->br_stack_cntr, size);
}
} else { /* * we always store at least the value of nr
*/
u64 nr = 0;
perf_output_put(handle, nr);
}
}
if (sample_type & PERF_SAMPLE_REGS_USER) {
u64 abi = data->regs_user.abi;
/* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
*/
perf_output_put(handle, abi);
if (sample_type & PERF_SAMPLE_STACK_USER) {
perf_output_sample_ustack(handle,
data->stack_user_size,
data->regs_user.regs);
}
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
perf_output_put(handle, data->weight.full);
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
if (sample_type & PERF_SAMPLE_TRANSACTION)
perf_output_put(handle, data->txn);
if (sample_type & PERF_SAMPLE_REGS_INTR) {
u64 abi = data->regs_intr.abi; /* * If there are no regs to dump, notice it through * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
*/
perf_output_put(handle, abi);
if (abi) {
u64 mask = event->attr.sample_regs_intr;
if (virt >= TASK_SIZE) { /* If it's vmalloc()d memory, leave phys_addr as 0 */ if (virt_addr_valid((void *)(uintptr_t)virt) &&
!(virt >= VMALLOC_START && virt < VMALLOC_END))
phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
} else { /* * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0.
*/ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) { struct page *p;
/* * Add the sample flags that are dependent to others. And clear the * sample flags that have already been done by the PMU driver.
*/
filtered_sample_type = sample_type;
filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
PERF_SAMPLE_IP);
filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
PERF_SAMPLE_REGS_USER);
filtered_sample_type &= ~data->sample_flags;
if (filtered_sample_type == 0) { /* Make sure it has the correct data->type for output */
data->type = event->attr.sample_type; return;
}
if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
perf_sample_regs_user(&data->regs_user, regs);
/* * It cannot use the filtered_sample_type here as REGS_USER can be set * by STACK_USER (using __cond_set() above) and we don't want to update * the dyn_size if it's not requested by users.
*/ if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64);
if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { /* * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added * in case new sample type is added, because we could eat * up the rest of the sample size.
*/
u16 stack_size = event->attr.sample_stack_user;
u16 header_size = perf_sample_data_size(data, event);
u16 size = sizeof(u64);
/* * If there is something to dump, add space for the dump * itself and for the field that tells the dynamic size, * which is how many have been actually dumped.
*/ if (stack_size)
size += sizeof(u64) + stack_size;
/* * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, * but the value will not dump to the userspace.
*/ if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
data->data_page_size = perf_get_page_size(data->addr);
data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
}
/* * Given the 16bit nature of header::size, an AUX sample can * easily overflow it, what with all the preceding sample bits. * Make sure this doesn't happen by using up to U16_MAX bytes * per sample in total (rounded down to 8 byte boundary).
*/
size = min_t(size_t, U16_MAX - header_size,
event->attr.aux_sample_size);
size = rounddown(size, 8);
size = perf_prepare_sample_aux(event, data, size);
/* * If you're adding more sample types here, you likely need to do * something about the overflowing header::size, like repurpose the * lowest 3 bits of size, which should be always zero at the moment. * This raises a more important question, do we really need 512k sized * samples and why, so good argumentation is in order for whatever you * do here next.
*/
WARN_ON_ONCE(header->size & 7);
}
scoped_guard (irqsave) { /* * Guard against self-recursion here. Another event could trip * this same from NMI context.
*/ if (READ_ONCE(rb->aux_in_pause_resume)) break;
list_for_each_entry_rcu(event, &pel->list, sb_list) { /* * Skip events that are not fully formed yet; ensure that * if we observe event->ctx, both event and ctx will be * complete enough. See perf_install_in_context().
*/ if (!smp_load_acquire(&event->ctx)) continue;
if (event->state < PERF_EVENT_STATE_INACTIVE) continue; if (!event_filter_match(event)) continue;
output(event, data);
}
}
/* * Iterate all events that need to receive side-band events. * * For new callers; ensure that account_pmu_sb_event() includes * your event, otherwise it might not get delivered.
*/ staticvoid
perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx)
{ struct perf_event_context *ctx;
rcu_read_lock();
preempt_disable();
/* * If we have task_ctx != NULL we only notify the task context itself. * The task_ctx is set only for EXIT events before releasing task * context.
*/ if (task_ctx) {
perf_iterate_ctx(task_ctx, output, data, false); goto done;
}
/* * In case of inheritance, it will be the parent that links to the * ring-buffer, but it will be the child that's actually using it. * * We are using event::rb to determine if the event should be stopped, * however this may race with ring_buffer_attach() (through set_output), * which will make us skip the event that actually needs to be stopped. * So ring_buffer_attach() has to stop an aux event before re-assigning * its rb pointer.
*/ if (rcu_dereference(parent->rb) == rb)
ro->err = __perf_event_stop(&sd);
}
restart:
rcu_read_lock();
list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) { /* * For per-CPU events, we need to make sure that neither they * nor their children are running; for cpu==-1 events it's * sufficient to stop the event itself if it's active, since * it can't have children.
*/
cpu = iter->cpu; if (cpu == -1)
cpu = READ_ONCE(iter->oncpu);
/* * Allocate data for a new task when profiling system-wide * events which require PMU specific data
*/ staticvoid
perf_event_alloc_task_data(struct task_struct *child, struct task_struct *parent)
{ struct kmem_cache *ctx_cache = NULL; struct perf_ctx_data *cd;
if (!refcount_read(&global_ctx_data_ref)) return;
scoped_guard (rcu) {
cd = rcu_dereference(parent->perf_ctx_data); if (cd)
ctx_cache = cd->ctx_cache;
}
if (!ctx_cache) return;
guard(percpu_read)(&global_ctx_data_rwsem);
scoped_guard (rcu) {
cd = rcu_dereference(child->perf_ctx_data); if (!cd) { /* * A system-wide event may be unaccount, * when attaching the perf_ctx_data.
*/ if (!refcount_read(&global_ctx_data_ref)) return; goto attach;
}
if (!cd->global) {
cd->global = 1;
refcount_inc(&cd->refcount);
}
}
pathname = kmalloc(PATH_MAX, GFP_KERNEL); if (pathname == NULL) {
cgroup_event.path = path_enomem;
} else { /* just to be sure to have enough space for alignment */
cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
cgroup_event.path = pathname;
}
/* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace.
*/
size = strlen(cgroup_event.path) + 1; while (!IS_ALIGNED(size, sizeof(u64)))
cgroup_event.path[size++] = '\0';
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma))
flags |= MAP_HUGETLB;
if (file) { conststruct inode *inode;
dev_t dev;
buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) {
name = "//enomem"; goto cpy_name;
} /* * d_path() works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later.
*/
name = d_path(file_user_path(file), buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) {
name = "//toolong"; goto cpy_name;
}
inode = file_user_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
gen = inode->i_generation;
maj = MAJOR(dev);
min = MINOR(dev);
goto got_name;
} else { if (vma->vm_ops && vma->vm_ops->name)
name = (char *) vma->vm_ops->name(vma); if (!name)
name = (char *)arch_vma_name(vma); if (!name) { if (vma_is_initial_heap(vma))
name = "[heap]"; elseif (vma_is_initial_stack(vma))
name = "[stack]"; else
name = "//anon";
}
}
cpy_name:
strscpy(tmp, name);
name = tmp;
got_name: /* * Since our buffer works in 8 byte units we need to align our string * size to a multiple of 8. However, we must guarantee the tail end is * zero'd out to avoid leaking random bits to userspace.
*/
size = strlen(name)+1; while (!IS_ALIGNED(size, sizeof(u64)))
name[size++] = '\0';
if (restart)
event->addr_filters_gen++;
raw_spin_unlock_irqrestore(&ifh->lock, flags);
if (restart)
perf_event_stop(event, 1);
}
/* * Adjust all task's events' filters to the new vma
*/ staticvoid perf_addr_filters_adjust(struct vm_area_struct *vma)
{ struct perf_event_context *ctx;
/* * Data tracing isn't supported yet and as such there is no need * to keep track of anything that isn't related to executable code:
*/ if (!(vma->vm_flags & VM_EXEC)) return;
switch (type) { case PERF_BPF_EVENT_PROG_LOAD: case PERF_BPF_EVENT_PROG_UNLOAD: if (atomic_read(&nr_ksymbol_events))
perf_event_bpf_emit_ksymbols(prog, type); break; default: return;
}
int perf_event_account_interrupt(struct perf_event *event)
{ return __perf_event_account_interrupt(event, 1);
}
staticinlinebool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
{ /* * Due to interrupt latency (AKA "skid"), we may enter the * kernel before taking an overflow, even if the PMU is only * counting user events.
*/ if (event->attr.exclude_kernel && !user_mode(regs)) returnfalse;
if (prog->type != BPF_PROG_TYPE_PERF_EVENT) return -EINVAL;
if (event->attr.precise_ip &&
prog->call_get_stack &&
(!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
event->attr.exclude_callchain_kernel ||
event->attr.exclude_callchain_user)) { /* * On perf_event with precise_ip, calling bpf_get_stack() * may trigger unwinder warnings and occasional crashes. * bpf_get_[stack|stackid] works around this issue by using * callchain attached to perf_sample_data. If the * perf_event does not full (kernel and user) callchain * attached to perf_sample_data, do not allow attaching BPF * program that calls bpf_get_[stack|stackid].
*/ return -EPROTO;
}
staticint __perf_event_overflow(struct perf_event *event, int throttle, struct perf_sample_data *data, struct pt_regs *regs)
{ int events = atomic_read(&event->event_limit); int ret = 0;
/* * Non-sampling counters might still use the PMI to fold short * hardware counters, ignore those.
*/ if (unlikely(!is_sampling_event(event))) return 0;
ret = __perf_event_account_interrupt(event, throttle);
if (event->attr.aux_pause)
perf_event_aux_pause(event->aux_event, true);
/* * XXX event_limit might not quite work as expected on inherited * events
*/
event->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
perf_event_disable_inatomic(event);
event->pmu->stop(event, 0);
}
if (event->attr.sigtrap) { /* * The desired behaviour of sigtrap vs invalid samples is a bit * tricky; on the one hand, one should not loose the SIGTRAP if * it is the first event, on the other hand, we should also not * trigger the WARN or override the data address.
*/ bool valid_sample = sample_is_allowed(event, regs); unsignedint pending_id = 1; enum task_work_notify_mode notify_mode;
if (regs)
pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
} elseif (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without * consuming pending_work; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. * * 2. Events that can overflow again before the IRQ- * work without user space progress (e.g. hrtimer). * To approximate progress (with false negatives), * check 32-bit hash of the current IP.
*/
WARN_ON_ONCE(event->pending_work != pending_id);
}
}
/* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event * is kept in the range [-sample_period, 0] so that we can use the * sign as trigger.
*/
/* For the event head insertion and removal in the hlist */ staticinlinestruct hlist_head *
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
{ struct swevent_hlist *hlist;
u32 event_id = event->attr.config;
u64 type = event->attr.type;
/* * Event scheduling is always serialized against hlist allocation * and release. Which makes the protected version suitable here. * The context lock guarantees that.
*/
hlist = rcu_dereference_protected(swhash->swevent_hlist,
lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL;
/* Deref the hlist from the update side */ staticinlinestruct swevent_hlist *
swevent_hlist_deref(struct swevent_htable *swhash)
{ return rcu_dereference_protected(swhash->swevent_hlist,
lockdep_is_held(&swhash->hlist_mutex));
}
hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &raw, regs)) { /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and * need to be re-computed for different sweveents. * Re-initialize data->sample_flags safely to avoid * the problem that next event skips preparing data * because data->sample_flags is set.
*/
perf_sample_data_init(&data, 0, 0);
perf_sample_save_raw_data(&data, event, &raw);
perf_swevent_event(event, count, &data, regs);
}
}
/* * If we got specified a target task, also iterate its context and * deliver this event there too.
*/ if (task && task != current) { struct perf_event_context *ctx;
rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock;
#ifdefined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe * The flags should match following PMU_FORMAT_ATTR(). * * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe * if not set, create kprobe/uprobe * * The following values specify a reference counter (or semaphore in the * terminology of tools like dtrace, systemtap, etc.) Userspace Statically * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset. * * PERF_UPROBE_REF_CTR_OFFSET_BITS # of bits in config as th offset * PERF_UPROBE_REF_CTR_OFFSET_SHIFT # of bits to shift left
*/ enum perf_probe_config {
PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0, /* [k,u]retprobe */
PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
};
/* * returns true if the event is a tracepoint, or a kprobe/upprobe created * with perf_event_open()
*/ staticinlinebool perf_event_is_tracing(struct perf_event *event)
{ if (event->pmu == &perf_tracepoint) returntrue; #ifdef CONFIG_KPROBE_EVENTS if (event->pmu == &perf_kprobe) returntrue; #endif #ifdef CONFIG_UPROBE_EVENTS if (event->pmu == &perf_uprobe) returntrue; #endif returnfalse;
}
staticvoid perf_free_addr_filters(struct perf_event *event)
{ /* * Used during free paths, there is no concurrency.
*/ if (list_empty(&event->addr_filters.list)) return;
perf_addr_filters_splice(event, NULL);
}
/* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. * Called with mm::mmap_lock down for reading.
*/ staticvoid perf_addr_filter_apply(struct perf_addr_filter *filter, struct mm_struct *mm, struct perf_addr_filter_range *fr)
{ struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
for_each_vma(vmi, vma) { if (!vma->vm_file) continue;
if (perf_addr_filter_vma_adjust(filter, vma, fr)) return;
}
}
/* * Update event's address range filters based on the * task's existing mappings, if any.
*/ staticvoid perf_event_addr_filters_apply(struct perf_event *event)
{ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct task_struct *task = READ_ONCE(event->ctx->task); struct perf_addr_filter *filter; struct mm_struct *mm = NULL; unsignedint count = 0; unsignedlong flags;
/* * We may observe TASK_TOMBSTONE, which means that the event tear-down * will stop on the parent's child_mutex that our caller is also holding
*/ if (task == TASK_TOMBSTONE) return;
if (ifh->nr_file_filters) {
mm = get_task_mm(task); if (!mm) goto restart;
mmap_read_lock(mm);
}
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { /* * Adjust base offset if the filter is associated to a * binary that needs to be mapped:
*/
event->addr_filter_ranges[count].start = 0;
event->addr_filter_ranges[count].size = 0;
/* * Address range filtering: limiting the data to certain * instruction address ranges. Filters are ioctl()ed to us from * userspace as ascii strings. * * Filter string format: * * ACTION RANGE_SPEC * where ACTION is one of the * * "filter": limit the trace to this region * * "start": start tracing from this address * * "stop": stop tracing at this address/region; * RANGE_SPEC is * * for kernel addresses: <start address>[/<size>] * * for object files: <start address>[/<size>]@</path/to/object/file> * * if <size> is not specified or is zero, the range is treated as a single * address; not valid for ACTION=="filter".
*/ enum {
IF_ACT_NONE = -1,
IF_ACT_FILTER,
IF_ACT_START,
IF_ACT_STOP,
IF_SRC_FILE,
IF_SRC_KERNEL,
IF_SRC_FILEADDR,
IF_SRC_KERNELADDR,
};
/* filter definition begins */ if (state == IF_STATE_ACTION) {
filter = perf_addr_filter_new(event, filters); if (!filter) goto fail;
}
token = match_token(start, if_tokens, args); switch (token) { case IF_ACT_FILTER: case IF_ACT_START: case IF_ACT_STOP: if (state != IF_STATE_ACTION) goto fail;
filter->action = actions[token];
state = IF_STATE_SOURCE; break;
case IF_SRC_KERNELADDR: case IF_SRC_KERNEL:
kernel = 1;
fallthrough;
case IF_SRC_FILEADDR: case IF_SRC_FILE: if (state != IF_STATE_SOURCE) goto fail;
*args[0].to = 0;
ret = kstrtoul(args[0].from, 0, &filter->offset); if (ret) goto fail;
if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
*args[1].to = 0;
ret = kstrtoul(args[1].from, 0, &filter->size); if (ret) goto fail;
}
if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1;
kfree(filename);
filename = match_strdup(&args[fpos]); if (!filename) {
ret = -ENOMEM; goto fail;
}
}
state = IF_STATE_END; break;
default: goto fail;
}
/* * Filter definition is fully parsed, validate and install it. * Make sure that it doesn't contradict itself or the event's * attribute.
*/ if (state == IF_STATE_END) {
ret = -EINVAL;
/* * ACTION "filter" must have a non-zero length region * specified.
*/ if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
!filter->size) goto fail;
if (!kernel) { if (!filename) goto fail;
/* * For now, we only support file-based filters * in per-task events; doing so for CPU-wide * events requires additional context switching * trickery, since same object code will be * mapped at different virtual addresses in * different processes.
*/
ret = -EOPNOTSUPP; if (!event->ctx->task) goto fail;
/* look up the path and grab its inode */
ret = kern_path(filename, LOOKUP_FOLLOW,
&filter->path); if (ret) goto fail;
ret = -EINVAL; if (!filter->path.dentry ||
!S_ISREG(d_inode(filter->path.dentry)
->i_mode)) goto fail;
event->addr_filters.nr_file_filters++;
}
/* ready to consume more filters */
kfree(filename);
filename = NULL;
state = IF_STATE_ACTION;
filter = NULL;
kernel = 0;
}
}
staticint perf_event_set_filter(struct perf_event *event, void __user *arg)
{ int ret = -EINVAL; char *filter_str;
filter_str = strndup_user(arg, PAGE_SIZE); if (IS_ERR(filter_str)) return PTR_ERR(filter_str);
#ifdef CONFIG_EVENT_TRACING if (perf_event_is_tracing(event)) { struct perf_event_context *ctx = event->ctx;
/* * Beware, here be dragons!! * * the tracepoint muck will deadlock against ctx->mutex, but * the tracepoint stuff does not actually need it. So * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we * already have a reference on ctx. * * This can result in event getting moved to a different ctx, * but that does not affect the tracepoint state.
*/
mutex_unlock(&ctx->mutex);
ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
mutex_lock(&ctx->mutex);
} else #endif if (has_addr_filter(event))
ret = perf_event_set_addr_filter(event, filter_str);
if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) if (__perf_event_overflow(event, 1, &data, regs))
ret = HRTIMER_NORESTART;
}
period = max_t(u64, 10000, event->hw.sample_period);
hrtimer_forward_now(hrtimer, ns_to_ktime(period));
/* * Careful: this function can be triggered in the hrtimer handler, * for cpu-clock events, so hrtimer_cancel() would cause a * deadlock. * * So use hrtimer_try_to_cancel() to try to stop the hrtimer, * and the cpu-clock handler also sets the PERF_HES_STOPPED flag, * which guarantees that perf_swevent_hrtimer() will stop the * hrtimer once it sees the PERF_HES_STOPPED flag.
*/ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
/* * Since hrtimers have a fixed rate, we can do a static freq->period * mapping and avoid the whole period adjust feedback stuff.
*/ if (event->attr.freq) { long freq = event->attr.sample_freq;
/* * Now that the PMU is complete, make it visible to perf_try_init_event().
*/ if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) return -EINVAL;
list_add_rcu(&pmu->entry, &pmus);
take_idr_id(pmu_type);
_pmu = no_free_ptr(pmu); // let it rip return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
staticvoid __pmu_detach_event(struct pmu *pmu, struct perf_event *event, struct perf_event_context *ctx)
{ /* * De-schedule the event and mark it REVOKED.
*/
perf_event_exit_event(event, ctx, true);
/* * All _free_event() bits that rely on event->pmu: * * Notably, perf_mmap() relies on the ordering here.
*/
scoped_guard (mutex, &event->mmap_mutex) {
WARN_ON_ONCE(pmu->event_unmapped); /* * Mostly an empty lock sequence, such that perf_mmap(), which * relies on mmap_mutex, is sure to observe the state change.
*/
}
for (;;) {
event = pmu_get_event(pmu); if (!event) break;
pmu_detach_event(pmu, event);
put_event(event);
}
/* * wait for pending _free_event()s
*/
wait_var_event(pmu, pmu_empty(pmu));
}
int perf_pmu_unregister(struct pmu *pmu)
{
scoped_guard (mutex, &pmus_lock) { if (!idr_cmpxchg(&pmu_idr, pmu->type, pmu, NULL)) return -EINVAL;
list_del_rcu(&pmu->entry);
}
/* * We dereference the pmu list under both SRCU and regular RCU, so * synchronize against both of those. * * Notably, the entirety of event creation, from perf_init_event() * (which will now fail, because of the above) until * perf_install_in_context() should be under SRCU such that * this synchronizes against event creation. This avoids trying to * detach events that are not fully formed.
*/
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
if (pmu->event_unmapped && !pmu_empty(pmu)) { /* * Can't force remove events when pmu::event_unmapped() * is used in perf_mmap_close().
*/
guard(mutex)(&pmus_lock);
idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu);
list_add_rcu(&pmu->entry, &pmus); return -EBUSY;
}
/* * A number of pmu->event_init() methods iterate the sibling_list to, * for example, validate if the group fits on the PMU. Therefore, * if this is a sibling event, acquire the ctx->mutex to protect * the sibling_list.
*/ if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment.
*/
ctx = perf_event_ctx_lock_nested(event->group_leader,
SINGLE_DEPTH_NESTING);
BUG_ON(!ctx);
}
event->pmu = pmu;
ret = pmu->event_init(event);
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
if (ret) goto err_pmu;
if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
has_extended_regs(event)) {
ret = -EOPNOTSUPP; goto err_destroy;
}
if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
event_has_any_exclude_flag(event)) {
ret = -EINVAL; goto err_destroy;
}
if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { conststruct cpumask *cpumask; struct cpumask *pmu_cpumask; int cpu;
/* * Save original type before calling pmu->event_init() since certain * pmus overwrites event->attr.type to forward event to another pmu.
*/
event->orig_type = event->attr.type;
/* Try parent's PMU first: */ if (event->parent && event->parent->pmu) {
pmu = event->parent->pmu;
ret = perf_try_init_event(pmu, event); if (!ret) return pmu;
}
/* * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE * are often aliases for PERF_TYPE_RAW.
*/
type = event->attr.type; if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
type = event->attr.config >> PERF_PMU_TYPE_SHIFT; if (!type) {
type = PERF_TYPE_RAW;
} else {
extended_type = true;
event->attr.config &= PERF_HW_EVENT_MASK;
}
}
again:
scoped_guard (rcu)
pmu = idr_find(&pmu_idr, type); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW &&
!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) return ERR_PTR(-ENOENT);
ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) {
type = event->attr.type; goto again;
}
if (ret) return ERR_PTR(ret);
return pmu;
}
list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event); if (!ret) return pmu;
/* * We keep a list of all !task (and therefore per-cpu) events * that need to receive side-band records. * * This avoids having to scan all the various PMU per-cpu contexts * looking for them.
*/ staticvoid account_pmu_sb_event(struct perf_event *event)
{ if (is_sb_event(event))
attach_sb_event(event);
}
/* Freq events need the tick to stay alive (see perf_event_task_tick). */ staticvoid account_freq_event_nohz(void)
{ #ifdef CONFIG_NO_HZ_FULL /* Lock so we don't race with concurrent unaccount */
spin_lock(&nr_freq_lock); if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
spin_unlock(&nr_freq_lock); #endif
}
staticvoid account_freq_event(void)
{ if (tick_nohz_full_enabled())
account_freq_event_nohz(); else
atomic_inc(&nr_freq_events);
}
if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
inc = true; if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events); if (event->attr.build_id)
atomic_inc(&nr_build_id_events); if (event->attr.comm)
atomic_inc(&nr_comm_events); if (event->attr.namespaces)
atomic_inc(&nr_namespaces_events); if (event->attr.cgroup)
atomic_inc(&nr_cgroup_events); if (event->attr.task)
atomic_inc(&nr_task_events); if (event->attr.freq)
account_freq_event(); if (event->attr.context_switch) {
atomic_inc(&nr_switch_events);
inc = true;
} if (has_branch_stack(event))
inc = true; if (is_cgroup_event(event))
inc = true; if (event->attr.ksymbol)
atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event)
atomic_inc(&nr_bpf_events); if (event->attr.text_poke)
atomic_inc(&nr_text_poke_events);
if (inc) { /* * We need the mutex here because static_branch_enable() * must complete *before* the perf_sched_count increment * becomes visible.
*/ if (atomic_inc_not_zero(&perf_sched_count)) goto enabled;
mutex_lock(&perf_sched_mutex); if (!atomic_read(&perf_sched_count)) {
static_branch_enable(&perf_sched_events); /* * Guarantee that all CPUs observe they key change and * call the perf scheduling hooks before proceeding to * install events that need them.
*/
synchronize_rcu();
} /* * Now that we have waited for the sync_sched(), allow further * increments to by-pass the mutex.
*/
atomic_inc(&perf_sched_count);
mutex_unlock(&perf_sched_mutex);
}
enabled:
account_pmu_sb_event(event);
}
/* * Allocate and initialize an event structure
*/ staticstruct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu, struct task_struct *task, struct perf_event *group_leader, struct perf_event *parent_event,
perf_overflow_handler_t overflow_handler, void *context, int cgroup_fd)
{ struct pmu *pmu; struct hw_perf_event *hwc; long err = -EINVAL; int node;
if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL);
} if (attr->sigtrap && !task) { /* Requires a task: avoid signalling random tasks. */ return ERR_PTR(-EINVAL);
}
if (parent_event)
event->event_caps = parent_event->event_caps;
if (task) {
event->attach_state = PERF_ATTACH_TASK; /* * XXX pmu::event_init needs to know what task to account to * and we cannot use the ctx information because we need the * pmu before we get a ctx.
*/
event->hw.target = get_task_struct(task);
}
event->clock = &local_clock; if (parent_event)
event->clock = parent_event->clock;
/* * We do not support PERF_SAMPLE_READ on inherited events unless * PERF_SAMPLE_TID is also selected, which allows inherited events to * collect per-thread samples. * See perf_output_read().
*/ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) return ERR_PTR(-EINVAL);
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
pmu = perf_init_event(event); if (IS_ERR(pmu)) return (void*)pmu;
/* * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config(). * The attach should be right after the perf_init_event(). * Otherwise, the __free_event() would mistakenly detach the non-exist * perf_ctx_data because of the other errors between them.
*/ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
err = attach_perf_ctx_data(event); if (err) return ERR_PTR(err);
}
/* * Disallow uncore-task events. Similarly, disallow uncore-cgroup * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask).
*/ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) return ERR_PTR(-EINVAL);
if (event->attr.aux_pause && event->attr.aux_resume) return ERR_PTR(-EINVAL);
if (event->attr.aux_start_paused) { if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) return ERR_PTR(-EOPNOTSUPP);
event->hw.aux_paused = 1;
}
if (cgroup_fd != -1) {
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) return ERR_PTR(err);
}
err = exclusive_event_init(event); if (err) return ERR_PTR(err);
if (has_addr_filter(event)) {
event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range),
GFP_KERNEL); if (!event->addr_filter_ranges) return ERR_PTR(-ENOMEM);
/* * Clone the parent's vma offsets: they are valid until exec() * even if the mm is not shared with the parent.
*/ if (event->parent) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
}
if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers(attr->sample_max_stack); if (err) return ERR_PTR(err);
event->attach_state |= PERF_ATTACH_CALLCHAIN;
}
}
err = security_perf_event_alloc(event); if (err) return ERR_PTR(err);
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
/* * Event creation should be under SRCU, see perf_pmu_unregister().
*/
lockdep_assert_held(&pmus_srcu);
scoped_guard (spinlock, &pmu->events_lock)
list_add(&event->pmu_list, &pmu->events);
/* Zero the full structure, so that a short copy will be nice. */
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size); if (ret) return ret;
/* ABI compatibility quirk: */ if (!size)
size = PERF_ATTR_SIZE_VER0; if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size;
ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); if (ret) { if (ret == -E2BIG) goto err_size; return ret;
}
attr->size = size;
if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) return -EINVAL;
if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL;
if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
u64 mask = attr->branch_sample_type;
/* only using defined bits */ if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) return -EINVAL;
/* at least one branch bit must be set */ if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) return -EINVAL;
/* propagate priv level, when not set for branch */ if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
/* exclude_kernel checked on syscall entry */ if (!attr->exclude_kernel)
mask |= PERF_SAMPLE_BRANCH_KERNEL;
if (!attr->exclude_user)
mask |= PERF_SAMPLE_BRANCH_USER;
if (!attr->exclude_hv)
mask |= PERF_SAMPLE_BRANCH_HV; /* * adjust user setting (for HW filter setup)
*/
attr->branch_sample_type = mask;
} /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
ret = perf_allow_kernel(); if (ret) return ret;
}
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
ret = perf_reg_validate(attr->sample_regs_user); if (ret) return ret;
}
if (attr->sample_type & PERF_SAMPLE_STACK_USER) { if (!arch_perf_have_user_stack_dump()) return -ENOSYS;
/* * We have __u32 type for the size, but so far * we can only use __u16 as maximum due to the * __u16 sample size limit.
*/ if (attr->sample_stack_user >= USHRT_MAX) return -EINVAL; elseif (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) return -EINVAL;
}
if (!attr->sample_max_stack)
attr->sample_max_stack = sysctl_perf_event_max_stack;
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
#ifndef CONFIG_CGROUP_PERF if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
(attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) return -EINVAL;
if (!attr->inherit && attr->inherit_thread) return -EINVAL;
if (attr->remove_on_exec && attr->enable_on_exec) return -EINVAL;
if (attr->sigtrap && !attr->remove_on_exec) return -EINVAL;
out: return ret;
err_size:
put_user(sizeof(*attr), &uattr->size);
ret = -E2BIG; goto out;
}
staticvoid mutex_lock_double(struct mutex *a, struct mutex *b)
{ if (b < a)
swap(a, b);
/* * If its not a per-cpu rb, it must be the same task.
*/ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target) goto out;
/* * Mixing clocks in the same buffer is trouble you don't need.
*/ if (output_event->clock != event->clock) goto out;
/* * Either writing ring buffer from beginning or from end. * Mixing is not allowed.
*/ if (is_write_backward(output_event) != is_write_backward(event)) goto out;
/* * If both events generate aux data, they must be on the same PMU
*/ if (has_aux(event) && has_aux(output_event) &&
event->pmu != output_event->pmu) goto out;
/* * Hold both mmap_mutex to serialize against perf_mmap_close(). Since * output_event is already on rb->event_list, and the list iteration * restarts after every removal, it is guaranteed this new event is * observed *OR* if output_event is already removed, it's guaranteed we * observe !rb->mmap_count.
*/
mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set: /* Can't redirect output if we've got an active mmap() */ if (atomic_read(&event->mmap_count)) goto unlock;
if (output_event) { if (output_event->state <= PERF_EVENT_STATE_REVOKED) goto unlock;
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event); if (!rb) goto unlock;
/* did we race against perf_mmap_close() */ if (!atomic_read(&rb->mmap_count)) {
ring_buffer_put(rb); goto unlock;
}
}
ring_buffer_attach(event, rb);
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex); if (output_event)
mutex_unlock(&output_event->mmap_mutex);
if (attr->sigtrap) { /* * perf_event_attr::sigtrap sends signals to the other task. * Require the current task to also have CAP_KILL.
*/
rcu_read_lock();
is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
rcu_read_unlock();
/* * If the required capabilities aren't available, checks for * ptrace permissions: upgrade to ATTACH, since sending signals * can effectively change the target task.
*/
ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
}
/* * Preserve ptrace permission check for backwards compatibility. The * ptrace check also includes checks that the current task and other * task have matching uids, and is therefore not done here explicitly.
*/ return is_capable || ptrace_may_access(task, ptrace_mode);
}
/** * sys_perf_event_open - open a performance event, associate it to a task/cpu * * @attr_uptr: event_id type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader event fd * @flags: perf event open flags
*/
SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsignedlong, flags)
{ struct perf_event *group_leader = NULL, *output_event = NULL; struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int move_group = 0; int err; int f_flags = O_RDWR; int cgroup_fd = -1;
/* for future expandability... */ if (flags & ~PERF_FLAG_ALL) return -EINVAL;
err = perf_copy_attr(attr_uptr, &attr); if (err) return err;
/* Do we allow access to perf_event_open(2) ? */
err = security_perf_event_open(PERF_SECURITY_OPEN); if (err) return err;
if (!attr.exclude_kernel) {
err = perf_allow_kernel(); if (err) return err;
}
if (attr.namespaces) { if (!perfmon_capable()) return -EACCES;
}
if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL;
} else { if (attr.sample_period & (1ULL << 63)) return -EINVAL;
}
/* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
err = perf_allow_kernel(); if (err) return err;
}
/* REGS_INTR can leak data, lockdown must prevent this */ if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
err = security_locked_down(LOCKDOWN_PERF); if (err) return err;
}
/* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument * designates the cpu on which to monitor threads from that * cgroup.
*/ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL;
if (flags & PERF_FLAG_FD_CLOEXEC)
f_flags |= O_CLOEXEC;
event_fd = get_unused_fd_flags(f_flags); if (event_fd < 0) return event_fd;
/* * Event creation should be under SRCU, see perf_pmu_unregister().
*/
guard(srcu)(&pmus_srcu);
if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
err = -EOPNOTSUPP; goto err_alloc;
}
}
/* * Special case software events and allow them to be part of * any hardware group.
*/
pmu = event->pmu;
if (attr.use_clockid) {
err = perf_event_set_clock(event, attr.clockid); if (err) goto err_alloc;
}
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
if (task) {
err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_alloc;
/* * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply).
*/
err = -EACCES; if (!perf_check_permission(&attr, task)) goto err_cred;
}
/* * Get the target context (task or percpu):
*/
ctx = find_get_context(task, event); if (IS_ERR(ctx)) {
err = PTR_ERR(ctx); goto err_cred;
}
if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu().
*/ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
if (!cpuctx->online) {
err = -ENODEV; goto err_locked;
}
}
if (group_leader) {
err = -EINVAL;
/* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling):
*/ if (group_leader->group_leader != group_leader) goto err_locked;
/* All events in a group should have the same clock */ if (group_leader->clock != event->clock) goto err_locked;
/* * Make sure we're both events for the same CPU; * grouping events for different CPUs is broken; since * you can never concurrently schedule them anyhow.
*/ if (group_leader->cpu != event->cpu) goto err_locked;
/* * Make sure we're both on the same context; either task or cpu.
*/ if (group_leader->ctx != ctx) goto err_locked;
/* * Only a group leader can be exclusive or pinned
*/ if (attr.exclusive || attr.pinned) goto err_locked;
if (is_software_event(event) &&
!in_software_context(group_leader)) { /* * If the event is a sw event, but the group_leader * is on hw context. * * Allow the addition of software events to hw * groups, this is safe because software events * never fail to schedule. * * Note the comment that goes with struct * perf_event_pmu_context.
*/
pmu = group_leader->pmu_ctx->pmu;
} elseif (!is_software_event(event)) { if (is_software_event(group_leader) &&
(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to * the hardware context.
*/
move_group = 1;
}
/* Don't allow group of multiple hw events from different pmus */ if (!in_software_context(group_leader) &&
group_leader->pmu_ctx->pmu != pmu) goto err_locked;
}
}
/* * Now that we're certain of the pmu; find the pmu_ctx.
*/
pmu_ctx = find_get_pmu_context(pmu, ctx, event); if (IS_ERR(pmu_ctx)) {
err = PTR_ERR(pmu_ctx); goto err_locked;
}
event->pmu_ctx = pmu_ctx;
if (output_event) {
err = perf_event_set_output(event, output_event); if (err) goto err_context;
}
if (!perf_event_validate_size(event)) {
err = -E2BIG; goto err_context;
}
/* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation.
*/ if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY; goto err_context;
}
/* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group * (through the sibling list, which is still in-tact), we can * end up with siblings installed in the wrong context. * * By installing siblings first we NO-OP because they're not * reachable through the group lists.
*/
for_each_sibling_event(sibling, group_leader) {
sibling->pmu_ctx = pmu_ctx;
get_pmu_ctx(pmu_ctx);
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
}
/* * Removing from the context ends up with disabled * event. What we want here is event in the initial * startup state, ready to be add into new context.
*/
group_leader->pmu_ctx = pmu_ctx;
get_pmu_ctx(pmu_ctx);
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
}
/* * Precalculate sample_data sizes; do while holding ctx::mutex such * that we're serialized against further additions and before * perf_install_in_context() which is the point the event is active and * can use these values.
*/
perf_event__header_size(event);
perf_event__id_header_size(event);
/* * File reference in group guarantees that group_leader has been * kept alive until we place the new event on the sibling_list. * This ensures destruction of the group leader will find * the pointer to itself in perf_group_detach().
*/
fd_install(event_fd, event_file); return event_fd;
/** * perf_event_create_kernel_counter * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound * @task: task to profile (NULL for percpu) * @overflow_handler: callback to trigger when we hit the event * @context: context data could be used in overflow_handler callback
*/ struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task,
perf_overflow_handler_t overflow_handler, void *context)
{ struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; struct pmu *pmu; int err;
/* * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted.
*/ if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL);
/* * Event creation should be under SRCU, see perf_pmu_unregister().
*/
guard(srcu)(&pmus_srcu);
if (!task) { /* * Check if the @cpu we're creating an event for is online. * * We use the perf_cpu_context::ctx::mutex to serialize against * the hotplug notifiers. See perf_event_{init,exit}_cpu().
*/ struct perf_cpu_context *cpuctx =
container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) {
err = -ENODEV; goto err_pmu_ctx;
}
}
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY; goto err_pmu_ctx;
}
/* * Re-instate events in 2 passes. * * Skip over group leaders and only install siblings on this first * pass, siblings will not get enabled without a leader, however a * leader will enable its siblings, even if those are still on the old * context.
*/
list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue;
/* * Once all the siblings are setup properly, install the group leaders * to make it go.
*/
list_for_each_entry_safe(event, tmp, events, migrate_entry) {
list_del(&event->migrate_entry);
__perf_pmu_install_event(pmu, ctx, cpu, event);
}
}
void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
{ struct perf_event_context *src_ctx, *dst_ctx;
LIST_HEAD(events);
/* * Since per-cpu context is persistent, no need to grab an extra * reference.
*/
src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
/* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx.
*/
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
if (child_event->attr.inherit_stat) { struct task_struct *task = child_event->ctx->task;
if (task && task != TASK_TOMBSTONE)
perf_event_read_event(child_event, task);
}
child_val = perf_event_count(child_event, false);
/* * Add back the child's count to the parent's count:
*/
atomic64_add(child_val, &parent_event->child_count);
atomic64_add(child_event->total_time_enabled,
&parent_event->child_total_time_enabled);
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);
}
if (parent_event) { /* * Do not destroy the 'original' grouping; because of the * context switch optimization the original events could've * ended up in a random child task. * * If we were to destroy the original group, all group related * operations would cease to function properly after this * random child dies. * * Do destroy all inherited groups, we don't care about those * and being thorough is better.
*/
detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex); /* PERF_ATTACH_ITRACE might be set concurrently */
attach_state = READ_ONCE(event->attach_state);
}
if (revoke)
detach_flags |= DETACH_GROUP | DETACH_REVOKE;
perf_remove_from_context(event, detach_flags); /* * Child events can be freed.
*/ if (parent_event) {
mutex_unlock(&parent_event->child_mutex);
/* * Match the refcount initialization. Make sure it doesn't happen * twice if pmu_detach_event() calls it on an already exited task.
*/ if (attach_state & PERF_ATTACH_CHILD) { /* * Kick perf_poll() for is_event_hup();
*/
perf_event_wakeup(parent_event); /* * pmu_detach_event() will have an extra refcount. * perf_pending_task() might have one too.
*/
put_event(event);
}
return;
}
/* * Parent events are governed by their filedesc, retain them.
*/
perf_event_wakeup(event);
}
ctx = perf_pin_task_context(task); if (!ctx) return;
/* * In order to reduce the amount of tricky in ctx tear-down, we hold * ctx::mutex over the entire thing. This serializes against almost * everything that wants to access the ctx. * * The exception is sys_perf_event_open() / * perf_event_create_kernel_count() which does find_get_context() * without ctx::mutex (it cannot because of the move_group double mutex * lock thing). See the comments in perf_install_in_context().
*/
mutex_lock(&ctx->mutex);
/* * In a single ctx::lock section, de-schedule the events and detach the * context from the task such that we cannot ever get it scheduled back * in.
*/
raw_spin_lock_irq(&ctx->lock); if (exit)
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
/* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead.
*/
RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
put_ctx(ctx); /* cannot be last */
WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
put_task_struct(task); /* cannot be last */
/* * Report the task dead after unscheduling the events so that we * won't get any samples after PERF_RECORD_EXIT. We can however still * get a few PERF_RECORD_READ events.
*/ if (exit)
perf_event_task(task, ctx, 0);
if (!exit) { /* * perf_event_release_kernel() could still have a reference on * this context. In that case we must wait for these events to * have been freed (in particular all their references to this * task must've been dropped). * * Without this copy_process() will unconditionally free this * task (irrespective of its reference count) and * _free_event()'s put_task_struct(event->hw.target) will be a * use-after-free. * * Wait for all events to drop their context reference.
*/
wait_var_event(&ctx->refcount,
refcount_read(&ctx->refcount) == 1);
}
put_ctx(ctx);
}
/* * When a task exits, feed back event values to parent events. * * Can be called with exec_update_lock held when called from * setup_new_exec().
*/ void perf_event_exit_task(struct task_struct *task)
{ struct perf_event *event, *tmp;
/* * Ensure the list deletion is visible before we clear * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex.
*/
smp_store_release(&event->owner, NULL);
}
mutex_unlock(&task->perf_event_mutex);
perf_event_exit_task_context(task, true);
/* * The perf_event_exit_task_context calls perf_event_task * with task's task_ctx, which generates EXIT events for * task contexts and sets task->perf_event_ctxp[] to NULL. * At this point we need to send EXIT events to cpu contexts.
*/
perf_event_task(task, NULL, 0);
/* * Detach the perf_ctx_data for the system-wide event.
*/
guard(percpu_read)(&global_ctx_data_rwsem);
detach_task_ctx_data(task);
}
/* * Free a context as created by inheritance by perf_event_init_task() below, * used by fork() in case of fail. * * Even though the task has never lived, the context and events have been * exposed through the child_list, so we must take care tearing it all down.
*/ void perf_event_free_task(struct task_struct *task)
{
perf_event_exit_task_context(task, false);
}
/* * Instead of creating recursive hierarchies of events, * we link inherited events back to the original parent, * which has a filp for sure, which we use as the reference * count:
*/ if (parent_event->parent)
parent_event = parent_event->parent;
if (parent_event->state <= PERF_EVENT_STATE_REVOKED) return NULL;
/* * Event creation should be under SRCU, see perf_pmu_unregister().
*/
guard(srcu)(&pmus_srcu);
/* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against * perf_event_release_kernel(), such that either we must observe * is_orphaned_event() or they will observe us on the child_list.
*/
mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
mutex_unlock(&parent_event->child_mutex);
free_event(child_event); return NULL;
}
/* * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_event_{en, dis}able_family.
*/ if (parent_state >= PERF_EVENT_STATE_INACTIVE)
child_event->state = PERF_EVENT_STATE_INACTIVE; else
child_event->state = PERF_EVENT_STATE_OFF;
/* * Link it up in the child's context:
*/
raw_spin_lock_irqsave(&child_ctx->lock, flags);
add_event_to_ctx(child_event, child_ctx);
child_event->attach_state |= PERF_ATTACH_CHILD;
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/* * Link this into the parent event's child list
*/
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);
return child_event;
}
/* * Inherits an event group. * * This will quietly suppress orphaned events; !inherit_event() is not an error. * This matches with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error
*/ staticint inherit_group(struct perf_event *parent_event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, struct perf_event_context *child_ctx)
{ struct perf_event *leader; struct perf_event *sub; struct perf_event *child_ctr;
leader = inherit_event(parent_event, parent, parent_ctx,
child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); /* * @leader can be NULL here because of is_orphaned_event(). In this * case inherit_event() will create individual events, similar to what * perf_group_detach() would do anyway.
*/
for_each_sibling_event(sub, parent_event) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr);
if (sub->aux_event == parent_event && child_ctr &&
!perf_get_aux_event(child_ctr, leader)) return -EINVAL;
} if (leader)
leader->group_generation = parent_event->group_generation; return 0;
}
/* * Creates the child task context and tries to inherit the event-group. * * Clears @inherited_all on !attr.inherited or error. Note that we'll leave * inherited_all set when we 'fail' to inherit an orphaned event; this is * consistent with perf_event_release_kernel() removing all child events. * * Returns: * - 0 on success * - <0 on error
*/ staticint
inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child,
u64 clone_flags, int *inherited_all)
{ struct perf_event_context *child_ctx; int ret;
if (!event->attr.inherit ||
(event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || /* Do not inherit if sigtrap and signal handlers were cleared. */
(event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
*inherited_all = 0; return 0;
}
child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so * inherit events that have been marked for cloning. * First allocate and initialize a context for the * child.
*/
child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM;
child->perf_event_ctxp = child_ctx;
}
ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret)
*inherited_all = 0;
return ret;
}
/* * Initialize the perf_event context in task_struct
*/ staticint perf_event_init_context(struct task_struct *child, u64 clone_flags)
{ struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; unsignedlong flags; int ret = 0;
if (likely(!parent->perf_event_ctxp)) return 0;
/* * If the parent's context is a clone, pin it so it won't get * swapped under us.
*/
parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0;
/* * No need to check if parent_ctx != NULL here; since we saw * it non-NULL earlier, the only reason for it to become NULL * is if we exit, and since we're currently in the middle of * a fork we can't be exiting at the same time.
*/
/* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it.
*/
mutex_lock(&parent_ctx->mutex);
/* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it:
*/
perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
child, clone_flags, &inherited_all); if (ret) goto out_unlock;
}
/* * We can't hold ctx->lock when iterating the ->flexible_group list due * to allocations, but we need to prevent rotation because * rotate_ctx() will change the list from interrupt context.
*/
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 1;
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
child, clone_flags, &inherited_all); if (ret) goto out_unlock;
}
if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. * * Note that if the parent is a clone, the holding of * parent_ctx->lock avoids it from being uncloned.
*/
cloned_ctx = parent_ctx->parent_ctx; if (cloned_ctx) {
child_ctx->parent_ctx = cloned_ctx;
child_ctx->parent_gen = parent_ctx->parent_gen;
} else {
child_ctx->parent_ctx = parent_ctx;
child_ctx->parent_gen = parent_ctx->generation;
}
get_ctx(child_ctx->parent_ctx);
}
// XXX simplify cpuctx->online
mutex_lock(&pmus_lock); /* * Clear the cpumasks, and migrate to other CPUs if possible. * Must be invoked before the __perf_event_exit_context.
*/
perf_event_clear_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
/* * Early boot stage, the cpumask hasn't been set yet. * The perf_online_<domain>_masks includes the first CPU of each domain. * Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
*/ if (cpumask_empty(perf_online_mask)) { for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
pmu_cpumask = perf_scope_cpumask(scope); if (WARN_ON_ONCE(!pmu_cpumask)) continue;
cpumask_set_cpu(cpu, pmu_cpumask);
} goto end;
}
/* * Run the perf reboot notifier at the very last possible moment so that * the generic watchdog code runs as long as possible.
*/ staticstruct notifier_block perf_reboot_notifier = {
.notifier_call = perf_reboot,
.priority = INT_MIN,
};
/* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right.
*/
BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
!= 1024);
}
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
.css_online = perf_cgroup_css_online,
.attach = perf_cgroup_attach, /* * Implicitly enable on dfl hierarchy so that perf events can * always be filtered by cgroup2 path as long as perf_event * controller is not mounted on a legacy hierarchy.
*/
.implicit_on_dfl = true,
.threaded = true,
}; #endif/* CONFIG_CGROUP_PERF */
¤ Diese beiden folgenden Angebotsgruppen bietet das Unternehmen0.538Angebot
(Wie Sie bei der Firma Beratungs- und Dienstleistungen beauftragen können 2026-04-27)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.