if (p) { /* -EAGAIN */ if (task_cpu(p) != smp_processor_id()) return;
/* * Now that we're on right CPU with IRQs disabled, we can test * if we hit the right task without races.
*/
tfc->ret = -ESRCH; /* No such (running) process */ if (p != current) return;
}
tfc->ret = tfc->func(tfc->info);
}
/** * task_function_call - call a function on the cpu on which a task runs * @p: the task to evaluate * @func: the function to be called * @info: the function call argument * * Calls the function @func when the task is currently running. This might * be on the current CPU, which just calls the function directly. This will * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * * returns @func return value or -ESRCH or -ENXIO when the process isn't running
*/ staticint
task_function_call(struct task_struct *p, remote_function_f func, void *info)
{ struct remote_function_call data = {
.p = p,
.func = func,
.info = info,
.ret = -EAGAIN,
}; int ret;
for (;;) {
ret = smp_call_function_single(task_cpu(p), remote_function,
&data, 1); if (!ret)
ret = data.ret;
if (ret != -EAGAIN) break;
cond_resched();
}
return ret;
}
/** * cpu_function_call - call a function on the cpu * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * * Calls the function @func on the remote cpu. * * returns: @func return value or -ENXIO when the cpu is offline
*/ staticint cpu_function_call(int cpu, remote_function_f func, void *info)
{ struct remote_function_call data = {
.p = NULL,
.func = func,
.info = info,
.ret = -ENXIO, /* No such CPU */
};
staticinlinevoid __perf_ctx_unlock(struct perf_event_context *ctx)
{ /* * If ctx_sched_in() didn't again set any ALL flags, clean up * after ctx_sched_out() by clearing is_active.
*/ if (ctx->is_active & EVENT_FROZEN) { if (!(ctx->is_active & EVENT_ALL))
ctx->is_active = 0; else
ctx->is_active &= ~EVENT_FROZEN;
}
raw_spin_unlock(&ctx->lock);
}
/* * On task ctx scheduling... * * When !ctx->nr_events a task context will not be scheduled. This means * we can disable the scheduler hooks (for performance) without leaving * pending task ctx state. * * This however results in two special cases: * * - removing the last event from a task ctx; this is relatively straight * forward and is done in __perf_remove_from_context. * * - adding the first event to a task ctx; this is tricky because we cannot * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
*/
perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit.
*/ if (ctx->task) { if (ctx->task != current) {
ret = -ESRCH; goto unlock;
}
/* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or * rather, we'll have bailed in task_function_call() or the * above ctx->task != current test), therefore we must have * ctx->is_active here.
*/
WARN_ON_ONCE(!ctx->is_active); /* * And since we have ctx->is_active, cpuctx->task_ctx must * match.
*/
WARN_ON_ONCE(task_ctx != ctx);
} else {
WARN_ON_ONCE(&cpuctx->ctx != ctx);
}
if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to * stabilize the event->ctx relation. See * perf_event_ctx_lock().
*/
lockdep_assert_held(&ctx->mutex);
}
if (!task) {
cpu_function_call(event->cpu, event_function, &efs); return;
}
if (task == TASK_TOMBSTONE) return;
again: if (!task_function_call(task, event_function, &efs)) return;
local_irq_disable();
cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out().
*/
task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (ctx->is_active) {
perf_ctx_unlock(cpuctx, ctx);
local_irq_enable(); goto again;
}
func(event, NULL, ctx, data);
unlock:
perf_ctx_unlock(cpuctx, ctx);
local_irq_enable();
}
/* * Similar to event_function_call() + event_function(), but hard assumes IRQs * are already disabled and we're on the right CPU.
*/ staticvoid event_function_local(struct perf_event *event, event_f func, void *data)
{ struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL;
lockdep_assert_irqs_disabled();
if (task) { if (task == TASK_TOMBSTONE) return;
task_ctx = ctx;
}
perf_ctx_lock(cpuctx, task_ctx);
task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock;
if (task) { /* * We must be either inactive or active and the right task, * otherwise we're screwed, since we cannot IPI to somewhere * else.
*/ if (ctx->is_active) { if (WARN_ON_ONCE(task != current)) goto unlock;
/* * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv
*/ int sysctl_perf_event_paranoid __read_mostly = 2;
/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ staticint sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
/* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not * get any real work done. This will drop the sample rate when * we detect that events are taking too long.
*/ #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length);
/* Decay the counter by 1 average sample. */
running_len = __this_cpu_read(running_sample_length);
running_len -= running_len/NR_ACCUMULATED_SAMPLES;
running_len += sample_len_ns;
__this_cpu_write(running_sample_length, running_len);
/* * Note: this will be biased artificially low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count.
*/
avg_len = running_len/NR_ACCUMULATED_SAMPLES; if (avg_len <= max_len) return;
/* * Compute a throttle threshold 25% below the current duration.
*/
avg_len += avg_len / 4;
max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; if (avg_len < max)
max /= (u32)avg_len; else
max = 1;
sysctl_perf_event_sample_rate = max * HZ;
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
if (!irq_work_queue(&perf_duration_work)) {
early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n",
__report_avg, __report_allowed,
sysctl_perf_event_sample_rate);
}
}
/* * State based event timekeeping... * * The basic idea is to use event->state to determine which (if any) time * fields to increment with the current delta. This means we only need to * update timestamps when we change state or when they are explicitly requested * (read). * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is * OFF, irrespective of what the group member states are. This results in * __perf_effective_state(). * * A further ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we * need to make sure the relevant context time is updated before we try and * update our timestamps.
*/
perf_event_update_time(event); /* * If a group leader gets enabled/disabled all its siblings * are affected too.
*/ if ((event->state < 0) ^ (state < 0))
perf_event_update_sibling_time(event);
WRITE_ONCE(event->state, state);
}
/* * UP store-release, load-acquire
*/
#define __store_release(ptr, val) \ do { \
barrier(); \
WRITE_ONCE(*(ptr), (val)); \
} while (0)
/* @event doesn't care about cgroup */ if (!event->cgrp) returntrue;
/* wants specific cgroup scope but @cpuctx isn't associated with any */ if (!cpuctx->cgrp) returnfalse;
/* * Cgroup scoping is recursive. An event enabled for a cgroup is * also enabled for all its descendant cgroups. If @cpuctx's * cgroup is a descendant of @event's (the test covers identity * case), it's a match.
*/ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
event->cgrp->css.cgroup);
}
/* * ensure we access cgroup data only when needed and * when we know the cgroup is pinned (css_get)
*/ if (!is_cgroup_event(event)) return;
info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active
*/ if (info->active)
__update_cgrp_time(info, perf_clock(), true);
}
/* * reschedule events based on the cgroup constraint of task.
*/ staticvoid perf_cgroup_switch(struct task_struct *task)
{ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp;
/* * cpuctx->cgrp is set when the first cgroup event enabled, * and is cleared when the last cgroup event disabled.
*/ if (READ_ONCE(cpuctx->cgrp) == NULL) return;
cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return;
guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); /* * Re-check, could've raced vs perf_remove_from_context().
*/ if (READ_ONCE(cpuctx->cgrp) == NULL) return;
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in * ctx_sched_out()
*/
cpuctx->cgrp = cgrp; /* * set cgrp before ctxsw in to allow * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around
*/
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
}
staticint perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css)
{ struct perf_cpu_context *cpuctx; struct perf_event **storage; int cpu, heap_size, ret = 0;
/* * Allow storage to have sufficient space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup.
*/ for (heap_size = 1; css; css = css->parent)
heap_size++;
/* * all events in a group must monitor * the same cgroup because a task belongs * to only one perf cgroup at a time
*/ if (group_leader && group_leader->cgrp != cgrp) {
perf_detach_cgroup(event);
ret = -EINVAL;
} return ret;
}
/* * set default to be dependent on timer tick just * like original code
*/ #define PERF_CPU_HRTIMER (1000 / HZ) /* * function must be called with interrupts disabled
*/ staticenum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{ struct perf_cpu_pmu_context *cpc; bool rotations;
/* * check default is sane, if not set then force to * default interval (1/tick)
*/
interval = pmu->hrtimer_interval_ms; if (interval < 1)
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
staticvoid put_ctx(struct perf_event_context *ctx)
{ if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
} else {
smp_mb__after_atomic(); /* pairs with wait_var_event() */ if (ctx->task == TASK_TOMBSTONE)
wake_up_var(&ctx->refcount);
}
}
/* * Because of perf_event::ctx migration in sys_perf_event_open::move_group and * perf_pmu_migrate_context() we need some magic. * * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * * Lock ordering is by mutex address. There are two other sites where * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] * perf_event_exit_event() * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() * inherit_group() * inherit_event() * perf_event_alloc() * perf_init_event() * perf_try_init_event() [ child , 1 ] * * While it appears there is an obvious deadlock here -- the parent and child * nesting levels are inverted between the two. This is in fact safe because * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break * the ctx parent<->child relation, and perf_pmu_migrate_context() is only * concerned with cpuctx and that doesn't have children. * * The places that change perf_event::ctx will issue: * * perf_remove_from_context(); * synchronize_rcu(); * perf_install_in_context(); * * to affect the change. The remove_from_context() + synchronize_rcu() should * quiesce the event, after which we can install it in the new location. This * means that only external vectors (perf_fops, prctl) can perturb the event * while in transit. Therefore all such accessors should also acquire * perf_event_context::mutex to serialize against this. * * However; because event->ctx can change while we're waiting to acquire * ctx->mutex we must be careful and use the below perf_event_ctx_lock() * function. * * Lock order: * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock * mmap_lock * perf_event::mmap_mutex * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock * cpuctx->mutex / perf_event_context::mutex
*/ staticstruct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{ struct perf_event_context *ctx;
/* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those.
*/ static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
{ struct perf_event_context *parent_ctx = ctx->parent_ctx;
lockdep_assert_held(&ctx->lock);
if (parent_ctx)
ctx->parent_ctx = NULL;
ctx->generation++;
return parent_ctx;
}
static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, enum pid_type type)
{
u32 nr; /* * only top level events have the pid namespace they were created in
*/ if (event->parent)
event = event->parent;
nr = __task_pid_nr_ns(p, type, event->ns); /* avoid -1 if it is idle thread or runs in another ns */ if (!nr && !pid_alive(p))
nr = -1; return nr;
}
/* * If we inherit events we want to return the parent event id * to userspace.
*/ static u64 primary_event_id(struct perf_event *event)
{
u64 id = event->id;
if (event->parent)
id = event->parent->id;
return id;
}
/* * Get the perf_event_context for a task and lock it. * * This has to cope with the fact that until it is locked, * the context could get moved to another task.
*/ staticstruct perf_event_context *
perf_lock_task_context(struct task_struct *task, unsignedlong *flags)
{ struct perf_event_context *ctx;
retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read * side critical section has interrupts disabled.
*/
local_irq_save(*flags);
rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry * if so. If we locked the right context, then it * can't get swapped on us any more.
*/
raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp)) {
raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
local_irq_restore(*flags); goto retry;
}
/* * Get the context for a task and increment its pin_count so it * can't get swapped to another task. This also increments its * reference count so that the context can't get freed.
*/ staticstruct perf_event_context *
perf_pin_task_context(struct task_struct *task)
{ struct perf_event_context *ctx; unsignedlong flags;
/* * Update the record of the current time in a context.
*/ staticvoid __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
lockdep_assert_held(&ctx->lock);
if (adv)
ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
/* * The above: time' = time + (now - timestamp), can be re-arranged * into: time` = now + (time - timestamp), which gives a single value * offset to compute future time without locks on. * * See perf_event_time_now(), which can be used from NMI context where * it's (obviously) not possible to acquire ctx->lock in order to read * both the above values in a consistent manner.
*/
WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
}
/* * Helper function to initialize event group nodes.
*/ staticvoid init_event_group(struct perf_event *event)
{
RB_CLEAR_NODE(&event->group_node);
event->group_index = 0;
}
/* * Extract pinned or flexible groups from the context * based on event attrs bits.
*/ staticstruct perf_event_groups *
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{ if (event->attr.pinned) return &ctx->pinned_groups; else return &ctx->flexible_groups;
}
#ifdef CONFIG_CGROUP_PERF if (event->cgrp)
cgroup = event->cgrp->css.cgroup; #endif
return cgroup;
}
/* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU.
*/ static __always_inline int
perf_event_groups_cmp(constint left_cpu, conststruct pmu *left_pmu, conststruct cgroup *left_cgroup, const u64 left_group_index, conststruct perf_event *right)
{ if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1;
if (left_pmu) { if (left_pmu < right->pmu_ctx->pmu) return -1; if (left_pmu > right->pmu_ctx->pmu) return 1;
}
if (left_cgroup != right_cgroup) { if (!left_cgroup) { /* * Left has no cgroup but right does, no * cgroups come first.
*/ return -1;
} if (!right_cgroup) { /* * Right has no cgroup but left does, no * cgroups come first.
*/ return 1;
} /* Two dissimilar cgroups, order by id. */ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) return -1;
return 1;
}
} #endif
if (left_group_index < right->group_index) return -1; if (left_group_index > right->group_index) return 1;
/* * Helper function to insert event into the pinned or flexible groups.
*/ staticvoid
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
{ struct perf_event_groups *groups;
groups = get_event_groups(event, ctx);
perf_event_groups_insert(groups, event);
}
/* * Delete a group from a tree.
*/ staticvoid
perf_event_groups_delete(struct perf_event_groups *groups, struct perf_event *event)
{
WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
RB_EMPTY_ROOT(&groups->tree));
/* * Iterate through the whole groups tree.
*/ #define perf_event_groups_for_each(event, groups) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \
typeof(*event), group_node); event; \
event = rb_entry_safe(rb_next(&event->group_node), \
typeof(*event), group_node))
/* * Does the event attribute request inherit with PERF_SAMPLE_READ
*/ staticinlinebool has_inherit_and_sample_read(struct perf_event_attr *attr)
{ return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
}
/* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held.
*/ staticvoid
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
lockdep_assert_held(&ctx->lock);
/* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings.
*/ if (event->group_leader == event) {
event->group_caps = event->event_caps;
add_event_to_groups(event, ctx);
}
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
ctx->nr_user++; if (event->attr.inherit_stat)
ctx->nr_stat++; if (has_inherit_and_sample_read(&event->attr))
local_inc(&ctx->nr_no_switch_fast);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
event->pmu_ctx->nr_events++;
}
/* * Initialize event state based on the perf_event_attr::disabled.
*/ staticinlinevoid perf_event__state_init(struct perf_event *event)
{
event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
PERF_EVENT_STATE_INACTIVE;
}
staticint __perf_event_read_size(u64 read_format, int nr_siblings)
{ int entry = sizeof(u64); /* value */ int size = 0; int nr = 1;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
if (read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
if (read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
if (read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
}
/* * Since perf_event_validate_size() limits this to 16k and inhibits * adding more siblings, this will never overflow.
*/ return size + nr * entry;
}
if (sample_type & PERF_SAMPLE_IP)
size += sizeof(data->ip);
if (sample_type & PERF_SAMPLE_ADDR)
size += sizeof(data->addr);
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
size += sizeof(data->weight.full);
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
if (sample_type & PERF_SAMPLE_DATA_SRC)
size += sizeof(data->data_src.val);
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
size += sizeof(data->phys_addr);
if (sample_type & PERF_SAMPLE_CGROUP)
size += sizeof(data->cgroup);
if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
size += sizeof(data->data_page_size);
if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
size += sizeof(data->code_page_size);
event->header_size = size;
}
/* * Called at perf_event creation and when events are attached/detached from a * group.
*/ staticvoid perf_event__header_size(struct perf_event *event)
{
event->read_size =
__perf_event_read_size(event->attr.read_format,
event->group_leader->nr_siblings);
__perf_event_header_size(event, event->attr.sample_type);
}
if (sample_type & PERF_SAMPLE_TID)
size += sizeof(data->tid_entry);
if (sample_type & PERF_SAMPLE_TIME)
size += sizeof(data->time);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
size += sizeof(data->id);
if (sample_type & PERF_SAMPLE_ID)
size += sizeof(data->id);
if (sample_type & PERF_SAMPLE_STREAM_ID)
size += sizeof(data->stream_id);
if (sample_type & PERF_SAMPLE_CPU)
size += sizeof(data->cpu_entry);
event->id_header_size = size;
}
/* * Check that adding an event to the group does not result in anybody * overflowing the 64k event limit imposed by the output buffer. * * Specifically, check that the read_size for the event does not exceed 16k, * read_size being the one term that grows with groups size. Since read_size * depends on per-event read_format, also (re)check the existing events. * * This leaves 48k for the constant size fields and things like callchains, * branch stacks and register sets.
*/ staticbool perf_event_validate_size(struct perf_event *event)
{ struct perf_event *sibling, *group_leader = event->group_leader;
if (__perf_event_read_size(event->attr.read_format,
group_leader->nr_siblings + 1) > 16*1024) returnfalse;
if (__perf_event_read_size(group_leader->attr.read_format,
group_leader->nr_siblings + 1) > 16*1024) returnfalse;
/* * When creating a new group leader, group_leader->ctx is initialized * after the size has been validated, but we cannot safely use * for_each_sibling_event() until group_leader->ctx is set. A new group * leader cannot have any siblings yet, so we can safely skip checking * the non-existent siblings.
*/ if (event == group_leader) returntrue;
/* * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held.
*/ staticvoid
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
/* * We can have double detach due to exit/hot-unplug + close.
*/ if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return;
event->attach_state &= ~PERF_ATTACH_CONTEXT;
ctx->nr_events--; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
ctx->nr_user--; if (event->attr.inherit_stat)
ctx->nr_stat--; if (has_inherit_and_sample_read(&event->attr))
local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
if (event->group_leader == event)
del_event_from_groups(event, ctx);
/* * If event uses aux_event tear down the link
*/ if (event->aux_event) {
iter = event->aux_event;
event->aux_event = NULL;
put_event(iter); return;
}
/* * If the event is an aux_event, tear down all links to * it from other events.
*/
for_each_sibling_event(iter, event) { if (iter->aux_event != event) continue;
iter->aux_event = NULL;
put_event(event);
/* * If it's ACTIVE, schedule it out and put it into ERROR * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status.
*/
__event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
}
}
staticint perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader)
{ /* * Our group leader must be an aux event if we want to be * an aux_output. This way, the aux event will precede its * aux_output events in the group, and therefore will always * schedule first.
*/ if (!group_leader) return 0;
/* * aux_output and aux_sample_size are mutually exclusive.
*/ if (event->attr.aux_output && event->attr.aux_sample_size) return 0;
if (event->attr.aux_output &&
!perf_aux_output_match(event, group_leader)) return 0;
if ((event->attr.aux_pause || event->attr.aux_resume) &&
!(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) return 0;
if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0;
if (!atomic_long_inc_not_zero(&group_leader->refcount)) return 0;
/* * Link aux_outputs to their aux event; this is undone in * perf_group_detach() by perf_put_aux_event(). When the * group in torn down, the aux_output events loose their * link to the aux_event and can't schedule any more.
*/
event->aux_event = group_leader;
/* * We can have double detach due to exit/hot-unplug + close.
*/ if (!(event->attach_state & PERF_ATTACH_GROUP)) return;
event->attach_state &= ~PERF_ATTACH_GROUP;
perf_put_aux_event(event);
/* * If this is a sibling, remove it from its group.
*/ if (leader != event) {
list_del_init(&event->sibling_list);
event->group_leader->nr_siblings--;
event->group_leader->group_generation++; goto out;
}
/* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
/* * Events that have PERF_EV_CAP_SIBLING require being part of * a group and cannot exist on their own, schedule them out * and move them into the ERROR state. Also see * _perf_event_enable(), it will not be able to recover this * ERROR state.
*/ if (sibling->event_caps & PERF_EV_CAP_SIBLING)
__event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
if (event->state != PERF_EVENT_STATE_ACTIVE) return;
/* * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but * we can schedule events _OUT_ individually through things like * __perf_remove_from_context().
*/
list_del_init(&event->active_list);
perf_pmu_disable(event->pmu);
event->pmu->del(event, 0);
event->oncpu = -1;
if (event->pending_disable) {
event->pending_disable = 0;
perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
if (!is_software_event(event))
cpc->active_oncpu--; if (is_event_in_freq_mode(event)) {
ctx->nr_freq--;
epc->nr_freq--;
} if (event->attr.exclusive || !cpc->active_oncpu)
cpc->exclusive = 0;
/* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list.
*/ staticvoid
__perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info)
{ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; enum perf_event_state state = PERF_EVENT_STATE_OFF; unsignedlong flags = (unsignedlong)info;
ctx_time_update(cpuctx, ctx);
/* * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time.
*/ if (flags & DETACH_EXIT)
state = PERF_EVENT_STATE_EXIT; if (flags & DETACH_REVOKE)
state = PERF_EVENT_STATE_REVOKED; if (flags & DETACH_DEAD)
state = PERF_EVENT_STATE_DEAD;
event_sched_out(event, ctx);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_disable(event, ctx);
/* * Remove the event from a task's (or a CPU's) list of events. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. * When called from perf_event_exit_task, it's OK because the * context has been detached from its task.
*/ staticvoid perf_remove_from_context(struct perf_event *event, unsignedlong flags)
{ struct perf_event_context *ctx = event->ctx;
lockdep_assert_held(&ctx->mutex);
/* * Because of perf_event_exit_task(), perf_remove_from_context() ought * to work in the face of TASK_TOMBSTONE, unlike every other * event_function_call() user.
*/
raw_spin_lock_irq(&ctx->lock); if (!ctx->is_active) {
__perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock); return;
}
raw_spin_unlock_irq(&ctx->lock);
/* * Cross CPU call to disable a performance event
*/ staticvoid __perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx,
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.68Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.