/* * State of retpoline detection. * * RETPOLINE_NONE: no retpoline detection * X86_RETPOLINE_POSSIBLE: x86 retpoline possible * X86_RETPOLINE_DETECTED: x86 retpoline detected
*/ enum retpoline_state_t {
RETPOLINE_NONE,
X86_RETPOLINE_POSSIBLE,
X86_RETPOLINE_DETECTED,
};
/** * struct thread_stack_entry - thread stack entry. * @ret_addr: return address * @timestamp: timestamp (if known) * @ref: external reference (e.g. db_id of sample) * @branch_count: the branch count when the entry was created * @insn_count: the instruction count when the entry was created * @cyc_count the cycle count when the entry was created * @db_id: id used for db-export * @cp: call path * @no_call: a 'call' was not seen * @trace_end: a 'call' but trace ended * @non_call: a branch but not a 'call' to the start of a different symbol
*/ struct thread_stack_entry {
u64 ret_addr;
u64 timestamp;
u64 ref;
u64 branch_count;
u64 insn_count;
u64 cyc_count;
u64 db_id; struct call_path *cp; bool no_call; bool trace_end; bool non_call;
};
/** * struct thread_stack - thread stack constructed from 'call' and 'return' * branch samples. * @stack: array that holds the stack * @cnt: number of entries in the stack * @sz: current maximum stack size * @trace_nr: current trace number * @branch_count: running branch count * @insn_count: running instruction count * @cyc_count running cycle count * @kernel_start: kernel start address * @last_time: last timestamp * @crp: call/return processor * @comm: current comm * @arr_sz: size of array if this is the first element of an array * @rstate: used to detect retpolines * @br_stack_rb: branch stack (ring buffer) * @br_stack_sz: maximum branch stack size * @br_stack_pos: current position in @br_stack_rb * @mispred_all: mark all branches as mispredicted
*/ struct thread_stack { struct thread_stack_entry *stack;
size_t cnt;
size_t sz;
u64 trace_nr;
u64 branch_count;
u64 insn_count;
u64 cyc_count;
u64 kernel_start;
u64 last_time; struct call_return_processor *crp; struct comm *comm; unsignedint arr_sz; enum retpoline_state_t rstate; struct branch_stack *br_stack_rb; unsignedint br_stack_sz; unsignedint br_stack_pos; bool mispred_all;
};
/* * Assume pid == tid == 0 identifies the idle task as defined by * perf_session__register_idle_thread(). The idle task is really 1 task per cpu, * and therefore requires a stack for each cpu.
*/ staticinlinebool thread_stack__per_cpu(struct thread *thread)
{ return !(thread__tid(thread) || thread__pid(thread));
}
/* * In some cases there may be functions which are not seen to return. * For example when setjmp / longjmp has been used. Or the perf context * switch in the kernel which doesn't stop and start tracing in exactly * the same code path. When that happens the return address will be * further down the stack. If the return address is not found at all, * we assume the opposite (i.e. this is a return for a call that wasn't * seen for some reason) and leave the stack alone.
*/ for (i = ts->cnt; i; ) { if (ts->stack[--i].ret_addr == ret_addr) {
ts->cnt = i; return;
}
}
}
/* * The parent db_id must be assigned before exporting the child. Note * it is not possible to export the parent first because its information * is not yet complete because its 'return' has not yet been processed.
*/
parent_db_id = idx ? &(tse - 1)->db_id : NULL;
if (!ts) {
ts = thread_stack__new(thread, cpu, NULL, callstack, br_stack_sz); if (!ts) {
pr_warning("Out of memory: no thread stack\n"); return -ENOMEM;
}
ts->trace_nr = trace_nr;
ts->mispred_all = mispred_all;
}
/* * When the trace is discontinuous, the trace_nr changes. In that case * the stack might be completely invalid. Better to report nothing than * to report something misleading, so flush the stack.
*/ if (trace_nr != ts->trace_nr) { if (ts->trace_nr)
__thread_stack__flush(thread, ts);
ts->trace_nr = trace_nr;
}
if (br_stack_sz)
thread_stack__update_br_stack(ts, flags, from_ip, to_ip);
/* * Stop here if thread_stack__process() is in use, or not recording call * stack.
*/ if (ts->crp || !callstack) return 0;
if (flags & PERF_IP_FLAG_CALL) {
u64 ret_addr;
if (!to_ip) return 0;
ret_addr = from_ip + insn_len; if (ret_addr == to_ip) return 0; /* Zero-length calls are excluded */ return thread_stack__push(ts, ret_addr,
flags & PERF_IP_FLAG_TRACE_END);
} elseif (flags & PERF_IP_FLAG_TRACE_BEGIN) { /* * If the caller did not change the trace number (which would * have flushed the stack) then try to make sense of the stack. * Possibly, tracing began after returning to the current * address, so try to pop that. Also, do not expect a call made * when the trace ended, to return, so pop that.
*/
thread_stack__pop(ts, to_ip);
thread_stack__pop_trace_end(ts);
} elseif ((flags & PERF_IP_FLAG_RETURN) && from_ip) {
thread_stack__pop(ts, to_ip);
}
for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) {
ip = ts->stack[ts->cnt - j].ret_addr;
context = callchain_context(ip, kernel_start); if (context != last_context) { if (i >= sz - 1) break;
chain->ips[i++] = context;
last_context = context;
}
chain->ips[i] = ip;
}
chain->nr = i;
}
/* * Hardware sample records, created some time after the event occurred, need to * have subsequent addresses removed from the call chain.
*/ void thread_stack__sample_late(struct thread *thread, int cpu, struct ip_callchain *chain, size_t sz,
u64 sample_ip, u64 kernel_start)
{ struct thread_stack *ts = thread__stack(thread, cpu);
u64 sample_context = callchain_context(sample_ip, kernel_start);
u64 last_context, context, ip;
size_t nr = 0, j;
if (sz < 2) {
chain->nr = 0; return;
}
if (!ts) goto out;
/* * When tracing kernel space, kernel addresses occur at the top of the * call chain after the event occurred but before tracing stopped. * Skip them.
*/ for (j = 1; j <= ts->cnt; j++) {
ip = ts->stack[ts->cnt - j].ret_addr;
context = callchain_context(ip, kernel_start); if (context == PERF_CONTEXT_USER ||
(context == sample_context && ip == sample_ip)) break;
}
last_context = sample_ip; /* Use sample_ip as an invalid context */
if (ip < kernel_start) { /* * User space sample: start copying branch entries when the * branch is in user space.
*/ for (s = spos; s < ssz && nr < sz; s++) { if (us_start(s, kernel_start, &start)) {
*d++ = *s;
nr += 1;
}
}
if (src->nr >= ts->br_stack_sz) { for (s = &src->entries[0]; s < spos && nr < sz; s++) { if (us_start(s, kernel_start, &start)) {
*d++ = *s;
nr += 1;
}
}
}
} else { struct branch_entry *nb = NULL;
/* * Kernel space sample: start copying branch entries when the ip * falls in between 2 branches (or the branch is in user space * because then the start must have been missed).
*/ for (s = spos; s < ssz && nr < sz; s++) { if (ks_start(s, ip, kernel_start, &start, nb)) {
*d++ = *s;
nr += 1;
}
nb = s;
}
if (src->nr >= ts->br_stack_sz) { for (s = &src->entries[0]; s < spos && nr < sz; s++) { if (ks_start(s, ip, kernel_start, &start, nb)) {
*d++ = *s;
nr += 1;
}
nb = s;
}
}
}
/* Return to userspace, so pop all kernel addresses */ while (thread_stack__in_kernel(ts)) {
err = thread_stack__call_return(thread, ts, --ts->cnt,
tm, ref, true); if (err) return err;
}
if (ip >= ks && addr < ks) { /* Return to userspace, so pop all kernel addresses */
err = thread_stack__pop_ks(thread, ts, sample, ref); if (err) return err;
/* If the stack is empty, push the userspace address */ if (!ts->cnt) {
cp = call_path__findnew(cpr, root, tsym, addr, ks); return thread_stack__push_cp(ts, 0, tm, ref, cp, true, false);
}
} elseif (thread_stack__in_kernel(ts) && ip < ks) { /* Return to userspace, so pop all kernel addresses */
err = thread_stack__pop_ks(thread, ts, sample, ref); if (err) return err;
}
if (parent->sym == from_al->sym) { /* * At the bottom of the stack, assume the missing 'call' was * before the trace started. So, pop the current symbol and push * the 'to' symbol.
*/ if (ts->cnt == 1) {
err = thread_stack__call_return(thread, ts, --ts->cnt,
tm, ref, false); if (err) return err;
}
if (!ts->cnt) {
cp = call_path__findnew(cpr, root, tsym, addr, ks);
/* * Otherwise assume the 'return' is being used as a jump (e.g. * retpoline) and just push the 'to' symbol.
*/
cp = call_path__findnew(cpr, parent, tsym, addr, ks);
/* * x86 retpoline functions pollute the call graph. This function removes them. * This does not handle function return thunks, nor is there any improvement * for the handling of inline thunks or extern thunks.
*/ staticint thread_stack__x86_retpoline(struct thread_stack *ts, struct perf_sample *sample, struct addr_location *to_al)
{ struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1]; struct call_path_root *cpr = ts->crp->cpr; struct symbol *sym = tse->cp->sym; struct symbol *tsym = to_al->sym; struct call_path *cp;
if (sym && is_x86_retpoline(sym->name)) { /* * This is a x86 retpoline fn. It pollutes the call graph by * showing up everywhere there is an indirect branch, but does * not itself mean anything. Here the top-of-stack is removed, * by decrementing the stack count, and then further down, the * resulting top-of-stack is replaced with the actual target. * The result is that the retpoline functions will no longer * appear in the call graph. Note this only affects the call * graph, since all the original branches are left unchanged.
*/
ts->cnt -= 1;
sym = ts->stack[ts->cnt - 2].cp->sym; if (sym && sym == tsym && to_al->addr != tsym->start) { /* * Target is back to the middle of the symbol we came * from so assume it is an indirect jmp and forget it * altogether.
*/
ts->cnt -= 1; return 0;
}
} elseif (sym && sym == tsym) { /* * Target is back to the symbol we came from so assume it is an * indirect jmp and forget it altogether.
*/
ts->cnt -= 1; return 0;
}
if (!ts) {
ts = thread_stack__new(thread, sample->cpu, crp, true, 0); if (!ts) return -ENOMEM;
ts->comm = comm;
}
rstate = ts->rstate; if (rstate == X86_RETPOLINE_DETECTED)
ts->rstate = X86_RETPOLINE_POSSIBLE;
/* Flush stack on exec */ if (ts->comm != comm && thread__pid(thread) == thread__tid(thread)) {
err = __thread_stack__flush(thread, ts); if (err) return err;
ts->comm = comm;
}
/* If the stack is empty, put the current symbol on the stack */ if (!ts->cnt) {
err = thread_stack__bottom(ts, sample, from_al, to_al, ref); if (err) return err;
}
/* * A call to the same symbol but not the start of the symbol, * may be the start of a x86 retpoline.
*/ if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
from_al->sym == to_al->sym &&
to_al->addr != to_al->sym->start)
ts->rstate = X86_RETPOLINE_DETECTED;
/* * The compiler might optimize a call/ret combination by making * it a jmp. Make that visible by recording on the stack a * branch to the start of a different symbol. Note, that means * when a ret pops the stack, all jmps must be popped off first.
*/
cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
to_al->sym, sample->addr,
ts->kernel_start);
err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false, false); if (!err)
ts->stack[ts->cnt - 1].non_call = true;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.