if (*perf_event_fasync(handle->event) && !handle->event->pending_kill)
handle->event->pending_kill = POLL_IN;
irq_work_queue(&handle->event->pending_irq);
}
/* * We need to ensure a later event_id doesn't publish a head when a former * event isn't done writing. However since we need to deal with NMIs we * cannot fully serialize things. * * We only publish the head (and generate a wakeup) when the outer-most * event completes.
*/ staticvoid perf_output_get_handle(struct perf_output_handle *handle)
{ struct perf_buffer *rb = handle->rb;
preempt_disable();
/* * Avoid an explicit LOAD/STORE such that architectures with memops * can use them.
*/
(*(volatileunsignedint *)&rb->nest)++;
handle->wakeup = local_read(&rb->wakeup);
}
/* * If this isn't the outermost nesting, we don't have to update * @rb->user_page->data_head.
*/
nest = READ_ONCE(rb->nest); if (nest > 1) {
WRITE_ONCE(rb->nest, nest - 1); goto out;
}
again: /* * In order to avoid publishing a head value that goes backwards, * we must ensure the load of @rb->head happens after we've * incremented @rb->nest. * * Otherwise we can observe a @rb->head value before one published * by an IRQ/NMI happening between the load and the increment.
*/
barrier();
head = local_read(&rb->head);
/* * IRQ/NMI can happen here and advance @rb->head, causing our * load above to be stale.
*/
/* * Since the mmap() consumer (userspace) can run on a different CPU: * * kernel user * * if (LOAD ->data_tail) { LOAD ->data_head * (A) smp_rmb() (C) * STORE $data LOAD $data * smp_wmb() (B) smp_mb() (D) * STORE ->data_head STORE ->data_tail * } * * Where A pairs with D, and B pairs with C. * * In our case (A) is a control dependency that separates the load of * the ->data_tail and the stores of $data. In case ->data_tail * indicates there is no room in the buffer to store $data we do not. * * D needs to be a full barrier since it separates the data READ * from the tail WRITE. * * For B a WMB is sufficient since it separates two WRITEs, and for C * an RMB is sufficient since it separates two READs. * * See perf_output_begin().
*/
smp_wmb(); /* B, matches C */
WRITE_ONCE(rb->user_page->data_head, head);
/* * We must publish the head before decrementing the nest count, * otherwise an IRQ/NMI can publish a more recent head value and our * write will (temporarily) publish a stale value.
*/
barrier();
WRITE_ONCE(rb->nest, 0);
/* * Ensure we decrement @rb->nest before we validate the @rb->head. * Otherwise we cannot be sure we caught the 'last' nested update.
*/
barrier(); if (unlikely(head != local_read(&rb->head))) {
WRITE_ONCE(rb->nest, 1); goto again;
}
if (handle->wakeup != local_read(&rb->wakeup))
perf_output_wakeup(handle);
have_lost = local_read(&rb->lost); if (unlikely(have_lost)) {
size += sizeof(lost_event); if (event->attr.sample_id_all)
size += event->id_header_size;
}
perf_output_get_handle(handle);
offset = local_read(&rb->head); do {
head = offset;
tail = READ_ONCE(rb->user_page->data_tail); if (!rb->overwrite) { if (unlikely(!ring_buffer_has_space(head, tail,
perf_data_size(rb),
size, backward))) goto fail;
}
/* * The above forms a control dependency barrier separating the * @tail load above from the data stores below. Since the @tail * load is required to compute the branch to fail below. * * A, matches D; the full memory barrier userspace SHOULD issue * after reading the data and before storing the new tail * position. * * See perf_output_put_handle().
*/
if (!backward)
head += size; else
head -= size;
} while (!local_try_cmpxchg(&rb->head, &offset, head));
if (backward) {
offset = head;
head = (u64)(-head);
}
/* * We rely on the implied barrier() by local_cmpxchg() to ensure * none of the data stores below can be lifted up by the compiler.
*/
if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
/* * perf_output_begin() only checks rb->paused, therefore * rb->paused must be true if we have no pages for output.
*/ if (!rb->nr_pages)
rb->paused = 1;
mutex_init(&rb->aux_mutex);
}
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
{ /* * OVERWRITE is determined by perf_aux_output_end() and can't * be passed in directly.
*/ if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) return;
/* * This is called before hardware starts writing to the AUX area to * obtain an output handle and make sure there's room in the buffer. * When the capture completes, call perf_aux_output_end() to commit * the recorded data to the buffer. * * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. * * Call this from pmu::start(); see the comment in perf_aux_output_end() * about its use in pmu callbacks. Both can also be called from the PMI * handler if needed.
*/ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event)
{ struct perf_event *output_event = event; unsignedlong aux_head, aux_tail; struct perf_buffer *rb; unsignedint nest;
if (output_event->parent)
output_event = output_event->parent;
/* * Since this will typically be open across pmu::add/pmu::del, we * grab ring_buffer's refcount instead of holding rcu read lock * to make sure it doesn't disappear under us.
*/
rb = ring_buffer_get(output_event); if (!rb) return NULL;
if (!rb_has_aux(rb)) goto err;
/* * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), * about to get freed, so we leave immediately. * * Checking rb::aux_mmap_count and rb::refcount has to be done in * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic().
*/ if (!atomic_read(&rb->aux_mmap_count)) goto err;
if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err;
nest = READ_ONCE(rb->aux_nest); /* * Nesting is not supported for AUX area, make sure nested * writers are caught early
*/ if (WARN_ON_ONCE(nest)) goto err_put;
/* * In overwrite mode, AUX data stores do not depend on aux_tail, * therefore (A) control dependency barrier does not exist. The * (B) <-> (C) ordering is still observed by the pmu driver.
*/ if (!rb->aux_overwrite) {
aux_tail = READ_ONCE(rb->user_page->aux_tail);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
/* * handle->size computation depends on aux_tail load; this forms a * control dependency barrier separating aux_tail load from aux data * store that will be enabled on successful return
*/ if (!handle->size) { /* A, matches D */
perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
WRITE_ONCE(rb->aux_nest, 0); goto err_put;
}
}
/* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. * * Note: this has to be called from pmu::stop() callback, as the assumption * of the AUX buffer management code is that after pmu::stop(), the AUX * transaction must be stopped and therefore drop the AUX reference count.
*/ void perf_aux_output_end(struct perf_output_handle *handle, unsignedlong size)
{ bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); struct perf_buffer *rb = handle->rb; unsignedlong aux_head;
/* in overwrite mode, driver provides aux_head via handle */ if (rb->aux_overwrite) {
handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
/* * Only send RECORD_AUX if we have something useful to communicate * * Note: the OVERWRITE records by themselves are not considered * useful, as they don't communicate any *new* information, * aside from the short-lived offset, that becomes history at * the next event sched-in and therefore isn't useful. * The userspace that needs to copy out AUX data in overwrite * mode should know to use user_page::aux_head for the actual * offset. So, from now on we don't output AUX records that * have *only* OVERWRITE flag set.
*/ if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
perf_event_aux_event(handle->event, aux_head, size,
handle->aux_flags);
WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb))
wakeup = true;
if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
}
handle->event = NULL;
WRITE_ONCE(rb->aux_nest, 0); /* can't be last */
rb_free_aux(rb);
ring_buffer_put(rb);
}
EXPORT_SYMBOL_GPL(perf_aux_output_end);
/* * Skip over a given number of bytes in the AUX buffer, due to, for example, * hardware's alignment constraints.
*/ int perf_aux_output_skip(struct perf_output_handle *handle, unsignedlong size)
{ struct perf_buffer *rb = handle->rb;
void *perf_get_aux(struct perf_output_handle *handle)
{ /* this is only valid between perf_aux_output_begin and *_end */ if (!handle->event) return NULL;
/* * Copy out AUX data from an AUX handle.
*/ long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsignedlong from, unsignedlong to)
{ struct perf_buffer *rb = aux_handle->rb; unsignedlong tocopy, remainder, len = 0; void *addr;
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
do {
tocopy = PAGE_SIZE - offset_in_page(from); if (to > from)
tocopy = min(tocopy, to - from); if (!tocopy) break;
staticstruct page *rb_alloc_aux_page(int node, int order)
{ struct page *page;
if (order > MAX_PAGE_ORDER)
order = MAX_PAGE_ORDER;
do {
page = alloc_pages_node(node, PERF_AUX_GFP, order);
} while (!page && order--);
if (page && order) { /* * Communicate the allocation size to the driver: * if we managed to secure a high-order allocation, * set its first page's private to this order; * !PagePrivate(page) means it's just a normal page.
*/
split_page(page, order);
SetPagePrivate(page);
set_page_private(page, order);
}
staticvoid __rb_free_aux(struct perf_buffer *rb)
{ int pg;
/* * Should never happen, the last reference should be dropped from * perf_mmap_close() path, which first stops aux transactions (which * in turn are the atomic holders of aux_refcount) and then does the * last rb_free_aux().
*/
WARN_ON_ONCE(in_atomic());
if (rb->aux_nr_pages) { for (pg = 0; pg < rb->aux_nr_pages; pg++)
rb_free_aux_page(rb, pg);
kfree(rb->aux_pages);
rb->aux_nr_pages = 0;
}
}
int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags)
{ bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); bool use_contiguous_pages = event->pmu->capabilities & (
PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_PREFER_LARGE); /* * Initialize max_order to 0 for page allocation. This allocates single * pages to minimize memory fragmentation. This is overridden if the * PMU needs or prefers contiguous pages (use_contiguous_pages = true).
*/ int max_order = 0; int ret = -ENOMEM;
if (!has_aux(event)) return -EOPNOTSUPP;
if (nr_pages <= 0) return -EINVAL;
if (!overwrite) { /* * Watermark defaults to half the buffer, to aid PMU drivers * in double buffering.
*/ if (!watermark)
watermark = min_t(unsignedlong,
U32_MAX,
(unsignedlong)nr_pages << (PAGE_SHIFT - 1));
/* * If using contiguous pages, use aux_watermark as the basis * for chunking to help PMU drivers honor the watermark.
*/ if (use_contiguous_pages)
max_order = get_order(watermark);
} else { /* * If using contiguous pages, we need to start with the * max_order that fits in nr_pages, not the other way around, * hence ilog2() and not get_order.
*/ if (use_contiguous_pages)
max_order = ilog2(nr_pages);
watermark = 0;
}
/* * kcalloc_node() is unable to allocate buffer if the size is larger * than: PAGE_SIZE << MAX_PAGE_ORDER; directly bail out in this case.
*/ if (get_order((unsignedlong)nr_pages * sizeof(void *)) > MAX_PAGE_ORDER) return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node); if (!rb->aux_pages) return -ENOMEM;
rb->free_aux = event->pmu->free_aux; for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { struct page *page; int last, order;
order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
page = rb_alloc_aux_page(node, order); if (!page) goto out;
for (last = rb->aux_nr_pages + (1 << page_private(page));
last > rb->aux_nr_pages; rb->aux_nr_pages++)
rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
}
/* * In overwrite mode, PMUs that don't support SG may not handle more * than one contiguous allocation, since they rely on PMI to do double * buffering. In this case, the entire buffer has to be one contiguous * chunk.
*/ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
overwrite) { struct page *page = virt_to_page(rb->aux_pages[0]);
if (page_private(page) != max_order) goto out;
}
rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
overwrite); if (!rb->aux_priv) goto out;
ret = 0;
/* * aux_pages (and pmu driver's private data, aux_priv) will be * referenced in both producer's and consumer's contexts, thus * we keep a refcount here to make sure either of the two can * reference them safely.
*/
refcount_set(&rb->aux_refcount, 1);
if (order_base_2(size) > PAGE_SHIFT+MAX_PAGE_ORDER) goto fail;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
rb = kzalloc_node(size, GFP_KERNEL, node); if (!rb) goto fail;
rb->user_page = perf_mmap_alloc_page(cpu); if (!rb->user_page) goto fail_user_page;
for (i = 0; i < nr_pages; i++) {
rb->data_pages[i] = perf_mmap_alloc_page(cpu); if (!rb->data_pages[i]) goto fail_data_pages;
}
rb->nr_pages = nr_pages;
ring_buffer_init(rb, watermark, flags);
return rb;
fail_data_pages: for (i--; i >= 0; i--)
perf_mmap_free_page(rb->data_pages[i]);
perf_mmap_free_page(rb->user_page);
fail_user_page:
kfree(rb);
fail: return NULL;
}
void rb_free(struct perf_buffer *rb)
{ int i;
perf_mmap_free_page(rb->user_page); for (i = 0; i < rb->nr_pages; i++)
perf_mmap_free_page(rb->data_pages[i]);
kfree(rb);
}
#else staticstruct page *
__perf_mmap_to_page(struct perf_buffer *rb, unsignedlong pgoff)
{ /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) return NULL;
struct page *
perf_mmap_to_page(struct perf_buffer *rb, unsignedlong pgoff)
{ if (rb->aux_nr_pages) { /* above AUX space */ if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) return NULL;
/* AUX space */ if (pgoff >= rb->aux_pgoff) { int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); return virt_to_page(rb->aux_pages[aux_pgoff]);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.