/* * The Processor Module Information system parameter allows transferring * of certain processor module information from the platform to the OS. * Refer PAPR+ document to get parameter token value as '43'.
*/
/* * read_24x7_sys_info() * Retrieve the number of sockets and chips per socket and cores per * chip details through the get-system-parameter rtas call.
*/ void read_24x7_sys_info(void)
{ struct papr_sysparm_buf *buf;
/* * Making system parameter: chips and sockets and cores per chip * default to 1.
*/
phys_sockets = 1;
phys_chipspersocket = 1;
phys_coresperchip = 1;
buf = papr_sysparm_buf_alloc(); if (!buf) return;
if (!papr_sysparm_get(PAPR_SYSPARM_PROC_MODULE_INFO, buf)) { int ntypes = be16_to_cpup((__be16 *)&buf->val[0]); int len = be16_to_cpu(buf->len);
/* Domains for which more than one result element are returned for each event. */ staticbool domain_needs_aggregation(unsignedint domain)
{ return aggregate_result_elements &&
(domain == HV_PERF_DOMAIN_PHYS_CORE ||
(domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE &&
domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE));
}
staticconstchar *domain_name(unsignedint domain)
{ if (!domain_is_valid(domain)) return NULL;
switch (domain) { case HV_PERF_DOMAIN_PHYS_CHIP: return"Physical Chip"; case HV_PERF_DOMAIN_PHYS_CORE: return"Physical Core"; case HV_PERF_DOMAIN_VCPU_HOME_CORE: return"VCPU Home Core"; case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return"VCPU Home Chip"; case HV_PERF_DOMAIN_VCPU_HOME_NODE: return"VCPU Home Node"; case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return"VCPU Remote Node";
}
/* * TODO: Merging events: * - Think of the hcall as an interface to a 4d array of counters: * - x = domains * - y = indexes in the domain (core, chip, vcpu, node, etc) * - z = offset into the counter space * - w = lpars (guest vms, "logical partitions") * - A single request is: x,y,y_last,z,z_last,w,w_last * - this means we can retrieve a rectangle of counters in y,z for a single x. * * - Things to consider (ignoring w): * - input cost_per_request = 16 * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs * - limited number of requests per hcall (must fit into 4K bytes) * - 4k = 16 [buffer header] - 16 [request size] * request_count * - 255 requests per hcall * - sometimes it will be more efficient to read extra data and discard
*/
/* * Example usage: * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/'
*/
/* * request_buffer and result_buffer are not required to be 4k aligned, * but are not allowed to cross any 4k boundary. Aligning them to 4k is * the simplest way to ensure that.
*/ #define H24x7_DATA_BUFFER_SIZE 4096 static DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096); static DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
/* * Things we don't check: * - padding for desc, name, and long/detailed desc is required to be '\0' * bytes. * * Return NULL if we pass end, * Otherwise return the address of the byte just following the event.
*/ staticvoid *event_end(struct hv_24x7_event_data *ev, void *end)
{ void *start = ev;
__be16 *dl_, *ldl_; unsignedint dl, ldl; unsignedint nl = be16_to_cpu(ev->event_name_len);
if (nl < 2) {
pr_debug("%s: name length too short: %d", __func__, nl); return NULL;
}
ldl_ = (__be16 *)(ev->remainder + nl + dl - 2); if (!IS_ALIGNED((uintptr_t)ldl_, 2))
pr_warn("long desc len not aligned %p", ldl_);
ldl = be16_to_cpu(*ldl_); if (ldl < 2) {
pr_debug("%s: long desc len too short (ldl=%u)",
__func__, ldl); return NULL;
}
/* * Each event we find in the catalog, will have a sysfs entry. Format the * data for this sysfs entry based on the event's domain. * * Events belonging to the Chip domain can only be monitored in that domain. * i.e the domain for these events is a fixed/knwon value. * * Events belonging to the Core domain can be monitored either in the physical * core or in one of the virtual CPU domains. So the domain value for these * events must be specified by the user (i.e is a required parameter). Format * the Core events with 'domain=?' so the perf-tool can error check required * parameters. * * NOTE: For the Core domain events, rather than making domain a required * parameter we could default it to PHYS_CORE and allowe users to * override the domain to one of the VCPU domains. * * However, this can make the interface a little inconsistent. * * If we set domain=2 (PHYS_CHIP) and allow user to override this field * the user may be tempted to also modify the "offset=x" field in which * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and * HPM_INST (offset=0x20) events. With: * * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/ * * we end up monitoring HPM_INST, while the command line has HPM_PCYC. * * By not assigning a default value to the domain for the Core events, * we can have simple guidelines: * * - Specifying values for parameters with "=?" is required. * * - Specifying (i.e overriding) values for other parameters * is undefined.
*/ staticchar *event_fmt(struct hv_24x7_event_data *event, unsignedint domain)
{ constchar *sindex; constchar *lpar; constchar *domain_str; char buf[8];
/* * Allocate and initialize strings representing event attributes. * * NOTE: The strings allocated here are never destroyed and continue to * exist till shutdown. This is to allow us to create as many events * from the catalog as possible, even if we encounter errors with some. * In case of changes to error paths in future, these may need to be * freed by the caller.
*/ staticstruct attribute *device_str_attr_create(char *name, int name_max, int name_nonce, char *str, size_t str_max)
{ char *n; char *s = memdup_to_str(str, str_max, GFP_KERNEL); struct attribute *a;
if (!s) return NULL;
if (!name_nonce)
n = kasprintf(GFP_KERNEL, "%.*s", name_max, name); else
n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name,
name_nonce); if (!n) goto out_s;
a = device_str_attr_create_(n, s); if (!a) goto out_n;
/* Add new node and rebalance tree. */
rb_link_node(&data->node, parent, new);
rb_insert_color(&data->node, root);
/* data->ct */ return 0;
}
staticvoid event_uniq_destroy(struct rb_root *root)
{ /* * the strings we point to are in the giant block of memory filled by * the catalog, and are freed separately.
*/ struct event_uniq *pos, *n;
rbtree_postorder_for_each_entry_safe(pos, n, root, node)
kfree(pos);
}
/* * ensure the event structure's sizes are self consistent and don't cause us to * read outside of the event * * On success, return the event length in bytes. * Otherwise, return -1 (and print as appropriate).
*/ static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
size_t event_idx,
size_t event_data_bytes,
size_t event_entry_count,
size_t offset, void *end)
{
ssize_t ev_len; void *ev_end, *calc_ev_end;
if (offset >= event_data_bytes) return -1;
if (event_idx >= event_entry_count) {
pr_devel("catalog event data has %zu bytes of padding after last event\n",
event_data_bytes - offset); return -1;
}
if (!event_fixed_portion_is_within(event, end)) {
pr_warn("event %zu fixed portion is not within range\n",
event_idx); return -1;
}
ev_len = be16_to_cpu(event->length);
if (ev_len % 16)
pr_info("event %zu has length %zu not divisible by 16: event=%p\n",
event_idx, ev_len, event);
if ((MAX_4K < event_data_len)
|| (MAX_4K < event_data_offs)
|| (MAX_4K - event_data_offs < event_data_len)) {
pr_err("invalid event data offs %zu and/or len %zu\n",
event_data_offs, event_data_len);
ret = -EIO; goto e_free;
}
if ((event_data_offs + event_data_len) > catalog_page_len) {
pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n",
event_data_offs,
event_data_offs + event_data_len,
catalog_page_len);
ret = -EIO; goto e_free;
}
if (SIZE_MAX - 1 < event_entry_count) {
pr_err("event_entry_count %zu is invalid\n", event_entry_count);
ret = -EIO; goto e_free;
}
event_data_bytes = event_data_len * 4096;
/* * event data can span several pages, events can cross between these * pages. Use vmalloc to make this easier.
*/
event_data = vmalloc(event_data_bytes); if (!event_data) {
pr_err("could not allocate event data\n");
ret = -ENOMEM; goto e_free;
}
end = event_data + event_data_bytes;
/* * using vmalloc_to_phys() like this only works if PAGE_SIZE is * divisible by 4096
*/
BUILD_BUG_ON(PAGE_SIZE % 4096);
for (i = 0; i < event_data_len; i++) {
hret = h_get_24x7_catalog_page_(
vmalloc_to_phys(event_data + i * 4096),
catalog_version_num,
i + event_data_offs); if (hret) {
pr_err("Failed to get event data in page %zu: rc=%ld\n",
i + event_data_offs, hret);
ret = -EIO; goto e_event_data;
}
}
/* * scan the catalog to determine the number of attributes we need, and * verify it at the same time.
*/ for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0;
;
event_idx++, event = (void *)event + ev_len) {
size_t offset = (void *)event - (void *)event_data; char *name; int nl;
if (!catalog_entry_domain_is_valid(event->domain)) {
pr_info("event %zu (%.*s) has invalid domain %d\n",
event_idx, nl, name, event->domain);
junk_events++; continue;
}
attr_max++;
}
event_idx_last = event_idx; if (event_idx_last != event_entry_count)
pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n",
event_idx_last, event_entry_count, junk_events);
events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL); if (!events) {
ret = -ENOMEM; goto e_event_data;
}
event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs),
GFP_KERNEL); if (!event_descs) {
ret = -ENOMEM; goto e_event_attrs;
}
event_long_descs = kmalloc_array(event_idx + 1, sizeof(*event_long_descs), GFP_KERNEL); if (!event_long_descs) {
ret = -ENOMEM; goto e_event_descs;
}
/* Iterate over the catalog filling in the attribute vector */ for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0,
event = event_data, event_idx = 0;
event_idx < event_idx_last;
event_idx++, ev_len = be16_to_cpu(event->length),
event = (void *)event + ev_len) { char *name; int nl; int nonce; /* * these are the only "bad" events that are intermixed and that * we can ignore without issue. make sure to skip them here
*/ if (event->event_group_record_len == 0) continue; if (!catalog_entry_domain_is_valid(event->domain)) continue;
name = event_name(event, &nl); if (ignore_event(name)) continue;
/* * Start the process for a new H_GET_24x7_DATA hcall.
*/ staticvoid init_24x7_request(struct hv_24x7_request_buffer *request_buffer, struct hv_24x7_data_result_buffer *result_buffer)
{
request_buffer->interface_version = interface_version; /* memset above set request_buffer->num_requests to 0 */
}
/* * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected * by 'init_24x7_request()' and 'add_event_to_24x7_request()'.
*/ staticint make_24x7_request(struct hv_24x7_request_buffer *request_buffer, struct hv_24x7_data_result_buffer *result_buffer)
{ long ret;
/* * NOTE: Due to variable number of array elements in request and * result buffer(s), sizeof() is not reliable. Use the actual * allocated buffer size, H24x7_DATA_BUFFER_SIZE.
*/
ret = plpar_hcall_norets(H_GET_24X7_DATA,
virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE,
virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE);
/* * Add the given @event to the next slot in the 24x7 request_buffer. * * Note that H_GET_24X7_DATA hcall allows reading several counters' * values in a single HCALL. We expect the caller to add events to the * request buffer one by one, make the HCALL and process the results.
*/ staticint add_event_to_24x7_request(struct perf_event *event, struct hv_24x7_request_buffer *request_buffer)
{
u16 idx; int i;
size_t req_size; struct hv_24x7_request *req;
if (request_buffer->num_requests >=
max_num_requests(request_buffer->interface_version)) {
pr_devel("Too many requests for 24x7 HCALL %d\n",
request_buffer->num_requests); return -EINVAL;
}
switch (event_get_domain(event)) { case HV_PERF_DOMAIN_PHYS_CHIP:
idx = event_get_chip(event); break; case HV_PERF_DOMAIN_PHYS_CORE:
idx = event_get_core(event); break; default:
idx = event_get_vcpu(event);
}
/** * get_count_from_result - get event count from all result elements in result * * If the event corresponding to this result needs aggregation of the result * element values, then this function does that. * * @event: Event associated with @res. * @resb: Result buffer containing @res. * @res: Result to work on. * @countp: Output variable containing the event count. * @next: Optional output variable pointing to the next result in @resb.
*/ staticint get_count_from_result(struct perf_event *event, struct hv_24x7_data_result_buffer *resb, struct hv_24x7_result *res, u64 *countp, struct hv_24x7_result **next)
{
u16 num_elements = be16_to_cpu(res->num_elements_returned);
u16 data_size = be16_to_cpu(res->result_element_data_size); unsignedint data_offset; void *element_data; int i;
u64 count;
/* * We can bail out early if the result is empty.
*/ if (!num_elements) {
pr_debug("Result of request %hhu is empty, nothing to do\n",
res->result_ix);
if (next)
*next = (struct hv_24x7_result *) res->elements;
return -ENODATA;
}
/* * Since we always specify 1 as the maximum for the smallest resource * we're requesting, there should to be only one element per result. * Except when an event needs aggregation, in which case there are more.
*/ if (num_elements != 1 &&
!domain_needs_aggregation(event_get_domain(event))) {
pr_err("Error: result of request %hhu has %hu elements\n",
res->result_ix, num_elements);
return -EIO;
}
if (data_size != sizeof(u64)) {
pr_debug("Error: result of request %hhu has data of %hu bytes\n",
res->result_ix, data_size);
/* Go through the result elements in the result. */ for (i = count = 0, element_data = res->elements + data_offset;
i < num_elements;
i++, element_data += data_size + data_offset)
count += be64_to_cpu(*((__be64 *)element_data));
*countp = count;
/* The next result is after the last result element. */ if (next)
*next = element_data - data_offset;
/* Not our event */ if (event->attr.type != event->pmu->type) return -ENOENT;
/* Unused areas must be 0 */ if (event_get_reserved1(event) ||
event_get_reserved2(event) ||
event_get_reserved3(event)) {
pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n",
event->attr.config,
event_get_reserved1(event),
event->attr.config1,
event_get_reserved2(event),
event->attr.config2,
event_get_reserved3(event)); return -EINVAL;
}
/* no branch sampling */ if (has_branch_stack(event)) return -EOPNOTSUPP;
/* offset must be 8 byte aligned */ if (event_get_offset(event) % 8) {
pr_devel("bad alignment\n"); return -EINVAL;
}
hret = hv_perf_caps_get(&caps); if (hret) {
pr_devel("could not get capabilities: rc=%ld\n", hret); return -EIO;
}
/* Physical domains & other lpars require extra capabilities */ if (!caps.collect_privileged && (is_physical_domain(domain) ||
(event_get_lpar(event) != event_get_lpar_max()))) {
pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n",
is_physical_domain(domain),
event_get_lpar(event)); return -EACCES;
}
/* Get the initial value of the counter for this event */ if (single_24x7_request(event, &ct)) {
pr_devel("test hcall failed\n"); return -EIO;
}
(void)local64_xchg(&event->hw.prev_count, ct);
/* * If in a READ transaction, add this counter to the list of * counters to read during the next HCALL (i.e commit_txn()). * If not in a READ transaction, go ahead and make the HCALL * to read this counter by itself.
*/
if (txn_flags & PERF_PMU_TXN_READ) { int i; int ret;
ret = add_event_to_24x7_request(event, request_buffer); if (ret) {
__this_cpu_write(hv_24x7_txn_err, ret);
} else { /* * Associate the event with the HCALL request index, * so ->commit_txn() can quickly find/update count.
*/
i = request_buffer->num_requests - 1;
staticvoid h_24x7_event_start(struct perf_event *event, int flags)
{ if (flags & PERF_EF_RELOAD)
local64_set(&event->hw.prev_count, h_24x7_get_value(event));
}
staticvoid h_24x7_event_stop(struct perf_event *event, int flags)
{
h_24x7_event_read(event);
}
staticint h_24x7_event_add(struct perf_event *event, int flags)
{ if (flags & PERF_EF_START)
h_24x7_event_start(event, flags);
return 0;
}
/* * 24x7 counters only support READ transactions. They are * always counting and dont need/support ADD transactions. * Cache the flags, but otherwise ignore transactions that * are not PERF_PMU_TXN_READ.
*/ staticvoid h_24x7_event_start_txn(struct pmu *pmu, unsignedint flags)
{ struct hv_24x7_request_buffer *request_buffer; struct hv_24x7_data_result_buffer *result_buffer;
/* We should not be called if we are already in a txn */
WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags));
__this_cpu_write(hv_24x7_txn_flags, flags); if (flags & ~PERF_PMU_TXN_READ) return;
/* * Clean up transaction state. * * NOTE: Ignore state of request and result buffers for now. * We will initialize them during the next read/txn.
*/ staticvoid reset_txn(void)
{
__this_cpu_write(hv_24x7_txn_flags, 0);
__this_cpu_write(hv_24x7_txn_err, 0);
}
/* * 24x7 counters only support READ transactions. They are always counting * and dont need/support ADD transactions. Clear ->txn_flags but otherwise * ignore transactions that are not of type PERF_PMU_TXN_READ. * * For READ transactions, submit all pending 24x7 requests (i.e requests * that were queued by h_24x7_event_read()), to the hypervisor and update * the event counts.
*/ staticint h_24x7_event_commit_txn(struct pmu *pmu)
{ struct hv_24x7_request_buffer *request_buffer; struct hv_24x7_data_result_buffer *result_buffer; struct hv_24x7_result *res, *next_res;
u64 count; int i, ret, txn_flags; struct hv_24x7_hw *h24x7hw;
ret = make_24x7_request(request_buffer, result_buffer); if (ret) goto put_reqb;
h24x7hw = &get_cpu_var(hv_24x7_hw);
/* Go through results in the result buffer to update event counts. */ for (i = 0, res = result_buffer->results;
i < result_buffer->num_results; i++, res = next_res) { struct perf_event *event = h24x7hw->events[res->result_ix];
ret = get_count_from_result(event, result_buffer, res, &count,
&next_res); if (ret) break;
/* * 24x7 counters only support READ transactions. They are always counting * and dont need/support ADD transactions. However, regardless of type * of transaction, all we need to do is cleanup, so we don't have to check * the type of transaction.
*/ staticvoid h_24x7_event_cancel_txn(struct pmu *pmu)
{
WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags));
reset_txn();
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.