/* * Regular printout to the terminal, suppressed if -q is specified:
*/ #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
/* * Debug printf:
*/ #undef dprintf #define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
struct thread_data { int curr_cpu;
cpu_set_t *bind_cpumask; int bind_node;
u8 *process_data; int process_nr; int thread_nr; int task_nr; unsignedint loops_done;
u64 val;
u64 runtime_ns;
u64 system_time_ns;
u64 user_time_ns; double speed_gbs; struct mutex *process_lock;
};
OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run (default: unlimited)"),
OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run (default: 5 secs)"),
OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"),
OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via reads (can be mixed with -W)"),
OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"),
OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"),
OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"),
OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations"),
OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations"),
OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"),
OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"),
OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"),
OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"),
OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details, " "convergence is reached when each process (all its threads) is running on a single NUMA node."),
OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
OPT_BOOLEAN('q', "quiet" , &quiet, "quiet mode (do not show any warnings or messages)"),
OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
/* Special option string parsing callbacks: */
OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]", "bind the first N tasks to these specific cpus (the rest is unbound)",
parse_cpus_opt),
OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]", "bind the first N tasks to these specific memory nodes (the rest is unbound)",
parse_nodes_opt),
OPT_END()
};
staticconstchar * const numa_usage[] = { "perf bench numa mem []",
NULL
};
/* * To get number of numa nodes present.
*/ staticint nr_numa_nodes(void)
{ int i, nr_nodes = 0;
for (i = 0; i < g->p.nr_nodes; i++) { if (numa_bitmask_isbitset(numa_nodes_ptr, i))
nr_nodes++;
}
return nr_nodes;
}
/* * To check if given numa node is present.
*/ staticint is_node_present(int node)
{ return numa_bitmask_isbitset(numa_nodes_ptr, node);
}
/* * To check given numa node has cpus.
*/ staticbool node_has_cpus(int node)
{ struct bitmask *cpumask = numa_allocate_cpumask(); bool ret = false; /* fall back to nocpus */ int cpu;
BUG_ON(!cpumask); if (!numa_node_to_cpus(node, cpumask)) { for (cpu = 0; cpu < (int)cpumask->size; cpu++) { if (numa_bitmask_isbitset(cpumask, cpu)) {
ret = true; break;
}
}
}
numa_free_cpumask(cpumask);
ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + 1);
dprintf("binding to node %d, mask: %016lx => %d\n", node, *node_mask->maskp, ret);
numa_bitmask_free(node_mask);
BUG_ON(ret);
}
#define HPSIZE (2*1024*1024)
#define set_taskname(fmt...) \ do { \ char name[20]; \
\
snprintf(name, 20, fmt); \
prctl(PR_SET_NAME, name); \
} while (0)
static u8 *alloc_data(ssize_t bytes0, int map_flags, int init_zero, int init_cpu0, int thp, int init_random)
{
cpu_set_t *orig_mask = NULL;
ssize_t bytes;
u8 *buf; int ret;
if (!bytes0) return NULL;
/* Allocate and initialize all memory on CPU#0: */ if (init_cpu0) { int node = numa_node_of_cpu(0);
if (map_flags == MAP_PRIVATE) { if (thp > 0) {
ret = madvise(buf, bytes, MADV_HUGEPAGE); if (ret && !g->print_once) {
g->print_once = 1;
printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n");
}
} if (thp < 0) {
ret = madvise(buf, bytes, MADV_NOHUGEPAGE); if (ret && !g->print_once) {
g->print_once = 1;
printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n");
}
}
}
if (init_zero) {
bzero(buf, bytes);
} else { /* Initialize random contents, different in each word: */ if (init_random) {
u64 *wbuf = (void *)buf; long off = rand(); long i;
for (i = 0; i < bytes/8; i++)
wbuf[i] = i + off;
}
}
staticvoid free_data(void *data, ssize_t bytes)
{ int ret;
if (!data) return;
ret = munmap(data, bytes);
BUG_ON(ret);
}
/* * Create a shared memory buffer that can be shared between processes, zeroed:
*/ staticvoid * zalloc_shared_data(ssize_t bytes)
{ return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0, g->p.thp, g->p.init_random);
}
/* * Create a shared memory buffer that can be shared between processes:
*/ staticvoid * setup_shared_data(ssize_t bytes)
{ return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0, g->p.thp, g->p.init_random);
}
/* * Allocate process-local memory - this will either be shared between * threads of this process, or only be accessed by this thread:
*/ staticvoid * setup_private_data(ssize_t bytes)
{ return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random);
}
/* * Check whether a CPU is online * * Returns: * 1 -> if CPU is online * 0 -> if CPU is offline * -1 -> error case
*/ staticint is_cpu_online(unsignedint cpu)
{ char *str;
size_t strlen; char buf[256]; int status = -1; struct stat statbuf;
/* * Check if /sys/devices/system/cpu/cpux/online file * exists. Some cases cpu0 won't have online file since * it is not expected to be turned off generally. * In kernels without CONFIG_HOTPLUG_CPU, this * file won't exist
*/
snprintf(buf, sizeof(buf), "/sys/devices/system/cpu/cpu%d/online", cpu); if (stat(buf, &statbuf) != 0) return 1;
/* * Read online file using sysfs__read_str. * If read or open fails, return -1. * If read succeeds, return value from file * which gets stored in "str"
*/
snprintf(buf, sizeof(buf), "devices/system/cpu/cpu%d/online", cpu);
if (sysfs__read_str(buf, &str, &strlen) < 0) return status;
if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) {
printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus); return -1;
}
if (is_cpu_online(bind_cpu_0) != 1 || is_cpu_online(bind_cpu_1) != 1) {
printf("\nTest not applicable, bind_cpu_0 or bind_cpu_1 is offline\n"); return -1;
}
if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) {
printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes); return -1;
}
/* * Make sure there's real data dependency to RAM (when read * accesses are enabled), so the compiler, the CPU and the * kernel (KSM, zero page, etc.) cannot optimize away RAM * accesses:
*/ staticinline u64 access_data(u64 *data, u64 val)
{ if (g->p.data_reads)
val += *data; if (g->p.data_writes)
*data = val + 1; return val;
}
/* * The worker process does two types of work, a forwards going * loop and a backwards going loop. * * We do this so that on multiprocessor systems we do not create * a 'train' of processing, with highly synchronized processes, * skewing the whole benchmark.
*/ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val)
{ long words = bytes/sizeof(u64);
u64 *data = (void *)__data; long chunk_0, chunk_1;
u64 *d0, *d, *d1; long off; long i;
BUG_ON(!data && words);
BUG_ON(data && !words);
if (!data) return val;
/* Very simple memset() work variant: */ if (g->p.data_zero_memset && !g->p.data_rand_walk) {
bzero(data, bytes); return val;
}
/* Spread out by PID/TID nr and by loop nr: */
chunk_0 = words/nr_max;
chunk_1 = words/g->p.nr_loops;
off = nr*chunk_0 + loop*chunk_1;
while (off >= words)
off -= words;
if (g->p.data_rand_walk) {
u32 lfsr = nr + loop + val; long j;
for (i = 0; i < words/1024; i++) { long start, end;
/* * Count the number of nodes a process's threads * are spread out on. * * A count of 1 means that the process is compressed * to a single node. A count of g->p.nr_nodes means it's * spread out on the whole system.
*/ staticint count_process_nodes(int process_nr)
{ char *node_present; int nodes; int n, t;
node = numa_node_of_cpu(td->curr_cpu); if (node < 0) /* curr_cpu was likely still -1 */ {
free(node_present); return 0;
}
node_present[node] = 1;
}
nodes = 0;
for (n = 0; n < g->p.nr_nodes; n++)
nodes += node_present[n];
free(node_present); return nodes;
}
/* * Count the number of distinct process-threads a node contains. * * A count of 1 means that the node contains only a single * process. If all nodes on the system contain at most one * process then we are well-converged.
*/ staticint count_node_processes(int node)
{ int processes = 0; int t, p;
for (p = 0; p < g->p.nr_proc; p++) { for (t = 0; t < g->p.nr_threads; t++) { struct thread_data *td; int task_nr; int n;
/* Strong convergence: all threads compress on a single node: */ if (nodes_min == 1 && nodes_max == 1) {
*strong = 1;
} else {
*strong = 0;
tprintf(" {%d-%d}", nodes_min, nodes_max);
}
}
staticvoid calc_convergence(double runtime_ns_max, double *convergence)
{ unsignedint loops_done_min, loops_done_max; int process_groups; int *nodes; int distance; int nr_min; int nr_max; int strong; int sum; int nr; int node; int cpu; int t;
if (!g->p.show_convergence && !g->p.measure_convergence) return;
for (node = 0; node < g->p.nr_nodes; node++) { if (!is_node_present(node)) continue;
nr = nodes[node];
nr_min = min(nr, nr_min);
nr_max = max(nr, nr_max);
sum += nr;
}
BUG_ON(nr_min > nr_max);
BUG_ON(sum > g->p.nr_tasks);
if (0 && (sum < g->p.nr_tasks)) {
free(nodes); return;
}
/* * Count the number of distinct process groups present * on nodes - when we are converged this will decrease * to g->p.nr_proc:
*/
process_groups = 0;
for (node = 0; node < g->p.nr_nodes; node++) { int processes;
if (!is_node_present(node)) continue;
processes = count_node_processes(node);
nr = nodes[node];
tprintf(" %2d/%-2d", nr, processes);
if (details >= 2) {
printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
process_nr, thread_nr, global_data, process_data, thread_data);
}
if (g->p.serialize_startup) {
mutex_lock(&g->startup_mutex);
g->nr_tasks_started++; /* The last thread wakes the main process. */ if (g->nr_tasks_started == g->p.nr_tasks)
cond_signal(&g->startup_cond);
mutex_unlock(&g->startup_mutex);
/* Here we will wait for the main process to start us all at once: */
mutex_lock(&g->start_work_mutex);
g->start_work = false;
g->nr_tasks_working++; while (!g->start_work)
cond_wait(&g->start_work_cond, &g->start_work_mutex);
for (l = 0; l < g->p.nr_loops; l++) {
start = stop;
if (g->stop_work) break;
val += do_work(global_data, g->p.bytes_global, process_nr, g->p.nr_proc, l, val);
val += do_work(process_data, g->p.bytes_process, thread_nr, g->p.nr_threads, l, val);
val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val);
if (g->p.sleep_usecs) {
mutex_lock(td->process_lock);
usleep(g->p.sleep_usecs);
mutex_unlock(td->process_lock);
} /* * Amount of work to be done under a process-global lock:
*/ if (g->p.bytes_process_locked) {
mutex_lock(td->process_lock);
val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val);
mutex_unlock(td->process_lock);
}
/* Check whether our max runtime timed out: */ if (g->p.nr_secs) {
timersub(&stop, &start0, &diff); if ((u32)diff.tv_sec >= g->p.nr_secs) {
g->stop_work = true; break;
}
}
/* Update the summary at most once per second: */ if (start.tv_sec == stop.tv_sec) continue;
/* * Perturb the first task's equilibrium every g->p.perturb_secs seconds, * by migrating to CPU#0:
*/ if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
cpu_set_t *orig_mask; int target_cpu; int this_cpu;
last_perturbance = stop.tv_sec;
/* * Depending on where we are running, move into * the other half of the system, to create some * real disturbance:
*/
this_cpu = g->threads[task_nr].curr_cpu; if (this_cpu < g->p.nr_cpus/2)
target_cpu = g->p.nr_cpus-1; else
target_cpu = 0;
orig_mask = bind_to_cpu(target_cpu);
/* Here we are running on the target CPU already */ if (details >= 1)
printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
/* * A worker process starts a couple of threads:
*/ staticvoid worker_process(int process_nr)
{ struct mutex process_lock; struct thread_data *td;
pthread_t *pthreads;
u8 *process_data; int task_nr; int ret; int t;
/* * Pick up the memory policy and the CPU binding of our first thread, * so that we initialize memory accordingly:
*/
task_nr = process_nr*g->p.nr_threads;
td = g->threads + task_nr;
/* * Print a short or long result, depending on the verbosity setting:
*/ staticvoid print_res(constchar *name, double val, constchar *txt_unit, constchar *txt_short, constchar *txt_long)
{ if (!name)
name = "main,";
if (g->p.serialize_startup) { bool threads_ready = false; double startup_sec;
/* * Wait for all the threads to start up. The last thread will * signal this process.
*/
mutex_lock(&g->startup_mutex); while (g->nr_tasks_started != g->p.nr_tasks)
cond_wait(&g->startup_cond, &g->startup_mutex);
mutex_unlock(&g->startup_mutex);
/* Wait for all threads to be at the start_work_cond. */ while (!threads_ready) {
mutex_lock(&g->start_work_mutex);
threads_ready = (g->nr_tasks_working == g->p.nr_tasks);
mutex_unlock(&g->start_work_mutex); if (!threads_ready)
usleep(1);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.