13 #include "../builtin.h" 14 #include "../util/util.h" 15 #include <subcmd/parse-options.h> 16 #include "../util/cloexec.h" 31 #include <sys/resource.h> 33 #include <sys/prctl.h> 34 #include <sys/types.h> 35 #include <linux/kernel.h> 36 #include <linux/time64.h> 44 #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) 50 #define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0) 166 OPT_INTEGER(
'p',
"nr_proc" , &
p0.
nr_proc,
"number of processes"),
167 OPT_INTEGER(
't',
"nr_threads" , &
p0.
nr_threads,
"number of threads per process"),
169 OPT_STRING(
'G',
"mb_global" , &
p0.
mb_global_str,
"MB",
"global memory (MBs)"),
170 OPT_STRING(
'P',
"mb_proc" , &
p0.
mb_proc_str,
"MB",
"process memory (MBs)"),
171 OPT_STRING(
'L',
"mb_proc_locked", &
p0.
mb_proc_locked_str,
"MB",
"process serialized/locked memory access (MBs), <= process_memory"),
172 OPT_STRING(
'T',
"mb_thread" , &
p0.
mb_thread_str,
"MB",
"thread memory (MBs)"),
174 OPT_UINTEGER(
'l',
"nr_loops" , &
p0.
nr_loops,
"max number of loops to run (default: unlimited)"),
175 OPT_UINTEGER(
's',
"nr_secs" , &
p0.
nr_secs,
"max number of seconds to run (default: 5 secs)"),
176 OPT_UINTEGER(
'u',
"usleep" , &
p0.
sleep_usecs,
"usecs to sleep per loop iteration"),
178 OPT_BOOLEAN(
'R',
"data_reads" , &
p0.
data_reads,
"access the data via reads (can be mixed with -W)"),
179 OPT_BOOLEAN(
'W',
"data_writes" , &
p0.
data_writes,
"access the data via writes (can be mixed with -R)"),
180 OPT_BOOLEAN(
'B',
"data_backwards", &
p0.
data_backwards,
"access the data backwards as well"),
181 OPT_BOOLEAN(
'Z',
"data_zero_memset", &
p0.
data_zero_memset,
"access the data via glibc bzero only"),
182 OPT_BOOLEAN(
'r',
"data_rand_walk", &
p0.
data_rand_walk,
"access the data with random (32bit LFSR) walk"),
185 OPT_BOOLEAN(
'z',
"init_zero" , &
p0.
init_zero,
"bzero the initial allocations"),
186 OPT_BOOLEAN(
'I',
"init_random" , &
p0.
init_random,
"randomize the contents of the initial allocations"),
187 OPT_BOOLEAN(
'0',
"init_cpu0" , &
p0.
init_cpu0,
"do the initial allocations on CPU#0"),
188 OPT_INTEGER(
'x',
"perturb_secs", &
p0.
perturb_secs,
"perturb thread 0/0 every X secs, to test convergence stability"),
190 OPT_INCR (
'd',
"show_details" , &
p0.
show_details,
"Show details"),
191 OPT_INCR (
'a',
"all" , &
p0.
run_all,
"Run all tests in the suite"),
192 OPT_INTEGER(
'H',
"thp" , &
p0.
thp,
"MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
193 OPT_BOOLEAN(
'c',
"show_convergence", &
p0.
show_convergence,
"show convergence details, " 194 "convergence is reached when each process (all its threads) is running on a single NUMA node."),
196 OPT_BOOLEAN(
'q',
"quiet" , &
p0.
show_quiet,
"quiet mode"),
200 OPT_CALLBACK(
'C',
"cpus", NULL,
"cpu[,cpu2,...cpuN]",
201 "bind the first N tasks to these specific cpus (the rest is unbound)",
203 OPT_CALLBACK(
'M',
"memnodes", NULL,
"node[,node2,...nodeN]",
204 "bind the first N tasks to these specific memory nodes (the rest is unbound)",
210 "perf bench numa <options>",
215 "perf bench numa mem [<options>]",
227 if (numa_bitmask_isbitset(numa_nodes_ptr, i))
239 return numa_bitmask_isbitset(numa_nodes_ptr, node);
247 struct bitmask *cpu = numa_allocate_cpumask();
250 if (cpu && !numa_node_to_cpus(node, cpu)) {
251 for (i = 0; i < cpu->size; i++) {
252 if (numa_bitmask_isbitset(cpu, i))
262 cpu_set_t orig_mask, mask;
265 ret = sched_getaffinity(0,
sizeof(orig_mask), &orig_mask);
270 if (target_cpu == -1) {
273 for (cpu = 0; cpu < g->
p.
nr_cpus; cpu++)
276 BUG_ON(target_cpu < 0 || target_cpu >= g->
p.
nr_cpus);
277 CPU_SET(target_cpu, &mask);
280 ret = sched_setaffinity(0,
sizeof(mask), &mask);
289 cpu_set_t orig_mask, mask;
294 BUG_ON(!cpus_per_node);
296 ret = sched_getaffinity(0,
sizeof(orig_mask), &orig_mask);
301 if (target_node == -1) {
302 for (cpu = 0; cpu < g->
p.
nr_cpus; cpu++)
305 int cpu_start = (target_node + 0) * cpus_per_node;
306 int cpu_stop = (target_node + 1) * cpus_per_node;
310 for (cpu = cpu_start; cpu < cpu_stop; cpu++)
314 ret = sched_setaffinity(0,
sizeof(mask), &mask);
324 ret = sched_setaffinity(0,
sizeof(mask), &mask);
332 ret = set_mempolicy(MPOL_DEFAULT, NULL, g->
p.
nr_nodes-1);
339 unsigned long nodemask;
345 BUG_ON(g->
p.
nr_nodes > (
int)
sizeof(nodemask)*8);
346 nodemask = 1L << node;
348 ret = set_mempolicy(MPOL_BIND, &nodemask,
sizeof(nodemask)*8);
349 dprintf(
"binding to node %d, mask: %016lx => %d\n", node, nodemask, ret);
354 #define HPSIZE (2*1024*1024) 356 #define set_taskname(fmt...) \ 360 snprintf(name, 20, fmt); \ 361 prctl(PR_SET_NAME, name); \ 365 int init_zero,
int init_cpu0,
int thp,
int init_random)
383 buf = (
void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0);
384 BUG_ON(buf == (
void *)-1);
386 if (map_flags == MAP_PRIVATE) {
391 printf(
"WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n");
398 printf(
"WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n");
408 u64 *wbuf = (
void *)buf;
412 for (i = 0; i < bytes/8; i++)
418 buf = (
void *)(((
unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1));
436 ret = munmap(data, bytes);
470 pthread_mutexattr_t
attr;
472 pthread_mutexattr_init(&attr);
473 pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
474 pthread_mutex_init(mutex, &attr);
502 tprintf(
"# binding tasks to CPUs:\n");
506 int bind_cpu, bind_cpu_0, bind_cpu_1;
507 char *tok, *tok_end, *tok_step, *tok_len, *tok_mul;
512 tok = strsep(&str,
",");
516 tok_end = strstr(tok,
"-");
518 dprintf(
"\ntoken: {%s}, end: {%s}\n", tok, tok_end);
521 bind_cpu_0 = bind_cpu_1 = atol(tok);
524 bind_cpu_0 = atol(tok);
525 bind_cpu_1 = atol(tok_end + 1);
529 tok_step = strstr(tok,
"#");
531 step = atol(tok_step + 1);
532 BUG_ON(step <= 0 || step >= g->
p.
nr_cpus);
541 tok_len = strstr(tok,
"_");
543 bind_len = atol(tok_len + 1);
544 BUG_ON(bind_len <= 0 || bind_len > g->
p.
nr_cpus);
549 tok_mul = strstr(tok,
"x");
551 mul = atol(tok_mul + 1);
555 dprintf(
"CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);
558 printf(
"\nTest not applicable, system has only %d CPUs.\n", g->
p.
nr_cpus);
562 BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0);
563 BUG_ON(bind_cpu_0 > bind_cpu_1);
565 for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
568 for (i = 0; i < mul; i++) {
572 printf(
"\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu);
580 tprintf(
"%2d/%d", bind_cpu, bind_len);
586 for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
587 BUG_ON(cpu < 0 || cpu >= g->
p.
nr_cpus);
599 printf(
"# NOTE: %d tasks bound, %d tasks unbound\n", t, g->
p.
nr_tasks - t);
606 const char *arg,
int unset __maybe_unused)
639 tprintf(
"# binding tasks to NODEs:\n");
644 char *tok, *tok_end, *tok_step, *tok_mul;
648 tok = strsep(&str,
",");
652 tok_end = strstr(tok,
"-");
654 dprintf(
"\ntoken: {%s}, end: {%s}\n", tok, tok_end);
657 bind_node_0 = bind_node_1 = atol(tok);
660 bind_node_0 = atol(tok);
661 bind_node_1 = atol(tok_end + 1);
665 tok_step = strstr(tok,
"#");
667 step = atol(tok_step + 1);
668 BUG_ON(step <= 0 || step >= g->
p.
nr_nodes);
673 tok_mul = strstr(tok,
"x");
675 mul = atol(tok_mul + 1);
679 dprintf(
"NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step);
682 printf(
"\nTest not applicable, system has only %d nodes.\n", g->
p.
nr_nodes);
686 BUG_ON(bind_node_0 < 0 || bind_node_1 < 0);
687 BUG_ON(bind_node_0 > bind_node_1);
689 for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) {
692 for (i = 0; i < mul; i++) {
694 printf(
"\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node);
714 printf(
"# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->
p.
nr_tasks - t);
721 const char *arg,
int unset __maybe_unused)
731 #define BIT(x) (1ul << x) 735 const uint32_t taps =
BIT(1) |
BIT(5) |
BIT(6) |
BIT(31);
736 return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps);
762 static u64
do_work(u8 *__data,
long bytes,
int nr,
int nr_max,
int loop, u64
val)
764 long words = bytes/
sizeof(u64);
765 u64 *
data = (
void *)__data;
766 long chunk_0, chunk_1;
771 BUG_ON(!data && words);
772 BUG_ON(data && !words);
784 chunk_0 = words/nr_max;
786 off = nr*chunk_0 + loop*chunk_1;
792 u32 lfsr = nr + loop +
val;
795 for (i = 0; i < words/1024; i++) {
800 start = lfsr % words;
801 end =
min(start + 1024, words-1);
804 bzero(data + start, (end-start) *
sizeof(u64));
806 for (j = start; j < end; j++)
818 if (unlikely(d >= d1))
820 if (unlikely(d == d0))
836 if (unlikely(d < data))
838 if (unlikely(d == d0))
857 prctl(0, bytes_worked);
860 #define MAX_NR_NODES 64 884 node = numa_node_of_cpu(td->
curr_cpu);
888 node_present[node] = 1;
894 nodes += node_present[n];
911 for (p = 0; p < g->
p.
nr_proc; p++) {
933 unsigned int nodes_min, nodes_max;
939 for (p = 0; p < g->
p.
nr_proc; p++) {
947 nodes_min =
min(nodes, nodes_min);
948 nodes_max = max(nodes, nodes_max);
952 if (nodes_min == 1 && nodes_max == 1) {
956 tprintf(
" {%d-%d}", nodes_min, nodes_max);
962 unsigned int loops_done_min, loops_done_max;
978 for (node = 0; node < g->
p.
nr_nodes; node++)
994 node = numa_node_of_cpu(cpu);
999 loops_done_min =
min(loops_done, loops_done_min);
1000 loops_done_max = max(loops_done, loops_done_max);
1007 for (node = 0; node < g->
p.
nr_nodes; node++) {
1011 nr_min =
min(nr, nr_min);
1012 nr_max = max(nr, nr_max);
1015 BUG_ON(nr_min > nr_max);
1029 for (node = 0; node < g->
p.
nr_nodes; node++) {
1036 tprintf(
" %2d/%-2d", nr, processes);
1038 process_groups += processes;
1041 distance = nr_max - nr_min;
1043 tprintf(
" [%2d/%-2d]", distance, process_groups);
1046 loops_done_min, loops_done_max, loops_done_max-loops_done_min);
1048 if (loops_done_min && loops_done_max) {
1049 double skew = 1.0 - (double)loops_done_min/loops_done_max;
1051 tprintf(
" [%4.1f%%]", skew * 100.0);
1056 if (strong && process_groups == g->
p.
nr_proc) {
1057 if (!*convergence) {
1058 *convergence = runtime_ns_max;
1074 static void show_summary(
double runtime_ns_max,
int l,
double *convergence)
1076 tprintf(
"\r # %5.1f%% [%.1f mins]",
1088 struct timeval start0, start, stop, diff;
1091 unsigned long last_perturbance;
1094 int first_task, last_task;
1095 double convergence = 0;
1097 double runtime_ns_max;
1104 struct rusage rusage;
1111 global_data = g->
data;
1122 if (process_nr == 0 && thread_nr == 0)
1126 printf(
"# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
1127 process_nr, thread_nr, global_data, process_data, thread_data);
1146 gettimeofday(&start0, NULL);
1148 start = stop = start0;
1149 last_perturbance = start.tv_sec;
1179 bytes_done += work_done;
1186 gettimeofday(&stop, NULL);
1190 timersub(&stop, &start0, &diff);
1191 if ((u32)diff.tv_sec >= g->
p.
nr_secs) {
1198 if (start.tv_sec == stop.tv_sec)
1206 cpu_set_t orig_mask;
1210 last_perturbance = stop.tv_sec;
1218 if (this_cpu < g->p.
nr_cpus/2)
1227 printf(
" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
1233 timersub(&stop, &start, &diff);
1235 runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
1238 printf(
" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64
"]\n",
1239 process_nr, thread_nr, runtime_ns_max / bytes_done, val);
1246 timersub(&stop, &start0, &diff);
1248 runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
1253 gettimeofday(&stop, NULL);
1254 timersub(&stop, &start0, &diff);
1256 td->
runtime_ns += diff.tv_usec * NSEC_PER_USEC;
1259 getrusage(RUSAGE_THREAD, &rusage);
1263 td->
user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC;
1281 pthread_t *pthreads;
1287 pthread_mutex_init(&process_lock, NULL);
1304 printf(
" # process %2d global mem: %p, process mem: %p\n",
1305 process_nr, g->
data, process_data);
1320 ret = pthread_create(pthreads + t, NULL,
worker_thread, td);
1325 ret = pthread_join(pthreads[t], NULL);
1339 printf(
" # %d %s will execute (on %d nodes, %d CPUs):\n",
1341 printf(
" # %5dx %5ldMB global shared mem operations\n",
1343 printf(
" # %5dx %5ldMB process shared mem operations\n",
1345 printf(
" # %5dx %5ldMB thread local mem operations\n",
1350 printf(
"\n ###\n"); fflush(stdout);
1369 for (cpu = 0; cpu < g->
p.
nr_cpus; cpu++)
1383 g = (
void *)
alloc_data(
sizeof(*g), MAP_SHARED, 1, 0, 0 , 0);
1388 g->
p.
nr_cpus = numa_num_configured_cpus();
1468 const char *txt_unit,
const char *txt_short,
const char *txt_long)
1474 printf(
" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
1476 printf(
" %14.3f %s\n", val, txt_long);
1481 struct timeval start, stop, diff;
1482 u64 runtime_ns_min, runtime_ns_sum;
1483 pid_t *pids,
pid, wpid;
1484 double delta_runtime;
1486 double runtime_sec_max;
1487 double runtime_sec_min;
1503 tprintf(
" # Startup synchronization: ..."); fflush(stdout);
1506 gettimeofday(&start, NULL);
1508 for (i = 0; i < g->
p.
nr_proc; i++) {
1510 dprintf(
" # process %2d: PID %d\n", i, pid);
1524 usleep(USEC_PER_MSEC);
1539 gettimeofday(&stop, NULL);
1541 timersub(&stop, &start, &diff);
1544 startup_sec += diff.tv_usec * NSEC_PER_USEC;
1547 tprintf(
" threads initialized in %.6f seconds.\n", startup_sec);
1553 gettimeofday(&start, NULL);
1559 for (i = 0; i < g->
p.
nr_proc; i++) {
1560 wpid = waitpid(pids[i], &wait_stat, 0);
1562 BUG_ON(!WIFEXITED(wait_stat));
1567 runtime_ns_min = -1LL;
1572 runtime_ns_sum += thread_runtime_ns;
1573 runtime_ns_min =
min(thread_runtime_ns, runtime_ns_min);
1576 gettimeofday(&stop, NULL);
1577 timersub(&stop, &start, &diff);
1585 runtime_sec_max += diff.tv_usec * NSEC_PER_USEC;
1595 "secs,",
"NUMA-convergence-latency",
"secs latency to NUMA-converge");
1599 "secs,",
"runtime-max/thread",
"secs slowest (max) thread-runtime");
1602 "secs,",
"runtime-min/thread",
"secs fastest (min) thread-runtime");
1605 "secs,",
"runtime-avg/thread",
"secs average thread-runtime");
1607 delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0;
1608 print_res(name, delta_runtime / runtime_sec_max * 100.0,
1609 "%,",
"spread-runtime/thread",
"% difference between max/avg runtime");
1612 "GB,",
"data/thread",
"GB data processed, per thread");
1615 "GB,",
"data-total",
"GB data processed, total");
1618 "nsecs,",
"runtime/byte/thread",
"nsecs/byte/thread runtime");
1621 "GB/sec,",
"thread-speed",
"GB/sec/thread speed");
1623 print_res(name, bytes / runtime_sec_max / 1e9,
1624 "GB/sec,",
"total-speed",
"GB/sec total speed");
1627 char tname[14 + 2 * 10 + 1];
1629 for (p = 0; p < g->
p.
nr_proc; p++) {
1631 memset(tname, 0,
sizeof(tname));
1633 snprintf(tname,
sizeof(tname),
"process%d:thread%d", p, t);
1635 "GB/sec",
"thread-speed",
"GB/sec/thread speed");
1637 "secs",
"thread-system-time",
"system CPU time/thread");
1639 "secs",
"thread-user-time",
"user CPU time/thread");
1671 printf(
"\n # Running %s \"perf bench numa", name);
1673 for (i = 0; i <
argc; i++)
1674 printf(
" %s", argv[i]);
1678 memset(p, 0,
sizeof(*p));
1714 #define OPT_BW_RAM "-s", "20", "-zZq", "--thp", " 1", "--no-data_rand_walk" 1715 #define OPT_BW_RAM_NOTHP OPT_BW_RAM, "--thp", "-1" 1717 #define OPT_CONV "-s", "100", "-zZ0qcm", "--thp", " 1" 1718 #define OPT_CONV_NOTHP OPT_CONV, "--thp", "-1" 1720 #define OPT_BW "-s", "20", "-zZ0q", "--thp", " 1" 1721 #define OPT_BW_NOTHP OPT_BW, "--thp", "-1" 1730 {
"RAM-bw-local,",
"mem",
"-p",
"1",
"-t",
"1",
"-P",
"1024",
1732 {
"RAM-bw-local-NOTHP,",
1733 "mem",
"-p",
"1",
"-t",
"1",
"-P",
"1024",
1735 {
"RAM-bw-remote,",
"mem",
"-p",
"1",
"-t",
"1",
"-P",
"1024",
1739 {
"RAM-bw-local-2x,",
"mem",
"-p",
"2",
"-t",
"1",
"-P",
"1024",
1741 {
"RAM-bw-remote-2x,",
"mem",
"-p",
"2",
"-t",
"1",
"-P",
"1024",
1745 {
"RAM-bw-cross,",
"mem",
"-p",
"2",
"-t",
"1",
"-P",
"1024",
1749 {
" 1x3-convergence,",
"mem",
"-p",
"1",
"-t",
"3",
"-P",
"512",
OPT_CONV },
1750 {
" 1x4-convergence,",
"mem",
"-p",
"1",
"-t",
"4",
"-P",
"512",
OPT_CONV },
1751 {
" 1x6-convergence,",
"mem",
"-p",
"1",
"-t",
"6",
"-P",
"1020",
OPT_CONV },
1752 {
" 2x3-convergence,",
"mem",
"-p",
"3",
"-t",
"3",
"-P",
"1020",
OPT_CONV },
1753 {
" 3x3-convergence,",
"mem",
"-p",
"3",
"-t",
"3",
"-P",
"1020",
OPT_CONV },
1754 {
" 4x4-convergence,",
"mem",
"-p",
"4",
"-t",
"4",
"-P",
"512",
OPT_CONV },
1755 {
" 4x4-convergence-NOTHP,",
1757 {
" 4x6-convergence,",
"mem",
"-p",
"4",
"-t",
"6",
"-P",
"1020",
OPT_CONV },
1758 {
" 4x8-convergence,",
"mem",
"-p",
"4",
"-t",
"8",
"-P",
"512",
OPT_CONV },
1759 {
" 8x4-convergence,",
"mem",
"-p",
"8",
"-t",
"4",
"-P",
"512",
OPT_CONV },
1760 {
" 8x4-convergence-NOTHP,",
1762 {
" 3x1-convergence,",
"mem",
"-p",
"3",
"-t",
"1",
"-P",
"512",
OPT_CONV },
1763 {
" 4x1-convergence,",
"mem",
"-p",
"4",
"-t",
"1",
"-P",
"512",
OPT_CONV },
1764 {
" 8x1-convergence,",
"mem",
"-p",
"8",
"-t",
"1",
"-P",
"512",
OPT_CONV },
1765 {
"16x1-convergence,",
"mem",
"-p",
"16",
"-t",
"1",
"-P",
"256",
OPT_CONV },
1766 {
"32x1-convergence,",
"mem",
"-p",
"32",
"-t",
"1",
"-P",
"128",
OPT_CONV },
1769 {
" 2x1-bw-process,",
"mem",
"-p",
"2",
"-t",
"1",
"-P",
"1024",
OPT_BW },
1770 {
" 3x1-bw-process,",
"mem",
"-p",
"3",
"-t",
"1",
"-P",
"1024",
OPT_BW },
1771 {
" 4x1-bw-process,",
"mem",
"-p",
"4",
"-t",
"1",
"-P",
"1024",
OPT_BW },
1772 {
" 8x1-bw-process,",
"mem",
"-p",
"8",
"-t",
"1",
"-P",
" 512",
OPT_BW },
1773 {
" 8x1-bw-process-NOTHP,",
1774 "mem",
"-p",
"8",
"-t",
"1",
"-P",
" 512",
OPT_BW_NOTHP },
1775 {
"16x1-bw-process,",
"mem",
"-p",
"16",
"-t",
"1",
"-P",
"256",
OPT_BW },
1777 {
" 4x1-bw-thread,",
"mem",
"-p",
"1",
"-t",
"4",
"-T",
"256",
OPT_BW },
1778 {
" 8x1-bw-thread,",
"mem",
"-p",
"1",
"-t",
"8",
"-T",
"256",
OPT_BW },
1779 {
"16x1-bw-thread,",
"mem",
"-p",
"1",
"-t",
"16",
"-T",
"128",
OPT_BW },
1780 {
"32x1-bw-thread,",
"mem",
"-p",
"1",
"-t",
"32",
"-T",
"64",
OPT_BW },
1782 {
" 2x3-bw-thread,",
"mem",
"-p",
"2",
"-t",
"3",
"-P",
"512",
OPT_BW },
1783 {
" 4x4-bw-thread,",
"mem",
"-p",
"4",
"-t",
"4",
"-P",
"512",
OPT_BW },
1784 {
" 4x6-bw-thread,",
"mem",
"-p",
"4",
"-t",
"6",
"-P",
"512",
OPT_BW },
1785 {
" 4x8-bw-thread,",
"mem",
"-p",
"4",
"-t",
"8",
"-P",
"512",
OPT_BW },
1786 {
" 4x8-bw-thread-NOTHP,",
1787 "mem",
"-p",
"4",
"-t",
"8",
"-P",
"512",
OPT_BW_NOTHP },
1788 {
" 3x3-bw-thread,",
"mem",
"-p",
"3",
"-t",
"3",
"-P",
"512",
OPT_BW },
1789 {
" 5x5-bw-thread,",
"mem",
"-p",
"5",
"-t",
"5",
"-P",
"512",
OPT_BW },
1791 {
"2x16-bw-thread,",
"mem",
"-p",
"2",
"-t",
"16",
"-P",
"512",
OPT_BW },
1792 {
"1x32-bw-thread,",
"mem",
"-p",
"1",
"-t",
"32",
"-P",
"2048",
OPT_BW },
1794 {
"numa02-bw,",
"mem",
"-p",
"1",
"-t",
"32",
"-T",
"32",
OPT_BW },
1795 {
"numa02-bw-NOTHP,",
"mem",
"-p",
"1",
"-t",
"32",
"-T",
"32",
OPT_BW_NOTHP },
1796 {
"numa01-bw-thread,",
"mem",
"-p",
"2",
"-t",
"16",
"-T",
"192",
OPT_BW },
1797 {
"numa01-bw-thread-NOTHP,",
1798 "mem",
"-p",
"2",
"-t",
"16",
"-T",
"192",
OPT_BW_NOTHP },
1807 ret = system(
"echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'");
1810 for (i = 0; i <
nr; i++) {
static void deinit_thread_data(void)
static int count_node_processes(int node)
static void * zalloc_shared_data(ssize_t bytes)
static int nr_numa_nodes(void)
static void show_summary(double runtime_ns_max, int l, double *convergence)
static void worker_process(int process_nr)
static int parse_cpus_opt(const struct option *opt, const char *arg, int unset)
struct thread_data * threads
static void init_thread_data(void)
static void init_global_mutex(pthread_mutex_t *mutex)
static cpu_set_t bind_to_node(int target_node)
const char * mb_global_str
static void bind_to_memnode(int node)
static int parse_setup_cpu_list(void)
static cpu_set_t bind_to_cpu(int target_cpu)
static u64 access_data(u64 *data, u64 val)
static int bench_all(void)
static bool node_has_cpus(int node)
static int count_process_nodes(int process_nr)
static void calc_convergence_compression(int *strong)
x86 movsq based memset() in arch/x86/lib/memset_64.S") MEMSET_FN(memset_erms
static const char * tests[][MAX_ARGS]
static void print_summary(void)
static int is_node_present(int node)
#define BENCH_FORMAT_DEFAULT
static uint32_t lfsr_32(uint32_t lfsr)
const char * mb_proc_locked_str
static void * setup_private_data(ssize_t bytes)
static struct global_info * g
static int __bench_numa(const char *name)
static u8 * alloc_data(ssize_t bytes0, int map_flags, int init_zero, int init_cpu0, int thp, int init_random)
static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
static int parse_setup_node_list(void)
static void * setup_shared_data(ssize_t bytes)
static int str(yyscan_t scanner, int token)
static void mempol_restore(void)
pthread_mutex_t startup_done_mutex
pthread_mutex_t stop_work_mutex
static void init_params(struct params *p, const char *name, int argc, const char **argv)
static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val)
pthread_mutex_t startup_mutex
int bench_numa(int argc, const char **argv)
pthread_mutex_t * process_lock
static void calc_convergence(double runtime_ns_max, double *convergence)
static void * worker_thread(void *__tdata)
static const char *const numa_usage[]
int __weak sched_getcpu(void)
static int run_bench_numa(const char *name, const char **argv)
static void free_data(void *data, ssize_t bytes)
const char * mb_thread_str
static int parse_cpu_list(const char *arg)
static int parse_node_list(const char *arg)
static void bind_to_cpumask(cpu_set_t mask)
static int parse_nodes_opt(const struct option *opt, const char *arg, int unset)
static int command_size(const char **argv)
static void print_res(const char *name, double val, const char *txt_unit, const char *txt_short, const char *txt_long)
static const char *const bench_numa_usage[]
pthread_mutex_t start_work_mutex
long bytes_process_locked
#define set_taskname(fmt...)
void static void * zalloc(size_t size)