HPCToolkit
upc.c
Go to the documentation of this file.
1 // -*-Mode: C++;-*- // technically C99
2 
3 // * BeginRiceCopyright *****************************************************
4 //
5 // $HeadURL$
6 // $Id$
7 //
8 // --------------------------------------------------------------------------
9 // Part of HPCToolkit (hpctoolkit.org)
10 //
11 // Information about sources of support for research and development of
12 // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
13 // --------------------------------------------------------------------------
14 //
15 // Copyright ((c)) 2002-2019, Rice University
16 // All rights reserved.
17 //
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
21 //
22 // * Redistributions of source code must retain the above copyright
23 // notice, this list of conditions and the following disclaimer.
24 //
25 // * Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimer in the
27 // documentation and/or other materials provided with the distribution.
28 //
29 // * Neither the name of Rice University (RICE) nor the names of its
30 // contributors may be used to endorse or promote products derived from
31 // this software without specific prior written permission.
32 //
33 // This software is provided by RICE and contributors "as is" and any
34 // express or implied warranties, including, but not limited to, the
35 // implied warranties of merchantability and fitness for a particular
36 // purpose are disclaimed. In no event shall RICE or contributors be
37 // liable for any direct, indirect, incidental, special, exemplary, or
38 // consequential damages (including, but not limited to, procurement of
39 // substitute goods or services; loss of use, data, or profits; or
40 // business interruption) however caused and on any theory of liability,
41 // whether in contract, strict liability, or tort (including negligence
42 // or otherwise) arising in any way out of the use of this software, even
43 // if advised of the possibility of such damage.
44 //
45 // ******************************************************* EndRiceCopyright *
46 
47 /*
48  * BG/P's UPC interface for overflow sampling.
49  *
50  * https://wiki.alcf.anl.gov/index.php/Performance_Tools
51  * intrepid: /bgsys/drivers/ppcfloor/arch/include/spi/UPC.h
52  *
53  * At startup:
54  *
55  * BGP_UPC_Initialize();
56  * BGP_UPC_Initialize_Counter_Config(BGP_UPC_MODE_0, BGP_UPC_CFG_EDGE_DEFAULT);
57  * for each event
58  * BGP_UPC_Set_Counter_Value(event, 0);
59  * BGP_UPC_Set_Counter_Threshold_Value(event, threshold);
60  * BGP_UPC_Start(0);
61  *
62  * Inside signal handler for SIGXCPU:
63  *
64  * BGP_UPC_Stop();
65  * for each event
66  * BGP_UPC_Read_Counter_Value(event, BGP_UPC_READ_EXCLUSIVE);
67  * if (counter >= threshold)
68  * BGP_UPC_Set_Counter_Value(event, 0);
69  * BGP_UPC_Set_Counter_Threshold_Value(event, threshold);
70  * BGP_UPC_Start(0);
71  *
72  * Notes:
73  *
74  * 1. Set counter value to 0 and count up to threshold.
75  * 2. Stop UPC before reading counters.
76  * 3. May sample on any number of events simultaneously.
77  * 4. There is no event for total cycles.
78  * 5. Seems to allow a separate threshold for each event,
79  * although some reports say only one global threshold.
80  * 6. Must run BGP_UPC_Initialize() in every process.
81  * 7. Set every node to Mode 0.
82  * 8. Don't use BGP_UPC_Monitor_Event().
83  *
84  * Note: all UPC interrupts go to core 0, so we sample core 0 and stay
85  * blind to the other cores.
86  */
87 
88 #include <sys/types.h>
89 #include <signal.h>
90 #include <stdio.h>
91 #include <stdlib.h>
92 #include <string.h>
93 
94 #include <spi/UPC.h>
95 #include <spi/UPC_Events.h>
96 #include <spi/kernel_interface.h>
97 #undef INIT
98 
99 #include <monitor.h>
100 
101 #include "simple_oo.h"
102 #include "sample_source_obj.h"
103 #include "common.h"
104 #include "ss-errno.h"
105 
106 #include <hpcrun/hpcrun_options.h>
107 #include <hpcrun/hpcrun_stats.h>
108 #include <hpcrun/metrics.h>
109 #include <hpcrun/safe-sampling.h>
110 #include <hpcrun/sample_event.h>
112 #include <hpcrun/thread_data.h>
113 #include <utilities/tokenize.h>
114 
115 #include <messages/messages.h>
116 #include <lush/lush-backtrace.h>
118 
119 /******************************************************************************
120  * external thread-local variables
121  *****************************************************************************/
122 extern __thread bool hpcrun_thread_suppress_sample;
123 
124 /******************************************************************************
125  * local variables
126  *****************************************************************************/
127 
128 // FIXME: there is no good way for sighandler to find self.
130 
131 #define DEFAULT_THRESHOLD 1000000L
132 
133 //----------------------------------------------------------------------
134 // Helper functions
135 //----------------------------------------------------------------------
136 
137 /*
138  * The UPC layer doesn't provide a name to event code function, so we
139  * do linear search here. There are about 1,000 possible events, but
140  * it's a one-time hit at startup.
141  *
142  * Returns: BGP event code, or -1 on failure.
143  */
144 #define EVENT_NAME_SIZE (BGP_UPC_MAXIMUM_LENGTH_EVENT_NAME + 50)
145 static int
146 bgp_event_name_to_code(const char *name)
147 {
148  char buf[EVENT_NAME_SIZE];
149  int ev;
150 
151  for (ev = BGP_UPC_MINIMUM_EVENT_ID; ev <= BGP_UPC_MAXIMUM_EVENT_ID; ev++) {
152  if (BGP_UPC_Get_Event_Name(ev, EVENT_NAME_SIZE, buf) >= 0
153  && strcmp(name, buf) == 0) {
154  return ev;
155  }
156  }
157 
158  return -1;
159 }
160 
161 /*
162  * Trim the category from the event description. The BGP description
163  * begins with a category: which is somewhat redundant with the name
164  * of the event. For example:
165  * name description
166  * BGP_PU0_JPIPE_ADD_SUB P0 CPU: Add/Sub in J-pipe
167  */
168 static char *
169 trim_event_desc(char *desc)
170 {
171  char *new;
172 
173  new = strchr(desc, ':');
174  if (new == NULL) {
175  return desc;
176  }
177  new += 1;
178  new += strspn(new, " \t");
179 
180  return new;
181 }
182 
183 //----------------------------------------------------------------------
184 // Method functions and signal handler
185 //----------------------------------------------------------------------
186 
187 static int
188 hpcrun_upc_handler(int sig, siginfo_t *info, void *context)
189 {
191 
192  int64_t counter, threshold;
193  int ev, k;
194 
195 
196 
197  // if sampling disabled explicitly for this thread, skip all processing
200 
201  return 0; // tell monitor that the signal has been handled
202  }
203 
204  BGP_UPC_Stop();
205 
206  // If the interrupt came from inside our code, then drop the sample
207  // and return and avoid any MSG.
208  // FIXME: NULL is bogus here and should be the program counter.
209  int safe = hpcrun_safe_enter_async(NULL);
210  if (! safe) {
212  }
213 
214  for (k = 0; k < myself->evl.nevents; k++) {
215  ev = myself->evl.events[k].event;
216  counter = BGP_UPC_Read_Counter_Value(ev, BGP_UPC_READ_EXCLUSIVE);
217  threshold = myself->evl.events[k].thresh;
218  if (counter >= threshold) {
219  if (safe) {
220  hpcrun_sample_callpath(context, myself->evl.events[k].metric_id,
221  1, 0, 0, NULL);
222  }
223  BGP_UPC_Set_Counter_Value(ev, 0);
224  BGP_UPC_Set_Counter_Threshold_Value(ev, threshold);
225  }
226  }
227 
228  if (! hpcrun_is_sampling_disabled()) {
229  BGP_UPC_Start(0);
230  }
231 
232  if (safe) {
234  }
235 
237 
238  return 0; // tell monitor that the signal has been handled
239 }
240 
241 // Note: Must run BGP_UPC_Initialize() in every process,
242 // and set every node to Mode 0 (to count on cores 0 and 1).
243 static void
245 {
246  BGP_UPC_Initialize();
247  if (Kernel_PhysicalProcessorID() == 0) {
248  BGP_UPC_Initialize_Counter_Config(BGP_UPC_MODE_0, BGP_UPC_CFG_EDGE_DEFAULT);
249  }
250  self->state = INIT;
251  TMSG(UPC, "BGP_UPC_Initialize");
252 }
253 
254 static void
255 METHOD_FN(thread_init)
256 {
257 }
258 
259 static void
260 METHOD_FN(thread_init_action)
261 {
262 }
263 
264 
265 static bool
266 METHOD_FN(supports_event, const char *ev_str)
267 {
268  char buf[EVENT_NAME_SIZE];
269  long threshold;
270 
271  if (self->state == UNINIT) {
272  METHOD_CALL(self, init);
273  }
274 
276  return bgp_event_name_to_code(buf) != -1;
277 }
278 
279 static void
280 METHOD_FN(process_event_list, int lush_metrics)
281 {
282  char *event;
283  char name[EVENT_NAME_SIZE];
284  long threshold;
285  int k, code, metric_id, nevents;
286 
287  for (event = start_tok(self->evl.evl_spec); more_tok(); event = next_tok()) {
289  code = bgp_event_name_to_code(name);
290  if (code < 0) {
291  EMSG("unexpected failure in UPC process_event_list(): "
292  "unable to find code for event %s", event);
293  hpcrun_ssfail_unsupported("UPC", event);
294  }
295  METHOD_CALL(self, store_event, code, threshold);
296  }
297 
298  nevents = self->evl.nevents;
300 
301  for (k = 0; k < nevents; k++) {
302  metric_id = hpcrun_new_metric();
303  code = self->evl.events[k].event;
304  threshold = self->evl.events[k].thresh;
305  BGP_UPC_Get_Event_Name(code, EVENT_NAME_SIZE, name);
306  hpcrun_set_metric_info_and_period(metric_id, strdup(name),
308  self->evl.events[k].metric_id = metric_id;
309  TMSG(UPC, "add event %s(%d), threshold %ld, metric %d",
310  name, code, threshold, metric_id);
311  }
312 }
313 
314 /*
315  * On BG/P, all UPC interrupts go to core 0, so we sample core 0 and
316  * stay blind to the other cores. We can sample on core 0 of every
317  * node, but leave in the soft failure in case something unexpected
318  * happens.
319  */
320 static void
321 METHOD_FN(gen_event_set, int lush_metrics)
322 {
323  char name[EVENT_NAME_SIZE];
324  int k, ev, ret;
325 
326  if (Kernel_PhysicalProcessorID() != 0) {
327  EMSG("Warning: unable to sample in this process/thread "
328  "due to BlueGene hardware limitations (not core 0).");
329  return;
330  }
331 
332  for (k = 0; k < self->evl.nevents; k++) {
333  ev = self->evl.events[k].event;
334  BGP_UPC_Get_Event_Name(ev, EVENT_NAME_SIZE, name);
335 
336  ret = BGP_UPC_Set_Counter_Value(ev, 0);
337  if (ret < 0) {
338  EMSG("Warning: unable to sample on this node "
339  "due to BlueGene hardware limitations.");
340  return;
341  }
342  ret = BGP_UPC_Set_Counter_Threshold_Value(ev, self->evl.events[k].thresh);
343  if (ret < 0) {
344  EMSG("Warning: unable to sample on this node "
345  "due to BlueGene hardware limitations.");
346  return;
347  }
348  TMSG(UPC, "monitor event %s(%d), threshold %ld",
349  name, ev, self->evl.events[k].thresh);
350  }
351 
352  myself = self;
353  ret = monitor_sigaction(SIGXCPU, &hpcrun_upc_handler, 0, NULL);
354  if (ret < 0) {
355  EEMSG("HPCToolkit fatal error: unable to install signal handler for SIGXCPU");
356  exit(1);
357  }
358  TMSG(UPC, "installed signal handler for SIGXCPU");
359 }
360 
361 static void
362 METHOD_FN(start)
363 {
364  if (Kernel_PhysicalProcessorID() != 0)
365  return;
366 
367  // Need to ignore failure here.
368  BGP_UPC_Start(0);
369 
370  TD_GET(ss_state)[self->sel_idx] = START;
371  TMSG(UPC, "BGP_UPC_Start on core 0");
372 }
373 
374 static void
375 METHOD_FN(thread_fini_action)
376 {
377 }
378 
379 static void
381 {
382  if (Kernel_PhysicalProcessorID() != 0)
383  return;
384 
385  BGP_UPC_Stop();
386  TD_GET(ss_state)[self->sel_idx] = STOP;
387  TMSG(UPC, "BGP_UPC_Stop on core 0");
388 }
389 
390 static void
391 METHOD_FN(shutdown)
392 {
393  if (Kernel_PhysicalProcessorID() != 0)
394  return;
395 
396  METHOD_CALL(self, stop);
397  self->state = UNINIT;
398  TMSG(UPC, "shutdown on core 0");
399 }
400 
401 static void
402 METHOD_FN(display_events)
403 {
404  char name[EVENT_NAME_SIZE];
405  char desc[2048];
406  int ev, num_total;
407 
408  printf("===========================================================================\n");
409  printf("Available BG/P UPC events on core 0\n");
410  printf("===========================================================================\n");
411  printf("Name\t\t\t\t\tDescription\n");
412  printf("---------------------------------------------------------------------------\n");
413 
414  num_total = 0;
415  for (ev = BGP_UPC_MINIMUM_EVENT_ID; ev <= BGP_UPC_MAXIMUM_EVENT_ID; ev++) {
416  if (BGP_UPC_Get_Event_Name(ev, EVENT_NAME_SIZE, name) >= 0
417  && BGP_UPC_Get_Event_Description(ev, 2040, desc) >= 0
418  && strstr(name, "PU0") != NULL) {
419  printf("%-35s\t%s\n", name, trim_event_desc(desc));
420  num_total++;
421  }
422  }
423  printf("UPC events on core 0: %d\n", num_total);
424  printf("\n");
425 
426  printf("===========================================================================\n");
427  printf("Other BG/P UPC events (not all available)\n");
428  printf("===========================================================================\n");
429  printf("Name\t\t\t\t\tDescription\n");
430  printf("---------------------------------------------------------------------------\n");
431 
432  num_total = 0;
433  for (ev = BGP_UPC_MINIMUM_EVENT_ID; ev <= BGP_UPC_MAXIMUM_EVENT_ID; ev++) {
434  if (BGP_UPC_Get_Event_Name(ev, EVENT_NAME_SIZE, name) >= 0
435  && BGP_UPC_Get_Event_Description(ev, 2040, desc) >= 0
436  && strstr(name, "PU0") == NULL) {
437  printf("%-35s\t%s\n", name, trim_event_desc(desc));
438  num_total++;
439  }
440  }
441  printf("Other UPC events: %d\n", num_total);
442  printf("\n");
443 }
444 
445 #define ss_name upc
446 #define ss_cls SS_HARDWARE
447 
448 #include "ss_obj.h"
__thread bool hpcrun_thread_suppress_sample
Definition: main.c:193
static int bgp_event_name_to_code(const char *name)
Definition: upc.c:146
static void hpcrun_safe_exit(void)
sample_val_t hpcrun_sample_callpath(void *context, int metricId, hpcrun_metricVal_t metricIncr, int skipInner, int isSync, sampling_info_t *data)
Definition: sample_event.c:160
#define HPCTOOLKIT_APPLICATION_ERRNO_RESTORE()
Definition: ss-errno.h:64
static sample_source_t * myself
Definition: upc.c:129
Definition: fmt.c:108
#define DEFAULT_THRESHOLD
Definition: upc.c:131
static bool hpcrun_is_sampling_disabled(void)
Definition: sample_event.h:73
static char * trim_event_desc(char *desc)
Definition: upc.c:169
_ev_t events[MAX_EVENTS]
Definition: evlist.h:69
void hpcrun_stats_num_samples_blocked_async_inc(void)
Definition: hpcrun_stats.c:148
char * next_tok(void)
Definition: tokenize.c:87
metric_desc_t * hpcrun_set_metric_info_and_period(int metric_id, const char *name, MetricFlags_ValFmt_t valFmt, size_t period, metric_desc_properties_t prop)
Definition: metrics.c:411
exit
Definition: names.cpp:1
#define EVENT_NAME_SIZE
Definition: upc.c:144
#define EMSG
Definition: messages.h:70
void hpcrun_ssfail_unsupported(char *source, char *event)
Definition: common.c:230
int metric_id
Definition: evlist.h:56
char * start_tok(char *lst)
Definition: tokenize.c:70
int lush_metrics
Definition: main.c:188
static void METHOD_FN(init)
Definition: upc.c:244
long thresh
Definition: evlist.h:55
#define TD_GET(field)
Definition: thread_data.h:256
#define TMSG(f,...)
Definition: messages.h:93
int hpcrun_extract_ev_thresh(const char *in, int evlen, char *ev, long *th, long def)
Definition: tokenize.c:157
static int hpcrun_safe_enter_async(void *pc)
#define METHOD_CALL(obj, meth,...)
Definition: simple_oo.h:87
static int hpcrun_upc_handler(int sig, siginfo_t *info, void *context)
Definition: upc.c:188
#define EEMSG(...)
Definition: messages.h:90
#define NULL
Definition: ElfHelper.cpp:85
int event
Definition: evlist.h:54
static int const threshold
int hpcrun_new_metric(void)
Definition: metrics.c:333
#define HPCTOOLKIT_APPLICATION_ERRNO_SAVE()
Definition: ss-errno.h:63
int nevents
Definition: evlist.h:68
void hpcrun_pre_allocate_metrics(size_t num)
Definition: metrics.c:190
int more_tok(void)
Definition: tokenize.c:78