HPCToolkit
cuda.c
Go to the documentation of this file.
1 // -*-Mode: C++;-*- // technically C99
2 
3 // * BeginRiceCopyright *****************************************************
4 //
5 // $HeadURL: https://outreach.scidac.gov/svn/hpctoolkit/trunk/src/tool/hpcrun/sample-sources/papi.c $
6 // $Id: papi.c 3328 2010-12-23 23:39:09Z tallent $
7 //
8 // --------------------------------------------------------------------------
9 // Part of HPCToolkit (hpctoolkit.org)
10 //
11 // Information about sources of support for research and development of
12 // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
13 // --------------------------------------------------------------------------
14 //
15 // Copyright ((c)) 2002-2019, Rice University
16 // All rights reserved.
17 //
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
21 //
22 // * Redistributions of source code must retain the above copyright
23 // notice, this list of conditions and the following disclaimer.
24 //
25 // * Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimer in the
27 // documentation and/or other materials provided with the distribution.
28 //
29 // * Neither the name of Rice University (RICE) nor the names of its
30 // contributors may be used to endorse or promote products derived from
31 // this software without specific prior written permission.
32 //
33 // This software is provided by RICE and contributors "as is" and any
34 // express or implied warranties, including, but not limited to, the
35 // implied warranties of merchantability and fitness for a particular
36 // purpose are disclaimed. In no event shall RICE or contributors be
37 // liable for any direct, indirect, incidental, special, exemplary, or
38 // consequential damages (including, but not limited to, procurement of
39 // substitute goods or services; loss of use, data, or profits; or
40 // business interruption) however caused and on any theory of liability,
41 // whether in contract, strict liability, or tort (including negligence
42 // or otherwise) arising in any way out of the use of this software, even
43 // if advised of the possibility of such damage.
44 //
45 // ******************************************************* EndRiceCopyright *
46 
47 //
48 // CUPTI synchronous sampling via PAPI sample source simple oo interface
49 //
50 
51 /******************************************************************************
52  * system includes
53  *****************************************************************************/
54 
55 #include <alloca.h>
56 #include <assert.h>
57 #include <ctype.h>
58 #include <papi.h>
59 #include <setjmp.h>
60 #include <stdlib.h>
61 #include <string.h>
62 #include <unistd.h>
63 #include <ucontext.h>
64 #include <stdbool.h>
65 
66 #include <pthread.h>
67 
68 #include <cuda.h>
69 #include <cupti.h>
70 
71 /******************************************************************************
72  * libmonitor
73  *****************************************************************************/
74 
75 #include <monitor.h>
76 
77 
78 /******************************************************************************
79  * local includes
80  *****************************************************************************/
81 
82 #include "simple_oo.h"
83 #include "sample_source_obj.h"
84 #include "common.h"
85 
86 #include <hpcrun/hpcrun_options.h>
87 #include <hpcrun/hpcrun_stats.h>
88 #include <hpcrun/metrics.h>
90 #include <hpcrun/sample_event.h>
91 #include <hpcrun/thread_data.h>
92 #include <utilities/tokenize.h>
93 #include <messages/messages.h>
94 #include <lush/lush-backtrace.h>
96 
97 
98 /******************************************************************************
99  * macros
100  *****************************************************************************/
101 
102 
103 #define OVERFLOW_MODE 0
104 #define NO_THRESHOLD 1L
105 
106 #define PAPI_CUDA_COMPONENT_ID 1
107 #define CUPTI_LAUNCH_CALLBACK_DEPTH 7
108 
109 
110 
111 /******************************************************************************
112  * forward declarations
113  *****************************************************************************/
114 
115 static void hpcrun_cuda_kernel_callback(void *userdata,
116  CUpti_CallbackDomain domain,
117  CUpti_CallbackId cbid,
118  const CUpti_CallbackData *cbInfo);
119 
120 static void check_cupti_error(int err, char *cuptifunc);
121 
122 static void event_fatal_error(int ev_code, int papi_ret);
123 
124 /******************************************************************************
125  * interface operations
126  *****************************************************************************/
127 
128 static void
130 {
131  PAPI_set_debug(0x3ff);
132 
133  // PAPI_library_init spawns threads yields DEADLOCK !!!
134  // --- Ignore new threads for init call ---
135  //
136  monitor_disable_new_threads();
137  int ret = PAPI_library_init(PAPI_VER_CURRENT);
138  monitor_enable_new_threads();
139 
140  TMSG(CUDA,"PAPI_library_init = %d", ret);
141  TMSG(CUDA,"PAPI_VER_CURRENT = %d", PAPI_VER_CURRENT);
142  if (ret != PAPI_VER_CURRENT){
143  STDERR_MSG("Fatal error: PAPI_library_init() failed with version mismatch.\n"
144  "HPCToolkit was compiled with version 0x%x but run on version 0x%x.\n"
145  "Check the HPCToolkit installation and try again.",
146  PAPI_VER_CURRENT, ret);
147  exit(1);
148  }
149  self->state = INIT;
150 }
151 
152 static void
153 METHOD_FN(thread_init)
154 {
155  TMSG(CUDA, "thread init");
156  int retval = PAPI_thread_init(pthread_self);
157  if (retval != PAPI_OK) {
158  EEMSG("PAPI_thread_init NOT ok, retval = %d", retval);
160  }
161  TMSG(CUDA, "thread init OK");
162 }
163 
164 static void
165 METHOD_FN(thread_init_action)
166 {
167  TMSG(CUDA, "register thread");
168  int retval = PAPI_register_thread();
169  if (retval != PAPI_OK) {
170  EEMSG("PAPI_register_thread NOT ok, retval = %d", retval);
172  }
173  TMSG(CUDA, "register thread ok");
174 }
175 
176 static void
177 METHOD_FN(start)
178 {
179  int cuptiErr;
180  CUpti_SubscriberHandle subscriber;
181 
182  TMSG(CUDA,"start called");
183 
184  cuptiErr = cuptiSubscribe(&subscriber,
185  (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback,
186  (void *) NULL);
187  check_cupti_error(cuptiErr, "cuptiSubscribe");
188 
189  cuptiErr = cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
190  CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
191  check_cupti_error(cuptiErr, "cuptiEnableCallback");
192 
193  TD_GET(ss_state)[self->evset_idx] = START;
194 }
195 
196 static void
197 METHOD_FN(thread_fini_action)
198 {
199  TMSG(CUDA, "unregister thread");
200  int rval = PAPI_unregister_thread();
201  if (rval != PAPI_OK) {
202  TMSG(CUDA, "warning: CUDA PAPI_unregister_thread (%d): %s.",
203  rval, PAPI_strerror(rval));
204  }
205 }
206 
207 static void
209 {
211 
212  int eventSet = td->eventSet[self->evset_idx];
213 
214  source_state_t my_state = TD_GET(ss_state)[self->evset_idx];
215 
216  TMSG(CUDA,"stop called");
217 
218  if (my_state == STOP) {
219  TMSG(CUDA,"PAPI CUDA stop called on an already stopped event set %d",eventSet);
220  return;
221  }
222 
223  if (my_state != START) {
224  TMSG(CUDA,"*WARNING* PAPI CUDA stop called on event set that has not been started");
225  return;
226  }
227 
228  TD_GET(ss_state)[self->evset_idx] = STOP;
229 }
230 
231 static void
232 METHOD_FN(shutdown)
233 {
234  int thr_id = TD_GET(id);
235  EEMSG("CUDA/PAPI shutdown from thread %d", thr_id);
236  if (thr_id != 0) {
237  EMSG("Shutdown op for cuda sample source called from thread %d", thr_id);
238  return;
239  }
240 
241  METHOD_CALL(self, stop); // make sure stop has been called
242 
244  int eventSet = td->eventSet[self->evset_idx];
245 
246  int rval; // for PAPI return codes
247 
248  /* Error need not be fatal -- we've already got our data! */
249  rval = PAPI_cleanup_eventset(eventSet);
250  if (rval != PAPI_OK) {
251  TMSG(CUDA, "warning: CUDA PAPI_cleanup_eventset (%d): %s.",
252  rval, PAPI_strerror(rval));
253  }
254 
255  rval = PAPI_destroy_eventset(&eventSet);
256  if (rval != PAPI_OK) {
257  TMSG(CUDA, "warning: CUDA PAPI_destroy_eventset (%d): %s.",
258  rval, PAPI_strerror(rval));
259  }
260 
261  td->eventSet[self->evset_idx] = PAPI_NULL;
262 
263  PAPI_shutdown();
264 
265  self->state = UNINIT;
266 }
267 
268 
269 #define CUDA_PREFIX "CUDA."
270 
271 // Return true if PAPI recognizes the name, whether supported or not.
272 // We'll handle unsupported events later.
273 static bool
274 METHOD_FN(supports_event,const char *ev_str)
275 {
276  if (self->state == UNINIT){
277  METHOD_CALL(self, init);
278  }
279 
280  char evtmp[1024];
281  int ec;
282  long th;
283 
284  hpcrun_extract_ev_thresh(ev_str, sizeof(evtmp), evtmp, &th,
285  NO_THRESHOLD);
286 
287  // handle only events for the CUDA component
288  if (strncmp(evtmp, CUDA_PREFIX, strlen(CUDA_PREFIX)) == 0) {
289  return PAPI_event_name_to_code(evtmp, &ec) == PAPI_OK;
290  }
291  return 0;
292 }
293 
294 static void
295 METHOD_FN(process_event_list, int lush_metrics)
296 {
297  char *event;
298  int i, ret;
299  int num_lush_metrics = 0;
300 
301  char* evlist = METHOD_CALL(self, get_event_str);
302  for (event = start_tok(evlist); more_tok(); event = next_tok()) {
303  char name[1024];
304  int evcode;
305  long thresh;
306 
307  TMSG(CUDA,"checking event spec = %s",event);
308  if (hpcrun_extract_ev_thresh(event, sizeof(name), name, &thresh,
309  NO_THRESHOLD)) {
310  AMSG("WARNING: %s is specified with a sampling threshold. "
311  "No thresholds supported for CUDA events", name);
312  }
313  ret = PAPI_event_name_to_code(name, &evcode);
314  if (ret != PAPI_OK) {
315  EMSG("unexpected failure in PAPI process_event_list(): "
316  "PAPI_event_name_to_code() returned %s (%d)",
317  PAPI_strerror(ret), ret);
318  hpcrun_ssfail_unsupported("PAPI", name);
319  }
320  if (PAPI_query_event(evcode) != PAPI_OK) {
321  hpcrun_ssfail_unsupported("PAPI", name);
322  }
323 
324  TMSG(CUDA,"got event code = %x, thresh = %ld", evcode, thresh);
325  METHOD_CALL(self, store_event, evcode, NO_THRESHOLD);
326  }
327  int nevents = (self->evl).nevents;
328  TMSG(CUDA,"nevents = %d", nevents);
329 
330  hpcrun_pre_allocate_metrics(nevents + num_lush_metrics);
331 
332  for (i = 0; i < nevents; i++) {
333  char buffer[PAPI_MAX_STR_LEN];
334  int metric_id = hpcrun_new_metric(); /* weight */
335  METHOD_CALL(self, store_metric_id, i, metric_id);
336  PAPI_event_code_to_name(self->evl.events[i].event, buffer);
337  TMSG(CUDA, "metric for event %d = %s", i, buffer);
338  hpcrun_set_metric_info_and_period(metric_id, strdup(buffer),
340  self->evl.events[i].thresh);
341  }
342 }
343 
344 static void
345 METHOD_FN(gen_event_set,int lush_metrics)
346 {
347  int i;
348  int ret;
349  int eventSet;
350 
351  eventSet = PAPI_NULL;
352  TMSG(CUDA,"create event set");
353  ret = PAPI_create_eventset(&eventSet);
354  TMSG(CUDA,"PAPI_create_eventset = %d, eventSet = %d", ret, eventSet);
355  if (ret != PAPI_OK) {
356  hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
357  ret, PAPI_strerror(ret));
358  }
359 
360  int nevents = (self->evl).nevents;
361  for (i = 0; i < nevents; i++) {
362  int evcode = self->evl.events[i].event;
363  ret = PAPI_add_event(eventSet, evcode);
364  TMSG(CUDA, "PAPI_add_event(eventSet=%d, event_code=%x)", eventSet, evcode);
365  if (ret != PAPI_OK) {
366  EMSG("failure in PAPI gen_event_set(): "
367  "PAPI_add_event() returned: %s (%d)",
368  PAPI_strerror(ret), ret);
369  event_fatal_error(evcode, ret);
370  }
371  }
372 
374  td->eventSet[self->evset_idx] = eventSet;
375 }
376 
377 static void
378 METHOD_FN(display_events)
379 {
380  PAPI_event_info_t info;
381  char name[200];
382  int ev, ret, num_total;
383 
384  printf("===========================================================================\n");
385  printf("Available CUDA events\n");
386  printf("===========================================================================\n");
387  printf("Name\t\t\t\tDescription\n");
388  printf("---------------------------------------------------------------------------\n");
389 
390 #ifdef PAPI_COMPONENT_STUFF_FIGURED_OUT
391  const PAPI_component_info_t *pci = PAPI_get_component_info(1);
392  printf("PAPI component name '%s' '%s' '%s' '%s'\n", pci->name, pci->version,
393  pci->support_version, pci->kernel_version);
394 #endif // PAPI_COMPONENT_STUFF_FIGURED_OUT
395 
396  num_total = 0;
397  ev = PAPI_NATIVE_MASK | PAPI_COMPONENT_MASK(PAPI_CUDA_COMPONENT_ID);
398  ret = PAPI_OK;
399 #ifdef PAPI_ENUM_FIRST
400  ret = PAPI_enum_event(&ev, PAPI_ENUM_FIRST);
401 #endif
402  while (ret == PAPI_OK) {
403  if (PAPI_query_event(ev) == PAPI_OK) {
404  PAPI_event_code_to_name(ev, name);
405  if (strncmp(name, CUDA_PREFIX, strlen(CUDA_PREFIX)) == 0 || 1) {
406  PAPI_get_event_info(ev, &info);
407  num_total++;
408  printf("%-30s\t%s\n", name, info.long_descr);
409  }
410  }
411  ret = PAPI_enum_event(&ev, PAPI_ENUM_EVENTS);
412  }
413  printf("Total CUDA events: %d\n", num_total);
414  printf("\n");
415 }
416 
417 /***************************************************************************
418  * object
419  ***************************************************************************/
420 
421 #define ss_name cuda
422 #define ss_cls SS_HARDWARE
423 
424 #include "ss_obj.h"
425 
426 /******************************************************************************
427  * private operations
428  *****************************************************************************/
429 
430 static void
431 event_fatal_error(int ev_code, int papi_ret)
432 {
433  char name[1024];
434 
435  PAPI_event_code_to_name(ev_code, name);
436  if (PAPI_query_event(ev_code) != PAPI_OK) {
437  hpcrun_ssfail_unsupported("CUDA", name);
438  }
439  if (papi_ret == PAPI_ECNFLCT) {
440  hpcrun_ssfail_conflict("CUDA", name);
441  }
442  hpcrun_ssfail_unsupported("CUDA", name);
443 }
444 
445 
446 static void
447 check_cupti_error(int err, char *cuptifunc)
448 {
449  if (err != CUPTI_SUCCESS) {
450  const char *errstr;
451  cuptiGetResultString(err, &errstr);
452 #ifdef CUPTI_ERRORS_UNMYSTIFIED
453  hpcrun_abort("error: CUDA CUPTI API function '%s' "
454  "failed with message '%s' \n", cuptifunc, errstr);
455 #endif // CUPTI_ERRORS_UNMYSTIFIED
456  }
457 }
458 
459 void CUPTIAPI
461  CUpti_CallbackDomain domain,
462  CUpti_CallbackId cbid,
463  const CUpti_CallbackData *cbInfo)
464 {
466  sample_source_t* self = &obj_name();
467 
468  int nevents = self->evl.nevents;
469  int cudaEventSet = td->eventSet[self->evset_idx];
470 
471  // This callback is enabled only for kernel launch; anything else is an error.
472  if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
473  hpcrun_abort("CUDA CUPTI callback seen for unexpected "
474  "interface operation: callback id %d\n", cbid);
475  }
476 
477  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
478  cudaThreadSynchronize();
479 
480  TMSG(CUDA,"starting CUDA monitoring w event set %d",cudaEventSet);
481  int ret = PAPI_start(cudaEventSet);
482  if (ret != PAPI_OK){
483  EMSG("CUDA monitoring failed to start. PAPI_start failed with %s (%d)",
484  PAPI_strerror(ret), ret);
485  }
486  }
487 
488  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
489  cudaThreadSynchronize();
490  long_long *eventValues =
491  (long_long *) alloca(sizeof(long_long) * (nevents+2));
492 
493  TMSG(CUDA,"stopping CUDA monitoring w event set %d",cudaEventSet);
494  PAPI_stop(cudaEventSet, eventValues);
495  TMSG(CUDA,"stopped CUDA monitoring w event set %d",cudaEventSet);
496 
497  ucontext_t uc;
498  TMSG(CUDA,"getting context in CUDA event handler");
499  getcontext(&uc);
500  TMSG(CUDA,"got context in CUDA event handler");
501  hpcrun_async_block();
502  TMSG(CUDA,"blocked async event in CUDA event handler");
503  {
504  int i;
505  for (i = 0; i < nevents; i++)
506  {
507  int metric_id = hpcrun_event2metric(&_cuda_obj, i);
508 
509  TMSG(CUDA, "sampling call path for metric_id = %d", metric_id);
510  hpcrun_sample_callpath(&uc, metric_id, eventValues[i]/*metricIncr*/,
511  CUPTI_LAUNCH_CALLBACK_DEPTH/*skipInner*/,
512  0/*isSync*/, NULL);
513  TMSG(CUDA, "sampled call path for metric_id = %d", metric_id);
514  }
515  }
516  TMSG(CUDA,"unblocking async event in CUDA event handler");
517  hpcrun_async_unblock();
518  TMSG(CUDA,"unblocked async event in CUDA event handler");
519  }
520 }
#define STDERR_MSG(...)
Definition: messages.h:89
#define obj_name()
Definition: ss_obj.h:71
source_state_t
static void event_fatal_error(int ev_code, int papi_ret)
Definition: cuda.c:431
err
Definition: names.cpp:1
sample_val_t hpcrun_sample_callpath(void *context, int metricId, hpcrun_metricVal_t metricIncr, int skipInner, int isSync, sampling_info_t *data)
Definition: sample_event.c:160
#define PAPI_CUDA_COMPONENT_ID
Definition: cuda.c:106
static void METHOD_FN(init)
Definition: cuda.c:129
#define hpcrun_abort(...)
Definition: messages.h:102
#define NO_THRESHOLD
Definition: cuda.c:104
static CUpti_SubscriberHandle subscriber
Definition: papi-c-cupti.c:244
char * next_tok(void)
Definition: tokenize.c:87
metric_desc_t * hpcrun_set_metric_info_and_period(int metric_id, const char *name, MetricFlags_ValFmt_t valFmt, size_t period, metric_desc_properties_t prop)
Definition: metrics.c:411
static void check_cupti_error(int err, char *cuptifunc)
Definition: cuda.c:447
exit
Definition: names.cpp:1
#define CUPTI_LAUNCH_CALLBACK_DEPTH
Definition: cuda.c:107
#define EMSG
Definition: messages.h:70
void hpcrun_ssfail_unsupported(char *source, char *event)
Definition: common.c:230
char * start_tok(char *lst)
Definition: tokenize.c:70
int lush_metrics
Definition: main.c:188
#define TD_GET(field)
Definition: thread_data.h:256
#define CUDA_PREFIX
Definition: cuda.c:269
#define TMSG(f,...)
Definition: messages.h:93
int hpcrun_extract_ev_thresh(const char *in, int evlen, char *ev, long *th, long def)
Definition: tokenize.c:157
int hpcrun_event2metric(sample_source_t *ss, int event_idx)
Definition: common.c:143
#define AMSG
Definition: messages.h:71
#define METHOD_CALL(obj, meth,...)
Definition: simple_oo.h:87
#define EEMSG(...)
Definition: messages.h:90
#define NULL
Definition: ElfHelper.cpp:85
unsigned char uc
Definition: amd-xop.c:3
int hpcrun_new_metric(void)
Definition: metrics.c:333
void monitor_real_abort(void)
void hpcrun_pre_allocate_metrics(size_t num)
Definition: metrics.c:190
int more_tok(void)
Definition: tokenize.c:78
thread_data_t *(* hpcrun_get_thread_data)(void)
Definition: thread_data.c:168
static void hpcrun_cuda_kernel_callback(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
Definition: cuda.c:460
static int domain
Definition: monitor.c:149
void hpcrun_ssfail_conflict(char *source, char *event)
Definition: common.c:255