HPCToolkit
papi-c-cupti.c
Go to the documentation of this file.
1 // ******************* System Includes ********************
2 #include <ucontext.h>
3 #include <dlfcn.h>
4 
5 #include <stdbool.h>
6 #include <string.h>
7 #include <stdint.h>
8 // *********************************************************
9 
10 
11 // ******************** PAPI *******************************
12 #include <papi.h>
13 // *********************************************************
14 
15 // ******************** MONITOR *******************************
16 #include <monitor.h>
17 // *********************************************************
18 
19 // ******************** GPU includes ***********************
20 #include <cuda_runtime_api.h>
21 #include <cupti.h>
22 // *********************************************************
23 
24 // ******* HPCToolkit Includes *********************************
25 #include <lib/prof-lean/spinlock.h>
26 
27 #include <hpcrun/thread_data.h>
28 #include <messages/messages.h>
29 #include <hpcrun/sample_event.h>
30 #include <hpcrun/safe-sampling.h>
32 #include <sample-sources/common.h>
34 // *********************************************************
35 
36 // ******** local includes ***********
37 #include "papi-c.h"
38 #include "papi-c-extended-info.h"
39 // ***********************************
40 
41 // ****************** Convenience macros *******************
42 
43 #define CUPTI_LAUNCH_CALLBACK_DEPTH 7
44 
45 #define Cupti_call(fn, ...) \
46 { \
47  int ret = fn(__VA_ARGS__); \
48  if (ret != CUPTI_SUCCESS) { \
49  const char* errstr; \
50  dcuptiGetResultString(ret, &errstr); \
51  hpcrun_abort("error: CUDA/CUPTI API " \
52  #fn " failed w error code %d ==> '%s'\n", \
53  ret, errstr); \
54  } \
55 }
56 
57 #define Cupti_call_silent(fn, ...) \
58 { \
59  (void) fn(__VA_ARGS__); \
60 }
61 
62 #define Chk_dlopen(v, lib, flags) \
63  void* v = monitor_real_dlopen(lib, flags); \
64  if (! v) { \
65  fprintf(stderr, "gpu dlopen %s failed\n", lib); \
66  return; \
67  } \
68 
69 #define Chk_dlsym(h, fn) { \
70  dlerror(); \
71  d ## fn = dlsym(h, #fn); \
72  char* e = dlerror(); \
73  if (e) { \
74  fprintf(stderr, "dlsym(%s) fails w '%s'\n", #fn, e); \
75  return; \
76  } \
77 }
78 // ***********************************************************
79 
80 typedef struct {
81  int nevents;
82  int event_set;
85 
86 static bool event_set_created = false;
87 static bool event_set_finalized = false;
88 
89 static papi_cuda_data_t local = {};
90 
93 
94 // ******************** cuda/cupti functions ***********************
95 // Some cuda/cupti functions must not be wrapped! So, we fetch them via dlopen.
96 // NOTE: naming convention is to prepend the letter "d" to the actual function
97 // The indirect functions are below.
98 //
99 cudaError_t (*dcudaThreadSynchronize)(void);
100 
101 CUptiResult (*dcuptiGetResultString)(CUptiResult result, const char** str);
102 
103 CUptiResult (*dcuptiSubscribe)(CUpti_SubscriberHandle* subscriber,
104  CUpti_CallbackFunc callback,
105  void* userdata);
106 
107 CUptiResult (*dcuptiEnableCallback)(uint32_t enable,
108  CUpti_SubscriberHandle subscriber,
109  CUpti_CallbackDomain domain,
110  CUpti_CallbackId cbid);
111 
112 CUptiResult (*dcuptiUnsubscribe)(CUpti_SubscriberHandle subscriber);
113 
114 
115 // *****************************************************************
116 typedef struct cuda_callback_t {
120 
121 //
122 // populate the cuda/cupti functions via dlopen
123 //
124 
125 static void
126 dlgpu(void)
127 {
128  // only use dlfunctions in NON static case
129 #ifndef HPCRUN_STATIC_LINK
130  Chk_dlopen(cudart, "libcudart.so", RTLD_NOW | RTLD_GLOBAL);
131  Chk_dlsym(cudart, cudaThreadSynchronize);
132 
133  Chk_dlopen(cupti, "libcupti.so", RTLD_NOW | RTLD_GLOBAL);
134  Chk_dlsym(cupti, cuptiGetResultString);
135  Chk_dlsym(cupti, cuptiSubscribe);
136  Chk_dlsym(cupti, cuptiEnableCallback);
137  Chk_dlsym(cupti, cuptiUnsubscribe);
138 #endif // ! HPCRUN_STATIC_LINK
139 }
140 
141 //
142 // noop routine
143 //
144 static void
146 {
147  ;
148 }
149 
150 //
151 // Predicate to determine if this component is being referenced
152 //
153 static bool
154 is_papi_c_cuda(const char* name)
155 {
156  return strstr(name, "cuda") == name;
157 }
158 
159 static void CUPTIAPI
161  CUpti_CallbackDomain domain,
162  CUpti_CallbackId cbid,
163  const CUpti_CallbackData* cbInfo)
164 {
165  TMSG(CUDA, "Got Kernel Callback");
166 
167  papi_cuda_data_t* cuda_data = userdata;
168  int nevents = cuda_data->nevents;
169  int cudaEventSet = cuda_data->event_set;
170  sample_source_t* self = cuda_data->self;
171 
172 
173  TMSG(CUDA, "nevents = %d, cuda event set = %x", nevents, cudaEventSet);
174 
175  // This callback is enabled only for kernel launch; anything else is an error.
176  if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
177  hpcrun_abort("CUDA CUPTI callback seen for unexpected "
178  "interface operation: callback id %d\n", cbid);
179  }
180 
181  if (cbInfo->callbackSite == CUPTI_API_ENTER) {
182  TMSG(CUDA, "Cupti API -ENTER- portion");
183  // MC recommends FIXME: Unnecessary, but use cudaDeviceSynchronize
184  // exclusive access to launcher
185  spinlock_lock(&cupti_lock);
186  TMSG(CUPTI, "-ACQ-lock");
188 
189  TMSG(CUPTI,"-- PRE launch callback");
190  TMSG(CUDA, "Start monitoring with event set %d", cudaEventSet);
191  int ret = PAPI_start(cudaEventSet);
192  if (ret != PAPI_OK){
193  EMSG("CUDA monitoring failed to start. PAPI_start failed with %s (%d)",
194  PAPI_strerror(ret), ret);
195  }
196  }
197  TMSG(CUDA, "Past (or done with) CUDA -ENTER- portion");
198 
199 
200  if (cbInfo->callbackSite == CUPTI_API_EXIT) {
201  TMSG(CUDA, "Cupti API -EXIT- portion");
202  // MC recommends Use cudaDeviceSynchronize
204  TMSG(CUPTI, "-- POST launch callback");
205  long_long eventValues[nevents+2];
206 
207  TMSG(CUDA,"stopping CUDA monitoring w event set %d",cudaEventSet);
208  int ret = PAPI_stop(cudaEventSet, eventValues);
209  if (ret != PAPI_OK){
210  EMSG("CUDA monitoring failed to -stop-. PAPI_stop failed with %s (%d)",
211  PAPI_strerror(ret), ret);
212  }
213  TMSG(CUDA,"stopped CUDA monitoring w event set %d",cudaEventSet);
214 
215  ucontext_t uc;
216  TMSG(CUDA,"getting context in CUDA event handler");
217  getcontext(&uc);
218  TMSG(CUDA,"got context in CUDA event handler");
219  bool safe = hpcrun_safe_enter();
220  TMSG(CUDA,"blocked async event in CUDA event handler");
221  {
222  int i;
223  for (i = 0; i < nevents; i++)
224  {
225  int metric_id = hpcrun_event2metric(self, i);
226 
227  TMSG(CUDA, "sampling call path for metric_id = %d", metric_id);
228  hpcrun_sample_callpath(&uc, metric_id, eventValues[i]/*metricIncr*/,
229  CUPTI_LAUNCH_CALLBACK_DEPTH/*skipInner*/,
230  0/*isSync*/, NULL);
231  TMSG(CUDA, "sampled call path for metric_id = %d", metric_id);
232  }
233  }
234  TMSG(CUDA,"unblocking async event in CUDA event handler");
235  if (safe) hpcrun_safe_exit();
236  TMSG(CUDA,"unblocked async event in CUDA event handler");
237 
238  spinlock_unlock(&cupti_lock);
239  TMSG(CUPTI,"-REL-lock\n");
240  }
241  TMSG(CUDA, "At end (past -EXIT-)");
242 }
243 
244 static CUpti_SubscriberHandle subscriber;
245 
246 //
247 // sync setup for cuda/cupti
248 //
249 static void
251 {
252  // FIXME: Remove local definition
253  // CUpti_SubscriberHandle subscriber;
254 
255  static bool one_time = false;
256 
257  spinlock_lock(&setup_lock);
258  TMSG(CUDA, "CUPTI setup acquire lock");
259  if (one_time) {
260  spinlock_unlock(&setup_lock);
261  TMSG(CUDA, "CUPTI setup release lock (setup already called)");
262  return;
263  }
264 
265  TMSG(CUDA,"sync setup called");
266 
268  local.self = hpcrun_fetch_source_by_name("papi");
269 
270  local.nevents = local.self->evl.nevents;
271 
272  // get cuda event set
273 
274  int cuda_component_idx;
275  int n_components = PAPI_num_components();
276 
277  for (int i = 0; i < n_components; i++) {
278  if (is_papi_c_cuda(PAPI_get_component_info(i)->name)) {
279  cuda_component_idx = i;
280  break;
281  }
282  }
283 
284  papi_source_info_t* psi = td->ss_info[local.self->sel_idx].ptr;
285  local.event_set = get_component_event_set(psi, cuda_component_idx);
286 
288  (CUpti_CallbackFunc)hpcrun_cuda_kernel_callback,
289  &local);
290 
292  CUPTI_CB_DOMAIN_RUNTIME_API,
293  CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
294 
295  one_time = true;
296  spinlock_unlock(&setup_lock);
297  TMSG(CUDA, "CUPTI setup release lock");
298 }
299 
300 //
301 // Get or create a cupti event set --- but only ONCE per process
302 //
303 void
305 {
306  TMSG(CUDA, "Get event set");
307  spinlock_lock(&setup_lock);
308  TMSG(CUDA, "Cupti lock acquired");
309  if (! event_set_created) {
310  TMSG(CUDA, "No event set created, so create one");
311  int ret = PAPI_create_eventset(ev_s);
312  if (ret != PAPI_OK) {
313  hpcrun_abort("Failure: PAPI_create_eventset.Return code = %d ==> %s",
314  ret, PAPI_strerror(ret));
315  }
316  local.event_set = *ev_s;
317  event_set_created = true;
318  TMSG(CUDA, "Event set %d created", local.event_set);
319  }
320  spinlock_unlock(&setup_lock);
321  TMSG(CUDA, "Cupti lock released");
322 }
323 
324 int
325 papi_c_cupti_add_event(int ev_s, int ev)
326 {
327  int rv = PAPI_OK;
328  TMSG(CUDA, "Adding event to cupti event set");
329  spinlock_lock(&setup_lock);
330  TMSG(CUDA, "Cupti lock acquired");
331  if (! event_set_finalized) {
332  TMSG(CUDA, "Really add event %x to cupti event set", ev);
333  rv = PAPI_add_event(local.event_set, ev);
334  TMSG(CUDA, "Check event set passed in = %d, cuda event set = %d", ev_s, local.event_set);
335  }
336  spinlock_unlock(&setup_lock);
337  TMSG(CUDA, "Cupti lock released");
338  return rv;
339 }
340 
341 void
343 {
344  spinlock_lock(&setup_lock);
345  event_set_finalized = true;
346  spinlock_unlock(&setup_lock);
347 }
348 
349 
350 //
351 // sync teardown for cuda/cupti
352 //
353 static void
355 {
356  static bool one_time = false;
357  spinlock_lock(&setup_lock);
358  if (one_time) return;
359 
360  TMSG(CUDA,"sync teardown called (=unsubscribe)");
361 
363  one_time = true;
364  spinlock_unlock(&setup_lock);
365 }
366 
368  .pred = is_papi_c_cuda,
369  .get_event_set = papi_c_cupti_get_event_set,
370  .add_event = papi_c_cupti_add_event,
371  .finalize_event_set = papi_c_cupti_finalize_event_set,
372  .sync_setup = papi_c_cupti_setup,
373  .sync_teardown = papi_c_cupti_teardown,
374  .sync_start = papi_c_no_action,
375  .sync_stop = papi_c_no_action,
376  .process_only = true,
377  .next = NULL,
378 };
379 
380 
381 void
383 {
384  // fetch actual cuda/cupti functions
385  dlgpu();
386  papi_c_sync_register(&cuda_component);
387 }
static bool event_set_finalized
Definition: papi-c-cupti.c:87
static void papi_c_no_action(void)
Definition: papi-c-cupti.c:145
static void papi_c_cupti_teardown(void)
Definition: papi-c-cupti.c:354
static void hpcrun_safe_exit(void)
sample_val_t hpcrun_sample_callpath(void *context, int metricId, hpcrun_metricVal_t metricIncr, int skipInner, int isSync, sampling_info_t *data)
Definition: sample_event.c:160
#define SS_OBJ_CONSTRUCTOR(ssname)
Definition: ss-obj-name.h:58
struct cuda_callback_t cuda_callback_t
static void spinlock_unlock(spinlock_t *l)
Definition: spinlock.h:96
static spinlock_t setup_lock
Definition: papi-c-cupti.c:92
static void papi_c_cupti_setup(void)
Definition: papi-c-cupti.c:250
static spinlock_t cupti_lock
Definition: papi-c-cupti.c:91
static papi_cuda_data_t local
Definition: papi-c-cupti.c:89
void papi_c_cupti_get_event_set(int *ev_s)
Definition: papi-c-cupti.c:304
int papi_c_cupti_add_event(int ev_s, int ev)
Definition: papi-c-cupti.c:325
void SS_OBJ_CONSTRUCTOR() papi_c_cupti(void)
Definition: papi-c-cupti.c:382
static sync_info_list_t cuda_component
Definition: papi-c-cupti.c:367
#define hpcrun_abort(...)
Definition: messages.h:102
CUptiResult(* dcuptiUnsubscribe)(CUpti_SubscriberHandle subscriber)
Definition: papi-c-cupti.c:112
static CUpti_SubscriberHandle subscriber
Definition: papi-c-cupti.c:244
void papi_c_sync_register(sync_info_list_t *info)
static bool event_set_created
Definition: papi-c-cupti.c:86
sample_source_t * ss
Definition: papi-c-cupti.c:117
static void dlgpu(void)
Definition: papi-c-cupti.c:126
#define CUPTI_LAUNCH_CALLBACK_DEPTH
Definition: papi-c-cupti.c:43
sample_source_t * self
Definition: papi-c-cupti.c:83
#define EMSG
Definition: messages.h:70
#define Chk_dlsym(h, fn)
Definition: papi-c-cupti.c:69
static void spinlock_lock(spinlock_t *l)
Definition: spinlock.h:111
#define TMSG(f,...)
Definition: messages.h:93
int hpcrun_event2metric(sample_source_t *ss, int event_idx)
Definition: common.c:143
int get_component_event_set(papi_source_info_t *psi, int cidx)
Definition: papi-c.c:152
#define NULL
Definition: ElfHelper.cpp:85
CUptiResult(* dcuptiEnableCallback)(uint32_t enable, CUpti_SubscriberHandle subscriber, CUpti_CallbackDomain domain, CUpti_CallbackId cbid)
Definition: papi-c-cupti.c:107
CUptiResult(* dcuptiSubscribe)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, void *userdata)
Definition: papi-c-cupti.c:103
unsigned char uc
Definition: amd-xop.c:3
void papi_c_cupti_finalize_event_set(void)
Definition: papi-c-cupti.c:342
sample_source_t * hpcrun_fetch_source_by_name(const char *src)
#define Chk_dlopen(v, lib, flags)
Definition: papi-c-cupti.c:62
static void CUPTIAPI hpcrun_cuda_kernel_callback(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
Definition: papi-c-cupti.c:160
const pred_proc_t pred
CUptiResult(* dcuptiGetResultString)(CUptiResult result, const char **str)
Definition: papi-c-cupti.c:101
int nevents
Definition: evlist.h:68
cudaError_t(* dcudaThreadSynchronize)(void)
Definition: papi-c-cupti.c:99
source_info_t * ss_info
Definition: thread_data.h:151
#define SPINLOCK_UNLOCKED
Definition: spinlock.h:84
static bool is_papi_c_cuda(const char *name)
Definition: papi-c-cupti.c:154
static int hpcrun_safe_enter(void)
#define Cupti_call(fn,...)
Definition: papi-c-cupti.c:45
thread_data_t *(* hpcrun_get_thread_data)(void)
Definition: thread_data.c:168
static int domain
Definition: monitor.c:149