20 #include <cuda_runtime_api.h> 43 #define CUPTI_LAUNCH_CALLBACK_DEPTH 7 45 #define Cupti_call(fn, ...) \ 47 int ret = fn(__VA_ARGS__); \ 48 if (ret != CUPTI_SUCCESS) { \ 50 dcuptiGetResultString(ret, &errstr); \ 51 hpcrun_abort("error: CUDA/CUPTI API " \ 52 #fn " failed w error code %d ==> '%s'\n", \ 57 #define Cupti_call_silent(fn, ...) \ 59 (void) fn(__VA_ARGS__); \ 62 #define Chk_dlopen(v, lib, flags) \ 63 void* v = monitor_real_dlopen(lib, flags); \ 65 fprintf(stderr, "gpu dlopen %s failed\n", lib); \ 69 #define Chk_dlsym(h, fn) { \ 71 d ## fn = dlsym(h, #fn); \ 72 char* e = dlerror(); \ 74 fprintf(stderr, "dlsym(%s) fails w '%s'\n", #fn, e); \ 104 CUpti_CallbackFunc callback,
109 CUpti_CallbackDomain
domain,
110 CUpti_CallbackId cbid);
129 #ifndef HPCRUN_STATIC_LINK 130 Chk_dlopen(cudart,
"libcudart.so", RTLD_NOW | RTLD_GLOBAL);
131 Chk_dlsym(cudart, cudaThreadSynchronize);
133 Chk_dlopen(cupti,
"libcupti.so", RTLD_NOW | RTLD_GLOBAL);
138 #endif // ! HPCRUN_STATIC_LINK 156 return strstr(name,
"cuda") == name;
161 CUpti_CallbackDomain
domain,
162 CUpti_CallbackId cbid,
163 const CUpti_CallbackData* cbInfo)
165 TMSG(CUDA,
"Got Kernel Callback");
168 int nevents = cuda_data->
nevents;
173 TMSG(CUDA,
"nevents = %d, cuda event set = %x", nevents, cudaEventSet);
176 if (cbid != CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020) {
178 "interface operation: callback id %d\n", cbid);
181 if (cbInfo->callbackSite == CUPTI_API_ENTER) {
182 TMSG(CUDA,
"Cupti API -ENTER- portion");
186 TMSG(CUPTI,
"-ACQ-lock");
189 TMSG(CUPTI,
"-- PRE launch callback");
190 TMSG(CUDA,
"Start monitoring with event set %d", cudaEventSet);
191 int ret = PAPI_start(cudaEventSet);
193 EMSG(
"CUDA monitoring failed to start. PAPI_start failed with %s (%d)",
194 PAPI_strerror(ret), ret);
197 TMSG(CUDA,
"Past (or done with) CUDA -ENTER- portion");
200 if (cbInfo->callbackSite == CUPTI_API_EXIT) {
201 TMSG(CUDA,
"Cupti API -EXIT- portion");
204 TMSG(CUPTI,
"-- POST launch callback");
205 long_long eventValues[nevents+2];
207 TMSG(CUDA,
"stopping CUDA monitoring w event set %d",cudaEventSet);
208 int ret = PAPI_stop(cudaEventSet, eventValues);
210 EMSG(
"CUDA monitoring failed to -stop-. PAPI_stop failed with %s (%d)",
211 PAPI_strerror(ret), ret);
213 TMSG(CUDA,
"stopped CUDA monitoring w event set %d",cudaEventSet);
216 TMSG(CUDA,
"getting context in CUDA event handler");
218 TMSG(CUDA,
"got context in CUDA event handler");
220 TMSG(CUDA,
"blocked async event in CUDA event handler");
223 for (i = 0; i < nevents; i++)
227 TMSG(CUDA,
"sampling call path for metric_id = %d", metric_id);
231 TMSG(CUDA,
"sampled call path for metric_id = %d", metric_id);
234 TMSG(CUDA,
"unblocking async event in CUDA event handler");
236 TMSG(CUDA,
"unblocked async event in CUDA event handler");
239 TMSG(CUPTI,
"-REL-lock\n");
241 TMSG(CUDA,
"At end (past -EXIT-)");
255 static bool one_time =
false;
258 TMSG(CUDA,
"CUPTI setup acquire lock");
261 TMSG(CUDA,
"CUPTI setup release lock (setup already called)");
265 TMSG(CUDA,
"sync setup called");
274 int cuda_component_idx;
275 int n_components = PAPI_num_components();
277 for (
int i = 0; i < n_components; i++) {
279 cuda_component_idx = i;
292 CUPTI_CB_DOMAIN_RUNTIME_API,
293 CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020);
297 TMSG(CUDA,
"CUPTI setup release lock");
306 TMSG(CUDA,
"Get event set");
308 TMSG(CUDA,
"Cupti lock acquired");
310 TMSG(CUDA,
"No event set created, so create one");
311 int ret = PAPI_create_eventset(ev_s);
312 if (ret != PAPI_OK) {
313 hpcrun_abort(
"Failure: PAPI_create_eventset.Return code = %d ==> %s",
314 ret, PAPI_strerror(ret));
321 TMSG(CUDA,
"Cupti lock released");
328 TMSG(CUDA,
"Adding event to cupti event set");
330 TMSG(CUDA,
"Cupti lock acquired");
332 TMSG(CUDA,
"Really add event %x to cupti event set", ev);
333 rv = PAPI_add_event(local.
event_set, ev);
334 TMSG(CUDA,
"Check event set passed in = %d, cuda event set = %d", ev_s, local.
event_set);
337 TMSG(CUDA,
"Cupti lock released");
356 static bool one_time =
false;
358 if (one_time)
return;
360 TMSG(CUDA,
"sync teardown called (=unsubscribe)");
376 .process_only =
true,
static bool event_set_finalized
static void papi_c_no_action(void)
static void papi_c_cupti_teardown(void)
static void hpcrun_safe_exit(void)
sample_val_t hpcrun_sample_callpath(void *context, int metricId, hpcrun_metricVal_t metricIncr, int skipInner, int isSync, sampling_info_t *data)
#define SS_OBJ_CONSTRUCTOR(ssname)
struct cuda_callback_t cuda_callback_t
static void spinlock_unlock(spinlock_t *l)
static spinlock_t setup_lock
static void papi_c_cupti_setup(void)
static spinlock_t cupti_lock
static papi_cuda_data_t local
void papi_c_cupti_get_event_set(int *ev_s)
int papi_c_cupti_add_event(int ev_s, int ev)
void SS_OBJ_CONSTRUCTOR() papi_c_cupti(void)
static sync_info_list_t cuda_component
#define hpcrun_abort(...)
CUptiResult(* dcuptiUnsubscribe)(CUpti_SubscriberHandle subscriber)
static CUpti_SubscriberHandle subscriber
void papi_c_sync_register(sync_info_list_t *info)
static bool event_set_created
#define CUPTI_LAUNCH_CALLBACK_DEPTH
static void spinlock_lock(spinlock_t *l)
int hpcrun_event2metric(sample_source_t *ss, int event_idx)
int get_component_event_set(papi_source_info_t *psi, int cidx)
CUptiResult(* dcuptiEnableCallback)(uint32_t enable, CUpti_SubscriberHandle subscriber, CUpti_CallbackDomain domain, CUpti_CallbackId cbid)
CUptiResult(* dcuptiSubscribe)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, void *userdata)
void papi_c_cupti_finalize_event_set(void)
sample_source_t * hpcrun_fetch_source_by_name(const char *src)
#define Chk_dlopen(v, lib, flags)
static void CUPTIAPI hpcrun_cuda_kernel_callback(void *userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData *cbInfo)
CUptiResult(* dcuptiGetResultString)(CUptiResult result, const char **str)
cudaError_t(* dcudaThreadSynchronize)(void)
#define SPINLOCK_UNLOCKED
static bool is_papi_c_cuda(const char *name)
static int hpcrun_safe_enter(void)
#define Cupti_call(fn,...)
thread_data_t *(* hpcrun_get_thread_data)(void)