70 #include <cuda_runtime.h> 113 #include <lib/prof-lean/atomic.h> 122 #define CHECK_CU_ERROR(err, cufunc) \ 123 if (err != CUDA_SUCCESS) \ 125 EETMSG("%s:%d: error %d for CUDA Driver API function '%s'\n", \ 126 __FILE__, __LINE__, err, cufunc); \ 127 monitor_real_abort(); \ 130 #define CHECK_CUPTI_ERROR(err, cuptifunc) \ 131 if (err != CUPTI_SUCCESS) \ 133 const char *errstr; \ 134 cuptiGetResultString(err, &errstr); \ 135 EEMSG("%s:%d:Error %s for CUPTI API function '%s'.\n", \ 136 __FILE__, __LINE__, errstr, cuptifunc); \ 137 monitor_real_abort(); \ 140 #define CU_SAFE_CALL( call ) do { \ 141 CUresult err = call; \ 142 if( CUDA_SUCCESS != err) { \ 143 EEMSG("Cuda driver error %d in call at file '%s' in line %i.\n", \ 144 err, __FILE__, __LINE__ ); \ 145 monitor_real_abort(); \ 148 #define CUDA_SAFE_CALL( call) do { \ 149 cudaError_t err = call; \ 150 if( cudaSuccess != err) { \ 151 EMSG("In %s, @ line %d, gives error %d = '%s'\n", __FILE__, __LINE__, \ 153 cudaGetErrorString(err)); \ 154 monitor_real_abort(); \ 157 #define Cuda_RTcall(fn) cudaRuntimeFunctionPointer[fn ## Enum].fn ## Real 159 #define GET_STREAM_ID(x) ((x) - g_stream_array) 160 #define ALL_STREAMS_MASK (0xffffffff) 162 #define MAX_SHARED_KEY_LENGTH (100) 164 #define HPCRUN_GPU_SHMSZ (1<<10) 166 #define SHARED_BLAMING_INITIALISED (ipc_data != NULL) 168 #define INCR_SHARED_BLAMING_DS(field) do{ if(SHARED_BLAMING_INITIALISED) atomic_add_i64(&(ipc_data->field), 1L); }while(0) 169 #define DECR_SHARED_BLAMING_DS(field) do{ if(SHARED_BLAMING_INITIALISED) atomic_add_i64(&(ipc_data->field), -1L); }while(0) 171 #define ADD_TO_FREE_EVENTS_LIST(node_ptr) do { (node_ptr)->next_free_node = g_free_event_nodes_head; \ 172 g_free_event_nodes_head = (node_ptr); }while(0) 174 #define ADD_TO_FREE_TREE_NODE_LIST(node_ptr) do { (node_ptr)->next_free_node = g_free_tree_nodes_head; \ 175 g_free_tree_nodes_head = (node_ptr); }while(0) 177 #define ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(node_ptr) do { (node_ptr)->next_free_node = g_free_active_kernel_nodes_head; \ 178 g_free_active_kernel_nodes_head = (node_ptr); }while(0) 180 #define HPCRUN_ASYNC_BLOCK_SPIN_LOCK bool safe = false; \ 181 do {safe = hpcrun_safe_enter(); \ 182 spinlock_lock(&g_gpu_lock);} while(0) 184 #define HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK do{spinlock_unlock(&g_gpu_lock); \ 185 if (safe) hpcrun_safe_exit();} while(0) 187 #define SYNC_PROLOGUE(ctxt, launch_node, start_time, rec_node) \ 188 TD_GET(gpu_data.overload_state) = SYNC_STATE; \ 189 TD_GET(gpu_data.accum_num_sync_threads) = 0; \ 190 TD_GET(gpu_data.accum_num_samples) = 0; \ 191 hpcrun_safe_enter(); \ 194 cct_node_t * launch_node = hpcrun_sample_callpath(&ctxt, cpu_idle_metric_id, 0 , 0 , 1 , NULL ).sample_node; \ 195 TD_GET(gpu_data.is_thread_at_cuda_sync) = true; \ 196 spinlock_lock(&g_gpu_lock); \ 197 uint64_t start_time; \ 198 event_list_node_t * rec_node = enter_cuda_sync(& start_time); \ 199 spinlock_unlock(&g_gpu_lock); \ 200 INCR_SHARED_BLAMING_DS(num_threads_at_sync_all_procs); \ 203 #define SYNC_EPILOGUE(ctxt, launch_node, start_time, rec_node, mask, end_time) \ 204 hpcrun_safe_enter(); \ 205 spinlock_lock(&g_gpu_lock); \ 206 uint64_t last_kernel_end_time = leave_cuda_sync(rec_node,start_time,mask); \ 207 TD_GET(gpu_data.accum_num_sync_threads) = 0; \ 208 TD_GET(gpu_data.accum_num_samples) = 0; \ 209 spinlock_unlock(&g_gpu_lock); \ 211 gettimeofday(&tv, NULL); \ 212 uint64_t end_time = ((uint64_t)tv.tv_usec + (((uint64_t)tv.tv_sec) * 1000000)); \ 213 if ( last_kernel_end_time > end_time) {last_kernel_end_time = end_time;} \ 214 uint64_t cpu_idle_time = last_kernel_end_time == 0 ? 0: last_kernel_end_time - start_time; \ 215 uint64_t gpu_idle_time = last_kernel_end_time == 0 ? end_time - start_time : end_time - last_kernel_end_time; \ 216 cct_metric_data_increment(cpu_idle_metric_id, launch_node, (cct_metric_data_t) {.i = (cpu_idle_time)}); \ 217 cct_metric_data_increment(gpu_idle_metric_id, launch_node, (cct_metric_data_t) {.i = (gpu_idle_time)}); \ 218 hpcrun_safe_exit(); \ 219 DECR_SHARED_BLAMING_DS(num_threads_at_sync_all_procs); \ 220 TD_GET(gpu_data.is_thread_at_cuda_sync) = false; 222 #define SYNC_MEMCPY_PROLOGUE(ctxt, launch_node, start_time, rec_node) SYNC_PROLOGUE(ctxt, launch_node, start_time, rec_node) 224 #define SYNC_MEMCPY_EPILOGUE(ctxt, launch_node, start_time, rec_node, mask, end_time, bytes, direction) \ 225 hpcrun_safe_enter(); \ 226 spinlock_lock(&g_gpu_lock); \ 227 uint64_t last_kernel_end_time = leave_cuda_sync(rec_node,start_time,mask); \ 228 TD_GET(gpu_data.accum_num_sync_threads) = 0; \ 229 TD_GET(gpu_data.accum_num_samples) = 0; \ 230 spinlock_unlock(&g_gpu_lock); \ 232 gettimeofday(&tv, NULL); \ 233 uint64_t end_time = ((uint64_t)tv.tv_usec + (((uint64_t)tv.tv_sec) * 1000000)); \ 234 if ( last_kernel_end_time > end_time) {last_kernel_end_time = end_time;} \ 235 uint64_t cpu_idle_time = end_time - start_time; \ 236 uint64_t gpu_idle_time = last_kernel_end_time == 0 ? end_time - start_time : end_time - last_kernel_end_time; \ 237 cct_metric_data_increment(cpu_idle_metric_id, launch_node, (cct_metric_data_t) {.i = (cpu_idle_time)}); \ 238 cct_metric_data_increment(gpu_idle_metric_id, launch_node, (cct_metric_data_t) {.i = (gpu_idle_time)}); \ 239 increment_mem_xfer_metric(bytes, direction, launch_node); \ 240 hpcrun_safe_exit(); \ 241 DECR_SHARED_BLAMING_DS(num_threads_at_sync_all_procs); \ 242 TD_GET(gpu_data.is_thread_at_cuda_sync) = false 245 #define ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, stream, skip_inner) \ 246 create_stream0_if_needed(stream); \ 247 uint32_t streamId = 0; \ 248 event_list_node_t *event_node; \ 249 streamId = splay_get_stream_id(stream); \ 250 HPCRUN_ASYNC_BLOCK_SPIN_LOCK; \ 251 TD_GET(gpu_data.is_thread_at_cuda_sync) = true; \ 252 ucontext_t context; \ 253 getcontext(&context); \ 254 cct_node_t *cct_node = hpcrun_sample_callpath(&context, cpu_idle_metric_id, 0, skip_inner , 1 , NULL ).sample_node; \ 255 cct_node_t *stream_cct = stream_duplicate_cpu_node(g_stream_array[streamId].st, &context, cct_node); \ 256 monitor_disable_new_threads(); \ 257 event_node = create_and_insert_event(streamId, cct_node, stream_cct); \ 258 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(event_node->event_start, stream)); \ 259 INCR_SHARED_BLAMING_DS(outstanding_kernels) 262 #define ASYNC_KERNEL_EPILOGUE(event_node, stream) \ 263 TD_GET(gpu_data.overload_state) = WORKING_STATE; \ 264 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(event_node->event_end, stream)); \ 265 monitor_enable_new_threads(); \ 266 TD_GET(gpu_data.is_thread_at_cuda_sync) = false; \ 267 HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK 269 #define ASYNC_MEMCPY_PROLOGUE(streamId, event_node, context, cct_node, stream, skip_inner) \ 270 ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, stream, skip_inner) 272 #define ASYNC_MEMCPY_EPILOGUE(event_node, cct_node, stream, count, kind) \ 273 TD_GET(gpu_data.overload_state) = WORKING_STATE; \ 274 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(event_node->event_end, stream)); \ 275 monitor_enable_new_threads(); \ 276 increment_mem_xfer_metric(count, kind, cct_node); \ 277 TD_GET(gpu_data.is_thread_at_cuda_sync) = false; \ 278 HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK 281 #define GET_NEW_TREE_NODE(node_ptr) do { \ 282 if (g_free_tree_nodes_head) { \ 283 node_ptr = g_free_tree_nodes_head; \ 284 g_free_tree_nodes_head = g_free_tree_nodes_head->next_free_node; \ 286 node_ptr = (tree_node *) hpcrun_malloc(sizeof(tree_node)); \ 290 #define GET_NEW_ACTIVE_KERNEL_NODE(node_ptr) do { \ 291 if (g_free_active_kernel_nodes_head) { \ 292 node_ptr = g_free_active_kernel_nodes_head; \ 293 g_free_active_kernel_nodes_head = g_free_active_kernel_nodes_head->next_free_node; \ 295 node_ptr = (active_kernel_node_t *) hpcrun_malloc(sizeof(active_kernel_node_t)); \ 300 #define SYNCHRONOUS_CLEANUP do{ hpcrun_safe_enter(); \ 301 spinlock_lock(&g_gpu_lock); \ 302 cleanup_finished_events(); \ 303 spinlock_unlock(&g_gpu_lock); \ 304 hpcrun_safe_exit(); } while(0) 308 #define CUDA_RUNTIME_SYNC_WRAPPER(fn, prologueArgs, epilogueArgs, ...) \ 309 VA_FN_DECLARE(cudaError_t, fn, __VA_ARGS__) {\ 310 if (! hpcrun_is_safe_to_sync(__func__)) return VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\ 311 SYNC_PROLOGUE prologueArgs;\ 312 monitor_disable_new_threads();\ 313 cudaError_t ret = VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\ 314 monitor_enable_new_threads();\ 315 SYNC_EPILOGUE epilogueArgs;\ 320 #define CUDA_RUNTIME_SYNC_ON_STREAM_WRAPPER(fn, prologueArgs, epilogueArgs, ...) \ 321 VA_FN_DECLARE(cudaError_t, fn, __VA_ARGS__) {\ 322 SYNC_PROLOGUE prologueArgs;\ 323 monitor_disable_new_threads();\ 324 cudaError_t ret = VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\ 325 hpcrun_safe_enter();\ 327 streamId = splay_get_stream_id(stream);\ 329 monitor_enable_new_threads();\ 330 SYNC_EPILOGUE epilogueArgs;\ 335 #define CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(fn, prologueArgs, epilogueArgs, ...) \ 336 VA_FN_DECLARE(cudaError_t, fn, __VA_ARGS__) {\ 337 if (! hpcrun_is_safe_to_sync(__func__)) return VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\ 338 ASYNC_MEMCPY_PROLOGUE prologueArgs;\ 339 cudaError_t ret = VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\ 340 ASYNC_MEMCPY_EPILOGUE epilogueArgs;\ 344 #define CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(fn, prologueArgs, epilogueArgs, ...) \ 345 VA_FN_DECLARE(cudaError_t, fn, __VA_ARGS__) {\ 346 if (! hpcrun_is_safe_to_sync(__func__)) return VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\ 347 SYNC_MEMCPY_PROLOGUE prologueArgs;\ 348 monitor_disable_new_threads();\ 349 cudaError_t ret = VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\ 350 monitor_enable_new_threads();\ 351 SYNC_MEMCPY_EPILOGUE epilogueArgs;\ 367 #define PopulateGPUFunctionPointers(basename, library) \ 371 void* dlsym_arg = RTLD_NEXT; \ 372 void* try = dlsym(dlsym_arg, basename ## FunctionPointer[0].functionName); \ 373 if ((error=dlerror()) || (! try)) { \ 374 if (getenv("DEBUG_HPCRUN_GPU_CONS")) \ 375 fprintf(stderr, "RTLD_NEXT argument fails for " #basename " (%s)\n", \ 376 (! try) ? "trial function pointer = NULL" : "dlerror != NULL"); \ 378 dlsym_arg = monitor_real_dlopen(#library, RTLD_LAZY); \ 380 fprintf(stderr, "fallback dlopen of " #library " failed," \ 381 " dlerror message = '%s'\n", dlerror()); \ 382 monitor_real_abort(); \ 384 if (getenv("DEBUG_HPCRUN_GPU_CONS")) \ 385 fprintf(stderr, "Going forward with " #basename " overrides using " #library "\n"); \ 388 if (getenv("DEBUG_HPCRUN_GPU_CONS")) \ 389 fprintf(stderr, "Going forward with " #basename " overrides using RTLD_NEXT\n"); \ 390 for (int i = 0; i < sizeof(basename ## FunctionPointer)/sizeof(basename ## FunctionPointer[0]); i++) { \ 392 basename ## FunctionPointer[i].generic = \ 393 dlsym(dlsym_arg, basename ## FunctionPointer[i].functionName); \ 394 if (getenv("DEBUG_HPCRUN_GPU_CONS")) \ 395 fprintf(stderr, #basename "Fnptr[%d] @ %p for %s = %p\n", \ 396 i, & basename ## FunctionPointer[i].generic, \ 397 basename ## FunctionPointer[i].functionName, \ 398 basename ## FunctionPointer[i].generic); \ 399 if ((error = dlerror()) != NULL) { \ 400 EEMSG("%s: during dlsym \n", error); \ 401 monitor_real_abort(); \ 415 enum overloadPotentialState{
428 extern cudaRuntimeFunctionPointer_t cudaRuntimeFunctionPointer[];
431 extern cuDriverFunctionPointer_t cuDriverFunctionPointer[];
443 typedef struct event_list_node_t {
445 cudaEvent_t event_start;
446 cudaEvent_t event_end;
449 uint64_t event_start_time;
450 uint64_t event_end_time;
463 struct event_list_node_t *next;
464 struct event_list_node_t *next_free_node;
470 typedef struct stream_node_t {
474 struct event_list_node_t *latest_event_node;
476 struct event_list_node_t *unfinished_event_node;
478 struct stream_node_t *next_unfinished_stream;
482 int32_t idle_node_id;
488 typedef struct active_kernel_node_t {
494 struct active_kernel_node_t *start_node;
497 struct active_kernel_node_t *next;
498 struct active_kernel_node_t *next_free_node;
500 struct active_kernel_node_t *next_active_kernel;
501 struct active_kernel_node_t *prev;
503 } active_kernel_node_t;
508 typedef struct stream_to_id_map_t {
513 struct stream_to_id_map_t *left;
514 struct stream_to_id_map_t *right;
515 } stream_to_id_map_t;
518 typedef struct IPC_data_t {
520 uint64_t outstanding_kernels;
521 uint64_t num_threads_at_sync_all_procs;
525 static uint32_t cleanup_finished_events();
538 static uint32_t g_stream_id = 32;
539 static uint32_t g_stream_to_id_index = 0;
547 static uint64_t g_num_threads_at_sync;
549 static event_list_node_t *g_free_event_nodes_head;
550 static active_kernel_node_t *g_free_active_kernel_nodes_head;
553 static struct stream_to_id_map_t *stream_to_id_tree_root;
554 static stream_to_id_map_t stream_to_id[MAX_STREAMS];
555 static stream_node_t g_stream_array[MAX_STREAMS];
558 static stream_node_t *g_unfinished_stream_list_head;
561 static event_list_node_t *g_finished_event_nodes_tail;
564 static event_list_node_t dummy_event_node = {
570 .stream_launcher_cct = 0
574 static bool g_do_shared_blaming;
577 static uint32_t g_cuda_launch_skip_inner;
579 static uint64_t g_start_of_world_time;
581 static cudaEvent_t g_start_of_world_event;
583 static bool g_stream0_initialized =
false;
585 static IPC_data_t * ipc_data;
594 PopulateEntryPointesToWrappedCudaRuntimeCalls()
596 PopulateGPUFunctionPointers(cudaRuntime, libcudart.so)
602 PopulateEntryPointesToWrappedCuDriverCalls(
void)
604 PopulateGPUFunctionPointers(cuDriver, libcuda.so)
608 InitCpuGpuBlameShiftDataStructs(
void)
610 char * shared_blaming_env;
611 char * cuda_launch_skip_inner_env;
612 g_unfinished_stream_list_head =
NULL;
613 g_finished_event_nodes_tail = &dummy_event_node;
614 dummy_event_node.next = g_finished_event_nodes_tail;
615 shared_blaming_env = getenv(
"HPCRUN_ENABLE_SHARED_GPU_BLAMING");
616 if(shared_blaming_env)
617 g_do_shared_blaming = atoi(shared_blaming_env);
619 cuda_launch_skip_inner_env = getenv(
"HPCRUN_CUDA_LAUNCH_SKIP_INNER");
620 if(cuda_launch_skip_inner_env)
621 g_cuda_launch_skip_inner = atoi(cuda_launch_skip_inner_env);
625 static void PopulateEntryPointesToWrappedCalls() {
626 PopulateEntryPointesToWrappedCudaRuntimeCalls();
627 PopulateEntryPointesToWrappedCuDriverCalls();
633 if (getenv(
"DEBUG_HPCRUN_GPU_CONS"))
634 fprintf(stderr,
"CPU-GPU blame shift constructor called\n");
637 PopulateEntryPointesToWrappedCalls();
638 InitCpuGpuBlameShiftDataStructs();
644 static char shared_key[MAX_SHARED_KEY_LENGTH];
646 static void destroy_shared_memory(
void * p) {
649 shm_unlink((
char *)shared_key);
652 static inline void create_shared_memory() {
656 monitor_disable_new_threads();
657 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaGetDeviceEnum].cudaGetDeviceReal(&device_id));
658 monitor_enable_new_threads();
659 sprintf(shared_key,
"/gpublame%d",device_id);
660 if ( (fd = shm_open(shared_key, O_RDWR | O_CREAT, 0666)) < 0 ) {
661 EEMSG(
"Failed to shm_open (%s) on device %d, retval = %d", shared_key, device_id, fd);
664 if ( ftruncate(fd,
sizeof(IPC_data_t)) < 0 ) {
665 EEMSG(
"Failed to ftruncate() on device %d",device_id);
669 if( (ipc_data = mmap(
NULL,
sizeof(IPC_data_t), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 )) == MAP_FAILED ) {
670 EEMSG(
"Failed to mmap() on device %d",device_id);
680 static struct stream_to_id_map_t *
splay(
struct stream_to_id_map_t *root, cudaStream_t key) {
686 static uint32_t splay_get_stream_id(cudaStream_t key) {
688 struct stream_to_id_map_t *root = stream_to_id_tree_root;
691 assert(root->stream == key);
692 stream_to_id_tree_root = root;
693 uint32_t ret = stream_to_id_tree_root->id;
702 static stream_to_id_map_t *
splay_insert(cudaStream_t stream_ip)
706 struct stream_to_id_map_t *
node = &stream_to_id[g_stream_to_id_index++];
707 node->stream = stream_ip;
708 node->left = node->right =
NULL;
709 node->id = g_stream_id++;
710 cudaStream_t stream = node->stream;
712 if (stream_to_id_tree_root !=
NULL) {
713 stream_to_id_tree_root =
splay(stream_to_id_tree_root, stream);
715 if (stream < stream_to_id_tree_root->stream) {
716 node->left = stream_to_id_tree_root->left;
717 node->right = stream_to_id_tree_root;
718 stream_to_id_tree_root->left =
NULL;
719 }
else if (stream > stream_to_id_tree_root->stream) {
720 node->left = stream_to_id_tree_root;
721 node->right = stream_to_id_tree_root->right;
722 stream_to_id_tree_root->right =
NULL;
724 EEMSG(
"stream_to_id_tree_root splay tree: unable to insert %p (already present)", node->stream);
728 stream_to_id_tree_root =
node;
730 return stream_to_id_tree_root;
767 inline void hpcrun_stream_finalize(
void * st) {
775 static struct stream_to_id_map_t *
splay_delete(cudaStream_t stream)
777 struct stream_to_id_map_t *result =
NULL;
779 TMSG(CUDA,
"Trying to delete %p from stream splay tree", stream);
781 if (stream_to_id_tree_root ==
NULL) {
783 TMSG(CUDA,
"stream_to_id_map_t splay tree empty: unable to delete %p", stream);
787 stream_to_id_tree_root =
splay(stream_to_id_tree_root, stream);
789 if (stream != stream_to_id_tree_root->stream) {
791 TMSG(CUDA,
"trying to deleting stream %p, but not in splay tree (root = %p)", stream, stream_to_id_tree_root->stream);
796 result = stream_to_id_tree_root;
798 if (stream_to_id_tree_root->left ==
NULL) {
799 stream_to_id_tree_root = stream_to_id_tree_root->right;
804 stream_to_id_tree_root->left =
splay(stream_to_id_tree_root->left, stream);
805 stream_to_id_tree_root->left->right = stream_to_id_tree_root->right;
806 stream_to_id_tree_root = stream_to_id_tree_root->left;
813 static inline event_list_node_t *enter_cuda_sync(uint64_t * syncStart) {
817 cleanup_finished_events();
820 gettimeofday(&tv,
NULL);
821 *syncStart = ((uint64_t) tv.tv_usec + (((uint64_t) tv.tv_sec) * 1000000));
823 event_list_node_t *recorded_node = g_finished_event_nodes_tail;
824 if (g_finished_event_nodes_tail != &dummy_event_node)
825 g_finished_event_nodes_tail->ref_count++;
827 atomic_add_i64(&g_num_threads_at_sync, 1L);
829 return recorded_node;
835 static uint64_t attribute_shared_blame_on_kernels(event_list_node_t * recorded_node, uint64_t recorded_time,
const uint32_t stream_mask,
double scaling_factor) {
838 if (recorded_node != &dummy_event_node)
839 recorded_node->ref_count--;
841 uint32_t num_active_kernels = 0;
842 active_kernel_node_t *sorted_active_kernels_begin =
NULL;
847 event_list_node_t *cur = recorded_node->next, *prev = recorded_node;
848 while (cur != &dummy_event_node) {
850 if (cur->ref_count == 0) {
851 prev->next = cur->next;
852 event_list_node_t *to_free = cur;
854 ADD_TO_FREE_EVENTS_LIST(to_free);
860 if ((cur->event_end_time <= recorded_time) || (cur->stream_id != (cur->stream_id & stream_mask))) {
861 if (cur->ref_count == 0) {
862 prev->next = cur->next;
863 event_list_node_t *to_free = cur;
865 ADD_TO_FREE_EVENTS_LIST(to_free);
874 active_kernel_node_t *start_active_kernel_node;
875 active_kernel_node_t *end_active_kernel_node;
876 GET_NEW_ACTIVE_KERNEL_NODE(start_active_kernel_node);
877 GET_NEW_ACTIVE_KERNEL_NODE(end_active_kernel_node);
879 if (cur->event_start_time < recorded_time) {
880 start_active_kernel_node->event_time = recorded_time;
882 start_active_kernel_node->event_time = cur->event_start_time;
885 start_active_kernel_node->event_type = KERNEL_START;
886 start_active_kernel_node->stream_id = cur->stream_id;
887 start_active_kernel_node->launcher_cct = cur->launcher_cct;
888 start_active_kernel_node->next_active_kernel =
NULL;
890 end_active_kernel_node->event_type = KERNEL_END;
891 end_active_kernel_node->start_node = start_active_kernel_node;
892 end_active_kernel_node->event_time = cur->event_end_time;
895 if (start_active_kernel_node->event_time == end_active_kernel_node->event_time) {
896 ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(start_active_kernel_node);
897 ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(end_active_kernel_node);
898 if (cur->ref_count == 0) {
899 prev->next = cur->next;
900 event_list_node_t *to_free = cur;
902 ADD_TO_FREE_EVENTS_LIST(to_free);
909 assert(start_active_kernel_node->event_time < end_active_kernel_node->event_time);
911 if (sorted_active_kernels_begin ==
NULL) {
913 start_active_kernel_node->next = end_active_kernel_node;
914 start_active_kernel_node->prev = end_active_kernel_node;
915 end_active_kernel_node->prev = start_active_kernel_node;
916 end_active_kernel_node->next = start_active_kernel_node;
917 sorted_active_kernels_begin = start_active_kernel_node;
922 active_kernel_node_t *current = sorted_active_kernels_begin->prev;
923 bool change_head = 1;
925 if (end_active_kernel_node->event_time > current->event_time) {
929 current = current->prev;
930 }
while (current != sorted_active_kernels_begin->prev);
931 end_active_kernel_node->next = current->next;
932 end_active_kernel_node->prev = current;
933 current->next->prev = end_active_kernel_node;
934 current->next = end_active_kernel_node;
936 sorted_active_kernels_begin = end_active_kernel_node;
939 current = end_active_kernel_node->prev;
942 if (start_active_kernel_node->event_time > current->event_time) {
946 current = current->prev;
947 }
while (current != sorted_active_kernels_begin->prev);
948 start_active_kernel_node->next = current->next;
949 start_active_kernel_node->prev = current;
950 current->next->prev = start_active_kernel_node;
951 current->next = start_active_kernel_node;
953 sorted_active_kernels_begin = start_active_kernel_node;
957 if (cur->ref_count == 0) {
958 prev->next = cur->next;
959 event_list_node_t *to_free = cur;
961 ADD_TO_FREE_EVENTS_LIST(to_free);
968 g_finished_event_nodes_tail = prev;
971 uint64_t last_kernel_end_time = 0;
972 if (sorted_active_kernels_begin) {
975 active_kernel_node_t *dummy_kernel_node;
976 GET_NEW_ACTIVE_KERNEL_NODE(dummy_kernel_node);
977 sorted_active_kernels_begin->prev->next = dummy_kernel_node;
978 dummy_kernel_node->prev = sorted_active_kernels_begin->prev;
979 sorted_active_kernels_begin->prev = dummy_kernel_node;
980 dummy_kernel_node->next = sorted_active_kernels_begin;
982 active_kernel_node_t *current = sorted_active_kernels_begin;
983 uint64_t last_time = recorded_time;
985 uint64_t new_time = current->event_time;
987 assert(new_time >= last_time);
988 assert(current != dummy_kernel_node &&
"should never process dummy_kernel_node");
990 if (num_active_kernels && (new_time > last_time)) {
992 active_kernel_node_t *blame_node = current->prev;
994 assert(blame_node->event_type == KERNEL_START);
997 .r = (new_time - last_time) * (scaling_factor) / num_active_kernels}
999 blame_node = blame_node->prev;
1000 }
while (blame_node != sorted_active_kernels_begin->prev);
1003 last_time = new_time;
1005 if (current->event_type == KERNEL_START) {
1006 num_active_kernels++;
1007 current = current->next;
1009 last_kernel_end_time = new_time;
1010 current->start_node->prev->next = current->start_node->next;
1011 current->start_node->next->prev = current->start_node->prev;
1012 if (current->start_node == sorted_active_kernels_begin)
1013 sorted_active_kernels_begin = current->start_node->next;
1014 ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST((current->start_node));
1016 #if 0 // Not a plausible case 1018 if (current->next == current) {
1019 ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(current);
1023 current->prev->next = current->next;
1024 current->next->prev = current->prev;
1025 if (current == sorted_active_kernels_begin)
1026 sorted_active_kernels_begin = current->next;
1027 num_active_kernels--;
1028 active_kernel_node_t *to_free = current;
1029 current = current->next;
1030 ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(to_free);
1034 }
while (current != sorted_active_kernels_begin->prev);
1036 ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(dummy_kernel_node);
1040 return last_kernel_end_time;
1045 static inline uint64_t leave_cuda_sync(event_list_node_t * recorded_node, uint64_t syncStart,
const uint32_t stream_mask) {
1049 cleanup_finished_events();
1051 double scaling_factor = 1.0;
1052 if(SHARED_BLAMING_INITIALISED &&
TD_GET(gpu_data.accum_num_samples))
1053 scaling_factor *= ((double)
TD_GET(gpu_data.accum_num_sync_threads))/
TD_GET(gpu_data.accum_num_samples);
1054 uint64_t last_kernel_end_time = attribute_shared_blame_on_kernels(recorded_node, syncStart, stream_mask, scaling_factor);
1055 atomic_add_i64(&g_num_threads_at_sync, -1L);
1056 return last_kernel_end_time;
1063 static uint32_t cleanup_finished_events() {
1064 uint32_t num_unfinished_streams = 0;
1065 stream_node_t *prev_stream =
NULL;
1066 stream_node_t *next_stream =
NULL;
1067 stream_node_t *cur_stream = g_unfinished_stream_list_head;
1069 while (cur_stream !=
NULL) {
1070 assert(cur_stream->unfinished_event_node &&
" Can't point unfinished stream to null");
1071 next_stream = cur_stream->next_unfinished_stream;
1073 event_list_node_t *current_event = cur_stream->unfinished_event_node;
1074 while (current_event) {
1076 cudaError_t err_cuda = cudaRuntimeFunctionPointer[cudaEventQueryEnum].cudaEventQueryReal(current_event->event_end);
1078 if (err_cuda == cudaSuccess) {
1081 DECR_SHARED_BLAMING_DS(outstanding_kernels);
1088 TMSG(CUDA,
"BEFORE: EventElapsedRT(%p, %p)\n", g_start_of_world_event, current_event->event_start);
1089 cudaError_t err1 =
Cuda_RTcall(cudaEventElapsedTime)(&elapsedTime,
1090 g_start_of_world_event,
1091 current_event->event_start);
1093 if (err1 != cudaSuccess) {
1094 EMSG(
"cudaEventElaspsedTime failed");
1098 assert(elapsedTime > 0);
1100 uint64_t micro_time_start = (uint64_t) (((
double) elapsedTime) * 1000) + g_start_of_world_time;
1102 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventElapsedTimeEnum].cudaEventElapsedTimeReal(&elapsedTime, g_start_of_world_event, current_event->event_end));
1104 assert(elapsedTime > 0);
1105 uint64_t micro_time_end = (uint64_t) (((
double) elapsedTime) * 1000) + g_start_of_world_time;
1107 assert(micro_time_start <= micro_time_end);
1112 cct_node_t *stream_cct = current_event->stream_launcher_cct;
1114 hpcrun_cct_persistent_id_trace_mutate(stream_cct);
1126 .i = (micro_time_end - micro_time_start)});
1128 event_list_node_t *deferred_node = current_event;
1129 current_event = current_event->next;
1134 if (g_num_threads_at_sync) {
1136 deferred_node->ref_count = g_num_threads_at_sync;
1137 deferred_node->event_start_time = micro_time_start;
1138 deferred_node->event_end_time = micro_time_end;
1139 deferred_node->next = g_finished_event_nodes_tail->next;
1140 g_finished_event_nodes_tail->next = deferred_node;
1141 g_finished_event_nodes_tail = deferred_node;
1146 ADD_TO_FREE_EVENTS_LIST(deferred_node);
1154 cur_stream->unfinished_event_node = current_event;
1155 if (current_event ==
NULL) {
1157 cur_stream->latest_event_node =
NULL;
1158 if (prev_stream ==
NULL) {
1159 g_unfinished_stream_list_head = next_stream;
1161 prev_stream->next_unfinished_stream = next_stream;
1165 num_unfinished_streams++;
1166 prev_stream = cur_stream;
1168 cur_stream = next_stream;
1170 return num_unfinished_streams;
1176 static event_list_node_t *create_and_insert_event(
int stream_id,
cct_node_t * launcher_cct,
cct_node_t * stream_launcher_cct) {
1178 event_list_node_t *event_node;
1179 if (g_free_event_nodes_head) {
1181 event_node = g_free_event_nodes_head;
1182 g_free_event_nodes_head = g_free_event_nodes_head->next_free_node;
1185 if (event_node->event_start)
1186 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal((event_node->event_start)));
1187 if (event_node->event_end)
1188 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal((event_node->event_end)));
1192 event_node = (event_list_node_t *)
hpcrun_malloc(
sizeof(event_list_node_t));
1196 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventCreateEnum].cudaEventCreateReal(&(event_node->event_start)));
1197 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventCreateEnum].cudaEventCreateReal(&(event_node->event_end)));
1199 event_node->stream_launcher_cct = stream_launcher_cct;
1200 event_node->launcher_cct = launcher_cct;
1201 event_node->next =
NULL;
1202 event_node->stream_id = stream_id;
1203 if (g_stream_array[stream_id].latest_event_node ==
NULL) {
1204 g_stream_array[stream_id].latest_event_node = event_node;
1205 g_stream_array[stream_id].unfinished_event_node = event_node;
1206 g_stream_array[stream_id].next_unfinished_stream = g_unfinished_stream_list_head;
1207 g_unfinished_stream_list_head = &(g_stream_array[stream_id]);
1209 g_stream_array[stream_id].latest_event_node->next = event_node;
1210 g_stream_array[stream_id].latest_event_node = event_node;
1216 static void close_all_streams(stream_to_id_map_t *root) {
1221 close_all_streams(root->left);
1222 close_all_streams(root->right);
1224 streamId = root->id;
1226 hpcrun_stream_finalize(g_stream_array[streamId].st);
1231 g_stream_array[streamId].st =
NULL;
1238 static void create_stream0_if_needed(cudaStream_t stream) {
1239 HPCRUN_ASYNC_BLOCK_SPIN_LOCK;
1240 if ( (((uint64_t)stream) == 0 )&& (g_stream0_initialized ==
false)) {
1241 uint32_t new_streamId;
1243 if (g_start_of_world_time == 0) {
1248 monitor_disable_new_threads();
1253 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventCreateEnum].cudaEventCreateReal(&g_start_of_world_event));
1258 gettimeofday(&tv,
NULL);
1259 g_start_of_world_time = ((uint64_t) tv.tv_usec + (((uint64_t) tv.tv_sec) * 1000000));
1262 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(g_start_of_world_event, 0));
1265 monitor_enable_new_threads();
1269 if(g_do_shared_blaming && ipc_data ==
NULL)
1270 create_shared_memory();
1276 gettimeofday(&tv,
NULL);
1277 g_stream_array[new_streamId].st = hpcrun_stream_data_alloc_init(new_streamId);
1283 cct_bundle_t *bundle = &(g_stream_array[new_streamId].st->epoch->csdata);
1285 hpcrun_cct_persistent_id_trace_mutate(idl);
1294 g_stream0_initialized =
true;
1296 HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK;
1306 CUDA_RUNTIME_SYNC_WRAPPER(cudaThreadSynchronize, (context, launcher_cct,
1307 syncStart, recorded_node), (context, launcher_cct, syncStart,
1308 recorded_node, ALL_STREAMS_MASK, syncEnd),
void)
1310 CUDA_RUNTIME_SYNC_ON_STREAM_WRAPPER(cudaStreamSynchronize, (context,
1311 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1312 syncStart, recorded_node, streamId, syncEnd), cudaStream_t, stream)
1314 CUDA_RUNTIME_SYNC_WRAPPER(cudaEventSynchronize, (context, launcher_cct,
1315 syncStart, recorded_node), (context, launcher_cct, syncStart,
1316 recorded_node, ALL_STREAMS_MASK, syncEnd), cudaEvent_t, event)
1318 CUDA_RUNTIME_SYNC_ON_STREAM_WRAPPER(cudaStreamWaitEvent, (context,
1319 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1320 syncStart, recorded_node, streamId, syncEnd), cudaStream_t, stream,
1321 cudaEvent_t, event,
unsigned int, flags)
1323 CUDA_RUNTIME_SYNC_WRAPPER(cudaDeviceSynchronize, (context, launcher_cct,
1324 syncStart, recorded_node), (context, launcher_cct, syncStart,
1325 recorded_node, ALL_STREAMS_MASK, syncEnd),
void)
1327 CUDA_RUNTIME_SYNC_WRAPPER(cudaMallocArray, (context, launcher_cct,
1328 syncStart, recorded_node), (context, launcher_cct, syncStart,
1329 recorded_node, ALL_STREAMS_MASK, syncEnd),
struct cudaArray **, array,
1330 const struct cudaChannelFormatDesc *, desc,
size_t, width,
size_t,
1331 height,
unsigned int, flags)
1333 CUDA_RUNTIME_SYNC_WRAPPER(cudaFree, (context, launcher_cct, syncStart,
1334 recorded_node), (context, launcher_cct, syncStart, recorded_node,
1335 ALL_STREAMS_MASK, syncEnd),
void *, devPtr)
1337 CUDA_RUNTIME_SYNC_WRAPPER(cudaFreeArray, (context, launcher_cct,
1338 syncStart, recorded_node), (context, launcher_cct, syncStart,
1339 recorded_node, ALL_STREAMS_MASK, syncEnd),
struct cudaArray *, array)
1343 cudaError_t cudaConfigureCall(dim3 grid, dim3 block,
size_t mem, cudaStream_t stream) {
1346 return cudaRuntimeFunctionPointer[cudaConfigureCallEnum].cudaConfigureCallReal(grid, block, mem, stream);
1347 TD_GET(gpu_data.is_thread_at_cuda_sync) =
true;
1348 monitor_disable_new_threads();
1349 cudaError_t ret = cudaRuntimeFunctionPointer[cudaConfigureCallEnum].cudaConfigureCallReal(grid, block, mem, stream);
1350 monitor_enable_new_threads();
1351 TD_GET(gpu_data.is_thread_at_cuda_sync) =
false;
1352 TD_GET(gpu_data.active_stream) = (uint64_t) stream;
1356 #if (CUDART_VERSION < 5000) 1357 cudaError_t cudaLaunch(
const char *entry) {
1359 cudaError_t cudaLaunch(
const void *entry) {
1363 return cudaRuntimeFunctionPointer[cudaLaunchEnum].cudaLaunchReal(entry);
1364 TMSG(CPU_GPU,
"Cuda launch (get spinlock)");
1365 ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t) (
TD_GET(gpu_data.active_stream))), g_cuda_launch_skip_inner);
1367 cudaError_t ret = cudaRuntimeFunctionPointer[cudaLaunchEnum].cudaLaunchReal(entry);
1369 TMSG(CPU_GPU,
"Cuda launch about to release spin lock");
1370 ASYNC_KERNEL_EPILOGUE(event_node, ((cudaStream_t) (
TD_GET(gpu_data.active_stream))));
1371 TMSG(CPU_GPU,
"Cuda launch done !(spin lock released)");
1377 cudaError_t cudaStreamDestroy(cudaStream_t stream) {
1379 SYNCHRONOUS_CLEANUP;
1385 streamId = splay_get_stream_id(stream);
1387 hpcrun_stream_finalize(g_stream_array[streamId].st);
1392 g_stream_array[streamId].st =
NULL;
1394 monitor_disable_new_threads();
1395 cudaError_t ret = cudaRuntimeFunctionPointer[cudaStreamDestroyEnum].cudaStreamDestroyReal(stream);
1396 monitor_enable_new_threads();
1406 static void StreamCreateBookKeeper(cudaStream_t * stream){
1408 HPCRUN_ASYNC_BLOCK_SPIN_LOCK;
1409 if (g_start_of_world_time == 0) {
1411 TD_GET(gpu_data.is_thread_at_cuda_sync) =
true;
1414 monitor_disable_new_threads();
1418 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventCreateEnum].cudaEventCreateReal(&g_start_of_world_event));
1423 gettimeofday(&tv,
NULL);
1424 g_start_of_world_time = ((uint64_t) tv.tv_usec + (((uint64_t) tv.tv_sec) * 1000000));
1427 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(g_start_of_world_event, 0));
1430 monitor_enable_new_threads();
1434 if(g_do_shared_blaming && ipc_data ==
NULL)
1435 create_shared_memory();
1438 TD_GET(gpu_data.is_thread_at_cuda_sync) =
false;
1442 g_stream_array[new_streamId].st = hpcrun_stream_data_alloc_init(new_streamId);
1447 cct_bundle_t *bundle = &(g_stream_array[new_streamId].st->epoch->csdata);
1449 hpcrun_cct_persistent_id_trace_mutate(idl);
1457 HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK;
1461 cudaError_t cudaStreamCreate(cudaStream_t * stream) {
1463 TD_GET(gpu_data.is_thread_at_cuda_sync) =
true;
1464 monitor_disable_new_threads();
1465 cudaError_t ret = cudaRuntimeFunctionPointer[cudaStreamCreateEnum].cudaStreamCreateReal(stream);
1466 monitor_enable_new_threads();
1467 TD_GET(gpu_data.is_thread_at_cuda_sync) =
false;
1469 StreamCreateBookKeeper(stream);
1473 inline static void increment_mem_xfer_metric(
size_t count,
enum cudaMemcpyKind kind,
cct_node_t *node){
1475 case cudaMemcpyHostToHost:
1479 case cudaMemcpyHostToDevice:
1484 case cudaMemcpyDeviceToHost:
1488 case cudaMemcpyDeviceToDevice:
1492 case cudaMemcpyDefault:
1503 CUDA_RUNTIME_SYNC_WRAPPER(cudaMalloc, (context, launcher_cct, syncStart,
1504 recorded_node), (context, launcher_cct, syncStart, recorded_node,
1505 ALL_STREAMS_MASK, syncEnd),
void **, devPtr,
size_t, size)
1507 CUDA_RUNTIME_SYNC_WRAPPER(cudaMalloc3D, (context, launcher_cct, syncStart,
1508 recorded_node), (context, launcher_cct, syncStart, recorded_node,
1509 ALL_STREAMS_MASK, syncEnd),
struct cudaPitchedPtr*, pitchedDevPtr,
1510 struct cudaExtent, extent)
1512 CUDA_RUNTIME_SYNC_WRAPPER(cudaMalloc3DArray, (context, launcher_cct,
1513 syncStart, recorded_node), (context, launcher_cct, syncStart,
1514 recorded_node, ALL_STREAMS_MASK, syncEnd),
struct cudaArray**, array,
1515 const struct cudaChannelFormatDesc*, desc,
struct cudaExtent, extent,
1516 unsigned int, flags)
1518 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy3D, (context, launcher_cct,
1519 syncStart, recorded_node), (context, launcher_cct, syncStart,
1520 recorded_node, ALL_STREAMS_MASK, syncEnd, (p->extent.width *
1521 p->extent.height * p->extent.depth), (p->kind)),
const struct 1522 cudaMemcpy3DParms *, p)
1525 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy3DPeer, (context,
1526 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1527 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, (p->extent.width *
1528 p->extent.height * p->extent.depth), cudaMemcpyDeviceToDevice),
const 1529 struct cudaMemcpy3DPeerParms *, p)
1531 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy3DAsync, (streamId,
1532 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1533 (p->extent.width * p->extent.height * p->extent.depth), (p->kind)),
1534 const struct cudaMemcpy3DParms *, p, cudaStream_t, stream)
1536 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy3DPeerAsync, (streamId,
1537 event_node, context, cct_node, stream, 0), (event_node, cct_node,
1538 stream, (p->extent.width * p->extent.height * p->extent.depth),
1539 cudaMemcpyDeviceToDevice),
const struct cudaMemcpy3DPeerParms *, p,
1540 cudaStream_t, stream)
1542 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyPeer, (context,
1543 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1544 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count,
1545 cudaMemcpyDeviceToDevice),
void *, dst,
int, dstDevice,
const void *,
1546 src,
int, srcDevice,
size_t, count)
1548 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyFromArray, (context,
1549 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1550 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind),
void 1551 *, dst,
const struct cudaArray *, src,
size_t, wOffset,
size_t, hOffset,
1552 size_t, count,
enum cudaMemcpyKind, kind)
1554 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyArrayToArray, (context,
1555 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1556 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind),
1557 struct cudaArray *, dst,
size_t, wOffsetDst,
size_t, hOffsetDst,
const 1558 struct cudaArray *, src,
size_t, wOffsetSrc,
size_t, hOffsetSrc,
size_t,
1559 count,
enum cudaMemcpyKind, kind)
1561 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy2DToArray, (context,
1562 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1563 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, (width * height)
1564 , kind),
struct cudaArray *, dst,
size_t, wOffset,
size_t, hOffset,
1565 const void *, src,
size_t, spitch,
size_t, width,
size_t, height,
enum 1566 cudaMemcpyKind, kind)
1568 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy2DFromArray, (context,
1569 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1570 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, (width * height),
1571 kind),
void *, dst,
size_t, dpitch,
const struct cudaArray *, src,
1572 size_t, wOffset,
size_t, hOffset,
size_t, width,
size_t, height,
enum 1573 cudaMemcpyKind, kind)
1575 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy2DArrayToArray, (context,
1576 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1577 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, (width * height),
1578 kind),
struct cudaArray *, dst,
size_t, wOffsetDst,
size_t, hOffsetDst,
1579 const struct cudaArray *, src,
size_t, wOffsetSrc,
size_t, hOffsetSrc,
1580 size_t, width,
size_t, height,
enum cudaMemcpyKind, kind )
1583 #if (CUDART_VERSION < 5000) 1585 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyToSymbol, (context,
1586 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1587 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count,
1588 kind),
const char *, symbol,
const void *, src,
size_t,
1589 count,
size_t, offset ,
enum cudaMemcpyKind, kind )
1593 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyToSymbol, (context,
1594 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1595 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind),
1596 const void *, symbol,
const void *, src,
size_t, count,
size_t, offset ,
1597 enum cudaMemcpyKind, kind )
1602 #if (CUDART_VERSION < 5000) 1604 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyFromSymbol,
1605 (context, launcher_cct, syncStart, recorded_node), (context,
1606 launcher_cct, syncStart, recorded_node, ALL_STREAMS_MASK,
1607 syncEnd, count, kind),
void *, dst,
const char *, symbol,
1608 size_t, count,
size_t, offset ,
enum cudaMemcpyKind, kind)
1612 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyFromSymbol, (context,
1613 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1614 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind),
1615 void *, dst,
const void *, symbol,
size_t, count,
size_t, offset ,
1616 enum cudaMemcpyKind, kind)
1621 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyPeerAsync, (streamId,
1622 event_node, context, cct_node, stream, 0), (event_node, cct_node,
1623 stream, count, cudaMemcpyDeviceToDevice),
void *, dst,
int, dstDevice,
1624 const void *, src,
int, srcDevice,
size_t, count, cudaStream_t, stream)
1626 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyFromArrayAsync, (streamId,
1627 event_node, context, cct_node, stream, 0), (event_node, cct_node,
1628 stream, count, kind),
void *, dst,
const struct cudaArray *, src,
1629 size_t, wOffset,
size_t, hOffset,
size_t, count,
enum cudaMemcpyKind,
1630 kind, cudaStream_t, stream)
1632 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy2DAsync, (streamId,
1633 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1634 (width * height), kind),
void *, dst,
size_t, dpitch,
const void *,
1635 src,
size_t, spitch,
size_t, width,
size_t, height,
enum cudaMemcpyKind,
1636 kind, cudaStream_t, stream)
1638 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy2DToArrayAsync, (streamId,
1639 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1640 (width * height), kind),
struct cudaArray *, dst,
size_t, wOffset,
1641 size_t, hOffset,
const void *, src,
size_t, spitch,
size_t, width,
1642 size_t, height,
enum cudaMemcpyKind, kind, cudaStream_t, stream)
1644 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy2DFromArrayAsync, (streamId,
1645 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1646 (width * height), kind),
void *, dst,
size_t, dpitch,
const struct 1647 cudaArray *, src,
size_t, wOffset,
size_t, hOffset,
size_t, width,
1648 size_t, height,
enum cudaMemcpyKind, kind, cudaStream_t, stream)
1651 #if (CUDART_VERSION < 5000) 1653 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyToSymbolAsync, (streamId,
1654 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1655 count, kind),
const char *, symbol,
const void *, src,
size_t, count,
1656 size_t, offset,
enum cudaMemcpyKind, kind, cudaStream_t, stream)
1660 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyToSymbolAsync, (streamId,
1661 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1662 count, kind),
const void *, symbol,
const void *, src,
size_t, count,
1663 size_t, offset,
enum cudaMemcpyKind, kind, cudaStream_t, stream)
1668 #if (CUDART_VERSION < 5000) 1670 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyFromSymbolAsync, (streamId,
1671 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1672 count, kind),
void *, dst,
const char *, symbol,
size_t, count,
size_t,
1673 offset,
enum cudaMemcpyKind, kind, cudaStream_t, stream)
1677 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyFromSymbolAsync, (streamId,
1678 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1679 count, kind),
void *, dst,
const void *, symbol,
size_t, count,
size_t,
1680 offset,
enum cudaMemcpyKind, kind, cudaStream_t, stream)
1685 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemset, (context, launcher_cct,
1686 syncStart, recorded_node), (context, launcher_cct, syncStart,
1687 recorded_node, ALL_STREAMS_MASK, syncEnd, 0, cudaMemcpyHostToDevice),
1688 void *, devPtr,
int, value,
size_t, count)
1690 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemset2D, (context, launcher_cct,
1691 syncStart, recorded_node), (context, launcher_cct, syncStart,
1692 recorded_node, ALL_STREAMS_MASK, syncEnd, 0, cudaMemcpyHostToDevice),
1693 void *, devPtr,
size_t, pitch,
int, value,
size_t, width,
size_t, height)
1695 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemset3D, (context, launcher_cct,
1696 syncStart, recorded_node), (context, launcher_cct, syncStart,
1697 recorded_node, ALL_STREAMS_MASK, syncEnd, 0, cudaMemcpyHostToDevice),
1698 struct cudaPitchedPtr, pitchedDevPtr,
int, value,
struct cudaExtent,
1701 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemsetAsync, (streamId, event_node,
1702 context, cct_node, stream, 0), (event_node, cct_node, stream, 0,
1703 cudaMemcpyHostToDevice),
void *, devPtr,
int, value,
size_t, count,
1704 cudaStream_t, stream)
1706 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemset2DAsync, (streamId,
1707 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1708 0, cudaMemcpyHostToDevice),
void *, devPtr,
size_t, pitch,
int, value,
1709 size_t, width,
size_t, height, cudaStream_t, stream)
1711 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemset3DAsync, (streamId,
1712 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1713 0, cudaMemcpyHostToDevice),
struct cudaPitchedPtr, pitchedDevPtr,
int,
1714 value,
struct cudaExtent, extent, cudaStream_t, stream)
1716 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyAsync, (streamId, event_node,
1717 context, cct_node, stream, 0), (event_node, cct_node, stream, count,
1718 kind),
void *, dst,
const void *, src,
size_t, count,
enum cudaMemcpyKind,
1719 kind, cudaStream_t, stream)
1721 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyToArrayAsync, (streamId,
1722 event_node, context, cct_node, stream, 0), (event_node, cct_node,
1723 stream, count, kind),
struct cudaArray *, dst,
size_t, wOffset,
size_t,
1724 hOffset,
const void *, src,
size_t, count,
enum cudaMemcpyKind, kind,
1725 cudaStream_t, stream)
1727 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy2D, (context, launcher_cct,
1728 syncStart, recorded_node), (context, launcher_cct, syncStart,
1729 recorded_node, ALL_STREAMS_MASK, syncEnd, (height * width), kind),
void *,
1730 dst,
size_t, dpitch,
const void *, src,
size_t, spitch,
size_t, width,
1731 size_t, height,
enum cudaMemcpyKind, kind)
1733 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy, (context, launcher_cct,
1734 syncStart, recorded_node), (context, launcher_cct, syncStart,
1735 recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind),
void *, dst,
1736 const void *, src,
size_t, count,
enum cudaMemcpyKind, kind)
1738 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyToArray, (context,
1739 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1740 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind),
struct 1741 cudaArray *, dst,
size_t, wOffset,
size_t, hOffset,
const void *, src,
1742 size_t, count,
enum cudaMemcpyKind, kind)
1750 CUresult cuStreamSynchronize(CUstream stream) {
1751 SYNC_PROLOGUE(context, launcher_cct, syncStart, recorded_node);
1753 monitor_disable_new_threads();
1754 CUresult ret = cuDriverFunctionPointer[cuStreamSynchronizeEnum].cuStreamSynchronizeReal(stream);
1755 monitor_enable_new_threads();
1759 streamId = splay_get_stream_id((cudaStream_t)stream);
1762 SYNC_EPILOGUE(context, launcher_cct, syncStart, recorded_node, streamId, syncEnd);
1768 CUresult cuEventSynchronize(CUevent event) {
1769 SYNC_PROLOGUE(context, launcher_cct, syncStart, recorded_node);
1771 monitor_disable_new_threads();
1772 CUresult ret = cuDriverFunctionPointer[cuEventSynchronizeEnum].cuEventSynchronizeReal(event);
1773 monitor_enable_new_threads();
1775 SYNC_EPILOGUE(context, launcher_cct, syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd);
1781 CUresult cuLaunchGridAsync(CUfunction f,
int grid_width,
int grid_height, CUstream hStream) {
1783 ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t)hStream), 0);
1785 CUresult ret = cuDriverFunctionPointer[cuLaunchGridAsyncEnum].cuLaunchGridAsyncReal(f, grid_width, grid_height, hStream);
1787 ASYNC_KERNEL_EPILOGUE(event_node, ((cudaStream_t)hStream));
1792 CUresult cuLaunchKernel (CUfunction f,
1793 unsigned int gridDimX,
1794 unsigned int gridDimY,
1795 unsigned int gridDimZ,
1796 unsigned int blockDimX,
1797 unsigned int blockDimY,
1798 unsigned int blockDimZ,
1799 unsigned int sharedMemBytes,
1801 void **kernelParams,
1803 ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t)hStream), 0);
1805 CUresult ret = cuDriverFunctionPointer[cuLaunchKernelEnum].cuLaunchKernelReal(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
1807 ASYNC_KERNEL_EPILOGUE(event_node, ((cudaStream_t)hStream));
1813 CUresult cuStreamDestroy(CUstream stream) {
1815 SYNCHRONOUS_CLEANUP;
1819 streamId = splay_get_stream_id((cudaStream_t)stream);
1822 hpcrun_stream_finalize(g_stream_array[streamId].st);
1827 g_stream_array[streamId].st =
NULL;
1829 monitor_disable_new_threads();
1830 cudaError_t ret = cuDriverFunctionPointer[cuStreamDestroy_v2Enum].cuStreamDestroy_v2Real(stream);
1831 monitor_enable_new_threads();
1841 CUresult cuStreamCreate(CUstream * phStream,
unsigned int Flags) {
1843 TD_GET(gpu_data.is_thread_at_cuda_sync) =
true;
1844 monitor_disable_new_threads();
1845 CUresult ret = cuDriverFunctionPointer[cuStreamCreateEnum].cuStreamCreateReal(phStream, Flags);
1846 monitor_enable_new_threads();
1847 TD_GET(gpu_data.is_thread_at_cuda_sync) =
false;
1849 StreamCreateBookKeeper((cudaStream_t*) phStream);
1856 static void destroy_all_events_in_free_event_list(){
1858 event_list_node_t * cur = g_free_event_nodes_head;
1860 monitor_disable_new_threads();
1862 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal(cur->event_start));
1863 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal(cur->event_end));
1864 cur->event_start = 0;
1866 cur = cur->next_free_node;
1868 monitor_enable_new_threads();
1873 cuCtxCreate_v2 (CUcontext *pctx,
unsigned int flags, CUdevice dev)
1876 fprintf(stderr,
"Too many contexts created\n");
1879 if (!
hpcrun_is_safe_to_sync(__func__)) {
return cuDriverFunctionPointer[cuCtxCreate_v2Enum].cuCtxCreate_v2Real(pctx, flags, dev);
1881 TD_GET(gpu_data.is_thread_at_cuda_sync) =
true;
1882 monitor_disable_new_threads();
1883 CUresult ret = cuDriverFunctionPointer[cuCtxCreate_v2Enum].cuCtxCreate_v2Real(pctx, flags, dev);
1884 monitor_enable_new_threads();
1885 TD_GET(gpu_data.is_thread_at_cuda_sync) =
false;
1889 CUresult cuCtxDestroy(CUcontext ctx) {
1891 SYNCHRONOUS_CLEANUP;
1893 HPCRUN_ASYNC_BLOCK_SPIN_LOCK;
1894 if (g_start_of_world_time != 0) {
1897 TD_GET(gpu_data.is_thread_at_cuda_sync) =
true;
1900 close_all_streams(stream_to_id_tree_root);
1901 stream_to_id_tree_root =
NULL;
1904 monitor_disable_new_threads();
1906 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal(g_start_of_world_event));
1907 g_start_of_world_time = 0;
1909 monitor_enable_new_threads();
1913 destroy_all_events_in_free_event_list();
1917 TD_GET(gpu_data.is_thread_at_cuda_sync) =
false;
1922 EMSG(
"Destroying Context!");
1923 HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK;
1925 monitor_disable_new_threads();
1926 CUresult ret = cuDriverFunctionPointer[cuCtxDestroy_v2Enum].cuCtxDestroy_v2Real(ctx);
1927 monitor_enable_new_threads();
1935 CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice,
const void *srcHost,
size_t ByteCount, CUstream hStream) {
1937 ASYNC_MEMCPY_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t)hStream), 0);
1939 CUresult ret = cuDriverFunctionPointer[cuMemcpyHtoDAsync_v2Enum].cuMemcpyHtoDAsync_v2Real(dstDevice, srcHost, ByteCount, hStream);
1941 ASYNC_MEMCPY_EPILOGUE(event_node, cct_node, ((cudaStream_t)hStream), ByteCount, cudaMemcpyHostToDevice);
1948 CUresult cuMemcpyHtoD(CUdeviceptr dstDevice,
const void *srcHost,
size_t ByteCount) {
1950 SYNC_MEMCPY_PROLOGUE(context, launcher_cct, syncStart, recorded_node);
1952 monitor_disable_new_threads();
1953 CUresult ret = cuDriverFunctionPointer[cuMemcpyHtoD_v2Enum].cuMemcpyHtoD_v2Real(dstDevice, srcHost, ByteCount);
1954 monitor_enable_new_threads();
1956 SYNC_MEMCPY_EPILOGUE(context, launcher_cct, syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, ByteCount, cudaMemcpyHostToDevice);
1962 CUresult cuMemcpyDtoHAsync(
void *dstHost, CUdeviceptr srcDevice,
size_t ByteCount, CUstream hStream) {
1964 ASYNC_MEMCPY_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t)hStream), 0);
1966 CUresult ret = cuDriverFunctionPointer[cuMemcpyDtoHAsync_v2Enum].cuMemcpyDtoHAsync_v2Real(dstHost, srcDevice, ByteCount, hStream);
1968 ASYNC_MEMCPY_EPILOGUE(event_node, cct_node, ((cudaStream_t)hStream), ByteCount, cudaMemcpyDeviceToHost);
1974 CUresult cuMemcpyDtoH(
void *dstHost, CUdeviceptr srcDevice,
size_t ByteCount) {
1976 SYNC_MEMCPY_PROLOGUE(context, launcher_cct, syncStart, recorded_node);
1978 monitor_disable_new_threads();
1979 CUresult ret = cuDriverFunctionPointer[cuMemcpyDtoH_v2Enum].cuMemcpyDtoH_v2Real(dstHost, srcDevice, ByteCount);
1980 monitor_enable_new_threads();
1982 SYNC_MEMCPY_EPILOGUE(context, launcher_cct, syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, ByteCount, cudaMemcpyDeviceToHost);
1993 gpu_blame_shifter(
void*
dc,
int metric_id,
cct_node_t* node,
int metric_dc)
2001 uint64_t cur_time_us = 0;
2004 EMSG(
"time_getTimeReal (clock_gettime) failed!");
2007 uint64_t metric_incr = cur_time_us -
TD_GET(last_time_us);
2011 bool is_threads_at_sync =
TD_GET(gpu_data.is_thread_at_cuda_sync);
2013 if (is_threads_at_sync) {
2014 if(SHARED_BLAMING_INITIALISED) {
2015 TD_GET(gpu_data.accum_num_sync_threads) += ipc_data->num_threads_at_sync_all_procs;
2016 TD_GET(gpu_data.accum_num_samples) += 1;
2022 uint32_t num_unfinshed_streams = 0;
2023 stream_node_t *unfinished_event_list_head = 0;
2025 num_unfinshed_streams = cleanup_finished_events();
2026 unfinished_event_list_head = g_unfinished_stream_list_head;
2028 if (num_unfinshed_streams) {
2031 if(SHARED_BLAMING_INITIALISED && ipc_data->num_threads_at_sync_all_procs && !g_num_threads_at_sync) {
2032 for (stream_node_t * unfinished_stream = unfinished_event_list_head; unfinished_stream; unfinished_stream = unfinished_stream->next_unfinished_stream) {
2036 .r = metric_incr / g_active_threads}
2044 if(
TD_GET(gpu_data.overload_state) == WORKING_STATE) {
2045 TD_GET(gpu_data.overload_state) = OVERLOADABLE_STATE;
2048 if(
TD_GET(gpu_data.overload_state) == OVERLOADABLE_STATE) {
2057 if(g_do_shared_blaming){
2058 if ( !ipc_data || ipc_data->outstanding_kernels == 0) {
static struct leakinfo_s * splay_delete(void *memblock)
void hpcrun_cct2metrics_init(cct2metrics_t **map)
int cpu_idle_cause_metric_id
int h_to_d_data_xfer_metric_id
static void splay_insert(struct leakinfo_s *node)
void hpcrun_process_aux_cleanup_remove(hpcrun_aux_cleanup_t *node)
static void hpcrun_safe_exit(void)
void hpcrun_trace_open(core_profile_trace_data_t *cptd)
void hpcrun_cct_insert_path(cct_node_t **root, cct_node_t *path)
static void spinlock_unlock(spinlock_t *l)
int d_to_h_data_xfer_metric_id
static cct_node_t * splay(cct_node_t *cct, cct_addr_t *addr)
void hpcrun_trace_append(core_profile_trace_data_t *cptd, cct_node_t *node, uint metric_id)
hpcrun_aux_cleanup_t * hpcrun_process_aux_cleanup_add(void(*func)(void *), void *arg)
hpcrun_loadmap_t * loadmap
void hpcrun_disable_papi_cuda(void)
static void cct_metric_data_increment(int metric_id, cct_node_t *x, cct_metric_data_t incr)
#define REGULAR_SPLAY_TREE(type, root, key, value, left, right)
void hpcrun_trace_close(core_profile_trace_data_t *cptd)
int hpcrun_write_profile_data(core_profile_trace_data_t *cptd)
uint64_t trace_max_time_us
void hpcrun_cct_bundle_init(cct_bundle_t *bundle, cct_ctxt_t *ctxt)
bool hpcrun_is_safe_to_sync(const char *fn)
#define Cuda_RTcall(fn)
-*-Mode: C++;-*- // technically C99
cct_ctxt_t * copy_thr_ctxt(cct_ctxt_t *thr_ctxt)
int hpcrun_trace_isactive()
static void * hpcrun_mmap_anon(size_t size)
int uva_data_xfer_metric_id
cct_node_t * hpcrun_cct_bundle_get_idle_node(cct_bundle_t *cct)
uint64_t cuda_ncontexts_decr(void)
#define HPCRUN_CONSTRUCTOR(x)
#define HPCRUN_FMT_MetricId_NULL
void * hpcrun_malloc(size_t size)
static void spinlock_lock(spinlock_t *l)
int32_t hpcrun_cct_persistent_id(cct_node_t *x)
static int time_getTimeReal(uint64_t *time)
void hpcrun_trace_append_with_time(core_profile_trace_data_t *st, unsigned int call_path_id, uint metric_id, uint64_t microtime)
uint64_t cuda_ncontexts_incr(void)
cct2metrics_t * cct2metrics_map
uint64_t trace_min_time_us
void monitor_real_abort(void)
metric_desc_properties_t properties
#define SPINLOCK_UNLOCKED
int d_to_d_data_xfer_metric_id
static int hpcrun_safe_enter(void)
int gpu_overload_potential_metric_id
metric_desc_t * hpcrun_id2metric(int metric_id)
hpcrun_loadmap_t * hpcrun_getLoadmap()
int h_to_h_data_xfer_metric_id