HPCToolkit
gpu_blame-overrides.c
Go to the documentation of this file.
1 // -*-Mode: C++;-*- // technically C99
2 
3 // * BeginRiceCopyright *****************************************************
4 //
5 // $HeadURL: https://outreach.scidac.gov/svn/hpctoolkit/branches/hpctoolkit-gpu-blame-shift-proto/src/tool/hpcrun/sample-sources/gpu_blame.c $
6 // $Id: itimer.c 3784 2012-05-10 22:35:51Z mc29 $
7 //
8 // --------------------------------------------------------------------------
9 // Part of HPCToolkit (hpctoolkit.org)
10 //
11 // Information about sources of support for research and development of
12 // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
13 // --------------------------------------------------------------------------
14 //
15 // Copyright ((c)) 2002-2019, Rice University
16 // All rights reserved.
17 //
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
21 //
22 // * Redistributions of source code must retain the above copyright
23 // notice, this list of conditions and the following disclaimer.
24 //
25 // * Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimer in the
27 // documentation and/or other materials provided with the distribution.
28 //
29 // * Neither the name of Rice University (RICE) nor the names of its
30 // contributors may be used to endorse or promote products derived from
31 // this software without specific prior written permission.
32 //
33 // This software is provided by RICE and contributors "as is" and any
34 // express or implied warranties, including, but not limited to, the
35 // implied warranties of merchantability and fitness for a particular
36 // purpose are disclaimed. In no event shall RICE or contributors be
37 // liable for any direct, indirect, incidental, special, exemplary, or
38 // consequential damages (including, but not limited to, procurement of
39 // substitute goods or services; loss of use, data, or profits; or
40 // business interruption) however caused and on any theory of liability,
41 // whether in contract, strict liability, or tort (including negligence
42 // or otherwise) arising in any way out of the use of this software, even
43 // if advised of the possibility of such damage.
44 //
45 // **
46 
47 #ifdef ENABLE_CUDA
48 
49 //
50 // Blame shifting interface
51 //
52 
53 /******************************************************************************
54  * system includes
55  *****************************************************************************/
56 #include <errno.h>
57 #include <stddef.h>
58 #include <stdlib.h>
59 #include <string.h>
60 #include <assert.h>
61 #include <stdbool.h>
62 #include <sys/ipc.h>
63 #include <sys/shm.h>
64 #include <fcntl.h>
65 #include <sys/mman.h>
66 #include <unistd.h>
67 #include <signal.h>
68 #include <sys/time.h> /* setitimer() */
69 #include <cuda.h>
70 #include <cuda_runtime.h>
71 #include <dlfcn.h>
72 #include <sys/shm.h>
73 #include <ucontext.h> /* struct ucontext */
74 
75 /******************************************************************************
76  * libmonitor
77  *****************************************************************************/
78 
79 #include <monitor.h>
80 
81 /******************************************************************************
82  * local includes
83  *****************************************************************************/
84 #include "common.h"
85 #include <hpcrun/constructors.h>
86 #include "gpu_blame.h"
87 #include "gpu_ctxt_actions.h"
88 
89 #include <hpcrun/main.h>
90 #include <hpcrun/hpcrun_options.h>
91 #include <hpcrun/write_data.h>
92 #include <hpcrun/safe-sampling.h>
93 #include <hpcrun/hpcrun_stats.h>
94 #include <hpcrun/memory/mmap.h>
95 
96 #include <hpcrun/cct/cct.h>
97 #include <hpcrun/metrics.h>
98 #include <hpcrun/sample_event.h>
100 #include <hpcrun/thread_data.h>
101 #include <hpcrun/trace.h>
102 
103 #include <lush/lush-backtrace.h>
104 #include <messages/messages.h>
105 
106 #include <utilities/tokenize.h>
108 
109 #include <unwind/common/unwind.h>
110 
111 #include <lib/support-lean/timer.h>
112 #include <lib/prof-lean/spinlock.h>
113 #include <lib/prof-lean/atomic.h>
116 /******************************************************************************
117  * macros
118  *****************************************************************************/
119 
120 // MACROS for error checking CUDA/CUPTI APIs
121 
122 #define CHECK_CU_ERROR(err, cufunc) \
123 if (err != CUDA_SUCCESS) \
124 { \
125 EETMSG("%s:%d: error %d for CUDA Driver API function '%s'\n", \
126 __FILE__, __LINE__, err, cufunc); \
127 monitor_real_abort(); \
128 }
129 
130 #define CHECK_CUPTI_ERROR(err, cuptifunc) \
131 if (err != CUPTI_SUCCESS) \
132 { \
133 const char *errstr; \
134 cuptiGetResultString(err, &errstr); \
135 EEMSG("%s:%d:Error %s for CUPTI API function '%s'.\n", \
136 __FILE__, __LINE__, errstr, cuptifunc); \
137 monitor_real_abort(); \
138 }
139 
140 #define CU_SAFE_CALL( call ) do { \
141 CUresult err = call; \
142 if( CUDA_SUCCESS != err) { \
143 EEMSG("Cuda driver error %d in call at file '%s' in line %i.\n", \
144 err, __FILE__, __LINE__ ); \
145 monitor_real_abort(); \
146 } } while (0)
147 
148 #define CUDA_SAFE_CALL( call) do { \
149 cudaError_t err = call; \
150 if( cudaSuccess != err) { \
151  EMSG("In %s, @ line %d, gives error %d = '%s'\n", __FILE__, __LINE__, \
152  err, \
153  cudaGetErrorString(err)); \
154  monitor_real_abort(); \
155 } } while (0)
156 
157 #define Cuda_RTcall(fn) cudaRuntimeFunctionPointer[fn ## Enum].fn ## Real
158 
159 #define GET_STREAM_ID(x) ((x) - g_stream_array)
160 #define ALL_STREAMS_MASK (0xffffffff)
161 
162 #define MAX_SHARED_KEY_LENGTH (100)
163 
164 #define HPCRUN_GPU_SHMSZ (1<<10)
165 
166 #define SHARED_BLAMING_INITIALISED (ipc_data != NULL)
167 
168 #define INCR_SHARED_BLAMING_DS(field) do{ if(SHARED_BLAMING_INITIALISED) atomic_add_i64(&(ipc_data->field), 1L); }while(0)
169 #define DECR_SHARED_BLAMING_DS(field) do{ if(SHARED_BLAMING_INITIALISED) atomic_add_i64(&(ipc_data->field), -1L); }while(0)
170 
171 #define ADD_TO_FREE_EVENTS_LIST(node_ptr) do { (node_ptr)->next_free_node = g_free_event_nodes_head; \
172 g_free_event_nodes_head = (node_ptr); }while(0)
173 
174 #define ADD_TO_FREE_TREE_NODE_LIST(node_ptr) do { (node_ptr)->next_free_node = g_free_tree_nodes_head; \
175 g_free_tree_nodes_head = (node_ptr); }while(0)
176 
177 #define ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(node_ptr) do { (node_ptr)->next_free_node = g_free_active_kernel_nodes_head; \
178 g_free_active_kernel_nodes_head = (node_ptr); }while(0)
179 
180 #define HPCRUN_ASYNC_BLOCK_SPIN_LOCK bool safe = false; \
181 do {safe = hpcrun_safe_enter(); \
182 spinlock_lock(&g_gpu_lock);} while(0)
183 
184 #define HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK do{spinlock_unlock(&g_gpu_lock); \
185  if (safe) hpcrun_safe_exit();} while(0)
186 
187 #define SYNC_PROLOGUE(ctxt, launch_node, start_time, rec_node) \
188 TD_GET(gpu_data.overload_state) = SYNC_STATE; \
189 TD_GET(gpu_data.accum_num_sync_threads) = 0; \
190 TD_GET(gpu_data.accum_num_samples) = 0; \
191 hpcrun_safe_enter(); \
192 ucontext_t ctxt; \
193 getcontext(&ctxt); \
194 cct_node_t * launch_node = hpcrun_sample_callpath(&ctxt, cpu_idle_metric_id, 0 , 0 /*skipInner */ , 1 /*isSync */, NULL ).sample_node; \
195 TD_GET(gpu_data.is_thread_at_cuda_sync) = true; \
196 spinlock_lock(&g_gpu_lock); \
197 uint64_t start_time; \
198 event_list_node_t * rec_node = enter_cuda_sync(& start_time); \
199 spinlock_unlock(&g_gpu_lock); \
200 INCR_SHARED_BLAMING_DS(num_threads_at_sync_all_procs); \
201 hpcrun_safe_exit();
202 
203 #define SYNC_EPILOGUE(ctxt, launch_node, start_time, rec_node, mask, end_time) \
204 hpcrun_safe_enter(); \
205 spinlock_lock(&g_gpu_lock); \
206 uint64_t last_kernel_end_time = leave_cuda_sync(rec_node,start_time,mask); \
207 TD_GET(gpu_data.accum_num_sync_threads) = 0; \
208 TD_GET(gpu_data.accum_num_samples) = 0; \
209 spinlock_unlock(&g_gpu_lock); \
210 struct timeval tv; \
211 gettimeofday(&tv, NULL); \
212 uint64_t end_time = ((uint64_t)tv.tv_usec + (((uint64_t)tv.tv_sec) * 1000000)); \
213 if ( last_kernel_end_time > end_time) {last_kernel_end_time = end_time;} \
214 uint64_t cpu_idle_time = last_kernel_end_time == 0 ? 0: last_kernel_end_time - start_time; \
215 uint64_t gpu_idle_time = last_kernel_end_time == 0 ? end_time - start_time : end_time - last_kernel_end_time; \
216 cct_metric_data_increment(cpu_idle_metric_id, launch_node, (cct_metric_data_t) {.i = (cpu_idle_time)}); \
217 cct_metric_data_increment(gpu_idle_metric_id, launch_node, (cct_metric_data_t) {.i = (gpu_idle_time)}); \
218 hpcrun_safe_exit(); \
219 DECR_SHARED_BLAMING_DS(num_threads_at_sync_all_procs); \
220 TD_GET(gpu_data.is_thread_at_cuda_sync) = false;
221 
222 #define SYNC_MEMCPY_PROLOGUE(ctxt, launch_node, start_time, rec_node) SYNC_PROLOGUE(ctxt, launch_node, start_time, rec_node)
223 
224 #define SYNC_MEMCPY_EPILOGUE(ctxt, launch_node, start_time, rec_node, mask, end_time, bytes, direction) \
225 hpcrun_safe_enter(); \
226 spinlock_lock(&g_gpu_lock); \
227 uint64_t last_kernel_end_time = leave_cuda_sync(rec_node,start_time,mask); \
228 TD_GET(gpu_data.accum_num_sync_threads) = 0; \
229 TD_GET(gpu_data.accum_num_samples) = 0; \
230 spinlock_unlock(&g_gpu_lock); \
231 struct timeval tv; \
232 gettimeofday(&tv, NULL); \
233 uint64_t end_time = ((uint64_t)tv.tv_usec + (((uint64_t)tv.tv_sec) * 1000000)); \
234 if ( last_kernel_end_time > end_time) {last_kernel_end_time = end_time;} \
235 uint64_t cpu_idle_time = end_time - start_time; \
236 uint64_t gpu_idle_time = last_kernel_end_time == 0 ? end_time - start_time : end_time - last_kernel_end_time; \
237 cct_metric_data_increment(cpu_idle_metric_id, launch_node, (cct_metric_data_t) {.i = (cpu_idle_time)}); \
238 cct_metric_data_increment(gpu_idle_metric_id, launch_node, (cct_metric_data_t) {.i = (gpu_idle_time)}); \
239 increment_mem_xfer_metric(bytes, direction, launch_node); \
240 hpcrun_safe_exit(); \
241 DECR_SHARED_BLAMING_DS(num_threads_at_sync_all_procs); \
242 TD_GET(gpu_data.is_thread_at_cuda_sync) = false
243 
244 
245 #define ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, stream, skip_inner) \
246 create_stream0_if_needed(stream); \
247 uint32_t streamId = 0; \
248 event_list_node_t *event_node; \
249 streamId = splay_get_stream_id(stream); \
250 HPCRUN_ASYNC_BLOCK_SPIN_LOCK; \
251 TD_GET(gpu_data.is_thread_at_cuda_sync) = true; \
252 ucontext_t context; \
253 getcontext(&context); \
254 cct_node_t *cct_node = hpcrun_sample_callpath(&context, cpu_idle_metric_id, 0, skip_inner /*skipInner */ , 1 /*isSync */, NULL ).sample_node; \
255 cct_node_t *stream_cct = stream_duplicate_cpu_node(g_stream_array[streamId].st, &context, cct_node); \
256 monitor_disable_new_threads(); \
257 event_node = create_and_insert_event(streamId, cct_node, stream_cct); \
258 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(event_node->event_start, stream)); \
259 INCR_SHARED_BLAMING_DS(outstanding_kernels)
260 
261 
262 #define ASYNC_KERNEL_EPILOGUE(event_node, stream) \
263 TD_GET(gpu_data.overload_state) = WORKING_STATE; \
264 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(event_node->event_end, stream)); \
265 monitor_enable_new_threads(); \
266 TD_GET(gpu_data.is_thread_at_cuda_sync) = false; \
267 HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK
268 
269 #define ASYNC_MEMCPY_PROLOGUE(streamId, event_node, context, cct_node, stream, skip_inner) \
270  ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, stream, skip_inner)
271 
272 #define ASYNC_MEMCPY_EPILOGUE(event_node, cct_node, stream, count, kind) \
273 TD_GET(gpu_data.overload_state) = WORKING_STATE; \
274 CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(event_node->event_end, stream)); \
275 monitor_enable_new_threads(); \
276 increment_mem_xfer_metric(count, kind, cct_node); \
277 TD_GET(gpu_data.is_thread_at_cuda_sync) = false; \
278 HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK
279 
280 
281 #define GET_NEW_TREE_NODE(node_ptr) do { \
282 if (g_free_tree_nodes_head) { \
283 node_ptr = g_free_tree_nodes_head; \
284 g_free_tree_nodes_head = g_free_tree_nodes_head->next_free_node; \
285 } else { \
286 node_ptr = (tree_node *) hpcrun_malloc(sizeof(tree_node)); \
287 } \
288 } while(0)
289 
290 #define GET_NEW_ACTIVE_KERNEL_NODE(node_ptr) do { \
291 if (g_free_active_kernel_nodes_head) { \
292 node_ptr = g_free_active_kernel_nodes_head; \
293 g_free_active_kernel_nodes_head = g_free_active_kernel_nodes_head->next_free_node; \
294 } else { \
295 node_ptr = (active_kernel_node_t *) hpcrun_malloc(sizeof(active_kernel_node_t)); \
296 } \
297 } while(0)
298 
299 
300 #define SYNCHRONOUS_CLEANUP do{ hpcrun_safe_enter(); \
301 spinlock_lock(&g_gpu_lock); \
302 cleanup_finished_events(); \
303 spinlock_unlock(&g_gpu_lock); \
304 hpcrun_safe_exit(); } while(0)
305 
306 
307 
308 #define CUDA_RUNTIME_SYNC_WRAPPER(fn, prologueArgs, epilogueArgs, ...) \
309  VA_FN_DECLARE(cudaError_t, fn, __VA_ARGS__) {\
310  if (! hpcrun_is_safe_to_sync(__func__)) return VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\
311  SYNC_PROLOGUE prologueArgs;\
312  monitor_disable_new_threads();\
313  cudaError_t ret = VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\
314  monitor_enable_new_threads();\
315  SYNC_EPILOGUE epilogueArgs;\
316  return ret;\
317  }
318 
319 
320 #define CUDA_RUNTIME_SYNC_ON_STREAM_WRAPPER(fn, prologueArgs, epilogueArgs, ...) \
321  VA_FN_DECLARE(cudaError_t, fn, __VA_ARGS__) {\
322  SYNC_PROLOGUE prologueArgs;\
323  monitor_disable_new_threads();\
324  cudaError_t ret = VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\
325  hpcrun_safe_enter();\
326  uint32_t streamId;\
327  streamId = splay_get_stream_id(stream);\
328  hpcrun_safe_exit();\
329  monitor_enable_new_threads();\
330  SYNC_EPILOGUE epilogueArgs;\
331  return ret;\
332  }
333 
334 
335 #define CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(fn, prologueArgs, epilogueArgs, ...) \
336  VA_FN_DECLARE(cudaError_t, fn, __VA_ARGS__) {\
337  if (! hpcrun_is_safe_to_sync(__func__)) return VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\
338  ASYNC_MEMCPY_PROLOGUE prologueArgs;\
339  cudaError_t ret = VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\
340  ASYNC_MEMCPY_EPILOGUE epilogueArgs;\
341  return ret;\
342  }
343 
344 #define CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(fn, prologueArgs, epilogueArgs, ...) \
345  VA_FN_DECLARE(cudaError_t, fn, __VA_ARGS__) {\
346  if (! hpcrun_is_safe_to_sync(__func__)) return VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\
347  SYNC_MEMCPY_PROLOGUE prologueArgs;\
348  monitor_disable_new_threads();\
349  cudaError_t ret = VA_FN_CALL(cudaRuntimeFunctionPointer[fn##Enum].fn##Real, __VA_ARGS__);\
350  monitor_enable_new_threads();\
351  SYNC_MEMCPY_EPILOGUE epilogueArgs;\
352  return ret;\
353  }
354 
355 
356 //
357 // Macro to populate a given set of CUDA function pointers:
358 // takes a basename for a function pointer set, and a library
359 // to read from (as a fallback position).
360 //
361 // Method:
362 //
363 // Decide on RTLD_NEXT or dlopen of library for the function pointer set
364 // (Abort if neither method succeeds)
365 // fetch all of the symbols using dlsym, aborting if any failure
366 
367 #define PopulateGPUFunctionPointers(basename, library) \
368  char *error; \
369  \
370  dlerror(); \
371  void* dlsym_arg = RTLD_NEXT; \
372  void* try = dlsym(dlsym_arg, basename ## FunctionPointer[0].functionName); \
373  if ((error=dlerror()) || (! try)) { \
374  if (getenv("DEBUG_HPCRUN_GPU_CONS")) \
375  fprintf(stderr, "RTLD_NEXT argument fails for " #basename " (%s)\n", \
376  (! try) ? "trial function pointer = NULL" : "dlerror != NULL"); \
377  dlerror(); \
378  dlsym_arg = monitor_real_dlopen(#library, RTLD_LAZY); \
379  if (! dlsym_arg) { \
380  fprintf(stderr, "fallback dlopen of " #library " failed," \
381  " dlerror message = '%s'\n", dlerror()); \
382  monitor_real_abort(); \
383  } \
384  if (getenv("DEBUG_HPCRUN_GPU_CONS")) \
385  fprintf(stderr, "Going forward with " #basename " overrides using " #library "\n"); \
386  } \
387  else \
388  if (getenv("DEBUG_HPCRUN_GPU_CONS")) \
389  fprintf(stderr, "Going forward with " #basename " overrides using RTLD_NEXT\n"); \
390  for (int i = 0; i < sizeof(basename ## FunctionPointer)/sizeof(basename ## FunctionPointer[0]); i++) { \
391  dlerror(); \
392  basename ## FunctionPointer[i].generic = \
393  dlsym(dlsym_arg, basename ## FunctionPointer[i].functionName); \
394  if (getenv("DEBUG_HPCRUN_GPU_CONS")) \
395  fprintf(stderr, #basename "Fnptr[%d] @ %p for %s = %p\n", \
396  i, & basename ## FunctionPointer[i].generic, \
397  basename ## FunctionPointer[i].functionName, \
398  basename ## FunctionPointer[i].generic); \
399  if ((error = dlerror()) != NULL) { \
400  EEMSG("%s: during dlsym \n", error); \
401  monitor_real_abort(); \
402  } \
403  }
404 
405 /******************************************************************************
406  * local constants
407  *****************************************************************************/
408 
409 enum _cuda_const {
410  KERNEL_START,
411  KERNEL_END
412 };
413 
414 // states for accounting overload potential
415 enum overloadPotentialState{
416  START_STATE=0,
417  WORKING_STATE,
418  SYNC_STATE,
419  OVERLOADABLE_STATE
420 };
421 
422 
423 /******************************************************************************
424  * externs
425  *****************************************************************************/
426 
427 // function pointers to real cuda runtime functions
428 extern cudaRuntimeFunctionPointer_t cudaRuntimeFunctionPointer[];
429 
430 // function pointers to real cuda driver functions
431 extern cuDriverFunctionPointer_t cuDriverFunctionPointer[];
432 
433 // special papi disable function
434 extern void hpcrun_disable_papi_cuda(void);
435 
436 /******************************************************************************
437  * forward declarations
438  *****************************************************************************/
439 
440 // Each event_list_node_t maintains information about an asynchronous cuda activity (kernel or memcpy)
441 // event_list_node_t is a bit of misnomer it should have been activity_list_node_t.
442 
443 typedef struct event_list_node_t {
444  // cudaEvent inserted immediately before and after the activity
445  cudaEvent_t event_start;
446  cudaEvent_t event_end;
447 
448  // start and end times of event_start and event_end
449  uint64_t event_start_time;
450  uint64_t event_end_time;
451 
452  // CCT node of the CPU thread that launched this activity
453  cct_node_t *launcher_cct;
454  // CCT node of the stream
455  cct_node_t *stream_launcher_cct;
456 
457  // Outstanding threads that need to examine this activity
458  uint32_t ref_count;
459 
460  // our internal splay tree id for the corresponding cudaStream for this activity
461  uint32_t stream_id;
462  union {
463  struct event_list_node_t *next;
464  struct event_list_node_t *next_free_node;
465  };
466 } event_list_node_t;
467 
468 
469 // Per GPU stream information
470 typedef struct stream_node_t {
471  // hpcrun profiling and tracing infp
472  struct core_profile_trace_data_t *st;
473  // pointer to most recently issued activity
474  struct event_list_node_t *latest_event_node;
475  // pointer to the oldest unfinished activity of this stream
476  struct event_list_node_t *unfinished_event_node;
477  // pointer to the next stream which has activities pending
478  struct stream_node_t *next_unfinished_stream;
479  // used to remove from hpcrun cleanup list if stream is explicitly destroyed
480  hpcrun_aux_cleanup_t * aux_cleanup_info;
481  // IDLE NODE persistent id for this stream
482  int32_t idle_node_id;
483 
484 } stream_node_t;
485 
486 
487 
488 typedef struct active_kernel_node_t {
489  uint64_t event_time;
490  bool event_type;
491  uint32_t stream_id;
492  union {
493  cct_node_t *launcher_cct; // present only in START nodes
494  struct active_kernel_node_t *start_node;
495  };
496  union {
497  struct active_kernel_node_t *next;
498  struct active_kernel_node_t *next_free_node;
499  };
500  struct active_kernel_node_t *next_active_kernel;
501  struct active_kernel_node_t *prev;
502 
503 } active_kernel_node_t;
504 
505 // We map GPU stream ID given by cuda to an internal id and place it in a splay tree.
506 // stream_to_id_map_t is the structure we store as a node in the splay tree
507 
508 typedef struct stream_to_id_map_t {
509  // actual cudaStream
510  cudaStream_t stream;
511  // Id given by us
512  uint32_t id;
513  struct stream_to_id_map_t *left;
514  struct stream_to_id_map_t *right;
515 } stream_to_id_map_t;
516 
517 
518 typedef struct IPC_data_t {
519  uint32_t device_id;
520  uint64_t outstanding_kernels;
521  uint64_t num_threads_at_sync_all_procs;
522 } IPC_data_t;
523 
524 
525 static uint32_t cleanup_finished_events();
526 
527 
528 /******************************************************************************
529  * global variables
530  *****************************************************************************/
531 
532 
533 /******************************************************************************
534  * local variables
535  *****************************************************************************/
536 
537 // TODO.. Hack to show streams as threads, we assume max of 32 CPU threads
538 static uint32_t g_stream_id = 32;
539 static uint32_t g_stream_to_id_index = 0;
540 
541 // Lock for stream to id map
542 static spinlock_t g_stream_id_lock = SPINLOCK_UNLOCKED;
543 
544 // lock for GPU activities
545 static spinlock_t g_gpu_lock = SPINLOCK_UNLOCKED;
546 
547 static uint64_t g_num_threads_at_sync;
548 
549 static event_list_node_t *g_free_event_nodes_head;
550 static active_kernel_node_t *g_free_active_kernel_nodes_head;
551 
552 // root of splay tree of stream ids
553 static struct stream_to_id_map_t *stream_to_id_tree_root;
554 static stream_to_id_map_t stream_to_id[MAX_STREAMS];
555 static stream_node_t g_stream_array[MAX_STREAMS];
556 
557 // First stream with pending activities
558 static stream_node_t *g_unfinished_stream_list_head;
559 
560 // Last stream with pending activities
561 static event_list_node_t *g_finished_event_nodes_tail;
562 
563 // dummy activity node
564 static event_list_node_t dummy_event_node = {
565  .event_end = 0,
566  .event_start = 0,
567  .event_end_time = 0,
568  .event_end_time = 0,
569  .launcher_cct = 0,
570  .stream_launcher_cct = 0
571 };
572 
573 // is inter-process blaming enabled?
574 static bool g_do_shared_blaming;
575 
576 // What level of nodes to skip in the backtrace
577 static uint32_t g_cuda_launch_skip_inner;
578 
579 static uint64_t g_start_of_world_time;
580 
581 static cudaEvent_t g_start_of_world_event;
582 
583 static bool g_stream0_initialized = false;
584 
585 static IPC_data_t * ipc_data;
586 
587 /******************** Utilities ********************/
588 /******************** CONSTRUCTORS ********************/
589 
590 
591 // obtain function pointers to all real cuda runtime functions
592 
593 static void
594 PopulateEntryPointesToWrappedCudaRuntimeCalls()
595 {
596  PopulateGPUFunctionPointers(cudaRuntime, libcudart.so)
597 }
598 
599 // obtain function pointers to all real cuda driver functions
600 
601 static void
602 PopulateEntryPointesToWrappedCuDriverCalls(void)
603 {
604  PopulateGPUFunctionPointers(cuDriver, libcuda.so)
605 }
606 
607 static void
608 InitCpuGpuBlameShiftDataStructs(void)
609 {
610  char * shared_blaming_env;
611  char * cuda_launch_skip_inner_env;
612  g_unfinished_stream_list_head = NULL;
613  g_finished_event_nodes_tail = &dummy_event_node;
614  dummy_event_node.next = g_finished_event_nodes_tail;
615  shared_blaming_env = getenv("HPCRUN_ENABLE_SHARED_GPU_BLAMING");
616  if(shared_blaming_env)
617  g_do_shared_blaming = atoi(shared_blaming_env);
618 
619  cuda_launch_skip_inner_env = getenv("HPCRUN_CUDA_LAUNCH_SKIP_INNER");
620  if(cuda_launch_skip_inner_env)
621  g_cuda_launch_skip_inner = atoi(cuda_launch_skip_inner_env);
622 }
623 
624 
625 static void PopulateEntryPointesToWrappedCalls() {
626  PopulateEntryPointesToWrappedCudaRuntimeCalls();
627  PopulateEntryPointesToWrappedCuDriverCalls();
628 }
629 
630 HPCRUN_CONSTRUCTOR(CpuGpuBlameShiftInit)(void)
631 {
633  if (getenv("DEBUG_HPCRUN_GPU_CONS"))
634  fprintf(stderr, "CPU-GPU blame shift constructor called\n");
635  // no dlopen calls in static case
636  // #ifndef HPCRUN_STATIC_LINK
637  PopulateEntryPointesToWrappedCalls();
638  InitCpuGpuBlameShiftDataStructs();
639  // #endif // ! HPCRUN_STATIC_LINK
640 }
641 
642 /******************** END CONSTRUCTORS ****/
643 
644 static char shared_key[MAX_SHARED_KEY_LENGTH];
645 
646 static void destroy_shared_memory(void * p) {
647  // we should munmap, but I will not do since we dont do it in so many other places in hpcrun
648  // munmap(ipc_data);
649  shm_unlink((char *)shared_key);
650 }
651 
652 static inline void create_shared_memory() {
653 
654  int device_id;
655  int fd ;
656  monitor_disable_new_threads();
657  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaGetDeviceEnum].cudaGetDeviceReal(&device_id));
658  monitor_enable_new_threads();
659  sprintf(shared_key, "/gpublame%d",device_id);
660  if ( (fd = shm_open(shared_key, O_RDWR | O_CREAT, 0666)) < 0 ) {
661  EEMSG("Failed to shm_open (%s) on device %d, retval = %d", shared_key, device_id, fd);
663  }
664  if ( ftruncate(fd, sizeof(IPC_data_t)) < 0 ) {
665  EEMSG("Failed to ftruncate() on device %d",device_id);
667  }
668 
669  if( (ipc_data = mmap(NULL, sizeof(IPC_data_t), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 )) == MAP_FAILED ) {
670  EEMSG("Failed to mmap() on device %d",device_id);
672  }
673 
674  hpcrun_process_aux_cleanup_add(destroy_shared_memory, (void *) shared_key);
675 
676 }
677 
678 // Get the stream id given a cudaStream
679 
680 static struct stream_to_id_map_t *splay(struct stream_to_id_map_t *root, cudaStream_t key) {
681  REGULAR_SPLAY_TREE(stream_to_id_map_t, root, key, stream, left, right);
682  return root;
683 }
684 
685 
686 static uint32_t splay_get_stream_id(cudaStream_t key) {
687  spinlock_lock(&g_stream_id_lock);
688  struct stream_to_id_map_t *root = stream_to_id_tree_root;
689  REGULAR_SPLAY_TREE(stream_to_id_map_t, root, key, stream, left, right);
690  // The stream at the root must match the key, else we are in a bad shape.
691  assert(root->stream == key);
692  stream_to_id_tree_root = root;
693  uint32_t ret = stream_to_id_tree_root->id;
694  spinlock_unlock(&g_stream_id_lock);
695  return ret;
696 
697 }
698 
699 
700 // Insert a new cudaStream into the splay tree
701 
702 static stream_to_id_map_t *splay_insert(cudaStream_t stream_ip)
703 {
704 
705  spinlock_lock(&g_stream_id_lock);
706  struct stream_to_id_map_t *node = &stream_to_id[g_stream_to_id_index++];
707  node->stream = stream_ip;
708  node->left = node->right = NULL;
709  node->id = g_stream_id++;
710  cudaStream_t stream = node->stream;
711 
712  if (stream_to_id_tree_root != NULL) {
713  stream_to_id_tree_root = splay(stream_to_id_tree_root, stream);
714 
715  if (stream < stream_to_id_tree_root->stream) {
716  node->left = stream_to_id_tree_root->left;
717  node->right = stream_to_id_tree_root;
718  stream_to_id_tree_root->left = NULL;
719  } else if (stream > stream_to_id_tree_root->stream) {
720  node->left = stream_to_id_tree_root;
721  node->right = stream_to_id_tree_root->right;
722  stream_to_id_tree_root->right = NULL;
723  } else {
724  EEMSG("stream_to_id_tree_root splay tree: unable to insert %p (already present)", node->stream);
726  }
727  }
728  stream_to_id_tree_root = node;
729  spinlock_unlock(&g_stream_id_lock);
730  return stream_to_id_tree_root;
731 }
732 
733 // Initialize hpcrun core_profile_trace_data for a new stream
734 static inline core_profile_trace_data_t *hpcrun_stream_data_alloc_init(int id) {
736  // FIXME: revisit to perform this memstore operation appropriately.
737  //memstore = td->memstore;
738  memset(st, 0xfe, sizeof(core_profile_trace_data_t));
739  //td->memstore = memstore;
740  //hpcrun_make_memstore(&td->memstore, is_child);
741  st->id = id;
742  st->epoch = hpcrun_malloc(sizeof(epoch_t));
743  st->epoch->csdata_ctxt = copy_thr_ctxt(TD_GET(core_profile_trace_data.epoch)->csdata.ctxt); //copy_thr_ctxt(thr_ctxt);
744  hpcrun_cct_bundle_init(&(st->epoch->csdata), (st->epoch->csdata).ctxt);
746  st->epoch->next = NULL;
747  hpcrun_cct2metrics_init(&(st->cct2metrics_map)); //this just does st->map = NULL;
748 
749 
750  st->trace_min_time_us = 0;
751  st->trace_max_time_us = 0;
752  st->hpcrun_file = NULL;
753 
754  return st;
755 }
756 
757 
758 
759 static cct_node_t *stream_duplicate_cpu_node(core_profile_trace_data_t *st, ucontext_t *context, cct_node_t *node) {
760  cct_bundle_t* cct= &(st->epoch->csdata);
761  cct_node_t * tmp_root = cct->tree_root;
762  hpcrun_cct_insert_path(&tmp_root, node);
763  return tmp_root;
764 }
765 
766 
767 inline void hpcrun_stream_finalize(void * st) {
768  if(hpcrun_trace_isactive())
769  hpcrun_trace_close(st);
770 
772 }
773 
774 
775 static struct stream_to_id_map_t *splay_delete(cudaStream_t stream)
776 {
777  struct stream_to_id_map_t *result = NULL;
778 
779  TMSG(CUDA, "Trying to delete %p from stream splay tree", stream);
780  spinlock_lock(&g_stream_id_lock);
781  if (stream_to_id_tree_root == NULL) {
782  spinlock_unlock(&g_stream_id_lock);
783  TMSG(CUDA, "stream_to_id_map_t splay tree empty: unable to delete %p", stream);
784  return NULL;
785  }
786 
787  stream_to_id_tree_root = splay(stream_to_id_tree_root, stream);
788 
789  if (stream != stream_to_id_tree_root->stream) {
790  spinlock_unlock(&g_stream_id_lock);
791  TMSG(CUDA, "trying to deleting stream %p, but not in splay tree (root = %p)", stream, stream_to_id_tree_root->stream);
792  // monitor_real_abort();
793  return NULL;
794  }
795 
796  result = stream_to_id_tree_root;
797 
798  if (stream_to_id_tree_root->left == NULL) {
799  stream_to_id_tree_root = stream_to_id_tree_root->right;
800  spinlock_unlock(&g_stream_id_lock);
801  return result;
802  }
803 
804  stream_to_id_tree_root->left = splay(stream_to_id_tree_root->left, stream);
805  stream_to_id_tree_root->left->right = stream_to_id_tree_root->right;
806  stream_to_id_tree_root = stream_to_id_tree_root->left;
807  spinlock_unlock(&g_stream_id_lock);
808  return result;
809 }
810 
811 
812 // Prologue for any cuda synchronization routine
813 static inline event_list_node_t *enter_cuda_sync(uint64_t * syncStart) {
815 
816  // Cleanup events so that when I goto wait anybody in the queue will be the ones I have not seen and finished after my timer started.
817  cleanup_finished_events();
818 
819  struct timeval tv;
820  gettimeofday(&tv, NULL);
821  *syncStart = ((uint64_t) tv.tv_usec + (((uint64_t) tv.tv_sec) * 1000000));
822 
823  event_list_node_t *recorded_node = g_finished_event_nodes_tail;
824  if (g_finished_event_nodes_tail != &dummy_event_node)
825  g_finished_event_nodes_tail->ref_count++;
826 
827  atomic_add_i64(&g_num_threads_at_sync, 1L);
828  // caller does HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK;
829  return recorded_node;
830 }
831 
832 
833 // blame all kernels finished during the sync time
834 
835 static uint64_t attribute_shared_blame_on_kernels(event_list_node_t * recorded_node, uint64_t recorded_time, const uint32_t stream_mask, double scaling_factor) {
836 
837  // if recorded_node is not dummy_event_node decrement its ref count
838  if (recorded_node != &dummy_event_node)
839  recorded_node->ref_count--;
840 
841  uint32_t num_active_kernels = 0;
842  active_kernel_node_t *sorted_active_kernels_begin = NULL;
843 
844  // Traverse all nodes, inserting them in a sorted list if their end times were past the recorded time
845  // If their start times were before the recorded, just record them as recorded_time
846 
847  event_list_node_t *cur = recorded_node->next, *prev = recorded_node;
848  while (cur != &dummy_event_node) {
849  // if the node's refcount is already zero, then free it and we dont care about it
850  if (cur->ref_count == 0) {
851  prev->next = cur->next;
852  event_list_node_t *to_free = cur;
853  cur = cur->next;
854  ADD_TO_FREE_EVENTS_LIST(to_free);
855  continue;
856  }
857 
858  cur->ref_count--;
859 
860  if ((cur->event_end_time <= recorded_time) || (cur->stream_id != (cur->stream_id & stream_mask))) {
861  if (cur->ref_count == 0) {
862  prev->next = cur->next;
863  event_list_node_t *to_free = cur;
864  cur = cur->next;
865  ADD_TO_FREE_EVENTS_LIST(to_free);
866  } else {
867  prev = cur;
868  cur = cur->next;
869  }
870  continue;
871  }
872  // Add start and end times in a sorted list (insertion sort)
873 
874  active_kernel_node_t *start_active_kernel_node;
875  active_kernel_node_t *end_active_kernel_node;
876  GET_NEW_ACTIVE_KERNEL_NODE(start_active_kernel_node);
877  GET_NEW_ACTIVE_KERNEL_NODE(end_active_kernel_node);
878 
879  if (cur->event_start_time < recorded_time) {
880  start_active_kernel_node->event_time = recorded_time;
881  } else {
882  start_active_kernel_node->event_time = cur->event_start_time;
883  }
884 
885  start_active_kernel_node->event_type = KERNEL_START;
886  start_active_kernel_node->stream_id = cur->stream_id;
887  start_active_kernel_node->launcher_cct = cur->launcher_cct;
888  start_active_kernel_node->next_active_kernel = NULL;
889 
890  end_active_kernel_node->event_type = KERNEL_END;
891  end_active_kernel_node->start_node = start_active_kernel_node;
892  end_active_kernel_node->event_time = cur->event_end_time;
893 
894  // drop if times are same
895  if (start_active_kernel_node->event_time == end_active_kernel_node->event_time) {
896  ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(start_active_kernel_node);
897  ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(end_active_kernel_node);
898  if (cur->ref_count == 0) {
899  prev->next = cur->next;
900  event_list_node_t *to_free = cur;
901  cur = cur->next;
902  ADD_TO_FREE_EVENTS_LIST(to_free);
903  } else {
904  prev = cur;
905  cur = cur->next;
906  }
907  continue;
908  }
909  assert(start_active_kernel_node->event_time < end_active_kernel_node->event_time);
910 
911  if (sorted_active_kernels_begin == NULL) {
912  // First entry
913  start_active_kernel_node->next = end_active_kernel_node;
914  start_active_kernel_node->prev = end_active_kernel_node;
915  end_active_kernel_node->prev = start_active_kernel_node;
916  end_active_kernel_node->next = start_active_kernel_node;
917  sorted_active_kernels_begin = start_active_kernel_node;
918  } else {
919  // There are atlest 2 entries
920 
921  // current points to the last node interms of time
922  active_kernel_node_t *current = sorted_active_kernels_begin->prev;
923  bool change_head = 1;
924  do {
925  if (end_active_kernel_node->event_time > current->event_time) {
926  change_head = 0;
927  break;
928  }
929  current = current->prev;
930  } while (current != sorted_active_kernels_begin->prev);
931  end_active_kernel_node->next = current->next;
932  end_active_kernel_node->prev = current;
933  current->next->prev = end_active_kernel_node;
934  current->next = end_active_kernel_node;
935  if (change_head) {
936  sorted_active_kernels_begin = end_active_kernel_node;
937  }
938 
939  current = end_active_kernel_node->prev;
940  change_head = 1;
941  do {
942  if (start_active_kernel_node->event_time > current->event_time) {
943  change_head = 0;
944  break;
945  }
946  current = current->prev;
947  } while (current != sorted_active_kernels_begin->prev);
948  start_active_kernel_node->next = current->next;
949  start_active_kernel_node->prev = current;
950  current->next->prev = start_active_kernel_node;
951  current->next = start_active_kernel_node;
952  if (change_head) {
953  sorted_active_kernels_begin = start_active_kernel_node;
954  }
955  }
956 
957  if (cur->ref_count == 0) {
958  prev->next = cur->next;
959  event_list_node_t *to_free = cur;
960  cur = cur->next;
961  ADD_TO_FREE_EVENTS_LIST(to_free);
962  } else {
963  prev = cur;
964  cur = cur->next;
965  }
966 
967  }
968  g_finished_event_nodes_tail = prev;
969 
970  // now attribute blame on the sorted list
971  uint64_t last_kernel_end_time = 0;
972  if (sorted_active_kernels_begin) {
973 
974  // attach a dummy tail
975  active_kernel_node_t *dummy_kernel_node;
976  GET_NEW_ACTIVE_KERNEL_NODE(dummy_kernel_node);
977  sorted_active_kernels_begin->prev->next = dummy_kernel_node;
978  dummy_kernel_node->prev = sorted_active_kernels_begin->prev;
979  sorted_active_kernels_begin->prev = dummy_kernel_node;
980  dummy_kernel_node->next = sorted_active_kernels_begin;
981 
982  active_kernel_node_t *current = sorted_active_kernels_begin;
983  uint64_t last_time = recorded_time;
984  do {
985  uint64_t new_time = current->event_time;
986 
987  assert(new_time >= last_time);
988  assert(current != dummy_kernel_node && "should never process dummy_kernel_node");
989 
990  if (num_active_kernels && (new_time > last_time)) {
991  //blame all
992  active_kernel_node_t *blame_node = current->prev;
993  do {
994  assert(blame_node->event_type == KERNEL_START);
995 
997  .r = (new_time - last_time) * (scaling_factor) / num_active_kernels}
998  );
999  blame_node = blame_node->prev;
1000  } while (blame_node != sorted_active_kernels_begin->prev);
1001  }
1002 
1003  last_time = new_time;
1004 
1005  if (current->event_type == KERNEL_START) {
1006  num_active_kernels++;
1007  current = current->next;
1008  } else {
1009  last_kernel_end_time = new_time;
1010  current->start_node->prev->next = current->start_node->next;
1011  current->start_node->next->prev = current->start_node->prev;
1012  if (current->start_node == sorted_active_kernels_begin)
1013  sorted_active_kernels_begin = current->start_node->next;
1014  ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST((current->start_node));
1015 
1016 #if 0 // Not a plausible case
1017  // If I am the last one then Just free and break;
1018  if (current->next == current) {
1019  ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(current);
1020  break;
1021  }
1022 #endif
1023  current->prev->next = current->next;
1024  current->next->prev = current->prev;
1025  if (current == sorted_active_kernels_begin)
1026  sorted_active_kernels_begin = current->next;
1027  num_active_kernels--;
1028  active_kernel_node_t *to_free = current;
1029  current = current->next;
1030  ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(to_free);
1031 
1032  }
1033 
1034  } while (current != sorted_active_kernels_begin->prev);
1035  // Free up the dummy node
1036  ADD_TO_FREE_ACTIVE_KERNEL_NODE_LIST(dummy_kernel_node);
1037 
1038  }
1039 
1040  return last_kernel_end_time;
1041 
1042 }
1043 
1044 // Epilogue for any cuda synchronization routine
1045 static inline uint64_t leave_cuda_sync(event_list_node_t * recorded_node, uint64_t syncStart, const uint32_t stream_mask) {
1046  //caller does HPCRUN_ASYNC_BLOCK_SPIN_LOCK;
1047 
1048  // Cleanup events so that when I goto wait anybody in the queue will be the ones I have not seen and finished after my timer started.
1049  cleanup_finished_events();
1050 
1051  double scaling_factor = 1.0;
1052  if(SHARED_BLAMING_INITIALISED && TD_GET(gpu_data.accum_num_samples))
1053  scaling_factor *= ((double)TD_GET(gpu_data.accum_num_sync_threads))/TD_GET(gpu_data.accum_num_samples);
1054  uint64_t last_kernel_end_time = attribute_shared_blame_on_kernels(recorded_node, syncStart, stream_mask, scaling_factor);
1055  atomic_add_i64(&g_num_threads_at_sync, -1L);
1056  return last_kernel_end_time;
1057  //caller does HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK;
1058 }
1059 
1060 
1061 // inspect activities finished on each stream and record metrics accordingly
1062 
1063 static uint32_t cleanup_finished_events() {
1064  uint32_t num_unfinished_streams = 0;
1065  stream_node_t *prev_stream = NULL;
1066  stream_node_t *next_stream = NULL;
1067  stream_node_t *cur_stream = g_unfinished_stream_list_head;
1068 
1069  while (cur_stream != NULL) {
1070  assert(cur_stream->unfinished_event_node && " Can't point unfinished stream to null");
1071  next_stream = cur_stream->next_unfinished_stream;
1072 
1073  event_list_node_t *current_event = cur_stream->unfinished_event_node;
1074  while (current_event) {
1075 
1076  cudaError_t err_cuda = cudaRuntimeFunctionPointer[cudaEventQueryEnum].cudaEventQueryReal(current_event->event_end);
1077 
1078  if (err_cuda == cudaSuccess) {
1079 
1080  // Decrement ipc_data->outstanding_kernels
1081  DECR_SHARED_BLAMING_DS(outstanding_kernels);
1082 
1083  // record start time
1084  float elapsedTime; // in millisec with 0.5 microsec resolution as per CUDA
1085 
1086  //FIX ME: deleting Elapsed time to handle context destruction....
1087  //static uint64_t deleteMeTime = 0;
1088  TMSG(CUDA, "BEFORE: EventElapsedRT(%p, %p)\n", g_start_of_world_event, current_event->event_start);
1089  cudaError_t err1 = Cuda_RTcall(cudaEventElapsedTime)(&elapsedTime,
1090  g_start_of_world_event,
1091  current_event->event_start);
1092  // soft failure
1093  if (err1 != cudaSuccess) {
1094  EMSG("cudaEventElaspsedTime failed");
1095  break;
1096  }
1097 
1098  assert(elapsedTime > 0);
1099 
1100  uint64_t micro_time_start = (uint64_t) (((double) elapsedTime) * 1000) + g_start_of_world_time;
1101 
1102  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventElapsedTimeEnum].cudaEventElapsedTimeReal(&elapsedTime, g_start_of_world_event, current_event->event_end));
1103 
1104  assert(elapsedTime > 0);
1105  uint64_t micro_time_end = (uint64_t) (((double) elapsedTime) * 1000) + g_start_of_world_time;
1106 
1107  assert(micro_time_start <= micro_time_end);
1108 
1109  if(hpcrun_trace_isactive()) {
1110  hpcrun_trace_append_with_time(cur_stream->st, cur_stream->idle_node_id, HPCRUN_FMT_MetricId_NULL /* null metric id */, micro_time_start - 1);
1111 
1112  cct_node_t *stream_cct = current_event->stream_launcher_cct;
1113 
1114  hpcrun_cct_persistent_id_trace_mutate(stream_cct);
1115 
1116  hpcrun_trace_append_with_time(cur_stream->st, hpcrun_cct_persistent_id(stream_cct), HPCRUN_FMT_MetricId_NULL /* null metric id */, micro_time_start);
1117 
1118  hpcrun_trace_append_with_time(cur_stream->st, hpcrun_cct_persistent_id(stream_cct), HPCRUN_FMT_MetricId_NULL /* null metric id */, micro_time_end);
1119 
1120  hpcrun_trace_append_with_time(cur_stream->st, cur_stream->idle_node_id, HPCRUN_FMT_MetricId_NULL /* null metric id */, micro_time_end + 1);
1121  }
1122 
1123 
1124  // Add the kernel execution time to the gpu_time_metric_id
1125  cct_metric_data_increment(gpu_time_metric_id, current_event->launcher_cct, (cct_metric_data_t) {
1126  .i = (micro_time_end - micro_time_start)});
1127 
1128  event_list_node_t *deferred_node = current_event;
1129  current_event = current_event->next;
1130 
1131 
1132 
1133  // Add to_free to fre list
1134  if (g_num_threads_at_sync) {
1135  // some threads are waiting, hence add this kernel for deferred blaming
1136  deferred_node->ref_count = g_num_threads_at_sync;
1137  deferred_node->event_start_time = micro_time_start;
1138  deferred_node->event_end_time = micro_time_end;
1139  deferred_node->next = g_finished_event_nodes_tail->next;
1140  g_finished_event_nodes_tail->next = deferred_node;
1141  g_finished_event_nodes_tail = deferred_node;
1142 
1143  } else {
1144  // It is better not to call destroy from here since we might be in the signal handler
1145  // Events will be destroyed lazily when they need to be reused.
1146  ADD_TO_FREE_EVENTS_LIST(deferred_node);
1147  }
1148 
1149  } else {
1150  break;
1151  }
1152  }
1153 
1154  cur_stream->unfinished_event_node = current_event;
1155  if (current_event == NULL) {
1156  // set oldest and newest pointers to null
1157  cur_stream->latest_event_node = NULL;
1158  if (prev_stream == NULL) {
1159  g_unfinished_stream_list_head = next_stream;
1160  } else {
1161  prev_stream->next_unfinished_stream = next_stream;
1162  }
1163  } else {
1164 
1165  num_unfinished_streams++;
1166  prev_stream = cur_stream;
1167  }
1168  cur_stream = next_stream;
1169  }
1170  return num_unfinished_streams;
1171 }
1172 
1173 
1174 // Insert a new activity in a stream
1175 // Caller is responsible for calling monitor_disable_new_threads()
1176 static event_list_node_t *create_and_insert_event(int stream_id, cct_node_t * launcher_cct, cct_node_t * stream_launcher_cct) {
1177 
1178  event_list_node_t *event_node;
1179  if (g_free_event_nodes_head) {
1180  // get from free list
1181  event_node = g_free_event_nodes_head;
1182  g_free_event_nodes_head = g_free_event_nodes_head->next_free_node;
1183 
1184  // Free the old events if they are alive
1185  if (event_node->event_start)
1186  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal((event_node->event_start)));
1187  if (event_node->event_end)
1188  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal((event_node->event_end)));
1189 
1190  } else {
1191  // allocate new node
1192  event_node = (event_list_node_t *) hpcrun_malloc(sizeof(event_list_node_t));
1193  }
1194  //cudaError_t err = cudaEventCreateWithFlags(&(event_node->event_end),cudaEventDisableTiming);
1195 
1196  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventCreateEnum].cudaEventCreateReal(&(event_node->event_start)));
1197  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventCreateEnum].cudaEventCreateReal(&(event_node->event_end)));
1198 
1199  event_node->stream_launcher_cct = stream_launcher_cct;
1200  event_node->launcher_cct = launcher_cct;
1201  event_node->next = NULL;
1202  event_node->stream_id = stream_id;
1203  if (g_stream_array[stream_id].latest_event_node == NULL) {
1204  g_stream_array[stream_id].latest_event_node = event_node;
1205  g_stream_array[stream_id].unfinished_event_node = event_node;
1206  g_stream_array[stream_id].next_unfinished_stream = g_unfinished_stream_list_head;
1207  g_unfinished_stream_list_head = &(g_stream_array[stream_id]);
1208  } else {
1209  g_stream_array[stream_id].latest_event_node->next = event_node;
1210  g_stream_array[stream_id].latest_event_node = event_node;
1211  }
1212 
1213  return event_node;
1214 }
1215 
1216 static void close_all_streams(stream_to_id_map_t *root) {
1217 
1218  if (!root)
1219  return;
1220 
1221  close_all_streams(root->left);
1222  close_all_streams(root->right);
1223  uint32_t streamId;
1224  streamId = root->id;
1225 
1226  hpcrun_stream_finalize(g_stream_array[streamId].st);
1227 
1228  // remove from hpcrun process auxiliary cleanup list
1229  hpcrun_process_aux_cleanup_remove(g_stream_array[streamId].aux_cleanup_info);
1230 
1231  g_stream_array[streamId].st = NULL;
1232 }
1233 
1234 
1235 // Stream #0 is never explicitly created. Hence create it if needed.
1236 // An alternate option is to create it eagerly whether needed or not.
1237 
1238 static void create_stream0_if_needed(cudaStream_t stream) {
1239  HPCRUN_ASYNC_BLOCK_SPIN_LOCK;
1240  if ( (((uint64_t)stream) == 0 )&& (g_stream0_initialized == false)) {
1241  uint32_t new_streamId;
1242  new_streamId = splay_insert(0)->id;
1243  if (g_start_of_world_time == 0) {
1244 
1245 
1246 
1247  // And disable tracking new threads from CUDA
1248  monitor_disable_new_threads();
1249 
1250  // Initialize and Record an event to indicate the start of this stream.
1251  // No need to wait for it since we query only the events posted after this and this will be complete when the latter posted ones are complete.
1252 
1253  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventCreateEnum].cudaEventCreateReal(&g_start_of_world_event));
1254 
1255  // record time
1256 
1257  struct timeval tv;
1258  gettimeofday(&tv, NULL);
1259  g_start_of_world_time = ((uint64_t) tv.tv_usec + (((uint64_t) tv.tv_sec) * 1000000));
1260 
1261  // record in stream 0
1262  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(g_start_of_world_event, 0));
1263 
1264  // enable monitoring new threads
1265  monitor_enable_new_threads();
1266 
1267  // This is a good time to create the shared memory
1268  // FIX ME: DEVICE_ID should be derived
1269  if(g_do_shared_blaming && ipc_data == NULL)
1270  create_shared_memory();
1271 
1272 
1273  }
1274 
1275  struct timeval tv;
1276  gettimeofday(&tv, NULL);
1277  g_stream_array[new_streamId].st = hpcrun_stream_data_alloc_init(new_streamId);
1278 
1279  if(hpcrun_trace_isactive()) {
1280  hpcrun_trace_open(g_stream_array[new_streamId].st);
1281 
1282  /*FIXME: convert below 4 lines to a macro */
1283  cct_bundle_t *bundle = &(g_stream_array[new_streamId].st->epoch->csdata);
1285  hpcrun_cct_persistent_id_trace_mutate(idl);
1286  // store the persistent id one time
1287  g_stream_array[new_streamId].idle_node_id = hpcrun_cct_persistent_id(idl);
1288 
1289  hpcrun_trace_append(g_stream_array[new_streamId].st, g_stream_array[new_streamId].idle_node_id, HPCRUN_FMT_MetricId_NULL /* null metric id */);
1290 
1291  }
1292 
1293  g_stream_array[new_streamId].aux_cleanup_info = hpcrun_process_aux_cleanup_add(hpcrun_stream_finalize, g_stream_array[new_streamId].st);
1294  g_stream0_initialized = true;
1295  }
1296  HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK;
1297 
1298 }
1299 
1300 
1302 // CUDA Runtime overrides
1304 
1305 
1306 CUDA_RUNTIME_SYNC_WRAPPER(cudaThreadSynchronize, (context, launcher_cct,
1307 syncStart, recorded_node), (context, launcher_cct, syncStart,
1308 recorded_node, ALL_STREAMS_MASK, syncEnd), void)
1309 
1310 CUDA_RUNTIME_SYNC_ON_STREAM_WRAPPER(cudaStreamSynchronize, (context,
1311 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1312 syncStart, recorded_node, streamId, syncEnd), cudaStream_t, stream)
1313 
1314 CUDA_RUNTIME_SYNC_WRAPPER(cudaEventSynchronize, (context, launcher_cct,
1315 syncStart, recorded_node), (context, launcher_cct, syncStart,
1316 recorded_node, ALL_STREAMS_MASK, syncEnd), cudaEvent_t, event)
1317 
1318 CUDA_RUNTIME_SYNC_ON_STREAM_WRAPPER(cudaStreamWaitEvent, (context,
1319 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1320 syncStart, recorded_node, streamId, syncEnd), cudaStream_t, stream,
1321 cudaEvent_t, event, unsigned int, flags)
1322 
1323 CUDA_RUNTIME_SYNC_WRAPPER(cudaDeviceSynchronize, (context, launcher_cct,
1324 syncStart, recorded_node), (context, launcher_cct, syncStart,
1325 recorded_node, ALL_STREAMS_MASK, syncEnd), void)
1326 
1327 CUDA_RUNTIME_SYNC_WRAPPER(cudaMallocArray, (context, launcher_cct,
1328 syncStart, recorded_node), (context, launcher_cct, syncStart,
1329 recorded_node, ALL_STREAMS_MASK, syncEnd), struct cudaArray **, array,
1330 const struct cudaChannelFormatDesc *, desc, size_t, width, size_t,
1331 height, unsigned int, flags)
1332 
1333 CUDA_RUNTIME_SYNC_WRAPPER(cudaFree, (context, launcher_cct, syncStart,
1334 recorded_node), (context, launcher_cct, syncStart, recorded_node,
1335 ALL_STREAMS_MASK, syncEnd), void *, devPtr)
1336 
1337 CUDA_RUNTIME_SYNC_WRAPPER(cudaFreeArray, (context, launcher_cct,
1338 syncStart, recorded_node), (context, launcher_cct, syncStart,
1339 recorded_node, ALL_STREAMS_MASK, syncEnd),struct cudaArray *, array)
1340 
1341 
1342 
1343 cudaError_t cudaConfigureCall(dim3 grid, dim3 block, size_t mem, cudaStream_t stream) {
1344 
1345  if (! hpcrun_is_safe_to_sync(__func__))
1346  return cudaRuntimeFunctionPointer[cudaConfigureCallEnum].cudaConfigureCallReal(grid, block, mem, stream);
1347  TD_GET(gpu_data.is_thread_at_cuda_sync) = true;
1348  monitor_disable_new_threads();
1349  cudaError_t ret = cudaRuntimeFunctionPointer[cudaConfigureCallEnum].cudaConfigureCallReal(grid, block, mem, stream);
1350  monitor_enable_new_threads();
1351  TD_GET(gpu_data.is_thread_at_cuda_sync) = false;
1352  TD_GET(gpu_data.active_stream) = (uint64_t) stream;
1353  return ret;
1354 }
1355 
1356 #if (CUDART_VERSION < 5000)
1357  cudaError_t cudaLaunch(const char *entry) {
1358 #else
1359  cudaError_t cudaLaunch(const void *entry) {
1360 #endif
1361 
1362  if (! hpcrun_is_safe_to_sync(__func__))
1363  return cudaRuntimeFunctionPointer[cudaLaunchEnum].cudaLaunchReal(entry);
1364  TMSG(CPU_GPU,"Cuda launch (get spinlock)");
1365  ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t) (TD_GET(gpu_data.active_stream))), g_cuda_launch_skip_inner);
1366 
1367  cudaError_t ret = cudaRuntimeFunctionPointer[cudaLaunchEnum].cudaLaunchReal(entry);
1368 
1369  TMSG(CPU_GPU, "Cuda launch about to release spin lock");
1370  ASYNC_KERNEL_EPILOGUE(event_node, ((cudaStream_t) (TD_GET(gpu_data.active_stream))));
1371  TMSG(CPU_GPU, "Cuda launch done !(spin lock released)");
1372 
1373  return ret;
1374 }
1375 
1376 
1377 cudaError_t cudaStreamDestroy(cudaStream_t stream) {
1378 
1379  SYNCHRONOUS_CLEANUP;
1380 
1382 
1383  uint32_t streamId;
1384 
1385  streamId = splay_get_stream_id(stream);
1386 
1387  hpcrun_stream_finalize(g_stream_array[streamId].st);
1388 
1389  // remove from hpcrun process auxiliary cleanup list
1390  hpcrun_process_aux_cleanup_remove(g_stream_array[streamId].aux_cleanup_info);
1391 
1392  g_stream_array[streamId].st = NULL;
1393 
1394  monitor_disable_new_threads();
1395  cudaError_t ret = cudaRuntimeFunctionPointer[cudaStreamDestroyEnum].cudaStreamDestroyReal(stream);
1396  monitor_enable_new_threads();
1397 
1398  // Delete splay tree entry
1399  splay_delete(stream);
1400  hpcrun_safe_exit();
1401  return ret;
1402 
1403 }
1404 
1405 
1406 static void StreamCreateBookKeeper(cudaStream_t * stream){
1407  uint32_t new_streamId = splay_insert(*stream)->id;
1408  HPCRUN_ASYNC_BLOCK_SPIN_LOCK;
1409  if (g_start_of_world_time == 0) {
1410  // In case cudaLaunch causes dlopn, async block may get enabled, as a safety net set gpu_data.is_thread_at_cuda_sync so that we dont call any cuda calls
1411  TD_GET(gpu_data.is_thread_at_cuda_sync) = true;
1412 
1413  // And disable tracking new threads from CUDA
1414  monitor_disable_new_threads();
1415 
1416  // Initialize and Record an event to indicate the start of this stream.
1417  // No need to wait for it since we query only the events posted after this and this will be complete when the latter posted ones are complete.
1418  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventCreateEnum].cudaEventCreateReal(&g_start_of_world_event));
1419 
1420  // record time
1421 
1422  struct timeval tv;
1423  gettimeofday(&tv, NULL);
1424  g_start_of_world_time = ((uint64_t) tv.tv_usec + (((uint64_t) tv.tv_sec) * 1000000));
1425 
1426  // record in stream 0
1427  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventRecordEnum].cudaEventRecordReal(g_start_of_world_event, 0));
1428 
1429  // enable monitoring new threads
1430  monitor_enable_new_threads();
1431 
1432  // This is a good time to create the shared memory
1433  // FIX ME: DEVICE_ID should be derived
1434  if(g_do_shared_blaming && ipc_data == NULL)
1435  create_shared_memory();
1436 
1437  // Ok to call cuda functions from the signal handler
1438  TD_GET(gpu_data.is_thread_at_cuda_sync) = false;
1439 
1440  }
1441 
1442  g_stream_array[new_streamId].st = hpcrun_stream_data_alloc_init(new_streamId);
1443  if(hpcrun_trace_isactive()) {
1444  hpcrun_trace_open(g_stream_array[new_streamId].st);
1445 
1446  /*FIXME: convert below 4 lines to a macro */
1447  cct_bundle_t *bundle = &(g_stream_array[new_streamId].st->epoch->csdata);
1449  hpcrun_cct_persistent_id_trace_mutate(idl);
1450  // store the persistent id one time.
1451  g_stream_array[new_streamId].idle_node_id = hpcrun_cct_persistent_id(idl);
1452  hpcrun_trace_append(g_stream_array[new_streamId].st, g_stream_array[new_streamId].idle_node_id, HPCRUN_FMT_MetricId_NULL /* null metric id */);
1453 
1454  }
1455 
1456  g_stream_array[new_streamId].aux_cleanup_info = hpcrun_process_aux_cleanup_add(hpcrun_stream_finalize, g_stream_array[new_streamId].st);
1457  HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK;
1458 
1459 }
1460 
1461 cudaError_t cudaStreamCreate(cudaStream_t * stream) {
1462 
1463  TD_GET(gpu_data.is_thread_at_cuda_sync) = true;
1464  monitor_disable_new_threads();
1465  cudaError_t ret = cudaRuntimeFunctionPointer[cudaStreamCreateEnum].cudaStreamCreateReal(stream);
1466  monitor_enable_new_threads();
1467  TD_GET(gpu_data.is_thread_at_cuda_sync) = false;
1468 
1469  StreamCreateBookKeeper(stream);
1470  return ret;
1471 }
1472 
1473 inline static void increment_mem_xfer_metric(size_t count, enum cudaMemcpyKind kind, cct_node_t *node){
1474  switch(kind){
1475  case cudaMemcpyHostToHost:
1477  break;
1478 
1479  case cudaMemcpyHostToDevice:
1481  break;
1482 
1483 
1484  case cudaMemcpyDeviceToHost:
1486  break;
1487 
1488  case cudaMemcpyDeviceToDevice:
1490  break;
1491 
1492  case cudaMemcpyDefault:
1494  break;
1495 
1496  default : break;
1497 
1498  }
1499 }
1500 
1501 
1502 
1503 CUDA_RUNTIME_SYNC_WRAPPER(cudaMalloc, (context, launcher_cct, syncStart,
1504 recorded_node), (context, launcher_cct, syncStart, recorded_node,
1505 ALL_STREAMS_MASK, syncEnd), void **, devPtr, size_t, size)
1506 
1507 CUDA_RUNTIME_SYNC_WRAPPER(cudaMalloc3D, (context, launcher_cct, syncStart,
1508 recorded_node), (context, launcher_cct, syncStart, recorded_node,
1509 ALL_STREAMS_MASK, syncEnd), struct cudaPitchedPtr*, pitchedDevPtr,
1510 struct cudaExtent, extent)
1511 
1512 CUDA_RUNTIME_SYNC_WRAPPER(cudaMalloc3DArray, (context, launcher_cct,
1513 syncStart, recorded_node), (context, launcher_cct, syncStart,
1514 recorded_node, ALL_STREAMS_MASK, syncEnd), struct cudaArray**, array,
1515 const struct cudaChannelFormatDesc*, desc, struct cudaExtent, extent,
1516 unsigned int, flags)
1517 
1518 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy3D, (context, launcher_cct,
1519 syncStart, recorded_node), (context, launcher_cct, syncStart,
1520 recorded_node, ALL_STREAMS_MASK, syncEnd, (p->extent.width *
1521 p->extent.height * p->extent.depth), (p->kind)), const struct
1522 cudaMemcpy3DParms *, p)
1523 
1524 
1525 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy3DPeer, (context,
1526 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1527 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, (p->extent.width *
1528 p->extent.height * p->extent.depth), cudaMemcpyDeviceToDevice), const
1529 struct cudaMemcpy3DPeerParms *, p)
1530 
1531 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy3DAsync, (streamId,
1532 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1533 (p->extent.width * p->extent.height * p->extent.depth), (p->kind)),
1534 const struct cudaMemcpy3DParms *, p, cudaStream_t, stream)
1535 
1536 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy3DPeerAsync, (streamId,
1537 event_node, context, cct_node, stream, 0), (event_node, cct_node,
1538 stream, (p->extent.width * p->extent.height * p->extent.depth),
1539 cudaMemcpyDeviceToDevice), const struct cudaMemcpy3DPeerParms *, p,
1540 cudaStream_t, stream)
1541 
1542 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyPeer, (context,
1543 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1544 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count,
1545 cudaMemcpyDeviceToDevice), void *, dst, int, dstDevice, const void *,
1546 src, int, srcDevice, size_t, count)
1547 
1548 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyFromArray, (context,
1549 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1550 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind), void
1551 *, dst, const struct cudaArray *, src, size_t, wOffset, size_t, hOffset,
1552 size_t, count, enum cudaMemcpyKind, kind)
1553 
1554 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyArrayToArray, (context,
1555 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1556 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind),
1557 struct cudaArray *, dst, size_t, wOffsetDst, size_t, hOffsetDst, const
1558 struct cudaArray *, src, size_t, wOffsetSrc, size_t, hOffsetSrc, size_t,
1559 count, enum cudaMemcpyKind, kind)
1560 
1561 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy2DToArray, (context,
1562 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1563 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, (width * height)
1564 , kind), struct cudaArray *, dst, size_t, wOffset, size_t, hOffset,
1565 const void *, src, size_t, spitch, size_t, width, size_t, height, enum
1566 cudaMemcpyKind, kind)
1567 
1568 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy2DFromArray, (context,
1569 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1570 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, (width * height),
1571 kind), void *, dst, size_t, dpitch, const struct cudaArray *, src,
1572 size_t, wOffset, size_t, hOffset, size_t, width, size_t, height, enum
1573 cudaMemcpyKind, kind)
1574 
1575 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy2DArrayToArray, (context,
1576 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1577 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, (width * height),
1578 kind), struct cudaArray *, dst, size_t, wOffsetDst, size_t, hOffsetDst,
1579 const struct cudaArray *, src, size_t, wOffsetSrc, size_t, hOffsetSrc,
1580 size_t, width, size_t, height, enum cudaMemcpyKind, kind )
1581 
1582 
1583 #if (CUDART_VERSION < 5000)
1584 
1585 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyToSymbol, (context,
1586 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1587 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count,
1588 kind), const char *, symbol, const void *, src, size_t,
1589 count, size_t, offset , enum cudaMemcpyKind, kind )
1590 
1591 #else
1592 
1593 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyToSymbol, (context,
1594 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1595 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind),
1596 const void *, symbol, const void *, src, size_t, count, size_t, offset ,
1597 enum cudaMemcpyKind, kind )
1598 
1599 #endif
1600 
1601 
1602 #if (CUDART_VERSION < 5000)
1603 
1604 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyFromSymbol,
1605 (context, launcher_cct, syncStart, recorded_node), (context,
1606 launcher_cct, syncStart, recorded_node, ALL_STREAMS_MASK,
1607 syncEnd, count, kind), void *, dst, const char *, symbol,
1608 size_t, count, size_t, offset , enum cudaMemcpyKind, kind)
1609 
1610 #else
1611 
1612 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyFromSymbol, (context,
1613 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1614 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind),
1615 void *, dst, const void *, symbol, size_t, count, size_t, offset ,
1616 enum cudaMemcpyKind, kind)
1617 
1618 #endif
1619 
1620 
1621 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyPeerAsync, (streamId,
1622 event_node, context, cct_node, stream, 0), (event_node, cct_node,
1623 stream, count, cudaMemcpyDeviceToDevice), void *, dst, int, dstDevice,
1624 const void *, src, int, srcDevice, size_t, count, cudaStream_t, stream)
1625 
1626 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyFromArrayAsync, (streamId,
1627 event_node, context, cct_node, stream, 0), (event_node, cct_node,
1628 stream, count, kind), void *, dst, const struct cudaArray *, src,
1629 size_t, wOffset, size_t, hOffset, size_t, count, enum cudaMemcpyKind,
1630 kind, cudaStream_t, stream)
1631 
1632 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy2DAsync, (streamId,
1633 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1634 (width * height), kind), void *, dst, size_t, dpitch, const void *,
1635 src, size_t, spitch, size_t, width, size_t, height, enum cudaMemcpyKind,
1636 kind, cudaStream_t, stream)
1637 
1638 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy2DToArrayAsync, (streamId,
1639 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1640 (width * height), kind), struct cudaArray *, dst, size_t, wOffset,
1641 size_t, hOffset, const void *, src, size_t, spitch, size_t, width,
1642 size_t, height, enum cudaMemcpyKind, kind, cudaStream_t, stream)
1643 
1644 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpy2DFromArrayAsync, (streamId,
1645 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1646 (width * height), kind), void *, dst, size_t, dpitch, const struct
1647 cudaArray *, src, size_t, wOffset, size_t, hOffset, size_t, width,
1648 size_t, height, enum cudaMemcpyKind, kind, cudaStream_t, stream)
1649 
1650 
1651 #if (CUDART_VERSION < 5000)
1652 
1653 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyToSymbolAsync, (streamId,
1654 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1655 count, kind), const char *, symbol, const void *, src, size_t, count,
1656 size_t, offset, enum cudaMemcpyKind, kind, cudaStream_t, stream)
1657 
1658 #else
1659 
1660 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyToSymbolAsync, (streamId,
1661 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1662 count, kind), const void *, symbol, const void *, src, size_t, count,
1663 size_t, offset, enum cudaMemcpyKind, kind, cudaStream_t, stream)
1664 
1665 #endif
1666 
1667 
1668 #if (CUDART_VERSION < 5000)
1669 
1670 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyFromSymbolAsync, (streamId,
1671 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1672 count, kind), void *, dst, const char *, symbol, size_t, count, size_t,
1673 offset, enum cudaMemcpyKind, kind, cudaStream_t, stream)
1674 
1675 #else
1676 
1677 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyFromSymbolAsync, (streamId,
1678 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1679 count, kind), void *, dst, const void *, symbol, size_t, count, size_t,
1680 offset, enum cudaMemcpyKind, kind, cudaStream_t, stream)
1681 
1682 #endif
1683 
1684 
1685 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemset, (context, launcher_cct,
1686 syncStart, recorded_node), (context, launcher_cct, syncStart,
1687 recorded_node, ALL_STREAMS_MASK, syncEnd, 0, cudaMemcpyHostToDevice),
1688 void *, devPtr, int, value, size_t, count)
1689 
1690 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemset2D, (context, launcher_cct,
1691 syncStart, recorded_node), (context, launcher_cct, syncStart,
1692 recorded_node, ALL_STREAMS_MASK, syncEnd, 0, cudaMemcpyHostToDevice),
1693 void *, devPtr, size_t, pitch, int, value, size_t, width, size_t, height)
1694 
1695 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemset3D, (context, launcher_cct,
1696 syncStart, recorded_node), (context, launcher_cct, syncStart,
1697 recorded_node, ALL_STREAMS_MASK, syncEnd, 0, cudaMemcpyHostToDevice),
1698 struct cudaPitchedPtr, pitchedDevPtr, int, value, struct cudaExtent,
1699 extent)
1700 
1701 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemsetAsync, (streamId, event_node,
1702 context, cct_node, stream, 0), (event_node, cct_node, stream, 0,
1703 cudaMemcpyHostToDevice), void *, devPtr, int, value, size_t, count,
1704 cudaStream_t, stream)
1705 
1706 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemset2DAsync, (streamId,
1707 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1708 0, cudaMemcpyHostToDevice), void *, devPtr, size_t, pitch, int, value,
1709 size_t, width, size_t, height, cudaStream_t, stream)
1710 
1711 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemset3DAsync, (streamId,
1712 event_node, context, cct_node, stream, 0), (event_node, cct_node, stream,
1713 0, cudaMemcpyHostToDevice), struct cudaPitchedPtr, pitchedDevPtr, int,
1714 value, struct cudaExtent, extent, cudaStream_t, stream)
1715 
1716 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyAsync, (streamId, event_node,
1717 context, cct_node, stream, 0), (event_node, cct_node, stream, count,
1718 kind), void *, dst, const void *, src, size_t, count, enum cudaMemcpyKind,
1719 kind, cudaStream_t, stream)
1720 
1721 CUDA_RUNTIME_ASYNC_MEMCPY_WRAPPER(cudaMemcpyToArrayAsync, (streamId,
1722 event_node, context, cct_node, stream, 0), (event_node, cct_node,
1723 stream, count, kind), struct cudaArray *, dst, size_t, wOffset, size_t,
1724 hOffset, const void *, src, size_t, count, enum cudaMemcpyKind, kind,
1725 cudaStream_t, stream)
1726 
1727 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy2D, (context, launcher_cct,
1728 syncStart, recorded_node), (context, launcher_cct, syncStart,
1729 recorded_node, ALL_STREAMS_MASK, syncEnd, (height * width), kind), void *,
1730 dst, size_t, dpitch, const void *, src, size_t, spitch, size_t, width,
1731 size_t, height, enum cudaMemcpyKind, kind)
1732 
1733 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpy, (context, launcher_cct,
1734 syncStart, recorded_node), (context, launcher_cct, syncStart,
1735 recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind), void *, dst,
1736 const void *, src, size_t, count, enum cudaMemcpyKind, kind)
1737 
1738 CUDA_RUNTIME_SYNC_MEMCPY_WRAPPER(cudaMemcpyToArray, (context,
1739 launcher_cct, syncStart, recorded_node), (context, launcher_cct,
1740 syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, count, kind), struct
1741 cudaArray *, dst, size_t, wOffset, size_t, hOffset, const void *, src,
1742 size_t, count, enum cudaMemcpyKind, kind)
1743 
1744 
1745 // CUDA Driver overrides
1748 
1749 
1750 CUresult cuStreamSynchronize(CUstream stream) {
1751  SYNC_PROLOGUE(context, launcher_cct, syncStart, recorded_node);
1752 
1753  monitor_disable_new_threads();
1754  CUresult ret = cuDriverFunctionPointer[cuStreamSynchronizeEnum].cuStreamSynchronizeReal(stream);
1755  monitor_enable_new_threads();
1756 
1758  uint32_t streamId;
1759  streamId = splay_get_stream_id((cudaStream_t)stream);
1760  hpcrun_safe_exit();
1761 
1762  SYNC_EPILOGUE(context, launcher_cct, syncStart, recorded_node, streamId, syncEnd);
1763 
1764  return ret;
1765 }
1766 
1767 
1768 CUresult cuEventSynchronize(CUevent event) {
1769  SYNC_PROLOGUE(context, launcher_cct, syncStart, recorded_node);
1770 
1771  monitor_disable_new_threads();
1772  CUresult ret = cuDriverFunctionPointer[cuEventSynchronizeEnum].cuEventSynchronizeReal(event);
1773  monitor_enable_new_threads();
1774 
1775  SYNC_EPILOGUE(context, launcher_cct, syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd);
1776 
1777  return ret;
1778 }
1779 
1780 
1781 CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream) {
1782 
1783  ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t)hStream), 0);
1784 
1785  CUresult ret = cuDriverFunctionPointer[cuLaunchGridAsyncEnum].cuLaunchGridAsyncReal(f, grid_width, grid_height, hStream);
1786 
1787  ASYNC_KERNEL_EPILOGUE(event_node, ((cudaStream_t)hStream));
1788 
1789  return ret;
1790 }
1791 
1792 CUresult cuLaunchKernel (CUfunction f,
1793  unsigned int gridDimX,
1794  unsigned int gridDimY,
1795  unsigned int gridDimZ,
1796  unsigned int blockDimX,
1797  unsigned int blockDimY,
1798  unsigned int blockDimZ,
1799  unsigned int sharedMemBytes,
1800  CUstream hStream,
1801  void **kernelParams,
1802  void **extra) {
1803  ASYNC_KERNEL_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t)hStream), 0);
1804 
1805  CUresult ret = cuDriverFunctionPointer[cuLaunchKernelEnum].cuLaunchKernelReal(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
1806 
1807  ASYNC_KERNEL_EPILOGUE(event_node, ((cudaStream_t)hStream));
1808 
1809  return ret;
1810 }
1811 
1812 
1813 CUresult cuStreamDestroy(CUstream stream) {
1814 
1815  SYNCHRONOUS_CLEANUP;
1817 
1818  uint32_t streamId;
1819  streamId = splay_get_stream_id((cudaStream_t)stream);
1820 
1821 
1822  hpcrun_stream_finalize(g_stream_array[streamId].st);
1823 
1824  // remove from hpcrun process auxiliary cleanup list
1825  hpcrun_process_aux_cleanup_remove(g_stream_array[streamId].aux_cleanup_info);
1826 
1827  g_stream_array[streamId].st = NULL;
1828 
1829  monitor_disable_new_threads();
1830  cudaError_t ret = cuDriverFunctionPointer[cuStreamDestroy_v2Enum].cuStreamDestroy_v2Real(stream);
1831  monitor_enable_new_threads();
1832 
1833  // Delete splay tree entry
1834  splay_delete((cudaStream_t)stream);
1835  hpcrun_safe_exit();
1836  return ret;
1837 
1838 }
1839 
1840 
1841 CUresult cuStreamCreate(CUstream * phStream, unsigned int Flags) {
1842 
1843  TD_GET(gpu_data.is_thread_at_cuda_sync) = true;
1844  monitor_disable_new_threads();
1845  CUresult ret = cuDriverFunctionPointer[cuStreamCreateEnum].cuStreamCreateReal(phStream, Flags);
1846  monitor_enable_new_threads();
1847  TD_GET(gpu_data.is_thread_at_cuda_sync) = false;
1848 
1849  StreamCreateBookKeeper((cudaStream_t*) phStream);
1850 
1851  return ret;
1852 
1853 }
1854 
1855 
1856 static void destroy_all_events_in_free_event_list(){
1857 
1858  event_list_node_t * cur = g_free_event_nodes_head;
1859 
1860  monitor_disable_new_threads();
1861  while(cur){
1862  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal(cur->event_start));
1863  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal(cur->event_end));
1864  cur->event_start = 0;
1865  cur->event_end = 0;
1866  cur = cur->next_free_node;
1867  }
1868  monitor_enable_new_threads();
1869 
1870 }
1871 
1872 CUresult
1873 cuCtxCreate_v2 (CUcontext *pctx, unsigned int flags, CUdevice dev)
1874 {
1875  if (cuda_ncontexts_incr() > 1) {
1876  fprintf(stderr, "Too many contexts created\n");
1878  }
1879  if (! hpcrun_is_safe_to_sync(__func__)) { return cuDriverFunctionPointer[cuCtxCreate_v2Enum].cuCtxCreate_v2Real(pctx, flags, dev);
1880  }
1881  TD_GET(gpu_data.is_thread_at_cuda_sync) = true;
1882  monitor_disable_new_threads();
1883  CUresult ret = cuDriverFunctionPointer[cuCtxCreate_v2Enum].cuCtxCreate_v2Real(pctx, flags, dev);
1884  monitor_enable_new_threads();
1885  TD_GET(gpu_data.is_thread_at_cuda_sync) = false;
1886  return ret;
1887 }
1888 
1889 CUresult cuCtxDestroy(CUcontext ctx) {
1890 
1891  SYNCHRONOUS_CLEANUP;
1892 
1893  HPCRUN_ASYNC_BLOCK_SPIN_LOCK;
1894  if (g_start_of_world_time != 0) {
1895 
1896  // In case cudaLaunch causes dlopn, async block may get enabled, as a safety net set gpu_data.is_thread_at_cuda_sync so that we dont call any cuda calls
1897  TD_GET(gpu_data.is_thread_at_cuda_sync) = true;
1898 
1899  // Walk the stream splay tree and close each trace.
1900  close_all_streams(stream_to_id_tree_root);
1901  stream_to_id_tree_root = NULL;
1902 
1903  // And disable tracking new threads from CUDA
1904  monitor_disable_new_threads();
1905 
1906  CUDA_SAFE_CALL(cudaRuntimeFunctionPointer[cudaEventDestroyEnum].cudaEventDestroyReal(g_start_of_world_event));
1907  g_start_of_world_time = 0;
1908  // enable monitoring new threads
1909  monitor_enable_new_threads();
1910 
1911 
1912  // Destroy all events in g_free_event_nodes_head
1913  destroy_all_events_in_free_event_list();
1914 
1915 
1916  // Ok to call cuda functions from the signal handler
1917  TD_GET(gpu_data.is_thread_at_cuda_sync) = false;
1918 
1919  }
1920  // count context creation ==> decrement here
1922  EMSG("Destroying Context!");
1923  HPCRUN_ASYNC_UNBLOCK_SPIN_UNLOCK;
1924 
1925  monitor_disable_new_threads();
1926  CUresult ret = cuDriverFunctionPointer[cuCtxDestroy_v2Enum].cuCtxDestroy_v2Real(ctx);
1927  monitor_enable_new_threads();
1928 
1929  return ret;
1930 }
1931 
1932 
1933 
1934 
1935 CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream) {
1936 
1937  ASYNC_MEMCPY_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t)hStream), 0);
1938 
1939  CUresult ret = cuDriverFunctionPointer[cuMemcpyHtoDAsync_v2Enum].cuMemcpyHtoDAsync_v2Real(dstDevice, srcHost, ByteCount, hStream);
1940 
1941  ASYNC_MEMCPY_EPILOGUE(event_node, cct_node, ((cudaStream_t)hStream), ByteCount, cudaMemcpyHostToDevice);
1942 
1943  return ret;
1944 
1945 }
1946 
1947 
1948 CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount) {
1949 
1950  SYNC_MEMCPY_PROLOGUE(context, launcher_cct, syncStart, recorded_node);
1951 
1952  monitor_disable_new_threads();
1953  CUresult ret = cuDriverFunctionPointer[cuMemcpyHtoD_v2Enum].cuMemcpyHtoD_v2Real(dstDevice, srcHost, ByteCount);
1954  monitor_enable_new_threads();
1955 
1956  SYNC_MEMCPY_EPILOGUE(context, launcher_cct, syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, ByteCount, cudaMemcpyHostToDevice);
1957  return ret;
1958 
1959 }
1960 
1961 
1962 CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) {
1963 
1964  ASYNC_MEMCPY_PROLOGUE(streamId, event_node, context, cct_node, ((cudaStream_t)hStream), 0);
1965 
1966  CUresult ret = cuDriverFunctionPointer[cuMemcpyDtoHAsync_v2Enum].cuMemcpyDtoHAsync_v2Real(dstHost, srcDevice, ByteCount, hStream);
1967 
1968  ASYNC_MEMCPY_EPILOGUE(event_node, cct_node, ((cudaStream_t)hStream), ByteCount, cudaMemcpyDeviceToHost);
1969 
1970  return ret;
1971 }
1972 
1973 
1974 CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount) {
1975 
1976  SYNC_MEMCPY_PROLOGUE(context, launcher_cct, syncStart, recorded_node);
1977 
1978  monitor_disable_new_threads();
1979  CUresult ret = cuDriverFunctionPointer[cuMemcpyDtoH_v2Enum].cuMemcpyDtoH_v2Real(dstHost, srcDevice, ByteCount);
1980  monitor_enable_new_threads();
1981 
1982  SYNC_MEMCPY_EPILOGUE(context, launcher_cct, syncStart, recorded_node, ALL_STREAMS_MASK, syncEnd, ByteCount, cudaMemcpyDeviceToHost);
1983  return ret;
1984 }
1985 
1986 
1988 // CPU-GPU blame shift interface
1990 
1991 
1992 void
1993 gpu_blame_shifter(void* dc, int metric_id, cct_node_t* node, int metric_dc)
1994 {
1995  metric_desc_t* metric_desc = hpcrun_id2metric(metric_id);
1996 
1997  // Only blame shift idleness for time metric.
1998  if ( !metric_desc->properties.time )
1999  return;
2000 
2001  uint64_t cur_time_us = 0;
2002  int ret = time_getTimeReal(&cur_time_us);
2003  if (ret != 0) {
2004  EMSG("time_getTimeReal (clock_gettime) failed!");
2006  }
2007  uint64_t metric_incr = cur_time_us - TD_GET(last_time_us);
2008 
2009  // If we are already in a cuda API, then we can't call cleanup_finished_events() since CUDA could have taken the same lock. Hence we just return.
2010 
2011  bool is_threads_at_sync = TD_GET(gpu_data.is_thread_at_cuda_sync);
2012 
2013  if (is_threads_at_sync) {
2014  if(SHARED_BLAMING_INITIALISED) {
2015  TD_GET(gpu_data.accum_num_sync_threads) += ipc_data->num_threads_at_sync_all_procs;
2016  TD_GET(gpu_data.accum_num_samples) += 1;
2017  }
2018  return;
2019  }
2020 
2021  spinlock_lock(&g_gpu_lock);
2022  uint32_t num_unfinshed_streams = 0;
2023  stream_node_t *unfinished_event_list_head = 0;
2024 
2025  num_unfinshed_streams = cleanup_finished_events();
2026  unfinished_event_list_head = g_unfinished_stream_list_head;
2027 
2028  if (num_unfinshed_streams) {
2029 
2030  //SHARED BLAMING: kernels need to be blamed for idleness on other procs/threads.
2031  if(SHARED_BLAMING_INITIALISED && ipc_data->num_threads_at_sync_all_procs && !g_num_threads_at_sync) {
2032  for (stream_node_t * unfinished_stream = unfinished_event_list_head; unfinished_stream; unfinished_stream = unfinished_stream->next_unfinished_stream) {
2033  //TODO: FIXME: the local threads at sync need to be removed, /T has to be done while adding metric
2034  //increment (either one of them).
2035  cct_metric_data_increment(cpu_idle_cause_metric_id, unfinished_stream->unfinished_event_node->launcher_cct, (cct_metric_data_t) {
2036  .r = metric_incr / g_active_threads}
2037  );
2038  }
2039  }
2040  }
2041  else {
2042 
2043  /*** Code to account for Overload factor ***/
2044  if(TD_GET(gpu_data.overload_state) == WORKING_STATE) {
2045  TD_GET(gpu_data.overload_state) = OVERLOADABLE_STATE;
2046  }
2047 
2048  if(TD_GET(gpu_data.overload_state) == OVERLOADABLE_STATE) {
2049  // Increment gpu_overload_potential_metric_id by metric_incr
2051  .i = metric_incr});
2052  }
2053 
2054  // GPU is idle iff ipc_data->outstanding_kernels == 0
2055  // If ipc_data is NULL, then this process has not made GPU calls so, we are blind and declare GPU idle w/o checking status of other processes
2056  // There is no better solution yet since we dont know which GPU card we should be looking for idleness.
2057  if(g_do_shared_blaming){
2058  if ( !ipc_data || ipc_data->outstanding_kernels == 0) { // GPU device is truely idle i.e. no other process is keeping it busy
2059  // Increment gpu_ilde by metric_incr
2061  .i = metric_incr});
2062  }
2063  } else {
2064  // Increment gpu_ilde by metric_incr
2066  .i = metric_incr});
2067  }
2068 
2069  }
2070  spinlock_unlock(&g_gpu_lock);
2071 }
2072 
2073 #endif
static struct leakinfo_s * splay_delete(void *memblock)
cct_ctxt_t * csdata_ctxt
Definition: epoch.h:66
void hpcrun_cct2metrics_init(cct2metrics_t **map)
Definition: cct2metrics.c:52
int cpu_idle_cause_metric_id
Definition: gpu_blame.c:120
int h_to_d_data_xfer_metric_id
Definition: gpu_blame.c:124
int gpu_idle_metric_id
Definition: gpu_blame.c:121
static void splay_insert(struct leakinfo_s *node)
void hpcrun_process_aux_cleanup_remove(hpcrun_aux_cleanup_t *node)
Definition: main.c:560
static void hpcrun_safe_exit(void)
void hpcrun_trace_open(core_profile_trace_data_t *cptd)
Definition: trace.c:124
void hpcrun_cct_insert_path(cct_node_t **root, cct_node_t *path)
Definition: cct.c:662
static void spinlock_unlock(spinlock_t *l)
Definition: spinlock.h:96
int d_to_h_data_xfer_metric_id
Definition: gpu_blame.c:125
static cct_node_t * splay(cct_node_t *cct, cct_addr_t *addr)
Definition: cct.c:202
void hpcrun_trace_append(core_profile_trace_data_t *cptd, cct_node_t *node, uint metric_id)
Definition: trace.c:173
hpcrun_aux_cleanup_t * hpcrun_process_aux_cleanup_add(void(*func)(void *), void *arg)
Definition: main.c:540
hpcrun_loadmap_t * loadmap
Definition: epoch.h:67
void hpcrun_disable_papi_cuda(void)
Definition: papi-c.c:808
cct_node_t * node
Definition: cct.c:128
cct_node_t * tree_root
Definition: cct_bundle.h:65
static void cct_metric_data_increment(int metric_id, cct_node_t *x, cct_metric_data_t incr)
Definition: cct2metrics.h:86
#define REGULAR_SPLAY_TREE(type, root, key, value, left, right)
Definition: splay-macros.h:172
void hpcrun_trace_close(core_profile_trace_data_t *cptd)
Definition: trace.c:195
int hpcrun_write_profile_data(core_profile_trace_data_t *cptd)
Definition: write_data.c:329
void hpcrun_cct_bundle_init(cct_bundle_t *bundle, cct_ctxt_t *ctxt)
Definition: cct_bundle.c:85
bool hpcrun_is_safe_to_sync(const char *fn)
Definition: main.c:314
cct_bundle_t csdata
Definition: epoch.h:65
static cct_addr_t dc
Definition: cct.c:797
#define Cuda_RTcall(fn)
-*-Mode: C++;-*- // technically C99
Definition: gpu_blame.c:108
cct_ctxt_t * copy_thr_ctxt(cct_ctxt_t *thr_ctxt)
Definition: cct_ctxt.c:77
int hpcrun_trace_isactive()
Definition: trace.c:107
#define EMSG
Definition: messages.h:70
static void * hpcrun_mmap_anon(size_t size)
Definition: mem.c:156
int uva_data_xfer_metric_id
Definition: gpu_blame.c:128
cct_node_t * hpcrun_cct_bundle_get_idle_node(cct_bundle_t *cct)
Definition: cct_bundle.c:159
uint64_t cuda_ncontexts_decr(void)
#define HPCRUN_CONSTRUCTOR(x)
Definition: constructors.h:5
#define HPCRUN_FMT_MetricId_NULL
Definition: hpcrun-fmt.h:691
Definition: epoch.h:64
void * hpcrun_malloc(size_t size)
Definition: mem.c:275
#define TD_GET(field)
Definition: thread_data.h:256
static void spinlock_lock(spinlock_t *l)
Definition: spinlock.h:111
int32_t hpcrun_cct_persistent_id(cct_node_t *x)
Definition: cct.c:363
static int time_getTimeReal(uint64_t *time)
Definition: timer.h:131
int gpu_time_metric_id
Definition: gpu_blame.c:119
#define TMSG(f,...)
Definition: messages.h:93
void hpcrun_trace_append_with_time(core_profile_trace_data_t *st, unsigned int call_path_id, uint metric_id, uint64_t microtime)
Definition: trace.c:164
#define EEMSG(...)
Definition: messages.h:90
#define NULL
Definition: ElfHelper.cpp:85
Definition: cct.c:96
uint64_t cuda_ncontexts_incr(void)
void monitor_real_abort(void)
metric_desc_properties_t properties
Definition: hpcrun-fmt.h:376
struct epoch_t * next
Definition: epoch.h:68
#define SPINLOCK_UNLOCKED
Definition: spinlock.h:84
int d_to_d_data_xfer_metric_id
Definition: gpu_blame.c:127
static int hpcrun_safe_enter(void)
int gpu_overload_potential_metric_id
Definition: gpu_blame.c:122
metric_desc_t * hpcrun_id2metric(int metric_id)
Definition: metrics.c:251
hpcrun_loadmap_t * hpcrun_getLoadmap()
Definition: loadmap.c:501
int h_to_h_data_xfer_metric_id
Definition: gpu_blame.c:126