HPCToolkit
gpu_blame.c
Go to the documentation of this file.
1 
3 // * BeginRiceCopyright *****************************************************
4 //
5 // $HeadURL: https://outreach.scidac.gov/svn/hpctoolkit/branches/hpctoolkit-gpu-blame-shift-proto/src/tool/hpcrun/sample-sources/gpu_blame.c $
6 // $Id: itimer.c 3784 2012-05-10 22:35:51Z mc29 $
7 //
8 // --------------------------------------------------------------------------
9 // Part of HPCToolkit (hpctoolkit.org)
10 //
11 // Information about sources of support for research and development of
12 // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
13 // --------------------------------------------------------------------------
14 //
15 // Copyright ((c)) 2002-2019, Rice University
16 // All rights reserved.
17 //
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
21 //
22 // * Redistributions of source code must retain the above copyright
23 // notice, this list of conditions and the following disclaimer.
24 //
25 // * Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimer in the
27 // documentation and/or other materials provided with the distribution.
28 //
29 // * Neither the name of Rice University (RICE) nor the names of its
30 // contributors may be used to endorse or promote products derived from
31 // this software without specific prior written permission.
32 //
33 // This software is provided by RICE and contributors "as is" and any
34 // express or implied warranties, including, but not limited to, the
35 // implied warranties of merchantability and fitness for a particular
36 // purpose are disclaimed. In no event shall RICE or contributors be
37 // liable for any direct, indirect, incidental, special, exemplary, or
38 // consequential damages (including, but not limited to, procurement of
39 // substitute goods or services; loss of use, data, or profits; or
40 // business interruption) however caused and on any theory of liability,
41 // whether in contract, strict liability, or tort (including negligence
42 // or otherwise) arising in any way out of the use of this software, even
43 // if advised of the possibility of such damage.
44 //
45 // **
46 
47 //
48 // Blame shiting interface
49 //
50 
51 /******************************************************************************
52  * system includes
53  *****************************************************************************/
54 #include <errno.h>
55 #include <stddef.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <assert.h>
59 #include <stdbool.h>
60 
61 #include <signal.h>
62 #include <sys/time.h> /* setitimer() */
63 #include <ucontext.h> /* struct ucontext */
64 #include <dlfcn.h>
65 
66 
67 /******************************************************************************
68  * libmonitor
69  *****************************************************************************/
70 
71 #include <monitor.h>
72 
73 
74 /******************************************************************************
75  * local includes
76  *****************************************************************************/
77 #include "sample_source_obj.h"
78 #include "common.h"
79 #include <hpcrun/hpcrun_options.h>
80 #include <hpcrun/hpcrun_stats.h>
81 
82 #include <hpcrun/metrics.h>
83 #include <hpcrun/safe-sampling.h>
84 #include <hpcrun/sample_event.h>
86 #include <hpcrun/thread_data.h>
87 #include <hpcrun/trace.h>
88 
89 #include <lush/lush-backtrace.h>
90 #include <messages/messages.h>
91 
92 #include <utilities/tokenize.h>
94 
95 #include <unwind/common/unwind.h>
96 
97 #include <lib/support-lean/timer.h>
98 #include <lib/prof-lean/spinlock.h>
99 #include <lib/prof-lean/atomic.h>
101 #include "blame-shift/blame-shift.h"
102 
103 #ifdef ENABLE_CUDA
104 #include "gpu_blame.h"
105 #endif // ENABLE_CUDA
106 
107 // ****************** utility macros *********************
108 #define Cuda_RTcall(fn) cudaRuntimeFunctionPointer[fn ## Enum].fn ## Real
109 #define Cuda_Dcall(fn) cuDriverFunctionPointer[fn ## Enum].fn ## Real
110 // ************************************************
111 
112 // ******* Global Variables ***********
114 bool g_cpu_gpu_enabled = false;
115 
116 
117 // Various CPU-GPU metrics
129 
130 
132 
133 // blame shift registration info
135 
136 // ****** UTILITY Functions (public)
137 
140 }
141 
142 // ******* METHOD DEFINITIONS ***********
143 
144 static void METHOD_FN(init)
145 {
146  TMSG(CPU_GPU_BLAME_CTL, "setting up CPU_GPU_BLAME");
147  //active threads represents the total number of threads in the system
148  //including the main thread
149  g_active_threads = 1;
150  self->state = INIT;
151 }
152 
153 static void METHOD_FN(thread_init)
154 {
155  TMSG(CPU_GPU_BLAME_CTL, "thread init");
156  atomic_add_i64(&g_active_threads, 1L);
157 }
158 
159 static void METHOD_FN(thread_init_action)
160 {
161  TMSG(CPU_GPU_BLAME_CTL, "thread action (noop)");
162 }
163 
164 static void
165 METHOD_FN(start)
166 {
167  TMSG(CPU_GPU_BLAME_CTL,"starting CPU_GPU_BLAME");
169  EMSG("Either pass -e WALLCLOCK or -e REALTIME to enable CPU_GPU_BLAME");
171  }
172  g_cpu_gpu_enabled = true;
173  TD_GET(ss_state)[self->sel_idx] = START;
174 }
175 
176 static void METHOD_FN(thread_fini_action)
177 {
178  TMSG(CPU_GPU_BLAME_CTL, "thread action ");
179  atomic_add_i64(&g_active_threads, -1L);
180 }
181 
182 static void METHOD_FN(stop)
183 {
184  TMSG(CPU_GPU_BLAME_CTL, "stopping CPU_GPU_BLAME");
185  TD_GET(ss_state)[self->sel_idx] = STOP;
186 }
187 
188 static void METHOD_FN(shutdown)
189 {
190  TMSG(CPU_GPU_BLAME_CTL, "shutodown CPU_GPU_BLAME_CTL");
191  METHOD_CALL(self, stop); // make sure stop has been called
192  self->state = UNINIT;
193 }
194 
195 static bool METHOD_FN(supports_event, const char *ev_str)
196 {
197  return hpcrun_ev_is(ev_str, "CPU_GPU_IDLE");
198 }
199 
200 
201 
202 static void METHOD_FN(process_event_list, int lush_metrics)
203 {
204 
205  TMSG(CPU_GPU_BLAME_CTL, "process event list, lush_metrics = %d", lush_metrics);
206 
207  // Create metrics for CPU/GPU blame shifting
208  // cpu_idle_metric_id a.k.a CPU_IDLE measures the time when CPU is idle waiting for GPU to finish
210  // cpu_idle_cause_metric_id a.k.a CPU_IDLE_CAUSE blames GPU kernels (CCT nodes which launched them)
211  // that are keeping the CPU idle
213  // gpu_idle_metric_id a.k.a GPU_IDLE_CAUSE measures the time when GPU is idle and blames CPU CCT node
214  // for not creating work
216  // gpu_time_metric_id a.k.a. GPU_ACTIVITY_TIME accounts the absolute running time of a kernel (CCT node which launched it)
218 
219 
220  // h_to_d_data_xfer_metric_id is the number of bytes xfered from CPU to GPU
222 
223  // d_to_h_data_xfer_metric_id is the number of bytes xfered from GPU to CPU
225 
226  // d_to_d_data_xfer_metric_id is the number of bytes xfered from GPU to GPU
228 
229  // h_to_h_data_xfer_metric_id is the number of bytes xfered from CPU to CPU
231 
232  // uva_data_xfer_metric_id is the number of bytes xfered over CUDA unified virtual address
234 
235  // Accumulates the time between last kernel end to current Sync point as a potential GPU overload factor
237 
242 
249 
250  bs_entry.fn = dlsym(RTLD_DEFAULT, "gpu_blame_shifter");
251  bs_entry.next = 0;
252  blame_shift_register(&bs_entry);
253 }
254 
255 static void METHOD_FN(gen_event_set, int lush_metrics)
256 {
257  // There is NO signal hander for us, we proxy with itimer or PAPI_TOT_CYC
258 }
259 
260 static void METHOD_FN(display_events)
261 {
262  printf("===========================================================================\n");
263  printf("Available CPU_GPU_IDLE events\n");
264  printf("===========================================================================\n");
265  printf("Name\t\tDescription\n");
266  printf("---------------------------------------------------------------------------\n");
267  printf("CPU_GPU_IDLE\tCPU GPU idleness\n");
268  printf("\n");
269 }
270 
271 /***************************************************************************
272  * object
273  ***************************************************************************/
274 
275 #define ss_name cpu_gpu_idle
276 #define ss_cls SS_HARDWARE
277 
278 #include "ss_obj.h"
279 
280 /******************************************************************************
281  * private operations
282  *****************************************************************************/
283 
static void METHOD_FN(init)
Definition: gpu_blame.c:144
int cpu_idle_cause_metric_id
Definition: gpu_blame.c:120
int h_to_d_data_xfer_metric_id
Definition: gpu_blame.c:124
int gpu_idle_metric_id
Definition: gpu_blame.c:121
struct bs_fn_entry_t * next
Definition: blame-shift.h:9
int d_to_h_data_xfer_metric_id
Definition: gpu_blame.c:125
void hpcrun_set_gpu_proxy_present()
Definition: gpu_blame.c:138
Definition: blame-shift.h:8
metric_desc_t * hpcrun_set_metric_info_and_period(int metric_id, const char *name, MetricFlags_ValFmt_t valFmt, size_t period, metric_desc_properties_t prop)
Definition: metrics.c:411
void blame_shift_register(bs_fn_entry_t *entry)
Definition: blame-shift.c:8
int stream_special_metric_id
Definition: gpu_blame.c:123
#define EMSG
Definition: messages.h:70
int uva_data_xfer_metric_id
Definition: gpu_blame.c:128
int lush_metrics
Definition: main.c:188
bool g_cpu_gpu_enabled
Definition: gpu_blame.c:114
int g_cpu_gpu_proxy_count
Definition: gpu_blame.c:113
bs_fn_t fn
Definition: blame-shift.h:10
#define TD_GET(field)
Definition: thread_data.h:256
static bs_fn_entry_t bs_entry
Definition: gpu_blame.c:134
bool hpcrun_ev_is(const char *candidate, const char *event_name)
Definition: tokenize.c:194
int gpu_time_metric_id
Definition: gpu_blame.c:119
#define TMSG(f,...)
Definition: messages.h:93
#define METHOD_CALL(obj, meth,...)
Definition: simple_oo.h:87
int blame_shift_source_available(bs_type bst)
Definition: blame-shift.c:32
int hpcrun_new_metric(void)
Definition: metrics.c:333
void monitor_real_abort(void)
uint64_t g_active_threads
Definition: gpu_blame.c:131
int d_to_d_data_xfer_metric_id
Definition: gpu_blame.c:127
int gpu_overload_potential_metric_id
Definition: gpu_blame.c:122
int cpu_idle_metric_id
Definition: gpu_blame.c:118
#define metric_property_none
Definition: hpcrun-fmt.h:202
int h_to_h_data_xfer_metric_id
Definition: gpu_blame.c:126