HPCToolkit
kernel_blocking.c
Go to the documentation of this file.
1 // -*-Mode: C++;-*- // technically C99
2 
3 // * BeginRiceCopyright *****************************************************
4 //
5 // --------------------------------------------------------------------------
6 // Part of HPCToolkit (hpctoolkit.org)
7 //
8 // Information about sources of support for research and development of
9 // HPCToolkit is at 'hpctoolkit.org' and in 'README.Acknowledgments'.
10 // --------------------------------------------------------------------------
11 //
12 // Copyright ((c)) 2002-2019, Rice University
13 // All rights reserved.
14 //
15 // Redistribution and use in source and binary forms, with or without
16 // modification, are permitted provided that the following conditions are
17 // met:
18 //
19 // * Redistributions of source code must retain the above copyright
20 // notice, this list of conditions and the following disclaimer.
21 //
22 // * Redistributions in binary form must reproduce the above copyright
23 // notice, this list of conditions and the following disclaimer in the
24 // documentation and/or other materials provided with the distribution.
25 //
26 // * Neither the name of Rice University (RICE) nor the names of its
27 // contributors may be used to endorse or promote products derived from
28 // this software without specific prior written permission.
29 //
30 // This software is provided by RICE and contributors "as is" and any
31 // express or implied warranties, including, but not limited to, the
32 // implied warranties of merchantability and fitness for a particular
33 // purpose are disclaimed. In no event shall RICE or contributors be
34 // liable for any direct, indirect, incidental, special, exemplary, or
35 // consequential damages (including, but not limited to, procurement of
36 // substitute goods or services; loss of use, data, or profits; or
37 // business interruption) however caused and on any theory of liability,
38 // whether in contract, strict liability, or tort (including negligence
39 // or otherwise) arising in any way out of the use of this software, even
40 // if advised of the possibility of such damage.
41 //
42 // ******************************************************* EndRiceCopyright *
43 
44 
45 /******************************************************************************
46  * local includes
47  *****************************************************************************/
48 
56 #include <assert.h>
57 #include <include/linux_info.h>
58 
59 #include "kernel_blocking.h"
60 
61 #include "perf-util.h" // u64, u32 and perf_mmap_data_t
62 #include "perf_mmap.h"
63 #include "event_custom.h"
64 
65 /******************************************************************************
66  * Macros
67  *****************************************************************************/
68 
69 #define KERNEL_BLOCKING_DEBUG 0
70 
71 // -----------------------------------------------------
72 // Predefined events
73 // -----------------------------------------------------
74 
75 #define EVNAME_KERNEL_BLOCK "BLOCKTIME"
76 #define EVNAME_CONTEXT_SWITCHES "CS"
77 
78 
79 //******************************************************************************
80 // forward declaration
81 //******************************************************************************
82 
83 
84 
85 //******************************************************************************
86 // local variables
87 //******************************************************************************
88 
89 // metric index for kernel blocking
90 // usually each thread has the same metric index, so it's safe to make it global
91 // for each thread (I hope).
92 static int metric_blocking_index = -1;
93 
94 static __thread u64 time_cs_out = 0; // time when leaving the application process
95 static __thread cct_node_t *cct_kernel = NULL; // cct of the last access to kernel
96 static __thread u32 cpu = 0; // cpu of the last sample
97 static __thread u32 pid = 0, tid = 0; // last pid/tid
98 
99 /******************************************************************************
100  * private operations
101  *****************************************************************************/
102 
103 static void
105  perf_mmap_data_t *mmap_data)
106 {
107  // make sure the time is is zero or positive
108  if (mmap_data->time < time_cs_out) {
109  TMSG(LINUX_PERF, "old t: %d, c: %d, p: %d, td: %d -- vs -- t: %d, c: %d, p: %d, td: %d",
110  time_cs_out, cpu, pid, tid, mmap_data->time, mmap_data->cpu, mmap_data->pid, mmap_data->tid);
111  return;
112  }
113 
114  uint64_t delta = mmap_data->time - time_cs_out;
115 
116  // ----------------------------------------------------------------
117  // blame the time spent in the kernel to the cct kernel
118  // ----------------------------------------------------------------
119 
121  (cct_metric_data_t){.i = delta});
122 
123  // ----------------------------------------------------------------
124  // it's important to always count the number of samples for debugging purpose
125  // ----------------------------------------------------------------
126 
128  td->core_profile_trace_data.perf_event_info[metric_blocking_index].num_samples++;
129 }
130 
131 /***********************************************************************
132  * this function is called when a cycle or time event occurs, OR a
133  * context switch event occurs.
134  *
135  * to compute kernel blocking time:
136  * time_outside_kernel - time_inside_kernel
137  *
138  * algorithm:
139  * if this is the first time entering kernel (time_cs_out == 0) then
140  * set time_cs_out
141  * store the cct to cct_kernel
142  * else
143  * if we are outside the kernel, then:
144  * compute the blocking time
145  * blame the time to the kernel cct
146  *
147  * else if we are still inside the kernel with different cct_kernel:
148  * compute the time
149  * blame the time to the old cct_kernel
150  * set time_cs_out to the current time
151  * set cct_kernel with the new cct
152  * end if
153  * end if
154  ***********************************************************************/
155 static void
157 {
158  if (metric_blocking_index < 0)
159  return; // not initialized or something wrong happens in the initialization
160 
161  perf_mmap_data_t *mmap_data = args->data;
162 
163  if (mmap_data == NULL) {
164 
165  // somehow, at the end of the execution, a sample event is still triggered
166  // and in this case, the arguments are null. Is this our bug ? or gdb ?
167 
168  return;
169  }
170 
171  if (mmap_data->context_switch_time > 0) {
172  // ----------------------------------------------------------------
173  // when PERF_RECORD_SWITCH (context switch out) occurs:
174  // if this is the first time, (time_cs_out is zero), then freeze the
175  // cs time and the current cct
176  // ----------------------------------------------------------------
177 
178  if (time_cs_out == 0) {
179  // ----------------------------------------------------------------
180  // this is the first time we enter the kernel (leaving the current process)
181  // needs to store the time to compute the blocking time in
182  // the next step when leaving the kernel
183  // ----------------------------------------------------------------
184  time_cs_out = mmap_data->context_switch_time;
185  cpu = mmap_data->cpu;
186  pid = mmap_data->pid;
187  tid = mmap_data->tid;
188  }
189  } else {
190 
191  // ----------------------------------------------------------------
192  // when PERF_SAMPLE_RECORD occurs:
193  // check whether we were previously in the kernel or not (time_cs_out > 0)
194  // if we were in the kernel, then we blame the different time to the
195  // cct kernel
196  // ----------------------------------------------------------------
197 
198  if (args->current->attr.config == PERF_COUNT_SW_CONTEXT_SWITCHES) {
199 
200  if (cct_kernel != NULL && time_cs_out > 0) {
201  // corner case : context switch within a context switch !
202  blame_kernel_time(cct_kernel, mmap_data);
203  time_cs_out = 0;
204  }
205  // context switch inside the kernel: record the new cct
206  cct_kernel = args->sample;
207 
208  } else if (cct_kernel != NULL) {
209  // other event than cs occurs (it can be cycles, clocks or others)
210 
211  if ((mmap_data->time > 0) && (time_cs_out > 0) && (mmap_data->nr==0)) {
212 #if KERNEL_BLOCKING_DEBUG
213  unsigned int cpumode = mmap_data->header_misc & PERF_RECORD_MISC_CPUMODE_MASK;
214  assert(cpumode == PERF_RECORD_MISC_USER);
215 #endif
216 
217  blame_kernel_time(cct_kernel, mmap_data);
218  // important: need to reset the value to inform that we are leaving the kernel
219  cct_kernel = NULL;
220  time_cs_out = 0;
221  }
222  }
223  }
224 }
225 
226 
227 /***************************************************************
228  * Register events to compute blocking time in the kernel
229  * We use perf's sofrware context switch event to compute the
230  * time spent inside the kernel. For this, we need to sample everytime
231  * a context switch occurs, and compute the time when entering the
232  * kernel vs leaving the kernel. See perf_event_handler.
233  * We need two metrics for this:
234  * - blocking time metric to store the time spent in the kernel
235  * - context switch metric to store the number of context switches
236  ****************************************************************/
237 static int
239  event_custom_t *event,
240  struct event_threshold_s *period)
241 {
243  if (event_info == NULL)
244  return -1;
245 
246  memset(event_info, 0, sizeof(event_info_t));
247 
248  // ------------------------------------------
249  // create metric to compute blocking time
250  // ------------------------------------------
251  event_info->metric_custom = event;
252 
257 
258  // ------------------------------------------
259  // create metric to store context switches
260  // ------------------------------------------
261  int metric_cs = hpcrun_new_metric();
263  metric_cs, EVNAME_CONTEXT_SWITCHES,
265 
266  // ------------------------------------------
267  // set context switch event description to be used when creating
268  // perf event of this type on each thread
269  // ------------------------------------------
270  /* PERF_SAMPLE_STACK_USER may also be good to use */
271  u64 sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID |
272  PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN |
273  PERF_SAMPLE_CPU | PERF_SAMPLE_PERIOD;
274 
275  struct perf_event_attr *attr = &(event_info->attr);
276  attr->config = PERF_COUNT_SW_CONTEXT_SWITCHES;
277  attr->type = PERF_TYPE_SOFTWARE;
278 
280  true /* use_period*/,
281  1 /* sample every context switch*/,
282  sample_type /* need additional info for sample type */
283  );
284 
285  event_info->attr.context_switch = 1;
286  event_info->attr.sample_id_all = 1;
287 
288  event_info->metric_custom = event;
289 
290  METHOD_CALL(self, store_event_and_info,
291  attr->config, 1, metric_cs, event_info);;
292 
293  return 1;
294 }
295 
296 
297 /******************************************************************************
298  * interface operations
299  *****************************************************************************/
300 
302 {
303  event_custom_t *event_kernel_blocking = hpcrun_malloc(sizeof(event_custom_t));
304  event_kernel_blocking->name = EVNAME_KERNEL_BLOCK;
305  event_kernel_blocking->desc = "Approximation of a thread's blocking time."
306  " This event requires another event (such as CYCLES) to profile with."
307  " The unit time is hardware-dependent but mostly in microseconds."
308  " This event is only available on Linux kernel 4.3 or newer.";
309  event_kernel_blocking->register_fn = register_blocking; // call backs
310  event_kernel_blocking->handler_fn = kernel_block_handler;
311  event_kernel_blocking->handle_type = INCLUSIVE;// please call me for all events
312 
313  event_custom_register(event_kernel_blocking);
314 }
struct event_info_s * current
Definition: event_custom.h:64
struct perf_event_attr attr
Definition: perf-util.h:138
#define EVNAME_CONTEXT_SWITCHES
static __thread u32 tid
event_handler_t * handler_fn
Definition: event_custom.h:89
struct event_custom_s * metric_custom
Definition: perf-util.h:139
static void cct_metric_data_increment(int metric_id, cct_node_t *x, cct_metric_data_t incr)
Definition: cct2metrics.h:86
static __thread u32 pid
metric_desc_t * hpcrun_set_metric_info_and_period(int metric_id, const char *name, MetricFlags_ValFmt_t valFmt, size_t period, metric_desc_properties_t prop)
Definition: metrics.c:411
static int metric_blocking_index
static void blame_kernel_time(cct_node_t *cct_kernel, perf_mmap_data_t *mmap_data)
static int register_blocking(sample_source_t *self, event_custom_t *event, struct event_threshold_s *period)
u64 context_switch_time
Definition: perf-util.h:122
static void kernel_block_handler(event_handler_arg_t *args)
__u32 u32
struct perf_mmap_data_s * data
Definition: event_custom.h:65
void * hpcrun_malloc(size_t size)
Definition: mem.c:275
static __thread cct_node_t * cct_kernel
const char * desc
Definition: event_custom.h:86
int perf_util_attr_init(const char *event_name, struct perf_event_attr *attr, bool usePeriod, u64 threshold, u64 sampletype)
Definition: perf-util.c:403
#define TMSG(f,...)
Definition: messages.h:93
__u64 u64
#define METHOD_CALL(obj, meth,...)
Definition: simple_oo.h:87
static __thread u32 cpu
register_event_t * register_fn
Definition: event_custom.h:88
#define NULL
Definition: ElfHelper.cpp:85
Definition: cct.c:96
int hpcrun_new_metric(void)
Definition: metrics.c:333
event_handle_type_t handle_type
Definition: event_custom.h:91
static __thread u64 time_cs_out
const char * name
Definition: event_custom.h:85
#define EVNAME_KERNEL_BLOCK
void kernel_blocking_init()
static long period
Definition: itimer.c:194
thread_data_t *(* hpcrun_get_thread_data)(void)
Definition: thread_data.c:168
#define metric_property_none
Definition: hpcrun-fmt.h:202
int event_custom_register(event_custom_t *event)
Definition: event_custom.c:110
sample_val_t * sample
Definition: event_custom.h:62