LLVM OpenMP* Runtime Library
kmp_stats.h
1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
18 
19 #if KMP_STATS_ENABLED
20 /* Statistics accumulator.
21  Accumulates number of samples and computes min, max, mean, standard deviation
22  on the fly.
23 
24  Online variance calculation algorithm from
25  http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26  */
27 
28 #include "kmp_stats_timing.h"
29 #include <limits>
30 #include <math.h>
31 #include <new> // placement new
32 #include <stdint.h>
33 #include <string>
34 #include <vector>
35 
36 /* Enable developer statistics here if you want them. They are more detailed
37  than is useful for application characterisation and are intended for the
38  runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
40 
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
43 
50  noTotal = 1 << 0,
51  onlyInMaster = 1 << 1,
52  noUnits = 1 << 2,
53  notInMaster = 1 << 3,
54  logEvent = 1 << 4
56 };
57 
64  IDLE,
65  SERIAL_REGION,
66  FORK_JOIN_BARRIER,
67  PLAIN_BARRIER,
68  TASKWAIT,
69  TASKYIELD,
70  TASKGROUP,
71  IMPLICIT_TASK,
72  EXPLICIT_TASK,
73  TEAMS_REGION
74 };
75 
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg) \
96  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97  macro(OMP_NESTED_PARALLEL, 0, arg) \
98  macro(OMP_LOOP_STATIC, 0, arg) \
99  macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100  macro(OMP_LOOP_DYNAMIC, 0, arg) \
101  macro(OMP_DISTRIBUTE, 0, arg) \
102  macro(OMP_BARRIER, 0, arg) \
103  macro(OMP_CRITICAL, 0, arg) \
104  macro(OMP_SINGLE, 0, arg) \
105  macro(OMP_MASTER, 0, arg) \
106  macro(OMP_TEAMS, 0, arg) \
107  macro(OMP_set_lock, 0, arg) \
108  macro(OMP_test_lock, 0, arg) \
109  macro(REDUCE_wait, 0, arg) \
110  macro(REDUCE_nowait, 0, arg) \
111  macro(OMP_TASKYIELD, 0, arg) \
112  macro(OMP_TASKLOOP, 0, arg) \
113  macro(TASK_executed, 0, arg) \
114  macro(TASK_cancelled, 0, arg) \
115  macro(TASK_stolen, 0, arg)
116 // clang-format on
117 
136 // clang-format off
137 #define KMP_FOREACH_TIMER(macro, arg) \
138  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
139  macro (OMP_parallel, stats_flags_e::logEvent, arg) \
140  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
141  macro (OMP_teams, stats_flags_e::logEvent, arg) \
142  macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
143  macro (OMP_loop_static, 0, arg) \
144  macro (OMP_loop_static_scheduling, 0, arg) \
145  macro (OMP_loop_dynamic, 0, arg) \
146  macro (OMP_loop_dynamic_scheduling, 0, arg) \
147  macro (OMP_distribute, 0, arg) \
148  macro (OMP_distribute_scheduling, 0, arg) \
149  macro (OMP_critical, 0, arg) \
150  macro (OMP_critical_wait, 0, arg) \
151  macro (OMP_single, 0, arg) \
152  macro (OMP_master, 0, arg) \
153  macro (OMP_task_immediate, 0, arg) \
154  macro (OMP_task_taskwait, 0, arg) \
155  macro (OMP_task_taskyield, 0, arg) \
156  macro (OMP_task_taskgroup, 0, arg) \
157  macro (OMP_task_join_bar, 0, arg) \
158  macro (OMP_task_plain_bar, 0, arg) \
159  macro (OMP_taskloop_scheduling, 0, arg) \
160  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
161  macro (OMP_idle, stats_flags_e::logEvent, arg) \
162  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
163  macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
164  macro (OMP_serial, stats_flags_e::logEvent, arg) \
165  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
166  arg) \
167  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
168  arg) \
169  macro (OMP_loop_static_iterations, \
170  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
171  macro (OMP_loop_static_total_iterations, \
172  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
173  macro (OMP_loop_dynamic_iterations, \
174  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
175  macro (OMP_loop_dynamic_total_iterations, \
176  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
177  macro (OMP_distribute_iterations, \
178  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
179  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
180 // clang-format on
181 
182 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
183 // initializing OpenMP or being created by a master)
184 // until the thread is destroyed
185 // OMP_parallel -- Time thread spends executing work directly
186 // within a #pragma omp parallel
187 // OMP_parallel_overhead -- Time thread spends setting up a parallel region
188 // OMP_loop_static -- Time thread spends executing loop iterations from
189 // a statically scheduled loop
190 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
191 // from a statically scheduled loop
192 // OMP_loop_dynamic -- Time thread spends executing loop iterations from
193 // a dynamically scheduled loop
194 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
195 // from a dynamically scheduled loop
196 // OMP_critical -- Time thread spends executing critical section
197 // OMP_critical_wait -- Time thread spends waiting to enter
198 // a critical section
199 // OMP_single -- Time spent executing a "single" region
200 // OMP_master -- Time spent executing a "master" region
201 // OMP_task_immediate -- Time spent executing non-deferred tasks
202 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait
203 // construct
204 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield
205 // construct
206 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
207 // construct
208 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier
209 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier
210 // construct
211 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
212 // construct
213 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
214 // inside implicit barrier at end of worksharing
215 // construct
216 // OMP_idle -- Time worker threads spend waiting for next
217 // parallel region
218 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
219 // parallel region
220 // OMP_join_barrier -- Time spent in a the join barrier surrounding a
221 // parallel region
222 // OMP_serial -- Time thread zero spends executing serial code
223 // OMP_set_numthreads -- Values passed to omp_set_num_threads
224 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region
225 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
226 // statically scheduled loops
227 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
228 // dynamically scheduled loops
229 
230 #if (KMP_DEVELOPER_STATS)
231 // Timers which are of interest to runtime library developers, not end users.
232 // These have to be explicitly enabled in addition to the other stats.
233 
234 // KMP_fork_barrier -- time in __kmp_fork_barrier
235 // KMP_join_barrier -- time in __kmp_join_barrier
236 // KMP_barrier -- time in __kmp_barrier
237 // KMP_end_split_barrier -- time in __kmp_end_split_barrier
238 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
239 // KMP_icv_copy -- start/stop timer for any ICV copying
240 // KMP_linear_gather -- time in __kmp_linear_barrier_gather
241 // KMP_linear_release -- time in __kmp_linear_barrier_release
242 // KMP_tree_gather -- time in __kmp_tree_barrier_gather
243 // KMP_tree_release -- time in __kmp_tree_barrier_release
244 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
245 // KMP_hyper_release -- time in __kmp_hyper_barrier_release
246 // clang-format off
247 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
248  macro(KMP_fork_call, 0, arg) \
249  macro(KMP_join_call, 0, arg) \
250  macro(KMP_end_split_barrier, 0, arg) \
251  macro(KMP_hier_gather, 0, arg) \
252  macro(KMP_hier_release, 0, arg) \
253  macro(KMP_hyper_gather, 0, arg) \
254  macro(KMP_hyper_release, 0, arg) \
255  macro(KMP_linear_gather, 0, arg) \
256  macro(KMP_linear_release, 0, arg) \
257  macro(KMP_tree_gather, 0, arg) \
258  macro(KMP_tree_release, 0, arg) \
259  macro(USER_resume, 0, arg) \
260  macro(USER_suspend, 0, arg) \
261  macro(USER_mwait, 0, arg) \
262  macro(KMP_allocate_team, 0, arg) \
263  macro(KMP_setup_icv_copy, 0, arg) \
264  macro(USER_icv_copy, 0, arg) \
265  macro (FOR_static_steal_stolen, \
266  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
267  macro (FOR_static_steal_chunks, \
268  stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
269 #else
270 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
271 #endif
272 // clang-format on
273 
293 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
294 
295 #define ENUMERATE(name, ignore, prefix) prefix##name,
296 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
297 
298 enum explicit_timer_e {
299  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
300 };
301 
302 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
303 #undef ENUMERATE
304 
305 /*
306  * A logarithmic histogram. It accumulates the number of values in each power of
307  * ten bin. So 1<=x<10, 10<=x<100, ...
308  * Mostly useful where we have some big outliers and want to see information
309  * about them.
310  */
311 class logHistogram {
312  enum {
313  numBins = 31, /* Number of powers of 10. If this changes you need to change
314  * the initializer for binMax */
315 
316  /*
317  * If you want to use this to analyse values that may be less than 1, (for
318  * instance times in s), then the logOffset gives you negative powers.
319  * In our case here, we're just looking at times in ticks, or counts, so we
320  * can never see values with magnitude < 1 (other than zero), so we can set
321  * it to 0. As above change the initializer if you change this.
322  */
323  logOffset = 0
324  };
325  uint32_t KMP_ALIGN_CACHE zeroCount;
326  struct {
327  uint32_t count;
328  double total;
329  } bins[numBins];
330 
331  static double binMax[numBins];
332 
333 #ifdef KMP_DEBUG
334  uint64_t _total;
335 
336  void check() const {
337  uint64_t t = zeroCount;
338  for (int i = 0; i < numBins; i++)
339  t += bins[i].count;
340  KMP_DEBUG_ASSERT(t == _total);
341  }
342 #else
343  void check() const {}
344 #endif
345 
346 public:
347  logHistogram() { reset(); }
348 
349  logHistogram(logHistogram const &o) {
350  for (int i = 0; i < numBins; i++)
351  bins[i] = o.bins[i];
352 #ifdef KMP_DEBUG
353  _total = o._total;
354 #endif
355  }
356 
357  void reset() {
358  zeroCount = 0;
359  for (int i = 0; i < numBins; i++) {
360  bins[i].count = 0;
361  bins[i].total = 0;
362  }
363 
364 #ifdef KMP_DEBUG
365  _total = 0;
366 #endif
367  }
368  uint32_t count(int b) const { return bins[b + logOffset].count; }
369  double total(int b) const { return bins[b + logOffset].total; }
370  static uint32_t findBin(double sample);
371 
372  logHistogram &operator+=(logHistogram const &o) {
373  zeroCount += o.zeroCount;
374  for (int i = 0; i < numBins; i++) {
375  bins[i].count += o.bins[i].count;
376  bins[i].total += o.bins[i].total;
377  }
378 #ifdef KMP_DEBUG
379  _total += o._total;
380  check();
381 #endif
382 
383  return *this;
384  }
385 
386  void addSample(double sample);
387  int minBin() const;
388  int maxBin() const;
389 
390  std::string format(char) const;
391 };
392 
393 class statistic {
394  double KMP_ALIGN_CACHE minVal;
395  double maxVal;
396  double meanVal;
397  double m2;
398  uint64_t sampleCount;
399  double offset;
400  bool collectingHist;
401  logHistogram hist;
402 
403 public:
404  statistic(bool doHist = bool(KMP_STATS_HIST)) {
405  reset();
406  collectingHist = doHist;
407  }
408  statistic(statistic const &o)
409  : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
410  sampleCount(o.sampleCount), offset(o.offset),
411  collectingHist(o.collectingHist), hist(o.hist) {}
412  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
413  : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
414  sampleCount(sc), offset(0.0), collectingHist(false) {}
415  bool haveHist() const { return collectingHist; }
416  double getMin() const { return minVal; }
417  double getMean() const { return meanVal; }
418  double getMax() const { return maxVal; }
419  uint64_t getCount() const { return sampleCount; }
420  double getSD() const { return sqrt(m2 / sampleCount); }
421  double getTotal() const { return sampleCount * meanVal; }
422  logHistogram const *getHist() const { return &hist; }
423  void setOffset(double d) { offset = d; }
424 
425  void reset() {
426  minVal = (std::numeric_limits<double>::max)();
427  maxVal = -minVal;
428  meanVal = 0.0;
429  m2 = 0.0;
430  sampleCount = 0;
431  offset = 0.0;
432  hist.reset();
433  }
434  void addSample(double sample);
435  void scale(double factor);
436  void scaleDown(double f) { scale(1. / f); }
437  void forceCount(uint64_t count) { sampleCount = count; }
438  statistic &operator+=(statistic const &other);
439 
440  std::string format(char unit, bool total = false) const;
441  std::string formatHist(char unit) const { return hist.format(unit); }
442 };
443 
444 struct statInfo {
445  const char *name;
446  uint32_t flags;
447 };
448 
449 class timeStat : public statistic {
450  static statInfo timerInfo[];
451 
452 public:
453  timeStat() : statistic() {}
454  static const char *name(timer_e e) { return timerInfo[e].name; }
455  static bool noTotal(timer_e e) {
456  return timerInfo[e].flags & stats_flags_e::noTotal;
457  }
458  static bool masterOnly(timer_e e) {
459  return timerInfo[e].flags & stats_flags_e::onlyInMaster;
460  }
461  static bool workerOnly(timer_e e) {
462  return timerInfo[e].flags & stats_flags_e::notInMaster;
463  }
464  static bool noUnits(timer_e e) {
465  return timerInfo[e].flags & stats_flags_e::noUnits;
466  }
467  static bool logEvent(timer_e e) {
468  return timerInfo[e].flags & stats_flags_e::logEvent;
469  }
470  static void clearEventFlags() {
471  for (int i = 0; i < TIMER_LAST; i++) {
472  timerInfo[i].flags &= (~(stats_flags_e::logEvent));
473  }
474  }
475 };
476 
477 // Where we need explicitly to start and end the timer, this version can be used
478 // Since these timers normally aren't nicely scoped, so don't have a good place
479 // to live on the stack of the thread, they're more work to use.
480 class explicitTimer {
481  timeStat *stat;
482  timer_e timerEnumValue;
483  tsc_tick_count startTime;
484  tsc_tick_count pauseStartTime;
485  tsc_tick_count::tsc_interval_t totalPauseTime;
486 
487 public:
488  explicitTimer(timeStat *s, timer_e te)
489  : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
490  totalPauseTime() {}
491 
492  // void setStat(timeStat *s) { stat = s; }
493  void start(tsc_tick_count tick);
494  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
495  void resume(tsc_tick_count tick) {
496  totalPauseTime += (tick - pauseStartTime);
497  }
498  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
499  void reset() {
500  startTime = 0;
501  pauseStartTime = 0;
502  totalPauseTime = 0;
503  }
504  timer_e get_type() const { return timerEnumValue; }
505 };
506 
507 // Where you need to partition a threads clock ticks into separate states
508 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
509 // DOING_NOTHING would render these conditions:
510 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
511 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
512 // versa
513 class partitionedTimers {
514 private:
515  std::vector<explicitTimer> timer_stack;
516 
517 public:
518  partitionedTimers();
519  void init(explicitTimer timer);
520  void exchange(explicitTimer timer);
521  void push(explicitTimer timer);
522  void pop();
523  void windup();
524 };
525 
526 // Special wrapper around the partitioned timers to aid timing code blocks
527 // It avoids the need to have an explicit end, leaving the scope suffices.
528 class blockPartitionedTimer {
529  partitionedTimers *part_timers;
530 
531 public:
532  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
533  : part_timers(pt) {
534  part_timers->push(timer);
535  }
536  ~blockPartitionedTimer() { part_timers->pop(); }
537 };
538 
539 // Special wrapper around the thread state to aid in keeping state in code
540 // blocks It avoids the need to have an explicit end, leaving the scope
541 // suffices.
542 class blockThreadState {
543  stats_state_e *state_pointer;
544  stats_state_e old_state;
545 
546 public:
547  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
548  : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
549  *state_pointer = new_state;
550  }
551  ~blockThreadState() { *state_pointer = old_state; }
552 };
553 
554 // If all you want is a count, then you can use this...
555 // The individual per-thread counts will be aggregated into a statistic at
556 // program exit.
557 class counter {
558  uint64_t value;
559  static const statInfo counterInfo[];
560 
561 public:
562  counter() : value(0) {}
563  void increment() { value++; }
564  uint64_t getValue() const { return value; }
565  void reset() { value = 0; }
566  static const char *name(counter_e e) { return counterInfo[e].name; }
567  static bool masterOnly(counter_e e) {
568  return counterInfo[e].flags & stats_flags_e::onlyInMaster;
569  }
570 };
571 
572 /* ****************************************************************
573  Class to implement an event
574 
575  There are four components to an event: start time, stop time
576  nest_level, and timer_name.
577  The start and stop time should be obvious (recorded in clock ticks).
578  The nest_level relates to the bar width in the timeline graph.
579  The timer_name is used to determine which timer event triggered this event.
580 
581  the interface to this class is through four read-only operations:
582  1) getStart() -- returns the start time as 64 bit integer
583  2) getStop() -- returns the stop time as 64 bit integer
584  3) getNestLevel() -- returns the nest level of the event
585  4) getTimerName() -- returns the timer name that triggered event
586 
587  *MORE ON NEST_LEVEL*
588  The nest level is used in the bar graph that represents the timeline.
589  Its main purpose is for showing how events are nested inside eachother.
590  For example, say events, A, B, and C are recorded. If the timeline
591  looks like this:
592 
593 Begin -------------------------------------------------------------> Time
594  | | | | | |
595  A B C C B A
596  start start start end end end
597 
598  Then A, B, C will have a nest level of 1, 2, 3 respectively.
599  These values are then used to calculate the barwidth so you can
600  see that inside A, B has occurred, and inside B, C has occurred.
601  Currently, this is shown with A's bar width being larger than B's
602  bar width, and B's bar width being larger than C's bar width.
603 
604 **************************************************************** */
605 class kmp_stats_event {
606  uint64_t start;
607  uint64_t stop;
608  int nest_level;
609  timer_e timer_name;
610 
611 public:
612  kmp_stats_event()
613  : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
614  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
615  : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
616  inline uint64_t getStart() const { return start; }
617  inline uint64_t getStop() const { return stop; }
618  inline int getNestLevel() const { return nest_level; }
619  inline timer_e getTimerName() const { return timer_name; }
620 };
621 
622 /* ****************************************************************
623  Class to implement a dynamically expandable array of events
624 
625  ---------------------------------------------------------
626  | event 1 | event 2 | event 3 | event 4 | ... | event N |
627  ---------------------------------------------------------
628 
629  An event is pushed onto the back of this array at every
630  explicitTimer->stop() call. The event records the thread #,
631  start time, stop time, and nest level related to the bar width.
632 
633  The event vector starts at size INIT_SIZE and grows (doubles in size)
634  if needed. An implication of this behavior is that log(N)
635  reallocations are needed (where N is number of events). If you want
636  to avoid reallocations, then set INIT_SIZE to a large value.
637 
638  the interface to this class is through six operations:
639  1) reset() -- sets the internal_size back to 0 but does not deallocate any
640  memory
641  2) size() -- returns the number of valid elements in the vector
642  3) push_back(start, stop, nest, timer_name) -- pushes an event onto
643  the back of the array
644  4) deallocate() -- frees all memory associated with the vector
645  5) sort() -- sorts the vector by start time
646  6) operator[index] or at(index) -- returns event reference at that index
647 **************************************************************** */
648 class kmp_stats_event_vector {
649  kmp_stats_event *events;
650  int internal_size;
651  int allocated_size;
652  static const int INIT_SIZE = 1024;
653 
654 public:
655  kmp_stats_event_vector() {
656  events =
657  (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
658  internal_size = 0;
659  allocated_size = INIT_SIZE;
660  }
661  ~kmp_stats_event_vector() {}
662  inline void reset() { internal_size = 0; }
663  inline int size() const { return internal_size; }
664  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
665  timer_e name) {
666  int i;
667  if (internal_size == allocated_size) {
668  kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
669  sizeof(kmp_stats_event) * allocated_size * 2);
670  for (i = 0; i < internal_size; i++)
671  tmp[i] = events[i];
672  __kmp_free(events);
673  events = tmp;
674  allocated_size *= 2;
675  }
676  events[internal_size] =
677  kmp_stats_event(start_time, stop_time, nest_level, name);
678  internal_size++;
679  return;
680  }
681  void deallocate();
682  void sort();
683  const kmp_stats_event &operator[](int index) const { return events[index]; }
684  kmp_stats_event &operator[](int index) { return events[index]; }
685  const kmp_stats_event &at(int index) const { return events[index]; }
686  kmp_stats_event &at(int index) { return events[index]; }
687 };
688 
689 /* ****************************************************************
690  Class to implement a doubly-linked, circular, statistics list
691 
692  |---| ---> |---| ---> |---| ---> |---| ---> ... next
693  | | | | | | | |
694  |---| <--- |---| <--- |---| <--- |---| <--- ... prev
695  Sentinel first second third
696  Node node node node
697 
698  The Sentinel Node is the user handle on the list.
699  The first node corresponds to thread 0's statistics.
700  The second node corresponds to thread 1's statistics and so on...
701 
702  Each node has a _timers, _counters, and _explicitTimers array to hold that
703  thread's statistics. The _explicitTimers point to the correct _timer and
704  update its statistics at every stop() call. The explicitTimers' pointers are
705  set up in the constructor. Each node also has an event vector to hold that
706  thread's timing events. The event vector expands as necessary and records
707  the start-stop times for each timer.
708 
709  The nestLevel variable is for plotting events and is related
710  to the bar width in the timeline graph.
711 
712  Every thread will have a thread local pointer to its node in
713  the list. The sentinel node is used by the master thread to
714  store "dummy" statistics before __kmp_create_worker() is called.
715 **************************************************************** */
716 class kmp_stats_list {
717  int gtid;
718  timeStat _timers[TIMER_LAST + 1];
719  counter _counters[COUNTER_LAST + 1];
720  explicitTimer thread_life_timer;
721  partitionedTimers _partitionedTimers;
722  int _nestLevel; // one per thread
723  kmp_stats_event_vector _event_vector;
724  kmp_stats_list *next;
725  kmp_stats_list *prev;
726  stats_state_e state;
727  int thread_is_idle_flag;
728 
729 public:
730  kmp_stats_list()
731  : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
732  TIMER_OMP_worker_thread_life),
733  _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
734  thread_is_idle_flag(0) {}
735  ~kmp_stats_list() {}
736  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
737  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
738  inline partitionedTimers *getPartitionedTimers() {
739  return &_partitionedTimers;
740  }
741  inline timeStat *getTimers() { return _timers; }
742  inline counter *getCounters() { return _counters; }
743  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
744  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
745  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
746  inline void resetEventVector() { _event_vector.reset(); }
747  inline void incrementNestValue() { _nestLevel++; }
748  inline int getNestValue() { return _nestLevel; }
749  inline void decrementNestValue() { _nestLevel--; }
750  inline int getGtid() const { return gtid; }
751  inline void setGtid(int newgtid) { gtid = newgtid; }
752  inline void setState(stats_state_e newstate) { state = newstate; }
753  inline stats_state_e getState() const { return state; }
754  inline stats_state_e *getStatePointer() { return &state; }
755  inline bool isIdle() { return thread_is_idle_flag == 1; }
756  inline void setIdleFlag() { thread_is_idle_flag = 1; }
757  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
758  kmp_stats_list *push_back(int gtid); // returns newly created list node
759  inline void push_event(uint64_t start_time, uint64_t stop_time,
760  int nest_level, timer_e name) {
761  _event_vector.push_back(start_time, stop_time, nest_level, name);
762  }
763  void deallocate();
764  class iterator;
765  kmp_stats_list::iterator begin();
766  kmp_stats_list::iterator end();
767  int size();
768  class iterator {
769  kmp_stats_list *ptr;
770  friend kmp_stats_list::iterator kmp_stats_list::begin();
771  friend kmp_stats_list::iterator kmp_stats_list::end();
772 
773  public:
774  iterator();
775  ~iterator();
776  iterator operator++();
777  iterator operator++(int dummy);
778  iterator operator--();
779  iterator operator--(int dummy);
780  bool operator!=(const iterator &rhs);
781  bool operator==(const iterator &rhs);
782  kmp_stats_list *operator*() const; // dereference operator
783  };
784 };
785 
786 /* ****************************************************************
787  Class to encapsulate all output functions and the environment variables
788 
789  This module holds filenames for various outputs (normal stats, events, plot
790  file), as well as coloring information for the plot file.
791 
792  The filenames and flags variables are read from environment variables.
793  These are read once by the constructor of the global variable
794  __kmp_stats_output which calls init().
795 
796  During this init() call, event flags for the timeStat::timerInfo[] global
797  array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
798 
799  The only interface function that is public is outputStats(heading). This
800  function should print out everything it needs to, either to files or stderr,
801  depending on the environment variables described below
802 
803  ENVIRONMENT VARIABLES:
804  KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
805  file, otherwise, print to stderr
806  KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
807  either KMP_STATS_FILE or stderr
808  KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
809  otherwise, the plot file is sent to "events.plt"
810  KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
811  events
812  KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
813  otherwise, output is sent to "events.dat"
814 **************************************************************** */
815 class kmp_stats_output_module {
816 
817 public:
818  struct rgb_color {
819  float r;
820  float g;
821  float b;
822  };
823 
824 private:
825  std::string outputFileName;
826  static const char *eventsFileName;
827  static const char *plotFileName;
828  static int printPerThreadFlag;
829  static int printPerThreadEventsFlag;
830  static const rgb_color globalColorArray[];
831  static rgb_color timerColorInfo[];
832 
833  void init();
834  static void setupEventColors();
835  static void printPloticusFile();
836  static void printHeaderInfo(FILE *statsOut);
837  static void printTimerStats(FILE *statsOut, statistic const *theStats,
838  statistic const *totalStats);
839  static void printCounterStats(FILE *statsOut, statistic const *theStats);
840  static void printCounters(FILE *statsOut, counter const *theCounters);
841  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
842  int gtid);
843  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
844  static void windupExplicitTimers();
845  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
846 
847 public:
848  kmp_stats_output_module() { init(); }
849  void outputStats(const char *heading);
850 };
851 
852 #ifdef __cplusplus
853 extern "C" {
854 #endif
855 void __kmp_stats_init();
856 void __kmp_stats_fini();
857 void __kmp_reset_stats();
858 void __kmp_output_stats(const char *);
859 void __kmp_accumulate_stats_at_exit(void);
860 // thread local pointer to stats node within list
861 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
862 // head to stats list.
863 extern kmp_stats_list *__kmp_stats_list;
864 // lock for __kmp_stats_list
865 extern kmp_tas_lock_t __kmp_stats_lock;
866 // reference start time
867 extern tsc_tick_count __kmp_stats_start_time;
868 // interface to output
869 extern kmp_stats_output_module __kmp_stats_output;
870 
871 #ifdef __cplusplus
872 }
873 #endif
874 
875 // Simple, standard interfaces that drop out completely if stats aren't enabled
876 
888 #define KMP_COUNT_VALUE(name, value) \
889  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
890 
901 #define KMP_COUNT_BLOCK(name) \
902  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
903 
921 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
922 
930 #define KMP_INIT_PARTITIONED_TIMERS(name) \
931  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
932  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
933 
934 #define KMP_TIME_PARTITIONED_BLOCK(name) \
935  blockPartitionedTimer __PBLOCKTIME__( \
936  __kmp_stats_thread_ptr->getPartitionedTimers(), \
937  explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
938  TIMER_##name))
939 
940 #define KMP_PUSH_PARTITIONED_TIMER(name) \
941  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
942  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
943 
944 #define KMP_POP_PARTITIONED_TIMER() \
945  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
946 
947 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
948  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
949  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
950 
951 #define KMP_SET_THREAD_STATE(state_name) \
952  __kmp_stats_thread_ptr->setState(state_name)
953 
954 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
955 
956 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \
957  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
958  state_name)
959 
967 #define KMP_RESET_STATS() __kmp_reset_stats()
968 
969 #if (KMP_DEVELOPER_STATS)
970 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
971 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
972 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
973 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
974 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
975 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
976  KMP_EXCHANGE_PARTITIONED_TIMER(n)
977 #else
978 // Null definitions
979 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
980 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
981 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
982 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
983 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
984 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
985 #endif
986 
987 #else // KMP_STATS_ENABLED
988 
989 // Null definitions
990 #define KMP_COUNT_VALUE(n, v) ((void)0)
991 #define KMP_COUNT_BLOCK(n) ((void)0)
992 
993 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
994 #define KMP_RESET_STATS() ((void)0)
995 
996 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
997 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
998 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
999 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1000 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1001 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1002 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1003 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1004 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1005 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
1006 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
1007 #define KMP_GET_THREAD_STATE() ((void)0)
1008 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1009 #endif // KMP_STATS_ENABLED
1010 
1011 #endif // KMP_STATS_H
stats_flags_e
flags to describe the statistic (timer or counter)
Definition: kmp_stats.h:49
#define KMP_FOREACH_COUNTER(macro, arg)
Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h.
Definition: kmp_stats.h:95
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)
Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Definition: kmp_stats.h:293
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
@ notInMaster
statistic is valid only for non-master threads
Definition: kmp_stats.h:53
@ noUnits
statistic doesn't need units printed next to it
Definition: kmp_stats.h:52
@ logEvent
Definition: kmp_stats.h:54
@ noTotal
do not show a TOTAL_aggregation for this statistic
Definition: kmp_stats.h:50
@ onlyInMaster
statistic is valid only for master
Definition: kmp_stats.h:51