LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_stats.h
1#ifndef KMP_STATS_H
2#define KMP_STATS_H
3
8//===----------------------------------------------------------------------===//
9//
10// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11// See https://llvm.org/LICENSE.txt for license information.
12// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13//
14//===----------------------------------------------------------------------===//
15
16#include "kmp_config.h"
17#include "kmp_debug.h"
18
19#if KMP_STATS_ENABLED
20/* Statistics accumulator.
21 Accumulates number of samples and computes min, max, mean, standard deviation
22 on the fly.
23
24 Online variance calculation algorithm from
25 http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26 */
27
28#include "kmp_stats_timing.h"
29#include <limits>
30#include <math.h>
31#include <new> // placement new
32#include <stdint.h>
33#include <string>
34#include <vector>
35
36/* Enable developer statistics here if you want them. They are more detailed
37 than is useful for application characterisation and are intended for the
38 runtime library developer. */
39#define KMP_DEVELOPER_STATS 0
40
41/* Enable/Disable histogram output */
42#define KMP_STATS_HIST 0
43
50 noTotal = 1 << 0,
51 onlyInMaster = 1 << 1,
52 noUnits = 1 << 2,
53 notInMaster = 1 << 3,
54 logEvent = 1 << 4
56};
57
64 IDLE,
65 SERIAL_REGION,
66 FORK_JOIN_BARRIER,
67 PLAIN_BARRIER,
68 TASKWAIT,
69 TASKYIELD,
70 TASKGROUP,
71 IMPLICIT_TASK,
72 EXPLICIT_TASK,
73 TEAMS_REGION
74};
75
94// clang-format off
95#define KMP_FOREACH_COUNTER(macro, arg) \
96 macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97 macro(OMP_NESTED_PARALLEL, 0, arg) \
98 macro(OMP_LOOP_STATIC, 0, arg) \
99 macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100 macro(OMP_LOOP_DYNAMIC, 0, arg) \
101 macro(OMP_DISTRIBUTE, 0, arg) \
102 macro(OMP_BARRIER, 0, arg) \
103 macro(OMP_CRITICAL, 0, arg) \
104 macro(OMP_SINGLE, 0, arg) \
105 macro(OMP_MASTER, 0, arg) \
106 macro(OMP_MASKED, 0, arg) \
107 macro(OMP_TEAMS, 0, arg) \
108 macro(OMP_set_lock, 0, arg) \
109 macro(OMP_test_lock, 0, arg) \
110 macro(REDUCE_wait, 0, arg) \
111 macro(REDUCE_nowait, 0, arg) \
112 macro(OMP_TASKYIELD, 0, arg) \
113 macro(OMP_TASKLOOP, 0, arg) \
114 macro(TASK_executed, 0, arg) \
115 macro(TASK_cancelled, 0, arg) \
116 macro(TASK_stolen, 0, arg)
117// clang-format on
118
137// clang-format off
138#define KMP_FOREACH_TIMER(macro, arg) \
139 macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
140 macro (OMP_parallel, stats_flags_e::logEvent, arg) \
141 macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
142 macro (OMP_teams, stats_flags_e::logEvent, arg) \
143 macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
144 macro (OMP_loop_static, 0, arg) \
145 macro (OMP_loop_static_scheduling, 0, arg) \
146 macro (OMP_loop_dynamic, 0, arg) \
147 macro (OMP_loop_dynamic_scheduling, 0, arg) \
148 macro (OMP_distribute, 0, arg) \
149 macro (OMP_distribute_scheduling, 0, arg) \
150 macro (OMP_critical, 0, arg) \
151 macro (OMP_critical_wait, 0, arg) \
152 macro (OMP_single, 0, arg) \
153 macro (OMP_master, 0, arg) \
154 macro (OMP_masked, 0, arg) \
155 macro (OMP_task_immediate, 0, arg) \
156 macro (OMP_task_taskwait, 0, arg) \
157 macro (OMP_task_taskyield, 0, arg) \
158 macro (OMP_task_taskgroup, 0, arg) \
159 macro (OMP_task_join_bar, 0, arg) \
160 macro (OMP_task_plain_bar, 0, arg) \
161 macro (OMP_taskloop_scheduling, 0, arg) \
162 macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
163 macro (OMP_idle, stats_flags_e::logEvent, arg) \
164 macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
165 macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
166 macro (OMP_serial, stats_flags_e::logEvent, arg) \
167 macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
168 arg) \
169 macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
170 arg) \
171 macro (OMP_loop_static_iterations, \
172 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
173 macro (OMP_loop_static_total_iterations, \
174 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
175 macro (OMP_loop_dynamic_iterations, \
176 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
177 macro (OMP_loop_dynamic_total_iterations, \
178 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
179 macro (OMP_distribute_iterations, \
180 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
181 KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
182// clang-format on
183
184// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
185// initializing OpenMP or being created by a primary
186// thread) until the thread is destroyed
187// OMP_parallel -- Time thread spends executing work directly
188// within a #pragma omp parallel
189// OMP_parallel_overhead -- Time thread spends setting up a parallel region
190// OMP_loop_static -- Time thread spends executing loop iterations from
191// a statically scheduled loop
192// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
193// from a statically scheduled loop
194// OMP_loop_dynamic -- Time thread spends executing loop iterations from
195// a dynamically scheduled loop
196// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
197// from a dynamically scheduled loop
198// OMP_critical -- Time thread spends executing critical section
199// OMP_critical_wait -- Time thread spends waiting to enter
200// a critical section
201// OMP_single -- Time spent executing a "single" region
202// OMP_master -- Time spent executing a "master" region
203// OMP_masked -- Time spent executing a "masked" region
204// OMP_task_immediate -- Time spent executing non-deferred tasks
205// OMP_task_taskwait -- Time spent executing tasks inside a taskwait
206// construct
207// OMP_task_taskyield -- Time spent executing tasks inside a taskyield
208// construct
209// OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
210// construct
211// OMP_task_join_bar -- Time spent executing tasks inside a join barrier
212// OMP_task_plain_bar -- Time spent executing tasks inside a barrier
213// construct
214// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
215// construct
216// OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
217// inside implicit barrier at end of worksharing
218// construct
219// OMP_idle -- Time worker threads spend waiting for next
220// parallel region
221// OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
222// parallel region
223// OMP_join_barrier -- Time spent in a the join barrier surrounding a
224// parallel region
225// OMP_serial -- Time thread zero spends executing serial code
226// OMP_set_numthreads -- Values passed to omp_set_num_threads
227// OMP_PARALLEL_args -- Number of arguments passed to a parallel region
228// OMP_loop_static_iterations -- Number of iterations thread is assigned for
229// statically scheduled loops
230// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
231// dynamically scheduled loops
232
233#if (KMP_DEVELOPER_STATS)
234// Timers which are of interest to runtime library developers, not end users.
235// These have to be explicitly enabled in addition to the other stats.
236
237// KMP_fork_barrier -- time in __kmp_fork_barrier
238// KMP_join_barrier -- time in __kmp_join_barrier
239// KMP_barrier -- time in __kmp_barrier
240// KMP_end_split_barrier -- time in __kmp_end_split_barrier
241// KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
242// KMP_icv_copy -- start/stop timer for any ICV copying
243// KMP_linear_gather -- time in __kmp_linear_barrier_gather
244// KMP_linear_release -- time in __kmp_linear_barrier_release
245// KMP_tree_gather -- time in __kmp_tree_barrier_gather
246// KMP_tree_release -- time in __kmp_tree_barrier_release
247// KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
248// KMP_hyper_release -- time in __kmp_hyper_barrier_release
249// KMP_dist_gather -- time in __kmp_dist_barrier_gather
250// KMP_dist_release -- time in __kmp_dist_barrier_release
251// clang-format off
252#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
253 macro(KMP_fork_call, 0, arg) \
254 macro(KMP_join_call, 0, arg) \
255 macro(KMP_end_split_barrier, 0, arg) \
256 macro(KMP_hier_gather, 0, arg) \
257 macro(KMP_hier_release, 0, arg) \
258 macro(KMP_hyper_gather, 0, arg) \
259 macro(KMP_hyper_release, 0, arg) \
260 macro(KMP_dist_gather, 0, arg) \
261 macro(KMP_dist_release, 0, arg) \
262 macro(KMP_linear_gather, 0, arg) \
263 macro(KMP_linear_release, 0, arg) \
264 macro(KMP_tree_gather, 0, arg) \
265 macro(KMP_tree_release, 0, arg) \
266 macro(USER_resume, 0, arg) \
267 macro(USER_suspend, 0, arg) \
268 macro(USER_mwait, 0, arg) \
269 macro(KMP_allocate_team, 0, arg) \
270 macro(KMP_setup_icv_copy, 0, arg) \
271 macro(USER_icv_copy, 0, arg) \
272 macro (FOR_static_steal_stolen, \
273 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
274 macro (FOR_static_steal_chunks, \
275 stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
276#else
277#define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
278#endif
279// clang-format on
280
300#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
301
302#define ENUMERATE(name, ignore, prefix) prefix##name,
303enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
304
305enum explicit_timer_e {
306 KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
307};
308
309enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
310#undef ENUMERATE
311
312/*
313 * A logarithmic histogram. It accumulates the number of values in each power of
314 * ten bin. So 1<=x<10, 10<=x<100, ...
315 * Mostly useful where we have some big outliers and want to see information
316 * about them.
317 */
318class logHistogram {
319 enum {
320 numBins = 31, /* Number of powers of 10. If this changes you need to change
321 * the initializer for binMax */
322
323 /*
324 * If you want to use this to analyse values that may be less than 1, (for
325 * instance times in s), then the logOffset gives you negative powers.
326 * In our case here, we're just looking at times in ticks, or counts, so we
327 * can never see values with magnitude < 1 (other than zero), so we can set
328 * it to 0. As above change the initializer if you change this.
329 */
330 logOffset = 0
331 };
332 uint32_t KMP_ALIGN_CACHE zeroCount;
333 struct {
334 uint32_t count;
335 double total;
336 } bins[numBins];
337
338 static double binMax[numBins];
339
340#ifdef KMP_DEBUG
341 uint64_t _total;
342
343 void check() const {
344 uint64_t t = zeroCount;
345 for (int i = 0; i < numBins; i++)
346 t += bins[i].count;
347 KMP_DEBUG_ASSERT(t == _total);
348 }
349#else
350 void check() const {}
351#endif
352
353public:
354 logHistogram() { reset(); }
355
356 logHistogram(logHistogram const &o) {
357 for (int i = 0; i < numBins; i++)
358 bins[i] = o.bins[i];
359#ifdef KMP_DEBUG
360 _total = o._total;
361#endif
362 }
363
364 void reset() {
365 zeroCount = 0;
366 for (int i = 0; i < numBins; i++) {
367 bins[i].count = 0;
368 bins[i].total = 0;
369 }
370
371#ifdef KMP_DEBUG
372 _total = 0;
373#endif
374 }
375 uint32_t count(int b) const { return bins[b + logOffset].count; }
376 double total(int b) const { return bins[b + logOffset].total; }
377 static uint32_t findBin(double sample);
378
379 logHistogram &operator+=(logHistogram const &o) {
380 zeroCount += o.zeroCount;
381 for (int i = 0; i < numBins; i++) {
382 bins[i].count += o.bins[i].count;
383 bins[i].total += o.bins[i].total;
384 }
385#ifdef KMP_DEBUG
386 _total += o._total;
387 check();
388#endif
389
390 return *this;
391 }
392
393 void addSample(double sample);
394 int minBin() const;
395 int maxBin() const;
396
397 std::string format(char) const;
398};
399
400class statistic {
401 double KMP_ALIGN_CACHE minVal;
402 double maxVal;
403 double meanVal;
404 double m2;
405 uint64_t sampleCount;
406 double offset;
407 bool collectingHist;
408 logHistogram hist;
409
410public:
411 statistic(bool doHist = bool(KMP_STATS_HIST)) {
412 reset();
413 collectingHist = doHist;
414 }
415 statistic(statistic const &o)
416 : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
417 sampleCount(o.sampleCount), offset(o.offset),
418 collectingHist(o.collectingHist), hist(o.hist) {}
419 statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
420 : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
421 sampleCount(sc), offset(0.0), collectingHist(false) {}
422 bool haveHist() const { return collectingHist; }
423 double getMin() const { return minVal; }
424 double getMean() const { return meanVal; }
425 double getMax() const { return maxVal; }
426 uint64_t getCount() const { return sampleCount; }
427 double getSD() const { return sqrt(m2 / sampleCount); }
428 double getTotal() const { return sampleCount * meanVal; }
429 logHistogram const *getHist() const { return &hist; }
430 void setOffset(double d) { offset = d; }
431
432 void reset() {
433 minVal = (std::numeric_limits<double>::max)();
434 maxVal = -minVal;
435 meanVal = 0.0;
436 m2 = 0.0;
437 sampleCount = 0;
438 offset = 0.0;
439 hist.reset();
440 }
441 void addSample(double sample);
442 void scale(double factor);
443 void scaleDown(double f) { scale(1. / f); }
444 void forceCount(uint64_t count) { sampleCount = count; }
445 statistic &operator+=(statistic const &other);
446
447 std::string format(char unit, bool total = false) const;
448 std::string formatHist(char unit) const { return hist.format(unit); }
449};
450
451struct statInfo {
452 const char *name;
453 uint32_t flags;
454};
455
456class timeStat : public statistic {
457 static statInfo timerInfo[];
458
459public:
460 timeStat() : statistic() {}
461 static const char *name(timer_e e) { return timerInfo[e].name; }
462 static bool noTotal(timer_e e) {
463 return timerInfo[e].flags & stats_flags_e::noTotal;
464 }
465 static bool masterOnly(timer_e e) {
466 return timerInfo[e].flags & stats_flags_e::onlyInMaster;
467 }
468 static bool workerOnly(timer_e e) {
469 return timerInfo[e].flags & stats_flags_e::notInMaster;
470 }
471 static bool noUnits(timer_e e) {
472 return timerInfo[e].flags & stats_flags_e::noUnits;
473 }
474 static bool logEvent(timer_e e) {
475 return timerInfo[e].flags & stats_flags_e::logEvent;
476 }
477 static void clearEventFlags() {
478 for (int i = 0; i < TIMER_LAST; i++) {
479 timerInfo[i].flags &= (~(stats_flags_e::logEvent));
480 }
481 }
482};
483
484// Where we need explicitly to start and end the timer, this version can be used
485// Since these timers normally aren't nicely scoped, so don't have a good place
486// to live on the stack of the thread, they're more work to use.
487class explicitTimer {
488 timeStat *stat;
489 timer_e timerEnumValue;
490 tsc_tick_count startTime;
491 tsc_tick_count pauseStartTime;
492 tsc_tick_count::tsc_interval_t totalPauseTime;
493
494public:
495 explicitTimer(timeStat *s, timer_e te)
496 : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
497 totalPauseTime() {}
498
499 // void setStat(timeStat *s) { stat = s; }
500 void start(tsc_tick_count tick);
501 void pause(tsc_tick_count tick) { pauseStartTime = tick; }
502 void resume(tsc_tick_count tick) {
503 totalPauseTime += (tick - pauseStartTime);
504 }
505 void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
506 void reset() {
507 startTime = 0;
508 pauseStartTime = 0;
509 totalPauseTime = 0;
510 }
511 timer_e get_type() const { return timerEnumValue; }
512};
513
514// Where you need to partition a threads clock ticks into separate states
515// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
516// DOING_NOTHING would render these conditions:
517// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
518// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
519// versa
520class partitionedTimers {
521private:
522 std::vector<explicitTimer> timer_stack;
523
524public:
525 partitionedTimers();
526 void init(explicitTimer timer);
527 void exchange(explicitTimer timer);
528 void push(explicitTimer timer);
529 void pop();
530 void windup();
531};
532
533// Special wrapper around the partitioned timers to aid timing code blocks
534// It avoids the need to have an explicit end, leaving the scope suffices.
535class blockPartitionedTimer {
536 partitionedTimers *part_timers;
537
538public:
539 blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
540 : part_timers(pt) {
541 part_timers->push(timer);
542 }
543 ~blockPartitionedTimer() { part_timers->pop(); }
544};
545
546// Special wrapper around the thread state to aid in keeping state in code
547// blocks It avoids the need to have an explicit end, leaving the scope
548// suffices.
549class blockThreadState {
550 stats_state_e *state_pointer;
551 stats_state_e old_state;
552
553public:
554 blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
555 : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
556 *state_pointer = new_state;
557 }
558 ~blockThreadState() { *state_pointer = old_state; }
559};
560
561// If all you want is a count, then you can use this...
562// The individual per-thread counts will be aggregated into a statistic at
563// program exit.
564class counter {
565 uint64_t value;
566 static const statInfo counterInfo[];
567
568public:
569 counter() : value(0) {}
570 void increment() { value++; }
571 uint64_t getValue() const { return value; }
572 void reset() { value = 0; }
573 static const char *name(counter_e e) { return counterInfo[e].name; }
574 static bool masterOnly(counter_e e) {
575 return counterInfo[e].flags & stats_flags_e::onlyInMaster;
576 }
577};
578
579/* ****************************************************************
580 Class to implement an event
581
582 There are four components to an event: start time, stop time
583 nest_level, and timer_name.
584 The start and stop time should be obvious (recorded in clock ticks).
585 The nest_level relates to the bar width in the timeline graph.
586 The timer_name is used to determine which timer event triggered this event.
587
588 the interface to this class is through four read-only operations:
589 1) getStart() -- returns the start time as 64 bit integer
590 2) getStop() -- returns the stop time as 64 bit integer
591 3) getNestLevel() -- returns the nest level of the event
592 4) getTimerName() -- returns the timer name that triggered event
593
594 *MORE ON NEST_LEVEL*
595 The nest level is used in the bar graph that represents the timeline.
596 Its main purpose is for showing how events are nested inside eachother.
597 For example, say events, A, B, and C are recorded. If the timeline
598 looks like this:
599
600Begin -------------------------------------------------------------> Time
601 | | | | | |
602 A B C C B A
603 start start start end end end
604
605 Then A, B, C will have a nest level of 1, 2, 3 respectively.
606 These values are then used to calculate the barwidth so you can
607 see that inside A, B has occurred, and inside B, C has occurred.
608 Currently, this is shown with A's bar width being larger than B's
609 bar width, and B's bar width being larger than C's bar width.
610
611**************************************************************** */
612class kmp_stats_event {
613 uint64_t start;
614 uint64_t stop;
615 int nest_level;
616 timer_e timer_name;
617
618public:
619 kmp_stats_event()
620 : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
621 kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
622 : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
623 inline uint64_t getStart() const { return start; }
624 inline uint64_t getStop() const { return stop; }
625 inline int getNestLevel() const { return nest_level; }
626 inline timer_e getTimerName() const { return timer_name; }
627};
628
629/* ****************************************************************
630 Class to implement a dynamically expandable array of events
631
632 ---------------------------------------------------------
633 | event 1 | event 2 | event 3 | event 4 | ... | event N |
634 ---------------------------------------------------------
635
636 An event is pushed onto the back of this array at every
637 explicitTimer->stop() call. The event records the thread #,
638 start time, stop time, and nest level related to the bar width.
639
640 The event vector starts at size INIT_SIZE and grows (doubles in size)
641 if needed. An implication of this behavior is that log(N)
642 reallocations are needed (where N is number of events). If you want
643 to avoid reallocations, then set INIT_SIZE to a large value.
644
645 the interface to this class is through six operations:
646 1) reset() -- sets the internal_size back to 0 but does not deallocate any
647 memory
648 2) size() -- returns the number of valid elements in the vector
649 3) push_back(start, stop, nest, timer_name) -- pushes an event onto
650 the back of the array
651 4) deallocate() -- frees all memory associated with the vector
652 5) sort() -- sorts the vector by start time
653 6) operator[index] or at(index) -- returns event reference at that index
654**************************************************************** */
655class kmp_stats_event_vector {
656 kmp_stats_event *events;
657 int internal_size;
658 int allocated_size;
659 static const int INIT_SIZE = 1024;
660
661public:
662 kmp_stats_event_vector() {
663 events =
664 (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
665 internal_size = 0;
666 allocated_size = INIT_SIZE;
667 }
668 ~kmp_stats_event_vector() {}
669 inline void reset() { internal_size = 0; }
670 inline int size() const { return internal_size; }
671 void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
672 timer_e name) {
673 int i;
674 if (internal_size == allocated_size) {
675 kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
676 sizeof(kmp_stats_event) * allocated_size * 2);
677 for (i = 0; i < internal_size; i++)
678 tmp[i] = events[i];
679 __kmp_free(events);
680 events = tmp;
681 allocated_size *= 2;
682 }
683 events[internal_size] =
684 kmp_stats_event(start_time, stop_time, nest_level, name);
685 internal_size++;
686 return;
687 }
688 void deallocate();
689 void sort();
690 const kmp_stats_event &operator[](int index) const { return events[index]; }
691 kmp_stats_event &operator[](int index) { return events[index]; }
692 const kmp_stats_event &at(int index) const { return events[index]; }
693 kmp_stats_event &at(int index) { return events[index]; }
694};
695
696/* ****************************************************************
697 Class to implement a doubly-linked, circular, statistics list
698
699 |---| ---> |---| ---> |---| ---> |---| ---> ... next
700 | | | | | | | |
701 |---| <--- |---| <--- |---| <--- |---| <--- ... prev
702 Sentinel first second third
703 Node node node node
704
705 The Sentinel Node is the user handle on the list.
706 The first node corresponds to thread 0's statistics.
707 The second node corresponds to thread 1's statistics and so on...
708
709 Each node has a _timers, _counters, and _explicitTimers array to hold that
710 thread's statistics. The _explicitTimers point to the correct _timer and
711 update its statistics at every stop() call. The explicitTimers' pointers are
712 set up in the constructor. Each node also has an event vector to hold that
713 thread's timing events. The event vector expands as necessary and records
714 the start-stop times for each timer.
715
716 The nestLevel variable is for plotting events and is related
717 to the bar width in the timeline graph.
718
719 Every thread will have a thread local pointer to its node in
720 the list. The sentinel node is used by the primary thread to
721 store "dummy" statistics before __kmp_create_worker() is called.
722**************************************************************** */
723class kmp_stats_list {
724 int gtid;
725 timeStat _timers[TIMER_LAST + 1];
726 counter _counters[COUNTER_LAST + 1];
727 explicitTimer thread_life_timer;
728 partitionedTimers _partitionedTimers;
729 int _nestLevel; // one per thread
730 kmp_stats_event_vector _event_vector;
731 kmp_stats_list *next;
732 kmp_stats_list *prev;
733 stats_state_e state;
734 int thread_is_idle_flag;
735
736public:
737 kmp_stats_list()
738 : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
739 TIMER_OMP_worker_thread_life),
740 _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
741 thread_is_idle_flag(0) {}
742 ~kmp_stats_list() {}
743 inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
744 inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
745 inline partitionedTimers *getPartitionedTimers() {
746 return &_partitionedTimers;
747 }
748 inline timeStat *getTimers() { return _timers; }
749 inline counter *getCounters() { return _counters; }
750 inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
751 inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
752 inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
753 inline void resetEventVector() { _event_vector.reset(); }
754 inline void incrementNestValue() { _nestLevel++; }
755 inline int getNestValue() { return _nestLevel; }
756 inline void decrementNestValue() { _nestLevel--; }
757 inline int getGtid() const { return gtid; }
758 inline void setGtid(int newgtid) { gtid = newgtid; }
759 inline void setState(stats_state_e newstate) { state = newstate; }
760 inline stats_state_e getState() const { return state; }
761 inline stats_state_e *getStatePointer() { return &state; }
762 inline bool isIdle() { return thread_is_idle_flag == 1; }
763 inline void setIdleFlag() { thread_is_idle_flag = 1; }
764 inline void resetIdleFlag() { thread_is_idle_flag = 0; }
765 kmp_stats_list *push_back(int gtid); // returns newly created list node
766 inline void push_event(uint64_t start_time, uint64_t stop_time,
767 int nest_level, timer_e name) {
768 _event_vector.push_back(start_time, stop_time, nest_level, name);
769 }
770 void deallocate();
771 class iterator;
772 kmp_stats_list::iterator begin();
773 kmp_stats_list::iterator end();
774 int size();
775 class iterator {
776 kmp_stats_list *ptr;
777 friend kmp_stats_list::iterator kmp_stats_list::begin();
778 friend kmp_stats_list::iterator kmp_stats_list::end();
779
780 public:
781 iterator();
782 ~iterator();
783 iterator operator++();
784 iterator operator++(int dummy);
785 iterator operator--();
786 iterator operator--(int dummy);
787 bool operator!=(const iterator &rhs);
788 bool operator==(const iterator &rhs);
789 kmp_stats_list *operator*() const; // dereference operator
790 };
791};
792
793/* ****************************************************************
794 Class to encapsulate all output functions and the environment variables
795
796 This module holds filenames for various outputs (normal stats, events, plot
797 file), as well as coloring information for the plot file.
798
799 The filenames and flags variables are read from environment variables.
800 These are read once by the constructor of the global variable
801 __kmp_stats_output which calls init().
802
803 During this init() call, event flags for the timeStat::timerInfo[] global
804 array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
805
806 The only interface function that is public is outputStats(heading). This
807 function should print out everything it needs to, either to files or stderr,
808 depending on the environment variables described below
809
810 ENVIRONMENT VARIABLES:
811 KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
812 file, otherwise, print to stderr
813 KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
814 either KMP_STATS_FILE or stderr
815 KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
816 otherwise, the plot file is sent to "events.plt"
817 KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
818 events
819 KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
820 otherwise, output is sent to "events.dat"
821**************************************************************** */
822class kmp_stats_output_module {
823
824public:
825 struct rgb_color {
826 float r;
827 float g;
828 float b;
829 };
830
831private:
832 std::string outputFileName;
833 static const char *eventsFileName;
834 static const char *plotFileName;
835 static int printPerThreadFlag;
836 static int printPerThreadEventsFlag;
837 static const rgb_color globalColorArray[];
838 static rgb_color timerColorInfo[];
839
840 void init();
841 static void setupEventColors();
842 static void printPloticusFile();
843 static void printHeaderInfo(FILE *statsOut);
844 static void printTimerStats(FILE *statsOut, statistic const *theStats,
845 statistic const *totalStats);
846 static void printCounterStats(FILE *statsOut, statistic const *theStats);
847 static void printCounters(FILE *statsOut, counter const *theCounters);
848 static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
849 int gtid);
850 static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
851 static void windupExplicitTimers();
852 bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
853
854public:
855 kmp_stats_output_module() { init(); }
856 void outputStats(const char *heading);
857};
858
859#ifdef __cplusplus
860extern "C" {
861#endif
862void __kmp_stats_init();
863void __kmp_stats_fini();
864void __kmp_reset_stats();
865void __kmp_output_stats(const char *);
866void __kmp_accumulate_stats_at_exit(void);
867// thread local pointer to stats node within list
868extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
869// head to stats list.
870extern kmp_stats_list *__kmp_stats_list;
871// lock for __kmp_stats_list
872extern kmp_tas_lock_t __kmp_stats_lock;
873// reference start time
874extern tsc_tick_count __kmp_stats_start_time;
875// interface to output
876extern kmp_stats_output_module __kmp_stats_output;
877
878#ifdef __cplusplus
879}
880#endif
881
882// Simple, standard interfaces that drop out completely if stats aren't enabled
883
895#define KMP_COUNT_VALUE(name, value) \
896 __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample((double)value)
897
908#define KMP_COUNT_BLOCK(name) \
909 __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
910
928#define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
929
937#define KMP_INIT_PARTITIONED_TIMERS(name) \
938 __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
939 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
940
941#define KMP_TIME_PARTITIONED_BLOCK(name) \
942 blockPartitionedTimer __PBLOCKTIME__( \
943 __kmp_stats_thread_ptr->getPartitionedTimers(), \
944 explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
945 TIMER_##name))
946
947#define KMP_PUSH_PARTITIONED_TIMER(name) \
948 __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
949 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
950
951#define KMP_POP_PARTITIONED_TIMER() \
952 __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
953
954#define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
955 __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
956 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
957
958#define KMP_SET_THREAD_STATE(state_name) \
959 __kmp_stats_thread_ptr->setState(state_name)
960
961#define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
962
963#define KMP_SET_THREAD_STATE_BLOCK(state_name) \
964 blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
965 state_name)
966
974#define KMP_RESET_STATS() __kmp_reset_stats()
975
976#if (KMP_DEVELOPER_STATS)
977#define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
978#define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
979#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
980#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
981#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
982#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
983 KMP_EXCHANGE_PARTITIONED_TIMER(n)
984#else
985// Null definitions
986#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
987#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
988#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
989#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
990#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
991#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
992#endif
993
994#else // KMP_STATS_ENABLED
995
996// Null definitions
997#define KMP_COUNT_VALUE(n, v) ((void)0)
998#define KMP_COUNT_BLOCK(n) ((void)0)
999
1000#define KMP_OUTPUT_STATS(heading_string) ((void)0)
1001#define KMP_RESET_STATS() ((void)0)
1002
1003#define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
1004#define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
1005#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
1006#define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1007#define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1008#define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1009#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1010#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1011#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1012#define KMP_POP_PARTITIONED_TIMER() ((void)0)
1013#define KMP_SET_THREAD_STATE(state_name) ((void)0)
1014#define KMP_GET_THREAD_STATE() ((void)0)
1015#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1016#endif // KMP_STATS_ENABLED
1017
1018#endif // KMP_STATS_H
stats_flags_e
flags to describe the statistic (timer or counter)
Definition kmp_stats.h:49
#define KMP_FOREACH_COUNTER(macro, arg)
Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h.
Definition kmp_stats.h:95
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)
Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Definition kmp_stats.h:300
stats_state_e
the states which a thread can be in
Definition kmp_stats.h:63
@ notInMaster
statistic is valid only for non-primary threads
Definition kmp_stats.h:53
@ noUnits
statistic doesn't need units printed next to it
Definition kmp_stats.h:52
@ logEvent
Definition kmp_stats.h:54
@ noTotal
do not show a TOTAL_aggregation for this statistic
Definition kmp_stats.h:50
@ onlyInMaster
statistic is valid only for primary thread
Definition kmp_stats.h:51