LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_dispatch.cpp
1/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13/* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20#include "kmp.h"
21#include "kmp_error.h"
22#include "kmp_i18n.h"
23#include "kmp_itt.h"
24#include "kmp_stats.h"
25#include "kmp_str.h"
26#if KMP_USE_X87CONTROL
27#include <float.h>
28#endif
29#include "kmp_lock.h"
30#include "kmp_dispatch.h"
31#if KMP_USE_HIER_SCHED
32#include "kmp_dispatch_hier.h"
33#endif
34
35#if OMPT_SUPPORT
36#include "ompt-specific.h"
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43 kmp_info_t *th;
44
45 KMP_DEBUG_ASSERT(gtid_ref);
46
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53#else
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55#endif
56 }
57 }
58}
59
60void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61 kmp_info_t *th;
62
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67 }
68 }
69}
70
71// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
75 // Nonmonotonic as default for dynamic schedule when no modifier is specified
76 int monotonicity = SCHEDULE_NONMONOTONIC;
77
78 // Let default be monotonic for executables
79 // compiled with OpenMP* 4.5 or less compilers
80 if (loc != NULL && loc->get_openmp_version() < 50)
81 monotonicity = SCHEDULE_MONOTONIC;
82
83 if (use_hier || __kmp_force_monotonic)
84 monotonicity = SCHEDULE_MONOTONIC;
85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86 monotonicity = SCHEDULE_NONMONOTONIC;
87 else if (SCHEDULE_HAS_MONOTONIC(schedule))
88 monotonicity = SCHEDULE_MONOTONIC;
89
90 return monotonicity;
91}
92
93#if KMP_STATIC_STEAL_ENABLED
94enum { // values for steal_flag (possible states of private per-loop buffer)
95 UNUSED = 0,
96 CLAIMED = 1, // owner thread started initialization
97 READY = 2, // available for stealing
98 THIEF = 3 // finished by owner, or claimed by thief
99 // possible state changes:
100 // 0 -> 1 owner only, sync
101 // 0 -> 3 thief only, sync
102 // 1 -> 2 owner only, async
103 // 2 -> 3 owner only, async
104 // 3 -> 2 owner only, async
105 // 3 -> 0 last thread finishing the loop, async
106};
107#endif
108
109// Initialize a dispatch_private_info_template<T> buffer for a particular
110// type of schedule,chunk. The loop description is found in lb (lower bound),
111// ub (upper bound), and st (stride). nproc is the number of threads relevant
112// to the scheduling (often the number of threads in a team, but not always if
113// hierarchical scheduling is used). tid is the id of the thread calling
114// the function within the group of nproc threads. It will have a value
115// between 0 and nproc - 1. This is often just the thread id within a team, but
116// is not necessarily the case when using hierarchical scheduling.
117// loc is the source file location of the corresponding loop
118// gtid is the global thread id
119template <typename T>
120void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
121 dispatch_private_info_template<T> *pr,
122 enum sched_type schedule, T lb, T ub,
123 typename traits_t<T>::signed_t st,
124#if USE_ITT_BUILD
125 kmp_uint64 *cur_chunk,
126#endif
127 typename traits_t<T>::signed_t chunk,
128 T nproc, T tid) {
129 typedef typename traits_t<T>::unsigned_t UT;
130 typedef typename traits_t<T>::floating_t DBL;
131
132 int active;
133 T tc;
134 kmp_info_t *th;
135 kmp_team_t *team;
136 int monotonicity;
137 bool use_hier;
138
139#ifdef KMP_DEBUG
140 typedef typename traits_t<T>::signed_t ST;
141 {
142 char *buff;
143 // create format specifiers before the debug output
144 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
145 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
146 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
147 traits_t<T>::spec, traits_t<T>::spec,
148 traits_t<ST>::spec, traits_t<ST>::spec,
149 traits_t<T>::spec, traits_t<T>::spec);
150 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
151 __kmp_str_free(&buff);
152 }
153#endif
154 /* setup data */
155 th = __kmp_threads[gtid];
156 team = th->th.th_team;
157 active = !team->t.t_serialized;
158
159#if USE_ITT_BUILD
160 int itt_need_metadata_reporting =
161 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
162 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
163 team->t.t_active_level == 1;
164#endif
165
166#if KMP_USE_HIER_SCHED
167 use_hier = pr->flags.use_hier;
168#else
169 use_hier = false;
170#endif
171
172 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
173 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
174 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
175
176 /* Pick up the nomerge/ordered bits from the scheduling type */
177 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
178 pr->flags.nomerge = TRUE;
179 schedule =
180 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
181 } else {
182 pr->flags.nomerge = FALSE;
183 }
184 pr->type_size = traits_t<T>::type_size; // remember the size of variables
185 if (kmp_ord_lower & schedule) {
186 pr->flags.ordered = TRUE;
187 schedule =
188 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
189 } else {
190 pr->flags.ordered = FALSE;
191 }
192 // Ordered overrides nonmonotonic
193 if (pr->flags.ordered) {
194 monotonicity = SCHEDULE_MONOTONIC;
195 }
196
197 if (schedule == kmp_sch_static) {
198 schedule = __kmp_static;
199 } else {
200 if (schedule == kmp_sch_runtime) {
201 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
202 // not specified)
203 schedule = team->t.t_sched.r_sched_type;
204 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
205 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
206 if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
207 monotonicity = SCHEDULE_MONOTONIC;
208 // Detail the schedule if needed (global controls are differentiated
209 // appropriately)
210 if (schedule == kmp_sch_guided_chunked) {
211 schedule = __kmp_guided;
212 } else if (schedule == kmp_sch_static) {
213 schedule = __kmp_static;
214 }
215 // Use the chunk size specified by OMP_SCHEDULE (or default if not
216 // specified)
217 chunk = team->t.t_sched.chunk;
218#if USE_ITT_BUILD
219 if (cur_chunk)
220 *cur_chunk = chunk;
221#endif
222#ifdef KMP_DEBUG
223 {
224 char *buff;
225 // create format specifiers before the debug output
226 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
227 "schedule:%%d chunk:%%%s\n",
228 traits_t<ST>::spec);
229 KD_TRACE(10, (buff, gtid, schedule, chunk));
230 __kmp_str_free(&buff);
231 }
232#endif
233 } else {
234 if (schedule == kmp_sch_guided_chunked) {
235 schedule = __kmp_guided;
236 }
237 if (chunk <= 0) {
238 chunk = KMP_DEFAULT_CHUNK;
239 }
240 }
241
242 if (schedule == kmp_sch_auto) {
243 // mapping and differentiation: in the __kmp_do_serial_initialize()
244 schedule = __kmp_auto;
245#ifdef KMP_DEBUG
246 {
247 char *buff;
248 // create format specifiers before the debug output
249 buff = __kmp_str_format(
250 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
251 "schedule:%%d chunk:%%%s\n",
252 traits_t<ST>::spec);
253 KD_TRACE(10, (buff, gtid, schedule, chunk));
254 __kmp_str_free(&buff);
255 }
256#endif
257 }
258#if KMP_STATIC_STEAL_ENABLED
259 // map nonmonotonic:dynamic to static steal
260 if (schedule == kmp_sch_dynamic_chunked) {
261 if (monotonicity == SCHEDULE_NONMONOTONIC)
262 schedule = kmp_sch_static_steal;
263 }
264#endif
265 /* guided analytical not safe for too many threads */
266 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
267 schedule = kmp_sch_guided_iterative_chunked;
268 KMP_WARNING(DispatchManyThreads);
269 }
270 if (schedule == kmp_sch_runtime_simd) {
271 // compiler provides simd_width in the chunk parameter
272 schedule = team->t.t_sched.r_sched_type;
273 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
274 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
275 // Detail the schedule if needed (global controls are differentiated
276 // appropriately)
277 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
278 schedule == __kmp_static) {
279 schedule = kmp_sch_static_balanced_chunked;
280 } else {
281 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
282 schedule = kmp_sch_guided_simd;
283 }
284 chunk = team->t.t_sched.chunk * chunk;
285 }
286#if USE_ITT_BUILD
287 if (cur_chunk)
288 *cur_chunk = chunk;
289#endif
290#ifdef KMP_DEBUG
291 {
292 char *buff;
293 // create format specifiers before the debug output
294 buff = __kmp_str_format(
295 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
296 " chunk:%%%s\n",
297 traits_t<ST>::spec);
298 KD_TRACE(10, (buff, gtid, schedule, chunk));
299 __kmp_str_free(&buff);
300 }
301#endif
302 }
303 pr->u.p.parm1 = chunk;
304 }
305 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
306 "unknown scheduling type");
307
308 pr->u.p.count = 0;
309
310 if (__kmp_env_consistency_check) {
311 if (st == 0) {
312 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
313 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
314 }
315 }
316 // compute trip count
317 if (st == 1) { // most common case
318 if (ub >= lb) {
319 tc = ub - lb + 1;
320 } else { // ub < lb
321 tc = 0; // zero-trip
322 }
323 } else if (st < 0) {
324 if (lb >= ub) {
325 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
326 // where the division needs to be unsigned regardless of the result type
327 tc = (UT)(lb - ub) / (-st) + 1;
328 } else { // lb < ub
329 tc = 0; // zero-trip
330 }
331 } else { // st > 0
332 if (ub >= lb) {
333 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
334 // where the division needs to be unsigned regardless of the result type
335 tc = (UT)(ub - lb) / st + 1;
336 } else { // ub < lb
337 tc = 0; // zero-trip
338 }
339 }
340
341#if KMP_STATS_ENABLED
342 if (KMP_MASTER_GTID(gtid)) {
343 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
344 }
345#endif
346
347 pr->u.p.lb = lb;
348 pr->u.p.ub = ub;
349 pr->u.p.st = st;
350 pr->u.p.tc = tc;
351
352#if KMP_OS_WINDOWS
353 pr->u.p.last_upper = ub + st;
354#endif /* KMP_OS_WINDOWS */
355
356 /* NOTE: only the active parallel region(s) has active ordered sections */
357
358 if (active) {
359 if (pr->flags.ordered) {
360 pr->ordered_bumped = 0;
361 pr->u.p.ordered_lower = 1;
362 pr->u.p.ordered_upper = 0;
363 }
364 }
365
366 switch (schedule) {
367#if KMP_STATIC_STEAL_ENABLED
368 case kmp_sch_static_steal: {
369 T ntc, init;
370
371 KD_TRACE(100,
372 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
373 gtid));
374
375 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
376 if (nproc > 1 && ntc >= nproc) {
377 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
378 T id = tid;
379 T small_chunk, extras;
380 kmp_uint32 old = UNUSED;
381 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
382 if (traits_t<T>::type_size > 4) {
383 // AC: TODO: check if 16-byte CAS available and use it to
384 // improve performance (probably wait for explicit request
385 // before spending time on this).
386 // For now use dynamically allocated per-private-buffer lock,
387 // free memory in __kmp_dispatch_next when status==0.
388 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
389 __kmp_init_lock(pr->u.p.steal_lock);
390 }
391 small_chunk = ntc / nproc;
392 extras = ntc % nproc;
393
394 init = id * small_chunk + (id < extras ? id : extras);
395 pr->u.p.count = init;
396 if (claimed) { // are we succeeded in claiming own buffer?
397 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
398 // Other threads will inspect steal_flag when searching for a victim.
399 // READY means other threads may steal from this thread from now on.
400 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
401 } else {
402 // other thread has stolen whole our range
403 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
404 pr->u.p.ub = init; // mark there is no iterations to work on
405 }
406 pr->u.p.parm2 = ntc; // save number of chunks
407 // parm3 is the number of times to attempt stealing which is
408 // nproc (just a heuristics, could be optimized later on).
409 pr->u.p.parm3 = nproc;
410 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
411 break;
412 } else {
413 /* too few chunks: switching to kmp_sch_dynamic_chunked */
414 schedule = kmp_sch_dynamic_chunked;
415 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
416 "kmp_sch_dynamic_chunked\n",
417 gtid));
418 goto dynamic_init;
419 break;
420 } // if
421 } // case
422#endif
423 case kmp_sch_static_balanced: {
424 T init, limit;
425
426 KD_TRACE(
427 100,
428 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
429 gtid));
430
431 if (nproc > 1) {
432 T id = tid;
433
434 if (tc < nproc) {
435 if (id < tc) {
436 init = id;
437 limit = id;
438 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
439 } else {
440 pr->u.p.count = 1; /* means no more chunks to execute */
441 pr->u.p.parm1 = FALSE;
442 break;
443 }
444 } else {
445 T small_chunk = tc / nproc;
446 T extras = tc % nproc;
447 init = id * small_chunk + (id < extras ? id : extras);
448 limit = init + small_chunk - (id < extras ? 0 : 1);
449 pr->u.p.parm1 = (id == nproc - 1);
450 }
451 } else {
452 if (tc > 0) {
453 init = 0;
454 limit = tc - 1;
455 pr->u.p.parm1 = TRUE;
456 } else {
457 // zero trip count
458 pr->u.p.count = 1; /* means no more chunks to execute */
459 pr->u.p.parm1 = FALSE;
460 break;
461 }
462 }
463#if USE_ITT_BUILD
464 // Calculate chunk for metadata report
465 if (itt_need_metadata_reporting)
466 if (cur_chunk)
467 *cur_chunk = limit - init + 1;
468#endif
469 if (st == 1) {
470 pr->u.p.lb = lb + init;
471 pr->u.p.ub = lb + limit;
472 } else {
473 // calculated upper bound, "ub" is user-defined upper bound
474 T ub_tmp = lb + limit * st;
475 pr->u.p.lb = lb + init * st;
476 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
477 // it exactly
478 if (st > 0) {
479 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
480 } else {
481 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
482 }
483 }
484 if (pr->flags.ordered) {
485 pr->u.p.ordered_lower = init;
486 pr->u.p.ordered_upper = limit;
487 }
488 break;
489 } // case
490 case kmp_sch_static_balanced_chunked: {
491 // similar to balanced, but chunk adjusted to multiple of simd width
492 T nth = nproc;
493 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
494 " -> falling-through to static_greedy\n",
495 gtid));
496 schedule = kmp_sch_static_greedy;
497 if (nth > 1)
498 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
499 else
500 pr->u.p.parm1 = tc;
501 break;
502 } // case
504 case kmp_sch_guided_iterative_chunked: {
505 KD_TRACE(
506 100,
507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
508 " case\n",
509 gtid));
510
511 if (nproc > 1) {
512 if ((2L * chunk + 1) * nproc >= tc) {
513 /* chunk size too large, switch to dynamic */
514 schedule = kmp_sch_dynamic_chunked;
515 goto dynamic_init;
516 } else {
517 // when remaining iters become less than parm2 - switch to dynamic
518 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
519 *(double *)&pr->u.p.parm3 =
520 guided_flt_param / (double)nproc; // may occupy parm3 and parm4
521 }
522 } else {
523 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
524 "kmp_sch_static_greedy\n",
525 gtid));
526 schedule = kmp_sch_static_greedy;
527 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
528 KD_TRACE(
529 100,
530 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
531 gtid));
532 pr->u.p.parm1 = tc;
533 } // if
534 } // case
535 break;
536 case kmp_sch_guided_analytical_chunked: {
537 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
538 "kmp_sch_guided_analytical_chunked case\n",
539 gtid));
540
541 if (nproc > 1) {
542 if ((2L * chunk + 1) * nproc >= tc) {
543 /* chunk size too large, switch to dynamic */
544 schedule = kmp_sch_dynamic_chunked;
545 goto dynamic_init;
546 } else {
547 /* commonly used term: (2 nproc - 1)/(2 nproc) */
548 DBL x;
549
550#if KMP_USE_X87CONTROL
551 /* Linux* OS already has 64-bit computation by default for long double,
552 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
553 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
554 instead of the default 53-bit. Even though long double doesn't work
555 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
556 expected to impact the correctness of the algorithm, but this has not
557 been mathematically proven. */
558 // save original FPCW and set precision to 64-bit, as
559 // Windows* OS on IA-32 architecture defaults to 53-bit
560 unsigned int oldFpcw = _control87(0, 0);
561 _control87(_PC_64, _MCW_PC); // 0,0x30000
562#endif
563 /* value used for comparison in solver for cross-over point */
564 KMP_ASSERT(tc > 0);
565 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
566
567 /* crossover point--chunk indexes equal to or greater than
568 this point switch to dynamic-style scheduling */
569 UT cross;
570
571 /* commonly used term: (2 nproc - 1)/(2 nproc) */
572 x = 1.0 - 0.5 / (double)nproc;
573
574#ifdef KMP_DEBUG
575 { // test natural alignment
576 struct _test_a {
577 char a;
578 union {
579 char b;
580 DBL d;
581 };
582 } t;
583 ptrdiff_t natural_alignment =
584 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
585 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
586 // long)natural_alignment );
587 KMP_DEBUG_ASSERT(
588 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
589 }
590#endif // KMP_DEBUG
591
592 /* save the term in thread private dispatch structure */
593 *(DBL *)&pr->u.p.parm3 = x;
594
595 /* solve for the crossover point to the nearest integer i for which C_i
596 <= chunk */
597 {
598 UT left, right, mid;
599 long double p;
600
601 /* estimate initial upper and lower bound */
602
603 /* doesn't matter what value right is as long as it is positive, but
604 it affects performance of the solver */
605 right = 229;
606 p = __kmp_pow<UT>(x, right);
607 if (p > target) {
608 do {
609 p *= p;
610 right <<= 1;
611 } while (p > target && right < (1 << 27));
612 /* lower bound is previous (failed) estimate of upper bound */
613 left = right >> 1;
614 } else {
615 left = 0;
616 }
617
618 /* bisection root-finding method */
619 while (left + 1 < right) {
620 mid = (left + right) / 2;
621 if (__kmp_pow<UT>(x, mid) > target) {
622 left = mid;
623 } else {
624 right = mid;
625 }
626 } // while
627 cross = right;
628 }
629 /* assert sanity of computed crossover point */
630 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
631 __kmp_pow<UT>(x, cross) <= target);
632
633 /* save the crossover point in thread private dispatch structure */
634 pr->u.p.parm2 = cross;
635
636// C75803
637#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
638#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
639#else
640#define GUIDED_ANALYTICAL_WORKAROUND (x)
641#endif
642 /* dynamic-style scheduling offset */
643 pr->u.p.count = tc -
644 __kmp_dispatch_guided_remaining(
645 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
646 cross * chunk;
647#if KMP_USE_X87CONTROL
648 // restore FPCW
649 _control87(oldFpcw, _MCW_PC);
650#endif
651 } // if
652 } else {
653 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
654 "kmp_sch_static_greedy\n",
655 gtid));
656 schedule = kmp_sch_static_greedy;
657 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
658 pr->u.p.parm1 = tc;
659 } // if
660 } // case
661 break;
662 case kmp_sch_static_greedy:
663 KD_TRACE(
664 100,
665 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
666 gtid));
667 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
668 break;
669 case kmp_sch_static_chunked:
670 case kmp_sch_dynamic_chunked:
671 dynamic_init:
672 if (tc == 0)
673 break;
674 if (pr->u.p.parm1 <= 0)
675 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
676 else if (pr->u.p.parm1 > tc)
677 pr->u.p.parm1 = tc;
678 // Store the total number of chunks to prevent integer overflow during
679 // bounds calculations in the get next chunk routine.
680 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
681 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
682 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
683 gtid));
684 break;
685 case kmp_sch_trapezoidal: {
686 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
687
688 T parm1, parm2, parm3, parm4;
689 KD_TRACE(100,
690 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
691 gtid));
692
693 parm1 = chunk;
694
695 /* F : size of the first cycle */
696 parm2 = (tc / (2 * nproc));
697
698 if (parm2 < 1) {
699 parm2 = 1;
700 }
701
702 /* L : size of the last cycle. Make sure the last cycle is not larger
703 than the first cycle. */
704 if (parm1 < 1) {
705 parm1 = 1;
706 } else if (parm1 > parm2) {
707 parm1 = parm2;
708 }
709
710 /* N : number of cycles */
711 parm3 = (parm2 + parm1);
712 parm3 = (2 * tc + parm3 - 1) / parm3;
713
714 if (parm3 < 2) {
715 parm3 = 2;
716 }
717
718 /* sigma : decreasing incr of the trapezoid */
719 parm4 = (parm3 - 1);
720 parm4 = (parm2 - parm1) / parm4;
721
722 // pointless check, because parm4 >= 0 always
723 // if ( parm4 < 0 ) {
724 // parm4 = 0;
725 //}
726
727 pr->u.p.parm1 = parm1;
728 pr->u.p.parm2 = parm2;
729 pr->u.p.parm3 = parm3;
730 pr->u.p.parm4 = parm4;
731 } // case
732 break;
733
734 default: {
735 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
736 KMP_HNT(GetNewerLibrary), // Hint
737 __kmp_msg_null // Variadic argument list terminator
738 );
739 } break;
740 } // switch
741 pr->schedule = schedule;
742}
743
744#if KMP_USE_HIER_SCHED
745template <typename T>
746inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
747 typename traits_t<T>::signed_t st);
748template <>
749inline void
750__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
751 kmp_int32 ub, kmp_int32 st) {
752 __kmp_dispatch_init_hierarchy<kmp_int32>(
753 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
754 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
755}
756template <>
757inline void
758__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
759 kmp_uint32 ub, kmp_int32 st) {
760 __kmp_dispatch_init_hierarchy<kmp_uint32>(
761 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
762 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
763}
764template <>
765inline void
766__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
767 kmp_int64 ub, kmp_int64 st) {
768 __kmp_dispatch_init_hierarchy<kmp_int64>(
769 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
770 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
771}
772template <>
773inline void
774__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
775 kmp_uint64 ub, kmp_int64 st) {
776 __kmp_dispatch_init_hierarchy<kmp_uint64>(
777 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
778 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
779}
780
781// free all the hierarchy scheduling memory associated with the team
782void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
783 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
784 for (int i = 0; i < num_disp_buff; ++i) {
785 // type does not matter here so use kmp_int32
786 auto sh =
787 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
788 &team->t.t_disp_buffer[i]);
789 if (sh->hier) {
790 sh->hier->deallocate();
791 __kmp_free(sh->hier);
792 }
793 }
794}
795#endif
796
797// UT - unsigned flavor of T, ST - signed flavor of T,
798// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
799template <typename T>
800static void
801__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
802 T ub, typename traits_t<T>::signed_t st,
803 typename traits_t<T>::signed_t chunk, int push_ws) {
804 typedef typename traits_t<T>::unsigned_t UT;
805
806 int active;
807 kmp_info_t *th;
808 kmp_team_t *team;
809 kmp_uint32 my_buffer_index;
810 dispatch_private_info_template<T> *pr;
811 dispatch_shared_info_template<T> volatile *sh;
812
813 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
814 sizeof(dispatch_private_info));
815 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
816 sizeof(dispatch_shared_info));
817 __kmp_assert_valid_gtid(gtid);
818
819 if (!TCR_4(__kmp_init_parallel))
820 __kmp_parallel_initialize();
821
822 __kmp_resume_if_soft_paused();
823
824#if INCLUDE_SSC_MARKS
825 SSC_MARK_DISPATCH_INIT();
826#endif
827#ifdef KMP_DEBUG
828 typedef typename traits_t<T>::signed_t ST;
829 {
830 char *buff;
831 // create format specifiers before the debug output
832 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
833 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
834 traits_t<ST>::spec, traits_t<T>::spec,
835 traits_t<T>::spec, traits_t<ST>::spec);
836 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
837 __kmp_str_free(&buff);
838 }
839#endif
840 /* setup data */
841 th = __kmp_threads[gtid];
842 team = th->th.th_team;
843 active = !team->t.t_serialized;
844 th->th.th_ident = loc;
845
846 // Any half-decent optimizer will remove this test when the blocks are empty
847 // since the macros expand to nothing
848 // when statistics are disabled.
849 if (schedule == __kmp_static) {
850 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
851 } else {
852 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
853 }
854
855#if KMP_USE_HIER_SCHED
856 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
857 // Hierarchical scheduling does not work with ordered, so if ordered is
858 // detected, then revert back to threaded scheduling.
859 bool ordered;
860 enum sched_type my_sched = schedule;
861 my_buffer_index = th->th.th_dispatch->th_disp_index;
862 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
863 &th->th.th_dispatch
864 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
865 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
866 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
867 my_sched =
868 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
869 ordered = (kmp_ord_lower & my_sched);
870 if (pr->flags.use_hier) {
871 if (ordered) {
872 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
873 "Disabling hierarchical scheduling.\n",
874 gtid));
875 pr->flags.use_hier = FALSE;
876 }
877 }
878 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
879 // Don't use hierarchical for ordered parallel loops and don't
880 // use the runtime hierarchy if one was specified in the program
881 if (!ordered && !pr->flags.use_hier)
882 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
883 }
884#endif // KMP_USE_HIER_SCHED
885
886#if USE_ITT_BUILD
887 kmp_uint64 cur_chunk = chunk;
888 int itt_need_metadata_reporting =
889 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
890 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
891 team->t.t_active_level == 1;
892#endif
893 if (!active) {
894 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
895 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
896 } else {
897 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
898 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
899
900 my_buffer_index = th->th.th_dispatch->th_disp_index++;
901
902 /* What happens when number of threads changes, need to resize buffer? */
903 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
904 &th->th.th_dispatch
905 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
906 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
907 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
908 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
909 my_buffer_index));
910 if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
911 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
912 " sh->buffer_index:%d\n",
913 gtid, my_buffer_index, sh->buffer_index));
914 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
915 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
916 // Note: KMP_WAIT() cannot be used there: buffer index and
917 // my_buffer_index are *always* 32-bit integers.
918 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
919 "sh->buffer_index:%d\n",
920 gtid, my_buffer_index, sh->buffer_index));
921 }
922 }
923
924 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
925#if USE_ITT_BUILD
926 &cur_chunk,
927#endif
928 chunk, (T)th->th.th_team_nproc,
929 (T)th->th.th_info.ds.ds_tid);
930 if (active) {
931 if (pr->flags.ordered == 0) {
932 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
933 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
934 } else {
935 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
936 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
937 }
938 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
939 th->th.th_dispatch->th_dispatch_sh_current =
940 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
941#if USE_ITT_BUILD
942 if (pr->flags.ordered) {
943 __kmp_itt_ordered_init(gtid);
944 }
945 // Report loop metadata
946 if (itt_need_metadata_reporting) {
947 // Only report metadata by primary thread of active team at level 1
948 kmp_uint64 schedtype = 0;
949 switch (schedule) {
950 case kmp_sch_static_chunked:
951 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
952 break;
953 case kmp_sch_static_greedy:
954 cur_chunk = pr->u.p.parm1;
955 break;
956 case kmp_sch_dynamic_chunked:
957 schedtype = 1;
958 break;
959 case kmp_sch_guided_iterative_chunked:
960 case kmp_sch_guided_analytical_chunked:
962 schedtype = 2;
963 break;
964 default:
965 // Should we put this case under "static"?
966 // case kmp_sch_static_steal:
967 schedtype = 3;
968 break;
969 }
970 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
971 }
972#if KMP_USE_HIER_SCHED
973 if (pr->flags.use_hier) {
974 pr->u.p.count = 0;
975 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
976 }
977#endif // KMP_USER_HIER_SCHED
978#endif /* USE_ITT_BUILD */
979 }
980
981#ifdef KMP_DEBUG
982 {
983 char *buff;
984 // create format specifiers before the debug output
985 buff = __kmp_str_format(
986 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
987 "lb:%%%s ub:%%%s"
988 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
989 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
990 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
991 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
992 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
993 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
994 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
995 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
996 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
997 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
998 __kmp_str_free(&buff);
999 }
1000#endif
1001#if OMPT_SUPPORT && OMPT_OPTIONAL
1002 if (ompt_enabled.ompt_callback_work) {
1003 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1004 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
1005 ompt_callbacks.ompt_callback(ompt_callback_work)(
1006 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
1007 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
1008 }
1009#endif
1010 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1011}
1012
1013/* For ordered loops, either __kmp_dispatch_finish() should be called after
1014 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1015 * every chunk of iterations. If the ordered section(s) were not executed
1016 * for this iteration (or every iteration in this chunk), we need to set the
1017 * ordered iteration counters so that the next thread can proceed. */
1018template <typename UT>
1019static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1020 typedef typename traits_t<UT>::signed_t ST;
1021 __kmp_assert_valid_gtid(gtid);
1022 kmp_info_t *th = __kmp_threads[gtid];
1023
1024 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1025 if (!th->th.th_team->t.t_serialized) {
1026
1027 dispatch_private_info_template<UT> *pr =
1028 reinterpret_cast<dispatch_private_info_template<UT> *>(
1029 th->th.th_dispatch->th_dispatch_pr_current);
1030 dispatch_shared_info_template<UT> volatile *sh =
1031 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1032 th->th.th_dispatch->th_dispatch_sh_current);
1033 KMP_DEBUG_ASSERT(pr);
1034 KMP_DEBUG_ASSERT(sh);
1035 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1036 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1037
1038 if (pr->ordered_bumped) {
1039 KD_TRACE(
1040 1000,
1041 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1042 gtid));
1043 pr->ordered_bumped = 0;
1044 } else {
1045 UT lower = pr->u.p.ordered_lower;
1046
1047#ifdef KMP_DEBUG
1048 {
1049 char *buff;
1050 // create format specifiers before the debug output
1051 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1052 "ordered_iteration:%%%s lower:%%%s\n",
1053 traits_t<UT>::spec, traits_t<UT>::spec);
1054 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1055 __kmp_str_free(&buff);
1056 }
1057#endif
1058
1059 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1060 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1061 KMP_MB(); /* is this necessary? */
1062#ifdef KMP_DEBUG
1063 {
1064 char *buff;
1065 // create format specifiers before the debug output
1066 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1067 "ordered_iteration:%%%s lower:%%%s\n",
1068 traits_t<UT>::spec, traits_t<UT>::spec);
1069 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1070 __kmp_str_free(&buff);
1071 }
1072#endif
1073
1074 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1075 } // if
1076 } // if
1077 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1078}
1079
1080#ifdef KMP_GOMP_COMPAT
1081
1082template <typename UT>
1083static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1084 typedef typename traits_t<UT>::signed_t ST;
1085 __kmp_assert_valid_gtid(gtid);
1086 kmp_info_t *th = __kmp_threads[gtid];
1087
1088 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1089 if (!th->th.th_team->t.t_serialized) {
1090 dispatch_private_info_template<UT> *pr =
1091 reinterpret_cast<dispatch_private_info_template<UT> *>(
1092 th->th.th_dispatch->th_dispatch_pr_current);
1093 dispatch_shared_info_template<UT> volatile *sh =
1094 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1095 th->th.th_dispatch->th_dispatch_sh_current);
1096 KMP_DEBUG_ASSERT(pr);
1097 KMP_DEBUG_ASSERT(sh);
1098 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1099 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1100
1101 UT lower = pr->u.p.ordered_lower;
1102 UT upper = pr->u.p.ordered_upper;
1103 UT inc = upper - lower + 1;
1104
1105 if (pr->ordered_bumped == inc) {
1106 KD_TRACE(
1107 1000,
1108 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1109 gtid));
1110 pr->ordered_bumped = 0;
1111 } else {
1112 inc -= pr->ordered_bumped;
1113
1114#ifdef KMP_DEBUG
1115 {
1116 char *buff;
1117 // create format specifiers before the debug output
1118 buff = __kmp_str_format(
1119 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1120 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1121 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1122 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1123 __kmp_str_free(&buff);
1124 }
1125#endif
1126
1127 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1128 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1129
1130 KMP_MB(); /* is this necessary? */
1131 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1132 "ordered_bumped to zero\n",
1133 gtid));
1134 pr->ordered_bumped = 0;
1136#ifdef KMP_DEBUG
1137 {
1138 char *buff;
1139 // create format specifiers before the debug output
1140 buff = __kmp_str_format(
1141 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1142 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1143 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1144 traits_t<UT>::spec);
1145 KD_TRACE(1000,
1146 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1147 __kmp_str_free(&buff);
1148 }
1149#endif
1150
1151 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1152 }
1153 // }
1154 }
1155 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1156}
1157
1158#endif /* KMP_GOMP_COMPAT */
1159
1160template <typename T>
1161int __kmp_dispatch_next_algorithm(int gtid,
1162 dispatch_private_info_template<T> *pr,
1163 dispatch_shared_info_template<T> volatile *sh,
1164 kmp_int32 *p_last, T *p_lb, T *p_ub,
1165 typename traits_t<T>::signed_t *p_st, T nproc,
1166 T tid) {
1167 typedef typename traits_t<T>::unsigned_t UT;
1168 typedef typename traits_t<T>::signed_t ST;
1169 typedef typename traits_t<T>::floating_t DBL;
1170 int status = 0;
1171 bool last = false;
1172 T start;
1173 ST incr;
1174 UT limit, trip, init;
1175 kmp_info_t *th = __kmp_threads[gtid];
1176 kmp_team_t *team = th->th.th_team;
1177
1178 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1179 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1180 KMP_DEBUG_ASSERT(pr);
1181 KMP_DEBUG_ASSERT(sh);
1182 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1183#ifdef KMP_DEBUG
1184 {
1185 char *buff;
1186 // create format specifiers before the debug output
1187 buff =
1188 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1189 "sh:%%p nproc:%%%s tid:%%%s\n",
1190 traits_t<T>::spec, traits_t<T>::spec);
1191 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1192 __kmp_str_free(&buff);
1193 }
1194#endif
1195
1196 // zero trip count
1197 if (pr->u.p.tc == 0) {
1198 KD_TRACE(10,
1199 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1200 "zero status:%d\n",
1201 gtid, status));
1202 return 0;
1203 }
1204
1205 switch (pr->schedule) {
1206#if KMP_STATIC_STEAL_ENABLED
1207 case kmp_sch_static_steal: {
1208 T chunk = pr->u.p.parm1;
1209 UT nchunks = pr->u.p.parm2;
1210 KD_TRACE(100,
1211 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1212 gtid));
1213
1214 trip = pr->u.p.tc - 1;
1215
1216 if (traits_t<T>::type_size > 4) {
1217 // use lock for 8-byte induction variable.
1218 // TODO (optional): check presence and use 16-byte CAS
1219 kmp_lock_t *lck = pr->u.p.steal_lock;
1220 KMP_DEBUG_ASSERT(lck != NULL);
1221 if (pr->u.p.count < (UT)pr->u.p.ub) {
1222 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1223 __kmp_acquire_lock(lck, gtid);
1224 // try to get own chunk of iterations
1225 init = (pr->u.p.count)++;
1226 status = (init < (UT)pr->u.p.ub);
1227 __kmp_release_lock(lck, gtid);
1228 } else {
1229 status = 0; // no own chunks
1230 }
1231 if (!status) { // try to steal
1232 kmp_lock_t *lckv; // victim buffer's lock
1233 T while_limit = pr->u.p.parm3;
1234 T while_index = 0;
1235 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1236 __kmp_dispatch_num_buffers; // current loop index
1237 // note: victim thread can potentially execute another loop
1238 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1239 while ((!status) && (while_limit != ++while_index)) {
1240 dispatch_private_info_template<T> *v;
1241 T remaining;
1242 T victimId = pr->u.p.parm4;
1243 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1244 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1245 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1246 KMP_DEBUG_ASSERT(v);
1247 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1248 oldVictimId != victimId) {
1249 victimId = (victimId + 1) % nproc;
1250 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1251 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1252 KMP_DEBUG_ASSERT(v);
1253 }
1254 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1255 continue; // try once more (nproc attempts in total)
1256 }
1257 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1258 kmp_uint32 old = UNUSED;
1259 // try to steal whole range from inactive victim
1260 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1261 if (status) {
1262 // initialize self buffer with victim's whole range of chunks
1263 T id = victimId;
1264 T small_chunk, extras;
1265 small_chunk = nchunks / nproc; // chunks per thread
1266 extras = nchunks % nproc;
1267 init = id * small_chunk + (id < extras ? id : extras);
1268 __kmp_acquire_lock(lck, gtid);
1269 pr->u.p.count = init + 1; // exclude one we execute immediately
1270 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1271 __kmp_release_lock(lck, gtid);
1272 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1273 // no need to reinitialize other thread invariants: lb, st, etc.
1274#ifdef KMP_DEBUG
1275 {
1276 char *buff;
1277 // create format specifiers before the debug output
1278 buff = __kmp_str_format(
1279 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1280 "count:%%%s ub:%%%s\n",
1281 traits_t<UT>::spec, traits_t<T>::spec);
1282 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1283 __kmp_str_free(&buff);
1284 }
1285#endif
1286 // activate non-empty buffer and let others steal from us
1287 if (pr->u.p.count < (UT)pr->u.p.ub)
1288 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1289 break;
1290 }
1291 }
1292 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY ||
1293 v->u.p.count >= (UT)v->u.p.ub) {
1294 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1295 continue; // no chunks to steal, try next victim
1296 }
1297 lckv = v->u.p.steal_lock;
1298 KMP_ASSERT(lckv != NULL);
1299 __kmp_acquire_lock(lckv, gtid);
1300 limit = v->u.p.ub; // keep initial ub
1301 if (v->u.p.count >= limit) {
1302 __kmp_release_lock(lckv, gtid);
1303 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1304 continue; // no chunks to steal, try next victim
1305 }
1306
1307 // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1308 // TODO: is this heuristics good enough??
1309 remaining = limit - v->u.p.count;
1310 if (remaining > 7) {
1311 // steal 1/4 of remaining
1312 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1313 init = (v->u.p.ub -= (remaining >> 2));
1314 } else {
1315 // steal 1 chunk of 1..7 remaining
1316 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1317 init = (v->u.p.ub -= 1);
1318 }
1319 __kmp_release_lock(lckv, gtid);
1320#ifdef KMP_DEBUG
1321 {
1322 char *buff;
1323 // create format specifiers before the debug output
1324 buff = __kmp_str_format(
1325 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1326 "count:%%%s ub:%%%s\n",
1327 traits_t<UT>::spec, traits_t<UT>::spec);
1328 KD_TRACE(10, (buff, gtid, victimId, init, limit));
1329 __kmp_str_free(&buff);
1330 }
1331#endif
1332 KMP_DEBUG_ASSERT(init + 1 <= limit);
1333 pr->u.p.parm4 = victimId; // remember victim to steal from
1334 status = 1;
1335 // now update own count and ub with stolen range excluding init chunk
1336 __kmp_acquire_lock(lck, gtid);
1337 pr->u.p.count = init + 1;
1338 pr->u.p.ub = limit;
1339 __kmp_release_lock(lck, gtid);
1340 // activate non-empty buffer and let others steal from us
1341 if (init + 1 < limit)
1342 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1343 } // while (search for victim)
1344 } // if (try to find victim and steal)
1345 } else {
1346 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1347 // as all operations on pair (count, ub) must be done atomically
1348 typedef union {
1349 struct {
1350 UT count;
1351 T ub;
1352 } p;
1353 kmp_int64 b;
1354 } union_i4;
1355 union_i4 vold, vnew;
1356 if (pr->u.p.count < (UT)pr->u.p.ub) {
1357 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1358 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1359 vnew.b = vold.b;
1360 vnew.p.count++; // get chunk from head of self range
1361 while (!KMP_COMPARE_AND_STORE_REL64(
1362 (volatile kmp_int64 *)&pr->u.p.count,
1363 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1364 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1365 KMP_CPU_PAUSE();
1366 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1367 vnew.b = vold.b;
1368 vnew.p.count++;
1369 }
1370 init = vold.p.count;
1371 status = (init < (UT)vold.p.ub);
1372 } else {
1373 status = 0; // no own chunks
1374 }
1375 if (!status) { // try to steal
1376 T while_limit = pr->u.p.parm3;
1377 T while_index = 0;
1378 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1379 __kmp_dispatch_num_buffers; // current loop index
1380 // note: victim thread can potentially execute another loop
1381 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1382 while ((!status) && (while_limit != ++while_index)) {
1383 dispatch_private_info_template<T> *v;
1384 T remaining;
1385 T victimId = pr->u.p.parm4;
1386 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1387 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1388 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1389 KMP_DEBUG_ASSERT(v);
1390 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1391 oldVictimId != victimId) {
1392 victimId = (victimId + 1) % nproc;
1393 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1394 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1395 KMP_DEBUG_ASSERT(v);
1396 }
1397 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1398 continue; // try once more (nproc attempts in total)
1399 }
1400 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1401 kmp_uint32 old = UNUSED;
1402 // try to steal whole range from inactive victim
1403 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1404 if (status) {
1405 // initialize self buffer with victim's whole range of chunks
1406 T id = victimId;
1407 T small_chunk, extras;
1408 small_chunk = nchunks / nproc; // chunks per thread
1409 extras = nchunks % nproc;
1410 init = id * small_chunk + (id < extras ? id : extras);
1411 vnew.p.count = init + 1;
1412 vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0);
1413 // write pair (count, ub) at once atomically
1414#if KMP_ARCH_X86
1415 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1416#else
1417 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1418#endif
1419 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1420 // no need to initialize other thread invariants: lb, st, etc.
1421#ifdef KMP_DEBUG
1422 {
1423 char *buff;
1424 // create format specifiers before the debug output
1425 buff = __kmp_str_format(
1426 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1427 "count:%%%s ub:%%%s\n",
1428 traits_t<UT>::spec, traits_t<T>::spec);
1429 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1430 __kmp_str_free(&buff);
1431 }
1432#endif
1433 // activate non-empty buffer and let others steal from us
1434 if (pr->u.p.count < (UT)pr->u.p.ub)
1435 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1436 break;
1437 }
1438 }
1439 while (1) { // CAS loop with check if victim still has enough chunks
1440 // many threads may be stealing concurrently from same victim
1441 vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1442 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1443 vold.p.count >= (UT)vold.p.ub) {
1444 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1445 break; // no chunks to steal, try next victim
1446 }
1447 vnew.b = vold.b;
1448 remaining = vold.p.ub - vold.p.count;
1449 // try to steal 1/4 of remaining
1450 // TODO: is this heuristics good enough??
1451 if (remaining > 7) {
1452 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1453 } else {
1454 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1455 }
1456 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1457 if (KMP_COMPARE_AND_STORE_REL64(
1458 (volatile kmp_int64 *)&v->u.p.count,
1459 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1460 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1461 // stealing succedded
1462#ifdef KMP_DEBUG
1463 {
1464 char *buff;
1465 // create format specifiers before the debug output
1466 buff = __kmp_str_format(
1467 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1468 "count:%%%s ub:%%%s\n",
1469 traits_t<T>::spec, traits_t<T>::spec);
1470 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1471 __kmp_str_free(&buff);
1472 }
1473#endif
1474 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1475 vold.p.ub - vnew.p.ub);
1476 status = 1;
1477 pr->u.p.parm4 = victimId; // keep victim id
1478 // now update own count and ub
1479 init = vnew.p.ub;
1480 vold.p.count = init + 1;
1481#if KMP_ARCH_X86
1482 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1483#else
1484 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1485#endif
1486 // activate non-empty buffer and let others steal from us
1487 if (vold.p.count < (UT)vold.p.ub)
1488 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1489 break;
1490 } // if (check CAS result)
1491 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1492 } // while (try to steal from particular victim)
1493 } // while (search for victim)
1494 } // if (try to find victim and steal)
1495 } // if (4-byte induction variable)
1496 if (!status) {
1497 *p_lb = 0;
1498 *p_ub = 0;
1499 if (p_st != NULL)
1500 *p_st = 0;
1501 } else {
1502 start = pr->u.p.lb;
1503 init *= chunk;
1504 limit = chunk + init - 1;
1505 incr = pr->u.p.st;
1506 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1507
1508 KMP_DEBUG_ASSERT(init <= trip);
1509 // keep track of done chunks for possible early exit from stealing
1510 // TODO: count executed chunks locally with rare update of shared location
1511 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1512 if ((last = (limit >= trip)) != 0)
1513 limit = trip;
1514 if (p_st != NULL)
1515 *p_st = incr;
1516
1517 if (incr == 1) {
1518 *p_lb = start + init;
1519 *p_ub = start + limit;
1520 } else {
1521 *p_lb = start + init * incr;
1522 *p_ub = start + limit * incr;
1523 }
1524 } // if
1525 break;
1526 } // case
1527#endif // KMP_STATIC_STEAL_ENABLED
1528 case kmp_sch_static_balanced: {
1529 KD_TRACE(
1530 10,
1531 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1532 gtid));
1533 /* check if thread has any iteration to do */
1534 if ((status = !pr->u.p.count) != 0) {
1535 pr->u.p.count = 1;
1536 *p_lb = pr->u.p.lb;
1537 *p_ub = pr->u.p.ub;
1538 last = (pr->u.p.parm1 != 0);
1539 if (p_st != NULL)
1540 *p_st = pr->u.p.st;
1541 } else { /* no iterations to do */
1542 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1543 }
1544 } // case
1545 break;
1546 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1547 merged here */
1548 case kmp_sch_static_chunked: {
1549 T parm1;
1550
1551 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1552 "kmp_sch_static_[affinity|chunked] case\n",
1553 gtid));
1554 parm1 = pr->u.p.parm1;
1555
1556 trip = pr->u.p.tc - 1;
1557 init = parm1 * (pr->u.p.count + tid);
1558
1559 if ((status = (init <= trip)) != 0) {
1560 start = pr->u.p.lb;
1561 incr = pr->u.p.st;
1562 limit = parm1 + init - 1;
1563
1564 if ((last = (limit >= trip)) != 0)
1565 limit = trip;
1566
1567 if (p_st != NULL)
1568 *p_st = incr;
1569
1570 pr->u.p.count += nproc;
1571
1572 if (incr == 1) {
1573 *p_lb = start + init;
1574 *p_ub = start + limit;
1575 } else {
1576 *p_lb = start + init * incr;
1577 *p_ub = start + limit * incr;
1578 }
1579
1580 if (pr->flags.ordered) {
1581 pr->u.p.ordered_lower = init;
1582 pr->u.p.ordered_upper = limit;
1583 } // if
1584 } // if
1585 } // case
1586 break;
1587
1588 case kmp_sch_dynamic_chunked: {
1589 UT chunk_number;
1590 UT chunk_size = pr->u.p.parm1;
1591 UT nchunks = pr->u.p.parm2;
1592
1593 KD_TRACE(
1594 100,
1595 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1596 gtid));
1597
1598 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1599 status = (chunk_number < nchunks);
1600 if (!status) {
1601 *p_lb = 0;
1602 *p_ub = 0;
1603 if (p_st != NULL)
1604 *p_st = 0;
1605 } else {
1606 init = chunk_size * chunk_number;
1607 trip = pr->u.p.tc - 1;
1608 start = pr->u.p.lb;
1609 incr = pr->u.p.st;
1610
1611 if ((last = (trip - init < (UT)chunk_size)))
1612 limit = trip;
1613 else
1614 limit = chunk_size + init - 1;
1615
1616 if (p_st != NULL)
1617 *p_st = incr;
1618
1619 if (incr == 1) {
1620 *p_lb = start + init;
1621 *p_ub = start + limit;
1622 } else {
1623 *p_lb = start + init * incr;
1624 *p_ub = start + limit * incr;
1625 }
1626
1627 if (pr->flags.ordered) {
1628 pr->u.p.ordered_lower = init;
1629 pr->u.p.ordered_upper = limit;
1630 } // if
1631 } // if
1632 } // case
1633 break;
1634
1635 case kmp_sch_guided_iterative_chunked: {
1636 T chunkspec = pr->u.p.parm1;
1637 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1638 "iterative case\n",
1639 gtid));
1640 trip = pr->u.p.tc;
1641 // Start atomic part of calculations
1642 while (1) {
1643 ST remaining; // signed, because can be < 0
1644 init = sh->u.s.iteration; // shared value
1645 remaining = trip - init;
1646 if (remaining <= 0) { // AC: need to compare with 0 first
1647 // nothing to do, don't try atomic op
1648 status = 0;
1649 break;
1650 }
1651 if ((T)remaining <
1652 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1653 // use dynamic-style schedule
1654 // atomically increment iterations, get old value
1655 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1656 (ST)chunkspec);
1657 remaining = trip - init;
1658 if (remaining <= 0) {
1659 status = 0; // all iterations got by other threads
1660 } else {
1661 // got some iterations to work on
1662 status = 1;
1663 if ((T)remaining > chunkspec) {
1664 limit = init + chunkspec - 1;
1665 } else {
1666 last = true; // the last chunk
1667 limit = init + remaining - 1;
1668 } // if
1669 } // if
1670 break;
1671 } // if
1672 limit = init + (UT)((double)remaining *
1673 *(double *)&pr->u.p.parm3); // divide by K*nproc
1674 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1675 (ST)init, (ST)limit)) {
1676 // CAS was successful, chunk obtained
1677 status = 1;
1678 --limit;
1679 break;
1680 } // if
1681 } // while
1682 if (status != 0) {
1683 start = pr->u.p.lb;
1684 incr = pr->u.p.st;
1685 if (p_st != NULL)
1686 *p_st = incr;
1687 *p_lb = start + init * incr;
1688 *p_ub = start + limit * incr;
1689 if (pr->flags.ordered) {
1690 pr->u.p.ordered_lower = init;
1691 pr->u.p.ordered_upper = limit;
1692 } // if
1693 } else {
1694 *p_lb = 0;
1695 *p_ub = 0;
1696 if (p_st != NULL)
1697 *p_st = 0;
1698 } // if
1699 } // case
1700 break;
1701
1702 case kmp_sch_guided_simd: {
1703 // same as iterative but curr-chunk adjusted to be multiple of given
1704 // chunk
1705 T chunk = pr->u.p.parm1;
1706 KD_TRACE(100,
1707 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1708 gtid));
1709 trip = pr->u.p.tc;
1710 // Start atomic part of calculations
1711 while (1) {
1712 ST remaining; // signed, because can be < 0
1713 init = sh->u.s.iteration; // shared value
1714 remaining = trip - init;
1715 if (remaining <= 0) { // AC: need to compare with 0 first
1716 status = 0; // nothing to do, don't try atomic op
1717 break;
1718 }
1719 KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1720 // compare with K*nproc*(chunk+1), K=2 by default
1721 if ((T)remaining < pr->u.p.parm2) {
1722 // use dynamic-style schedule
1723 // atomically increment iterations, get old value
1724 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1725 (ST)chunk);
1726 remaining = trip - init;
1727 if (remaining <= 0) {
1728 status = 0; // all iterations got by other threads
1729 } else {
1730 // got some iterations to work on
1731 status = 1;
1732 if ((T)remaining > chunk) {
1733 limit = init + chunk - 1;
1734 } else {
1735 last = true; // the last chunk
1736 limit = init + remaining - 1;
1737 } // if
1738 } // if
1739 break;
1740 } // if
1741 // divide by K*nproc
1742 UT span;
1743 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1744 &span);
1745 UT rem = span % chunk;
1746 if (rem) // adjust so that span%chunk == 0
1747 span += chunk - rem;
1748 limit = init + span;
1749 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1750 (ST)init, (ST)limit)) {
1751 // CAS was successful, chunk obtained
1752 status = 1;
1753 --limit;
1754 break;
1755 } // if
1756 } // while
1757 if (status != 0) {
1758 start = pr->u.p.lb;
1759 incr = pr->u.p.st;
1760 if (p_st != NULL)
1761 *p_st = incr;
1762 *p_lb = start + init * incr;
1763 *p_ub = start + limit * incr;
1764 if (pr->flags.ordered) {
1765 pr->u.p.ordered_lower = init;
1766 pr->u.p.ordered_upper = limit;
1767 } // if
1768 } else {
1769 *p_lb = 0;
1770 *p_ub = 0;
1771 if (p_st != NULL)
1772 *p_st = 0;
1773 } // if
1774 } // case
1775 break;
1776
1777 case kmp_sch_guided_analytical_chunked: {
1778 T chunkspec = pr->u.p.parm1;
1779 UT chunkIdx;
1780#if KMP_USE_X87CONTROL
1781 /* for storing original FPCW value for Windows* OS on
1782 IA-32 architecture 8-byte version */
1783 unsigned int oldFpcw;
1784 unsigned int fpcwSet = 0;
1785#endif
1786 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1787 "kmp_sch_guided_analytical_chunked case\n",
1788 gtid));
1789
1790 trip = pr->u.p.tc;
1791
1792 KMP_DEBUG_ASSERT(nproc > 1);
1793 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1794
1795 while (1) { /* this while loop is a safeguard against unexpected zero
1796 chunk sizes */
1797 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1798 if (chunkIdx >= (UT)pr->u.p.parm2) {
1799 --trip;
1800 /* use dynamic-style scheduling */
1801 init = chunkIdx * chunkspec + pr->u.p.count;
1802 /* need to verify init > 0 in case of overflow in the above
1803 * calculation */
1804 if ((status = (init > 0 && init <= trip)) != 0) {
1805 limit = init + chunkspec - 1;
1806
1807 if ((last = (limit >= trip)) != 0)
1808 limit = trip;
1809 }
1810 break;
1811 } else {
1812/* use exponential-style scheduling */
1813/* The following check is to workaround the lack of long double precision on
1814 Windows* OS.
1815 This check works around the possible effect that init != 0 for chunkIdx == 0.
1816 */
1817#if KMP_USE_X87CONTROL
1818 /* If we haven't already done so, save original
1819 FPCW and set precision to 64-bit, as Windows* OS
1820 on IA-32 architecture defaults to 53-bit */
1821 if (!fpcwSet) {
1822 oldFpcw = _control87(0, 0);
1823 _control87(_PC_64, _MCW_PC);
1824 fpcwSet = 0x30000;
1825 }
1826#endif
1827 if (chunkIdx) {
1828 init = __kmp_dispatch_guided_remaining<T>(
1829 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1830 KMP_DEBUG_ASSERT(init);
1831 init = trip - init;
1832 } else
1833 init = 0;
1834 limit = trip - __kmp_dispatch_guided_remaining<T>(
1835 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1836 KMP_ASSERT(init <= limit);
1837 if (init < limit) {
1838 KMP_DEBUG_ASSERT(limit <= trip);
1839 --limit;
1840 status = 1;
1841 break;
1842 } // if
1843 } // if
1844 } // while (1)
1845#if KMP_USE_X87CONTROL
1846 /* restore FPCW if necessary
1847 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1848 */
1849 if (fpcwSet && (oldFpcw & fpcwSet))
1850 _control87(oldFpcw, _MCW_PC);
1851#endif
1852 if (status != 0) {
1853 start = pr->u.p.lb;
1854 incr = pr->u.p.st;
1855 if (p_st != NULL)
1856 *p_st = incr;
1857 *p_lb = start + init * incr;
1858 *p_ub = start + limit * incr;
1859 if (pr->flags.ordered) {
1860 pr->u.p.ordered_lower = init;
1861 pr->u.p.ordered_upper = limit;
1862 }
1863 } else {
1864 *p_lb = 0;
1865 *p_ub = 0;
1866 if (p_st != NULL)
1867 *p_st = 0;
1868 }
1869 } // case
1870 break;
1871
1872 case kmp_sch_trapezoidal: {
1873 UT index;
1874 T parm2 = pr->u.p.parm2;
1875 T parm3 = pr->u.p.parm3;
1876 T parm4 = pr->u.p.parm4;
1877 KD_TRACE(100,
1878 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1879 gtid));
1880
1881 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1882
1883 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1884 trip = pr->u.p.tc - 1;
1885
1886 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1887 *p_lb = 0;
1888 *p_ub = 0;
1889 if (p_st != NULL)
1890 *p_st = 0;
1891 } else {
1892 start = pr->u.p.lb;
1893 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1894 incr = pr->u.p.st;
1895
1896 if ((last = (limit >= trip)) != 0)
1897 limit = trip;
1898
1899 if (p_st != NULL)
1900 *p_st = incr;
1901
1902 if (incr == 1) {
1903 *p_lb = start + init;
1904 *p_ub = start + limit;
1905 } else {
1906 *p_lb = start + init * incr;
1907 *p_ub = start + limit * incr;
1908 }
1909
1910 if (pr->flags.ordered) {
1911 pr->u.p.ordered_lower = init;
1912 pr->u.p.ordered_upper = limit;
1913 } // if
1914 } // if
1915 } // case
1916 break;
1917 default: {
1918 status = 0; // to avoid complaints on uninitialized variable use
1919 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1920 KMP_HNT(GetNewerLibrary), // Hint
1921 __kmp_msg_null // Variadic argument list terminator
1922 );
1923 } break;
1924 } // switch
1925 if (p_last)
1926 *p_last = last;
1927#ifdef KMP_DEBUG
1928 if (pr->flags.ordered) {
1929 char *buff;
1930 // create format specifiers before the debug output
1931 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1932 "ordered_lower:%%%s ordered_upper:%%%s\n",
1933 traits_t<UT>::spec, traits_t<UT>::spec);
1934 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1935 __kmp_str_free(&buff);
1936 }
1937 {
1938 char *buff;
1939 // create format specifiers before the debug output
1940 buff = __kmp_str_format(
1941 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1942 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1943 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1944 KMP_DEBUG_ASSERT(p_last);
1945 KMP_DEBUG_ASSERT(p_st);
1946 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1947 __kmp_str_free(&buff);
1948 }
1949#endif
1950 return status;
1951}
1952
1953/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1954 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1955 is not called. */
1956#if OMPT_SUPPORT && OMPT_OPTIONAL
1957#define OMPT_LOOP_END \
1958 if (status == 0) { \
1959 if (ompt_enabled.ompt_callback_work) { \
1960 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1961 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1962 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1963 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1964 &(task_info->task_data), 0, codeptr); \
1965 } \
1966 }
1967// TODO: implement count
1968#else
1969#define OMPT_LOOP_END // no-op
1970#endif
1971
1972#if KMP_STATS_ENABLED
1973#define KMP_STATS_LOOP_END \
1974 { \
1975 kmp_int64 u, l, t, i; \
1976 l = (kmp_int64)(*p_lb); \
1977 u = (kmp_int64)(*p_ub); \
1978 i = (kmp_int64)(pr->u.p.st); \
1979 if (status == 0) { \
1980 t = 0; \
1981 KMP_POP_PARTITIONED_TIMER(); \
1982 } else if (i == 1) { \
1983 if (u >= l) \
1984 t = u - l + 1; \
1985 else \
1986 t = 0; \
1987 } else if (i < 0) { \
1988 if (l >= u) \
1989 t = (l - u) / (-i) + 1; \
1990 else \
1991 t = 0; \
1992 } else { \
1993 if (u >= l) \
1994 t = (u - l) / i + 1; \
1995 else \
1996 t = 0; \
1997 } \
1998 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1999 }
2000#else
2001#define KMP_STATS_LOOP_END /* Nothing */
2002#endif
2003
2004template <typename T>
2005static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2006 T *p_lb, T *p_ub,
2007 typename traits_t<T>::signed_t *p_st
2008#if OMPT_SUPPORT && OMPT_OPTIONAL
2009 ,
2010 void *codeptr
2011#endif
2012) {
2013
2014 typedef typename traits_t<T>::unsigned_t UT;
2015 typedef typename traits_t<T>::signed_t ST;
2016 // This is potentially slightly misleading, schedule(runtime) will appear here
2017 // even if the actual runtime schedule is static. (Which points out a
2018 // disadvantage of schedule(runtime): even when static scheduling is used it
2019 // costs more than a compile time choice to use static scheduling would.)
2020 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2021
2022 int status;
2023 dispatch_private_info_template<T> *pr;
2024 __kmp_assert_valid_gtid(gtid);
2025 kmp_info_t *th = __kmp_threads[gtid];
2026 kmp_team_t *team = th->th.th_team;
2027
2028 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2029 KD_TRACE(
2030 1000,
2031 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2032 gtid, p_lb, p_ub, p_st, p_last));
2033
2034 if (team->t.t_serialized) {
2035 /* NOTE: serialize this dispatch because we are not at the active level */
2036 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2037 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2038 KMP_DEBUG_ASSERT(pr);
2039
2040 if ((status = (pr->u.p.tc != 0)) == 0) {
2041 *p_lb = 0;
2042 *p_ub = 0;
2043 // if ( p_last != NULL )
2044 // *p_last = 0;
2045 if (p_st != NULL)
2046 *p_st = 0;
2047 if (__kmp_env_consistency_check) {
2048 if (pr->pushed_ws != ct_none) {
2049 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2050 }
2051 }
2052 } else if (pr->flags.nomerge) {
2053 kmp_int32 last;
2054 T start;
2055 UT limit, trip, init;
2056 ST incr;
2057 T chunk = pr->u.p.parm1;
2058
2059 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2060 gtid));
2061
2062 init = chunk * pr->u.p.count++;
2063 trip = pr->u.p.tc - 1;
2064
2065 if ((status = (init <= trip)) == 0) {
2066 *p_lb = 0;
2067 *p_ub = 0;
2068 // if ( p_last != NULL )
2069 // *p_last = 0;
2070 if (p_st != NULL)
2071 *p_st = 0;
2072 if (__kmp_env_consistency_check) {
2073 if (pr->pushed_ws != ct_none) {
2074 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2075 }
2076 }
2077 } else {
2078 start = pr->u.p.lb;
2079 limit = chunk + init - 1;
2080 incr = pr->u.p.st;
2081
2082 if ((last = (limit >= trip)) != 0) {
2083 limit = trip;
2084#if KMP_OS_WINDOWS
2085 pr->u.p.last_upper = pr->u.p.ub;
2086#endif /* KMP_OS_WINDOWS */
2087 }
2088 if (p_last != NULL)
2089 *p_last = last;
2090 if (p_st != NULL)
2091 *p_st = incr;
2092 if (incr == 1) {
2093 *p_lb = start + init;
2094 *p_ub = start + limit;
2095 } else {
2096 *p_lb = start + init * incr;
2097 *p_ub = start + limit * incr;
2098 }
2099
2100 if (pr->flags.ordered) {
2101 pr->u.p.ordered_lower = init;
2102 pr->u.p.ordered_upper = limit;
2103#ifdef KMP_DEBUG
2104 {
2105 char *buff;
2106 // create format specifiers before the debug output
2107 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2108 "ordered_lower:%%%s ordered_upper:%%%s\n",
2109 traits_t<UT>::spec, traits_t<UT>::spec);
2110 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2111 pr->u.p.ordered_upper));
2112 __kmp_str_free(&buff);
2113 }
2114#endif
2115 } // if
2116 } // if
2117 } else {
2118 pr->u.p.tc = 0;
2119 *p_lb = pr->u.p.lb;
2120 *p_ub = pr->u.p.ub;
2121#if KMP_OS_WINDOWS
2122 pr->u.p.last_upper = *p_ub;
2123#endif /* KMP_OS_WINDOWS */
2124 if (p_last != NULL)
2125 *p_last = TRUE;
2126 if (p_st != NULL)
2127 *p_st = pr->u.p.st;
2128 } // if
2129#ifdef KMP_DEBUG
2130 {
2131 char *buff;
2132 // create format specifiers before the debug output
2133 buff = __kmp_str_format(
2134 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2135 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2136 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2137 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2138 (p_last ? *p_last : 0), status));
2139 __kmp_str_free(&buff);
2140 }
2141#endif
2142#if INCLUDE_SSC_MARKS
2143 SSC_MARK_DISPATCH_NEXT();
2144#endif
2145 OMPT_LOOP_END;
2146 KMP_STATS_LOOP_END;
2147 return status;
2148 } else {
2149 kmp_int32 last = 0;
2150 dispatch_shared_info_template<T> volatile *sh;
2151
2152 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2153 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2154
2155 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2156 th->th.th_dispatch->th_dispatch_pr_current);
2157 KMP_DEBUG_ASSERT(pr);
2158 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2159 th->th.th_dispatch->th_dispatch_sh_current);
2160 KMP_DEBUG_ASSERT(sh);
2161
2162#if KMP_USE_HIER_SCHED
2163 if (pr->flags.use_hier)
2164 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2165 else
2166#endif // KMP_USE_HIER_SCHED
2167 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2168 p_st, th->th.th_team_nproc,
2169 th->th.th_info.ds.ds_tid);
2170 // status == 0: no more iterations to execute
2171 if (status == 0) {
2172 ST num_done;
2173 num_done = test_then_inc<ST>(&sh->u.s.num_done);
2174#ifdef KMP_DEBUG
2175 {
2176 char *buff;
2177 // create format specifiers before the debug output
2178 buff = __kmp_str_format(
2179 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2180 traits_t<ST>::spec);
2181 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2182 __kmp_str_free(&buff);
2183 }
2184#endif
2185
2186#if KMP_USE_HIER_SCHED
2187 pr->flags.use_hier = FALSE;
2188#endif
2189 if (num_done == th->th.th_team_nproc - 1) {
2190#if KMP_STATIC_STEAL_ENABLED
2191 if (pr->schedule == kmp_sch_static_steal) {
2192 int i;
2193 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2194 __kmp_dispatch_num_buffers; // current loop index
2195 // loop complete, safe to destroy locks used for stealing
2196 for (i = 0; i < th->th.th_team_nproc; ++i) {
2197 dispatch_private_info_template<T> *buf =
2198 reinterpret_cast<dispatch_private_info_template<T> *>(
2199 &team->t.t_dispatch[i].th_disp_buffer[idx]);
2200 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2201 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2202 if (traits_t<T>::type_size > 4) {
2203 // destroy locks used for stealing
2204 kmp_lock_t *lck = buf->u.p.steal_lock;
2205 KMP_ASSERT(lck != NULL);
2206 __kmp_destroy_lock(lck);
2207 __kmp_free(lck);
2208 buf->u.p.steal_lock = NULL;
2209 }
2210 }
2211 }
2212#endif
2213 /* NOTE: release shared buffer to be reused */
2214
2215 KMP_MB(); /* Flush all pending memory write invalidates. */
2216
2217 sh->u.s.num_done = 0;
2218 sh->u.s.iteration = 0;
2219
2220 /* TODO replace with general release procedure? */
2221 if (pr->flags.ordered) {
2222 sh->u.s.ordered_iteration = 0;
2223 }
2224
2225 sh->buffer_index += __kmp_dispatch_num_buffers;
2226 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2227 gtid, sh->buffer_index));
2228
2229 KMP_MB(); /* Flush all pending memory write invalidates. */
2230
2231 } // if
2232 if (__kmp_env_consistency_check) {
2233 if (pr->pushed_ws != ct_none) {
2234 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2235 }
2236 }
2237
2238 th->th.th_dispatch->th_deo_fcn = NULL;
2239 th->th.th_dispatch->th_dxo_fcn = NULL;
2240 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2241 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2242 } // if (status == 0)
2243#if KMP_OS_WINDOWS
2244 else if (last) {
2245 pr->u.p.last_upper = pr->u.p.ub;
2246 }
2247#endif /* KMP_OS_WINDOWS */
2248 if (p_last != NULL && status != 0)
2249 *p_last = last;
2250 } // if
2251
2252#ifdef KMP_DEBUG
2253 {
2254 char *buff;
2255 // create format specifiers before the debug output
2256 buff = __kmp_str_format(
2257 "__kmp_dispatch_next: T#%%d normal case: "
2258 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2259 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2260 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2261 (p_last ? *p_last : 0), status));
2262 __kmp_str_free(&buff);
2263 }
2264#endif
2265#if INCLUDE_SSC_MARKS
2266 SSC_MARK_DISPATCH_NEXT();
2267#endif
2268 OMPT_LOOP_END;
2269 KMP_STATS_LOOP_END;
2270 return status;
2271}
2272
2273template <typename T>
2274static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2275 kmp_int32 *plastiter, T *plower, T *pupper,
2276 typename traits_t<T>::signed_t incr) {
2277 typedef typename traits_t<T>::unsigned_t UT;
2278 kmp_uint32 team_id;
2279 kmp_uint32 nteams;
2280 UT trip_count;
2281 kmp_team_t *team;
2282 kmp_info_t *th;
2283
2284 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2285 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2286#ifdef KMP_DEBUG
2287 typedef typename traits_t<T>::signed_t ST;
2288 {
2289 char *buff;
2290 // create format specifiers before the debug output
2291 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2292 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2293 traits_t<T>::spec, traits_t<T>::spec,
2294 traits_t<ST>::spec, traits_t<T>::spec);
2295 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2296 __kmp_str_free(&buff);
2297 }
2298#endif
2299
2300 if (__kmp_env_consistency_check) {
2301 if (incr == 0) {
2302 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2303 loc);
2304 }
2305 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2306 // The loop is illegal.
2307 // Some zero-trip loops maintained by compiler, e.g.:
2308 // for(i=10;i<0;++i) // lower >= upper - run-time check
2309 // for(i=0;i>10;--i) // lower <= upper - run-time check
2310 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2311 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2312 // Compiler does not check the following illegal loops:
2313 // for(i=0;i<10;i+=incr) // where incr<0
2314 // for(i=10;i>0;i-=incr) // where incr<0
2315 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2316 }
2317 }
2318 __kmp_assert_valid_gtid(gtid);
2319 th = __kmp_threads[gtid];
2320 team = th->th.th_team;
2321 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2322 nteams = th->th.th_teams_size.nteams;
2323 team_id = team->t.t_master_tid;
2324 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2325
2326 // compute global trip count
2327 if (incr == 1) {
2328 trip_count = *pupper - *plower + 1;
2329 } else if (incr == -1) {
2330 trip_count = *plower - *pupper + 1;
2331 } else if (incr > 0) {
2332 // upper-lower can exceed the limit of signed type
2333 trip_count = (UT)(*pupper - *plower) / incr + 1;
2334 } else {
2335 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2336 }
2337
2338 if (trip_count <= nteams) {
2339 KMP_DEBUG_ASSERT(
2340 __kmp_static == kmp_sch_static_greedy ||
2341 __kmp_static ==
2342 kmp_sch_static_balanced); // Unknown static scheduling type.
2343 // only some teams get single iteration, others get nothing
2344 if (team_id < trip_count) {
2345 *pupper = *plower = *plower + team_id * incr;
2346 } else {
2347 *plower = *pupper + incr; // zero-trip loop
2348 }
2349 if (plastiter != NULL)
2350 *plastiter = (team_id == trip_count - 1);
2351 } else {
2352 if (__kmp_static == kmp_sch_static_balanced) {
2353 UT chunk = trip_count / nteams;
2354 UT extras = trip_count % nteams;
2355 *plower +=
2356 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2357 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2358 if (plastiter != NULL)
2359 *plastiter = (team_id == nteams - 1);
2360 } else {
2361 T chunk_inc_count =
2362 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2363 T upper = *pupper;
2364 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2365 // Unknown static scheduling type.
2366 *plower += team_id * chunk_inc_count;
2367 *pupper = *plower + chunk_inc_count - incr;
2368 // Check/correct bounds if needed
2369 if (incr > 0) {
2370 if (*pupper < *plower)
2371 *pupper = traits_t<T>::max_value;
2372 if (plastiter != NULL)
2373 *plastiter = *plower <= upper && *pupper > upper - incr;
2374 if (*pupper > upper)
2375 *pupper = upper; // tracker C73258
2376 } else {
2377 if (*pupper > *plower)
2378 *pupper = traits_t<T>::min_value;
2379 if (plastiter != NULL)
2380 *plastiter = *plower >= upper && *pupper < upper - incr;
2381 if (*pupper < upper)
2382 *pupper = upper; // tracker C73258
2383 }
2384 }
2385 }
2386}
2387
2388//-----------------------------------------------------------------------------
2389// Dispatch routines
2390// Transfer call to template< type T >
2391// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2392// T lb, T ub, ST st, ST chunk )
2393extern "C" {
2394
2411void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2412 enum sched_type schedule, kmp_int32 lb,
2413 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2414 KMP_DEBUG_ASSERT(__kmp_init_serial);
2415#if OMPT_SUPPORT && OMPT_OPTIONAL
2416 OMPT_STORE_RETURN_ADDRESS(gtid);
2417#endif
2418 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2419}
2423void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2424 enum sched_type schedule, kmp_uint32 lb,
2425 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2426 KMP_DEBUG_ASSERT(__kmp_init_serial);
2427#if OMPT_SUPPORT && OMPT_OPTIONAL
2428 OMPT_STORE_RETURN_ADDRESS(gtid);
2429#endif
2430 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2431}
2432
2436void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2437 enum sched_type schedule, kmp_int64 lb,
2438 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2439 KMP_DEBUG_ASSERT(__kmp_init_serial);
2440#if OMPT_SUPPORT && OMPT_OPTIONAL
2441 OMPT_STORE_RETURN_ADDRESS(gtid);
2442#endif
2443 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2444}
2445
2449void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2450 enum sched_type schedule, kmp_uint64 lb,
2451 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2452 KMP_DEBUG_ASSERT(__kmp_init_serial);
2453#if OMPT_SUPPORT && OMPT_OPTIONAL
2454 OMPT_STORE_RETURN_ADDRESS(gtid);
2455#endif
2456 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2457}
2458
2468void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2469 enum sched_type schedule, kmp_int32 *p_last,
2470 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2471 kmp_int32 chunk) {
2472 KMP_DEBUG_ASSERT(__kmp_init_serial);
2473#if OMPT_SUPPORT && OMPT_OPTIONAL
2474 OMPT_STORE_RETURN_ADDRESS(gtid);
2475#endif
2476 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2477 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2478}
2479
2480void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2481 enum sched_type schedule, kmp_int32 *p_last,
2482 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2483 kmp_int32 chunk) {
2484 KMP_DEBUG_ASSERT(__kmp_init_serial);
2485#if OMPT_SUPPORT && OMPT_OPTIONAL
2486 OMPT_STORE_RETURN_ADDRESS(gtid);
2487#endif
2488 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2489 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2490}
2491
2492void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2493 enum sched_type schedule, kmp_int32 *p_last,
2494 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2495 kmp_int64 chunk) {
2496 KMP_DEBUG_ASSERT(__kmp_init_serial);
2497#if OMPT_SUPPORT && OMPT_OPTIONAL
2498 OMPT_STORE_RETURN_ADDRESS(gtid);
2499#endif
2500 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2501 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2502}
2503
2504void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2505 enum sched_type schedule, kmp_int32 *p_last,
2506 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2507 kmp_int64 chunk) {
2508 KMP_DEBUG_ASSERT(__kmp_init_serial);
2509#if OMPT_SUPPORT && OMPT_OPTIONAL
2510 OMPT_STORE_RETURN_ADDRESS(gtid);
2511#endif
2512 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2513 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2514}
2515
2529int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2530 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2531#if OMPT_SUPPORT && OMPT_OPTIONAL
2532 OMPT_STORE_RETURN_ADDRESS(gtid);
2533#endif
2534 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2535#if OMPT_SUPPORT && OMPT_OPTIONAL
2536 ,
2537 OMPT_LOAD_RETURN_ADDRESS(gtid)
2538#endif
2539 );
2540}
2541
2545int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2546 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2547 kmp_int32 *p_st) {
2548#if OMPT_SUPPORT && OMPT_OPTIONAL
2549 OMPT_STORE_RETURN_ADDRESS(gtid);
2550#endif
2551 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2552#if OMPT_SUPPORT && OMPT_OPTIONAL
2553 ,
2554 OMPT_LOAD_RETURN_ADDRESS(gtid)
2555#endif
2556 );
2557}
2558
2562int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2563 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2564#if OMPT_SUPPORT && OMPT_OPTIONAL
2565 OMPT_STORE_RETURN_ADDRESS(gtid);
2566#endif
2567 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2568#if OMPT_SUPPORT && OMPT_OPTIONAL
2569 ,
2570 OMPT_LOAD_RETURN_ADDRESS(gtid)
2571#endif
2572 );
2573}
2574
2578int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2579 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2580 kmp_int64 *p_st) {
2581#if OMPT_SUPPORT && OMPT_OPTIONAL
2582 OMPT_STORE_RETURN_ADDRESS(gtid);
2583#endif
2584 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2585#if OMPT_SUPPORT && OMPT_OPTIONAL
2586 ,
2587 OMPT_LOAD_RETURN_ADDRESS(gtid)
2588#endif
2589 );
2590}
2591
2598void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2599 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2600}
2601
2605void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2606 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2607}
2608
2612void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2613 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2614}
2615
2619void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2620 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2621}
2624//-----------------------------------------------------------------------------
2625// Non-template routines from kmp_dispatch.cpp used in other sources
2626
2627kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2628 return value == checker;
2629}
2630
2631kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2632 return value != checker;
2633}
2634
2635kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2636 return value < checker;
2637}
2638
2639kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2640 return value >= checker;
2641}
2642
2643kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2644 return value <= checker;
2645}
2646
2647kmp_uint32
2648__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2649 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2650 void *obj // Higher-level synchronization object, or NULL.
2651) {
2652 // note: we may not belong to a team at this point
2653 volatile kmp_uint32 *spin = spinner;
2654 kmp_uint32 check = checker;
2655 kmp_uint32 spins;
2656 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2657 kmp_uint32 r;
2658 kmp_uint64 time;
2659
2660 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2661 KMP_INIT_YIELD(spins);
2662 KMP_INIT_BACKOFF(time);
2663 // main wait spin loop
2664 while (!f(r = TCR_4(*spin), check)) {
2665 KMP_FSYNC_SPIN_PREPARE(obj);
2666 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2667 split. It causes problems with infinite recursion because of exit lock */
2668 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2669 __kmp_abort_thread(); */
2670 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
2671 }
2672 KMP_FSYNC_SPIN_ACQUIRED(obj);
2673 return r;
2674}
2675
2676void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2677 kmp_uint32 (*pred)(void *, kmp_uint32),
2678 void *obj // Higher-level synchronization object, or NULL.
2679) {
2680 // note: we may not belong to a team at this point
2681 void *spin = spinner;
2682 kmp_uint32 check = checker;
2683 kmp_uint32 spins;
2684 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2685 kmp_uint64 time;
2686
2687 KMP_FSYNC_SPIN_INIT(obj, spin);
2688 KMP_INIT_YIELD(spins);
2689 KMP_INIT_BACKOFF(time);
2690 // main wait spin loop
2691 while (!f(spin, check)) {
2692 KMP_FSYNC_SPIN_PREPARE(obj);
2693 /* if we have waited a bit, or are noversubscribed, yield */
2694 /* pause is in the following code */
2695 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
2696 }
2697 KMP_FSYNC_SPIN_ACQUIRED(obj);
2698}
2699
2700} // extern "C"
2701
2702#ifdef KMP_GOMP_COMPAT
2703
2704void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2705 enum sched_type schedule, kmp_int32 lb,
2706 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2707 int push_ws) {
2708 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2709 push_ws);
2710}
2711
2712void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2713 enum sched_type schedule, kmp_uint32 lb,
2714 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2715 int push_ws) {
2716 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2717 push_ws);
2718}
2719
2720void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2721 enum sched_type schedule, kmp_int64 lb,
2722 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2723 int push_ws) {
2724 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2725 push_ws);
2726}
2727
2728void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2729 enum sched_type schedule, kmp_uint64 lb,
2730 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2731 int push_ws) {
2732 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2733 push_ws);
2734}
2735
2736void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2737 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2738}
2739
2740void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2741 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2742}
2743
2744void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2745 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2746}
2747
2748void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2749 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2750}
2751
2752#endif /* KMP_GOMP_COMPAT */
2753
2754/* ------------------------------------------------------------------------ */
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition kmp_stats.h:895
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition kmp_stats.h:908
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
sched_type
Definition kmp.h:357
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)
@ kmp_sch_runtime_simd
Definition kmp.h:379
@ kmp_sch_auto
Definition kmp.h:364
@ kmp_sch_static
Definition kmp.h:360
@ kmp_sch_guided_simd
Definition kmp.h:378
@ kmp_sch_guided_chunked
Definition kmp.h:362
@ kmp_sch_lower
Definition kmp.h:358
@ kmp_nm_upper
Definition kmp.h:429
@ kmp_ord_lower
Definition kmp.h:384
@ kmp_sch_upper
Definition kmp.h:382
@ kmp_nm_lower
Definition kmp.h:402
Definition kmp.h:234