LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34#if OMPD_SUPPORT
35#include "ompd-specific.h"
36#endif
37
38#if OMP_PROFILING_SUPPORT
39#include "llvm/Support/TimeProfiler.h"
40static char *ProfileTraceFile = nullptr;
41#endif
42
43/* these are temporary issues to be dealt with */
44#define KMP_USE_PRCTL 0
45
46#if KMP_OS_WINDOWS
47#include <process.h>
48#endif
49
50#if KMP_OS_WINDOWS
51// windows does not need include files as it doesn't use shared memory
52#else
53#include <sys/mman.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#define SHM_SIZE 1024
57#endif
58
59#if defined(KMP_GOMP_COMPAT)
60char const __kmp_version_alt_comp[] =
61 KMP_VERSION_PREFIX "alternative compiler support: yes";
62#endif /* defined(KMP_GOMP_COMPAT) */
63
64char const __kmp_version_omp_api[] =
65 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66
67#ifdef KMP_DEBUG
68char const __kmp_version_lock[] =
69 KMP_VERSION_PREFIX "lock type: run time selectable";
70#endif /* KMP_DEBUG */
71
72#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73
74/* ------------------------------------------------------------------------ */
75
76#if KMP_USE_MONITOR
77kmp_info_t __kmp_monitor;
78#endif
79
80/* Forward declarations */
81
82void __kmp_cleanup(void);
83
84static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85 int gtid);
86static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87 kmp_internal_control_t *new_icvs,
88 ident_t *loc);
89#if KMP_AFFINITY_SUPPORTED
90static void __kmp_partition_places(kmp_team_t *team,
91 int update_master_only = 0);
92#endif
93static void __kmp_do_serial_initialize(void);
94void __kmp_fork_barrier(int gtid, int tid);
95void __kmp_join_barrier(int gtid);
96void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97 kmp_internal_control_t *new_icvs, ident_t *loc);
98
99#ifdef USE_LOAD_BALANCE
100static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101#endif
102
103static int __kmp_expand_threads(int nNeed);
104#if KMP_OS_WINDOWS
105static int __kmp_unregister_root_other_thread(int gtid);
106#endif
107static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109
110void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111 int new_nthreads);
112void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113
114/* Calculate the identifier of the current thread */
115/* fast (and somewhat portable) way to get unique identifier of executing
116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117int __kmp_get_global_thread_id() {
118 int i;
119 kmp_info_t **other_threads;
120 size_t stack_data;
121 char *stack_addr;
122 size_t stack_size;
123 char *stack_base;
124
125 KA_TRACE(
126 1000,
127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128 __kmp_nth, __kmp_all_nth));
129
130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133 __kmp_init_gtid for this to work. */
134
135 if (!TCR_4(__kmp_init_gtid))
136 return KMP_GTID_DNE;
137
138#ifdef KMP_TDATA_GTID
139 if (TCR_4(__kmp_gtid_mode) >= 3) {
140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141 return __kmp_gtid;
142 }
143#endif
144 if (TCR_4(__kmp_gtid_mode) >= 2) {
145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146 return __kmp_gtid_get_specific();
147 }
148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149
150 stack_addr = (char *)&stack_data;
151 other_threads = __kmp_threads;
152
153 /* ATT: The code below is a source of potential bugs due to unsynchronized
154 access to __kmp_threads array. For example:
155 1. Current thread loads other_threads[i] to thr and checks it, it is
156 non-NULL.
157 2. Current thread is suspended by OS.
158 3. Another thread unregisters and finishes (debug versions of free()
159 may fill memory with something like 0xEF).
160 4. Current thread is resumed.
161 5. Current thread reads junk from *thr.
162 TODO: Fix it. --ln */
163
164 for (i = 0; i < __kmp_threads_capacity; i++) {
165
166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167 if (!thr)
168 continue;
169
170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172
173 /* stack grows down -- search through all of the active threads */
174
175 if (stack_addr <= stack_base) {
176 size_t stack_diff = stack_base - stack_addr;
177
178 if (stack_diff <= stack_size) {
179 /* The only way we can be closer than the allocated */
180 /* stack size is if we are running on this thread. */
181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182 return i;
183 }
184 }
185 }
186
187 /* get specific to try and determine our gtid */
188 KA_TRACE(1000,
189 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190 "thread, using TLS\n"));
191 i = __kmp_gtid_get_specific();
192
193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194
195 /* if we havn't been assigned a gtid, then return code */
196 if (i < 0)
197 return i;
198
199 /* dynamically updated stack window for uber threads to avoid get_specific
200 call */
201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202 KMP_FATAL(StackOverflow, i);
203 }
204
205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206 if (stack_addr > stack_base) {
207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210 stack_base);
211 } else {
212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213 stack_base - stack_addr);
214 }
215
216 /* Reprint stack bounds for ubermaster since they have been refined */
217 if (__kmp_storage_map) {
218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221 other_threads[i]->th.th_info.ds.ds_stacksize,
222 "th_%d stack (refinement)", i);
223 }
224 return i;
225}
226
227int __kmp_get_global_thread_id_reg() {
228 int gtid;
229
230 if (!__kmp_init_serial) {
231 gtid = KMP_GTID_DNE;
232 } else
233#ifdef KMP_TDATA_GTID
234 if (TCR_4(__kmp_gtid_mode) >= 3) {
235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236 gtid = __kmp_gtid;
237 } else
238#endif
239 if (TCR_4(__kmp_gtid_mode) >= 2) {
240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241 gtid = __kmp_gtid_get_specific();
242 } else {
243 KA_TRACE(1000,
244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245 gtid = __kmp_get_global_thread_id();
246 }
247
248 /* we must be a new uber master sibling thread */
249 if (gtid == KMP_GTID_DNE) {
250 KA_TRACE(10,
251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252 "Registering a new gtid.\n"));
253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254 if (!__kmp_init_serial) {
255 __kmp_do_serial_initialize();
256 gtid = __kmp_gtid_get_specific();
257 } else {
258 gtid = __kmp_register_root(FALSE);
259 }
260 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262 }
263
264 KMP_DEBUG_ASSERT(gtid >= 0);
265
266 return gtid;
267}
268
269/* caller must hold forkjoin_lock */
270void __kmp_check_stack_overlap(kmp_info_t *th) {
271 int f;
272 char *stack_beg = NULL;
273 char *stack_end = NULL;
274 int gtid;
275
276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277 if (__kmp_storage_map) {
278 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280
281 gtid = __kmp_gtid_from_thread(th);
282
283 if (gtid == KMP_GTID_MONITOR) {
284 __kmp_print_storage_map_gtid(
285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286 "th_%s stack (%s)", "mon",
287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288 } else {
289 __kmp_print_storage_map_gtid(
290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291 "th_%d stack (%s)", gtid,
292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293 }
294 }
295
296 /* No point in checking ubermaster threads since they use refinement and
297 * cannot overlap */
298 gtid = __kmp_gtid_from_thread(th);
299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300 KA_TRACE(10,
301 ("__kmp_check_stack_overlap: performing extensive checking\n"));
302 if (stack_beg == NULL) {
303 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305 }
306
307 for (f = 0; f < __kmp_threads_capacity; f++) {
308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309
310 if (f_th && f_th != th) {
311 char *other_stack_end =
312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313 char *other_stack_beg =
314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317
318 /* Print the other stack values before the abort */
319 if (__kmp_storage_map)
320 __kmp_print_storage_map_gtid(
321 -1, other_stack_beg, other_stack_end,
322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324
325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326 __kmp_msg_null);
327 }
328 }
329 }
330 }
331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332}
333
334/* ------------------------------------------------------------------------ */
335
336void __kmp_infinite_loop(void) {
337 static int done = FALSE;
338
339 while (!done) {
340 KMP_YIELD(TRUE);
341 }
342}
343
344#define MAX_MESSAGE 512
345
346void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347 char const *format, ...) {
348 char buffer[MAX_MESSAGE];
349 va_list ap;
350
351 va_start(ap, format);
352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353 p2, (unsigned long)size, format);
354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355 __kmp_vprintf(kmp_err, buffer, ap);
356#if KMP_PRINT_DATA_PLACEMENT
357 int node;
358 if (gtid >= 0) {
359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360 if (__kmp_storage_map_verbose) {
361 node = __kmp_get_host_node(p1);
362 if (node < 0) /* doesn't work, so don't try this next time */
363 __kmp_storage_map_verbose = FALSE;
364 else {
365 char *last;
366 int lastNode;
367 int localProc = __kmp_get_cpu_from_gtid(gtid);
368
369 const int page_size = KMP_GET_PAGE_SIZE();
370
371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373 if (localProc >= 0)
374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375 localProc >> 1);
376 else
377 __kmp_printf_no_lock(" GTID %d\n", gtid);
378#if KMP_USE_PRCTL
379 /* The more elaborate format is disabled for now because of the prctl
380 * hanging bug. */
381 do {
382 last = p1;
383 lastNode = node;
384 /* This loop collates adjacent pages with the same host node. */
385 do {
386 (char *)p1 += page_size;
387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389 lastNode);
390 } while (p1 <= p2);
391#else
392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393 (char *)p1 + (page_size - 1),
394 __kmp_get_host_node(p1));
395 if (p1 < p2) {
396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397 (char *)p2 + (page_size - 1),
398 __kmp_get_host_node(p2));
399 }
400#endif
401 }
402 }
403 } else
404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405 }
406#endif /* KMP_PRINT_DATA_PLACEMENT */
407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408}
409
410void __kmp_warn(char const *format, ...) {
411 char buffer[MAX_MESSAGE];
412 va_list ap;
413
414 if (__kmp_generate_warnings == kmp_warnings_off) {
415 return;
416 }
417
418 va_start(ap, format);
419
420 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422 __kmp_vprintf(kmp_err, buffer, ap);
423 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424
425 va_end(ap);
426}
427
428void __kmp_abort_process() {
429 // Later threads may stall here, but that's ok because abort() will kill them.
430 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431
432 if (__kmp_debug_buf) {
433 __kmp_dump_debug_buffer();
434 }
435
436 if (KMP_OS_WINDOWS) {
437 // Let other threads know of abnormal termination and prevent deadlock
438 // if abort happened during library initialization or shutdown
439 __kmp_global.g.g_abort = SIGABRT;
440
441 /* On Windows* OS by default abort() causes pop-up error box, which stalls
442 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443 boxes. _set_abort_behavior() works well, but this function is not
444 available in VS7 (this is not problem for DLL, but it is a problem for
445 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446 help, at least in some versions of MS C RTL.
447
448 It seems following sequence is the only way to simulate abort() and
449 avoid pop-up error box. */
450 raise(SIGABRT);
451 _exit(3); // Just in case, if signal ignored, exit anyway.
452 } else {
453 __kmp_unregister_library();
454 abort();
455 }
456
457 __kmp_infinite_loop();
458 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459
460} // __kmp_abort_process
461
462void __kmp_abort_thread(void) {
463 // TODO: Eliminate g_abort global variable and this function.
464 // In case of abort just call abort(), it will kill all the threads.
465 __kmp_infinite_loop();
466} // __kmp_abort_thread
467
468/* Print out the storage map for the major kmp_info_t thread data structures
469 that are allocated together. */
470
471static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473 gtid);
474
475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477
478 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479 sizeof(kmp_local_t), "th_%d.th_local", gtid);
480
481 __kmp_print_storage_map_gtid(
482 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484
485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486 &thr->th.th_bar[bs_plain_barrier + 1],
487 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488 gtid);
489
490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491 &thr->th.th_bar[bs_forkjoin_barrier + 1],
492 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493 gtid);
494
495#if KMP_FAST_REDUCTION_BARRIER
496 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497 &thr->th.th_bar[bs_reduction_barrier + 1],
498 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499 gtid);
500#endif // KMP_FAST_REDUCTION_BARRIER
501}
502
503/* Print out the storage map for the major kmp_team_t team data structures
504 that are allocated together. */
505
506static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507 int team_id, int num_thr) {
508 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510 header, team_id);
511
512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513 &team->t.t_bar[bs_last_barrier],
514 sizeof(kmp_balign_team_t) * bs_last_barrier,
515 "%s_%d.t_bar", header, team_id);
516
517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518 &team->t.t_bar[bs_plain_barrier + 1],
519 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520 header, team_id);
521
522 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523 &team->t.t_bar[bs_forkjoin_barrier + 1],
524 sizeof(kmp_balign_team_t),
525 "%s_%d.t_bar[forkjoin]", header, team_id);
526
527#if KMP_FAST_REDUCTION_BARRIER
528 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529 &team->t.t_bar[bs_reduction_barrier + 1],
530 sizeof(kmp_balign_team_t),
531 "%s_%d.t_bar[reduction]", header, team_id);
532#endif // KMP_FAST_REDUCTION_BARRIER
533
534 __kmp_print_storage_map_gtid(
535 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537
538 __kmp_print_storage_map_gtid(
539 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541
542 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543 &team->t.t_disp_buffer[num_disp_buff],
544 sizeof(dispatch_shared_info_t) * num_disp_buff,
545 "%s_%d.t_disp_buffer", header, team_id);
546}
547
548static void __kmp_init_allocator() {
549 __kmp_init_memkind();
550 __kmp_init_target_mem();
551}
552static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553
554/* ------------------------------------------------------------------------ */
555
556#if KMP_DYNAMIC_LIB
557#if KMP_OS_WINDOWS
558
559BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561
562 switch (fdwReason) {
563
564 case DLL_PROCESS_ATTACH:
565 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566
567 return TRUE;
568
569 case DLL_PROCESS_DETACH:
570 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571
572 // According to Windows* documentation for DllMain entry point:
573 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574 // lpReserved == NULL when FreeLibrary() is called,
575 // lpReserved != NULL when the process is terminated.
576 // When FreeLibrary() is called, worker threads remain alive. So the
577 // runtime's state is consistent and executing proper shutdown is OK.
578 // When the process is terminated, worker threads have exited or been
579 // forcefully terminated by the OS and only the shutdown thread remains.
580 // This can leave the runtime in an inconsistent state.
581 // Hence, only attempt proper cleanup when FreeLibrary() is called.
582 // Otherwise, rely on OS to reclaim resources.
583 if (lpReserved == NULL)
584 __kmp_internal_end_library(__kmp_gtid_get_specific());
585
586 return TRUE;
587
588 case DLL_THREAD_ATTACH:
589 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590
591 /* if we want to register new siblings all the time here call
592 * __kmp_get_gtid(); */
593 return TRUE;
594
595 case DLL_THREAD_DETACH:
596 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597
598 __kmp_internal_end_thread(__kmp_gtid_get_specific());
599 return TRUE;
600 }
601
602 return TRUE;
603}
604
605#endif /* KMP_OS_WINDOWS */
606#endif /* KMP_DYNAMIC_LIB */
607
608/* __kmp_parallel_deo -- Wait until it's our turn. */
609void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610 int gtid = *gtid_ref;
611#ifdef BUILD_PARALLEL_ORDERED
612 kmp_team_t *team = __kmp_team_from_gtid(gtid);
613#endif /* BUILD_PARALLEL_ORDERED */
614
615 if (__kmp_env_consistency_check) {
616 if (__kmp_threads[gtid]->th.th_root->r.r_active)
617#if KMP_USE_DYNAMIC_LOCK
618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619#else
620 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621#endif
622 }
623#ifdef BUILD_PARALLEL_ORDERED
624 if (!team->t.t_serialized) {
625 KMP_MB();
626 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627 NULL);
628 KMP_MB();
629 }
630#endif /* BUILD_PARALLEL_ORDERED */
631}
632
633/* __kmp_parallel_dxo -- Signal the next task. */
634void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635 int gtid = *gtid_ref;
636#ifdef BUILD_PARALLEL_ORDERED
637 int tid = __kmp_tid_from_gtid(gtid);
638 kmp_team_t *team = __kmp_team_from_gtid(gtid);
639#endif /* BUILD_PARALLEL_ORDERED */
640
641 if (__kmp_env_consistency_check) {
642 if (__kmp_threads[gtid]->th.th_root->r.r_active)
643 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644 }
645#ifdef BUILD_PARALLEL_ORDERED
646 if (!team->t.t_serialized) {
647 KMP_MB(); /* Flush all pending memory write invalidates. */
648
649 /* use the tid of the next thread in this team */
650 /* TODO replace with general release procedure */
651 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652
653 KMP_MB(); /* Flush all pending memory write invalidates. */
654 }
655#endif /* BUILD_PARALLEL_ORDERED */
656}
657
658/* ------------------------------------------------------------------------ */
659/* The BARRIER for a SINGLE process section is always explicit */
660
661int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662 int status;
663 kmp_info_t *th;
664 kmp_team_t *team;
665
666 if (!TCR_4(__kmp_init_parallel))
667 __kmp_parallel_initialize();
668 __kmp_resume_if_soft_paused();
669
670 th = __kmp_threads[gtid];
671 team = th->th.th_team;
672 status = 0;
673
674 th->th.th_ident = id_ref;
675
676 if (team->t.t_serialized) {
677 status = 1;
678 } else {
679 kmp_int32 old_this = th->th.th_local.this_construct;
680
681 ++th->th.th_local.this_construct;
682 /* try to set team count to thread count--success means thread got the
683 single block */
684 /* TODO: Should this be acquire or release? */
685 if (team->t.t_construct == old_this) {
686 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687 th->th.th_local.this_construct);
688 }
689#if USE_ITT_BUILD
690 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692 team->t.t_active_level == 1) {
693 // Only report metadata by primary thread of active team at level 1
694 __kmp_itt_metadata_single(id_ref);
695 }
696#endif /* USE_ITT_BUILD */
697 }
698
699 if (__kmp_env_consistency_check) {
700 if (status && push_ws) {
701 __kmp_push_workshare(gtid, ct_psingle, id_ref);
702 } else {
703 __kmp_check_workshare(gtid, ct_psingle, id_ref);
704 }
705 }
706#if USE_ITT_BUILD
707 if (status) {
708 __kmp_itt_single_start(gtid);
709 }
710#endif /* USE_ITT_BUILD */
711 return status;
712}
713
714void __kmp_exit_single(int gtid) {
715#if USE_ITT_BUILD
716 __kmp_itt_single_end(gtid);
717#endif /* USE_ITT_BUILD */
718 if (__kmp_env_consistency_check)
719 __kmp_pop_workshare(gtid, ct_psingle, NULL);
720}
721
722/* determine if we can go parallel or must use a serialized parallel region and
723 * how many threads we can use
724 * set_nproc is the number of threads requested for the team
725 * returns 0 if we should serialize or only use one thread,
726 * otherwise the number of threads to use
727 * The forkjoin lock is held by the caller. */
728static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729 int master_tid, int set_nthreads,
730 int enter_teams) {
731 int capacity;
732 int new_nthreads;
733 KMP_DEBUG_ASSERT(__kmp_init_serial);
734 KMP_DEBUG_ASSERT(root && parent_team);
735 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736
737 // If dyn-var is set, dynamically adjust the number of desired threads,
738 // according to the method specified by dynamic_mode.
739 new_nthreads = set_nthreads;
740 if (!get__dynamic_2(parent_team, master_tid)) {
741 ;
742 }
743#ifdef USE_LOAD_BALANCE
744 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746 if (new_nthreads == 1) {
747 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748 "reservation to 1 thread\n",
749 master_tid));
750 return 1;
751 }
752 if (new_nthreads < set_nthreads) {
753 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754 "reservation to %d threads\n",
755 master_tid, new_nthreads));
756 }
757 }
758#endif /* USE_LOAD_BALANCE */
759 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760 new_nthreads = __kmp_avail_proc - __kmp_nth +
761 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762 if (new_nthreads <= 1) {
763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764 "reservation to 1 thread\n",
765 master_tid));
766 return 1;
767 }
768 if (new_nthreads < set_nthreads) {
769 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770 "reservation to %d threads\n",
771 master_tid, new_nthreads));
772 } else {
773 new_nthreads = set_nthreads;
774 }
775 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776 if (set_nthreads > 2) {
777 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778 new_nthreads = (new_nthreads % set_nthreads) + 1;
779 if (new_nthreads == 1) {
780 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781 "reservation to 1 thread\n",
782 master_tid));
783 return 1;
784 }
785 if (new_nthreads < set_nthreads) {
786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787 "reservation to %d threads\n",
788 master_tid, new_nthreads));
789 }
790 }
791 } else {
792 KMP_ASSERT(0);
793 }
794
795 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796 if (__kmp_nth + new_nthreads -
797 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798 __kmp_max_nth) {
799 int tl_nthreads = __kmp_max_nth - __kmp_nth +
800 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801 if (tl_nthreads <= 0) {
802 tl_nthreads = 1;
803 }
804
805 // If dyn-var is false, emit a 1-time warning.
806 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807 __kmp_reserve_warn = 1;
808 __kmp_msg(kmp_ms_warning,
809 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811 }
812 if (tl_nthreads == 1) {
813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814 "reduced reservation to 1 thread\n",
815 master_tid));
816 return 1;
817 }
818 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819 "reservation to %d threads\n",
820 master_tid, tl_nthreads));
821 new_nthreads = tl_nthreads;
822 }
823
824 // Respect OMP_THREAD_LIMIT
825 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827 if (cg_nthreads + new_nthreads -
828 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829 max_cg_threads) {
830 int tl_nthreads = max_cg_threads - cg_nthreads +
831 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832 if (tl_nthreads <= 0) {
833 tl_nthreads = 1;
834 }
835
836 // If dyn-var is false, emit a 1-time warning.
837 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838 __kmp_reserve_warn = 1;
839 __kmp_msg(kmp_ms_warning,
840 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842 }
843 if (tl_nthreads == 1) {
844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845 "reduced reservation to 1 thread\n",
846 master_tid));
847 return 1;
848 }
849 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850 "reservation to %d threads\n",
851 master_tid, tl_nthreads));
852 new_nthreads = tl_nthreads;
853 }
854
855 // Check if the threads array is large enough, or needs expanding.
856 // See comment in __kmp_register_root() about the adjustment if
857 // __kmp_threads[0] == NULL.
858 capacity = __kmp_threads_capacity;
859 if (TCR_PTR(__kmp_threads[0]) == NULL) {
860 --capacity;
861 }
862 // If it is not for initializing the hidden helper team, we need to take
863 // __kmp_hidden_helper_threads_num out of the capacity because it is included
864 // in __kmp_threads_capacity.
865 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866 capacity -= __kmp_hidden_helper_threads_num;
867 }
868 if (__kmp_nth + new_nthreads -
869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870 capacity) {
871 // Expand the threads array.
872 int slotsRequired = __kmp_nth + new_nthreads -
873 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874 capacity;
875 int slotsAdded = __kmp_expand_threads(slotsRequired);
876 if (slotsAdded < slotsRequired) {
877 // The threads array was not expanded enough.
878 new_nthreads -= (slotsRequired - slotsAdded);
879 KMP_ASSERT(new_nthreads >= 1);
880
881 // If dyn-var is false, emit a 1-time warning.
882 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883 __kmp_reserve_warn = 1;
884 if (__kmp_tp_cached) {
885 __kmp_msg(kmp_ms_warning,
886 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889 } else {
890 __kmp_msg(kmp_ms_warning,
891 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893 }
894 }
895 }
896 }
897
898#ifdef KMP_DEBUG
899 if (new_nthreads == 1) {
900 KC_TRACE(10,
901 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902 "dead roots and rechecking; requested %d threads\n",
903 __kmp_get_gtid(), set_nthreads));
904 } else {
905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906 " %d threads\n",
907 __kmp_get_gtid(), new_nthreads, set_nthreads));
908 }
909#endif // KMP_DEBUG
910 return new_nthreads;
911}
912
913/* Allocate threads from the thread pool and assign them to the new team. We are
914 assured that there are enough threads available, because we checked on that
915 earlier within critical section forkjoin */
916static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917 kmp_info_t *master_th, int master_gtid,
918 int fork_teams_workers) {
919 int i;
920 int use_hot_team;
921
922 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924 KMP_MB();
925
926 /* first, let's setup the primary thread */
927 master_th->th.th_info.ds.ds_tid = 0;
928 master_th->th.th_team = team;
929 master_th->th.th_team_nproc = team->t.t_nproc;
930 master_th->th.th_team_master = master_th;
931 master_th->th.th_team_serialized = FALSE;
932 master_th->th.th_dispatch = &team->t.t_dispatch[0];
933
934/* make sure we are not the optimized hot team */
935#if KMP_NESTED_HOT_TEAMS
936 use_hot_team = 0;
937 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938 if (hot_teams) { // hot teams array is not allocated if
939 // KMP_HOT_TEAMS_MAX_LEVEL=0
940 int level = team->t.t_active_level - 1; // index in array of hot teams
941 if (master_th->th.th_teams_microtask) { // are we inside the teams?
942 if (master_th->th.th_teams_size.nteams > 1) {
943 ++level; // level was not increased in teams construct for
944 // team_of_masters
945 }
946 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947 master_th->th.th_teams_level == team->t.t_level) {
948 ++level; // level was not increased in teams construct for
949 // team_of_workers before the parallel
950 } // team->t.t_level will be increased inside parallel
951 }
952 if (level < __kmp_hot_teams_max_level) {
953 if (hot_teams[level].hot_team) {
954 // hot team has already been allocated for given level
955 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956 use_hot_team = 1; // the team is ready to use
957 } else {
958 use_hot_team = 0; // AC: threads are not allocated yet
959 hot_teams[level].hot_team = team; // remember new hot team
960 hot_teams[level].hot_team_nth = team->t.t_nproc;
961 }
962 } else {
963 use_hot_team = 0;
964 }
965 }
966#else
967 use_hot_team = team == root->r.r_hot_team;
968#endif
969 if (!use_hot_team) {
970
971 /* install the primary thread */
972 team->t.t_threads[0] = master_th;
973 __kmp_initialize_info(master_th, team, 0, master_gtid);
974
975 /* now, install the worker threads */
976 for (i = 1; i < team->t.t_nproc; i++) {
977
978 /* fork or reallocate a new thread and install it in team */
979 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980 team->t.t_threads[i] = thr;
981 KMP_DEBUG_ASSERT(thr);
982 KMP_DEBUG_ASSERT(thr->th.th_team == team);
983 /* align team and thread arrived states */
984 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985 "T#%d(%d:%d) join =%llu, plain=%llu\n",
986 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989 team->t.t_bar[bs_plain_barrier].b_arrived));
990 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991 thr->th.th_teams_level = master_th->th.th_teams_level;
992 thr->th.th_teams_size = master_th->th.th_teams_size;
993 { // Initialize threads' barrier data.
994 int b;
995 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996 for (b = 0; b < bs_last_barrier; ++b) {
997 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999#if USE_DEBUGGER
1000 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001#endif
1002 }
1003 }
1004 }
1005
1006#if KMP_AFFINITY_SUPPORTED
1007 // Do not partition the places list for teams construct workers who
1008 // haven't actually been forked to do real work yet. This partitioning
1009 // will take place in the parallel region nested within the teams construct.
1010 if (!fork_teams_workers) {
1011 __kmp_partition_places(team);
1012 }
1013#endif
1014 }
1015
1016 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1017 for (i = 0; i < team->t.t_nproc; i++) {
1018 kmp_info_t *thr = team->t.t_threads[i];
1019 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1020 thr->th.th_prev_level != team->t.t_level) {
1021 team->t.t_display_affinity = 1;
1022 break;
1023 }
1024 }
1025 }
1026
1027 KMP_MB();
1028}
1029
1030#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031// Propagate any changes to the floating point control registers out to the team
1032// We try to avoid unnecessary writes to the relevant cache line in the team
1033// structure, so we don't make changes unless they are needed.
1034inline static void propagateFPControl(kmp_team_t *team) {
1035 if (__kmp_inherit_fp_control) {
1036 kmp_int16 x87_fpu_control_word;
1037 kmp_uint32 mxcsr;
1038
1039 // Get primary thread's values of FPU control flags (both X87 and vector)
1040 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1041 __kmp_store_mxcsr(&mxcsr);
1042 mxcsr &= KMP_X86_MXCSR_MASK;
1043
1044 // There is no point looking at t_fp_control_saved here.
1045 // If it is TRUE, we still have to update the values if they are different
1046 // from those we now have. If it is FALSE we didn't save anything yet, but
1047 // our objective is the same. We have to ensure that the values in the team
1048 // are the same as those we have.
1049 // So, this code achieves what we need whether or not t_fp_control_saved is
1050 // true. By checking whether the value needs updating we avoid unnecessary
1051 // writes that would put the cache-line into a written state, causing all
1052 // threads in the team to have to read it again.
1053 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1054 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1055 // Although we don't use this value, other code in the runtime wants to know
1056 // whether it should restore them. So we must ensure it is correct.
1057 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1058 } else {
1059 // Similarly here. Don't write to this cache-line in the team structure
1060 // unless we have to.
1061 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1062 }
1063}
1064
1065// Do the opposite, setting the hardware registers to the updated values from
1066// the team.
1067inline static void updateHWFPControl(kmp_team_t *team) {
1068 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1069 // Only reset the fp control regs if they have been changed in the team.
1070 // the parallel region that we are exiting.
1071 kmp_int16 x87_fpu_control_word;
1072 kmp_uint32 mxcsr;
1073 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1074 __kmp_store_mxcsr(&mxcsr);
1075 mxcsr &= KMP_X86_MXCSR_MASK;
1076
1077 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1078 __kmp_clear_x87_fpu_status_word();
1079 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1080 }
1081
1082 if (team->t.t_mxcsr != mxcsr) {
1083 __kmp_load_mxcsr(&team->t.t_mxcsr);
1084 }
1085 }
1086}
1087#else
1088#define propagateFPControl(x) ((void)0)
1089#define updateHWFPControl(x) ((void)0)
1090#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1091
1092static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1093 int realloc); // forward declaration
1094
1095/* Run a parallel region that has been serialized, so runs only in a team of the
1096 single primary thread. */
1097void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1098 kmp_info_t *this_thr;
1099 kmp_team_t *serial_team;
1100
1101 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1102
1103 /* Skip all this code for autopar serialized loops since it results in
1104 unacceptable overhead */
1105 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1106 return;
1107
1108 if (!TCR_4(__kmp_init_parallel))
1109 __kmp_parallel_initialize();
1110 __kmp_resume_if_soft_paused();
1111
1112 this_thr = __kmp_threads[global_tid];
1113 serial_team = this_thr->th.th_serial_team;
1114
1115 /* utilize the serialized team held by this thread */
1116 KMP_DEBUG_ASSERT(serial_team);
1117 KMP_MB();
1118
1119 if (__kmp_tasking_mode != tskm_immediate_exec) {
1120 KMP_DEBUG_ASSERT(
1121 this_thr->th.th_task_team ==
1122 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1123 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1124 NULL);
1125 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1126 "team %p, new task_team = NULL\n",
1127 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1128 this_thr->th.th_task_team = NULL;
1129 }
1130
1131 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1132 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1133 proc_bind = proc_bind_false;
1134 } else if (proc_bind == proc_bind_default) {
1135 // No proc_bind clause was specified, so use the current value
1136 // of proc-bind-var for this parallel region.
1137 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1138 }
1139 // Reset for next parallel region
1140 this_thr->th.th_set_proc_bind = proc_bind_default;
1141
1142#if OMPT_SUPPORT
1143 ompt_data_t ompt_parallel_data = ompt_data_none;
1144 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1145 if (ompt_enabled.enabled &&
1146 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1147
1148 ompt_task_info_t *parent_task_info;
1149 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1150
1151 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1152 if (ompt_enabled.ompt_callback_parallel_begin) {
1153 int team_size = 1;
1154
1155 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1156 &(parent_task_info->task_data), &(parent_task_info->frame),
1157 &ompt_parallel_data, team_size,
1158 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1159 }
1160 }
1161#endif // OMPT_SUPPORT
1162
1163 if (this_thr->th.th_team != serial_team) {
1164 // Nested level will be an index in the nested nthreads array
1165 int level = this_thr->th.th_team->t.t_level;
1166
1167 if (serial_team->t.t_serialized) {
1168 /* this serial team was already used
1169 TODO increase performance by making this locks more specific */
1170 kmp_team_t *new_team;
1171
1172 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1173
1174 new_team =
1175 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1176#if OMPT_SUPPORT
1177 ompt_parallel_data,
1178#endif
1179 proc_bind, &this_thr->th.th_current_task->td_icvs,
1180 0 USE_NESTED_HOT_ARG(NULL));
1181 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1182 KMP_ASSERT(new_team);
1183
1184 /* setup new serialized team and install it */
1185 new_team->t.t_threads[0] = this_thr;
1186 new_team->t.t_parent = this_thr->th.th_team;
1187 serial_team = new_team;
1188 this_thr->th.th_serial_team = serial_team;
1189
1190 KF_TRACE(
1191 10,
1192 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1193 global_tid, serial_team));
1194
1195 /* TODO the above breaks the requirement that if we run out of resources,
1196 then we can still guarantee that serialized teams are ok, since we may
1197 need to allocate a new one */
1198 } else {
1199 KF_TRACE(
1200 10,
1201 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1202 global_tid, serial_team));
1203 }
1204
1205 /* we have to initialize this serial team */
1206 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1207 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1208 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1209 serial_team->t.t_ident = loc;
1210 serial_team->t.t_serialized = 1;
1211 serial_team->t.t_nproc = 1;
1212 serial_team->t.t_parent = this_thr->th.th_team;
1213 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1214 this_thr->th.th_team = serial_team;
1215 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1216
1217 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1218 this_thr->th.th_current_task));
1219 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1220 this_thr->th.th_current_task->td_flags.executing = 0;
1221
1222 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1223
1224 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1225 implicit task for each serialized task represented by
1226 team->t.t_serialized? */
1227 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1228 &this_thr->th.th_current_task->td_parent->td_icvs);
1229
1230 // Thread value exists in the nested nthreads array for the next nested
1231 // level
1232 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1233 this_thr->th.th_current_task->td_icvs.nproc =
1234 __kmp_nested_nth.nth[level + 1];
1235 }
1236
1237 if (__kmp_nested_proc_bind.used &&
1238 (level + 1 < __kmp_nested_proc_bind.used)) {
1239 this_thr->th.th_current_task->td_icvs.proc_bind =
1240 __kmp_nested_proc_bind.bind_types[level + 1];
1241 }
1242
1243#if USE_DEBUGGER
1244 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1245#endif
1246 this_thr->th.th_info.ds.ds_tid = 0;
1247
1248 /* set thread cache values */
1249 this_thr->th.th_team_nproc = 1;
1250 this_thr->th.th_team_master = this_thr;
1251 this_thr->th.th_team_serialized = 1;
1252
1253 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1254 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1255 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1256
1257 propagateFPControl(serial_team);
1258
1259 /* check if we need to allocate dispatch buffers stack */
1260 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1261 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1262 serial_team->t.t_dispatch->th_disp_buffer =
1263 (dispatch_private_info_t *)__kmp_allocate(
1264 sizeof(dispatch_private_info_t));
1265 }
1266 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1267
1268 KMP_MB();
1269
1270 } else {
1271 /* this serialized team is already being used,
1272 * that's fine, just add another nested level */
1273 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1274 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1275 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1276 ++serial_team->t.t_serialized;
1277 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1278
1279 // Nested level will be an index in the nested nthreads array
1280 int level = this_thr->th.th_team->t.t_level;
1281 // Thread value exists in the nested nthreads array for the next nested
1282 // level
1283 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1284 this_thr->th.th_current_task->td_icvs.nproc =
1285 __kmp_nested_nth.nth[level + 1];
1286 }
1287 serial_team->t.t_level++;
1288 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1289 "of serial team %p to %d\n",
1290 global_tid, serial_team, serial_team->t.t_level));
1291
1292 /* allocate/push dispatch buffers stack */
1293 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1294 {
1295 dispatch_private_info_t *disp_buffer =
1296 (dispatch_private_info_t *)__kmp_allocate(
1297 sizeof(dispatch_private_info_t));
1298 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1299 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1300 }
1301 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1302
1303 KMP_MB();
1304 }
1305 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1306
1307 // Perform the display affinity functionality for
1308 // serialized parallel regions
1309 if (__kmp_display_affinity) {
1310 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1311 this_thr->th.th_prev_num_threads != 1) {
1312 // NULL means use the affinity-format-var ICV
1313 __kmp_aux_display_affinity(global_tid, NULL);
1314 this_thr->th.th_prev_level = serial_team->t.t_level;
1315 this_thr->th.th_prev_num_threads = 1;
1316 }
1317 }
1318
1319 if (__kmp_env_consistency_check)
1320 __kmp_push_parallel(global_tid, NULL);
1321#if OMPT_SUPPORT
1322 serial_team->t.ompt_team_info.master_return_address = codeptr;
1323 if (ompt_enabled.enabled &&
1324 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1325 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1326 OMPT_GET_FRAME_ADDRESS(0);
1327
1328 ompt_lw_taskteam_t lw_taskteam;
1329 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1330 &ompt_parallel_data, codeptr);
1331
1332 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1333 // don't use lw_taskteam after linking. content was swaped
1334
1335 /* OMPT implicit task begin */
1336 if (ompt_enabled.ompt_callback_implicit_task) {
1337 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1338 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1339 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1340 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1341 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1342 __kmp_tid_from_gtid(global_tid);
1343 }
1344
1345 /* OMPT state */
1346 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1347 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1348 OMPT_GET_FRAME_ADDRESS(0);
1349 }
1350#endif
1351}
1352
1353/* most of the work for a fork */
1354/* return true if we really went parallel, false if serialized */
1355int __kmp_fork_call(ident_t *loc, int gtid,
1356 enum fork_context_e call_context, // Intel, GNU, ...
1357 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1358 kmp_va_list ap) {
1359 void **argv;
1360 int i;
1361 int master_tid;
1362 int master_this_cons;
1363 kmp_team_t *team;
1364 kmp_team_t *parent_team;
1365 kmp_info_t *master_th;
1366 kmp_root_t *root;
1367 int nthreads;
1368 int master_active;
1369 int master_set_numthreads;
1370 int level;
1371 int active_level;
1372 int teams_level;
1373#if KMP_NESTED_HOT_TEAMS
1374 kmp_hot_team_ptr_t **p_hot_teams;
1375#endif
1376 { // KMP_TIME_BLOCK
1377 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1378 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1379
1380 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1381 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1382 /* Some systems prefer the stack for the root thread(s) to start with */
1383 /* some gap from the parent stack to prevent false sharing. */
1384 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1385 /* These 2 lines below are so this does not get optimized out */
1386 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1387 __kmp_stkpadding += (short)((kmp_int64)dummy);
1388 }
1389
1390 /* initialize if needed */
1391 KMP_DEBUG_ASSERT(
1392 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1393 if (!TCR_4(__kmp_init_parallel))
1394 __kmp_parallel_initialize();
1395 __kmp_resume_if_soft_paused();
1396
1397 /* setup current data */
1398 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399 // shutdown
1400 parent_team = master_th->th.th_team;
1401 master_tid = master_th->th.th_info.ds.ds_tid;
1402 master_this_cons = master_th->th.th_local.this_construct;
1403 root = master_th->th.th_root;
1404 master_active = root->r.r_active;
1405 master_set_numthreads = master_th->th.th_set_nproc;
1406
1407#if OMPT_SUPPORT
1408 ompt_data_t ompt_parallel_data = ompt_data_none;
1409 ompt_data_t *parent_task_data;
1410 ompt_frame_t *ompt_frame;
1411 ompt_data_t *implicit_task_data;
1412 void *return_address = NULL;
1413
1414 if (ompt_enabled.enabled) {
1415 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1416 NULL, NULL);
1417 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1418 }
1419#endif
1420
1421 // Assign affinity to root thread if it hasn't happened yet
1422 __kmp_assign_root_init_mask();
1423
1424 // Nested level will be an index in the nested nthreads array
1425 level = parent_team->t.t_level;
1426 // used to launch non-serial teams even if nested is not allowed
1427 active_level = parent_team->t.t_active_level;
1428 // needed to check nesting inside the teams
1429 teams_level = master_th->th.th_teams_level;
1430#if KMP_NESTED_HOT_TEAMS
1431 p_hot_teams = &master_th->th.th_hot_teams;
1432 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1433 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1434 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1435 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1436 // it is either actual or not needed (when active_level > 0)
1437 (*p_hot_teams)[0].hot_team_nth = 1;
1438 }
1439#endif
1440
1441#if OMPT_SUPPORT
1442 if (ompt_enabled.enabled) {
1443 if (ompt_enabled.ompt_callback_parallel_begin) {
1444 int team_size = master_set_numthreads
1445 ? master_set_numthreads
1446 : get__nproc_2(parent_team, master_tid);
1447 int flags = OMPT_INVOKER(call_context) |
1448 ((microtask == (microtask_t)__kmp_teams_master)
1449 ? ompt_parallel_league
1450 : ompt_parallel_team);
1451 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1452 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1453 return_address);
1454 }
1455 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1456 }
1457#endif
1458
1459 master_th->th.th_ident = loc;
1460
1461 if (master_th->th.th_teams_microtask && ap &&
1462 microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1463 // AC: This is start of parallel that is nested inside teams construct.
1464 // The team is actual (hot), all workers are ready at the fork barrier.
1465 // No lock needed to initialize the team a bit, then free workers.
1466 parent_team->t.t_ident = loc;
1467 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1468 parent_team->t.t_argc = argc;
1469 argv = (void **)parent_team->t.t_argv;
1470 for (i = argc - 1; i >= 0; --i)
1471 *argv++ = va_arg(kmp_va_deref(ap), void *);
1472 // Increment our nested depth levels, but not increase the serialization
1473 if (parent_team == master_th->th.th_serial_team) {
1474 // AC: we are in serialized parallel
1475 __kmpc_serialized_parallel(loc, gtid);
1476 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477
1478 if (call_context == fork_context_gnu) {
1479 // AC: need to decrement t_serialized for enquiry functions to work
1480 // correctly, will restore at join time
1481 parent_team->t.t_serialized--;
1482 return TRUE;
1483 }
1484
1485#if OMPD_SUPPORT
1486 parent_team->t.t_pkfn = microtask;
1487#endif
1488
1489#if OMPT_SUPPORT
1490 void *dummy;
1491 void **exit_frame_p;
1492
1493 ompt_lw_taskteam_t lw_taskteam;
1494
1495 if (ompt_enabled.enabled) {
1496 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1497 &ompt_parallel_data, return_address);
1498 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1499
1500 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1501 // don't use lw_taskteam after linking. content was swaped
1502
1503 /* OMPT implicit task begin */
1504 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1505 if (ompt_enabled.ompt_callback_implicit_task) {
1506 OMPT_CUR_TASK_INFO(master_th)->thread_num =
1507 __kmp_tid_from_gtid(gtid);
1508 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1509 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1510 implicit_task_data, 1,
1511 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1512 }
1513
1514 /* OMPT state */
1515 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1516 } else {
1517 exit_frame_p = &dummy;
1518 }
1519#endif
1520 // AC: need to decrement t_serialized for enquiry functions to work
1521 // correctly, will restore at join time
1522 parent_team->t.t_serialized--;
1523
1524 {
1525 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1526 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1527 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1528#if OMPT_SUPPORT
1529 ,
1530 exit_frame_p
1531#endif
1532 );
1533 }
1534
1535#if OMPT_SUPPORT
1536 if (ompt_enabled.enabled) {
1537 *exit_frame_p = NULL;
1538 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1539 if (ompt_enabled.ompt_callback_implicit_task) {
1540 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1541 ompt_scope_end, NULL, implicit_task_data, 1,
1542 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543 }
1544 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1545 __ompt_lw_taskteam_unlink(master_th);
1546 if (ompt_enabled.ompt_callback_parallel_end) {
1547 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1548 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1549 OMPT_INVOKER(call_context) | ompt_parallel_team,
1550 return_address);
1551 }
1552 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1553 }
1554#endif
1555 return TRUE;
1556 }
1557
1558 parent_team->t.t_pkfn = microtask;
1559 parent_team->t.t_invoke = invoker;
1560 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1561 parent_team->t.t_active_level++;
1562 parent_team->t.t_level++;
1563 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1564
1565#if OMPT_SUPPORT
1566 if (ompt_enabled.enabled) {
1567 ompt_lw_taskteam_t lw_taskteam;
1568 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1569 &ompt_parallel_data, return_address);
1570 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1571 }
1572#endif
1573
1574 /* Change number of threads in the team if requested */
1575 if (master_set_numthreads) { // The parallel has num_threads clause
1576 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1577 // AC: only can reduce number of threads dynamically, can't increase
1578 kmp_info_t **other_threads = parent_team->t.t_threads;
1579 // NOTE: if using distributed barrier, we need to run this code block
1580 // even when the team size appears not to have changed from the max.
1581 int old_proc = master_th->th.th_teams_size.nth;
1582 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1583 bp_dist_bar) {
1584 __kmp_resize_dist_barrier(parent_team, old_proc,
1585 master_set_numthreads);
1586 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1587 }
1588 parent_team->t.t_nproc = master_set_numthreads;
1589 for (i = 0; i < master_set_numthreads; ++i) {
1590 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591 }
1592 }
1593 // Keep extra threads hot in the team for possible next parallels
1594 master_th->th.th_set_nproc = 0;
1595 }
1596
1597#if USE_DEBUGGER
1598 if (__kmp_debugging) { // Let debugger override number of threads.
1599 int nth = __kmp_omp_num_threads(loc);
1600 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601 master_set_numthreads = nth;
1602 }
1603 }
1604#endif
1605
1606 // Figure out the proc_bind policy for the nested parallel within teams
1607 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1608 // proc_bind_default means don't update
1609 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1610 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1611 proc_bind = proc_bind_false;
1612 } else {
1613 // No proc_bind clause specified; use current proc-bind-var
1614 if (proc_bind == proc_bind_default) {
1615 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1616 }
1617 /* else: The proc_bind policy was specified explicitly on parallel
1618 clause.
1619 This overrides proc-bind-var for this parallel region, but does not
1620 change proc-bind-var. */
1621 // Figure the value of proc-bind-var for the child threads.
1622 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1623 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1624 master_th->th.th_current_task->td_icvs.proc_bind)) {
1625 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1626 }
1627 }
1628 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1629 // Need to change the bind-var ICV to correct value for each implicit task
1630 if (proc_bind_icv != proc_bind_default &&
1631 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1632 kmp_info_t **other_threads = parent_team->t.t_threads;
1633 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1634 other_threads[i]->th.th_current_task->td_icvs.proc_bind =
1635 proc_bind_icv;
1636 }
1637 }
1638 // Reset for next parallel region
1639 master_th->th.th_set_proc_bind = proc_bind_default;
1640
1641#if USE_ITT_BUILD && USE_ITT_NOTIFY
1642 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1643 KMP_ITT_DEBUG) &&
1644 __kmp_forkjoin_frames_mode == 3 &&
1645 parent_team->t.t_active_level == 1 // only report frames at level 1
1646 && master_th->th.th_teams_size.nteams == 1) {
1647 kmp_uint64 tmp_time = __itt_get_timestamp();
1648 master_th->th.th_frame_time = tmp_time;
1649 parent_team->t.t_region_time = tmp_time;
1650 }
1651 if (__itt_stack_caller_create_ptr) {
1652 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1653 // create new stack stitching id before entering fork barrier
1654 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1655 }
1656#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657#if KMP_AFFINITY_SUPPORTED
1658 __kmp_partition_places(parent_team);
1659#endif
1660
1661 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1662 "master_th=%p, gtid=%d\n",
1663 root, parent_team, master_th, gtid));
1664 __kmp_internal_fork(loc, gtid, parent_team);
1665 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1666 "master_th=%p, gtid=%d\n",
1667 root, parent_team, master_th, gtid));
1668
1669 if (call_context == fork_context_gnu)
1670 return TRUE;
1671
1672 /* Invoke microtask for PRIMARY thread */
1673 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1674 parent_team->t.t_id, parent_team->t.t_pkfn));
1675
1676 if (!parent_team->t.t_invoke(gtid)) {
1677 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1678 }
1679 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1680 parent_team->t.t_id, parent_team->t.t_pkfn));
1681 KMP_MB(); /* Flush all pending memory write invalidates. */
1682
1683 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1684
1685 return TRUE;
1686 } // Parallel closely nested in teams construct
1687
1688#if KMP_DEBUG
1689 if (__kmp_tasking_mode != tskm_immediate_exec) {
1690 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1691 parent_team->t.t_task_team[master_th->th.th_task_state]);
1692 }
1693#endif
1694
1695 // Need this to happen before we determine the number of threads, not while
1696 // we are allocating the team
1697 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1698 int enter_teams = 0;
1699 if (parent_team->t.t_active_level >=
1700 master_th->th.th_current_task->td_icvs.max_active_levels) {
1701 nthreads = 1;
1702 } else {
1703 enter_teams = ((ap == NULL && active_level == 0) ||
1704 (ap && teams_level > 0 && teams_level == level));
1705 nthreads = master_set_numthreads
1706 ? master_set_numthreads
1707 // TODO: get nproc directly from current task
1708 : get__nproc_2(parent_team, master_tid);
1709 // Check if we need to take forkjoin lock? (no need for serialized
1710 // parallel out of teams construct). This code moved here from
1711 // __kmp_reserve_threads() to speedup nested serialized parallels.
1712 if (nthreads > 1) {
1713 if ((get__max_active_levels(master_th) == 1 &&
1714 (root->r.r_in_parallel && !enter_teams)) ||
1715 (__kmp_library == library_serial)) {
1716 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1717 " threads\n",
1718 gtid, nthreads));
1719 nthreads = 1;
1720 }
1721 }
1722 if (nthreads > 1) {
1723 /* determine how many new threads we can use */
1724 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1725 /* AC: If we execute teams from parallel region (on host), then teams
1726 should be created but each can only have 1 thread if nesting is
1727 disabled. If teams called from serial region, then teams and their
1728 threads should be created regardless of the nesting setting. */
1729 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1730 nthreads, enter_teams);
1731 if (nthreads == 1) {
1732 // Free lock for single thread execution here; for multi-thread
1733 // execution it will be freed later after team of threads created
1734 // and initialized
1735 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1736 }
1737 }
1738 }
1739 KMP_DEBUG_ASSERT(nthreads > 0);
1740
1741 // If we temporarily changed the set number of threads then restore it now
1742 master_th->th.th_set_nproc = 0;
1743
1744 /* create a serialized parallel region? */
1745 if (nthreads == 1) {
1746/* josh todo: hypothetical question: what do we do for OS X*? */
1747#if KMP_OS_LINUX && \
1748 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1749 void *args[argc];
1750#else
1751 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1752#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1753 KMP_ARCH_AARCH64) */
1754
1755 KA_TRACE(20,
1756 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1757
1758 __kmpc_serialized_parallel(loc, gtid);
1759
1760#if OMPD_SUPPORT
1761 master_th->th.th_serial_team->t.t_pkfn = microtask;
1762#endif
1763
1764 if (call_context == fork_context_intel) {
1765 /* TODO this sucks, use the compiler itself to pass args! :) */
1766 master_th->th.th_serial_team->t.t_ident = loc;
1767 if (!ap) {
1768 // revert change made in __kmpc_serialized_parallel()
1769 master_th->th.th_serial_team->t.t_level--;
1770 // Get args from parent team for teams construct
1771
1772#if OMPT_SUPPORT
1773 void *dummy;
1774 void **exit_frame_p;
1775 ompt_task_info_t *task_info;
1776
1777 ompt_lw_taskteam_t lw_taskteam;
1778
1779 if (ompt_enabled.enabled) {
1780 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1781 &ompt_parallel_data, return_address);
1782
1783 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1784 // don't use lw_taskteam after linking. content was swaped
1785
1786 task_info = OMPT_CUR_TASK_INFO(master_th);
1787 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1788 if (ompt_enabled.ompt_callback_implicit_task) {
1789 OMPT_CUR_TASK_INFO(master_th)->thread_num =
1790 __kmp_tid_from_gtid(gtid);
1791 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1793 &(task_info->task_data), 1,
1794 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1795 ompt_task_implicit);
1796 }
1797
1798 /* OMPT state */
1799 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1800 } else {
1801 exit_frame_p = &dummy;
1802 }
1803#endif
1804
1805 {
1806 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808 __kmp_invoke_microtask(microtask, gtid, 0, argc,
1809 parent_team->t.t_argv
1810#if OMPT_SUPPORT
1811 ,
1812 exit_frame_p
1813#endif
1814 );
1815 }
1816
1817#if OMPT_SUPPORT
1818 if (ompt_enabled.enabled) {
1819 *exit_frame_p = NULL;
1820 if (ompt_enabled.ompt_callback_implicit_task) {
1821 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822 ompt_scope_end, NULL, &(task_info->task_data), 1,
1823 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1824 ompt_task_implicit);
1825 }
1826 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1827 __ompt_lw_taskteam_unlink(master_th);
1828 if (ompt_enabled.ompt_callback_parallel_end) {
1829 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830 &ompt_parallel_data, parent_task_data,
1831 OMPT_INVOKER(call_context) | ompt_parallel_team,
1832 return_address);
1833 }
1834 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835 }
1836#endif
1837 } else if (microtask == (microtask_t)__kmp_teams_master) {
1838 KMP_DEBUG_ASSERT(master_th->th.th_team ==
1839 master_th->th.th_serial_team);
1840 team = master_th->th.th_team;
1841 // team->t.t_pkfn = microtask;
1842 team->t.t_invoke = invoker;
1843 __kmp_alloc_argv_entries(argc, team, TRUE);
1844 team->t.t_argc = argc;
1845 argv = (void **)team->t.t_argv;
1846 if (ap) {
1847 for (i = argc - 1; i >= 0; --i)
1848 *argv++ = va_arg(kmp_va_deref(ap), void *);
1849 } else {
1850 for (i = 0; i < argc; ++i)
1851 // Get args from parent team for teams construct
1852 argv[i] = parent_team->t.t_argv[i];
1853 }
1854 // AC: revert change made in __kmpc_serialized_parallel()
1855 // because initial code in teams should have level=0
1856 team->t.t_level--;
1857 // AC: call special invoker for outer "parallel" of teams construct
1858 invoker(gtid);
1859#if OMPT_SUPPORT
1860 if (ompt_enabled.enabled) {
1861 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1862 if (ompt_enabled.ompt_callback_implicit_task) {
1863 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864 ompt_scope_end, NULL, &(task_info->task_data), 0,
1865 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1866 }
1867 if (ompt_enabled.ompt_callback_parallel_end) {
1868 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1869 &ompt_parallel_data, parent_task_data,
1870 OMPT_INVOKER(call_context) | ompt_parallel_league,
1871 return_address);
1872 }
1873 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874 }
1875#endif
1876 } else {
1877 argv = args;
1878 for (i = argc - 1; i >= 0; --i)
1879 *argv++ = va_arg(kmp_va_deref(ap), void *);
1880 KMP_MB();
1881
1882#if OMPT_SUPPORT
1883 void *dummy;
1884 void **exit_frame_p;
1885 ompt_task_info_t *task_info;
1886
1887 ompt_lw_taskteam_t lw_taskteam;
1888
1889 if (ompt_enabled.enabled) {
1890 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1891 &ompt_parallel_data, return_address);
1892 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1893 // don't use lw_taskteam after linking. content was swaped
1894 task_info = OMPT_CUR_TASK_INFO(master_th);
1895 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1896
1897 /* OMPT implicit task begin */
1898 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1899 if (ompt_enabled.ompt_callback_implicit_task) {
1900 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1902 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1903 ompt_task_implicit);
1904 OMPT_CUR_TASK_INFO(master_th)->thread_num =
1905 __kmp_tid_from_gtid(gtid);
1906 }
1907
1908 /* OMPT state */
1909 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910 } else {
1911 exit_frame_p = &dummy;
1912 }
1913#endif
1914
1915 {
1916 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919#if OMPT_SUPPORT
1920 ,
1921 exit_frame_p
1922#endif
1923 );
1924 }
1925
1926#if OMPT_SUPPORT
1927 if (ompt_enabled.enabled) {
1928 *exit_frame_p = NULL;
1929 if (ompt_enabled.ompt_callback_implicit_task) {
1930 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931 ompt_scope_end, NULL, &(task_info->task_data), 1,
1932 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1933 ompt_task_implicit);
1934 }
1935
1936 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1937 __ompt_lw_taskteam_unlink(master_th);
1938 if (ompt_enabled.ompt_callback_parallel_end) {
1939 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1940 &ompt_parallel_data, parent_task_data,
1941 OMPT_INVOKER(call_context) | ompt_parallel_team,
1942 return_address);
1943 }
1944 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1945 }
1946#endif
1947 }
1948 } else if (call_context == fork_context_gnu) {
1949#if OMPT_SUPPORT
1950 ompt_lw_taskteam_t lwt;
1951 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952 return_address);
1953
1954 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956// don't use lw_taskteam after linking. content was swaped
1957#endif
1958
1959 // we were called from GNU native code
1960 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961 return FALSE;
1962 } else {
1963 KMP_ASSERT2(call_context < fork_context_last,
1964 "__kmp_fork_call: unknown fork_context parameter");
1965 }
1966
1967 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968 KMP_MB();
1969 return FALSE;
1970 } // if (nthreads == 1)
1971
1972 // GEH: only modify the executing flag in the case when not serialized
1973 // serialized case is handled in kmpc_serialized_parallel
1974 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975 "curtask=%p, curtask_max_aclevel=%d\n",
1976 parent_team->t.t_active_level, master_th,
1977 master_th->th.th_current_task,
1978 master_th->th.th_current_task->td_icvs.max_active_levels));
1979 // TODO: GEH - cannot do this assertion because root thread not set up as
1980 // executing
1981 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982 master_th->th.th_current_task->td_flags.executing = 0;
1983
1984 if (!master_th->th.th_teams_microtask || level > teams_level) {
1985 /* Increment our nested depth level */
1986 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1987 }
1988
1989 // See if we need to make a copy of the ICVs.
1990 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1991 if ((level + 1 < __kmp_nested_nth.used) &&
1992 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1993 nthreads_icv = __kmp_nested_nth.nth[level + 1];
1994 } else {
1995 nthreads_icv = 0; // don't update
1996 }
1997
1998 // Figure out the proc_bind_policy for the new team.
1999 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2000 // proc_bind_default means don't update
2001 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2002 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2003 proc_bind = proc_bind_false;
2004 } else {
2005 // No proc_bind clause specified; use current proc-bind-var for this
2006 // parallel region
2007 if (proc_bind == proc_bind_default) {
2008 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2009 }
2010 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2011 if (master_th->th.th_teams_microtask &&
2012 microtask == (microtask_t)__kmp_teams_master) {
2013 proc_bind = __kmp_teams_proc_bind;
2014 }
2015 /* else: The proc_bind policy was specified explicitly on parallel clause.
2016 This overrides proc-bind-var for this parallel region, but does not
2017 change proc-bind-var. */
2018 // Figure the value of proc-bind-var for the child threads.
2019 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2020 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2021 master_th->th.th_current_task->td_icvs.proc_bind)) {
2022 // Do not modify the proc bind icv for the two teams construct forks
2023 // They just let the proc bind icv pass through
2024 if (!master_th->th.th_teams_microtask ||
2025 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2026 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2027 }
2028 }
2029
2030 // Reset for next parallel region
2031 master_th->th.th_set_proc_bind = proc_bind_default;
2032
2033 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2034 kmp_internal_control_t new_icvs;
2035 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036 new_icvs.next = NULL;
2037 if (nthreads_icv > 0) {
2038 new_icvs.nproc = nthreads_icv;
2039 }
2040 if (proc_bind_icv != proc_bind_default) {
2041 new_icvs.proc_bind = proc_bind_icv;
2042 }
2043
2044 /* allocate a new parallel team */
2045 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2046 team = __kmp_allocate_team(root, nthreads, nthreads,
2047#if OMPT_SUPPORT
2048 ompt_parallel_data,
2049#endif
2050 proc_bind, &new_icvs,
2051 argc USE_NESTED_HOT_ARG(master_th));
2052 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2053 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2054 } else {
2055 /* allocate a new parallel team */
2056 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057 team = __kmp_allocate_team(root, nthreads, nthreads,
2058#if OMPT_SUPPORT
2059 ompt_parallel_data,
2060#endif
2061 proc_bind,
2062 &master_th->th.th_current_task->td_icvs,
2063 argc USE_NESTED_HOT_ARG(master_th));
2064 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2065 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2066 &master_th->th.th_current_task->td_icvs);
2067 }
2068 KF_TRACE(
2069 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2070
2071 /* setup the new team */
2072 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2073 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2074 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2075 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2076 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2077#if OMPT_SUPPORT
2078 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2079 return_address);
2080#endif
2081 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2082 // TODO: parent_team->t.t_level == INT_MAX ???
2083 if (!master_th->th.th_teams_microtask || level > teams_level) {
2084 int new_level = parent_team->t.t_level + 1;
2085 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2086 new_level = parent_team->t.t_active_level + 1;
2087 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2088 } else {
2089 // AC: Do not increase parallel level at start of the teams construct
2090 int new_level = parent_team->t.t_level;
2091 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2092 new_level = parent_team->t.t_active_level;
2093 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2094 }
2095 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2096 // set primary thread's schedule as new run-time schedule
2097 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2098
2099 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2100 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2101
2102 // Update the floating point rounding in the team if required.
2103 propagateFPControl(team);
2104#if OMPD_SUPPORT
2105 if (ompd_state & OMPD_ENABLE_BP)
2106 ompd_bp_parallel_begin();
2107#endif
2108
2109 if (__kmp_tasking_mode != tskm_immediate_exec) {
2110 // Set primary thread's task team to team's task team. Unless this is hot
2111 // team, it should be NULL.
2112 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2113 parent_team->t.t_task_team[master_th->th.th_task_state]);
2114 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2115 "%p, new task_team %p / team %p\n",
2116 __kmp_gtid_from_thread(master_th),
2117 master_th->th.th_task_team, parent_team,
2118 team->t.t_task_team[master_th->th.th_task_state], team));
2119
2120 if (active_level || master_th->th.th_task_team) {
2121 // Take a memo of primary thread's task_state
2122 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2123 if (master_th->th.th_task_state_top >=
2124 master_th->th.th_task_state_stack_sz) { // increase size
2125 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2126 kmp_uint8 *old_stack, *new_stack;
2127 kmp_uint32 i;
2128 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2129 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2130 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2131 }
2132 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2133 ++i) { // zero-init rest of stack
2134 new_stack[i] = 0;
2135 }
2136 old_stack = master_th->th.th_task_state_memo_stack;
2137 master_th->th.th_task_state_memo_stack = new_stack;
2138 master_th->th.th_task_state_stack_sz = new_size;
2139 __kmp_free(old_stack);
2140 }
2141 // Store primary thread's task_state on stack
2142 master_th->th
2143 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2144 master_th->th.th_task_state;
2145 master_th->th.th_task_state_top++;
2146#if KMP_NESTED_HOT_TEAMS
2147 if (master_th->th.th_hot_teams &&
2148 active_level < __kmp_hot_teams_max_level &&
2149 team == master_th->th.th_hot_teams[active_level].hot_team) {
2150 // Restore primary thread's nested state if nested hot team
2151 master_th->th.th_task_state =
2152 master_th->th
2153 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2154 } else {
2155#endif
2156 master_th->th.th_task_state = 0;
2157#if KMP_NESTED_HOT_TEAMS
2158 }
2159#endif
2160 }
2161#if !KMP_NESTED_HOT_TEAMS
2162 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2163 (team == root->r.r_hot_team));
2164#endif
2165 }
2166
2167 KA_TRACE(
2168 20,
2169 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2170 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2171 team->t.t_nproc));
2172 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2173 (team->t.t_master_tid == 0 &&
2174 (team->t.t_parent == root->r.r_root_team ||
2175 team->t.t_parent->t.t_serialized)));
2176 KMP_MB();
2177
2178 /* now, setup the arguments */
2179 argv = (void **)team->t.t_argv;
2180 if (ap) {
2181 for (i = argc - 1; i >= 0; --i) {
2182 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2183 KMP_CHECK_UPDATE(*argv, new_argv);
2184 argv++;
2185 }
2186 } else {
2187 for (i = 0; i < argc; ++i) {
2188 // Get args from parent team for teams construct
2189 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2190 }
2191 }
2192
2193 /* now actually fork the threads */
2194 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2195 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2196 root->r.r_active = TRUE;
2197
2198 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2199 __kmp_setup_icv_copy(team, nthreads,
2200 &master_th->th.th_current_task->td_icvs, loc);
2201
2202#if OMPT_SUPPORT
2203 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2204#endif
2205
2206 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2207
2208#if USE_ITT_BUILD
2209 if (team->t.t_active_level == 1 // only report frames at level 1
2210 && !master_th->th.th_teams_microtask) { // not in teams construct
2211#if USE_ITT_NOTIFY
2212 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2213 (__kmp_forkjoin_frames_mode == 3 ||
2214 __kmp_forkjoin_frames_mode == 1)) {
2215 kmp_uint64 tmp_time = 0;
2216 if (__itt_get_timestamp_ptr)
2217 tmp_time = __itt_get_timestamp();
2218 // Internal fork - report frame begin
2219 master_th->th.th_frame_time = tmp_time;
2220 if (__kmp_forkjoin_frames_mode == 3)
2221 team->t.t_region_time = tmp_time;
2222 } else
2223// only one notification scheme (either "submit" or "forking/joined", not both)
2224#endif /* USE_ITT_NOTIFY */
2225 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2226 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2227 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2228 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2229 }
2230 }
2231#endif /* USE_ITT_BUILD */
2232
2233 /* now go on and do the work */
2234 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2235 KMP_MB();
2236 KF_TRACE(10,
2237 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2238 root, team, master_th, gtid));
2239
2240#if USE_ITT_BUILD
2241 if (__itt_stack_caller_create_ptr) {
2242 // create new stack stitching id before entering fork barrier
2243 if (!enter_teams) {
2244 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2245 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2246 } else if (parent_team->t.t_serialized) {
2247 // keep stack stitching id in the serialized parent_team;
2248 // current team will be used for parallel inside the teams;
2249 // if parent_team is active, then it already keeps stack stitching id
2250 // for the league of teams
2251 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2252 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2253 }
2254 }
2255#endif /* USE_ITT_BUILD */
2256
2257 // AC: skip __kmp_internal_fork at teams construct, let only primary
2258 // threads execute
2259 if (ap) {
2260 __kmp_internal_fork(loc, gtid, team);
2261 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2262 "master_th=%p, gtid=%d\n",
2263 root, team, master_th, gtid));
2264 }
2265
2266 if (call_context == fork_context_gnu) {
2267 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2268 return TRUE;
2269 }
2270
2271 /* Invoke microtask for PRIMARY thread */
2272 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2273 team->t.t_id, team->t.t_pkfn));
2274 } // END of timer KMP_fork_call block
2275
2276#if KMP_STATS_ENABLED
2277 // If beginning a teams construct, then change thread state
2278 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2279 if (!ap) {
2280 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2281 }
2282#endif
2283
2284 if (!team->t.t_invoke(gtid)) {
2285 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2286 }
2287
2288#if KMP_STATS_ENABLED
2289 // If was beginning of a teams construct, then reset thread state
2290 if (!ap) {
2291 KMP_SET_THREAD_STATE(previous_state);
2292 }
2293#endif
2294
2295 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2296 team->t.t_id, team->t.t_pkfn));
2297 KMP_MB(); /* Flush all pending memory write invalidates. */
2298
2299 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300#if OMPT_SUPPORT
2301 if (ompt_enabled.enabled) {
2302 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2303 }
2304#endif
2305
2306 return TRUE;
2307}
2308
2309#if OMPT_SUPPORT
2310static inline void __kmp_join_restore_state(kmp_info_t *thread,
2311 kmp_team_t *team) {
2312 // restore state outside the region
2313 thread->th.ompt_thread_info.state =
2314 ((team->t.t_serialized) ? ompt_state_work_serial
2315 : ompt_state_work_parallel);
2316}
2317
2318static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2319 kmp_team_t *team, ompt_data_t *parallel_data,
2320 int flags, void *codeptr) {
2321 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2322 if (ompt_enabled.ompt_callback_parallel_end) {
2323 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2324 parallel_data, &(task_info->task_data), flags, codeptr);
2325 }
2326
2327 task_info->frame.enter_frame = ompt_data_none;
2328 __kmp_join_restore_state(thread, team);
2329}
2330#endif
2331
2332void __kmp_join_call(ident_t *loc, int gtid
2333#if OMPT_SUPPORT
2334 ,
2335 enum fork_context_e fork_context
2336#endif
2337 ,
2338 int exit_teams) {
2339 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2340 kmp_team_t *team;
2341 kmp_team_t *parent_team;
2342 kmp_info_t *master_th;
2343 kmp_root_t *root;
2344 int master_active;
2345
2346 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2347
2348 /* setup current data */
2349 master_th = __kmp_threads[gtid];
2350 root = master_th->th.th_root;
2351 team = master_th->th.th_team;
2352 parent_team = team->t.t_parent;
2353
2354 master_th->th.th_ident = loc;
2355
2356#if OMPT_SUPPORT
2357 void *team_microtask = (void *)team->t.t_pkfn;
2358 // For GOMP interface with serialized parallel, need the
2359 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2360 // and end-parallel events.
2361 if (ompt_enabled.enabled &&
2362 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2363 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2364 }
2365#endif
2366
2367#if KMP_DEBUG
2368 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2369 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2370 "th_task_team = %p\n",
2371 __kmp_gtid_from_thread(master_th), team,
2372 team->t.t_task_team[master_th->th.th_task_state],
2373 master_th->th.th_task_team));
2374 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2375 team->t.t_task_team[master_th->th.th_task_state]);
2376 }
2377#endif
2378
2379 if (team->t.t_serialized) {
2380 if (master_th->th.th_teams_microtask) {
2381 // We are in teams construct
2382 int level = team->t.t_level;
2383 int tlevel = master_th->th.th_teams_level;
2384 if (level == tlevel) {
2385 // AC: we haven't incremented it earlier at start of teams construct,
2386 // so do it here - at the end of teams construct
2387 team->t.t_level++;
2388 } else if (level == tlevel + 1) {
2389 // AC: we are exiting parallel inside teams, need to increment
2390 // serialization in order to restore it in the next call to
2391 // __kmpc_end_serialized_parallel
2392 team->t.t_serialized++;
2393 }
2394 }
2396
2397#if OMPT_SUPPORT
2398 if (ompt_enabled.enabled) {
2399 __kmp_join_restore_state(master_th, parent_team);
2400 }
2401#endif
2402
2403 return;
2404 }
2405
2406 master_active = team->t.t_master_active;
2407
2408 if (!exit_teams) {
2409 // AC: No barrier for internal teams at exit from teams construct.
2410 // But there is barrier for external team (league).
2411 __kmp_internal_join(loc, gtid, team);
2412#if USE_ITT_BUILD
2413 if (__itt_stack_caller_create_ptr) {
2414 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2415 // destroy the stack stitching id after join barrier
2416 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2417 team->t.t_stack_id = NULL;
2418 }
2419#endif
2420 } else {
2421 master_th->th.th_task_state =
2422 0; // AC: no tasking in teams (out of any parallel)
2423#if USE_ITT_BUILD
2424 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2425 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2426 // destroy the stack stitching id on exit from the teams construct
2427 // if parent_team is active, then the id will be destroyed later on
2428 // by master of the league of teams
2429 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2430 parent_team->t.t_stack_id = NULL;
2431 }
2432#endif
2433
2434 if (team->t.t_nproc > 1 &&
2435 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2436 team->t.b->update_num_threads(team->t.t_nproc);
2437 __kmp_add_threads_to_team(team, team->t.t_nproc);
2438 }
2439 }
2440
2441 KMP_MB();
2442
2443#if OMPT_SUPPORT
2444 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2445 void *codeptr = team->t.ompt_team_info.master_return_address;
2446#endif
2447
2448#if USE_ITT_BUILD
2449 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2450 if (team->t.t_active_level == 1 &&
2451 (!master_th->th.th_teams_microtask || /* not in teams construct */
2452 master_th->th.th_teams_size.nteams == 1)) {
2453 master_th->th.th_ident = loc;
2454 // only one notification scheme (either "submit" or "forking/joined", not
2455 // both)
2456 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2457 __kmp_forkjoin_frames_mode == 3)
2458 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2459 master_th->th.th_frame_time, 0, loc,
2460 master_th->th.th_team_nproc, 1);
2461 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2462 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2463 __kmp_itt_region_joined(gtid);
2464 } // active_level == 1
2465#endif /* USE_ITT_BUILD */
2466
2467#if KMP_AFFINITY_SUPPORTED
2468 if (!exit_teams) {
2469 // Restore master thread's partition.
2470 master_th->th.th_first_place = team->t.t_first_place;
2471 master_th->th.th_last_place = team->t.t_last_place;
2472 }
2473#endif // KMP_AFFINITY_SUPPORTED
2474
2475 if (master_th->th.th_teams_microtask && !exit_teams &&
2476 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2477 team->t.t_level == master_th->th.th_teams_level + 1) {
2478// AC: We need to leave the team structure intact at the end of parallel
2479// inside the teams construct, so that at the next parallel same (hot) team
2480// works, only adjust nesting levels
2481#if OMPT_SUPPORT
2482 ompt_data_t ompt_parallel_data = ompt_data_none;
2483 if (ompt_enabled.enabled) {
2484 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2485 if (ompt_enabled.ompt_callback_implicit_task) {
2486 int ompt_team_size = team->t.t_nproc;
2487 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2488 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2489 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2490 }
2491 task_info->frame.exit_frame = ompt_data_none;
2492 task_info->task_data = ompt_data_none;
2493 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2494 __ompt_lw_taskteam_unlink(master_th);
2495 }
2496#endif
2497 /* Decrement our nested depth level */
2498 team->t.t_level--;
2499 team->t.t_active_level--;
2500 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2501
2502 // Restore number of threads in the team if needed. This code relies on
2503 // the proper adjustment of th_teams_size.nth after the fork in
2504 // __kmp_teams_master on each teams primary thread in the case that
2505 // __kmp_reserve_threads reduced it.
2506 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2507 int old_num = master_th->th.th_team_nproc;
2508 int new_num = master_th->th.th_teams_size.nth;
2509 kmp_info_t **other_threads = team->t.t_threads;
2510 team->t.t_nproc = new_num;
2511 for (int i = 0; i < old_num; ++i) {
2512 other_threads[i]->th.th_team_nproc = new_num;
2513 }
2514 // Adjust states of non-used threads of the team
2515 for (int i = old_num; i < new_num; ++i) {
2516 // Re-initialize thread's barrier data.
2517 KMP_DEBUG_ASSERT(other_threads[i]);
2518 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2519 for (int b = 0; b < bs_last_barrier; ++b) {
2520 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2521 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2522#if USE_DEBUGGER
2523 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2524#endif
2525 }
2526 if (__kmp_tasking_mode != tskm_immediate_exec) {
2527 // Synchronize thread's task state
2528 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2529 }
2530 }
2531 }
2532
2533#if OMPT_SUPPORT
2534 if (ompt_enabled.enabled) {
2535 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2536 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2537 }
2538#endif
2539
2540 return;
2541 }
2542
2543 /* do cleanup and restore the parent team */
2544 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2545 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2546
2547 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2548
2549 /* jc: The following lock has instructions with REL and ACQ semantics,
2550 separating the parallel user code called in this parallel region
2551 from the serial user code called after this function returns. */
2552 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2553
2554 if (!master_th->th.th_teams_microtask ||
2555 team->t.t_level > master_th->th.th_teams_level) {
2556 /* Decrement our nested depth level */
2557 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2558 }
2559 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2560
2561#if OMPT_SUPPORT
2562 if (ompt_enabled.enabled) {
2563 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2564 if (ompt_enabled.ompt_callback_implicit_task) {
2565 int flags = (team_microtask == (void *)__kmp_teams_master)
2566 ? ompt_task_initial
2567 : ompt_task_implicit;
2568 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2569 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2571 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2572 }
2573 task_info->frame.exit_frame = ompt_data_none;
2574 task_info->task_data = ompt_data_none;
2575 }
2576#endif
2577
2578 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2579 master_th, team));
2580 __kmp_pop_current_task_from_thread(master_th);
2581
2582 master_th->th.th_def_allocator = team->t.t_def_allocator;
2583
2584#if OMPD_SUPPORT
2585 if (ompd_state & OMPD_ENABLE_BP)
2586 ompd_bp_parallel_end();
2587#endif
2588 updateHWFPControl(team);
2589
2590 if (root->r.r_active != master_active)
2591 root->r.r_active = master_active;
2592
2593 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2594 master_th)); // this will free worker threads
2595
2596 /* this race was fun to find. make sure the following is in the critical
2597 region otherwise assertions may fail occasionally since the old team may be
2598 reallocated and the hierarchy appears inconsistent. it is actually safe to
2599 run and won't cause any bugs, but will cause those assertion failures. it's
2600 only one deref&assign so might as well put this in the critical region */
2601 master_th->th.th_team = parent_team;
2602 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2603 master_th->th.th_team_master = parent_team->t.t_threads[0];
2604 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2605
2606 /* restore serialized team, if need be */
2607 if (parent_team->t.t_serialized &&
2608 parent_team != master_th->th.th_serial_team &&
2609 parent_team != root->r.r_root_team) {
2610 __kmp_free_team(root,
2611 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2612 master_th->th.th_serial_team = parent_team;
2613 }
2614
2615 if (__kmp_tasking_mode != tskm_immediate_exec) {
2616 if (master_th->th.th_task_state_top >
2617 0) { // Restore task state from memo stack
2618 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2619 // Remember primary thread's state if we re-use this nested hot team
2620 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2621 master_th->th.th_task_state;
2622 --master_th->th.th_task_state_top; // pop
2623 // Now restore state at this level
2624 master_th->th.th_task_state =
2625 master_th->th
2626 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2627 }
2628 // Copy the task team from the parent team to the primary thread
2629 master_th->th.th_task_team =
2630 parent_team->t.t_task_team[master_th->th.th_task_state];
2631 KA_TRACE(20,
2632 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2633 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2634 parent_team));
2635 }
2636
2637 // TODO: GEH - cannot do this assertion because root thread not set up as
2638 // executing
2639 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2640 master_th->th.th_current_task->td_flags.executing = 1;
2641
2642 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2643
2644#if OMPT_SUPPORT
2645 int flags =
2646 OMPT_INVOKER(fork_context) |
2647 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2648 : ompt_parallel_team);
2649 if (ompt_enabled.enabled) {
2650 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2651 codeptr);
2652 }
2653#endif
2654
2655 KMP_MB();
2656 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2657}
2658
2659/* Check whether we should push an internal control record onto the
2660 serial team stack. If so, do it. */
2661void __kmp_save_internal_controls(kmp_info_t *thread) {
2662
2663 if (thread->th.th_team != thread->th.th_serial_team) {
2664 return;
2665 }
2666 if (thread->th.th_team->t.t_serialized > 1) {
2667 int push = 0;
2668
2669 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2670 push = 1;
2671 } else {
2672 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2673 thread->th.th_team->t.t_serialized) {
2674 push = 1;
2675 }
2676 }
2677 if (push) { /* push a record on the serial team's stack */
2678 kmp_internal_control_t *control =
2679 (kmp_internal_control_t *)__kmp_allocate(
2680 sizeof(kmp_internal_control_t));
2681
2682 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2683
2684 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2685
2686 control->next = thread->th.th_team->t.t_control_stack_top;
2687 thread->th.th_team->t.t_control_stack_top = control;
2688 }
2689 }
2690}
2691
2692/* Changes set_nproc */
2693void __kmp_set_num_threads(int new_nth, int gtid) {
2694 kmp_info_t *thread;
2695 kmp_root_t *root;
2696
2697 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2698 KMP_DEBUG_ASSERT(__kmp_init_serial);
2699
2700 if (new_nth < 1)
2701 new_nth = 1;
2702 else if (new_nth > __kmp_max_nth)
2703 new_nth = __kmp_max_nth;
2704
2705 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2706 thread = __kmp_threads[gtid];
2707 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2708 return; // nothing to do
2709
2710 __kmp_save_internal_controls(thread);
2711
2712 set__nproc(thread, new_nth);
2713
2714 // If this omp_set_num_threads() call will cause the hot team size to be
2715 // reduced (in the absence of a num_threads clause), then reduce it now,
2716 // rather than waiting for the next parallel region.
2717 root = thread->th.th_root;
2718 if (__kmp_init_parallel && (!root->r.r_active) &&
2719 (root->r.r_hot_team->t.t_nproc > new_nth)
2720#if KMP_NESTED_HOT_TEAMS
2721 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2722#endif
2723 ) {
2724 kmp_team_t *hot_team = root->r.r_hot_team;
2725 int f;
2726
2727 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2728
2729 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2730 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2731 }
2732 // Release the extra threads we don't need any more.
2733 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2734 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2735 if (__kmp_tasking_mode != tskm_immediate_exec) {
2736 // When decreasing team size, threads no longer in the team should unref
2737 // task team.
2738 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2739 }
2740 __kmp_free_thread(hot_team->t.t_threads[f]);
2741 hot_team->t.t_threads[f] = NULL;
2742 }
2743 hot_team->t.t_nproc = new_nth;
2744#if KMP_NESTED_HOT_TEAMS
2745 if (thread->th.th_hot_teams) {
2746 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2747 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2748 }
2749#endif
2750
2751 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2752 hot_team->t.b->update_num_threads(new_nth);
2753 __kmp_add_threads_to_team(hot_team, new_nth);
2754 }
2755
2756 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2757
2758 // Update the t_nproc field in the threads that are still active.
2759 for (f = 0; f < new_nth; f++) {
2760 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2761 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2762 }
2763 // Special flag in case omp_set_num_threads() call
2764 hot_team->t.t_size_changed = -1;
2765 }
2766}
2767
2768/* Changes max_active_levels */
2769void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2770 kmp_info_t *thread;
2771
2772 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2773 "%d = (%d)\n",
2774 gtid, max_active_levels));
2775 KMP_DEBUG_ASSERT(__kmp_init_serial);
2776
2777 // validate max_active_levels
2778 if (max_active_levels < 0) {
2779 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2780 // We ignore this call if the user has specified a negative value.
2781 // The current setting won't be changed. The last valid setting will be
2782 // used. A warning will be issued (if warnings are allowed as controlled by
2783 // the KMP_WARNINGS env var).
2784 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2785 "max_active_levels for thread %d = (%d)\n",
2786 gtid, max_active_levels));
2787 return;
2788 }
2789 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2790 // it's OK, the max_active_levels is within the valid range: [ 0;
2791 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2792 // We allow a zero value. (implementation defined behavior)
2793 } else {
2794 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2795 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2796 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2797 // Current upper limit is MAX_INT. (implementation defined behavior)
2798 // If the input exceeds the upper limit, we correct the input to be the
2799 // upper limit. (implementation defined behavior)
2800 // Actually, the flow should never get here until we use MAX_INT limit.
2801 }
2802 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2803 "max_active_levels for thread %d = (%d)\n",
2804 gtid, max_active_levels));
2805
2806 thread = __kmp_threads[gtid];
2807
2808 __kmp_save_internal_controls(thread);
2809
2810 set__max_active_levels(thread, max_active_levels);
2811}
2812
2813/* Gets max_active_levels */
2814int __kmp_get_max_active_levels(int gtid) {
2815 kmp_info_t *thread;
2816
2817 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2818 KMP_DEBUG_ASSERT(__kmp_init_serial);
2819
2820 thread = __kmp_threads[gtid];
2821 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2822 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2823 "curtask_maxaclevel=%d\n",
2824 gtid, thread->th.th_current_task,
2825 thread->th.th_current_task->td_icvs.max_active_levels));
2826 return thread->th.th_current_task->td_icvs.max_active_levels;
2827}
2828
2829// nteams-var per-device ICV
2830void __kmp_set_num_teams(int num_teams) {
2831 if (num_teams > 0)
2832 __kmp_nteams = num_teams;
2833}
2834int __kmp_get_max_teams(void) { return __kmp_nteams; }
2835// teams-thread-limit-var per-device ICV
2836void __kmp_set_teams_thread_limit(int limit) {
2837 if (limit > 0)
2838 __kmp_teams_thread_limit = limit;
2839}
2840int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2841
2842KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2843KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2844
2845/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2846void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2847 kmp_info_t *thread;
2848 kmp_sched_t orig_kind;
2849 // kmp_team_t *team;
2850
2851 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2852 gtid, (int)kind, chunk));
2853 KMP_DEBUG_ASSERT(__kmp_init_serial);
2854
2855 // Check if the kind parameter is valid, correct if needed.
2856 // Valid parameters should fit in one of two intervals - standard or extended:
2857 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2858 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2859 orig_kind = kind;
2860 kind = __kmp_sched_without_mods(kind);
2861
2862 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2863 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2864 // TODO: Hint needs attention in case we change the default schedule.
2865 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2866 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2867 __kmp_msg_null);
2868 kind = kmp_sched_default;
2869 chunk = 0; // ignore chunk value in case of bad kind
2870 }
2871
2872 thread = __kmp_threads[gtid];
2873
2874 __kmp_save_internal_controls(thread);
2875
2876 if (kind < kmp_sched_upper_std) {
2877 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2878 // differ static chunked vs. unchunked: chunk should be invalid to
2879 // indicate unchunked schedule (which is the default)
2880 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2881 } else {
2882 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2883 __kmp_sch_map[kind - kmp_sched_lower - 1];
2884 }
2885 } else {
2886 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2887 // kmp_sched_lower - 2 ];
2888 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2889 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2890 kmp_sched_lower - 2];
2891 }
2892 __kmp_sched_apply_mods_intkind(
2893 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2894 if (kind == kmp_sched_auto || chunk < 1) {
2895 // ignore parameter chunk for schedule auto
2896 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2897 } else {
2898 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2899 }
2900}
2901
2902/* Gets def_sched_var ICV values */
2903void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2904 kmp_info_t *thread;
2905 enum sched_type th_type;
2906
2907 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2908 KMP_DEBUG_ASSERT(__kmp_init_serial);
2909
2910 thread = __kmp_threads[gtid];
2911
2912 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2913 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2914 case kmp_sch_static:
2915 case kmp_sch_static_greedy:
2916 case kmp_sch_static_balanced:
2917 *kind = kmp_sched_static;
2918 __kmp_sched_apply_mods_stdkind(kind, th_type);
2919 *chunk = 0; // chunk was not set, try to show this fact via zero value
2920 return;
2921 case kmp_sch_static_chunked:
2922 *kind = kmp_sched_static;
2923 break;
2924 case kmp_sch_dynamic_chunked:
2925 *kind = kmp_sched_dynamic;
2926 break;
2928 case kmp_sch_guided_iterative_chunked:
2929 case kmp_sch_guided_analytical_chunked:
2930 *kind = kmp_sched_guided;
2931 break;
2932 case kmp_sch_auto:
2933 *kind = kmp_sched_auto;
2934 break;
2935 case kmp_sch_trapezoidal:
2936 *kind = kmp_sched_trapezoidal;
2937 break;
2938#if KMP_STATIC_STEAL_ENABLED
2939 case kmp_sch_static_steal:
2940 *kind = kmp_sched_static_steal;
2941 break;
2942#endif
2943 default:
2944 KMP_FATAL(UnknownSchedulingType, th_type);
2945 }
2946
2947 __kmp_sched_apply_mods_stdkind(kind, th_type);
2948 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2949}
2950
2951int __kmp_get_ancestor_thread_num(int gtid, int level) {
2952
2953 int ii, dd;
2954 kmp_team_t *team;
2955 kmp_info_t *thr;
2956
2957 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2958 KMP_DEBUG_ASSERT(__kmp_init_serial);
2959
2960 // validate level
2961 if (level == 0)
2962 return 0;
2963 if (level < 0)
2964 return -1;
2965 thr = __kmp_threads[gtid];
2966 team = thr->th.th_team;
2967 ii = team->t.t_level;
2968 if (level > ii)
2969 return -1;
2970
2971 if (thr->th.th_teams_microtask) {
2972 // AC: we are in teams region where multiple nested teams have same level
2973 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2974 if (level <=
2975 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2976 KMP_DEBUG_ASSERT(ii >= tlevel);
2977 // AC: As we need to pass by the teams league, we need to artificially
2978 // increase ii
2979 if (ii == tlevel) {
2980 ii += 2; // three teams have same level
2981 } else {
2982 ii++; // two teams have same level
2983 }
2984 }
2985 }
2986
2987 if (ii == level)
2988 return __kmp_tid_from_gtid(gtid);
2989
2990 dd = team->t.t_serialized;
2991 level++;
2992 while (ii > level) {
2993 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2994 }
2995 if ((team->t.t_serialized) && (!dd)) {
2996 team = team->t.t_parent;
2997 continue;
2998 }
2999 if (ii > level) {
3000 team = team->t.t_parent;
3001 dd = team->t.t_serialized;
3002 ii--;
3003 }
3004 }
3005
3006 return (dd > 1) ? (0) : (team->t.t_master_tid);
3007}
3008
3009int __kmp_get_team_size(int gtid, int level) {
3010
3011 int ii, dd;
3012 kmp_team_t *team;
3013 kmp_info_t *thr;
3014
3015 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3016 KMP_DEBUG_ASSERT(__kmp_init_serial);
3017
3018 // validate level
3019 if (level == 0)
3020 return 1;
3021 if (level < 0)
3022 return -1;
3023 thr = __kmp_threads[gtid];
3024 team = thr->th.th_team;
3025 ii = team->t.t_level;
3026 if (level > ii)
3027 return -1;
3028
3029 if (thr->th.th_teams_microtask) {
3030 // AC: we are in teams region where multiple nested teams have same level
3031 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3032 if (level <=
3033 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3034 KMP_DEBUG_ASSERT(ii >= tlevel);
3035 // AC: As we need to pass by the teams league, we need to artificially
3036 // increase ii
3037 if (ii == tlevel) {
3038 ii += 2; // three teams have same level
3039 } else {
3040 ii++; // two teams have same level
3041 }
3042 }
3043 }
3044
3045 while (ii > level) {
3046 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3047 }
3048 if (team->t.t_serialized && (!dd)) {
3049 team = team->t.t_parent;
3050 continue;
3051 }
3052 if (ii > level) {
3053 team = team->t.t_parent;
3054 ii--;
3055 }
3056 }
3057
3058 return team->t.t_nproc;
3059}
3060
3061kmp_r_sched_t __kmp_get_schedule_global() {
3062 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3063 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3064 // independently. So one can get the updated schedule here.
3065
3066 kmp_r_sched_t r_sched;
3067
3068 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3069 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3070 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3071 // different roots (even in OMP 2.5)
3072 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3073 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3074 if (s == kmp_sch_static) {
3075 // replace STATIC with more detailed schedule (balanced or greedy)
3076 r_sched.r_sched_type = __kmp_static;
3077 } else if (s == kmp_sch_guided_chunked) {
3078 // replace GUIDED with more detailed schedule (iterative or analytical)
3079 r_sched.r_sched_type = __kmp_guided;
3080 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3081 r_sched.r_sched_type = __kmp_sched;
3082 }
3083 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3084
3085 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3086 // __kmp_chunk may be wrong here (if it was not ever set)
3087 r_sched.chunk = KMP_DEFAULT_CHUNK;
3088 } else {
3089 r_sched.chunk = __kmp_chunk;
3090 }
3091
3092 return r_sched;
3093}
3094
3095/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3096 at least argc number of *t_argv entries for the requested team. */
3097static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3098
3099 KMP_DEBUG_ASSERT(team);
3100 if (!realloc || argc > team->t.t_max_argc) {
3101
3102 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3103 "current entries=%d\n",
3104 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3105 /* if previously allocated heap space for args, free them */
3106 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3107 __kmp_free((void *)team->t.t_argv);
3108
3109 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3110 /* use unused space in the cache line for arguments */
3111 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3112 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3113 "argv entries\n",
3114 team->t.t_id, team->t.t_max_argc));
3115 team->t.t_argv = &team->t.t_inline_argv[0];
3116 if (__kmp_storage_map) {
3117 __kmp_print_storage_map_gtid(
3118 -1, &team->t.t_inline_argv[0],
3119 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3120 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3121 team->t.t_id);
3122 }
3123 } else {
3124 /* allocate space for arguments in the heap */
3125 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3126 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3127 : 2 * argc;
3128 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3129 "argv entries\n",
3130 team->t.t_id, team->t.t_max_argc));
3131 team->t.t_argv =
3132 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3133 if (__kmp_storage_map) {
3134 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3135 &team->t.t_argv[team->t.t_max_argc],
3136 sizeof(void *) * team->t.t_max_argc,
3137 "team_%d.t_argv", team->t.t_id);
3138 }
3139 }
3140 }
3141}
3142
3143static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3144 int i;
3145 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3146 team->t.t_threads =
3147 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3148 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3149 sizeof(dispatch_shared_info_t) * num_disp_buff);
3150 team->t.t_dispatch =
3151 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3152 team->t.t_implicit_task_taskdata =
3153 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3154 team->t.t_max_nproc = max_nth;
3155
3156 /* setup dispatch buffers */
3157 for (i = 0; i < num_disp_buff; ++i) {
3158 team->t.t_disp_buffer[i].buffer_index = i;
3159 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3160 }
3161}
3162
3163static void __kmp_free_team_arrays(kmp_team_t *team) {
3164 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3165 int i;
3166 for (i = 0; i < team->t.t_max_nproc; ++i) {
3167 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3168 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3169 team->t.t_dispatch[i].th_disp_buffer = NULL;
3170 }
3171 }
3172#if KMP_USE_HIER_SCHED
3173 __kmp_dispatch_free_hierarchies(team);
3174#endif
3175 __kmp_free(team->t.t_threads);
3176 __kmp_free(team->t.t_disp_buffer);
3177 __kmp_free(team->t.t_dispatch);
3178 __kmp_free(team->t.t_implicit_task_taskdata);
3179 team->t.t_threads = NULL;
3180 team->t.t_disp_buffer = NULL;
3181 team->t.t_dispatch = NULL;
3182 team->t.t_implicit_task_taskdata = 0;
3183}
3184
3185static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3186 kmp_info_t **oldThreads = team->t.t_threads;
3187
3188 __kmp_free(team->t.t_disp_buffer);
3189 __kmp_free(team->t.t_dispatch);
3190 __kmp_free(team->t.t_implicit_task_taskdata);
3191 __kmp_allocate_team_arrays(team, max_nth);
3192
3193 KMP_MEMCPY(team->t.t_threads, oldThreads,
3194 team->t.t_nproc * sizeof(kmp_info_t *));
3195
3196 __kmp_free(oldThreads);
3197}
3198
3199static kmp_internal_control_t __kmp_get_global_icvs(void) {
3200
3201 kmp_r_sched_t r_sched =
3202 __kmp_get_schedule_global(); // get current state of scheduling globals
3203
3204 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3205
3206 kmp_internal_control_t g_icvs = {
3207 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3208 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3209 // adjustment of threads (per thread)
3210 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3211 // whether blocktime is explicitly set
3212 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3213#if KMP_USE_MONITOR
3214 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3215// intervals
3216#endif
3217 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3218 // next parallel region (per thread)
3219 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3220 __kmp_cg_max_nth, // int thread_limit;
3221 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3222 // for max_active_levels
3223 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3224 // {sched,chunk} pair
3225 __kmp_nested_proc_bind.bind_types[0],
3226 __kmp_default_device,
3227 NULL // struct kmp_internal_control *next;
3228 };
3229
3230 return g_icvs;
3231}
3232
3233static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3234
3235 kmp_internal_control_t gx_icvs;
3236 gx_icvs.serial_nesting_level =
3237 0; // probably =team->t.t_serial like in save_inter_controls
3238 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3239 gx_icvs.next = NULL;
3240
3241 return gx_icvs;
3242}
3243
3244static void __kmp_initialize_root(kmp_root_t *root) {
3245 int f;
3246 kmp_team_t *root_team;
3247 kmp_team_t *hot_team;
3248 int hot_team_max_nth;
3249 kmp_r_sched_t r_sched =
3250 __kmp_get_schedule_global(); // get current state of scheduling globals
3251 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3252 KMP_DEBUG_ASSERT(root);
3253 KMP_ASSERT(!root->r.r_begin);
3254
3255 /* setup the root state structure */
3256 __kmp_init_lock(&root->r.r_begin_lock);
3257 root->r.r_begin = FALSE;
3258 root->r.r_active = FALSE;
3259 root->r.r_in_parallel = 0;
3260 root->r.r_blocktime = __kmp_dflt_blocktime;
3261#if KMP_AFFINITY_SUPPORTED
3262 root->r.r_affinity_assigned = FALSE;
3263#endif
3264
3265 /* setup the root team for this task */
3266 /* allocate the root team structure */
3267 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3268
3269 root_team =
3270 __kmp_allocate_team(root,
3271 1, // new_nproc
3272 1, // max_nproc
3273#if OMPT_SUPPORT
3274 ompt_data_none, // root parallel id
3275#endif
3276 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3277 0 // argc
3278 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3279 );
3280#if USE_DEBUGGER
3281 // Non-NULL value should be assigned to make the debugger display the root
3282 // team.
3283 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3284#endif
3285
3286 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3287
3288 root->r.r_root_team = root_team;
3289 root_team->t.t_control_stack_top = NULL;
3290
3291 /* initialize root team */
3292 root_team->t.t_threads[0] = NULL;
3293 root_team->t.t_nproc = 1;
3294 root_team->t.t_serialized = 1;
3295 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3296 root_team->t.t_sched.sched = r_sched.sched;
3297 KA_TRACE(
3298 20,
3299 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3300 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3301
3302 /* setup the hot team for this task */
3303 /* allocate the hot team structure */
3304 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3305
3306 hot_team =
3307 __kmp_allocate_team(root,
3308 1, // new_nproc
3309 __kmp_dflt_team_nth_ub * 2, // max_nproc
3310#if OMPT_SUPPORT
3311 ompt_data_none, // root parallel id
3312#endif
3313 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3314 0 // argc
3315 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3316 );
3317 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3318
3319 root->r.r_hot_team = hot_team;
3320 root_team->t.t_control_stack_top = NULL;
3321
3322 /* first-time initialization */
3323 hot_team->t.t_parent = root_team;
3324
3325 /* initialize hot team */
3326 hot_team_max_nth = hot_team->t.t_max_nproc;
3327 for (f = 0; f < hot_team_max_nth; ++f) {
3328 hot_team->t.t_threads[f] = NULL;
3329 }
3330 hot_team->t.t_nproc = 1;
3331 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3332 hot_team->t.t_sched.sched = r_sched.sched;
3333 hot_team->t.t_size_changed = 0;
3334}
3335
3336#ifdef KMP_DEBUG
3337
3338typedef struct kmp_team_list_item {
3339 kmp_team_p const *entry;
3340 struct kmp_team_list_item *next;
3341} kmp_team_list_item_t;
3342typedef kmp_team_list_item_t *kmp_team_list_t;
3343
3344static void __kmp_print_structure_team_accum( // Add team to list of teams.
3345 kmp_team_list_t list, // List of teams.
3346 kmp_team_p const *team // Team to add.
3347) {
3348
3349 // List must terminate with item where both entry and next are NULL.
3350 // Team is added to the list only once.
3351 // List is sorted in ascending order by team id.
3352 // Team id is *not* a key.
3353
3354 kmp_team_list_t l;
3355
3356 KMP_DEBUG_ASSERT(list != NULL);
3357 if (team == NULL) {
3358 return;
3359 }
3360
3361 __kmp_print_structure_team_accum(list, team->t.t_parent);
3362 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3363
3364 // Search list for the team.
3365 l = list;
3366 while (l->next != NULL && l->entry != team) {
3367 l = l->next;
3368 }
3369 if (l->next != NULL) {
3370 return; // Team has been added before, exit.
3371 }
3372
3373 // Team is not found. Search list again for insertion point.
3374 l = list;
3375 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3376 l = l->next;
3377 }
3378
3379 // Insert team.
3380 {
3381 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3382 sizeof(kmp_team_list_item_t));
3383 *item = *l;
3384 l->entry = team;
3385 l->next = item;
3386 }
3387}
3388
3389static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3390
3391) {
3392 __kmp_printf("%s", title);
3393 if (team != NULL) {
3394 __kmp_printf("%2x %p\n", team->t.t_id, team);
3395 } else {
3396 __kmp_printf(" - (nil)\n");
3397 }
3398}
3399
3400static void __kmp_print_structure_thread(char const *title,
3401 kmp_info_p const *thread) {
3402 __kmp_printf("%s", title);
3403 if (thread != NULL) {
3404 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3405 } else {
3406 __kmp_printf(" - (nil)\n");
3407 }
3408}
3409
3410void __kmp_print_structure(void) {
3411
3412 kmp_team_list_t list;
3413
3414 // Initialize list of teams.
3415 list =
3416 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3417 list->entry = NULL;
3418 list->next = NULL;
3419
3420 __kmp_printf("\n------------------------------\nGlobal Thread "
3421 "Table\n------------------------------\n");
3422 {
3423 int gtid;
3424 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3425 __kmp_printf("%2d", gtid);
3426 if (__kmp_threads != NULL) {
3427 __kmp_printf(" %p", __kmp_threads[gtid]);
3428 }
3429 if (__kmp_root != NULL) {
3430 __kmp_printf(" %p", __kmp_root[gtid]);
3431 }
3432 __kmp_printf("\n");
3433 }
3434 }
3435
3436 // Print out __kmp_threads array.
3437 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3438 "----------\n");
3439 if (__kmp_threads != NULL) {
3440 int gtid;
3441 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3442 kmp_info_t const *thread = __kmp_threads[gtid];
3443 if (thread != NULL) {
3444 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3445 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3446 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3447 __kmp_print_structure_team(" Serial Team: ",
3448 thread->th.th_serial_team);
3449 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3450 __kmp_print_structure_thread(" Primary: ",
3451 thread->th.th_team_master);
3452 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3453 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3454 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3455 __kmp_print_structure_thread(" Next in pool: ",
3456 thread->th.th_next_pool);
3457 __kmp_printf("\n");
3458 __kmp_print_structure_team_accum(list, thread->th.th_team);
3459 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3460 }
3461 }
3462 } else {
3463 __kmp_printf("Threads array is not allocated.\n");
3464 }
3465
3466 // Print out __kmp_root array.
3467 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3468 "--------\n");
3469 if (__kmp_root != NULL) {
3470 int gtid;
3471 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3472 kmp_root_t const *root = __kmp_root[gtid];
3473 if (root != NULL) {
3474 __kmp_printf("GTID %2d %p:\n", gtid, root);
3475 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3476 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3477 __kmp_print_structure_thread(" Uber Thread: ",
3478 root->r.r_uber_thread);
3479 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3480 __kmp_printf(" In Parallel: %2d\n",
3481 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3482 __kmp_printf("\n");
3483 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3484 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3485 }
3486 }
3487 } else {
3488 __kmp_printf("Ubers array is not allocated.\n");
3489 }
3490
3491 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3492 "--------\n");
3493 while (list->next != NULL) {
3494 kmp_team_p const *team = list->entry;
3495 int i;
3496 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3497 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3498 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3499 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3500 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3501 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3502 for (i = 0; i < team->t.t_nproc; ++i) {
3503 __kmp_printf(" Thread %2d: ", i);
3504 __kmp_print_structure_thread("", team->t.t_threads[i]);
3505 }
3506 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3507 __kmp_printf("\n");
3508 list = list->next;
3509 }
3510
3511 // Print out __kmp_thread_pool and __kmp_team_pool.
3512 __kmp_printf("\n------------------------------\nPools\n----------------------"
3513 "--------\n");
3514 __kmp_print_structure_thread("Thread pool: ",
3515 CCAST(kmp_info_t *, __kmp_thread_pool));
3516 __kmp_print_structure_team("Team pool: ",
3517 CCAST(kmp_team_t *, __kmp_team_pool));
3518 __kmp_printf("\n");
3519
3520 // Free team list.
3521 while (list != NULL) {
3522 kmp_team_list_item_t *item = list;
3523 list = list->next;
3524 KMP_INTERNAL_FREE(item);
3525 }
3526}
3527
3528#endif
3529
3530//---------------------------------------------------------------------------
3531// Stuff for per-thread fast random number generator
3532// Table of primes
3533static const unsigned __kmp_primes[] = {
3534 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3535 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3536 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3537 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3538 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3539 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3540 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3541 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3542 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3543 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3544 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3545
3546//---------------------------------------------------------------------------
3547// __kmp_get_random: Get a random number using a linear congruential method.
3548unsigned short __kmp_get_random(kmp_info_t *thread) {
3549 unsigned x = thread->th.th_x;
3550 unsigned short r = (unsigned short)(x >> 16);
3551
3552 thread->th.th_x = x * thread->th.th_a + 1;
3553
3554 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3555 thread->th.th_info.ds.ds_tid, r));
3556
3557 return r;
3558}
3559//--------------------------------------------------------
3560// __kmp_init_random: Initialize a random number generator
3561void __kmp_init_random(kmp_info_t *thread) {
3562 unsigned seed = thread->th.th_info.ds.ds_tid;
3563
3564 thread->th.th_a =
3565 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3566 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3567 KA_TRACE(30,
3568 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3569}
3570
3571#if KMP_OS_WINDOWS
3572/* reclaim array entries for root threads that are already dead, returns number
3573 * reclaimed */
3574static int __kmp_reclaim_dead_roots(void) {
3575 int i, r = 0;
3576
3577 for (i = 0; i < __kmp_threads_capacity; ++i) {
3578 if (KMP_UBER_GTID(i) &&
3579 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3580 !__kmp_root[i]
3581 ->r.r_active) { // AC: reclaim only roots died in non-active state
3582 r += __kmp_unregister_root_other_thread(i);
3583 }
3584 }
3585 return r;
3586}
3587#endif
3588
3589/* This function attempts to create free entries in __kmp_threads and
3590 __kmp_root, and returns the number of free entries generated.
3591
3592 For Windows* OS static library, the first mechanism used is to reclaim array
3593 entries for root threads that are already dead.
3594
3595 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3596 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3597 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3598 threadprivate cache array has been created. Synchronization with
3599 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3600
3601 After any dead root reclamation, if the clipping value allows array expansion
3602 to result in the generation of a total of nNeed free slots, the function does
3603 that expansion. If not, nothing is done beyond the possible initial root
3604 thread reclamation.
3605
3606 If any argument is negative, the behavior is undefined. */
3607static int __kmp_expand_threads(int nNeed) {
3608 int added = 0;
3609 int minimumRequiredCapacity;
3610 int newCapacity;
3611 kmp_info_t **newThreads;
3612 kmp_root_t **newRoot;
3613
3614 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3615 // resizing __kmp_threads does not need additional protection if foreign
3616 // threads are present
3617
3618#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3619 /* only for Windows static library */
3620 /* reclaim array entries for root threads that are already dead */
3621 added = __kmp_reclaim_dead_roots();
3622
3623 if (nNeed) {
3624 nNeed -= added;
3625 if (nNeed < 0)
3626 nNeed = 0;
3627 }
3628#endif
3629 if (nNeed <= 0)
3630 return added;
3631
3632 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3633 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3634 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3635 // > __kmp_max_nth in one of two ways:
3636 //
3637 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3638 // may not be reused by another thread, so we may need to increase
3639 // __kmp_threads_capacity to __kmp_max_nth + 1.
3640 //
3641 // 2) New foreign root(s) are encountered. We always register new foreign
3642 // roots. This may cause a smaller # of threads to be allocated at
3643 // subsequent parallel regions, but the worker threads hang around (and
3644 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3645 //
3646 // Anyway, that is the reason for moving the check to see if
3647 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3648 // instead of having it performed here. -BB
3649
3650 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3651
3652 /* compute expansion headroom to check if we can expand */
3653 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3654 /* possible expansion too small -- give up */
3655 return added;
3656 }
3657 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3658
3659 newCapacity = __kmp_threads_capacity;
3660 do {
3661 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3662 : __kmp_sys_max_nth;
3663 } while (newCapacity < minimumRequiredCapacity);
3664 newThreads = (kmp_info_t **)__kmp_allocate(
3665 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3666 newRoot =
3667 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3668 KMP_MEMCPY(newThreads, __kmp_threads,
3669 __kmp_threads_capacity * sizeof(kmp_info_t *));
3670 KMP_MEMCPY(newRoot, __kmp_root,
3671 __kmp_threads_capacity * sizeof(kmp_root_t *));
3672
3673 kmp_info_t **temp_threads = __kmp_threads;
3674 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3675 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3676 __kmp_free(temp_threads);
3677 added += newCapacity - __kmp_threads_capacity;
3678 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3679
3680 if (newCapacity > __kmp_tp_capacity) {
3681 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3682 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3683 __kmp_threadprivate_resize_cache(newCapacity);
3684 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3685 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3686 }
3687 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3688 }
3689
3690 return added;
3691}
3692
3693/* Register the current thread as a root thread and obtain our gtid. We must
3694 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3695 thread that calls from __kmp_do_serial_initialize() */
3696int __kmp_register_root(int initial_thread) {
3697 kmp_info_t *root_thread;
3698 kmp_root_t *root;
3699 int gtid;
3700 int capacity;
3701 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3702 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3703 KMP_MB();
3704
3705 /* 2007-03-02:
3706 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3707 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3708 work as expected -- it may return false (that means there is at least one
3709 empty slot in __kmp_threads array), but it is possible the only free slot
3710 is #0, which is reserved for initial thread and so cannot be used for this
3711 one. Following code workarounds this bug.
3712
3713 However, right solution seems to be not reserving slot #0 for initial
3714 thread because:
3715 (1) there is no magic in slot #0,
3716 (2) we cannot detect initial thread reliably (the first thread which does
3717 serial initialization may be not a real initial thread).
3718 */
3719 capacity = __kmp_threads_capacity;
3720 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3721 --capacity;
3722 }
3723
3724 // If it is not for initializing the hidden helper team, we need to take
3725 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3726 // in __kmp_threads_capacity.
3727 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3728 capacity -= __kmp_hidden_helper_threads_num;
3729 }
3730
3731 /* see if there are too many threads */
3732 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3733 if (__kmp_tp_cached) {
3734 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3735 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3736 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3737 } else {
3738 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3739 __kmp_msg_null);
3740 }
3741 }
3742
3743 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3744 // 0: initial thread, also a regular OpenMP thread.
3745 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3746 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3747 // regular OpenMP threads.
3748 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3749 // Find an available thread slot for hidden helper thread. Slots for hidden
3750 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3751 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3752 gtid <= __kmp_hidden_helper_threads_num;
3753 gtid++)
3754 ;
3755 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3756 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3757 "hidden helper thread: T#%d\n",
3758 gtid));
3759 } else {
3760 /* find an available thread slot */
3761 // Don't reassign the zero slot since we need that to only be used by
3762 // initial thread. Slots for hidden helper threads should also be skipped.
3763 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3764 gtid = 0;
3765 } else {
3766 for (gtid = __kmp_hidden_helper_threads_num + 1;
3767 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3768 ;
3769 }
3770 KA_TRACE(
3771 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3772 KMP_ASSERT(gtid < __kmp_threads_capacity);
3773 }
3774
3775 /* update global accounting */
3776 __kmp_all_nth++;
3777 TCW_4(__kmp_nth, __kmp_nth + 1);
3778
3779 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3780 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3781 if (__kmp_adjust_gtid_mode) {
3782 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3783 if (TCR_4(__kmp_gtid_mode) != 2) {
3784 TCW_4(__kmp_gtid_mode, 2);
3785 }
3786 } else {
3787 if (TCR_4(__kmp_gtid_mode) != 1) {
3788 TCW_4(__kmp_gtid_mode, 1);
3789 }
3790 }
3791 }
3792
3793#ifdef KMP_ADJUST_BLOCKTIME
3794 /* Adjust blocktime to zero if necessary */
3795 /* Middle initialization might not have occurred yet */
3796 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3797 if (__kmp_nth > __kmp_avail_proc) {
3798 __kmp_zero_bt = TRUE;
3799 }
3800 }
3801#endif /* KMP_ADJUST_BLOCKTIME */
3802
3803 /* setup this new hierarchy */
3804 if (!(root = __kmp_root[gtid])) {
3805 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3806 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3807 }
3808
3809#if KMP_STATS_ENABLED
3810 // Initialize stats as soon as possible (right after gtid assignment).
3811 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3812 __kmp_stats_thread_ptr->startLife();
3813 KMP_SET_THREAD_STATE(SERIAL_REGION);
3814 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3815#endif
3816 __kmp_initialize_root(root);
3817
3818 /* setup new root thread structure */
3819 if (root->r.r_uber_thread) {
3820 root_thread = root->r.r_uber_thread;
3821 } else {
3822 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3823 if (__kmp_storage_map) {
3824 __kmp_print_thread_storage_map(root_thread, gtid);
3825 }
3826 root_thread->th.th_info.ds.ds_gtid = gtid;
3827#if OMPT_SUPPORT
3828 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3829#endif
3830 root_thread->th.th_root = root;
3831 if (__kmp_env_consistency_check) {
3832 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3833 }
3834#if USE_FAST_MEMORY
3835 __kmp_initialize_fast_memory(root_thread);
3836#endif /* USE_FAST_MEMORY */
3837
3838#if KMP_USE_BGET
3839 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3840 __kmp_initialize_bget(root_thread);
3841#endif
3842 __kmp_init_random(root_thread); // Initialize random number generator
3843 }
3844
3845 /* setup the serial team held in reserve by the root thread */
3846 if (!root_thread->th.th_serial_team) {
3847 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3848 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3849 root_thread->th.th_serial_team = __kmp_allocate_team(
3850 root, 1, 1,
3851#if OMPT_SUPPORT
3852 ompt_data_none, // root parallel id
3853#endif
3854 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3855 }
3856 KMP_ASSERT(root_thread->th.th_serial_team);
3857 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3858 root_thread->th.th_serial_team));
3859
3860 /* drop root_thread into place */
3861 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3862
3863 root->r.r_root_team->t.t_threads[0] = root_thread;
3864 root->r.r_hot_team->t.t_threads[0] = root_thread;
3865 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3866 // AC: the team created in reserve, not for execution (it is unused for now).
3867 root_thread->th.th_serial_team->t.t_serialized = 0;
3868 root->r.r_uber_thread = root_thread;
3869
3870 /* initialize the thread, get it ready to go */
3871 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3872 TCW_4(__kmp_init_gtid, TRUE);
3873
3874 /* prepare the primary thread for get_gtid() */
3875 __kmp_gtid_set_specific(gtid);
3876
3877#if USE_ITT_BUILD
3878 __kmp_itt_thread_name(gtid);
3879#endif /* USE_ITT_BUILD */
3880
3881#ifdef KMP_TDATA_GTID
3882 __kmp_gtid = gtid;
3883#endif
3884 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3885 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3886
3887 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3888 "plain=%u\n",
3889 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3890 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3891 KMP_INIT_BARRIER_STATE));
3892 { // Initialize barrier data.
3893 int b;
3894 for (b = 0; b < bs_last_barrier; ++b) {
3895 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3896#if USE_DEBUGGER
3897 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3898#endif
3899 }
3900 }
3901 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3902 KMP_INIT_BARRIER_STATE);
3903
3904#if KMP_AFFINITY_SUPPORTED
3905 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3906 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3907 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3908 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3909#endif /* KMP_AFFINITY_SUPPORTED */
3910 root_thread->th.th_def_allocator = __kmp_def_allocator;
3911 root_thread->th.th_prev_level = 0;
3912 root_thread->th.th_prev_num_threads = 1;
3913
3914 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3915 tmp->cg_root = root_thread;
3916 tmp->cg_thread_limit = __kmp_cg_max_nth;
3917 tmp->cg_nthreads = 1;
3918 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3919 " cg_nthreads init to 1\n",
3920 root_thread, tmp));
3921 tmp->up = NULL;
3922 root_thread->th.th_cg_roots = tmp;
3923
3924 __kmp_root_counter++;
3925
3926#if OMPT_SUPPORT
3927 if (!initial_thread && ompt_enabled.enabled) {
3928
3929 kmp_info_t *root_thread = ompt_get_thread();
3930
3931 ompt_set_thread_state(root_thread, ompt_state_overhead);
3932
3933 if (ompt_enabled.ompt_callback_thread_begin) {
3934 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3935 ompt_thread_initial, __ompt_get_thread_data_internal());
3936 }
3937 ompt_data_t *task_data;
3938 ompt_data_t *parallel_data;
3939 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3940 NULL);
3941 if (ompt_enabled.ompt_callback_implicit_task) {
3942 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3943 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3944 }
3945
3946 ompt_set_thread_state(root_thread, ompt_state_work_serial);
3947 }
3948#endif
3949#if OMPD_SUPPORT
3950 if (ompd_state & OMPD_ENABLE_BP)
3951 ompd_bp_thread_begin();
3952#endif
3953
3954 KMP_MB();
3955 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3956
3957 return gtid;
3958}
3959
3960#if KMP_NESTED_HOT_TEAMS
3961static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3962 const int max_level) {
3963 int i, n, nth;
3964 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3965 if (!hot_teams || !hot_teams[level].hot_team) {
3966 return 0;
3967 }
3968 KMP_DEBUG_ASSERT(level < max_level);
3969 kmp_team_t *team = hot_teams[level].hot_team;
3970 nth = hot_teams[level].hot_team_nth;
3971 n = nth - 1; // primary thread is not freed
3972 if (level < max_level - 1) {
3973 for (i = 0; i < nth; ++i) {
3974 kmp_info_t *th = team->t.t_threads[i];
3975 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3976 if (i > 0 && th->th.th_hot_teams) {
3977 __kmp_free(th->th.th_hot_teams);
3978 th->th.th_hot_teams = NULL;
3979 }
3980 }
3981 }
3982 __kmp_free_team(root, team, NULL);
3983 return n;
3984}
3985#endif
3986
3987// Resets a root thread and clear its root and hot teams.
3988// Returns the number of __kmp_threads entries directly and indirectly freed.
3989static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3990 kmp_team_t *root_team = root->r.r_root_team;
3991 kmp_team_t *hot_team = root->r.r_hot_team;
3992 int n = hot_team->t.t_nproc;
3993 int i;
3994
3995 KMP_DEBUG_ASSERT(!root->r.r_active);
3996
3997 root->r.r_root_team = NULL;
3998 root->r.r_hot_team = NULL;
3999 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4000 // before call to __kmp_free_team().
4001 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4002#if KMP_NESTED_HOT_TEAMS
4003 if (__kmp_hot_teams_max_level >
4004 0) { // need to free nested hot teams and their threads if any
4005 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4006 kmp_info_t *th = hot_team->t.t_threads[i];
4007 if (__kmp_hot_teams_max_level > 1) {
4008 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4009 }
4010 if (th->th.th_hot_teams) {
4011 __kmp_free(th->th.th_hot_teams);
4012 th->th.th_hot_teams = NULL;
4013 }
4014 }
4015 }
4016#endif
4017 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4018
4019 // Before we can reap the thread, we need to make certain that all other
4020 // threads in the teams that had this root as ancestor have stopped trying to
4021 // steal tasks.
4022 if (__kmp_tasking_mode != tskm_immediate_exec) {
4023 __kmp_wait_to_unref_task_teams();
4024 }
4025
4026#if KMP_OS_WINDOWS
4027 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4028 KA_TRACE(
4029 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4030 "\n",
4031 (LPVOID) & (root->r.r_uber_thread->th),
4032 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4033 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4034#endif /* KMP_OS_WINDOWS */
4035
4036#if OMPD_SUPPORT
4037 if (ompd_state & OMPD_ENABLE_BP)
4038 ompd_bp_thread_end();
4039#endif
4040
4041#if OMPT_SUPPORT
4042 ompt_data_t *task_data;
4043 ompt_data_t *parallel_data;
4044 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4045 NULL);
4046 if (ompt_enabled.ompt_callback_implicit_task) {
4047 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4048 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4049 }
4050 if (ompt_enabled.ompt_callback_thread_end) {
4051 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4052 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4053 }
4054#endif
4055
4056 TCW_4(__kmp_nth,
4057 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4058 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4059 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4060 " to %d\n",
4061 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4062 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4063 if (i == 1) {
4064 // need to free contention group structure
4065 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4066 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4067 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4068 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4069 root->r.r_uber_thread->th.th_cg_roots = NULL;
4070 }
4071 __kmp_reap_thread(root->r.r_uber_thread, 1);
4072
4073 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4074 // instead of freeing.
4075 root->r.r_uber_thread = NULL;
4076 /* mark root as no longer in use */
4077 root->r.r_begin = FALSE;
4078
4079 return n;
4080}
4081
4082void __kmp_unregister_root_current_thread(int gtid) {
4083 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4084 /* this lock should be ok, since unregister_root_current_thread is never
4085 called during an abort, only during a normal close. furthermore, if you
4086 have the forkjoin lock, you should never try to get the initz lock */
4087 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4088 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4089 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4090 "exiting T#%d\n",
4091 gtid));
4092 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4093 return;
4094 }
4095 kmp_root_t *root = __kmp_root[gtid];
4096
4097 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4098 KMP_ASSERT(KMP_UBER_GTID(gtid));
4099 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4100 KMP_ASSERT(root->r.r_active == FALSE);
4101
4102 KMP_MB();
4103
4104 kmp_info_t *thread = __kmp_threads[gtid];
4105 kmp_team_t *team = thread->th.th_team;
4106 kmp_task_team_t *task_team = thread->th.th_task_team;
4107
4108 // we need to wait for the proxy tasks before finishing the thread
4109 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4110 task_team->tt.tt_hidden_helper_task_encountered)) {
4111#if OMPT_SUPPORT
4112 // the runtime is shutting down so we won't report any events
4113 thread->th.ompt_thread_info.state = ompt_state_undefined;
4114#endif
4115 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4116 }
4117
4118 __kmp_reset_root(gtid, root);
4119
4120 KMP_MB();
4121 KC_TRACE(10,
4122 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4123
4124 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4125}
4126
4127#if KMP_OS_WINDOWS
4128/* __kmp_forkjoin_lock must be already held
4129 Unregisters a root thread that is not the current thread. Returns the number
4130 of __kmp_threads entries freed as a result. */
4131static int __kmp_unregister_root_other_thread(int gtid) {
4132 kmp_root_t *root = __kmp_root[gtid];
4133 int r;
4134
4135 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4136 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4137 KMP_ASSERT(KMP_UBER_GTID(gtid));
4138 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4139 KMP_ASSERT(root->r.r_active == FALSE);
4140
4141 r = __kmp_reset_root(gtid, root);
4142 KC_TRACE(10,
4143 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4144 return r;
4145}
4146#endif
4147
4148#if KMP_DEBUG
4149void __kmp_task_info() {
4150
4151 kmp_int32 gtid = __kmp_entry_gtid();
4152 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4153 kmp_info_t *this_thr = __kmp_threads[gtid];
4154 kmp_team_t *steam = this_thr->th.th_serial_team;
4155 kmp_team_t *team = this_thr->th.th_team;
4156
4157 __kmp_printf(
4158 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4159 "ptask=%p\n",
4160 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4161 team->t.t_implicit_task_taskdata[tid].td_parent);
4162}
4163#endif // KMP_DEBUG
4164
4165/* TODO optimize with one big memclr, take out what isn't needed, split
4166 responsibility to workers as much as possible, and delay initialization of
4167 features as much as possible */
4168static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4169 int tid, int gtid) {
4170 /* this_thr->th.th_info.ds.ds_gtid is setup in
4171 kmp_allocate_thread/create_worker.
4172 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4173 KMP_DEBUG_ASSERT(this_thr != NULL);
4174 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4175 KMP_DEBUG_ASSERT(team);
4176 KMP_DEBUG_ASSERT(team->t.t_threads);
4177 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4178 kmp_info_t *master = team->t.t_threads[0];
4179 KMP_DEBUG_ASSERT(master);
4180 KMP_DEBUG_ASSERT(master->th.th_root);
4181
4182 KMP_MB();
4183
4184 TCW_SYNC_PTR(this_thr->th.th_team, team);
4185
4186 this_thr->th.th_info.ds.ds_tid = tid;
4187 this_thr->th.th_set_nproc = 0;
4188 if (__kmp_tasking_mode != tskm_immediate_exec)
4189 // When tasking is possible, threads are not safe to reap until they are
4190 // done tasking; this will be set when tasking code is exited in wait
4191 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4192 else // no tasking --> always safe to reap
4193 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4194 this_thr->th.th_set_proc_bind = proc_bind_default;
4195#if KMP_AFFINITY_SUPPORTED
4196 this_thr->th.th_new_place = this_thr->th.th_current_place;
4197#endif
4198 this_thr->th.th_root = master->th.th_root;
4199
4200 /* setup the thread's cache of the team structure */
4201 this_thr->th.th_team_nproc = team->t.t_nproc;
4202 this_thr->th.th_team_master = master;
4203 this_thr->th.th_team_serialized = team->t.t_serialized;
4204
4205 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4206
4207 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4208 tid, gtid, this_thr, this_thr->th.th_current_task));
4209
4210 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4211 team, tid, TRUE);
4212
4213 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4214 tid, gtid, this_thr, this_thr->th.th_current_task));
4215 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4216 // __kmp_initialize_team()?
4217
4218 /* TODO no worksharing in speculative threads */
4219 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4220
4221 this_thr->th.th_local.this_construct = 0;
4222
4223 if (!this_thr->th.th_pri_common) {
4224 this_thr->th.th_pri_common =
4225 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4226 if (__kmp_storage_map) {
4227 __kmp_print_storage_map_gtid(
4228 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4229 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4230 }
4231 this_thr->th.th_pri_head = NULL;
4232 }
4233
4234 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4235 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4236 // Make new thread's CG root same as primary thread's
4237 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4238 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4239 if (tmp) {
4240 // worker changes CG, need to check if old CG should be freed
4241 int i = tmp->cg_nthreads--;
4242 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4243 " on node %p of thread %p to %d\n",
4244 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4245 if (i == 1) {
4246 __kmp_free(tmp); // last thread left CG --> free it
4247 }
4248 }
4249 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4250 // Increment new thread's CG root's counter to add the new thread
4251 this_thr->th.th_cg_roots->cg_nthreads++;
4252 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4253 " node %p of thread %p to %d\n",
4254 this_thr, this_thr->th.th_cg_roots,
4255 this_thr->th.th_cg_roots->cg_root,
4256 this_thr->th.th_cg_roots->cg_nthreads));
4257 this_thr->th.th_current_task->td_icvs.thread_limit =
4258 this_thr->th.th_cg_roots->cg_thread_limit;
4259 }
4260
4261 /* Initialize dynamic dispatch */
4262 {
4263 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4264 // Use team max_nproc since this will never change for the team.
4265 size_t disp_size =
4266 sizeof(dispatch_private_info_t) *
4267 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4268 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4269 team->t.t_max_nproc));
4270 KMP_ASSERT(dispatch);
4271 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4272 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4273
4274 dispatch->th_disp_index = 0;
4275 dispatch->th_doacross_buf_idx = 0;
4276 if (!dispatch->th_disp_buffer) {
4277 dispatch->th_disp_buffer =
4278 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4279
4280 if (__kmp_storage_map) {
4281 __kmp_print_storage_map_gtid(
4282 gtid, &dispatch->th_disp_buffer[0],
4283 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4284 ? 1
4285 : __kmp_dispatch_num_buffers],
4286 disp_size,
4287 "th_%d.th_dispatch.th_disp_buffer "
4288 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4289 gtid, team->t.t_id, gtid);
4290 }
4291 } else {
4292 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4293 }
4294
4295 dispatch->th_dispatch_pr_current = 0;
4296 dispatch->th_dispatch_sh_current = 0;
4297
4298 dispatch->th_deo_fcn = 0; /* ORDERED */
4299 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4300 }
4301
4302 this_thr->th.th_next_pool = NULL;
4303
4304 if (!this_thr->th.th_task_state_memo_stack) {
4305 size_t i;
4306 this_thr->th.th_task_state_memo_stack =
4307 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4308 this_thr->th.th_task_state_top = 0;
4309 this_thr->th.th_task_state_stack_sz = 4;
4310 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4311 ++i) // zero init the stack
4312 this_thr->th.th_task_state_memo_stack[i] = 0;
4313 }
4314
4315 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4316 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4317
4318 KMP_MB();
4319}
4320
4321/* allocate a new thread for the requesting team. this is only called from
4322 within a forkjoin critical section. we will first try to get an available
4323 thread from the thread pool. if none is available, we will fork a new one
4324 assuming we are able to create a new one. this should be assured, as the
4325 caller should check on this first. */
4326kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4327 int new_tid) {
4328 kmp_team_t *serial_team;
4329 kmp_info_t *new_thr;
4330 int new_gtid;
4331
4332 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4333 KMP_DEBUG_ASSERT(root && team);
4334#if !KMP_NESTED_HOT_TEAMS
4335 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4336#endif
4337 KMP_MB();
4338
4339 /* first, try to get one from the thread pool */
4340 if (__kmp_thread_pool) {
4341 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4342 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4343 if (new_thr == __kmp_thread_pool_insert_pt) {
4344 __kmp_thread_pool_insert_pt = NULL;
4345 }
4346 TCW_4(new_thr->th.th_in_pool, FALSE);
4347 __kmp_suspend_initialize_thread(new_thr);
4348 __kmp_lock_suspend_mx(new_thr);
4349 if (new_thr->th.th_active_in_pool == TRUE) {
4350 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4351 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4352 new_thr->th.th_active_in_pool = FALSE;
4353 }
4354 __kmp_unlock_suspend_mx(new_thr);
4355
4356 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4357 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4358 KMP_ASSERT(!new_thr->th.th_team);
4359 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4360
4361 /* setup the thread structure */
4362 __kmp_initialize_info(new_thr, team, new_tid,
4363 new_thr->th.th_info.ds.ds_gtid);
4364 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4365
4366 TCW_4(__kmp_nth, __kmp_nth + 1);
4367
4368 new_thr->th.th_task_state = 0;
4369 new_thr->th.th_task_state_top = 0;
4370 new_thr->th.th_task_state_stack_sz = 4;
4371
4372 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4373 // Make sure pool thread has transitioned to waiting on own thread struct
4374 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4375 // Thread activated in __kmp_allocate_team when increasing team size
4376 }
4377
4378#ifdef KMP_ADJUST_BLOCKTIME
4379 /* Adjust blocktime back to zero if necessary */
4380 /* Middle initialization might not have occurred yet */
4381 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4382 if (__kmp_nth > __kmp_avail_proc) {
4383 __kmp_zero_bt = TRUE;
4384 }
4385 }
4386#endif /* KMP_ADJUST_BLOCKTIME */
4387
4388#if KMP_DEBUG
4389 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4390 // KMP_BARRIER_PARENT_FLAG.
4391 int b;
4392 kmp_balign_t *balign = new_thr->th.th_bar;
4393 for (b = 0; b < bs_last_barrier; ++b)
4394 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4395#endif
4396
4397 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4398 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4399
4400 KMP_MB();
4401 return new_thr;
4402 }
4403
4404 /* no, well fork a new one */
4405 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4406 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4407
4408#if KMP_USE_MONITOR
4409 // If this is the first worker thread the RTL is creating, then also
4410 // launch the monitor thread. We try to do this as early as possible.
4411 if (!TCR_4(__kmp_init_monitor)) {
4412 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4413 if (!TCR_4(__kmp_init_monitor)) {
4414 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4415 TCW_4(__kmp_init_monitor, 1);
4416 __kmp_create_monitor(&__kmp_monitor);
4417 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4418#if KMP_OS_WINDOWS
4419 // AC: wait until monitor has started. This is a fix for CQ232808.
4420 // The reason is that if the library is loaded/unloaded in a loop with
4421 // small (parallel) work in between, then there is high probability that
4422 // monitor thread started after the library shutdown. At shutdown it is
4423 // too late to cope with the problem, because when the primary thread is
4424 // in DllMain (process detach) the monitor has no chances to start (it is
4425 // blocked), and primary thread has no means to inform the monitor that
4426 // the library has gone, because all the memory which the monitor can
4427 // access is going to be released/reset.
4428 while (TCR_4(__kmp_init_monitor) < 2) {
4429 KMP_YIELD(TRUE);
4430 }
4431 KF_TRACE(10, ("after monitor thread has started\n"));
4432#endif
4433 }
4434 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4435 }
4436#endif
4437
4438 KMP_MB();
4439
4440 {
4441 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4442 ? 1
4443 : __kmp_hidden_helper_threads_num + 1;
4444
4445 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4446 ++new_gtid) {
4447 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4448 }
4449
4450 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4451 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4452 }
4453 }
4454
4455 /* allocate space for it. */
4456 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4457
4458 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4459
4460#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4461 // suppress race conditions detection on synchronization flags in debug mode
4462 // this helps to analyze library internals eliminating false positives
4463 __itt_suppress_mark_range(
4464 __itt_suppress_range, __itt_suppress_threading_errors,
4465 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4466 __itt_suppress_mark_range(
4467 __itt_suppress_range, __itt_suppress_threading_errors,
4468 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4469#if KMP_OS_WINDOWS
4470 __itt_suppress_mark_range(
4471 __itt_suppress_range, __itt_suppress_threading_errors,
4472 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4473#else
4474 __itt_suppress_mark_range(__itt_suppress_range,
4475 __itt_suppress_threading_errors,
4476 &new_thr->th.th_suspend_init_count,
4477 sizeof(new_thr->th.th_suspend_init_count));
4478#endif
4479 // TODO: check if we need to also suppress b_arrived flags
4480 __itt_suppress_mark_range(__itt_suppress_range,
4481 __itt_suppress_threading_errors,
4482 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4483 sizeof(new_thr->th.th_bar[0].bb.b_go));
4484 __itt_suppress_mark_range(__itt_suppress_range,
4485 __itt_suppress_threading_errors,
4486 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4487 sizeof(new_thr->th.th_bar[1].bb.b_go));
4488 __itt_suppress_mark_range(__itt_suppress_range,
4489 __itt_suppress_threading_errors,
4490 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4491 sizeof(new_thr->th.th_bar[2].bb.b_go));
4492#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4493 if (__kmp_storage_map) {
4494 __kmp_print_thread_storage_map(new_thr, new_gtid);
4495 }
4496
4497 // add the reserve serialized team, initialized from the team's primary thread
4498 {
4499 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4500 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4501 new_thr->th.th_serial_team = serial_team =
4502 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4503#if OMPT_SUPPORT
4504 ompt_data_none, // root parallel id
4505#endif
4506 proc_bind_default, &r_icvs,
4507 0 USE_NESTED_HOT_ARG(NULL));
4508 }
4509 KMP_ASSERT(serial_team);
4510 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4511 // execution (it is unused for now).
4512 serial_team->t.t_threads[0] = new_thr;
4513 KF_TRACE(10,
4514 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4515 new_thr));
4516
4517 /* setup the thread structures */
4518 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4519
4520#if USE_FAST_MEMORY
4521 __kmp_initialize_fast_memory(new_thr);
4522#endif /* USE_FAST_MEMORY */
4523
4524#if KMP_USE_BGET
4525 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4526 __kmp_initialize_bget(new_thr);
4527#endif
4528
4529 __kmp_init_random(new_thr); // Initialize random number generator
4530
4531 /* Initialize these only once when thread is grabbed for a team allocation */
4532 KA_TRACE(20,
4533 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4534 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4535
4536 int b;
4537 kmp_balign_t *balign = new_thr->th.th_bar;
4538 for (b = 0; b < bs_last_barrier; ++b) {
4539 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4540 balign[b].bb.team = NULL;
4541 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4542 balign[b].bb.use_oncore_barrier = 0;
4543 }
4544
4545 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4546 new_thr->th.th_sleep_loc_type = flag_unset;
4547
4548 new_thr->th.th_spin_here = FALSE;
4549 new_thr->th.th_next_waiting = 0;
4550#if KMP_OS_UNIX
4551 new_thr->th.th_blocking = false;
4552#endif
4553
4554#if KMP_AFFINITY_SUPPORTED
4555 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4556 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4557 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4558 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4559#endif
4560 new_thr->th.th_def_allocator = __kmp_def_allocator;
4561 new_thr->th.th_prev_level = 0;
4562 new_thr->th.th_prev_num_threads = 1;
4563
4564 TCW_4(new_thr->th.th_in_pool, FALSE);
4565 new_thr->th.th_active_in_pool = FALSE;
4566 TCW_4(new_thr->th.th_active, TRUE);
4567
4568 /* adjust the global counters */
4569 __kmp_all_nth++;
4570 __kmp_nth++;
4571
4572 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4573 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4574 if (__kmp_adjust_gtid_mode) {
4575 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4576 if (TCR_4(__kmp_gtid_mode) != 2) {
4577 TCW_4(__kmp_gtid_mode, 2);
4578 }
4579 } else {
4580 if (TCR_4(__kmp_gtid_mode) != 1) {
4581 TCW_4(__kmp_gtid_mode, 1);
4582 }
4583 }
4584 }
4585
4586#ifdef KMP_ADJUST_BLOCKTIME
4587 /* Adjust blocktime back to zero if necessary */
4588 /* Middle initialization might not have occurred yet */
4589 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4590 if (__kmp_nth > __kmp_avail_proc) {
4591 __kmp_zero_bt = TRUE;
4592 }
4593 }
4594#endif /* KMP_ADJUST_BLOCKTIME */
4595
4596 /* actually fork it and create the new worker thread */
4597 KF_TRACE(
4598 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4599 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4600 KF_TRACE(10,
4601 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4602
4603 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4604 new_gtid));
4605 KMP_MB();
4606 return new_thr;
4607}
4608
4609/* Reinitialize team for reuse.
4610 The hot team code calls this case at every fork barrier, so EPCC barrier
4611 test are extremely sensitive to changes in it, esp. writes to the team
4612 struct, which cause a cache invalidation in all threads.
4613 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4614static void __kmp_reinitialize_team(kmp_team_t *team,
4615 kmp_internal_control_t *new_icvs,
4616 ident_t *loc) {
4617 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4618 team->t.t_threads[0], team));
4619 KMP_DEBUG_ASSERT(team && new_icvs);
4620 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4621 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4622
4623 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4624 // Copy ICVs to the primary thread's implicit taskdata
4625 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4626 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4627
4628 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4629 team->t.t_threads[0], team));
4630}
4631
4632/* Initialize the team data structure.
4633 This assumes the t_threads and t_max_nproc are already set.
4634 Also, we don't touch the arguments */
4635static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4636 kmp_internal_control_t *new_icvs,
4637 ident_t *loc) {
4638 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4639
4640 /* verify */
4641 KMP_DEBUG_ASSERT(team);
4642 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4643 KMP_DEBUG_ASSERT(team->t.t_threads);
4644 KMP_MB();
4645
4646 team->t.t_master_tid = 0; /* not needed */
4647 /* team->t.t_master_bar; not needed */
4648 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4649 team->t.t_nproc = new_nproc;
4650
4651 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4652 team->t.t_next_pool = NULL;
4653 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4654 * up hot team */
4655
4656 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4657 team->t.t_invoke = NULL; /* not needed */
4658
4659 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4660 team->t.t_sched.sched = new_icvs->sched.sched;
4661
4662#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4663 team->t.t_fp_control_saved = FALSE; /* not needed */
4664 team->t.t_x87_fpu_control_word = 0; /* not needed */
4665 team->t.t_mxcsr = 0; /* not needed */
4666#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4667
4668 team->t.t_construct = 0;
4669
4670 team->t.t_ordered.dt.t_value = 0;
4671 team->t.t_master_active = FALSE;
4672
4673#ifdef KMP_DEBUG
4674 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4675#endif
4676#if KMP_OS_WINDOWS
4677 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4678#endif
4679
4680 team->t.t_control_stack_top = NULL;
4681
4682 __kmp_reinitialize_team(team, new_icvs, loc);
4683
4684 KMP_MB();
4685 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4686}
4687
4688#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4689/* Sets full mask for thread and returns old mask, no changes to structures. */
4690static void
4691__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4692 if (KMP_AFFINITY_CAPABLE()) {
4693 int status;
4694 if (old_mask != NULL) {
4695 status = __kmp_get_system_affinity(old_mask, TRUE);
4696 int error = errno;
4697 if (status != 0) {
4698 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4699 __kmp_msg_null);
4700 }
4701 }
4702 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4703 }
4704}
4705#endif
4706
4707#if KMP_AFFINITY_SUPPORTED
4708
4709// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4710// It calculates the worker + primary thread's partition based upon the parent
4711// thread's partition, and binds each worker to a thread in their partition.
4712// The primary thread's partition should already include its current binding.
4713static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4714 // Do not partition places for the hidden helper team
4715 if (KMP_HIDDEN_HELPER_TEAM(team))
4716 return;
4717 // Copy the primary thread's place partition to the team struct
4718 kmp_info_t *master_th = team->t.t_threads[0];
4719 KMP_DEBUG_ASSERT(master_th != NULL);
4720 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4721 int first_place = master_th->th.th_first_place;
4722 int last_place = master_th->th.th_last_place;
4723 int masters_place = master_th->th.th_current_place;
4724 team->t.t_first_place = first_place;
4725 team->t.t_last_place = last_place;
4726
4727 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4728 "bound to place %d partition = [%d,%d]\n",
4729 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4730 team->t.t_id, masters_place, first_place, last_place));
4731
4732 switch (proc_bind) {
4733
4734 case proc_bind_default:
4735 // Serial teams might have the proc_bind policy set to proc_bind_default.
4736 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4737 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4738 break;
4739
4740 case proc_bind_primary: {
4741 int f;
4742 int n_th = team->t.t_nproc;
4743 for (f = 1; f < n_th; f++) {
4744 kmp_info_t *th = team->t.t_threads[f];
4745 KMP_DEBUG_ASSERT(th != NULL);
4746 th->th.th_first_place = first_place;
4747 th->th.th_last_place = last_place;
4748 th->th.th_new_place = masters_place;
4749 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4750 team->t.t_display_affinity != 1) {
4751 team->t.t_display_affinity = 1;
4752 }
4753
4754 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4755 "partition = [%d,%d]\n",
4756 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4757 f, masters_place, first_place, last_place));
4758 }
4759 } break;
4760
4761 case proc_bind_close: {
4762 int f;
4763 int n_th = team->t.t_nproc;
4764 int n_places;
4765 if (first_place <= last_place) {
4766 n_places = last_place - first_place + 1;
4767 } else {
4768 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4769 }
4770 if (n_th <= n_places) {
4771 int place = masters_place;
4772 for (f = 1; f < n_th; f++) {
4773 kmp_info_t *th = team->t.t_threads[f];
4774 KMP_DEBUG_ASSERT(th != NULL);
4775
4776 if (place == last_place) {
4777 place = first_place;
4778 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4779 place = 0;
4780 } else {
4781 place++;
4782 }
4783 th->th.th_first_place = first_place;
4784 th->th.th_last_place = last_place;
4785 th->th.th_new_place = place;
4786 if (__kmp_display_affinity && place != th->th.th_current_place &&
4787 team->t.t_display_affinity != 1) {
4788 team->t.t_display_affinity = 1;
4789 }
4790
4791 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4792 "partition = [%d,%d]\n",
4793 __kmp_gtid_from_thread(team->t.t_threads[f]),
4794 team->t.t_id, f, place, first_place, last_place));
4795 }
4796 } else {
4797 int S, rem, gap, s_count;
4798 S = n_th / n_places;
4799 s_count = 0;
4800 rem = n_th - (S * n_places);
4801 gap = rem > 0 ? n_places / rem : n_places;
4802 int place = masters_place;
4803 int gap_ct = gap;
4804 for (f = 0; f < n_th; f++) {
4805 kmp_info_t *th = team->t.t_threads[f];
4806 KMP_DEBUG_ASSERT(th != NULL);
4807
4808 th->th.th_first_place = first_place;
4809 th->th.th_last_place = last_place;
4810 th->th.th_new_place = place;
4811 if (__kmp_display_affinity && place != th->th.th_current_place &&
4812 team->t.t_display_affinity != 1) {
4813 team->t.t_display_affinity = 1;
4814 }
4815 s_count++;
4816
4817 if ((s_count == S) && rem && (gap_ct == gap)) {
4818 // do nothing, add an extra thread to place on next iteration
4819 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4820 // we added an extra thread to this place; move to next place
4821 if (place == last_place) {
4822 place = first_place;
4823 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4824 place = 0;
4825 } else {
4826 place++;
4827 }
4828 s_count = 0;
4829 gap_ct = 1;
4830 rem--;
4831 } else if (s_count == S) { // place full; don't add extra
4832 if (place == last_place) {
4833 place = first_place;
4834 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4835 place = 0;
4836 } else {
4837 place++;
4838 }
4839 gap_ct++;
4840 s_count = 0;
4841 }
4842
4843 KA_TRACE(100,
4844 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4845 "partition = [%d,%d]\n",
4846 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4847 th->th.th_new_place, first_place, last_place));
4848 }
4849 KMP_DEBUG_ASSERT(place == masters_place);
4850 }
4851 } break;
4852
4853 case proc_bind_spread: {
4854 int f;
4855 int n_th = team->t.t_nproc;
4856 int n_places;
4857 int thidx;
4858 if (first_place <= last_place) {
4859 n_places = last_place - first_place + 1;
4860 } else {
4861 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4862 }
4863 if (n_th <= n_places) {
4864 int place = -1;
4865
4866 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4867 int S = n_places / n_th;
4868 int s_count, rem, gap, gap_ct;
4869
4870 place = masters_place;
4871 rem = n_places - n_th * S;
4872 gap = rem ? n_th / rem : 1;
4873 gap_ct = gap;
4874 thidx = n_th;
4875 if (update_master_only == 1)
4876 thidx = 1;
4877 for (f = 0; f < thidx; f++) {
4878 kmp_info_t *th = team->t.t_threads[f];
4879 KMP_DEBUG_ASSERT(th != NULL);
4880
4881 th->th.th_first_place = place;
4882 th->th.th_new_place = place;
4883 if (__kmp_display_affinity && place != th->th.th_current_place &&
4884 team->t.t_display_affinity != 1) {
4885 team->t.t_display_affinity = 1;
4886 }
4887 s_count = 1;
4888 while (s_count < S) {
4889 if (place == last_place) {
4890 place = first_place;
4891 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4892 place = 0;
4893 } else {
4894 place++;
4895 }
4896 s_count++;
4897 }
4898 if (rem && (gap_ct == gap)) {
4899 if (place == last_place) {
4900 place = first_place;
4901 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4902 place = 0;
4903 } else {
4904 place++;
4905 }
4906 rem--;
4907 gap_ct = 0;
4908 }
4909 th->th.th_last_place = place;
4910 gap_ct++;
4911
4912 if (place == last_place) {
4913 place = first_place;
4914 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4915 place = 0;
4916 } else {
4917 place++;
4918 }
4919
4920 KA_TRACE(100,
4921 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4922 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4923 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4924 f, th->th.th_new_place, th->th.th_first_place,
4925 th->th.th_last_place, __kmp_affinity_num_masks));
4926 }
4927 } else {
4928 /* Having uniform space of available computation places I can create
4929 T partitions of round(P/T) size and put threads into the first
4930 place of each partition. */
4931 double current = static_cast<double>(masters_place);
4932 double spacing =
4933 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4934 int first, last;
4935 kmp_info_t *th;
4936
4937 thidx = n_th + 1;
4938 if (update_master_only == 1)
4939 thidx = 1;
4940 for (f = 0; f < thidx; f++) {
4941 first = static_cast<int>(current);
4942 last = static_cast<int>(current + spacing) - 1;
4943 KMP_DEBUG_ASSERT(last >= first);
4944 if (first >= n_places) {
4945 if (masters_place) {
4946 first -= n_places;
4947 last -= n_places;
4948 if (first == (masters_place + 1)) {
4949 KMP_DEBUG_ASSERT(f == n_th);
4950 first--;
4951 }
4952 if (last == masters_place) {
4953 KMP_DEBUG_ASSERT(f == (n_th - 1));
4954 last--;
4955 }
4956 } else {
4957 KMP_DEBUG_ASSERT(f == n_th);
4958 first = 0;
4959 last = 0;
4960 }
4961 }
4962 if (last >= n_places) {
4963 last = (n_places - 1);
4964 }
4965 place = first;
4966 current += spacing;
4967 if (f < n_th) {
4968 KMP_DEBUG_ASSERT(0 <= first);
4969 KMP_DEBUG_ASSERT(n_places > first);
4970 KMP_DEBUG_ASSERT(0 <= last);
4971 KMP_DEBUG_ASSERT(n_places > last);
4972 KMP_DEBUG_ASSERT(last_place >= first_place);
4973 th = team->t.t_threads[f];
4974 KMP_DEBUG_ASSERT(th);
4975 th->th.th_first_place = first;
4976 th->th.th_new_place = place;
4977 th->th.th_last_place = last;
4978 if (__kmp_display_affinity && place != th->th.th_current_place &&
4979 team->t.t_display_affinity != 1) {
4980 team->t.t_display_affinity = 1;
4981 }
4982 KA_TRACE(100,
4983 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4984 "partition = [%d,%d], spacing = %.4f\n",
4985 __kmp_gtid_from_thread(team->t.t_threads[f]),
4986 team->t.t_id, f, th->th.th_new_place,
4987 th->th.th_first_place, th->th.th_last_place, spacing));
4988 }
4989 }
4990 }
4991 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4992 } else {
4993 int S, rem, gap, s_count;
4994 S = n_th / n_places;
4995 s_count = 0;
4996 rem = n_th - (S * n_places);
4997 gap = rem > 0 ? n_places / rem : n_places;
4998 int place = masters_place;
4999 int gap_ct = gap;
5000 thidx = n_th;
5001 if (update_master_only == 1)
5002 thidx = 1;
5003 for (f = 0; f < thidx; f++) {
5004 kmp_info_t *th = team->t.t_threads[f];
5005 KMP_DEBUG_ASSERT(th != NULL);
5006
5007 th->th.th_first_place = place;
5008 th->th.th_last_place = place;
5009 th->th.th_new_place = place;
5010 if (__kmp_display_affinity && place != th->th.th_current_place &&
5011 team->t.t_display_affinity != 1) {
5012 team->t.t_display_affinity = 1;
5013 }
5014 s_count++;
5015
5016 if ((s_count == S) && rem && (gap_ct == gap)) {
5017 // do nothing, add an extra thread to place on next iteration
5018 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5019 // we added an extra thread to this place; move on to next place
5020 if (place == last_place) {
5021 place = first_place;
5022 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5023 place = 0;
5024 } else {
5025 place++;
5026 }
5027 s_count = 0;
5028 gap_ct = 1;
5029 rem--;
5030 } else if (s_count == S) { // place is full; don't add extra thread
5031 if (place == last_place) {
5032 place = first_place;
5033 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5034 place = 0;
5035 } else {
5036 place++;
5037 }
5038 gap_ct++;
5039 s_count = 0;
5040 }
5041
5042 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5043 "partition = [%d,%d]\n",
5044 __kmp_gtid_from_thread(team->t.t_threads[f]),
5045 team->t.t_id, f, th->th.th_new_place,
5046 th->th.th_first_place, th->th.th_last_place));
5047 }
5048 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5049 }
5050 } break;
5051
5052 default:
5053 break;
5054 }
5055
5056 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5057}
5058
5059#endif // KMP_AFFINITY_SUPPORTED
5060
5061/* allocate a new team data structure to use. take one off of the free pool if
5062 available */
5063kmp_team_t *
5064__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5065#if OMPT_SUPPORT
5066 ompt_data_t ompt_parallel_data,
5067#endif
5068 kmp_proc_bind_t new_proc_bind,
5069 kmp_internal_control_t *new_icvs,
5070 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5071 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5072 int f;
5073 kmp_team_t *team;
5074 int use_hot_team = !root->r.r_active;
5075 int level = 0;
5076 int do_place_partition = 1;
5077
5078 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5079 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5080 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5081 KMP_MB();
5082
5083#if KMP_NESTED_HOT_TEAMS
5084 kmp_hot_team_ptr_t *hot_teams;
5085 if (master) {
5086 team = master->th.th_team;
5087 level = team->t.t_active_level;
5088 if (master->th.th_teams_microtask) { // in teams construct?
5089 if (master->th.th_teams_size.nteams > 1 &&
5090 ( // #teams > 1
5091 team->t.t_pkfn ==
5092 (microtask_t)__kmp_teams_master || // inner fork of the teams
5093 master->th.th_teams_level <
5094 team->t.t_level)) { // or nested parallel inside the teams
5095 ++level; // not increment if #teams==1, or for outer fork of the teams;
5096 // increment otherwise
5097 }
5098 // Do not perform the place partition if inner fork of the teams
5099 // Wait until nested parallel region encountered inside teams construct
5100 if ((master->th.th_teams_size.nteams == 1 &&
5101 master->th.th_teams_level >= team->t.t_level) ||
5102 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5103 do_place_partition = 0;
5104 }
5105 hot_teams = master->th.th_hot_teams;
5106 if (level < __kmp_hot_teams_max_level && hot_teams &&
5107 hot_teams[level].hot_team) {
5108 // hot team has already been allocated for given level
5109 use_hot_team = 1;
5110 } else {
5111 use_hot_team = 0;
5112 }
5113 } else {
5114 // check we won't access uninitialized hot_teams, just in case
5115 KMP_DEBUG_ASSERT(new_nproc == 1);
5116 }
5117#endif
5118 // Optimization to use a "hot" team
5119 if (use_hot_team && new_nproc > 1) {
5120 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5121#if KMP_NESTED_HOT_TEAMS
5122 team = hot_teams[level].hot_team;
5123#else
5124 team = root->r.r_hot_team;
5125#endif
5126#if KMP_DEBUG
5127 if (__kmp_tasking_mode != tskm_immediate_exec) {
5128 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5129 "task_team[1] = %p before reinit\n",
5130 team->t.t_task_team[0], team->t.t_task_team[1]));
5131 }
5132#endif
5133
5134 if (team->t.t_nproc != new_nproc &&
5135 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5136 // Distributed barrier may need a resize
5137 int old_nthr = team->t.t_nproc;
5138 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5139 }
5140
5141 // If not doing the place partition, then reset the team's proc bind
5142 // to indicate that partitioning of all threads still needs to take place
5143 if (do_place_partition == 0)
5144 team->t.t_proc_bind = proc_bind_default;
5145 // Has the number of threads changed?
5146 /* Let's assume the most common case is that the number of threads is
5147 unchanged, and put that case first. */
5148 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5149 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5150 // This case can mean that omp_set_num_threads() was called and the hot
5151 // team size was already reduced, so we check the special flag
5152 if (team->t.t_size_changed == -1) {
5153 team->t.t_size_changed = 1;
5154 } else {
5155 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5156 }
5157
5158 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5159 kmp_r_sched_t new_sched = new_icvs->sched;
5160 // set primary thread's schedule as new run-time schedule
5161 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5162
5163 __kmp_reinitialize_team(team, new_icvs,
5164 root->r.r_uber_thread->th.th_ident);
5165
5166 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5167 team->t.t_threads[0], team));
5168 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5169
5170#if KMP_AFFINITY_SUPPORTED
5171 if ((team->t.t_size_changed == 0) &&
5172 (team->t.t_proc_bind == new_proc_bind)) {
5173 if (new_proc_bind == proc_bind_spread) {
5174 if (do_place_partition) {
5175 // add flag to update only master for spread
5176 __kmp_partition_places(team, 1);
5177 }
5178 }
5179 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5180 "proc_bind = %d, partition = [%d,%d]\n",
5181 team->t.t_id, new_proc_bind, team->t.t_first_place,
5182 team->t.t_last_place));
5183 } else {
5184 if (do_place_partition) {
5185 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5186 __kmp_partition_places(team);
5187 }
5188 }
5189#else
5190 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5191#endif /* KMP_AFFINITY_SUPPORTED */
5192 } else if (team->t.t_nproc > new_nproc) {
5193 KA_TRACE(20,
5194 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5195 new_nproc));
5196
5197 team->t.t_size_changed = 1;
5198 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5199 // Barrier size already reduced earlier in this function
5200 // Activate team threads via th_used_in_team
5201 __kmp_add_threads_to_team(team, new_nproc);
5202 }
5203#if KMP_NESTED_HOT_TEAMS
5204 if (__kmp_hot_teams_mode == 0) {
5205 // AC: saved number of threads should correspond to team's value in this
5206 // mode, can be bigger in mode 1, when hot team has threads in reserve
5207 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5208 hot_teams[level].hot_team_nth = new_nproc;
5209#endif // KMP_NESTED_HOT_TEAMS
5210 /* release the extra threads we don't need any more */
5211 for (f = new_nproc; f < team->t.t_nproc; f++) {
5212 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5213 if (__kmp_tasking_mode != tskm_immediate_exec) {
5214 // When decreasing team size, threads no longer in the team should
5215 // unref task team.
5216 team->t.t_threads[f]->th.th_task_team = NULL;
5217 }
5218 __kmp_free_thread(team->t.t_threads[f]);
5219 team->t.t_threads[f] = NULL;
5220 }
5221#if KMP_NESTED_HOT_TEAMS
5222 } // (__kmp_hot_teams_mode == 0)
5223 else {
5224 // When keeping extra threads in team, switch threads to wait on own
5225 // b_go flag
5226 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5227 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5228 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5229 for (int b = 0; b < bs_last_barrier; ++b) {
5230 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5231 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5232 }
5233 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5234 }
5235 }
5236 }
5237#endif // KMP_NESTED_HOT_TEAMS
5238 team->t.t_nproc = new_nproc;
5239 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5240 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5241 __kmp_reinitialize_team(team, new_icvs,
5242 root->r.r_uber_thread->th.th_ident);
5243
5244 // Update remaining threads
5245 for (f = 0; f < new_nproc; ++f) {
5246 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5247 }
5248
5249 // restore the current task state of the primary thread: should be the
5250 // implicit task
5251 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5252 team->t.t_threads[0], team));
5253
5254 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5255
5256#ifdef KMP_DEBUG
5257 for (f = 0; f < team->t.t_nproc; f++) {
5258 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5259 team->t.t_threads[f]->th.th_team_nproc ==
5260 team->t.t_nproc);
5261 }
5262#endif
5263
5264 if (do_place_partition) {
5265 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5266#if KMP_AFFINITY_SUPPORTED
5267 __kmp_partition_places(team);
5268#endif
5269 }
5270 } else { // team->t.t_nproc < new_nproc
5271#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5272 kmp_affin_mask_t *old_mask;
5273 if (KMP_AFFINITY_CAPABLE()) {
5274 KMP_CPU_ALLOC(old_mask);
5275 }
5276#endif
5277
5278 KA_TRACE(20,
5279 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5280 new_nproc));
5281 int old_nproc = team->t.t_nproc; // save old value and use to update only
5282 team->t.t_size_changed = 1;
5283
5284#if KMP_NESTED_HOT_TEAMS
5285 int avail_threads = hot_teams[level].hot_team_nth;
5286 if (new_nproc < avail_threads)
5287 avail_threads = new_nproc;
5288 kmp_info_t **other_threads = team->t.t_threads;
5289 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5290 // Adjust barrier data of reserved threads (if any) of the team
5291 // Other data will be set in __kmp_initialize_info() below.
5292 int b;
5293 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5294 for (b = 0; b < bs_last_barrier; ++b) {
5295 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5296 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5297#if USE_DEBUGGER
5298 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5299#endif
5300 }
5301 }
5302 if (hot_teams[level].hot_team_nth >= new_nproc) {
5303 // we have all needed threads in reserve, no need to allocate any
5304 // this only possible in mode 1, cannot have reserved threads in mode 0
5305 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5306 team->t.t_nproc = new_nproc; // just get reserved threads involved
5307 } else {
5308 // We may have some threads in reserve, but not enough;
5309 // get reserved threads involved if any.
5310 team->t.t_nproc = hot_teams[level].hot_team_nth;
5311 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5312#endif // KMP_NESTED_HOT_TEAMS
5313 if (team->t.t_max_nproc < new_nproc) {
5314 /* reallocate larger arrays */
5315 __kmp_reallocate_team_arrays(team, new_nproc);
5316 __kmp_reinitialize_team(team, new_icvs, NULL);
5317 }
5318
5319#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5320 /* Temporarily set full mask for primary thread before creation of
5321 workers. The reason is that workers inherit the affinity from the
5322 primary thread, so if a lot of workers are created on the single
5323 core quickly, they don't get a chance to set their own affinity for
5324 a long time. */
5325 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5326#endif
5327
5328 /* allocate new threads for the hot team */
5329 for (f = team->t.t_nproc; f < new_nproc; f++) {
5330 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5331 KMP_DEBUG_ASSERT(new_worker);
5332 team->t.t_threads[f] = new_worker;
5333
5334 KA_TRACE(20,
5335 ("__kmp_allocate_team: team %d init T#%d arrived: "
5336 "join=%llu, plain=%llu\n",
5337 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5338 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5339 team->t.t_bar[bs_plain_barrier].b_arrived));
5340
5341 { // Initialize barrier data for new threads.
5342 int b;
5343 kmp_balign_t *balign = new_worker->th.th_bar;
5344 for (b = 0; b < bs_last_barrier; ++b) {
5345 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5346 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5347 KMP_BARRIER_PARENT_FLAG);
5348#if USE_DEBUGGER
5349 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5350#endif
5351 }
5352 }
5353 }
5354
5355#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5356 if (KMP_AFFINITY_CAPABLE()) {
5357 /* Restore initial primary thread's affinity mask */
5358 __kmp_set_system_affinity(old_mask, TRUE);
5359 KMP_CPU_FREE(old_mask);
5360 }
5361#endif
5362#if KMP_NESTED_HOT_TEAMS
5363 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5364#endif // KMP_NESTED_HOT_TEAMS
5365 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5366 // Barrier size already increased earlier in this function
5367 // Activate team threads via th_used_in_team
5368 __kmp_add_threads_to_team(team, new_nproc);
5369 }
5370 /* make sure everyone is syncronized */
5371 // new threads below
5372 __kmp_initialize_team(team, new_nproc, new_icvs,
5373 root->r.r_uber_thread->th.th_ident);
5374
5375 /* reinitialize the threads */
5376 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5377 for (f = 0; f < team->t.t_nproc; ++f)
5378 __kmp_initialize_info(team->t.t_threads[f], team, f,
5379 __kmp_gtid_from_tid(f, team));
5380
5381 if (level) { // set th_task_state for new threads in nested hot team
5382 // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5383 // only need to set the th_task_state for the new threads. th_task_state
5384 // for primary thread will not be accurate until after this in
5385 // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5386 // get the correct value.
5387 for (f = old_nproc; f < team->t.t_nproc; ++f)
5388 team->t.t_threads[f]->th.th_task_state =
5389 team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5390 } else { // set th_task_state for new threads in non-nested hot team
5391 // copy primary thread's state
5392 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5393 for (f = old_nproc; f < team->t.t_nproc; ++f)
5394 team->t.t_threads[f]->th.th_task_state = old_state;
5395 }
5396
5397#ifdef KMP_DEBUG
5398 for (f = 0; f < team->t.t_nproc; ++f) {
5399 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5400 team->t.t_threads[f]->th.th_team_nproc ==
5401 team->t.t_nproc);
5402 }
5403#endif
5404
5405 if (do_place_partition) {
5406 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5407#if KMP_AFFINITY_SUPPORTED
5408 __kmp_partition_places(team);
5409#endif
5410 }
5411 } // Check changes in number of threads
5412
5413 kmp_info_t *master = team->t.t_threads[0];
5414 if (master->th.th_teams_microtask) {
5415 for (f = 1; f < new_nproc; ++f) {
5416 // propagate teams construct specific info to workers
5417 kmp_info_t *thr = team->t.t_threads[f];
5418 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5419 thr->th.th_teams_level = master->th.th_teams_level;
5420 thr->th.th_teams_size = master->th.th_teams_size;
5421 }
5422 }
5423#if KMP_NESTED_HOT_TEAMS
5424 if (level) {
5425 // Sync barrier state for nested hot teams, not needed for outermost hot
5426 // team.
5427 for (f = 1; f < new_nproc; ++f) {
5428 kmp_info_t *thr = team->t.t_threads[f];
5429 int b;
5430 kmp_balign_t *balign = thr->th.th_bar;
5431 for (b = 0; b < bs_last_barrier; ++b) {
5432 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5433 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5434#if USE_DEBUGGER
5435 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5436#endif
5437 }
5438 }
5439 }
5440#endif // KMP_NESTED_HOT_TEAMS
5441
5442 /* reallocate space for arguments if necessary */
5443 __kmp_alloc_argv_entries(argc, team, TRUE);
5444 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5445 // The hot team re-uses the previous task team,
5446 // if untouched during the previous release->gather phase.
5447
5448 KF_TRACE(10, (" hot_team = %p\n", team));
5449
5450#if KMP_DEBUG
5451 if (__kmp_tasking_mode != tskm_immediate_exec) {
5452 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5453 "task_team[1] = %p after reinit\n",
5454 team->t.t_task_team[0], team->t.t_task_team[1]));
5455 }
5456#endif
5457
5458#if OMPT_SUPPORT
5459 __ompt_team_assign_id(team, ompt_parallel_data);
5460#endif
5461
5462 KMP_MB();
5463
5464 return team;
5465 }
5466
5467 /* next, let's try to take one from the team pool */
5468 KMP_MB();
5469 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5470 /* TODO: consider resizing undersized teams instead of reaping them, now
5471 that we have a resizing mechanism */
5472 if (team->t.t_max_nproc >= max_nproc) {
5473 /* take this team from the team pool */
5474 __kmp_team_pool = team->t.t_next_pool;
5475
5476 if (max_nproc > 1 &&
5477 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5478 if (!team->t.b) { // Allocate barrier structure
5479 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5480 }
5481 }
5482
5483 /* setup the team for fresh use */
5484 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5485
5486 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5487 "task_team[1] %p to NULL\n",
5488 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5489 team->t.t_task_team[0] = NULL;
5490 team->t.t_task_team[1] = NULL;
5491
5492 /* reallocate space for arguments if necessary */
5493 __kmp_alloc_argv_entries(argc, team, TRUE);
5494 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5495
5496 KA_TRACE(
5497 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5498 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5499 { // Initialize barrier data.
5500 int b;
5501 for (b = 0; b < bs_last_barrier; ++b) {
5502 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5503#if USE_DEBUGGER
5504 team->t.t_bar[b].b_master_arrived = 0;
5505 team->t.t_bar[b].b_team_arrived = 0;
5506#endif
5507 }
5508 }
5509
5510 team->t.t_proc_bind = new_proc_bind;
5511
5512 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5513 team->t.t_id));
5514
5515#if OMPT_SUPPORT
5516 __ompt_team_assign_id(team, ompt_parallel_data);
5517#endif
5518
5519 KMP_MB();
5520
5521 return team;
5522 }
5523
5524 /* reap team if it is too small, then loop back and check the next one */
5525 // not sure if this is wise, but, will be redone during the hot-teams
5526 // rewrite.
5527 /* TODO: Use technique to find the right size hot-team, don't reap them */
5528 team = __kmp_reap_team(team);
5529 __kmp_team_pool = team;
5530 }
5531
5532 /* nothing available in the pool, no matter, make a new team! */
5533 KMP_MB();
5534 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5535
5536 /* and set it up */
5537 team->t.t_max_nproc = max_nproc;
5538 if (max_nproc > 1 &&
5539 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5540 // Allocate barrier structure
5541 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5542 }
5543
5544 /* NOTE well, for some reason allocating one big buffer and dividing it up
5545 seems to really hurt performance a lot on the P4, so, let's not use this */
5546 __kmp_allocate_team_arrays(team, max_nproc);
5547
5548 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5549 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5550
5551 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5552 "%p to NULL\n",
5553 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5554 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5555 // memory, no need to duplicate
5556 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5557 // memory, no need to duplicate
5558
5559 if (__kmp_storage_map) {
5560 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5561 }
5562
5563 /* allocate space for arguments */
5564 __kmp_alloc_argv_entries(argc, team, FALSE);
5565 team->t.t_argc = argc;
5566
5567 KA_TRACE(20,
5568 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5569 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5570 { // Initialize barrier data.
5571 int b;
5572 for (b = 0; b < bs_last_barrier; ++b) {
5573 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5574#if USE_DEBUGGER
5575 team->t.t_bar[b].b_master_arrived = 0;
5576 team->t.t_bar[b].b_team_arrived = 0;
5577#endif
5578 }
5579 }
5580
5581 team->t.t_proc_bind = new_proc_bind;
5582
5583#if OMPT_SUPPORT
5584 __ompt_team_assign_id(team, ompt_parallel_data);
5585 team->t.ompt_serialized_team_info = NULL;
5586#endif
5587
5588 KMP_MB();
5589
5590 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5591 team->t.t_id));
5592
5593 return team;
5594}
5595
5596/* TODO implement hot-teams at all levels */
5597/* TODO implement lazy thread release on demand (disband request) */
5598
5599/* free the team. return it to the team pool. release all the threads
5600 * associated with it */
5601void __kmp_free_team(kmp_root_t *root,
5602 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5603 int f;
5604 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5605 team->t.t_id));
5606
5607 /* verify state */
5608 KMP_DEBUG_ASSERT(root);
5609 KMP_DEBUG_ASSERT(team);
5610 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5611 KMP_DEBUG_ASSERT(team->t.t_threads);
5612
5613 int use_hot_team = team == root->r.r_hot_team;
5614#if KMP_NESTED_HOT_TEAMS
5615 int level;
5616 if (master) {
5617 level = team->t.t_active_level - 1;
5618 if (master->th.th_teams_microtask) { // in teams construct?
5619 if (master->th.th_teams_size.nteams > 1) {
5620 ++level; // level was not increased in teams construct for
5621 // team_of_masters
5622 }
5623 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5624 master->th.th_teams_level == team->t.t_level) {
5625 ++level; // level was not increased in teams construct for
5626 // team_of_workers before the parallel
5627 } // team->t.t_level will be increased inside parallel
5628 }
5629#if KMP_DEBUG
5630 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5631#endif
5632 if (level < __kmp_hot_teams_max_level) {
5633 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5634 use_hot_team = 1;
5635 }
5636 }
5637#endif // KMP_NESTED_HOT_TEAMS
5638
5639 /* team is done working */
5640 TCW_SYNC_PTR(team->t.t_pkfn,
5641 NULL); // Important for Debugging Support Library.
5642#if KMP_OS_WINDOWS
5643 team->t.t_copyin_counter = 0; // init counter for possible reuse
5644#endif
5645 // Do not reset pointer to parent team to NULL for hot teams.
5646
5647 /* if we are non-hot team, release our threads */
5648 if (!use_hot_team) {
5649 if (__kmp_tasking_mode != tskm_immediate_exec) {
5650 // Wait for threads to reach reapable state
5651 for (f = 1; f < team->t.t_nproc; ++f) {
5652 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5653 kmp_info_t *th = team->t.t_threads[f];
5654 volatile kmp_uint32 *state = &th->th.th_reap_state;
5655 while (*state != KMP_SAFE_TO_REAP) {
5656#if KMP_OS_WINDOWS
5657 // On Windows a thread can be killed at any time, check this
5658 DWORD ecode;
5659 if (!__kmp_is_thread_alive(th, &ecode)) {
5660 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5661 break;
5662 }
5663#endif
5664 // first check if thread is sleeping
5665 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5666 if (fl.is_sleeping())
5667 fl.resume(__kmp_gtid_from_thread(th));
5668 KMP_CPU_PAUSE();
5669 }
5670 }
5671
5672 // Delete task teams
5673 int tt_idx;
5674 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5675 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5676 if (task_team != NULL) {
5677 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5678 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5679 team->t.t_threads[f]->th.th_task_team = NULL;
5680 }
5681 KA_TRACE(
5682 20,
5683 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5684 __kmp_get_gtid(), task_team, team->t.t_id));
5685#if KMP_NESTED_HOT_TEAMS
5686 __kmp_free_task_team(master, task_team);
5687#endif
5688 team->t.t_task_team[tt_idx] = NULL;
5689 }
5690 }
5691 }
5692
5693 // Reset pointer to parent team only for non-hot teams.
5694 team->t.t_parent = NULL;
5695 team->t.t_level = 0;
5696 team->t.t_active_level = 0;
5697
5698 /* free the worker threads */
5699 for (f = 1; f < team->t.t_nproc; ++f) {
5700 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5701 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5702 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5703 1, 2);
5704 }
5705 __kmp_free_thread(team->t.t_threads[f]);
5706 }
5707
5708 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5709 if (team->t.b) {
5710 // wake up thread at old location
5711 team->t.b->go_release();
5712 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5713 for (f = 1; f < team->t.t_nproc; ++f) {
5714 if (team->t.b->sleep[f].sleep) {
5715 __kmp_atomic_resume_64(
5716 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5717 (kmp_atomic_flag_64<> *)NULL);
5718 }
5719 }
5720 }
5721 // Wait for threads to be removed from team
5722 for (int f = 1; f < team->t.t_nproc; ++f) {
5723 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5724 KMP_CPU_PAUSE();
5725 }
5726 }
5727 }
5728
5729 for (f = 1; f < team->t.t_nproc; ++f) {
5730 team->t.t_threads[f] = NULL;
5731 }
5732
5733 if (team->t.t_max_nproc > 1 &&
5734 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5735 distributedBarrier::deallocate(team->t.b);
5736 team->t.b = NULL;
5737 }
5738 /* put the team back in the team pool */
5739 /* TODO limit size of team pool, call reap_team if pool too large */
5740 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5741 __kmp_team_pool = (volatile kmp_team_t *)team;
5742 } else { // Check if team was created for primary threads in teams construct
5743 // See if first worker is a CG root
5744 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5745 team->t.t_threads[1]->th.th_cg_roots);
5746 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5747 // Clean up the CG root nodes on workers so that this team can be re-used
5748 for (f = 1; f < team->t.t_nproc; ++f) {
5749 kmp_info_t *thr = team->t.t_threads[f];
5750 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5751 thr->th.th_cg_roots->cg_root == thr);
5752 // Pop current CG root off list
5753 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5754 thr->th.th_cg_roots = tmp->up;
5755 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5756 " up to node %p. cg_nthreads was %d\n",
5757 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5758 int i = tmp->cg_nthreads--;
5759 if (i == 1) {
5760 __kmp_free(tmp); // free CG if we are the last thread in it
5761 }
5762 // Restore current task's thread_limit from CG root
5763 if (thr->th.th_cg_roots)
5764 thr->th.th_current_task->td_icvs.thread_limit =
5765 thr->th.th_cg_roots->cg_thread_limit;
5766 }
5767 }
5768 }
5769
5770 KMP_MB();
5771}
5772
5773/* reap the team. destroy it, reclaim all its resources and free its memory */
5774kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5775 kmp_team_t *next_pool = team->t.t_next_pool;
5776
5777 KMP_DEBUG_ASSERT(team);
5778 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5779 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5780 KMP_DEBUG_ASSERT(team->t.t_threads);
5781 KMP_DEBUG_ASSERT(team->t.t_argv);
5782
5783 /* TODO clean the threads that are a part of this? */
5784
5785 /* free stuff */
5786 __kmp_free_team_arrays(team);
5787 if (team->t.t_argv != &team->t.t_inline_argv[0])
5788 __kmp_free((void *)team->t.t_argv);
5789 __kmp_free(team);
5790
5791 KMP_MB();
5792 return next_pool;
5793}
5794
5795// Free the thread. Don't reap it, just place it on the pool of available
5796// threads.
5797//
5798// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5799// binding for the affinity mechanism to be useful.
5800//
5801// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5802// However, we want to avoid a potential performance problem by always
5803// scanning through the list to find the correct point at which to insert
5804// the thread (potential N**2 behavior). To do this we keep track of the
5805// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5806// With single-level parallelism, threads will always be added to the tail
5807// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5808// parallelism, all bets are off and we may need to scan through the entire
5809// free list.
5810//
5811// This change also has a potentially large performance benefit, for some
5812// applications. Previously, as threads were freed from the hot team, they
5813// would be placed back on the free list in inverse order. If the hot team
5814// grew back to it's original size, then the freed thread would be placed
5815// back on the hot team in reverse order. This could cause bad cache
5816// locality problems on programs where the size of the hot team regularly
5817// grew and shrunk.
5818//
5819// Now, for single-level parallelism, the OMP tid is always == gtid.
5820void __kmp_free_thread(kmp_info_t *this_th) {
5821 int gtid;
5822 kmp_info_t **scan;
5823
5824 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5825 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5826
5827 KMP_DEBUG_ASSERT(this_th);
5828
5829 // When moving thread to pool, switch thread to wait on own b_go flag, and
5830 // uninitialized (NULL team).
5831 int b;
5832 kmp_balign_t *balign = this_th->th.th_bar;
5833 for (b = 0; b < bs_last_barrier; ++b) {
5834 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5835 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5836 balign[b].bb.team = NULL;
5837 balign[b].bb.leaf_kids = 0;
5838 }
5839 this_th->th.th_task_state = 0;
5840 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5841
5842 /* put thread back on the free pool */
5843 TCW_PTR(this_th->th.th_team, NULL);
5844 TCW_PTR(this_th->th.th_root, NULL);
5845 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5846
5847 while (this_th->th.th_cg_roots) {
5848 this_th->th.th_cg_roots->cg_nthreads--;
5849 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5850 " %p of thread %p to %d\n",
5851 this_th, this_th->th.th_cg_roots,
5852 this_th->th.th_cg_roots->cg_root,
5853 this_th->th.th_cg_roots->cg_nthreads));
5854 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5855 if (tmp->cg_root == this_th) { // Thread is a cg_root
5856 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5857 KA_TRACE(
5858 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5859 this_th->th.th_cg_roots = tmp->up;
5860 __kmp_free(tmp);
5861 } else { // Worker thread
5862 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5863 __kmp_free(tmp);
5864 }
5865 this_th->th.th_cg_roots = NULL;
5866 break;
5867 }
5868 }
5869
5870 /* If the implicit task assigned to this thread can be used by other threads
5871 * -> multiple threads can share the data and try to free the task at
5872 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5873 * with higher probability when hot team is disabled but can occurs even when
5874 * the hot team is enabled */
5875 __kmp_free_implicit_task(this_th);
5876 this_th->th.th_current_task = NULL;
5877
5878 // If the __kmp_thread_pool_insert_pt is already past the new insert
5879 // point, then we need to re-scan the entire list.
5880 gtid = this_th->th.th_info.ds.ds_gtid;
5881 if (__kmp_thread_pool_insert_pt != NULL) {
5882 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5883 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5884 __kmp_thread_pool_insert_pt = NULL;
5885 }
5886 }
5887
5888 // Scan down the list to find the place to insert the thread.
5889 // scan is the address of a link in the list, possibly the address of
5890 // __kmp_thread_pool itself.
5891 //
5892 // In the absence of nested parallelism, the for loop will have 0 iterations.
5893 if (__kmp_thread_pool_insert_pt != NULL) {
5894 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5895 } else {
5896 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5897 }
5898 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5899 scan = &((*scan)->th.th_next_pool))
5900 ;
5901
5902 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5903 // to its address.
5904 TCW_PTR(this_th->th.th_next_pool, *scan);
5905 __kmp_thread_pool_insert_pt = *scan = this_th;
5906 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5907 (this_th->th.th_info.ds.ds_gtid <
5908 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5909 TCW_4(this_th->th.th_in_pool, TRUE);
5910 __kmp_suspend_initialize_thread(this_th);
5911 __kmp_lock_suspend_mx(this_th);
5912 if (this_th->th.th_active == TRUE) {
5913 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5914 this_th->th.th_active_in_pool = TRUE;
5915 }
5916#if KMP_DEBUG
5917 else {
5918 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5919 }
5920#endif
5921 __kmp_unlock_suspend_mx(this_th);
5922
5923 TCW_4(__kmp_nth, __kmp_nth - 1);
5924
5925#ifdef KMP_ADJUST_BLOCKTIME
5926 /* Adjust blocktime back to user setting or default if necessary */
5927 /* Middle initialization might never have occurred */
5928 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5929 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5930 if (__kmp_nth <= __kmp_avail_proc) {
5931 __kmp_zero_bt = FALSE;
5932 }
5933 }
5934#endif /* KMP_ADJUST_BLOCKTIME */
5935
5936 KMP_MB();
5937}
5938
5939/* ------------------------------------------------------------------------ */
5940
5941void *__kmp_launch_thread(kmp_info_t *this_thr) {
5942#if OMP_PROFILING_SUPPORT
5943 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5944 // TODO: add a configuration option for time granularity
5945 if (ProfileTraceFile)
5946 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5947#endif
5948
5949 int gtid = this_thr->th.th_info.ds.ds_gtid;
5950 /* void *stack_data;*/
5951 kmp_team_t **volatile pteam;
5952
5953 KMP_MB();
5954 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5955
5956 if (__kmp_env_consistency_check) {
5957 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5958 }
5959
5960#if OMPD_SUPPORT
5961 if (ompd_state & OMPD_ENABLE_BP)
5962 ompd_bp_thread_begin();
5963#endif
5964
5965#if OMPT_SUPPORT
5966 ompt_data_t *thread_data = nullptr;
5967 if (ompt_enabled.enabled) {
5968 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5969 *thread_data = ompt_data_none;
5970
5971 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5972 this_thr->th.ompt_thread_info.wait_id = 0;
5973 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5974 this_thr->th.ompt_thread_info.parallel_flags = 0;
5975 if (ompt_enabled.ompt_callback_thread_begin) {
5976 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5977 ompt_thread_worker, thread_data);
5978 }
5979 this_thr->th.ompt_thread_info.state = ompt_state_idle;
5980 }
5981#endif
5982
5983 /* This is the place where threads wait for work */
5984 while (!TCR_4(__kmp_global.g.g_done)) {
5985 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5986 KMP_MB();
5987
5988 /* wait for work to do */
5989 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5990
5991 /* No tid yet since not part of a team */
5992 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5993
5994#if OMPT_SUPPORT
5995 if (ompt_enabled.enabled) {
5996 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5997 }
5998#endif
5999
6000 pteam = &this_thr->th.th_team;
6001
6002 /* have we been allocated? */
6003 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6004 /* we were just woken up, so run our new task */
6005 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6006 int rc;
6007 KA_TRACE(20,
6008 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6009 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6010 (*pteam)->t.t_pkfn));
6011
6012 updateHWFPControl(*pteam);
6013
6014#if OMPT_SUPPORT
6015 if (ompt_enabled.enabled) {
6016 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6017 }
6018#endif
6019
6020 rc = (*pteam)->t.t_invoke(gtid);
6021 KMP_ASSERT(rc);
6022
6023 KMP_MB();
6024 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6025 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6026 (*pteam)->t.t_pkfn));
6027 }
6028#if OMPT_SUPPORT
6029 if (ompt_enabled.enabled) {
6030 /* no frame set while outside task */
6031 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6032
6033 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6034 }
6035#endif
6036 /* join barrier after parallel region */
6037 __kmp_join_barrier(gtid);
6038 }
6039 }
6040 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6041
6042#if OMPD_SUPPORT
6043 if (ompd_state & OMPD_ENABLE_BP)
6044 ompd_bp_thread_end();
6045#endif
6046
6047#if OMPT_SUPPORT
6048 if (ompt_enabled.ompt_callback_thread_end) {
6049 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6050 }
6051#endif
6052
6053 this_thr->th.th_task_team = NULL;
6054 /* run the destructors for the threadprivate data for this thread */
6055 __kmp_common_destroy_gtid(gtid);
6056
6057 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6058 KMP_MB();
6059
6060#if OMP_PROFILING_SUPPORT
6061 llvm::timeTraceProfilerFinishThread();
6062#endif
6063 return this_thr;
6064}
6065
6066/* ------------------------------------------------------------------------ */
6067
6068void __kmp_internal_end_dest(void *specific_gtid) {
6069 // Make sure no significant bits are lost
6070 int gtid;
6071 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6072
6073 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6074 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6075 * this is because 0 is reserved for the nothing-stored case */
6076
6077 __kmp_internal_end_thread(gtid);
6078}
6079
6080#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6081
6082__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6083 __kmp_internal_end_atexit();
6084}
6085
6086#endif
6087
6088/* [Windows] josh: when the atexit handler is called, there may still be more
6089 than one thread alive */
6090void __kmp_internal_end_atexit(void) {
6091 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6092 /* [Windows]
6093 josh: ideally, we want to completely shutdown the library in this atexit
6094 handler, but stat code that depends on thread specific data for gtid fails
6095 because that data becomes unavailable at some point during the shutdown, so
6096 we call __kmp_internal_end_thread instead. We should eventually remove the
6097 dependency on __kmp_get_specific_gtid in the stat code and use
6098 __kmp_internal_end_library to cleanly shutdown the library.
6099
6100 // TODO: Can some of this comment about GVS be removed?
6101 I suspect that the offending stat code is executed when the calling thread
6102 tries to clean up a dead root thread's data structures, resulting in GVS
6103 code trying to close the GVS structures for that thread, but since the stat
6104 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6105 the calling thread is cleaning up itself instead of another thread, it get
6106 confused. This happens because allowing a thread to unregister and cleanup
6107 another thread is a recent modification for addressing an issue.
6108 Based on the current design (20050722), a thread may end up
6109 trying to unregister another thread only if thread death does not trigger
6110 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6111 thread specific data destructor function to detect thread death. For
6112 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6113 is nothing. Thus, the workaround is applicable only for Windows static
6114 stat library. */
6115 __kmp_internal_end_library(-1);
6116#if KMP_OS_WINDOWS
6117 __kmp_close_console();
6118#endif
6119}
6120
6121static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6122 // It is assumed __kmp_forkjoin_lock is acquired.
6123
6124 int gtid;
6125
6126 KMP_DEBUG_ASSERT(thread != NULL);
6127
6128 gtid = thread->th.th_info.ds.ds_gtid;
6129
6130 if (!is_root) {
6131 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6132 /* Assume the threads are at the fork barrier here */
6133 KA_TRACE(
6134 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6135 gtid));
6136 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6137 while (
6138 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6139 KMP_CPU_PAUSE();
6140 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6141 } else {
6142 /* Need release fence here to prevent seg faults for tree forkjoin
6143 barrier (GEH) */
6144 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6145 thread);
6146 __kmp_release_64(&flag);
6147 }
6148 }
6149
6150 // Terminate OS thread.
6151 __kmp_reap_worker(thread);
6152
6153 // The thread was killed asynchronously. If it was actively
6154 // spinning in the thread pool, decrement the global count.
6155 //
6156 // There is a small timing hole here - if the worker thread was just waking
6157 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6158 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6159 // the global counter might not get updated.
6160 //
6161 // Currently, this can only happen as the library is unloaded,
6162 // so there are no harmful side effects.
6163 if (thread->th.th_active_in_pool) {
6164 thread->th.th_active_in_pool = FALSE;
6165 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6166 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6167 }
6168 }
6169
6170 __kmp_free_implicit_task(thread);
6171
6172// Free the fast memory for tasking
6173#if USE_FAST_MEMORY
6174 __kmp_free_fast_memory(thread);
6175#endif /* USE_FAST_MEMORY */
6176
6177 __kmp_suspend_uninitialize_thread(thread);
6178
6179 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6180 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6181
6182 --__kmp_all_nth;
6183 // __kmp_nth was decremented when thread is added to the pool.
6184
6185#ifdef KMP_ADJUST_BLOCKTIME
6186 /* Adjust blocktime back to user setting or default if necessary */
6187 /* Middle initialization might never have occurred */
6188 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6189 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6190 if (__kmp_nth <= __kmp_avail_proc) {
6191 __kmp_zero_bt = FALSE;
6192 }
6193 }
6194#endif /* KMP_ADJUST_BLOCKTIME */
6195
6196 /* free the memory being used */
6197 if (__kmp_env_consistency_check) {
6198 if (thread->th.th_cons) {
6199 __kmp_free_cons_stack(thread->th.th_cons);
6200 thread->th.th_cons = NULL;
6201 }
6202 }
6203
6204 if (thread->th.th_pri_common != NULL) {
6205 __kmp_free(thread->th.th_pri_common);
6206 thread->th.th_pri_common = NULL;
6207 }
6208
6209 if (thread->th.th_task_state_memo_stack != NULL) {
6210 __kmp_free(thread->th.th_task_state_memo_stack);
6211 thread->th.th_task_state_memo_stack = NULL;
6212 }
6213
6214#if KMP_USE_BGET
6215 if (thread->th.th_local.bget_data != NULL) {
6216 __kmp_finalize_bget(thread);
6217 }
6218#endif
6219
6220#if KMP_AFFINITY_SUPPORTED
6221 if (thread->th.th_affin_mask != NULL) {
6222 KMP_CPU_FREE(thread->th.th_affin_mask);
6223 thread->th.th_affin_mask = NULL;
6224 }
6225#endif /* KMP_AFFINITY_SUPPORTED */
6226
6227#if KMP_USE_HIER_SCHED
6228 if (thread->th.th_hier_bar_data != NULL) {
6229 __kmp_free(thread->th.th_hier_bar_data);
6230 thread->th.th_hier_bar_data = NULL;
6231 }
6232#endif
6233
6234 __kmp_reap_team(thread->th.th_serial_team);
6235 thread->th.th_serial_team = NULL;
6236 __kmp_free(thread);
6237
6238 KMP_MB();
6239
6240} // __kmp_reap_thread
6241
6242static void __kmp_itthash_clean(kmp_info_t *th) {
6243#if USE_ITT_NOTIFY
6244 if (__kmp_itt_region_domains.count > 0) {
6245 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6246 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6247 while (bucket) {
6248 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6249 __kmp_thread_free(th, bucket);
6250 bucket = next;
6251 }
6252 }
6253 }
6254 if (__kmp_itt_barrier_domains.count > 0) {
6255 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6256 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6257 while (bucket) {
6258 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6259 __kmp_thread_free(th, bucket);
6260 bucket = next;
6261 }
6262 }
6263 }
6264#endif
6265}
6266
6267static void __kmp_internal_end(void) {
6268 int i;
6269
6270 /* First, unregister the library */
6271 __kmp_unregister_library();
6272
6273#if KMP_OS_WINDOWS
6274 /* In Win static library, we can't tell when a root actually dies, so we
6275 reclaim the data structures for any root threads that have died but not
6276 unregistered themselves, in order to shut down cleanly.
6277 In Win dynamic library we also can't tell when a thread dies. */
6278 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6279// dead roots
6280#endif
6281
6282 for (i = 0; i < __kmp_threads_capacity; i++)
6283 if (__kmp_root[i])
6284 if (__kmp_root[i]->r.r_active)
6285 break;
6286 KMP_MB(); /* Flush all pending memory write invalidates. */
6287 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6288
6289 if (i < __kmp_threads_capacity) {
6290#if KMP_USE_MONITOR
6291 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6292 KMP_MB(); /* Flush all pending memory write invalidates. */
6293
6294 // Need to check that monitor was initialized before reaping it. If we are
6295 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6296 // __kmp_monitor will appear to contain valid data, but it is only valid in
6297 // the parent process, not the child.
6298 // New behavior (201008): instead of keying off of the flag
6299 // __kmp_init_parallel, the monitor thread creation is keyed off
6300 // of the new flag __kmp_init_monitor.
6301 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6302 if (TCR_4(__kmp_init_monitor)) {
6303 __kmp_reap_monitor(&__kmp_monitor);
6304 TCW_4(__kmp_init_monitor, 0);
6305 }
6306 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6307 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6308#endif // KMP_USE_MONITOR
6309 } else {
6310/* TODO move this to cleanup code */
6311#ifdef KMP_DEBUG
6312 /* make sure that everything has properly ended */
6313 for (i = 0; i < __kmp_threads_capacity; i++) {
6314 if (__kmp_root[i]) {
6315 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6316 // there can be uber threads alive here
6317 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6318 }
6319 }
6320#endif
6321
6322 KMP_MB();
6323
6324 // Reap the worker threads.
6325 // This is valid for now, but be careful if threads are reaped sooner.
6326 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6327 // Get the next thread from the pool.
6328 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6329 __kmp_thread_pool = thread->th.th_next_pool;
6330 // Reap it.
6331 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6332 thread->th.th_next_pool = NULL;
6333 thread->th.th_in_pool = FALSE;
6334 __kmp_reap_thread(thread, 0);
6335 }
6336 __kmp_thread_pool_insert_pt = NULL;
6337
6338 // Reap teams.
6339 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6340 // Get the next team from the pool.
6341 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6342 __kmp_team_pool = team->t.t_next_pool;
6343 // Reap it.
6344 team->t.t_next_pool = NULL;
6345 __kmp_reap_team(team);
6346 }
6347
6348 __kmp_reap_task_teams();
6349
6350#if KMP_OS_UNIX
6351 // Threads that are not reaped should not access any resources since they
6352 // are going to be deallocated soon, so the shutdown sequence should wait
6353 // until all threads either exit the final spin-waiting loop or begin
6354 // sleeping after the given blocktime.
6355 for (i = 0; i < __kmp_threads_capacity; i++) {
6356 kmp_info_t *thr = __kmp_threads[i];
6357 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6358 KMP_CPU_PAUSE();
6359 }
6360#endif
6361
6362 for (i = 0; i < __kmp_threads_capacity; ++i) {
6363 // TBD: Add some checking...
6364 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6365 }
6366
6367 /* Make sure all threadprivate destructors get run by joining with all
6368 worker threads before resetting this flag */
6369 TCW_SYNC_4(__kmp_init_common, FALSE);
6370
6371 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6372 KMP_MB();
6373
6374#if KMP_USE_MONITOR
6375 // See note above: One of the possible fixes for CQ138434 / CQ140126
6376 //
6377 // FIXME: push both code fragments down and CSE them?
6378 // push them into __kmp_cleanup() ?
6379 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6380 if (TCR_4(__kmp_init_monitor)) {
6381 __kmp_reap_monitor(&__kmp_monitor);
6382 TCW_4(__kmp_init_monitor, 0);
6383 }
6384 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6385 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6386#endif
6387 } /* else !__kmp_global.t_active */
6388 TCW_4(__kmp_init_gtid, FALSE);
6389 KMP_MB(); /* Flush all pending memory write invalidates. */
6390
6391 __kmp_cleanup();
6392#if OMPT_SUPPORT
6393 ompt_fini();
6394#endif
6395}
6396
6397void __kmp_internal_end_library(int gtid_req) {
6398 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6399 /* this shouldn't be a race condition because __kmp_internal_end() is the
6400 only place to clear __kmp_serial_init */
6401 /* we'll check this later too, after we get the lock */
6402 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6403 // redundant, because the next check will work in any case.
6404 if (__kmp_global.g.g_abort) {
6405 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6406 /* TODO abort? */
6407 return;
6408 }
6409 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6410 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6411 return;
6412 }
6413
6414 // If hidden helper team has been initialized, we need to deinit it
6415 if (TCR_4(__kmp_init_hidden_helper) &&
6416 !TCR_4(__kmp_hidden_helper_team_done)) {
6417 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6418 // First release the main thread to let it continue its work
6419 __kmp_hidden_helper_main_thread_release();
6420 // Wait until the hidden helper team has been destroyed
6421 __kmp_hidden_helper_threads_deinitz_wait();
6422 }
6423
6424 KMP_MB(); /* Flush all pending memory write invalidates. */
6425 /* find out who we are and what we should do */
6426 {
6427 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6428 KA_TRACE(
6429 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6430 if (gtid == KMP_GTID_SHUTDOWN) {
6431 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6432 "already shutdown\n"));
6433 return;
6434 } else if (gtid == KMP_GTID_MONITOR) {
6435 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6436 "registered, or system shutdown\n"));
6437 return;
6438 } else if (gtid == KMP_GTID_DNE) {
6439 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6440 "shutdown\n"));
6441 /* we don't know who we are, but we may still shutdown the library */
6442 } else if (KMP_UBER_GTID(gtid)) {
6443 /* unregister ourselves as an uber thread. gtid is no longer valid */
6444 if (__kmp_root[gtid]->r.r_active) {
6445 __kmp_global.g.g_abort = -1;
6446 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6447 __kmp_unregister_library();
6448 KA_TRACE(10,
6449 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6450 gtid));
6451 return;
6452 } else {
6453 __kmp_itthash_clean(__kmp_threads[gtid]);
6454 KA_TRACE(
6455 10,
6456 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6457 __kmp_unregister_root_current_thread(gtid);
6458 }
6459 } else {
6460/* worker threads may call this function through the atexit handler, if they
6461 * call exit() */
6462/* For now, skip the usual subsequent processing and just dump the debug buffer.
6463 TODO: do a thorough shutdown instead */
6464#ifdef DUMP_DEBUG_ON_EXIT
6465 if (__kmp_debug_buf)
6466 __kmp_dump_debug_buffer();
6467#endif
6468 // added unregister library call here when we switch to shm linux
6469 // if we don't, it will leave lots of files in /dev/shm
6470 // cleanup shared memory file before exiting.
6471 __kmp_unregister_library();
6472 return;
6473 }
6474 }
6475 /* synchronize the termination process */
6476 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6477
6478 /* have we already finished */
6479 if (__kmp_global.g.g_abort) {
6480 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6481 /* TODO abort? */
6482 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6483 return;
6484 }
6485 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6486 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6487 return;
6488 }
6489
6490 /* We need this lock to enforce mutex between this reading of
6491 __kmp_threads_capacity and the writing by __kmp_register_root.
6492 Alternatively, we can use a counter of roots that is atomically updated by
6493 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6494 __kmp_internal_end_*. */
6495 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6496
6497 /* now we can safely conduct the actual termination */
6498 __kmp_internal_end();
6499
6500 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6501 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6502
6503 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6504
6505#ifdef DUMP_DEBUG_ON_EXIT
6506 if (__kmp_debug_buf)
6507 __kmp_dump_debug_buffer();
6508#endif
6509
6510#if KMP_OS_WINDOWS
6511 __kmp_close_console();
6512#endif
6513
6514 __kmp_fini_allocator();
6515
6516} // __kmp_internal_end_library
6517
6518void __kmp_internal_end_thread(int gtid_req) {
6519 int i;
6520
6521 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6522 /* this shouldn't be a race condition because __kmp_internal_end() is the
6523 * only place to clear __kmp_serial_init */
6524 /* we'll check this later too, after we get the lock */
6525 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6526 // redundant, because the next check will work in any case.
6527 if (__kmp_global.g.g_abort) {
6528 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6529 /* TODO abort? */
6530 return;
6531 }
6532 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6533 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6534 return;
6535 }
6536
6537 // If hidden helper team has been initialized, we need to deinit it
6538 if (TCR_4(__kmp_init_hidden_helper) &&
6539 !TCR_4(__kmp_hidden_helper_team_done)) {
6540 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6541 // First release the main thread to let it continue its work
6542 __kmp_hidden_helper_main_thread_release();
6543 // Wait until the hidden helper team has been destroyed
6544 __kmp_hidden_helper_threads_deinitz_wait();
6545 }
6546
6547 KMP_MB(); /* Flush all pending memory write invalidates. */
6548
6549 /* find out who we are and what we should do */
6550 {
6551 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6552 KA_TRACE(10,
6553 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6554 if (gtid == KMP_GTID_SHUTDOWN) {
6555 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6556 "already shutdown\n"));
6557 return;
6558 } else if (gtid == KMP_GTID_MONITOR) {
6559 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6560 "registered, or system shutdown\n"));
6561 return;
6562 } else if (gtid == KMP_GTID_DNE) {
6563 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6564 "shutdown\n"));
6565 return;
6566 /* we don't know who we are */
6567 } else if (KMP_UBER_GTID(gtid)) {
6568 /* unregister ourselves as an uber thread. gtid is no longer valid */
6569 if (__kmp_root[gtid]->r.r_active) {
6570 __kmp_global.g.g_abort = -1;
6571 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6572 KA_TRACE(10,
6573 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6574 gtid));
6575 return;
6576 } else {
6577 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6578 gtid));
6579 __kmp_unregister_root_current_thread(gtid);
6580 }
6581 } else {
6582 /* just a worker thread, let's leave */
6583 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6584
6585 if (gtid >= 0) {
6586 __kmp_threads[gtid]->th.th_task_team = NULL;
6587 }
6588
6589 KA_TRACE(10,
6590 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6591 gtid));
6592 return;
6593 }
6594 }
6595#if KMP_DYNAMIC_LIB
6596 if (__kmp_pause_status != kmp_hard_paused)
6597 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6598 // because we will better shutdown later in the library destructor.
6599 {
6600 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6601 return;
6602 }
6603#endif
6604 /* synchronize the termination process */
6605 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6606
6607 /* have we already finished */
6608 if (__kmp_global.g.g_abort) {
6609 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6610 /* TODO abort? */
6611 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6612 return;
6613 }
6614 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6615 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6616 return;
6617 }
6618
6619 /* We need this lock to enforce mutex between this reading of
6620 __kmp_threads_capacity and the writing by __kmp_register_root.
6621 Alternatively, we can use a counter of roots that is atomically updated by
6622 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6623 __kmp_internal_end_*. */
6624
6625 /* should we finish the run-time? are all siblings done? */
6626 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6627
6628 for (i = 0; i < __kmp_threads_capacity; ++i) {
6629 if (KMP_UBER_GTID(i)) {
6630 KA_TRACE(
6631 10,
6632 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6633 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6634 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6635 return;
6636 }
6637 }
6638
6639 /* now we can safely conduct the actual termination */
6640
6641 __kmp_internal_end();
6642
6643 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6644 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6645
6646 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6647
6648#ifdef DUMP_DEBUG_ON_EXIT
6649 if (__kmp_debug_buf)
6650 __kmp_dump_debug_buffer();
6651#endif
6652} // __kmp_internal_end_thread
6653
6654// -----------------------------------------------------------------------------
6655// Library registration stuff.
6656
6657static long __kmp_registration_flag = 0;
6658// Random value used to indicate library initialization.
6659static char *__kmp_registration_str = NULL;
6660// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6661
6662static inline char *__kmp_reg_status_name() {
6663/* On RHEL 3u5 if linked statically, getpid() returns different values in
6664 each thread. If registration and unregistration go in different threads
6665 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6666 env var can not be found, because the name will contain different pid. */
6667// macOS* complains about name being too long with additional getuid()
6668#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6669 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6670 (int)getuid());
6671#else
6672 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6673#endif
6674} // __kmp_reg_status_get
6675
6676void __kmp_register_library_startup(void) {
6677
6678 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6679 int done = 0;
6680 union {
6681 double dtime;
6682 long ltime;
6683 } time;
6684#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6685 __kmp_initialize_system_tick();
6686#endif
6687 __kmp_read_system_time(&time.dtime);
6688 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6689 __kmp_registration_str =
6690 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6691 __kmp_registration_flag, KMP_LIBRARY_FILE);
6692
6693 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6694 __kmp_registration_str));
6695
6696 while (!done) {
6697
6698 char *value = NULL; // Actual value of the environment variable.
6699
6700#if defined(KMP_USE_SHM)
6701 char *shm_name = __kmp_str_format("/%s", name);
6702 int shm_preexist = 0;
6703 char *data1;
6704 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6705 if ((fd1 == -1) && (errno == EEXIST)) {
6706 // file didn't open because it already exists.
6707 // try opening existing file
6708 fd1 = shm_open(shm_name, O_RDWR, 0666);
6709 if (fd1 == -1) { // file didn't open
6710 // error out here
6711 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6712 __kmp_msg_null);
6713 } else {
6714 // able to open existing file
6715 shm_preexist = 1;
6716 }
6717 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6718 // already exists.
6719 // error out here.
6720 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6721 __kmp_msg_null);
6722 }
6723 if (shm_preexist == 0) {
6724 // we created SHM now set size
6725 if (ftruncate(fd1, SHM_SIZE) == -1) {
6726 // error occured setting size;
6727 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6728 KMP_ERR(errno), __kmp_msg_null);
6729 }
6730 }
6731 data1 =
6732 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6733 if (data1 == MAP_FAILED) {
6734 // failed to map shared memory
6735 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6736 __kmp_msg_null);
6737 }
6738 if (shm_preexist == 0) { // set data to SHM, set value
6739 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6740 }
6741 // Read value from either what we just wrote or existing file.
6742 value = __kmp_str_format("%s", data1); // read value from SHM
6743 munmap(data1, SHM_SIZE);
6744 close(fd1);
6745#else // Windows and unix with static library
6746 // Set environment variable, but do not overwrite if it is exist.
6747 __kmp_env_set(name, __kmp_registration_str, 0);
6748 // read value to see if it got set
6749 value = __kmp_env_get(name);
6750#endif
6751
6752 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6753 done = 1; // Ok, environment variable set successfully, exit the loop.
6754 } else {
6755 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6756 // Check whether it alive or dead.
6757 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6758 char *tail = value;
6759 char *flag_addr_str = NULL;
6760 char *flag_val_str = NULL;
6761 char const *file_name = NULL;
6762 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6763 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6764 file_name = tail;
6765 if (tail != NULL) {
6766 unsigned long *flag_addr = 0;
6767 unsigned long flag_val = 0;
6768 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6769 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6770 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6771 // First, check whether environment-encoded address is mapped into
6772 // addr space.
6773 // If so, dereference it to see if it still has the right value.
6774 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6775 neighbor = 1;
6776 } else {
6777 // If not, then we know the other copy of the library is no longer
6778 // running.
6779 neighbor = 2;
6780 }
6781 }
6782 }
6783 switch (neighbor) {
6784 case 0: // Cannot parse environment variable -- neighbor status unknown.
6785 // Assume it is the incompatible format of future version of the
6786 // library. Assume the other library is alive.
6787 // WARN( ... ); // TODO: Issue a warning.
6788 file_name = "unknown library";
6789 KMP_FALLTHROUGH();
6790 // Attention! Falling to the next case. That's intentional.
6791 case 1: { // Neighbor is alive.
6792 // Check it is allowed.
6793 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6794 if (!__kmp_str_match_true(duplicate_ok)) {
6795 // That's not allowed. Issue fatal error.
6796 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6797 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6798 }
6799 KMP_INTERNAL_FREE(duplicate_ok);
6800 __kmp_duplicate_library_ok = 1;
6801 done = 1; // Exit the loop.
6802 } break;
6803 case 2: { // Neighbor is dead.
6804
6805#if defined(KMP_USE_SHM)
6806 // close shared memory.
6807 shm_unlink(shm_name); // this removes file in /dev/shm
6808#else
6809 // Clear the variable and try to register library again.
6810 __kmp_env_unset(name);
6811#endif
6812 } break;
6813 default: {
6814 KMP_DEBUG_ASSERT(0);
6815 } break;
6816 }
6817 }
6818 KMP_INTERNAL_FREE((void *)value);
6819#if defined(KMP_USE_SHM)
6820 KMP_INTERNAL_FREE((void *)shm_name);
6821#endif
6822 } // while
6823 KMP_INTERNAL_FREE((void *)name);
6824
6825} // func __kmp_register_library_startup
6826
6827void __kmp_unregister_library(void) {
6828
6829 char *name = __kmp_reg_status_name();
6830 char *value = NULL;
6831
6832#if defined(KMP_USE_SHM)
6833 char *shm_name = __kmp_str_format("/%s", name);
6834 int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6835 if (fd1 == -1) {
6836 // file did not open. return.
6837 return;
6838 }
6839 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6840 if (data1 != MAP_FAILED) {
6841 value = __kmp_str_format("%s", data1); // read value from SHM
6842 munmap(data1, SHM_SIZE);
6843 }
6844 close(fd1);
6845#else
6846 value = __kmp_env_get(name);
6847#endif
6848
6849 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6850 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6851 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6852// Ok, this is our variable. Delete it.
6853#if defined(KMP_USE_SHM)
6854 shm_unlink(shm_name); // this removes file in /dev/shm
6855#else
6856 __kmp_env_unset(name);
6857#endif
6858 }
6859
6860#if defined(KMP_USE_SHM)
6861 KMP_INTERNAL_FREE(shm_name);
6862#endif
6863
6864 KMP_INTERNAL_FREE(__kmp_registration_str);
6865 KMP_INTERNAL_FREE(value);
6866 KMP_INTERNAL_FREE(name);
6867
6868 __kmp_registration_flag = 0;
6869 __kmp_registration_str = NULL;
6870
6871} // __kmp_unregister_library
6872
6873// End of Library registration stuff.
6874// -----------------------------------------------------------------------------
6875
6876#if KMP_MIC_SUPPORTED
6877
6878static void __kmp_check_mic_type() {
6879 kmp_cpuid_t cpuid_state = {0};
6880 kmp_cpuid_t *cs_p = &cpuid_state;
6881 __kmp_x86_cpuid(1, 0, cs_p);
6882 // We don't support mic1 at the moment
6883 if ((cs_p->eax & 0xff0) == 0xB10) {
6884 __kmp_mic_type = mic2;
6885 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6886 __kmp_mic_type = mic3;
6887 } else {
6888 __kmp_mic_type = non_mic;
6889 }
6890}
6891
6892#endif /* KMP_MIC_SUPPORTED */
6893
6894#if KMP_HAVE_UMWAIT
6895static void __kmp_user_level_mwait_init() {
6896 struct kmp_cpuid buf;
6897 __kmp_x86_cpuid(7, 0, &buf);
6898 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6899 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6900 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6901 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6902 __kmp_umwait_enabled));
6903}
6904#elif KMP_HAVE_MWAIT
6905#ifndef AT_INTELPHIUSERMWAIT
6906// Spurious, non-existent value that should always fail to return anything.
6907// Will be replaced with the correct value when we know that.
6908#define AT_INTELPHIUSERMWAIT 10000
6909#endif
6910// getauxval() function is available in RHEL7 and SLES12. If a system with an
6911// earlier OS is used to build the RTL, we'll use the following internal
6912// function when the entry is not found.
6913unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6914unsigned long getauxval(unsigned long) { return 0; }
6915
6916static void __kmp_user_level_mwait_init() {
6917 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6918 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6919 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6920 // KMP_USER_LEVEL_MWAIT was set to TRUE.
6921 if (__kmp_mic_type == mic3) {
6922 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6923 if ((res & 0x1) || __kmp_user_level_mwait) {
6924 __kmp_mwait_enabled = TRUE;
6925 if (__kmp_user_level_mwait) {
6926 KMP_INFORM(EnvMwaitWarn);
6927 }
6928 } else {
6929 __kmp_mwait_enabled = FALSE;
6930 }
6931 }
6932 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6933 "__kmp_mwait_enabled = %d\n",
6934 __kmp_mic_type, __kmp_mwait_enabled));
6935}
6936#endif /* KMP_HAVE_UMWAIT */
6937
6938static void __kmp_do_serial_initialize(void) {
6939 int i, gtid;
6940 size_t size;
6941
6942 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6943
6944 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6945 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6946 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6947 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6948 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6949
6950#if OMPT_SUPPORT
6951 ompt_pre_init();
6952#endif
6953#if OMPD_SUPPORT
6954 __kmp_env_dump();
6955 ompd_init();
6956#endif
6957
6958 __kmp_validate_locks();
6959
6960 /* Initialize internal memory allocator */
6961 __kmp_init_allocator();
6962
6963 /* Register the library startup via an environment variable and check to see
6964 whether another copy of the library is already registered. */
6965
6966 __kmp_register_library_startup();
6967
6968 /* TODO reinitialization of library */
6969 if (TCR_4(__kmp_global.g.g_done)) {
6970 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6971 }
6972
6973 __kmp_global.g.g_abort = 0;
6974 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6975
6976/* initialize the locks */
6977#if KMP_USE_ADAPTIVE_LOCKS
6978#if KMP_DEBUG_ADAPTIVE_LOCKS
6979 __kmp_init_speculative_stats();
6980#endif
6981#endif
6982#if KMP_STATS_ENABLED
6983 __kmp_stats_init();
6984#endif
6985 __kmp_init_lock(&__kmp_global_lock);
6986 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6987 __kmp_init_lock(&__kmp_debug_lock);
6988 __kmp_init_atomic_lock(&__kmp_atomic_lock);
6989 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6990 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6991 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6992 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6993 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6994 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6995 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6996 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6997 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6998 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6999 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7000 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7001 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7002 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7003#if KMP_USE_MONITOR
7004 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7005#endif
7006 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7007
7008 /* conduct initialization and initial setup of configuration */
7009
7010 __kmp_runtime_initialize();
7011
7012#if KMP_MIC_SUPPORTED
7013 __kmp_check_mic_type();
7014#endif
7015
7016// Some global variable initialization moved here from kmp_env_initialize()
7017#ifdef KMP_DEBUG
7018 kmp_diag = 0;
7019#endif
7020 __kmp_abort_delay = 0;
7021
7022 // From __kmp_init_dflt_team_nth()
7023 /* assume the entire machine will be used */
7024 __kmp_dflt_team_nth_ub = __kmp_xproc;
7025 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7026 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7027 }
7028 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7029 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7030 }
7031 __kmp_max_nth = __kmp_sys_max_nth;
7032 __kmp_cg_max_nth = __kmp_sys_max_nth;
7033 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7034 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7035 __kmp_teams_max_nth = __kmp_sys_max_nth;
7036 }
7037
7038 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7039 // part
7040 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7041#if KMP_USE_MONITOR
7042 __kmp_monitor_wakeups =
7043 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7044 __kmp_bt_intervals =
7045 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7046#endif
7047 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7048 __kmp_library = library_throughput;
7049 // From KMP_SCHEDULE initialization
7050 __kmp_static = kmp_sch_static_balanced;
7051// AC: do not use analytical here, because it is non-monotonous
7052//__kmp_guided = kmp_sch_guided_iterative_chunked;
7053//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7054// need to repeat assignment
7055// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7056// bit control and barrier method control parts
7057#if KMP_FAST_REDUCTION_BARRIER
7058#define kmp_reduction_barrier_gather_bb ((int)1)
7059#define kmp_reduction_barrier_release_bb ((int)1)
7060#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7061#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7062#endif // KMP_FAST_REDUCTION_BARRIER
7063 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7064 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7065 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7066 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7067 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7068#if KMP_FAST_REDUCTION_BARRIER
7069 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7070 // lin_64 ): hyper,1
7071 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7072 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7073 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7074 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7075 }
7076#endif // KMP_FAST_REDUCTION_BARRIER
7077 }
7078#if KMP_FAST_REDUCTION_BARRIER
7079#undef kmp_reduction_barrier_release_pat
7080#undef kmp_reduction_barrier_gather_pat
7081#undef kmp_reduction_barrier_release_bb
7082#undef kmp_reduction_barrier_gather_bb
7083#endif // KMP_FAST_REDUCTION_BARRIER
7084#if KMP_MIC_SUPPORTED
7085 if (__kmp_mic_type == mic2) { // KNC
7086 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7087 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7088 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7089 1; // forkjoin release
7090 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7091 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7092 }
7093#if KMP_FAST_REDUCTION_BARRIER
7094 if (__kmp_mic_type == mic2) { // KNC
7095 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7096 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7097 }
7098#endif // KMP_FAST_REDUCTION_BARRIER
7099#endif // KMP_MIC_SUPPORTED
7100
7101// From KMP_CHECKS initialization
7102#ifdef KMP_DEBUG
7103 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7104#else
7105 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7106#endif
7107
7108 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7109 __kmp_foreign_tp = TRUE;
7110
7111 __kmp_global.g.g_dynamic = FALSE;
7112 __kmp_global.g.g_dynamic_mode = dynamic_default;
7113
7114 __kmp_init_nesting_mode();
7115
7116 __kmp_env_initialize(NULL);
7117
7118#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7119 __kmp_user_level_mwait_init();
7120#endif
7121// Print all messages in message catalog for testing purposes.
7122#ifdef KMP_DEBUG
7123 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7124 if (__kmp_str_match_true(val)) {
7125 kmp_str_buf_t buffer;
7126 __kmp_str_buf_init(&buffer);
7127 __kmp_i18n_dump_catalog(&buffer);
7128 __kmp_printf("%s", buffer.str);
7129 __kmp_str_buf_free(&buffer);
7130 }
7131 __kmp_env_free(&val);
7132#endif
7133
7134 __kmp_threads_capacity =
7135 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7136 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7137 __kmp_tp_capacity = __kmp_default_tp_capacity(
7138 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7139
7140 // If the library is shut down properly, both pools must be NULL. Just in
7141 // case, set them to NULL -- some memory may leak, but subsequent code will
7142 // work even if pools are not freed.
7143 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7144 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7145 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7146 __kmp_thread_pool = NULL;
7147 __kmp_thread_pool_insert_pt = NULL;
7148 __kmp_team_pool = NULL;
7149
7150 /* Allocate all of the variable sized records */
7151 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7152 * expandable */
7153 /* Since allocation is cache-aligned, just add extra padding at the end */
7154 size =
7155 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7156 CACHE_LINE;
7157 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7158 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7159 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7160
7161 /* init thread counts */
7162 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7163 0); // Asserts fail if the library is reinitializing and
7164 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7165 __kmp_all_nth = 0;
7166 __kmp_nth = 0;
7167
7168 /* setup the uber master thread and hierarchy */
7169 gtid = __kmp_register_root(TRUE);
7170 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7171 KMP_ASSERT(KMP_UBER_GTID(gtid));
7172 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7173
7174 KMP_MB(); /* Flush all pending memory write invalidates. */
7175
7176 __kmp_common_initialize();
7177
7178#if KMP_OS_UNIX
7179 /* invoke the child fork handler */
7180 __kmp_register_atfork();
7181#endif
7182
7183#if !KMP_DYNAMIC_LIB
7184 {
7185 /* Invoke the exit handler when the program finishes, only for static
7186 library. For dynamic library, we already have _fini and DllMain. */
7187 int rc = atexit(__kmp_internal_end_atexit);
7188 if (rc != 0) {
7189 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7190 __kmp_msg_null);
7191 }
7192 }
7193#endif
7194
7195#if KMP_HANDLE_SIGNALS
7196#if KMP_OS_UNIX
7197 /* NOTE: make sure that this is called before the user installs their own
7198 signal handlers so that the user handlers are called first. this way they
7199 can return false, not call our handler, avoid terminating the library, and
7200 continue execution where they left off. */
7201 __kmp_install_signals(FALSE);
7202#endif /* KMP_OS_UNIX */
7203#if KMP_OS_WINDOWS
7204 __kmp_install_signals(TRUE);
7205#endif /* KMP_OS_WINDOWS */
7206#endif
7207
7208 /* we have finished the serial initialization */
7209 __kmp_init_counter++;
7210
7211 __kmp_init_serial = TRUE;
7212
7213 if (__kmp_settings) {
7214 __kmp_env_print();
7215 }
7216
7217 if (__kmp_display_env || __kmp_display_env_verbose) {
7218 __kmp_env_print_2();
7219 }
7220
7221#if OMPT_SUPPORT
7222 ompt_post_init();
7223#endif
7224
7225 KMP_MB();
7226
7227 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7228}
7229
7230void __kmp_serial_initialize(void) {
7231 if (__kmp_init_serial) {
7232 return;
7233 }
7234 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7235 if (__kmp_init_serial) {
7236 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7237 return;
7238 }
7239 __kmp_do_serial_initialize();
7240 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7241}
7242
7243static void __kmp_do_middle_initialize(void) {
7244 int i, j;
7245 int prev_dflt_team_nth;
7246
7247 if (!__kmp_init_serial) {
7248 __kmp_do_serial_initialize();
7249 }
7250
7251 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7252
7253 // Save the previous value for the __kmp_dflt_team_nth so that
7254 // we can avoid some reinitialization if it hasn't changed.
7255 prev_dflt_team_nth = __kmp_dflt_team_nth;
7256
7257#if KMP_AFFINITY_SUPPORTED
7258 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7259 // number of cores on the machine.
7260 __kmp_affinity_initialize();
7261
7262#endif /* KMP_AFFINITY_SUPPORTED */
7263
7264 KMP_ASSERT(__kmp_xproc > 0);
7265 if (__kmp_avail_proc == 0) {
7266 __kmp_avail_proc = __kmp_xproc;
7267 }
7268
7269 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7270 // correct them now
7271 j = 0;
7272 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7273 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7274 __kmp_avail_proc;
7275 j++;
7276 }
7277
7278 if (__kmp_dflt_team_nth == 0) {
7279#ifdef KMP_DFLT_NTH_CORES
7280 // Default #threads = #cores
7281 __kmp_dflt_team_nth = __kmp_ncores;
7282 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7283 "__kmp_ncores (%d)\n",
7284 __kmp_dflt_team_nth));
7285#else
7286 // Default #threads = #available OS procs
7287 __kmp_dflt_team_nth = __kmp_avail_proc;
7288 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7289 "__kmp_avail_proc(%d)\n",
7290 __kmp_dflt_team_nth));
7291#endif /* KMP_DFLT_NTH_CORES */
7292 }
7293
7294 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7295 __kmp_dflt_team_nth = KMP_MIN_NTH;
7296 }
7297 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7298 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7299 }
7300
7301 if (__kmp_nesting_mode > 0)
7302 __kmp_set_nesting_mode_threads();
7303
7304 // There's no harm in continuing if the following check fails,
7305 // but it indicates an error in the previous logic.
7306 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7307
7308 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7309 // Run through the __kmp_threads array and set the num threads icv for each
7310 // root thread that is currently registered with the RTL (which has not
7311 // already explicitly set its nthreads-var with a call to
7312 // omp_set_num_threads()).
7313 for (i = 0; i < __kmp_threads_capacity; i++) {
7314 kmp_info_t *thread = __kmp_threads[i];
7315 if (thread == NULL)
7316 continue;
7317 if (thread->th.th_current_task->td_icvs.nproc != 0)
7318 continue;
7319
7320 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7321 }
7322 }
7323 KA_TRACE(
7324 20,
7325 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7326 __kmp_dflt_team_nth));
7327
7328#ifdef KMP_ADJUST_BLOCKTIME
7329 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7330 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7331 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7332 if (__kmp_nth > __kmp_avail_proc) {
7333 __kmp_zero_bt = TRUE;
7334 }
7335 }
7336#endif /* KMP_ADJUST_BLOCKTIME */
7337
7338 /* we have finished middle initialization */
7339 TCW_SYNC_4(__kmp_init_middle, TRUE);
7340
7341 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7342}
7343
7344void __kmp_middle_initialize(void) {
7345 if (__kmp_init_middle) {
7346 return;
7347 }
7348 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7349 if (__kmp_init_middle) {
7350 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7351 return;
7352 }
7353 __kmp_do_middle_initialize();
7354 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7355}
7356
7357void __kmp_parallel_initialize(void) {
7358 int gtid = __kmp_entry_gtid(); // this might be a new root
7359
7360 /* synchronize parallel initialization (for sibling) */
7361 if (TCR_4(__kmp_init_parallel))
7362 return;
7363 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7364 if (TCR_4(__kmp_init_parallel)) {
7365 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7366 return;
7367 }
7368
7369 /* TODO reinitialization after we have already shut down */
7370 if (TCR_4(__kmp_global.g.g_done)) {
7371 KA_TRACE(
7372 10,
7373 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7374 __kmp_infinite_loop();
7375 }
7376
7377 /* jc: The lock __kmp_initz_lock is already held, so calling
7378 __kmp_serial_initialize would cause a deadlock. So we call
7379 __kmp_do_serial_initialize directly. */
7380 if (!__kmp_init_middle) {
7381 __kmp_do_middle_initialize();
7382 }
7383 __kmp_assign_root_init_mask();
7384 __kmp_resume_if_hard_paused();
7385
7386 /* begin initialization */
7387 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7388 KMP_ASSERT(KMP_UBER_GTID(gtid));
7389
7390#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7391 // Save the FP control regs.
7392 // Worker threads will set theirs to these values at thread startup.
7393 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7394 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7395 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7396#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7397
7398#if KMP_OS_UNIX
7399#if KMP_HANDLE_SIGNALS
7400 /* must be after __kmp_serial_initialize */
7401 __kmp_install_signals(TRUE);
7402#endif
7403#endif
7404
7405 __kmp_suspend_initialize();
7406
7407#if defined(USE_LOAD_BALANCE)
7408 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7409 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7410 }
7411#else
7412 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7413 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7414 }
7415#endif
7416
7417 if (__kmp_version) {
7418 __kmp_print_version_2();
7419 }
7420
7421 /* we have finished parallel initialization */
7422 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7423
7424 KMP_MB();
7425 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7426
7427 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7428}
7429
7430void __kmp_hidden_helper_initialize() {
7431 if (TCR_4(__kmp_init_hidden_helper))
7432 return;
7433
7434 // __kmp_parallel_initialize is required before we initialize hidden helper
7435 if (!TCR_4(__kmp_init_parallel))
7436 __kmp_parallel_initialize();
7437
7438 // Double check. Note that this double check should not be placed before
7439 // __kmp_parallel_initialize as it will cause dead lock.
7440 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7441 if (TCR_4(__kmp_init_hidden_helper)) {
7442 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7443 return;
7444 }
7445
7446 // Set the count of hidden helper tasks to be executed to zero
7447 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7448
7449 // Set the global variable indicating that we're initializing hidden helper
7450 // team/threads
7451 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7452
7453 // Platform independent initialization
7454 __kmp_do_initialize_hidden_helper_threads();
7455
7456 // Wait here for the finish of initialization of hidden helper teams
7457 __kmp_hidden_helper_threads_initz_wait();
7458
7459 // We have finished hidden helper initialization
7460 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7461
7462 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7463}
7464
7465/* ------------------------------------------------------------------------ */
7466
7467void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7468 kmp_team_t *team) {
7469 kmp_disp_t *dispatch;
7470
7471 KMP_MB();
7472
7473 /* none of the threads have encountered any constructs, yet. */
7474 this_thr->th.th_local.this_construct = 0;
7475#if KMP_CACHE_MANAGE
7476 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7477#endif /* KMP_CACHE_MANAGE */
7478 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7479 KMP_DEBUG_ASSERT(dispatch);
7480 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7481 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7482 // this_thr->th.th_info.ds.ds_tid ] );
7483
7484 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7485 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7486 if (__kmp_env_consistency_check)
7487 __kmp_push_parallel(gtid, team->t.t_ident);
7488
7489 KMP_MB(); /* Flush all pending memory write invalidates. */
7490}
7491
7492void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7493 kmp_team_t *team) {
7494 if (__kmp_env_consistency_check)
7495 __kmp_pop_parallel(gtid, team->t.t_ident);
7496
7497 __kmp_finish_implicit_task(this_thr);
7498}
7499
7500int __kmp_invoke_task_func(int gtid) {
7501 int rc;
7502 int tid = __kmp_tid_from_gtid(gtid);
7503 kmp_info_t *this_thr = __kmp_threads[gtid];
7504 kmp_team_t *team = this_thr->th.th_team;
7505
7506 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7507#if USE_ITT_BUILD
7508 if (__itt_stack_caller_create_ptr) {
7509 // inform ittnotify about entering user's code
7510 if (team->t.t_stack_id != NULL) {
7511 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7512 } else {
7513 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7514 __kmp_itt_stack_callee_enter(
7515 (__itt_caller)team->t.t_parent->t.t_stack_id);
7516 }
7517 }
7518#endif /* USE_ITT_BUILD */
7519#if INCLUDE_SSC_MARKS
7520 SSC_MARK_INVOKING();
7521#endif
7522
7523#if OMPT_SUPPORT
7524 void *dummy;
7525 void **exit_frame_p;
7526 ompt_data_t *my_task_data;
7527 ompt_data_t *my_parallel_data;
7528 int ompt_team_size;
7529
7530 if (ompt_enabled.enabled) {
7531 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7532 .ompt_task_info.frame.exit_frame.ptr);
7533 } else {
7534 exit_frame_p = &dummy;
7535 }
7536
7537 my_task_data =
7538 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7539 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7540 if (ompt_enabled.ompt_callback_implicit_task) {
7541 ompt_team_size = team->t.t_nproc;
7542 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7543 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7544 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7545 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7546 }
7547#endif
7548
7549#if KMP_STATS_ENABLED
7550 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7551 if (previous_state == stats_state_e::TEAMS_REGION) {
7552 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7553 } else {
7554 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7555 }
7556 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7557#endif
7558
7559 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7560 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7561#if OMPT_SUPPORT
7562 ,
7563 exit_frame_p
7564#endif
7565 );
7566#if OMPT_SUPPORT
7567 *exit_frame_p = NULL;
7568 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7569#endif
7570
7571#if KMP_STATS_ENABLED
7572 if (previous_state == stats_state_e::TEAMS_REGION) {
7573 KMP_SET_THREAD_STATE(previous_state);
7574 }
7575 KMP_POP_PARTITIONED_TIMER();
7576#endif
7577
7578#if USE_ITT_BUILD
7579 if (__itt_stack_caller_create_ptr) {
7580 // inform ittnotify about leaving user's code
7581 if (team->t.t_stack_id != NULL) {
7582 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7583 } else {
7584 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7585 __kmp_itt_stack_callee_leave(
7586 (__itt_caller)team->t.t_parent->t.t_stack_id);
7587 }
7588 }
7589#endif /* USE_ITT_BUILD */
7590 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7591
7592 return rc;
7593}
7594
7595void __kmp_teams_master(int gtid) {
7596 // This routine is called by all primary threads in teams construct
7597 kmp_info_t *thr = __kmp_threads[gtid];
7598 kmp_team_t *team = thr->th.th_team;
7599 ident_t *loc = team->t.t_ident;
7600 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7601 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7602 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7603 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7604 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7605
7606 // This thread is a new CG root. Set up the proper variables.
7607 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7608 tmp->cg_root = thr; // Make thr the CG root
7609 // Init to thread limit stored when league primary threads were forked
7610 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7611 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7612 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7613 " cg_nthreads to 1\n",
7614 thr, tmp));
7615 tmp->up = thr->th.th_cg_roots;
7616 thr->th.th_cg_roots = tmp;
7617
7618// Launch league of teams now, but not let workers execute
7619// (they hang on fork barrier until next parallel)
7620#if INCLUDE_SSC_MARKS
7621 SSC_MARK_FORKING();
7622#endif
7623 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7624 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7625 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7626#if INCLUDE_SSC_MARKS
7627 SSC_MARK_JOINING();
7628#endif
7629 // If the team size was reduced from the limit, set it to the new size
7630 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7631 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7632 // AC: last parameter "1" eliminates join barrier which won't work because
7633 // worker threads are in a fork barrier waiting for more parallel regions
7634 __kmp_join_call(loc, gtid
7635#if OMPT_SUPPORT
7636 ,
7637 fork_context_intel
7638#endif
7639 ,
7640 1);
7641}
7642
7643int __kmp_invoke_teams_master(int gtid) {
7644 kmp_info_t *this_thr = __kmp_threads[gtid];
7645 kmp_team_t *team = this_thr->th.th_team;
7646#if KMP_DEBUG
7647 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7648 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7649 (void *)__kmp_teams_master);
7650#endif
7651 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7652#if OMPT_SUPPORT
7653 int tid = __kmp_tid_from_gtid(gtid);
7654 ompt_data_t *task_data =
7655 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7656 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7657 if (ompt_enabled.ompt_callback_implicit_task) {
7658 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7659 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7660 ompt_task_initial);
7661 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7662 }
7663#endif
7664 __kmp_teams_master(gtid);
7665#if OMPT_SUPPORT
7666 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7667#endif
7668 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7669 return 1;
7670}
7671
7672/* this sets the requested number of threads for the next parallel region
7673 encountered by this team. since this should be enclosed in the forkjoin
7674 critical section it should avoid race conditions with asymmetrical nested
7675 parallelism */
7676
7677void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7678 kmp_info_t *thr = __kmp_threads[gtid];
7679
7680 if (num_threads > 0)
7681 thr->th.th_set_nproc = num_threads;
7682}
7683
7684static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7685 int num_threads) {
7686 KMP_DEBUG_ASSERT(thr);
7687 // Remember the number of threads for inner parallel regions
7688 if (!TCR_4(__kmp_init_middle))
7689 __kmp_middle_initialize(); // get internal globals calculated
7690 __kmp_assign_root_init_mask();
7691 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7692 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7693
7694 if (num_threads == 0) {
7695 if (__kmp_teams_thread_limit > 0) {
7696 num_threads = __kmp_teams_thread_limit;
7697 } else {
7698 num_threads = __kmp_avail_proc / num_teams;
7699 }
7700 // adjust num_threads w/o warning as it is not user setting
7701 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7702 // no thread_limit clause specified - do not change thread-limit-var ICV
7703 if (num_threads > __kmp_dflt_team_nth) {
7704 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7705 }
7706 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7707 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7708 } // prevent team size to exceed thread-limit-var
7709 if (num_teams * num_threads > __kmp_teams_max_nth) {
7710 num_threads = __kmp_teams_max_nth / num_teams;
7711 }
7712 if (num_threads == 0) {
7713 num_threads = 1;
7714 }
7715 } else {
7716 if (num_threads < 0) {
7717 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7718 __kmp_msg_null);
7719 num_threads = 1;
7720 }
7721 // This thread will be the primary thread of the league primary threads
7722 // Store new thread limit; old limit is saved in th_cg_roots list
7723 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7724 // num_threads = min(num_threads, nthreads-var)
7725 if (num_threads > __kmp_dflt_team_nth) {
7726 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7727 }
7728 if (num_teams * num_threads > __kmp_teams_max_nth) {
7729 int new_threads = __kmp_teams_max_nth / num_teams;
7730 if (new_threads == 0) {
7731 new_threads = 1;
7732 }
7733 if (new_threads != num_threads) {
7734 if (!__kmp_reserve_warn) { // user asked for too many threads
7735 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7736 __kmp_msg(kmp_ms_warning,
7737 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7738 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7739 }
7740 }
7741 num_threads = new_threads;
7742 }
7743 }
7744 thr->th.th_teams_size.nth = num_threads;
7745}
7746
7747/* this sets the requested number of teams for the teams region and/or
7748 the number of threads for the next parallel region encountered */
7749void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7750 int num_threads) {
7751 kmp_info_t *thr = __kmp_threads[gtid];
7752 if (num_teams < 0) {
7753 // OpenMP specification requires requested values to be positive,
7754 // but people can send us any value, so we'd better check
7755 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7756 __kmp_msg_null);
7757 num_teams = 1;
7758 }
7759 if (num_teams == 0) {
7760 if (__kmp_nteams > 0) {
7761 num_teams = __kmp_nteams;
7762 } else {
7763 num_teams = 1; // default number of teams is 1.
7764 }
7765 }
7766 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7767 if (!__kmp_reserve_warn) {
7768 __kmp_reserve_warn = 1;
7769 __kmp_msg(kmp_ms_warning,
7770 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7771 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7772 }
7773 num_teams = __kmp_teams_max_nth;
7774 }
7775 // Set number of teams (number of threads in the outer "parallel" of the
7776 // teams)
7777 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7778
7779 __kmp_push_thread_limit(thr, num_teams, num_threads);
7780}
7781
7782/* This sets the requested number of teams for the teams region and/or
7783 the number of threads for the next parallel region encountered */
7784void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7785 int num_teams_ub, int num_threads) {
7786 kmp_info_t *thr = __kmp_threads[gtid];
7787 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7788 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7789 KMP_DEBUG_ASSERT(num_threads >= 0);
7790
7791 if (num_teams_lb > num_teams_ub) {
7792 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7793 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7794 }
7795
7796 int num_teams = 1; // defalt number of teams is 1.
7797
7798 if (num_teams_lb == 0 && num_teams_ub > 0)
7799 num_teams_lb = num_teams_ub;
7800
7801 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7802 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7803 if (num_teams > __kmp_teams_max_nth) {
7804 if (!__kmp_reserve_warn) {
7805 __kmp_reserve_warn = 1;
7806 __kmp_msg(kmp_ms_warning,
7807 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7808 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7809 }
7810 num_teams = __kmp_teams_max_nth;
7811 }
7812 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7813 num_teams = num_teams_ub;
7814 } else { // num_teams_lb <= num_teams <= num_teams_ub
7815 if (num_threads <= 0) {
7816 if (num_teams_ub > __kmp_teams_max_nth) {
7817 num_teams = num_teams_lb;
7818 } else {
7819 num_teams = num_teams_ub;
7820 }
7821 } else {
7822 num_teams = (num_threads > __kmp_teams_max_nth)
7823 ? num_teams
7824 : __kmp_teams_max_nth / num_threads;
7825 if (num_teams < num_teams_lb) {
7826 num_teams = num_teams_lb;
7827 } else if (num_teams > num_teams_ub) {
7828 num_teams = num_teams_ub;
7829 }
7830 }
7831 }
7832 // Set number of teams (number of threads in the outer "parallel" of the
7833 // teams)
7834 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7835
7836 __kmp_push_thread_limit(thr, num_teams, num_threads);
7837}
7838
7839// Set the proc_bind var to use in the following parallel region.
7840void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7841 kmp_info_t *thr = __kmp_threads[gtid];
7842 thr->th.th_set_proc_bind = proc_bind;
7843}
7844
7845/* Launch the worker threads into the microtask. */
7846
7847void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7848 kmp_info_t *this_thr = __kmp_threads[gtid];
7849
7850#ifdef KMP_DEBUG
7851 int f;
7852#endif /* KMP_DEBUG */
7853
7854 KMP_DEBUG_ASSERT(team);
7855 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7856 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7857 KMP_MB(); /* Flush all pending memory write invalidates. */
7858
7859 team->t.t_construct = 0; /* no single directives seen yet */
7860 team->t.t_ordered.dt.t_value =
7861 0; /* thread 0 enters the ordered section first */
7862
7863 /* Reset the identifiers on the dispatch buffer */
7864 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7865 if (team->t.t_max_nproc > 1) {
7866 int i;
7867 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7868 team->t.t_disp_buffer[i].buffer_index = i;
7869 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7870 }
7871 } else {
7872 team->t.t_disp_buffer[0].buffer_index = 0;
7873 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7874 }
7875
7876 KMP_MB(); /* Flush all pending memory write invalidates. */
7877 KMP_ASSERT(this_thr->th.th_team == team);
7878
7879#ifdef KMP_DEBUG
7880 for (f = 0; f < team->t.t_nproc; f++) {
7881 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7882 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7883 }
7884#endif /* KMP_DEBUG */
7885
7886 /* release the worker threads so they may begin working */
7887 __kmp_fork_barrier(gtid, 0);
7888}
7889
7890void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7891 kmp_info_t *this_thr = __kmp_threads[gtid];
7892
7893 KMP_DEBUG_ASSERT(team);
7894 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7895 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7896 KMP_MB(); /* Flush all pending memory write invalidates. */
7897
7898 /* Join barrier after fork */
7899
7900#ifdef KMP_DEBUG
7901 if (__kmp_threads[gtid] &&
7902 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7903 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7904 __kmp_threads[gtid]);
7905 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7906 "team->t.t_nproc=%d\n",
7907 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7908 team->t.t_nproc);
7909 __kmp_print_structure();
7910 }
7911 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7912 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7913#endif /* KMP_DEBUG */
7914
7915 __kmp_join_barrier(gtid); /* wait for everyone */
7916#if OMPT_SUPPORT
7917 if (ompt_enabled.enabled &&
7918 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7919 int ds_tid = this_thr->th.th_info.ds.ds_tid;
7920 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7921 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7922#if OMPT_OPTIONAL
7923 void *codeptr = NULL;
7924 if (KMP_MASTER_TID(ds_tid) &&
7925 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7926 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7927 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7928
7929 if (ompt_enabled.ompt_callback_sync_region_wait) {
7930 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7931 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7932 codeptr);
7933 }
7934 if (ompt_enabled.ompt_callback_sync_region) {
7935 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7936 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7937 codeptr);
7938 }
7939#endif
7940 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7941 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7942 ompt_scope_end, NULL, task_data, 0, ds_tid,
7943 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7944 }
7945 }
7946#endif
7947
7948 KMP_MB(); /* Flush all pending memory write invalidates. */
7949 KMP_ASSERT(this_thr->th.th_team == team);
7950}
7951
7952/* ------------------------------------------------------------------------ */
7953
7954#ifdef USE_LOAD_BALANCE
7955
7956// Return the worker threads actively spinning in the hot team, if we
7957// are at the outermost level of parallelism. Otherwise, return 0.
7958static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7959 int i;
7960 int retval;
7961 kmp_team_t *hot_team;
7962
7963 if (root->r.r_active) {
7964 return 0;
7965 }
7966 hot_team = root->r.r_hot_team;
7967 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7968 return hot_team->t.t_nproc - 1; // Don't count primary thread
7969 }
7970
7971 // Skip the primary thread - it is accounted for elsewhere.
7972 retval = 0;
7973 for (i = 1; i < hot_team->t.t_nproc; i++) {
7974 if (hot_team->t.t_threads[i]->th.th_active) {
7975 retval++;
7976 }
7977 }
7978 return retval;
7979}
7980
7981// Perform an automatic adjustment to the number of
7982// threads used by the next parallel region.
7983static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7984 int retval;
7985 int pool_active;
7986 int hot_team_active;
7987 int team_curr_active;
7988 int system_active;
7989
7990 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7991 set_nproc));
7992 KMP_DEBUG_ASSERT(root);
7993 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7994 ->th.th_current_task->td_icvs.dynamic == TRUE);
7995 KMP_DEBUG_ASSERT(set_nproc > 1);
7996
7997 if (set_nproc == 1) {
7998 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7999 return 1;
8000 }
8001
8002 // Threads that are active in the thread pool, active in the hot team for this
8003 // particular root (if we are at the outer par level), and the currently
8004 // executing thread (to become the primary thread) are available to add to the
8005 // new team, but are currently contributing to the system load, and must be
8006 // accounted for.
8007 pool_active = __kmp_thread_pool_active_nth;
8008 hot_team_active = __kmp_active_hot_team_nproc(root);
8009 team_curr_active = pool_active + hot_team_active + 1;
8010
8011 // Check the system load.
8012 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8013 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8014 "hot team active = %d\n",
8015 system_active, pool_active, hot_team_active));
8016
8017 if (system_active < 0) {
8018 // There was an error reading the necessary info from /proc, so use the
8019 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8020 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8021 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8022 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8023
8024 // Make this call behave like the thread limit algorithm.
8025 retval = __kmp_avail_proc - __kmp_nth +
8026 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8027 if (retval > set_nproc) {
8028 retval = set_nproc;
8029 }
8030 if (retval < KMP_MIN_NTH) {
8031 retval = KMP_MIN_NTH;
8032 }
8033
8034 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8035 retval));
8036 return retval;
8037 }
8038
8039 // There is a slight delay in the load balance algorithm in detecting new
8040 // running procs. The real system load at this instant should be at least as
8041 // large as the #active omp thread that are available to add to the team.
8042 if (system_active < team_curr_active) {
8043 system_active = team_curr_active;
8044 }
8045 retval = __kmp_avail_proc - system_active + team_curr_active;
8046 if (retval > set_nproc) {
8047 retval = set_nproc;
8048 }
8049 if (retval < KMP_MIN_NTH) {
8050 retval = KMP_MIN_NTH;
8051 }
8052
8053 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8054 return retval;
8055} // __kmp_load_balance_nproc()
8056
8057#endif /* USE_LOAD_BALANCE */
8058
8059/* ------------------------------------------------------------------------ */
8060
8061/* NOTE: this is called with the __kmp_init_lock held */
8062void __kmp_cleanup(void) {
8063 int f;
8064
8065 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8066
8067 if (TCR_4(__kmp_init_parallel)) {
8068#if KMP_HANDLE_SIGNALS
8069 __kmp_remove_signals();
8070#endif
8071 TCW_4(__kmp_init_parallel, FALSE);
8072 }
8073
8074 if (TCR_4(__kmp_init_middle)) {
8075#if KMP_AFFINITY_SUPPORTED
8076 __kmp_affinity_uninitialize();
8077#endif /* KMP_AFFINITY_SUPPORTED */
8078 __kmp_cleanup_hierarchy();
8079 TCW_4(__kmp_init_middle, FALSE);
8080 }
8081
8082 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8083
8084 if (__kmp_init_serial) {
8085 __kmp_runtime_destroy();
8086 __kmp_init_serial = FALSE;
8087 }
8088
8089 __kmp_cleanup_threadprivate_caches();
8090
8091 for (f = 0; f < __kmp_threads_capacity; f++) {
8092 if (__kmp_root[f] != NULL) {
8093 __kmp_free(__kmp_root[f]);
8094 __kmp_root[f] = NULL;
8095 }
8096 }
8097 __kmp_free(__kmp_threads);
8098 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8099 // there is no need in freeing __kmp_root.
8100 __kmp_threads = NULL;
8101 __kmp_root = NULL;
8102 __kmp_threads_capacity = 0;
8103
8104#if KMP_USE_DYNAMIC_LOCK
8105 __kmp_cleanup_indirect_user_locks();
8106#else
8107 __kmp_cleanup_user_locks();
8108#endif
8109#if OMPD_SUPPORT
8110 if (ompd_state) {
8111 __kmp_free(ompd_env_block);
8112 ompd_env_block = NULL;
8113 ompd_env_block_size = 0;
8114 }
8115#endif
8116
8117#if KMP_AFFINITY_SUPPORTED
8118 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8119 __kmp_cpuinfo_file = NULL;
8120#endif /* KMP_AFFINITY_SUPPORTED */
8121
8122#if KMP_USE_ADAPTIVE_LOCKS
8123#if KMP_DEBUG_ADAPTIVE_LOCKS
8124 __kmp_print_speculative_stats();
8125#endif
8126#endif
8127 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8128 __kmp_nested_nth.nth = NULL;
8129 __kmp_nested_nth.size = 0;
8130 __kmp_nested_nth.used = 0;
8131 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8132 __kmp_nested_proc_bind.bind_types = NULL;
8133 __kmp_nested_proc_bind.size = 0;
8134 __kmp_nested_proc_bind.used = 0;
8135 if (__kmp_affinity_format) {
8136 KMP_INTERNAL_FREE(__kmp_affinity_format);
8137 __kmp_affinity_format = NULL;
8138 }
8139
8140 __kmp_i18n_catclose();
8141
8142#if KMP_USE_HIER_SCHED
8143 __kmp_hier_scheds.deallocate();
8144#endif
8145
8146#if KMP_STATS_ENABLED
8147 __kmp_stats_fini();
8148#endif
8149
8150 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8151}
8152
8153/* ------------------------------------------------------------------------ */
8154
8155int __kmp_ignore_mppbeg(void) {
8156 char *env;
8157
8158 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8159 if (__kmp_str_match_false(env))
8160 return FALSE;
8161 }
8162 // By default __kmpc_begin() is no-op.
8163 return TRUE;
8164}
8165
8166int __kmp_ignore_mppend(void) {
8167 char *env;
8168
8169 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8170 if (__kmp_str_match_false(env))
8171 return FALSE;
8172 }
8173 // By default __kmpc_end() is no-op.
8174 return TRUE;
8175}
8176
8177void __kmp_internal_begin(void) {
8178 int gtid;
8179 kmp_root_t *root;
8180
8181 /* this is a very important step as it will register new sibling threads
8182 and assign these new uber threads a new gtid */
8183 gtid = __kmp_entry_gtid();
8184 root = __kmp_threads[gtid]->th.th_root;
8185 KMP_ASSERT(KMP_UBER_GTID(gtid));
8186
8187 if (root->r.r_begin)
8188 return;
8189 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8190 if (root->r.r_begin) {
8191 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8192 return;
8193 }
8194
8195 root->r.r_begin = TRUE;
8196
8197 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8198}
8199
8200/* ------------------------------------------------------------------------ */
8201
8202void __kmp_user_set_library(enum library_type arg) {
8203 int gtid;
8204 kmp_root_t *root;
8205 kmp_info_t *thread;
8206
8207 /* first, make sure we are initialized so we can get our gtid */
8208
8209 gtid = __kmp_entry_gtid();
8210 thread = __kmp_threads[gtid];
8211
8212 root = thread->th.th_root;
8213
8214 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8215 library_serial));
8216 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8217 thread */
8218 KMP_WARNING(SetLibraryIncorrectCall);
8219 return;
8220 }
8221
8222 switch (arg) {
8223 case library_serial:
8224 thread->th.th_set_nproc = 0;
8225 set__nproc(thread, 1);
8226 break;
8227 case library_turnaround:
8228 thread->th.th_set_nproc = 0;
8229 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8230 : __kmp_dflt_team_nth_ub);
8231 break;
8232 case library_throughput:
8233 thread->th.th_set_nproc = 0;
8234 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8235 : __kmp_dflt_team_nth_ub);
8236 break;
8237 default:
8238 KMP_FATAL(UnknownLibraryType, arg);
8239 }
8240
8241 __kmp_aux_set_library(arg);
8242}
8243
8244void __kmp_aux_set_stacksize(size_t arg) {
8245 if (!__kmp_init_serial)
8246 __kmp_serial_initialize();
8247
8248#if KMP_OS_DARWIN
8249 if (arg & (0x1000 - 1)) {
8250 arg &= ~(0x1000 - 1);
8251 if (arg + 0x1000) /* check for overflow if we round up */
8252 arg += 0x1000;
8253 }
8254#endif
8255 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8256
8257 /* only change the default stacksize before the first parallel region */
8258 if (!TCR_4(__kmp_init_parallel)) {
8259 size_t value = arg; /* argument is in bytes */
8260
8261 if (value < __kmp_sys_min_stksize)
8262 value = __kmp_sys_min_stksize;
8263 else if (value > KMP_MAX_STKSIZE)
8264 value = KMP_MAX_STKSIZE;
8265
8266 __kmp_stksize = value;
8267
8268 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8269 }
8270
8271 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8272}
8273
8274/* set the behaviour of the runtime library */
8275/* TODO this can cause some odd behaviour with sibling parallelism... */
8276void __kmp_aux_set_library(enum library_type arg) {
8277 __kmp_library = arg;
8278
8279 switch (__kmp_library) {
8280 case library_serial: {
8281 KMP_INFORM(LibraryIsSerial);
8282 } break;
8283 case library_turnaround:
8284 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8285 __kmp_use_yield = 2; // only yield when oversubscribed
8286 break;
8287 case library_throughput:
8288 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8289 __kmp_dflt_blocktime = 200;
8290 break;
8291 default:
8292 KMP_FATAL(UnknownLibraryType, arg);
8293 }
8294}
8295
8296/* Getting team information common for all team API */
8297// Returns NULL if not in teams construct
8298static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8299 kmp_info_t *thr = __kmp_entry_thread();
8300 teams_serialized = 0;
8301 if (thr->th.th_teams_microtask) {
8302 kmp_team_t *team = thr->th.th_team;
8303 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8304 int ii = team->t.t_level;
8305 teams_serialized = team->t.t_serialized;
8306 int level = tlevel + 1;
8307 KMP_DEBUG_ASSERT(ii >= tlevel);
8308 while (ii > level) {
8309 for (teams_serialized = team->t.t_serialized;
8310 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8311 }
8312 if (team->t.t_serialized && (!teams_serialized)) {
8313 team = team->t.t_parent;
8314 continue;
8315 }
8316 if (ii > level) {
8317 team = team->t.t_parent;
8318 ii--;
8319 }
8320 }
8321 return team;
8322 }
8323 return NULL;
8324}
8325
8326int __kmp_aux_get_team_num() {
8327 int serialized;
8328 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8329 if (team) {
8330 if (serialized > 1) {
8331 return 0; // teams region is serialized ( 1 team of 1 thread ).
8332 } else {
8333 return team->t.t_master_tid;
8334 }
8335 }
8336 return 0;
8337}
8338
8339int __kmp_aux_get_num_teams() {
8340 int serialized;
8341 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8342 if (team) {
8343 if (serialized > 1) {
8344 return 1;
8345 } else {
8346 return team->t.t_parent->t.t_nproc;
8347 }
8348 }
8349 return 1;
8350}
8351
8352/* ------------------------------------------------------------------------ */
8353
8354/*
8355 * Affinity Format Parser
8356 *
8357 * Field is in form of: %[[[0].]size]type
8358 * % and type are required (%% means print a literal '%')
8359 * type is either single char or long name surrounded by {},
8360 * e.g., N or {num_threads}
8361 * 0 => leading zeros
8362 * . => right justified when size is specified
8363 * by default output is left justified
8364 * size is the *minimum* field length
8365 * All other characters are printed as is
8366 *
8367 * Available field types:
8368 * L {thread_level} - omp_get_level()
8369 * n {thread_num} - omp_get_thread_num()
8370 * h {host} - name of host machine
8371 * P {process_id} - process id (integer)
8372 * T {thread_identifier} - native thread identifier (integer)
8373 * N {num_threads} - omp_get_num_threads()
8374 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8375 * a {thread_affinity} - comma separated list of integers or integer ranges
8376 * (values of affinity mask)
8377 *
8378 * Implementation-specific field types can be added
8379 * If a type is unknown, print "undefined"
8380 */
8381
8382// Structure holding the short name, long name, and corresponding data type
8383// for snprintf. A table of these will represent the entire valid keyword
8384// field types.
8385typedef struct kmp_affinity_format_field_t {
8386 char short_name; // from spec e.g., L -> thread level
8387 const char *long_name; // from spec thread_level -> thread level
8388 char field_format; // data type for snprintf (typically 'd' or 's'
8389 // for integer or string)
8390} kmp_affinity_format_field_t;
8391
8392static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8393#if KMP_AFFINITY_SUPPORTED
8394 {'A', "thread_affinity", 's'},
8395#endif
8396 {'t', "team_num", 'd'},
8397 {'T', "num_teams", 'd'},
8398 {'L', "nesting_level", 'd'},
8399 {'n', "thread_num", 'd'},
8400 {'N', "num_threads", 'd'},
8401 {'a', "ancestor_tnum", 'd'},
8402 {'H', "host", 's'},
8403 {'P', "process_id", 'd'},
8404 {'i', "native_thread_id", 'd'}};
8405
8406// Return the number of characters it takes to hold field
8407static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8408 const char **ptr,
8409 kmp_str_buf_t *field_buffer) {
8410 int rc, format_index, field_value;
8411 const char *width_left, *width_right;
8412 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8413 static const int FORMAT_SIZE = 20;
8414 char format[FORMAT_SIZE] = {0};
8415 char absolute_short_name = 0;
8416
8417 KMP_DEBUG_ASSERT(gtid >= 0);
8418 KMP_DEBUG_ASSERT(th);
8419 KMP_DEBUG_ASSERT(**ptr == '%');
8420 KMP_DEBUG_ASSERT(field_buffer);
8421
8422 __kmp_str_buf_clear(field_buffer);
8423
8424 // Skip the initial %
8425 (*ptr)++;
8426
8427 // Check for %% first
8428 if (**ptr == '%') {
8429 __kmp_str_buf_cat(field_buffer, "%", 1);
8430 (*ptr)++; // skip over the second %
8431 return 1;
8432 }
8433
8434 // Parse field modifiers if they are present
8435 pad_zeros = false;
8436 if (**ptr == '0') {
8437 pad_zeros = true;
8438 (*ptr)++; // skip over 0
8439 }
8440 right_justify = false;
8441 if (**ptr == '.') {
8442 right_justify = true;
8443 (*ptr)++; // skip over .
8444 }
8445 // Parse width of field: [width_left, width_right)
8446 width_left = width_right = NULL;
8447 if (**ptr >= '0' && **ptr <= '9') {
8448 width_left = *ptr;
8449 SKIP_DIGITS(*ptr);
8450 width_right = *ptr;
8451 }
8452
8453 // Create the format for KMP_SNPRINTF based on flags parsed above
8454 format_index = 0;
8455 format[format_index++] = '%';
8456 if (!right_justify)
8457 format[format_index++] = '-';
8458 if (pad_zeros)
8459 format[format_index++] = '0';
8460 if (width_left && width_right) {
8461 int i = 0;
8462 // Only allow 8 digit number widths.
8463 // This also prevents overflowing format variable
8464 while (i < 8 && width_left < width_right) {
8465 format[format_index++] = *width_left;
8466 width_left++;
8467 i++;
8468 }
8469 }
8470
8471 // Parse a name (long or short)
8472 // Canonicalize the name into absolute_short_name
8473 found_valid_name = false;
8474 parse_long_name = (**ptr == '{');
8475 if (parse_long_name)
8476 (*ptr)++; // skip initial left brace
8477 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8478 sizeof(__kmp_affinity_format_table[0]);
8479 ++i) {
8480 char short_name = __kmp_affinity_format_table[i].short_name;
8481 const char *long_name = __kmp_affinity_format_table[i].long_name;
8482 char field_format = __kmp_affinity_format_table[i].field_format;
8483 if (parse_long_name) {
8484 size_t length = KMP_STRLEN(long_name);
8485 if (strncmp(*ptr, long_name, length) == 0) {
8486 found_valid_name = true;
8487 (*ptr) += length; // skip the long name
8488 }
8489 } else if (**ptr == short_name) {
8490 found_valid_name = true;
8491 (*ptr)++; // skip the short name
8492 }
8493 if (found_valid_name) {
8494 format[format_index++] = field_format;
8495 format[format_index++] = '\0';
8496 absolute_short_name = short_name;
8497 break;
8498 }
8499 }
8500 if (parse_long_name) {
8501 if (**ptr != '}') {
8502 absolute_short_name = 0;
8503 } else {
8504 (*ptr)++; // skip over the right brace
8505 }
8506 }
8507
8508 // Attempt to fill the buffer with the requested
8509 // value using snprintf within __kmp_str_buf_print()
8510 switch (absolute_short_name) {
8511 case 't':
8512 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8513 break;
8514 case 'T':
8515 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8516 break;
8517 case 'L':
8518 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8519 break;
8520 case 'n':
8521 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8522 break;
8523 case 'H': {
8524 static const int BUFFER_SIZE = 256;
8525 char buf[BUFFER_SIZE];
8526 __kmp_expand_host_name(buf, BUFFER_SIZE);
8527 rc = __kmp_str_buf_print(field_buffer, format, buf);
8528 } break;
8529 case 'P':
8530 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8531 break;
8532 case 'i':
8533 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8534 break;
8535 case 'N':
8536 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8537 break;
8538 case 'a':
8539 field_value =
8540 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8541 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8542 break;
8543#if KMP_AFFINITY_SUPPORTED
8544 case 'A': {
8545 kmp_str_buf_t buf;
8546 __kmp_str_buf_init(&buf);
8547 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8548 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8549 __kmp_str_buf_free(&buf);
8550 } break;
8551#endif
8552 default:
8553 // According to spec, If an implementation does not have info for field
8554 // type, then "undefined" is printed
8555 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8556 // Skip the field
8557 if (parse_long_name) {
8558 SKIP_TOKEN(*ptr);
8559 if (**ptr == '}')
8560 (*ptr)++;
8561 } else {
8562 (*ptr)++;
8563 }
8564 }
8565
8566 KMP_ASSERT(format_index <= FORMAT_SIZE);
8567 return rc;
8568}
8569
8570/*
8571 * Return number of characters needed to hold the affinity string
8572 * (not including null byte character)
8573 * The resultant string is printed to buffer, which the caller can then
8574 * handle afterwards
8575 */
8576size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8577 kmp_str_buf_t *buffer) {
8578 const char *parse_ptr;
8579 size_t retval;
8580 const kmp_info_t *th;
8581 kmp_str_buf_t field;
8582
8583 KMP_DEBUG_ASSERT(buffer);
8584 KMP_DEBUG_ASSERT(gtid >= 0);
8585
8586 __kmp_str_buf_init(&field);
8587 __kmp_str_buf_clear(buffer);
8588
8589 th = __kmp_threads[gtid];
8590 retval = 0;
8591
8592 // If format is NULL or zero-length string, then we use
8593 // affinity-format-var ICV
8594 parse_ptr = format;
8595 if (parse_ptr == NULL || *parse_ptr == '\0') {
8596 parse_ptr = __kmp_affinity_format;
8597 }
8598 KMP_DEBUG_ASSERT(parse_ptr);
8599
8600 while (*parse_ptr != '\0') {
8601 // Parse a field
8602 if (*parse_ptr == '%') {
8603 // Put field in the buffer
8604 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8605 __kmp_str_buf_catbuf(buffer, &field);
8606 retval += rc;
8607 } else {
8608 // Put literal character in buffer
8609 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8610 retval++;
8611 parse_ptr++;
8612 }
8613 }
8614 __kmp_str_buf_free(&field);
8615 return retval;
8616}
8617
8618// Displays the affinity string to stdout
8619void __kmp_aux_display_affinity(int gtid, const char *format) {
8620 kmp_str_buf_t buf;
8621 __kmp_str_buf_init(&buf);
8622 __kmp_aux_capture_affinity(gtid, format, &buf);
8623 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8624 __kmp_str_buf_free(&buf);
8625}
8626
8627/* ------------------------------------------------------------------------ */
8628
8629void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8630 int blocktime = arg; /* argument is in milliseconds */
8631#if KMP_USE_MONITOR
8632 int bt_intervals;
8633#endif
8634 kmp_int8 bt_set;
8635
8636 __kmp_save_internal_controls(thread);
8637
8638 /* Normalize and set blocktime for the teams */
8639 if (blocktime < KMP_MIN_BLOCKTIME)
8640 blocktime = KMP_MIN_BLOCKTIME;
8641 else if (blocktime > KMP_MAX_BLOCKTIME)
8642 blocktime = KMP_MAX_BLOCKTIME;
8643
8644 set__blocktime_team(thread->th.th_team, tid, blocktime);
8645 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8646
8647#if KMP_USE_MONITOR
8648 /* Calculate and set blocktime intervals for the teams */
8649 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8650
8651 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8652 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8653#endif
8654
8655 /* Set whether blocktime has been set to "TRUE" */
8656 bt_set = TRUE;
8657
8658 set__bt_set_team(thread->th.th_team, tid, bt_set);
8659 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8660#if KMP_USE_MONITOR
8661 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8662 "bt_intervals=%d, monitor_updates=%d\n",
8663 __kmp_gtid_from_tid(tid, thread->th.th_team),
8664 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8665 __kmp_monitor_wakeups));
8666#else
8667 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8668 __kmp_gtid_from_tid(tid, thread->th.th_team),
8669 thread->th.th_team->t.t_id, tid, blocktime));
8670#endif
8671}
8672
8673void __kmp_aux_set_defaults(char const *str, size_t len) {
8674 if (!__kmp_init_serial) {
8675 __kmp_serial_initialize();
8676 }
8677 __kmp_env_initialize(str);
8678
8679 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8680 __kmp_env_print();
8681 }
8682} // __kmp_aux_set_defaults
8683
8684/* ------------------------------------------------------------------------ */
8685/* internal fast reduction routines */
8686
8687PACKED_REDUCTION_METHOD_T
8688__kmp_determine_reduction_method(
8689 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8690 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8691 kmp_critical_name *lck) {
8692
8693 // Default reduction method: critical construct ( lck != NULL, like in current
8694 // PAROPT )
8695 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8696 // can be selected by RTL
8697 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8698 // can be selected by RTL
8699 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8700 // among generated by PAROPT.
8701
8702 PACKED_REDUCTION_METHOD_T retval;
8703
8704 int team_size;
8705
8706 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8707 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8708
8709#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8710 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8711#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8712
8713 retval = critical_reduce_block;
8714
8715 // another choice of getting a team size (with 1 dynamic deference) is slower
8716 team_size = __kmp_get_team_num_threads(global_tid);
8717 if (team_size == 1) {
8718
8719 retval = empty_reduce_block;
8720
8721 } else {
8722
8723 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8724
8725#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8726 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8727
8728#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8729 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8730
8731 int teamsize_cutoff = 4;
8732
8733#if KMP_MIC_SUPPORTED
8734 if (__kmp_mic_type != non_mic) {
8735 teamsize_cutoff = 8;
8736 }
8737#endif
8738 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8739 if (tree_available) {
8740 if (team_size <= teamsize_cutoff) {
8741 if (atomic_available) {
8742 retval = atomic_reduce_block;
8743 }
8744 } else {
8745 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8746 }
8747 } else if (atomic_available) {
8748 retval = atomic_reduce_block;
8749 }
8750#else
8751#error "Unknown or unsupported OS"
8752#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8753 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8754
8755#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8756
8757#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8758
8759 // basic tuning
8760
8761 if (atomic_available) {
8762 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8763 retval = atomic_reduce_block;
8764 }
8765 } // otherwise: use critical section
8766
8767#elif KMP_OS_DARWIN
8768
8769 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8770 if (atomic_available && (num_vars <= 3)) {
8771 retval = atomic_reduce_block;
8772 } else if (tree_available) {
8773 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8774 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8775 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8776 }
8777 } // otherwise: use critical section
8778
8779#else
8780#error "Unknown or unsupported OS"
8781#endif
8782
8783#else
8784#error "Unknown or unsupported architecture"
8785#endif
8786 }
8787
8788 // KMP_FORCE_REDUCTION
8789
8790 // If the team is serialized (team_size == 1), ignore the forced reduction
8791 // method and stay with the unsynchronized method (empty_reduce_block)
8792 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8793 team_size != 1) {
8794
8795 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8796
8797 int atomic_available, tree_available;
8798
8799 switch ((forced_retval = __kmp_force_reduction_method)) {
8800 case critical_reduce_block:
8801 KMP_ASSERT(lck); // lck should be != 0
8802 break;
8803
8804 case atomic_reduce_block:
8805 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8806 if (!atomic_available) {
8807 KMP_WARNING(RedMethodNotSupported, "atomic");
8808 forced_retval = critical_reduce_block;
8809 }
8810 break;
8811
8812 case tree_reduce_block:
8813 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8814 if (!tree_available) {
8815 KMP_WARNING(RedMethodNotSupported, "tree");
8816 forced_retval = critical_reduce_block;
8817 } else {
8818#if KMP_FAST_REDUCTION_BARRIER
8819 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8820#endif
8821 }
8822 break;
8823
8824 default:
8825 KMP_ASSERT(0); // "unsupported method specified"
8826 }
8827
8828 retval = forced_retval;
8829 }
8830
8831 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8832
8833#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8834#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8835
8836 return (retval);
8837}
8838// this function is for testing set/get/determine reduce method
8839kmp_int32 __kmp_get_reduce_method(void) {
8840 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8841}
8842
8843// Soft pause sets up threads to ignore blocktime and just go to sleep.
8844// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8845void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8846
8847// Hard pause shuts down the runtime completely. Resume happens naturally when
8848// OpenMP is used subsequently.
8849void __kmp_hard_pause() {
8850 __kmp_pause_status = kmp_hard_paused;
8851 __kmp_internal_end_thread(-1);
8852}
8853
8854// Soft resume sets __kmp_pause_status, and wakes up all threads.
8855void __kmp_resume_if_soft_paused() {
8856 if (__kmp_pause_status == kmp_soft_paused) {
8857 __kmp_pause_status = kmp_not_paused;
8858
8859 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8860 kmp_info_t *thread = __kmp_threads[gtid];
8861 if (thread) { // Wake it if sleeping
8862 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8863 thread);
8864 if (fl.is_sleeping())
8865 fl.resume(gtid);
8866 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8867 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8868 } else { // thread holds the lock and may sleep soon
8869 do { // until either the thread sleeps, or we can get the lock
8870 if (fl.is_sleeping()) {
8871 fl.resume(gtid);
8872 break;
8873 } else if (__kmp_try_suspend_mx(thread)) {
8874 __kmp_unlock_suspend_mx(thread);
8875 break;
8876 }
8877 } while (1);
8878 }
8879 }
8880 }
8881 }
8882}
8883
8884// This function is called via __kmpc_pause_resource. Returns 0 if successful.
8885// TODO: add warning messages
8886int __kmp_pause_resource(kmp_pause_status_t level) {
8887 if (level == kmp_not_paused) { // requesting resume
8888 if (__kmp_pause_status == kmp_not_paused) {
8889 // error message about runtime not being paused, so can't resume
8890 return 1;
8891 } else {
8892 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8893 __kmp_pause_status == kmp_hard_paused);
8894 __kmp_pause_status = kmp_not_paused;
8895 return 0;
8896 }
8897 } else if (level == kmp_soft_paused) { // requesting soft pause
8898 if (__kmp_pause_status != kmp_not_paused) {
8899 // error message about already being paused
8900 return 1;
8901 } else {
8902 __kmp_soft_pause();
8903 return 0;
8904 }
8905 } else if (level == kmp_hard_paused) { // requesting hard pause
8906 if (__kmp_pause_status != kmp_not_paused) {
8907 // error message about already being paused
8908 return 1;
8909 } else {
8910 __kmp_hard_pause();
8911 return 0;
8912 }
8913 } else {
8914 // error message about invalid level
8915 return 1;
8916 }
8917}
8918
8919void __kmp_omp_display_env(int verbose) {
8920 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8921 if (__kmp_init_serial == 0)
8922 __kmp_do_serial_initialize();
8923 __kmp_display_env_impl(!verbose, verbose);
8924 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8925}
8926
8927// The team size is changing, so distributed barrier must be modified
8928void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8929 int new_nthreads) {
8930 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8931 bp_dist_bar);
8932 kmp_info_t **other_threads = team->t.t_threads;
8933
8934 // We want all the workers to stop waiting on the barrier while we adjust the
8935 // size of the team.
8936 for (int f = 1; f < old_nthreads; ++f) {
8937 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8938 // Ignore threads that are already inactive or not present in the team
8939 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8940 // teams construct causes thread_limit to get passed in, and some of
8941 // those could be inactive; just ignore them
8942 continue;
8943 }
8944 // If thread is transitioning still to in_use state, wait for it
8945 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8946 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8947 KMP_CPU_PAUSE();
8948 }
8949 // The thread should be in_use now
8950 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8951 // Transition to unused state
8952 team->t.t_threads[f]->th.th_used_in_team.store(2);
8953 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8954 }
8955 // Release all the workers
8956 kmp_uint64 new_value; // new value for go
8957 new_value = team->t.b->go_release();
8958
8959 KMP_MFENCE();
8960
8961 // Workers should see transition status 2 and move to 0; but may need to be
8962 // woken up first
8963 size_t my_go_index;
8964 int count = old_nthreads - 1;
8965 while (count > 0) {
8966 count = old_nthreads - 1;
8967 for (int f = 1; f < old_nthreads; ++f) {
8968 my_go_index = f / team->t.b->threads_per_go;
8969 if (other_threads[f]->th.th_used_in_team.load() != 0) {
8970 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8971 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8972 void *, other_threads[f]->th.th_sleep_loc);
8973 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8974 }
8975 } else {
8976 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8977 count--;
8978 }
8979 }
8980 }
8981 // Now update the barrier size
8982 team->t.b->update_num_threads(new_nthreads);
8983 team->t.b->go_reset();
8984}
8985
8986void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
8987 // Add the threads back to the team
8988 KMP_DEBUG_ASSERT(team);
8989 // Threads were paused and pointed at th_used_in_team temporarily during a
8990 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
8991 // the thread that it should transition itself back into the team. Then, if
8992 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
8993 // to wake it up.
8994 for (int f = 1; f < new_nthreads; ++f) {
8995 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
8996 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
8997 3);
8998 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
8999 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9000 (kmp_flag_32<false, false> *)NULL);
9001 }
9002 }
9003 // The threads should be transitioning to the team; when they are done, they
9004 // should have set th_used_in_team to 1. This loop forces master to wait until
9005 // all threads have moved into the team and are waiting in the barrier.
9006 int count = new_nthreads - 1;
9007 while (count > 0) {
9008 count = new_nthreads - 1;
9009 for (int f = 1; f < new_nthreads; ++f) {
9010 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9011 count--;
9012 }
9013 }
9014 }
9015}
9016
9017// Globals and functions for hidden helper task
9018kmp_info_t **__kmp_hidden_helper_threads;
9019kmp_info_t *__kmp_hidden_helper_main_thread;
9020std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9021#if KMP_OS_LINUX
9022kmp_int32 __kmp_hidden_helper_threads_num = 8;
9023kmp_int32 __kmp_enable_hidden_helper = TRUE;
9024#else
9025kmp_int32 __kmp_hidden_helper_threads_num = 0;
9026kmp_int32 __kmp_enable_hidden_helper = FALSE;
9027#endif
9028
9029namespace {
9030std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9031
9032void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9033 // This is an explicit synchronization on all hidden helper threads in case
9034 // that when a regular thread pushes a hidden helper task to one hidden
9035 // helper thread, the thread has not been awaken once since they're released
9036 // by the main thread after creating the team.
9037 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9038 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9039 __kmp_hidden_helper_threads_num)
9040 ;
9041
9042 // If main thread, then wait for signal
9043 if (__kmpc_master(nullptr, *gtid)) {
9044 // First, unset the initial state and release the initial thread
9045 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9046 __kmp_hidden_helper_initz_release();
9047 __kmp_hidden_helper_main_thread_wait();
9048 // Now wake up all worker threads
9049 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9050 __kmp_hidden_helper_worker_thread_signal();
9051 }
9052 }
9053}
9054} // namespace
9055
9056void __kmp_hidden_helper_threads_initz_routine() {
9057 // Create a new root for hidden helper team/threads
9058 const int gtid = __kmp_register_root(TRUE);
9059 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9060 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9061 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9062 __kmp_hidden_helper_threads_num;
9063
9064 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9065
9066 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9067
9068 // Set the initialization flag to FALSE
9069 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9070
9071 __kmp_hidden_helper_threads_deinitz_release();
9072}
9073
9074/* Nesting Mode:
9075 Set via KMP_NESTING_MODE, which takes an integer.
9076 Note: we skip duplicate topology levels, and skip levels with only
9077 one entity.
9078 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9079 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9080 in the topology, and initializes the number of threads at each of those
9081 levels to the number of entities at each level, respectively, below the
9082 entity at the parent level.
9083 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9084 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9085 the user to turn nesting on explicitly. This is an even more experimental
9086 option to this experimental feature, and may change or go away in the
9087 future.
9088*/
9089
9090// Allocate space to store nesting levels
9091void __kmp_init_nesting_mode() {
9092 int levels = KMP_HW_LAST;
9093 __kmp_nesting_mode_nlevels = levels;
9094 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9095 for (int i = 0; i < levels; ++i)
9096 __kmp_nesting_nth_level[i] = 0;
9097 if (__kmp_nested_nth.size < levels) {
9098 __kmp_nested_nth.nth =
9099 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9100 __kmp_nested_nth.size = levels;
9101 }
9102}
9103
9104// Set # threads for top levels of nesting; must be called after topology set
9105void __kmp_set_nesting_mode_threads() {
9106 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9107
9108 if (__kmp_nesting_mode == 1)
9109 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9110 else if (__kmp_nesting_mode > 1)
9111 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9112
9113 if (__kmp_topology) { // use topology info
9114 int loc, hw_level;
9115 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9116 loc < __kmp_nesting_mode_nlevels;
9117 loc++, hw_level++) {
9118 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9119 if (__kmp_nesting_nth_level[loc] == 1)
9120 loc--;
9121 }
9122 // Make sure all cores are used
9123 if (__kmp_nesting_mode > 1 && loc > 1) {
9124 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9125 int num_cores = __kmp_topology->get_count(core_level);
9126 int upper_levels = 1;
9127 for (int level = 0; level < loc - 1; ++level)
9128 upper_levels *= __kmp_nesting_nth_level[level];
9129 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9130 __kmp_nesting_nth_level[loc - 1] =
9131 num_cores / __kmp_nesting_nth_level[loc - 2];
9132 }
9133 __kmp_nesting_mode_nlevels = loc;
9134 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9135 } else { // no topology info available; provide a reasonable guesstimation
9136 if (__kmp_avail_proc >= 4) {
9137 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9138 __kmp_nesting_nth_level[1] = 2;
9139 __kmp_nesting_mode_nlevels = 2;
9140 } else {
9141 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9142 __kmp_nesting_mode_nlevels = 1;
9143 }
9144 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9145 }
9146 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9147 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9148 }
9149 set__nproc(thread, __kmp_nesting_nth_level[0]);
9150 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9151 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9152 if (get__max_active_levels(thread) > 1) {
9153 // if max levels was set, set nesting mode levels to same
9154 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9155 }
9156 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9157 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9158}
@ KMP_IDENT_AUTOPAR
Definition kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition kmp_stats.h:937
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition kmp_stats.h:895
stats_state_e
the states which a thread can be in
Definition kmp_stats.h:63
sched_type
Definition kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition kmp.h:364
@ kmp_sch_static
Definition kmp.h:360
@ kmp_sch_guided_chunked
Definition kmp.h:362
Definition kmp.h:234
kmp_int32 flags
Definition kmp.h:236