LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#include <cstdio>
28#if KMP_USE_HIER_SCHED
29#include "kmp_dispatch_hier.h"
30#endif
31
32#if OMPT_SUPPORT
33#include "ompt-specific.h"
34#endif
35#if OMPD_SUPPORT
36#include "ompd-specific.h"
37#endif
38
39#if OMP_PROFILING_SUPPORT
40#include "llvm/Support/TimeProfiler.h"
41static char *ProfileTraceFile = nullptr;
42#endif
43
44/* these are temporary issues to be dealt with */
45#define KMP_USE_PRCTL 0
46
47#if KMP_OS_WINDOWS
48#include <process.h>
49#endif
50
51#if KMP_OS_WINDOWS
52// windows does not need include files as it doesn't use shared memory
53#else
54#include <sys/mman.h>
55#include <sys/stat.h>
56#include <fcntl.h>
57#define SHM_SIZE 1024
58#endif
59
60#if defined(KMP_GOMP_COMPAT)
61char const __kmp_version_alt_comp[] =
62 KMP_VERSION_PREFIX "alternative compiler support: yes";
63#endif /* defined(KMP_GOMP_COMPAT) */
64
65char const __kmp_version_omp_api[] =
66 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
67
68#ifdef KMP_DEBUG
69char const __kmp_version_lock[] =
70 KMP_VERSION_PREFIX "lock type: run time selectable";
71#endif /* KMP_DEBUG */
72
73#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
74
75/* ------------------------------------------------------------------------ */
76
77#if KMP_USE_MONITOR
78kmp_info_t __kmp_monitor;
79#endif
80
81/* Forward declarations */
82
83void __kmp_cleanup(void);
84
85static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
86 int gtid);
87static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
88 kmp_internal_control_t *new_icvs,
89 ident_t *loc);
90#if KMP_AFFINITY_SUPPORTED
91static void __kmp_partition_places(kmp_team_t *team,
92 int update_master_only = 0);
93#endif
94static void __kmp_do_serial_initialize(void);
95void __kmp_fork_barrier(int gtid, int tid);
96void __kmp_join_barrier(int gtid);
97void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
98 kmp_internal_control_t *new_icvs, ident_t *loc);
99
100#ifdef USE_LOAD_BALANCE
101static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
102#endif
103
104static int __kmp_expand_threads(int nNeed);
105#if KMP_OS_WINDOWS
106static int __kmp_unregister_root_other_thread(int gtid);
107#endif
108static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
109kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
110
111void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
112 int new_nthreads);
113void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
114
115/* Calculate the identifier of the current thread */
116/* fast (and somewhat portable) way to get unique identifier of executing
117 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
118int __kmp_get_global_thread_id() {
119 int i;
120 kmp_info_t **other_threads;
121 size_t stack_data;
122 char *stack_addr;
123 size_t stack_size;
124 char *stack_base;
125
126 KA_TRACE(
127 1000,
128 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
129 __kmp_nth, __kmp_all_nth));
130
131 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
132 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
133 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
134 __kmp_init_gtid for this to work. */
135
136 if (!TCR_4(__kmp_init_gtid))
137 return KMP_GTID_DNE;
138
139#ifdef KMP_TDATA_GTID
140 if (TCR_4(__kmp_gtid_mode) >= 3) {
141 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
142 return __kmp_gtid;
143 }
144#endif
145 if (TCR_4(__kmp_gtid_mode) >= 2) {
146 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
147 return __kmp_gtid_get_specific();
148 }
149 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
150
151 stack_addr = (char *)&stack_data;
152 other_threads = __kmp_threads;
153
154 /* ATT: The code below is a source of potential bugs due to unsynchronized
155 access to __kmp_threads array. For example:
156 1. Current thread loads other_threads[i] to thr and checks it, it is
157 non-NULL.
158 2. Current thread is suspended by OS.
159 3. Another thread unregisters and finishes (debug versions of free()
160 may fill memory with something like 0xEF).
161 4. Current thread is resumed.
162 5. Current thread reads junk from *thr.
163 TODO: Fix it. --ln */
164
165 for (i = 0; i < __kmp_threads_capacity; i++) {
166
167 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
168 if (!thr)
169 continue;
170
171 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
172 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
173
174 /* stack grows down -- search through all of the active threads */
175
176 if (stack_addr <= stack_base) {
177 size_t stack_diff = stack_base - stack_addr;
178
179 if (stack_diff <= stack_size) {
180 /* The only way we can be closer than the allocated */
181 /* stack size is if we are running on this thread. */
182 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
183 return i;
184 }
185 }
186 }
187
188 /* get specific to try and determine our gtid */
189 KA_TRACE(1000,
190 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
191 "thread, using TLS\n"));
192 i = __kmp_gtid_get_specific();
193
194 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
195
196 /* if we havn't been assigned a gtid, then return code */
197 if (i < 0)
198 return i;
199
200 /* dynamically updated stack window for uber threads to avoid get_specific
201 call */
202 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
203 KMP_FATAL(StackOverflow, i);
204 }
205
206 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
207 if (stack_addr > stack_base) {
208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
209 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
210 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
211 stack_base);
212 } else {
213 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
214 stack_base - stack_addr);
215 }
216
217 /* Reprint stack bounds for ubermaster since they have been refined */
218 if (__kmp_storage_map) {
219 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
220 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
221 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
222 other_threads[i]->th.th_info.ds.ds_stacksize,
223 "th_%d stack (refinement)", i);
224 }
225 return i;
226}
227
228int __kmp_get_global_thread_id_reg() {
229 int gtid;
230
231 if (!__kmp_init_serial) {
232 gtid = KMP_GTID_DNE;
233 } else
234#ifdef KMP_TDATA_GTID
235 if (TCR_4(__kmp_gtid_mode) >= 3) {
236 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
237 gtid = __kmp_gtid;
238 } else
239#endif
240 if (TCR_4(__kmp_gtid_mode) >= 2) {
241 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
242 gtid = __kmp_gtid_get_specific();
243 } else {
244 KA_TRACE(1000,
245 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
246 gtid = __kmp_get_global_thread_id();
247 }
248
249 /* we must be a new uber master sibling thread */
250 if (gtid == KMP_GTID_DNE) {
251 KA_TRACE(10,
252 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
253 "Registering a new gtid.\n"));
254 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
255 if (!__kmp_init_serial) {
256 __kmp_do_serial_initialize();
257 gtid = __kmp_gtid_get_specific();
258 } else {
259 gtid = __kmp_register_root(FALSE);
260 }
261 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
262 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
263 }
264
265 KMP_DEBUG_ASSERT(gtid >= 0);
266
267 return gtid;
268}
269
270/* caller must hold forkjoin_lock */
271void __kmp_check_stack_overlap(kmp_info_t *th) {
272 int f;
273 char *stack_beg = NULL;
274 char *stack_end = NULL;
275 int gtid;
276
277 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
278 if (__kmp_storage_map) {
279 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
280 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
281
282 gtid = __kmp_gtid_from_thread(th);
283
284 if (gtid == KMP_GTID_MONITOR) {
285 __kmp_print_storage_map_gtid(
286 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
287 "th_%s stack (%s)", "mon",
288 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
289 } else {
290 __kmp_print_storage_map_gtid(
291 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
292 "th_%d stack (%s)", gtid,
293 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
294 }
295 }
296
297 /* No point in checking ubermaster threads since they use refinement and
298 * cannot overlap */
299 gtid = __kmp_gtid_from_thread(th);
300 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
301 KA_TRACE(10,
302 ("__kmp_check_stack_overlap: performing extensive checking\n"));
303 if (stack_beg == NULL) {
304 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
305 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
306 }
307
308 for (f = 0; f < __kmp_threads_capacity; f++) {
309 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
310
311 if (f_th && f_th != th) {
312 char *other_stack_end =
313 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
314 char *other_stack_beg =
315 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
316 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
317 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
318
319 /* Print the other stack values before the abort */
320 if (__kmp_storage_map)
321 __kmp_print_storage_map_gtid(
322 -1, other_stack_beg, other_stack_end,
323 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
324 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
325
326 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
327 __kmp_msg_null);
328 }
329 }
330 }
331 }
332 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
333}
334
335/* ------------------------------------------------------------------------ */
336
337void __kmp_infinite_loop(void) {
338 static int done = FALSE;
339
340 while (!done) {
341 KMP_YIELD(TRUE);
342 }
343}
344
345#define MAX_MESSAGE 512
346
347void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
348 char const *format, ...) {
349 char buffer[MAX_MESSAGE];
350 va_list ap;
351
352 va_start(ap, format);
353 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
354 p2, (unsigned long)size, format);
355 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
356 __kmp_vprintf(kmp_err, buffer, ap);
357#if KMP_PRINT_DATA_PLACEMENT
358 int node;
359 if (gtid >= 0) {
360 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
361 if (__kmp_storage_map_verbose) {
362 node = __kmp_get_host_node(p1);
363 if (node < 0) /* doesn't work, so don't try this next time */
364 __kmp_storage_map_verbose = FALSE;
365 else {
366 char *last;
367 int lastNode;
368 int localProc = __kmp_get_cpu_from_gtid(gtid);
369
370 const int page_size = KMP_GET_PAGE_SIZE();
371
372 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
373 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
374 if (localProc >= 0)
375 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
376 localProc >> 1);
377 else
378 __kmp_printf_no_lock(" GTID %d\n", gtid);
379#if KMP_USE_PRCTL
380 /* The more elaborate format is disabled for now because of the prctl
381 * hanging bug. */
382 do {
383 last = p1;
384 lastNode = node;
385 /* This loop collates adjacent pages with the same host node. */
386 do {
387 (char *)p1 += page_size;
388 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
389 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
390 lastNode);
391 } while (p1 <= p2);
392#else
393 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
394 (char *)p1 + (page_size - 1),
395 __kmp_get_host_node(p1));
396 if (p1 < p2) {
397 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
398 (char *)p2 + (page_size - 1),
399 __kmp_get_host_node(p2));
400 }
401#endif
402 }
403 }
404 } else
405 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
406 }
407#endif /* KMP_PRINT_DATA_PLACEMENT */
408 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
409}
410
411void __kmp_warn(char const *format, ...) {
412 char buffer[MAX_MESSAGE];
413 va_list ap;
414
415 if (__kmp_generate_warnings == kmp_warnings_off) {
416 return;
417 }
418
419 va_start(ap, format);
420
421 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
422 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
423 __kmp_vprintf(kmp_err, buffer, ap);
424 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
425
426 va_end(ap);
427}
428
429void __kmp_abort_process() {
430 // Later threads may stall here, but that's ok because abort() will kill them.
431 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
432
433 if (__kmp_debug_buf) {
434 __kmp_dump_debug_buffer();
435 }
436
437 if (KMP_OS_WINDOWS) {
438 // Let other threads know of abnormal termination and prevent deadlock
439 // if abort happened during library initialization or shutdown
440 __kmp_global.g.g_abort = SIGABRT;
441
442 /* On Windows* OS by default abort() causes pop-up error box, which stalls
443 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
444 boxes. _set_abort_behavior() works well, but this function is not
445 available in VS7 (this is not problem for DLL, but it is a problem for
446 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
447 help, at least in some versions of MS C RTL.
448
449 It seems following sequence is the only way to simulate abort() and
450 avoid pop-up error box. */
451 raise(SIGABRT);
452 _exit(3); // Just in case, if signal ignored, exit anyway.
453 } else {
454 __kmp_unregister_library();
455 abort();
456 }
457
458 __kmp_infinite_loop();
459 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
460
461} // __kmp_abort_process
462
463void __kmp_abort_thread(void) {
464 // TODO: Eliminate g_abort global variable and this function.
465 // In case of abort just call abort(), it will kill all the threads.
466 __kmp_infinite_loop();
467} // __kmp_abort_thread
468
469/* Print out the storage map for the major kmp_info_t thread data structures
470 that are allocated together. */
471
472static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
473 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
474 gtid);
475
476 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
477 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
478
479 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
480 sizeof(kmp_local_t), "th_%d.th_local", gtid);
481
482 __kmp_print_storage_map_gtid(
483 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
484 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
485
486 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
487 &thr->th.th_bar[bs_plain_barrier + 1],
488 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
489 gtid);
490
491 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
492 &thr->th.th_bar[bs_forkjoin_barrier + 1],
493 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
494 gtid);
495
496#if KMP_FAST_REDUCTION_BARRIER
497 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
498 &thr->th.th_bar[bs_reduction_barrier + 1],
499 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
500 gtid);
501#endif // KMP_FAST_REDUCTION_BARRIER
502}
503
504/* Print out the storage map for the major kmp_team_t team data structures
505 that are allocated together. */
506
507static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
508 int team_id, int num_thr) {
509 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
510 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
511 header, team_id);
512
513 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
514 &team->t.t_bar[bs_last_barrier],
515 sizeof(kmp_balign_team_t) * bs_last_barrier,
516 "%s_%d.t_bar", header, team_id);
517
518 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
519 &team->t.t_bar[bs_plain_barrier + 1],
520 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
521 header, team_id);
522
523 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
524 &team->t.t_bar[bs_forkjoin_barrier + 1],
525 sizeof(kmp_balign_team_t),
526 "%s_%d.t_bar[forkjoin]", header, team_id);
527
528#if KMP_FAST_REDUCTION_BARRIER
529 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
530 &team->t.t_bar[bs_reduction_barrier + 1],
531 sizeof(kmp_balign_team_t),
532 "%s_%d.t_bar[reduction]", header, team_id);
533#endif // KMP_FAST_REDUCTION_BARRIER
534
535 __kmp_print_storage_map_gtid(
536 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
537 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
538
539 __kmp_print_storage_map_gtid(
540 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
541 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
542
543 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
544 &team->t.t_disp_buffer[num_disp_buff],
545 sizeof(dispatch_shared_info_t) * num_disp_buff,
546 "%s_%d.t_disp_buffer", header, team_id);
547}
548
549static void __kmp_init_allocator() {
550 __kmp_init_memkind();
551 __kmp_init_target_mem();
552}
553static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
554
555/* ------------------------------------------------------------------------ */
556
557#if KMP_DYNAMIC_LIB
558#if KMP_OS_WINDOWS
559
560BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
561 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
562
563 switch (fdwReason) {
564
565 case DLL_PROCESS_ATTACH:
566 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
567
568 return TRUE;
569
570 case DLL_PROCESS_DETACH:
571 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
572
573 // According to Windows* documentation for DllMain entry point:
574 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
575 // lpReserved == NULL when FreeLibrary() is called,
576 // lpReserved != NULL when the process is terminated.
577 // When FreeLibrary() is called, worker threads remain alive. So the
578 // runtime's state is consistent and executing proper shutdown is OK.
579 // When the process is terminated, worker threads have exited or been
580 // forcefully terminated by the OS and only the shutdown thread remains.
581 // This can leave the runtime in an inconsistent state.
582 // Hence, only attempt proper cleanup when FreeLibrary() is called.
583 // Otherwise, rely on OS to reclaim resources.
584 if (lpReserved == NULL)
585 __kmp_internal_end_library(__kmp_gtid_get_specific());
586
587 return TRUE;
588
589 case DLL_THREAD_ATTACH:
590 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
591
592 /* if we want to register new siblings all the time here call
593 * __kmp_get_gtid(); */
594 return TRUE;
595
596 case DLL_THREAD_DETACH:
597 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
598
599 __kmp_internal_end_thread(__kmp_gtid_get_specific());
600 return TRUE;
601 }
602
603 return TRUE;
604}
605
606#endif /* KMP_OS_WINDOWS */
607#endif /* KMP_DYNAMIC_LIB */
608
609/* __kmp_parallel_deo -- Wait until it's our turn. */
610void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
611 int gtid = *gtid_ref;
612#ifdef BUILD_PARALLEL_ORDERED
613 kmp_team_t *team = __kmp_team_from_gtid(gtid);
614#endif /* BUILD_PARALLEL_ORDERED */
615
616 if (__kmp_env_consistency_check) {
617 if (__kmp_threads[gtid]->th.th_root->r.r_active)
618#if KMP_USE_DYNAMIC_LOCK
619 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
620#else
621 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
622#endif
623 }
624#ifdef BUILD_PARALLEL_ORDERED
625 if (!team->t.t_serialized) {
626 KMP_MB();
627 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
628 NULL);
629 KMP_MB();
630 }
631#endif /* BUILD_PARALLEL_ORDERED */
632}
633
634/* __kmp_parallel_dxo -- Signal the next task. */
635void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
636 int gtid = *gtid_ref;
637#ifdef BUILD_PARALLEL_ORDERED
638 int tid = __kmp_tid_from_gtid(gtid);
639 kmp_team_t *team = __kmp_team_from_gtid(gtid);
640#endif /* BUILD_PARALLEL_ORDERED */
641
642 if (__kmp_env_consistency_check) {
643 if (__kmp_threads[gtid]->th.th_root->r.r_active)
644 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
645 }
646#ifdef BUILD_PARALLEL_ORDERED
647 if (!team->t.t_serialized) {
648 KMP_MB(); /* Flush all pending memory write invalidates. */
649
650 /* use the tid of the next thread in this team */
651 /* TODO replace with general release procedure */
652 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
653
654 KMP_MB(); /* Flush all pending memory write invalidates. */
655 }
656#endif /* BUILD_PARALLEL_ORDERED */
657}
658
659/* ------------------------------------------------------------------------ */
660/* The BARRIER for a SINGLE process section is always explicit */
661
662int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
663 int status;
664 kmp_info_t *th;
665 kmp_team_t *team;
666
667 if (!TCR_4(__kmp_init_parallel))
668 __kmp_parallel_initialize();
669 __kmp_resume_if_soft_paused();
670
671 th = __kmp_threads[gtid];
672 team = th->th.th_team;
673 status = 0;
674
675 th->th.th_ident = id_ref;
676
677 if (team->t.t_serialized) {
678 status = 1;
679 } else {
680 kmp_int32 old_this = th->th.th_local.this_construct;
681
682 ++th->th.th_local.this_construct;
683 /* try to set team count to thread count--success means thread got the
684 single block */
685 /* TODO: Should this be acquire or release? */
686 if (team->t.t_construct == old_this) {
687 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
688 th->th.th_local.this_construct);
689 }
690#if USE_ITT_BUILD
691 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
692 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
693 team->t.t_active_level == 1) {
694 // Only report metadata by primary thread of active team at level 1
695 __kmp_itt_metadata_single(id_ref);
696 }
697#endif /* USE_ITT_BUILD */
698 }
699
700 if (__kmp_env_consistency_check) {
701 if (status && push_ws) {
702 __kmp_push_workshare(gtid, ct_psingle, id_ref);
703 } else {
704 __kmp_check_workshare(gtid, ct_psingle, id_ref);
705 }
706 }
707#if USE_ITT_BUILD
708 if (status) {
709 __kmp_itt_single_start(gtid);
710 }
711#endif /* USE_ITT_BUILD */
712 return status;
713}
714
715void __kmp_exit_single(int gtid) {
716#if USE_ITT_BUILD
717 __kmp_itt_single_end(gtid);
718#endif /* USE_ITT_BUILD */
719 if (__kmp_env_consistency_check)
720 __kmp_pop_workshare(gtid, ct_psingle, NULL);
721}
722
723/* determine if we can go parallel or must use a serialized parallel region and
724 * how many threads we can use
725 * set_nproc is the number of threads requested for the team
726 * returns 0 if we should serialize or only use one thread,
727 * otherwise the number of threads to use
728 * The forkjoin lock is held by the caller. */
729static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
730 int master_tid, int set_nthreads,
731 int enter_teams) {
732 int capacity;
733 int new_nthreads;
734 KMP_DEBUG_ASSERT(__kmp_init_serial);
735 KMP_DEBUG_ASSERT(root && parent_team);
736 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
737
738 // If dyn-var is set, dynamically adjust the number of desired threads,
739 // according to the method specified by dynamic_mode.
740 new_nthreads = set_nthreads;
741 if (!get__dynamic_2(parent_team, master_tid)) {
742 ;
743 }
744#ifdef USE_LOAD_BALANCE
745 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
746 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
747 if (new_nthreads == 1) {
748 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
749 "reservation to 1 thread\n",
750 master_tid));
751 return 1;
752 }
753 if (new_nthreads < set_nthreads) {
754 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
755 "reservation to %d threads\n",
756 master_tid, new_nthreads));
757 }
758 }
759#endif /* USE_LOAD_BALANCE */
760 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
761 new_nthreads = __kmp_avail_proc - __kmp_nth +
762 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
763 if (new_nthreads <= 1) {
764 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
765 "reservation to 1 thread\n",
766 master_tid));
767 return 1;
768 }
769 if (new_nthreads < set_nthreads) {
770 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
771 "reservation to %d threads\n",
772 master_tid, new_nthreads));
773 } else {
774 new_nthreads = set_nthreads;
775 }
776 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
777 if (set_nthreads > 2) {
778 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
779 new_nthreads = (new_nthreads % set_nthreads) + 1;
780 if (new_nthreads == 1) {
781 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
782 "reservation to 1 thread\n",
783 master_tid));
784 return 1;
785 }
786 if (new_nthreads < set_nthreads) {
787 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
788 "reservation to %d threads\n",
789 master_tid, new_nthreads));
790 }
791 }
792 } else {
793 KMP_ASSERT(0);
794 }
795
796 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
797 if (__kmp_nth + new_nthreads -
798 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
799 __kmp_max_nth) {
800 int tl_nthreads = __kmp_max_nth - __kmp_nth +
801 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
802 if (tl_nthreads <= 0) {
803 tl_nthreads = 1;
804 }
805
806 // If dyn-var is false, emit a 1-time warning.
807 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
808 __kmp_reserve_warn = 1;
809 __kmp_msg(kmp_ms_warning,
810 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
811 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
812 }
813 if (tl_nthreads == 1) {
814 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
815 "reduced reservation to 1 thread\n",
816 master_tid));
817 return 1;
818 }
819 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
820 "reservation to %d threads\n",
821 master_tid, tl_nthreads));
822 new_nthreads = tl_nthreads;
823 }
824
825 // Respect OMP_THREAD_LIMIT
826 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
827 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
828 if (cg_nthreads + new_nthreads -
829 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
830 max_cg_threads) {
831 int tl_nthreads = max_cg_threads - cg_nthreads +
832 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
833 if (tl_nthreads <= 0) {
834 tl_nthreads = 1;
835 }
836
837 // If dyn-var is false, emit a 1-time warning.
838 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
839 __kmp_reserve_warn = 1;
840 __kmp_msg(kmp_ms_warning,
841 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
842 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
843 }
844 if (tl_nthreads == 1) {
845 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
846 "reduced reservation to 1 thread\n",
847 master_tid));
848 return 1;
849 }
850 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
851 "reservation to %d threads\n",
852 master_tid, tl_nthreads));
853 new_nthreads = tl_nthreads;
854 }
855
856 // Check if the threads array is large enough, or needs expanding.
857 // See comment in __kmp_register_root() about the adjustment if
858 // __kmp_threads[0] == NULL.
859 capacity = __kmp_threads_capacity;
860 if (TCR_PTR(__kmp_threads[0]) == NULL) {
861 --capacity;
862 }
863 // If it is not for initializing the hidden helper team, we need to take
864 // __kmp_hidden_helper_threads_num out of the capacity because it is included
865 // in __kmp_threads_capacity.
866 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
867 capacity -= __kmp_hidden_helper_threads_num;
868 }
869 if (__kmp_nth + new_nthreads -
870 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
871 capacity) {
872 // Expand the threads array.
873 int slotsRequired = __kmp_nth + new_nthreads -
874 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
875 capacity;
876 int slotsAdded = __kmp_expand_threads(slotsRequired);
877 if (slotsAdded < slotsRequired) {
878 // The threads array was not expanded enough.
879 new_nthreads -= (slotsRequired - slotsAdded);
880 KMP_ASSERT(new_nthreads >= 1);
881
882 // If dyn-var is false, emit a 1-time warning.
883 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
884 __kmp_reserve_warn = 1;
885 if (__kmp_tp_cached) {
886 __kmp_msg(kmp_ms_warning,
887 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
888 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
889 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
890 } else {
891 __kmp_msg(kmp_ms_warning,
892 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
893 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
894 }
895 }
896 }
897 }
898
899#ifdef KMP_DEBUG
900 if (new_nthreads == 1) {
901 KC_TRACE(10,
902 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
903 "dead roots and rechecking; requested %d threads\n",
904 __kmp_get_gtid(), set_nthreads));
905 } else {
906 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
907 " %d threads\n",
908 __kmp_get_gtid(), new_nthreads, set_nthreads));
909 }
910#endif // KMP_DEBUG
911 return new_nthreads;
912}
913
914/* Allocate threads from the thread pool and assign them to the new team. We are
915 assured that there are enough threads available, because we checked on that
916 earlier within critical section forkjoin */
917static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
918 kmp_info_t *master_th, int master_gtid,
919 int fork_teams_workers) {
920 int i;
921 int use_hot_team;
922
923 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
924 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
925 KMP_MB();
926
927 /* first, let's setup the primary thread */
928 master_th->th.th_info.ds.ds_tid = 0;
929 master_th->th.th_team = team;
930 master_th->th.th_team_nproc = team->t.t_nproc;
931 master_th->th.th_team_master = master_th;
932 master_th->th.th_team_serialized = FALSE;
933 master_th->th.th_dispatch = &team->t.t_dispatch[0];
934
935/* make sure we are not the optimized hot team */
936#if KMP_NESTED_HOT_TEAMS
937 use_hot_team = 0;
938 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
939 if (hot_teams) { // hot teams array is not allocated if
940 // KMP_HOT_TEAMS_MAX_LEVEL=0
941 int level = team->t.t_active_level - 1; // index in array of hot teams
942 if (master_th->th.th_teams_microtask) { // are we inside the teams?
943 if (master_th->th.th_teams_size.nteams > 1) {
944 ++level; // level was not increased in teams construct for
945 // team_of_masters
946 }
947 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
948 master_th->th.th_teams_level == team->t.t_level) {
949 ++level; // level was not increased in teams construct for
950 // team_of_workers before the parallel
951 } // team->t.t_level will be increased inside parallel
952 }
953 if (level < __kmp_hot_teams_max_level) {
954 if (hot_teams[level].hot_team) {
955 // hot team has already been allocated for given level
956 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
957 use_hot_team = 1; // the team is ready to use
958 } else {
959 use_hot_team = 0; // AC: threads are not allocated yet
960 hot_teams[level].hot_team = team; // remember new hot team
961 hot_teams[level].hot_team_nth = team->t.t_nproc;
962 }
963 } else {
964 use_hot_team = 0;
965 }
966 }
967#else
968 use_hot_team = team == root->r.r_hot_team;
969#endif
970 if (!use_hot_team) {
971
972 /* install the primary thread */
973 team->t.t_threads[0] = master_th;
974 __kmp_initialize_info(master_th, team, 0, master_gtid);
975
976 /* now, install the worker threads */
977 for (i = 1; i < team->t.t_nproc; i++) {
978
979 /* fork or reallocate a new thread and install it in team */
980 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
981 team->t.t_threads[i] = thr;
982 KMP_DEBUG_ASSERT(thr);
983 KMP_DEBUG_ASSERT(thr->th.th_team == team);
984 /* align team and thread arrived states */
985 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
986 "T#%d(%d:%d) join =%llu, plain=%llu\n",
987 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
988 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
989 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
990 team->t.t_bar[bs_plain_barrier].b_arrived));
991 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
992 thr->th.th_teams_level = master_th->th.th_teams_level;
993 thr->th.th_teams_size = master_th->th.th_teams_size;
994 { // Initialize threads' barrier data.
995 int b;
996 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
997 for (b = 0; b < bs_last_barrier; ++b) {
998 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
999 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1000#if USE_DEBUGGER
1001 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1002#endif
1003 }
1004 }
1005 }
1006
1007#if KMP_AFFINITY_SUPPORTED
1008 // Do not partition the places list for teams construct workers who
1009 // haven't actually been forked to do real work yet. This partitioning
1010 // will take place in the parallel region nested within the teams construct.
1011 if (!fork_teams_workers) {
1012 __kmp_partition_places(team);
1013 }
1014#endif
1015
1016 if (team->t.t_nproc > 1 &&
1017 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1018 team->t.b->update_num_threads(team->t.t_nproc);
1019 __kmp_add_threads_to_team(team, team->t.t_nproc);
1020 }
1021 }
1022
1023 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1024 for (i = 0; i < team->t.t_nproc; i++) {
1025 kmp_info_t *thr = team->t.t_threads[i];
1026 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1027 thr->th.th_prev_level != team->t.t_level) {
1028 team->t.t_display_affinity = 1;
1029 break;
1030 }
1031 }
1032 }
1033
1034 KMP_MB();
1035}
1036
1037#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1038// Propagate any changes to the floating point control registers out to the team
1039// We try to avoid unnecessary writes to the relevant cache line in the team
1040// structure, so we don't make changes unless they are needed.
1041inline static void propagateFPControl(kmp_team_t *team) {
1042 if (__kmp_inherit_fp_control) {
1043 kmp_int16 x87_fpu_control_word;
1044 kmp_uint32 mxcsr;
1045
1046 // Get primary thread's values of FPU control flags (both X87 and vector)
1047 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1048 __kmp_store_mxcsr(&mxcsr);
1049 mxcsr &= KMP_X86_MXCSR_MASK;
1050
1051 // There is no point looking at t_fp_control_saved here.
1052 // If it is TRUE, we still have to update the values if they are different
1053 // from those we now have. If it is FALSE we didn't save anything yet, but
1054 // our objective is the same. We have to ensure that the values in the team
1055 // are the same as those we have.
1056 // So, this code achieves what we need whether or not t_fp_control_saved is
1057 // true. By checking whether the value needs updating we avoid unnecessary
1058 // writes that would put the cache-line into a written state, causing all
1059 // threads in the team to have to read it again.
1060 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1061 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1062 // Although we don't use this value, other code in the runtime wants to know
1063 // whether it should restore them. So we must ensure it is correct.
1064 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1065 } else {
1066 // Similarly here. Don't write to this cache-line in the team structure
1067 // unless we have to.
1068 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1069 }
1070}
1071
1072// Do the opposite, setting the hardware registers to the updated values from
1073// the team.
1074inline static void updateHWFPControl(kmp_team_t *team) {
1075 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1076 // Only reset the fp control regs if they have been changed in the team.
1077 // the parallel region that we are exiting.
1078 kmp_int16 x87_fpu_control_word;
1079 kmp_uint32 mxcsr;
1080 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1081 __kmp_store_mxcsr(&mxcsr);
1082 mxcsr &= KMP_X86_MXCSR_MASK;
1083
1084 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1085 __kmp_clear_x87_fpu_status_word();
1086 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1087 }
1088
1089 if (team->t.t_mxcsr != mxcsr) {
1090 __kmp_load_mxcsr(&team->t.t_mxcsr);
1091 }
1092 }
1093}
1094#else
1095#define propagateFPControl(x) ((void)0)
1096#define updateHWFPControl(x) ((void)0)
1097#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1098
1099static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1100 int realloc); // forward declaration
1101
1102/* Run a parallel region that has been serialized, so runs only in a team of the
1103 single primary thread. */
1104void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1105 kmp_info_t *this_thr;
1106 kmp_team_t *serial_team;
1107
1108 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1109
1110 /* Skip all this code for autopar serialized loops since it results in
1111 unacceptable overhead */
1112 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1113 return;
1114
1115 if (!TCR_4(__kmp_init_parallel))
1116 __kmp_parallel_initialize();
1117 __kmp_resume_if_soft_paused();
1118
1119 this_thr = __kmp_threads[global_tid];
1120 serial_team = this_thr->th.th_serial_team;
1121
1122 /* utilize the serialized team held by this thread */
1123 KMP_DEBUG_ASSERT(serial_team);
1124 KMP_MB();
1125
1126 if (__kmp_tasking_mode != tskm_immediate_exec) {
1127 KMP_DEBUG_ASSERT(
1128 this_thr->th.th_task_team ==
1129 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1130 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1131 NULL);
1132 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1133 "team %p, new task_team = NULL\n",
1134 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1135 this_thr->th.th_task_team = NULL;
1136 }
1137
1138 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1139 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1140 proc_bind = proc_bind_false;
1141 } else if (proc_bind == proc_bind_default) {
1142 // No proc_bind clause was specified, so use the current value
1143 // of proc-bind-var for this parallel region.
1144 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1145 }
1146 // Reset for next parallel region
1147 this_thr->th.th_set_proc_bind = proc_bind_default;
1148
1149#if OMPT_SUPPORT
1150 ompt_data_t ompt_parallel_data = ompt_data_none;
1151 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1152 if (ompt_enabled.enabled &&
1153 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1154
1155 ompt_task_info_t *parent_task_info;
1156 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1157
1158 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1159 if (ompt_enabled.ompt_callback_parallel_begin) {
1160 int team_size = 1;
1161
1162 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1163 &(parent_task_info->task_data), &(parent_task_info->frame),
1164 &ompt_parallel_data, team_size,
1165 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1166 }
1167 }
1168#endif // OMPT_SUPPORT
1169
1170 if (this_thr->th.th_team != serial_team) {
1171 // Nested level will be an index in the nested nthreads array
1172 int level = this_thr->th.th_team->t.t_level;
1173
1174 if (serial_team->t.t_serialized) {
1175 /* this serial team was already used
1176 TODO increase performance by making this locks more specific */
1177 kmp_team_t *new_team;
1178
1179 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1180
1181 new_team =
1182 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1183#if OMPT_SUPPORT
1184 ompt_parallel_data,
1185#endif
1186 proc_bind, &this_thr->th.th_current_task->td_icvs,
1187 0 USE_NESTED_HOT_ARG(NULL));
1188 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1189 KMP_ASSERT(new_team);
1190
1191 /* setup new serialized team and install it */
1192 new_team->t.t_threads[0] = this_thr;
1193 new_team->t.t_parent = this_thr->th.th_team;
1194 serial_team = new_team;
1195 this_thr->th.th_serial_team = serial_team;
1196
1197 KF_TRACE(
1198 10,
1199 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1200 global_tid, serial_team));
1201
1202 /* TODO the above breaks the requirement that if we run out of resources,
1203 then we can still guarantee that serialized teams are ok, since we may
1204 need to allocate a new one */
1205 } else {
1206 KF_TRACE(
1207 10,
1208 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1209 global_tid, serial_team));
1210 }
1211
1212 /* we have to initialize this serial team */
1213 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1214 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1215 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1216 serial_team->t.t_ident = loc;
1217 serial_team->t.t_serialized = 1;
1218 serial_team->t.t_nproc = 1;
1219 serial_team->t.t_parent = this_thr->th.th_team;
1220 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1221 this_thr->th.th_team = serial_team;
1222 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1223
1224 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1225 this_thr->th.th_current_task));
1226 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1227 this_thr->th.th_current_task->td_flags.executing = 0;
1228
1229 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1230
1231 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1232 implicit task for each serialized task represented by
1233 team->t.t_serialized? */
1234 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1235 &this_thr->th.th_current_task->td_parent->td_icvs);
1236
1237 // Thread value exists in the nested nthreads array for the next nested
1238 // level
1239 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1240 this_thr->th.th_current_task->td_icvs.nproc =
1241 __kmp_nested_nth.nth[level + 1];
1242 }
1243
1244 if (__kmp_nested_proc_bind.used &&
1245 (level + 1 < __kmp_nested_proc_bind.used)) {
1246 this_thr->th.th_current_task->td_icvs.proc_bind =
1247 __kmp_nested_proc_bind.bind_types[level + 1];
1248 }
1249
1250#if USE_DEBUGGER
1251 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1252#endif
1253 this_thr->th.th_info.ds.ds_tid = 0;
1254
1255 /* set thread cache values */
1256 this_thr->th.th_team_nproc = 1;
1257 this_thr->th.th_team_master = this_thr;
1258 this_thr->th.th_team_serialized = 1;
1259
1260 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1261 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1262 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1263
1264 propagateFPControl(serial_team);
1265
1266 /* check if we need to allocate dispatch buffers stack */
1267 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1268 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1269 serial_team->t.t_dispatch->th_disp_buffer =
1270 (dispatch_private_info_t *)__kmp_allocate(
1271 sizeof(dispatch_private_info_t));
1272 }
1273 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1274
1275 KMP_MB();
1276
1277 } else {
1278 /* this serialized team is already being used,
1279 * that's fine, just add another nested level */
1280 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1281 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1282 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1283 ++serial_team->t.t_serialized;
1284 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1285
1286 // Nested level will be an index in the nested nthreads array
1287 int level = this_thr->th.th_team->t.t_level;
1288 // Thread value exists in the nested nthreads array for the next nested
1289 // level
1290 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1291 this_thr->th.th_current_task->td_icvs.nproc =
1292 __kmp_nested_nth.nth[level + 1];
1293 }
1294 serial_team->t.t_level++;
1295 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1296 "of serial team %p to %d\n",
1297 global_tid, serial_team, serial_team->t.t_level));
1298
1299 /* allocate/push dispatch buffers stack */
1300 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1301 {
1302 dispatch_private_info_t *disp_buffer =
1303 (dispatch_private_info_t *)__kmp_allocate(
1304 sizeof(dispatch_private_info_t));
1305 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1306 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1307 }
1308 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1309
1310 KMP_MB();
1311 }
1312 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1313
1314 // Perform the display affinity functionality for
1315 // serialized parallel regions
1316 if (__kmp_display_affinity) {
1317 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1318 this_thr->th.th_prev_num_threads != 1) {
1319 // NULL means use the affinity-format-var ICV
1320 __kmp_aux_display_affinity(global_tid, NULL);
1321 this_thr->th.th_prev_level = serial_team->t.t_level;
1322 this_thr->th.th_prev_num_threads = 1;
1323 }
1324 }
1325
1326 if (__kmp_env_consistency_check)
1327 __kmp_push_parallel(global_tid, NULL);
1328#if OMPT_SUPPORT
1329 serial_team->t.ompt_team_info.master_return_address = codeptr;
1330 if (ompt_enabled.enabled &&
1331 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1332 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1333 OMPT_GET_FRAME_ADDRESS(0);
1334
1335 ompt_lw_taskteam_t lw_taskteam;
1336 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1337 &ompt_parallel_data, codeptr);
1338
1339 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1340 // don't use lw_taskteam after linking. content was swaped
1341
1342 /* OMPT implicit task begin */
1343 if (ompt_enabled.ompt_callback_implicit_task) {
1344 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1345 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1346 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1347 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1348 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1349 __kmp_tid_from_gtid(global_tid);
1350 }
1351
1352 /* OMPT state */
1353 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1354 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1355 OMPT_GET_FRAME_ADDRESS(0);
1356 }
1357#endif
1358}
1359
1360// Test if this fork is for a team closely nested in a teams construct
1361static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1362 microtask_t microtask, int level,
1363 int teams_level, kmp_va_list ap) {
1364 return (master_th->th.th_teams_microtask && ap &&
1365 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1366}
1367
1368// Test if this fork is for the teams construct, i.e. to form the outer league
1369// of teams
1370static inline bool __kmp_is_entering_teams(int active_level, int level,
1371 int teams_level, kmp_va_list ap) {
1372 return ((ap == NULL && active_level == 0) ||
1373 (ap && teams_level > 0 && teams_level == level));
1374}
1375
1376// AC: This is start of parallel that is nested inside teams construct.
1377// The team is actual (hot), all workers are ready at the fork barrier.
1378// No lock needed to initialize the team a bit, then free workers.
1379static inline int
1380__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1381 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1382 enum fork_context_e call_context, microtask_t microtask,
1383 launch_t invoker, int master_set_numthreads, int level,
1384#if OMPT_SUPPORT
1385 ompt_data_t ompt_parallel_data, void *return_address,
1386#endif
1387 kmp_va_list ap) {
1388 void **argv;
1389 int i;
1390
1391 parent_team->t.t_ident = loc;
1392 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1393 parent_team->t.t_argc = argc;
1394 argv = (void **)parent_team->t.t_argv;
1395 for (i = argc - 1; i >= 0; --i) {
1396 *argv++ = va_arg(kmp_va_deref(ap), void *);
1397 }
1398 // Increment our nested depth levels, but not increase the serialization
1399 if (parent_team == master_th->th.th_serial_team) {
1400 // AC: we are in serialized parallel
1401 __kmpc_serialized_parallel(loc, gtid);
1402 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1403
1404 if (call_context == fork_context_gnu) {
1405 // AC: need to decrement t_serialized for enquiry functions to work
1406 // correctly, will restore at join time
1407 parent_team->t.t_serialized--;
1408 return TRUE;
1409 }
1410
1411#if OMPD_SUPPORT
1412 parent_team->t.t_pkfn = microtask;
1413#endif
1414
1415#if OMPT_SUPPORT
1416 void *dummy;
1417 void **exit_frame_p;
1418 ompt_data_t *implicit_task_data;
1419 ompt_lw_taskteam_t lw_taskteam;
1420
1421 if (ompt_enabled.enabled) {
1422 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1423 &ompt_parallel_data, return_address);
1424 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1425
1426 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1427 // Don't use lw_taskteam after linking. Content was swapped.
1428
1429 /* OMPT implicit task begin */
1430 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1431 if (ompt_enabled.ompt_callback_implicit_task) {
1432 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1433 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1434 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1435 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1436 }
1437
1438 /* OMPT state */
1439 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1440 } else {
1441 exit_frame_p = &dummy;
1442 }
1443#endif
1444
1445 // AC: need to decrement t_serialized for enquiry functions to work
1446 // correctly, will restore at join time
1447 parent_team->t.t_serialized--;
1448
1449 {
1450 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1451 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1452 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1453#if OMPT_SUPPORT
1454 ,
1455 exit_frame_p
1456#endif
1457 );
1458 }
1459
1460#if OMPT_SUPPORT
1461 if (ompt_enabled.enabled) {
1462 *exit_frame_p = NULL;
1463 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1464 if (ompt_enabled.ompt_callback_implicit_task) {
1465 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1466 ompt_scope_end, NULL, implicit_task_data, 1,
1467 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1468 }
1469 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1470 __ompt_lw_taskteam_unlink(master_th);
1471 if (ompt_enabled.ompt_callback_parallel_end) {
1472 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1473 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1474 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1475 }
1476 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1477 }
1478#endif
1479 return TRUE;
1480 }
1481
1482 parent_team->t.t_pkfn = microtask;
1483 parent_team->t.t_invoke = invoker;
1484 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1485 parent_team->t.t_active_level++;
1486 parent_team->t.t_level++;
1487 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1488
1489 // If the threads allocated to the team are less than the thread limit, update
1490 // the thread limit here. th_teams_size.nth is specific to this team nested
1491 // in a teams construct, the team is fully created, and we're about to do
1492 // the actual fork. Best to do this here so that the subsequent uses below
1493 // and in the join have the correct value.
1494 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1495
1496#if OMPT_SUPPORT
1497 if (ompt_enabled.enabled) {
1498 ompt_lw_taskteam_t lw_taskteam;
1499 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1500 return_address);
1501 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1502 }
1503#endif
1504
1505 /* Change number of threads in the team if requested */
1506 if (master_set_numthreads) { // The parallel has num_threads clause
1507 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1508 // AC: only can reduce number of threads dynamically, can't increase
1509 kmp_info_t **other_threads = parent_team->t.t_threads;
1510 // NOTE: if using distributed barrier, we need to run this code block
1511 // even when the team size appears not to have changed from the max.
1512 int old_proc = master_th->th.th_teams_size.nth;
1513 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1514 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1515 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1516 }
1517 parent_team->t.t_nproc = master_set_numthreads;
1518 for (i = 0; i < master_set_numthreads; ++i) {
1519 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1520 }
1521 }
1522 // Keep extra threads hot in the team for possible next parallels
1523 master_th->th.th_set_nproc = 0;
1524 }
1525
1526#if USE_DEBUGGER
1527 if (__kmp_debugging) { // Let debugger override number of threads.
1528 int nth = __kmp_omp_num_threads(loc);
1529 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1530 master_set_numthreads = nth;
1531 }
1532 }
1533#endif
1534
1535 // Figure out the proc_bind policy for the nested parallel within teams
1536 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1537 // proc_bind_default means don't update
1538 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1539 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1540 proc_bind = proc_bind_false;
1541 } else {
1542 // No proc_bind clause specified; use current proc-bind-var
1543 if (proc_bind == proc_bind_default) {
1544 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1545 }
1546 /* else: The proc_bind policy was specified explicitly on parallel clause.
1547 This overrides proc-bind-var for this parallel region, but does not
1548 change proc-bind-var. */
1549 // Figure the value of proc-bind-var for the child threads.
1550 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1551 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1552 master_th->th.th_current_task->td_icvs.proc_bind)) {
1553 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1554 }
1555 }
1556 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1557 // Need to change the bind-var ICV to correct value for each implicit task
1558 if (proc_bind_icv != proc_bind_default &&
1559 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1560 kmp_info_t **other_threads = parent_team->t.t_threads;
1561 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1562 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1563 }
1564 }
1565 // Reset for next parallel region
1566 master_th->th.th_set_proc_bind = proc_bind_default;
1567
1568#if USE_ITT_BUILD && USE_ITT_NOTIFY
1569 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1570 KMP_ITT_DEBUG) &&
1571 __kmp_forkjoin_frames_mode == 3 &&
1572 parent_team->t.t_active_level == 1 // only report frames at level 1
1573 && master_th->th.th_teams_size.nteams == 1) {
1574 kmp_uint64 tmp_time = __itt_get_timestamp();
1575 master_th->th.th_frame_time = tmp_time;
1576 parent_team->t.t_region_time = tmp_time;
1577 }
1578 if (__itt_stack_caller_create_ptr) {
1579 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1580 // create new stack stitching id before entering fork barrier
1581 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1582 }
1583#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1584#if KMP_AFFINITY_SUPPORTED
1585 __kmp_partition_places(parent_team);
1586#endif
1587
1588 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1589 "master_th=%p, gtid=%d\n",
1590 root, parent_team, master_th, gtid));
1591 __kmp_internal_fork(loc, gtid, parent_team);
1592 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1593 "master_th=%p, gtid=%d\n",
1594 root, parent_team, master_th, gtid));
1595
1596 if (call_context == fork_context_gnu)
1597 return TRUE;
1598
1599 /* Invoke microtask for PRIMARY thread */
1600 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1601 parent_team->t.t_id, parent_team->t.t_pkfn));
1602
1603 if (!parent_team->t.t_invoke(gtid)) {
1604 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1605 }
1606 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1607 parent_team->t.t_id, parent_team->t.t_pkfn));
1608 KMP_MB(); /* Flush all pending memory write invalidates. */
1609
1610 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1611
1612 return TRUE;
1613}
1614
1615// Create a serialized parallel region
1616static inline int
1617__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1618 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1619 kmp_info_t *master_th, kmp_team_t *parent_team,
1620#if OMPT_SUPPORT
1621 ompt_data_t *ompt_parallel_data, void **return_address,
1622 ompt_data_t **parent_task_data,
1623#endif
1624 kmp_va_list ap) {
1625 kmp_team_t *team;
1626 int i;
1627 void **argv;
1628
1629/* josh todo: hypothetical question: what do we do for OS X*? */
1630#if KMP_OS_LINUX && \
1631 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1632 void *args[argc];
1633#else
1634 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1635#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1636 KMP_ARCH_AARCH64) */
1637
1638 KA_TRACE(
1639 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1640
1641 __kmpc_serialized_parallel(loc, gtid);
1642
1643#if OMPD_SUPPORT
1644 master_th->th.th_serial_team->t.t_pkfn = microtask;
1645#endif
1646
1647 if (call_context == fork_context_intel) {
1648 /* TODO this sucks, use the compiler itself to pass args! :) */
1649 master_th->th.th_serial_team->t.t_ident = loc;
1650 if (!ap) {
1651 // revert change made in __kmpc_serialized_parallel()
1652 master_th->th.th_serial_team->t.t_level--;
1653// Get args from parent team for teams construct
1654
1655#if OMPT_SUPPORT
1656 void *dummy;
1657 void **exit_frame_p;
1658 ompt_task_info_t *task_info;
1659 ompt_lw_taskteam_t lw_taskteam;
1660
1661 if (ompt_enabled.enabled) {
1662 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1663 ompt_parallel_data, *return_address);
1664
1665 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1666 // don't use lw_taskteam after linking. content was swaped
1667 task_info = OMPT_CUR_TASK_INFO(master_th);
1668 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1669 if (ompt_enabled.ompt_callback_implicit_task) {
1670 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1671 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1672 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1673 &(task_info->task_data), 1,
1674 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1675 }
1676
1677 /* OMPT state */
1678 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1679 } else {
1680 exit_frame_p = &dummy;
1681 }
1682#endif
1683
1684 {
1685 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1686 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1687 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1688#if OMPT_SUPPORT
1689 ,
1690 exit_frame_p
1691#endif
1692 );
1693 }
1694
1695#if OMPT_SUPPORT
1696 if (ompt_enabled.enabled) {
1697 *exit_frame_p = NULL;
1698 if (ompt_enabled.ompt_callback_implicit_task) {
1699 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1700 ompt_scope_end, NULL, &(task_info->task_data), 1,
1701 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1702 }
1703 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1704 __ompt_lw_taskteam_unlink(master_th);
1705 if (ompt_enabled.ompt_callback_parallel_end) {
1706 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1707 ompt_parallel_data, *parent_task_data,
1708 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1709 }
1710 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1711 }
1712#endif
1713 } else if (microtask == (microtask_t)__kmp_teams_master) {
1714 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1715 team = master_th->th.th_team;
1716 // team->t.t_pkfn = microtask;
1717 team->t.t_invoke = invoker;
1718 __kmp_alloc_argv_entries(argc, team, TRUE);
1719 team->t.t_argc = argc;
1720 argv = (void **)team->t.t_argv;
1721 if (ap) {
1722 for (i = argc - 1; i >= 0; --i)
1723 *argv++ = va_arg(kmp_va_deref(ap), void *);
1724 } else {
1725 for (i = 0; i < argc; ++i)
1726 // Get args from parent team for teams construct
1727 argv[i] = parent_team->t.t_argv[i];
1728 }
1729 // AC: revert change made in __kmpc_serialized_parallel()
1730 // because initial code in teams should have level=0
1731 team->t.t_level--;
1732 // AC: call special invoker for outer "parallel" of teams construct
1733 invoker(gtid);
1734#if OMPT_SUPPORT
1735 if (ompt_enabled.enabled) {
1736 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1737 if (ompt_enabled.ompt_callback_implicit_task) {
1738 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1739 ompt_scope_end, NULL, &(task_info->task_data), 0,
1740 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1741 }
1742 if (ompt_enabled.ompt_callback_parallel_end) {
1743 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1744 ompt_parallel_data, *parent_task_data,
1745 OMPT_INVOKER(call_context) | ompt_parallel_league,
1746 *return_address);
1747 }
1748 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1749 }
1750#endif
1751 } else {
1752 argv = args;
1753 for (i = argc - 1; i >= 0; --i)
1754 *argv++ = va_arg(kmp_va_deref(ap), void *);
1755 KMP_MB();
1756
1757#if OMPT_SUPPORT
1758 void *dummy;
1759 void **exit_frame_p;
1760 ompt_task_info_t *task_info;
1761 ompt_lw_taskteam_t lw_taskteam;
1762 ompt_data_t *implicit_task_data;
1763
1764 if (ompt_enabled.enabled) {
1765 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1766 ompt_parallel_data, *return_address);
1767 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1768 // don't use lw_taskteam after linking. content was swaped
1769 task_info = OMPT_CUR_TASK_INFO(master_th);
1770 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1771
1772 /* OMPT implicit task begin */
1773 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1774 if (ompt_enabled.ompt_callback_implicit_task) {
1775 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1776 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1777 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1778 ompt_task_implicit);
1779 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1780 }
1781
1782 /* OMPT state */
1783 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1784 } else {
1785 exit_frame_p = &dummy;
1786 }
1787#endif
1788
1789 {
1790 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1791 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1792 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1793#if OMPT_SUPPORT
1794 ,
1795 exit_frame_p
1796#endif
1797 );
1798 }
1799
1800#if OMPT_SUPPORT
1801 if (ompt_enabled.enabled) {
1802 *exit_frame_p = NULL;
1803 if (ompt_enabled.ompt_callback_implicit_task) {
1804 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1805 ompt_scope_end, NULL, &(task_info->task_data), 1,
1806 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1807 }
1808
1809 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1810 __ompt_lw_taskteam_unlink(master_th);
1811 if (ompt_enabled.ompt_callback_parallel_end) {
1812 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1813 ompt_parallel_data, *parent_task_data,
1814 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1815 }
1816 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1817 }
1818#endif
1819 }
1820 } else if (call_context == fork_context_gnu) {
1821#if OMPT_SUPPORT
1822 if (ompt_enabled.enabled) {
1823 ompt_lw_taskteam_t lwt;
1824 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1825 *return_address);
1826
1827 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1828 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1829 }
1830// don't use lw_taskteam after linking. content was swaped
1831#endif
1832
1833 // we were called from GNU native code
1834 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1835 return FALSE;
1836 } else {
1837 KMP_ASSERT2(call_context < fork_context_last,
1838 "__kmp_serial_fork_call: unknown fork_context parameter");
1839 }
1840
1841 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1842 KMP_MB();
1843 return FALSE;
1844}
1845
1846/* most of the work for a fork */
1847/* return true if we really went parallel, false if serialized */
1848int __kmp_fork_call(ident_t *loc, int gtid,
1849 enum fork_context_e call_context, // Intel, GNU, ...
1850 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1851 kmp_va_list ap) {
1852 void **argv;
1853 int i;
1854 int master_tid;
1855 int master_this_cons;
1856 kmp_team_t *team;
1857 kmp_team_t *parent_team;
1858 kmp_info_t *master_th;
1859 kmp_root_t *root;
1860 int nthreads;
1861 int master_active;
1862 int master_set_numthreads;
1863 int level;
1864 int active_level;
1865 int teams_level;
1866#if KMP_NESTED_HOT_TEAMS
1867 kmp_hot_team_ptr_t **p_hot_teams;
1868#endif
1869 { // KMP_TIME_BLOCK
1870 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1871 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1872
1873 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1874 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1875 /* Some systems prefer the stack for the root thread(s) to start with */
1876 /* some gap from the parent stack to prevent false sharing. */
1877 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1878 /* These 2 lines below are so this does not get optimized out */
1879 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1880 __kmp_stkpadding += (short)((kmp_int64)dummy);
1881 }
1882
1883 /* initialize if needed */
1884 KMP_DEBUG_ASSERT(
1885 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1886 if (!TCR_4(__kmp_init_parallel))
1887 __kmp_parallel_initialize();
1888 __kmp_resume_if_soft_paused();
1889
1890 /* setup current data */
1891 // AC: potentially unsafe, not in sync with library shutdown,
1892 // __kmp_threads can be freed
1893 master_th = __kmp_threads[gtid];
1894
1895 parent_team = master_th->th.th_team;
1896 master_tid = master_th->th.th_info.ds.ds_tid;
1897 master_this_cons = master_th->th.th_local.this_construct;
1898 root = master_th->th.th_root;
1899 master_active = root->r.r_active;
1900 master_set_numthreads = master_th->th.th_set_nproc;
1901
1902#if OMPT_SUPPORT
1903 ompt_data_t ompt_parallel_data = ompt_data_none;
1904 ompt_data_t *parent_task_data;
1905 ompt_frame_t *ompt_frame;
1906 void *return_address = NULL;
1907
1908 if (ompt_enabled.enabled) {
1909 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1910 NULL, NULL);
1911 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1912 }
1913#endif
1914
1915 // Assign affinity to root thread if it hasn't happened yet
1916 __kmp_assign_root_init_mask();
1917
1918 // Nested level will be an index in the nested nthreads array
1919 level = parent_team->t.t_level;
1920 // used to launch non-serial teams even if nested is not allowed
1921 active_level = parent_team->t.t_active_level;
1922 // needed to check nesting inside the teams
1923 teams_level = master_th->th.th_teams_level;
1924#if KMP_NESTED_HOT_TEAMS
1925 p_hot_teams = &master_th->th.th_hot_teams;
1926 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1927 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1928 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1929 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1930 // it is either actual or not needed (when active_level > 0)
1931 (*p_hot_teams)[0].hot_team_nth = 1;
1932 }
1933#endif
1934
1935#if OMPT_SUPPORT
1936 if (ompt_enabled.enabled) {
1937 if (ompt_enabled.ompt_callback_parallel_begin) {
1938 int team_size = master_set_numthreads
1939 ? master_set_numthreads
1940 : get__nproc_2(parent_team, master_tid);
1941 int flags = OMPT_INVOKER(call_context) |
1942 ((microtask == (microtask_t)__kmp_teams_master)
1943 ? ompt_parallel_league
1944 : ompt_parallel_team);
1945 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1946 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1947 return_address);
1948 }
1949 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1950 }
1951#endif
1952
1953 master_th->th.th_ident = loc;
1954
1955 // Parallel closely nested in teams construct:
1956 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1957 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1958 call_context, microtask, invoker,
1959 master_set_numthreads, level,
1960#if OMPT_SUPPORT
1961 ompt_parallel_data, return_address,
1962#endif
1963 ap);
1964 } // End parallel closely nested in teams construct
1965
1966#if KMP_DEBUG
1967 if (__kmp_tasking_mode != tskm_immediate_exec) {
1968 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1969 parent_team->t.t_task_team[master_th->th.th_task_state]);
1970 }
1971#endif
1972
1973 // Need this to happen before we determine the number of threads, not while
1974 // we are allocating the team
1975 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1976
1977 // Determine the number of threads
1978 int enter_teams =
1979 __kmp_is_entering_teams(active_level, level, teams_level, ap);
1980 if ((!enter_teams &&
1981 (parent_team->t.t_active_level >=
1982 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
1983 (__kmp_library == library_serial)) {
1984 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
1985 nthreads = 1;
1986 } else {
1987 nthreads = master_set_numthreads
1988 ? master_set_numthreads
1989 // TODO: get nproc directly from current task
1990 : get__nproc_2(parent_team, master_tid);
1991 // Check if we need to take forkjoin lock? (no need for serialized
1992 // parallel out of teams construct).
1993 if (nthreads > 1) {
1994 /* determine how many new threads we can use */
1995 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1996 /* AC: If we execute teams from parallel region (on host), then teams
1997 should be created but each can only have 1 thread if nesting is
1998 disabled. If teams called from serial region, then teams and their
1999 threads should be created regardless of the nesting setting. */
2000 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2001 nthreads, enter_teams);
2002 if (nthreads == 1) {
2003 // Free lock for single thread execution here; for multi-thread
2004 // execution it will be freed later after team of threads created
2005 // and initialized
2006 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2007 }
2008 }
2009 }
2010 KMP_DEBUG_ASSERT(nthreads > 0);
2011
2012 // If we temporarily changed the set number of threads then restore it now
2013 master_th->th.th_set_nproc = 0;
2014
2015 if (nthreads == 1) {
2016 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2017 invoker, master_th, parent_team,
2018#if OMPT_SUPPORT
2019 &ompt_parallel_data, &return_address,
2020 &parent_task_data,
2021#endif
2022 ap);
2023 } // if (nthreads == 1)
2024
2025 // GEH: only modify the executing flag in the case when not serialized
2026 // serialized case is handled in kmpc_serialized_parallel
2027 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2028 "curtask=%p, curtask_max_aclevel=%d\n",
2029 parent_team->t.t_active_level, master_th,
2030 master_th->th.th_current_task,
2031 master_th->th.th_current_task->td_icvs.max_active_levels));
2032 // TODO: GEH - cannot do this assertion because root thread not set up as
2033 // executing
2034 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2035 master_th->th.th_current_task->td_flags.executing = 0;
2036
2037 if (!master_th->th.th_teams_microtask || level > teams_level) {
2038 /* Increment our nested depth level */
2039 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2040 }
2041
2042 // See if we need to make a copy of the ICVs.
2043 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2044 if ((level + 1 < __kmp_nested_nth.used) &&
2045 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2046 nthreads_icv = __kmp_nested_nth.nth[level + 1];
2047 } else {
2048 nthreads_icv = 0; // don't update
2049 }
2050
2051 // Figure out the proc_bind_policy for the new team.
2052 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2053 // proc_bind_default means don't update
2054 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2055 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2056 proc_bind = proc_bind_false;
2057 } else {
2058 // No proc_bind clause specified; use current proc-bind-var for this
2059 // parallel region
2060 if (proc_bind == proc_bind_default) {
2061 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2062 }
2063 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2064 if (master_th->th.th_teams_microtask &&
2065 microtask == (microtask_t)__kmp_teams_master) {
2066 proc_bind = __kmp_teams_proc_bind;
2067 }
2068 /* else: The proc_bind policy was specified explicitly on parallel clause.
2069 This overrides proc-bind-var for this parallel region, but does not
2070 change proc-bind-var. */
2071 // Figure the value of proc-bind-var for the child threads.
2072 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2073 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2074 master_th->th.th_current_task->td_icvs.proc_bind)) {
2075 // Do not modify the proc bind icv for the two teams construct forks
2076 // They just let the proc bind icv pass through
2077 if (!master_th->th.th_teams_microtask ||
2078 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2079 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2080 }
2081 }
2082
2083 // Reset for next parallel region
2084 master_th->th.th_set_proc_bind = proc_bind_default;
2085
2086 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2087 kmp_internal_control_t new_icvs;
2088 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2089 new_icvs.next = NULL;
2090 if (nthreads_icv > 0) {
2091 new_icvs.nproc = nthreads_icv;
2092 }
2093 if (proc_bind_icv != proc_bind_default) {
2094 new_icvs.proc_bind = proc_bind_icv;
2095 }
2096
2097 /* allocate a new parallel team */
2098 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2099 team = __kmp_allocate_team(root, nthreads, nthreads,
2100#if OMPT_SUPPORT
2101 ompt_parallel_data,
2102#endif
2103 proc_bind, &new_icvs,
2104 argc USE_NESTED_HOT_ARG(master_th));
2105 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2106 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2107 } else {
2108 /* allocate a new parallel team */
2109 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2110 team = __kmp_allocate_team(root, nthreads, nthreads,
2111#if OMPT_SUPPORT
2112 ompt_parallel_data,
2113#endif
2114 proc_bind,
2115 &master_th->th.th_current_task->td_icvs,
2116 argc USE_NESTED_HOT_ARG(master_th));
2117 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2118 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2119 &master_th->th.th_current_task->td_icvs);
2120 }
2121 KF_TRACE(
2122 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2123
2124 /* setup the new team */
2125 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2126 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2127 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2128 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2129 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2130#if OMPT_SUPPORT
2131 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2132 return_address);
2133#endif
2134 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2135 // TODO: parent_team->t.t_level == INT_MAX ???
2136 if (!master_th->th.th_teams_microtask || level > teams_level) {
2137 int new_level = parent_team->t.t_level + 1;
2138 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2139 new_level = parent_team->t.t_active_level + 1;
2140 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2141 } else {
2142 // AC: Do not increase parallel level at start of the teams construct
2143 int new_level = parent_team->t.t_level;
2144 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2145 new_level = parent_team->t.t_active_level;
2146 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2147 }
2148 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2149 // set primary thread's schedule as new run-time schedule
2150 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2151
2152 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2153 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2154
2155 // Update the floating point rounding in the team if required.
2156 propagateFPControl(team);
2157#if OMPD_SUPPORT
2158 if (ompd_state & OMPD_ENABLE_BP)
2159 ompd_bp_parallel_begin();
2160#endif
2161
2162 if (__kmp_tasking_mode != tskm_immediate_exec) {
2163 // Set primary thread's task team to team's task team. Unless this is hot
2164 // team, it should be NULL.
2165 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2166 parent_team->t.t_task_team[master_th->th.th_task_state]);
2167 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2168 "%p, new task_team %p / team %p\n",
2169 __kmp_gtid_from_thread(master_th),
2170 master_th->th.th_task_team, parent_team,
2171 team->t.t_task_team[master_th->th.th_task_state], team));
2172
2173 if (active_level || master_th->th.th_task_team) {
2174 // Take a memo of primary thread's task_state
2175 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2176 if (master_th->th.th_task_state_top >=
2177 master_th->th.th_task_state_stack_sz) { // increase size
2178 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2179 kmp_uint8 *old_stack, *new_stack;
2180 kmp_uint32 i;
2181 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2182 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2183 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2184 }
2185 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2186 ++i) { // zero-init rest of stack
2187 new_stack[i] = 0;
2188 }
2189 old_stack = master_th->th.th_task_state_memo_stack;
2190 master_th->th.th_task_state_memo_stack = new_stack;
2191 master_th->th.th_task_state_stack_sz = new_size;
2192 __kmp_free(old_stack);
2193 }
2194 // Store primary thread's task_state on stack
2195 master_th->th
2196 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2197 master_th->th.th_task_state;
2198 master_th->th.th_task_state_top++;
2199#if KMP_NESTED_HOT_TEAMS
2200 if (master_th->th.th_hot_teams &&
2201 active_level < __kmp_hot_teams_max_level &&
2202 team == master_th->th.th_hot_teams[active_level].hot_team) {
2203 // Restore primary thread's nested state if nested hot team
2204 master_th->th.th_task_state =
2205 master_th->th
2206 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2207 } else {
2208#endif
2209 master_th->th.th_task_state = 0;
2210#if KMP_NESTED_HOT_TEAMS
2211 }
2212#endif
2213 }
2214#if !KMP_NESTED_HOT_TEAMS
2215 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2216 (team == root->r.r_hot_team));
2217#endif
2218 }
2219
2220 KA_TRACE(
2221 20,
2222 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2223 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2224 team->t.t_nproc));
2225 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2226 (team->t.t_master_tid == 0 &&
2227 (team->t.t_parent == root->r.r_root_team ||
2228 team->t.t_parent->t.t_serialized)));
2229 KMP_MB();
2230
2231 /* now, setup the arguments */
2232 argv = (void **)team->t.t_argv;
2233 if (ap) {
2234 for (i = argc - 1; i >= 0; --i) {
2235 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2236 KMP_CHECK_UPDATE(*argv, new_argv);
2237 argv++;
2238 }
2239 } else {
2240 for (i = 0; i < argc; ++i) {
2241 // Get args from parent team for teams construct
2242 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2243 }
2244 }
2245
2246 /* now actually fork the threads */
2247 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2248 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2249 root->r.r_active = TRUE;
2250
2251 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2252 __kmp_setup_icv_copy(team, nthreads,
2253 &master_th->th.th_current_task->td_icvs, loc);
2254
2255#if OMPT_SUPPORT
2256 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2257#endif
2258
2259 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2260
2261#if USE_ITT_BUILD
2262 if (team->t.t_active_level == 1 // only report frames at level 1
2263 && !master_th->th.th_teams_microtask) { // not in teams construct
2264#if USE_ITT_NOTIFY
2265 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2266 (__kmp_forkjoin_frames_mode == 3 ||
2267 __kmp_forkjoin_frames_mode == 1)) {
2268 kmp_uint64 tmp_time = 0;
2269 if (__itt_get_timestamp_ptr)
2270 tmp_time = __itt_get_timestamp();
2271 // Internal fork - report frame begin
2272 master_th->th.th_frame_time = tmp_time;
2273 if (__kmp_forkjoin_frames_mode == 3)
2274 team->t.t_region_time = tmp_time;
2275 } else
2276// only one notification scheme (either "submit" or "forking/joined", not both)
2277#endif /* USE_ITT_NOTIFY */
2278 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2279 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2280 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2281 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2282 }
2283 }
2284#endif /* USE_ITT_BUILD */
2285
2286 /* now go on and do the work */
2287 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2288 KMP_MB();
2289 KF_TRACE(10,
2290 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2291 root, team, master_th, gtid));
2292
2293#if USE_ITT_BUILD
2294 if (__itt_stack_caller_create_ptr) {
2295 // create new stack stitching id before entering fork barrier
2296 if (!enter_teams) {
2297 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2298 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2299 } else if (parent_team->t.t_serialized) {
2300 // keep stack stitching id in the serialized parent_team;
2301 // current team will be used for parallel inside the teams;
2302 // if parent_team is active, then it already keeps stack stitching id
2303 // for the league of teams
2304 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2305 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2306 }
2307 }
2308#endif /* USE_ITT_BUILD */
2309
2310 // AC: skip __kmp_internal_fork at teams construct, let only primary
2311 // threads execute
2312 if (ap) {
2313 __kmp_internal_fork(loc, gtid, team);
2314 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2315 "master_th=%p, gtid=%d\n",
2316 root, team, master_th, gtid));
2317 }
2318
2319 if (call_context == fork_context_gnu) {
2320 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2321 return TRUE;
2322 }
2323
2324 /* Invoke microtask for PRIMARY thread */
2325 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2326 team->t.t_id, team->t.t_pkfn));
2327 } // END of timer KMP_fork_call block
2328
2329#if KMP_STATS_ENABLED
2330 // If beginning a teams construct, then change thread state
2331 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2332 if (!ap) {
2333 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2334 }
2335#endif
2336
2337 if (!team->t.t_invoke(gtid)) {
2338 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2339 }
2340
2341#if KMP_STATS_ENABLED
2342 // If was beginning of a teams construct, then reset thread state
2343 if (!ap) {
2344 KMP_SET_THREAD_STATE(previous_state);
2345 }
2346#endif
2347
2348 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2349 team->t.t_id, team->t.t_pkfn));
2350 KMP_MB(); /* Flush all pending memory write invalidates. */
2351
2352 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2353#if OMPT_SUPPORT
2354 if (ompt_enabled.enabled) {
2355 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2356 }
2357#endif
2358
2359 return TRUE;
2360}
2361
2362#if OMPT_SUPPORT
2363static inline void __kmp_join_restore_state(kmp_info_t *thread,
2364 kmp_team_t *team) {
2365 // restore state outside the region
2366 thread->th.ompt_thread_info.state =
2367 ((team->t.t_serialized) ? ompt_state_work_serial
2368 : ompt_state_work_parallel);
2369}
2370
2371static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2372 kmp_team_t *team, ompt_data_t *parallel_data,
2373 int flags, void *codeptr) {
2374 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2375 if (ompt_enabled.ompt_callback_parallel_end) {
2376 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2377 parallel_data, &(task_info->task_data), flags, codeptr);
2378 }
2379
2380 task_info->frame.enter_frame = ompt_data_none;
2381 __kmp_join_restore_state(thread, team);
2382}
2383#endif
2384
2385void __kmp_join_call(ident_t *loc, int gtid
2386#if OMPT_SUPPORT
2387 ,
2388 enum fork_context_e fork_context
2389#endif
2390 ,
2391 int exit_teams) {
2392 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2393 kmp_team_t *team;
2394 kmp_team_t *parent_team;
2395 kmp_info_t *master_th;
2396 kmp_root_t *root;
2397 int master_active;
2398
2399 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2400
2401 /* setup current data */
2402 master_th = __kmp_threads[gtid];
2403 root = master_th->th.th_root;
2404 team = master_th->th.th_team;
2405 parent_team = team->t.t_parent;
2406
2407 master_th->th.th_ident = loc;
2408
2409#if OMPT_SUPPORT
2410 void *team_microtask = (void *)team->t.t_pkfn;
2411 // For GOMP interface with serialized parallel, need the
2412 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2413 // and end-parallel events.
2414 if (ompt_enabled.enabled &&
2415 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2416 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2417 }
2418#endif
2419
2420#if KMP_DEBUG
2421 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2422 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2423 "th_task_team = %p\n",
2424 __kmp_gtid_from_thread(master_th), team,
2425 team->t.t_task_team[master_th->th.th_task_state],
2426 master_th->th.th_task_team));
2427 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2428 team->t.t_task_team[master_th->th.th_task_state]);
2429 }
2430#endif
2431
2432 if (team->t.t_serialized) {
2433 if (master_th->th.th_teams_microtask) {
2434 // We are in teams construct
2435 int level = team->t.t_level;
2436 int tlevel = master_th->th.th_teams_level;
2437 if (level == tlevel) {
2438 // AC: we haven't incremented it earlier at start of teams construct,
2439 // so do it here - at the end of teams construct
2440 team->t.t_level++;
2441 } else if (level == tlevel + 1) {
2442 // AC: we are exiting parallel inside teams, need to increment
2443 // serialization in order to restore it in the next call to
2444 // __kmpc_end_serialized_parallel
2445 team->t.t_serialized++;
2446 }
2447 }
2449
2450#if OMPT_SUPPORT
2451 if (ompt_enabled.enabled) {
2452 if (fork_context == fork_context_gnu) {
2453 __ompt_lw_taskteam_unlink(master_th);
2454 }
2455 __kmp_join_restore_state(master_th, parent_team);
2456 }
2457#endif
2458
2459 return;
2460 }
2461
2462 master_active = team->t.t_master_active;
2463
2464 if (!exit_teams) {
2465 // AC: No barrier for internal teams at exit from teams construct.
2466 // But there is barrier for external team (league).
2467 __kmp_internal_join(loc, gtid, team);
2468#if USE_ITT_BUILD
2469 if (__itt_stack_caller_create_ptr) {
2470 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2471 // destroy the stack stitching id after join barrier
2472 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2473 team->t.t_stack_id = NULL;
2474 }
2475#endif
2476 } else {
2477 master_th->th.th_task_state =
2478 0; // AC: no tasking in teams (out of any parallel)
2479#if USE_ITT_BUILD
2480 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2481 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2482 // destroy the stack stitching id on exit from the teams construct
2483 // if parent_team is active, then the id will be destroyed later on
2484 // by master of the league of teams
2485 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2486 parent_team->t.t_stack_id = NULL;
2487 }
2488#endif
2489 }
2490
2491 KMP_MB();
2492
2493#if OMPT_SUPPORT
2494 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2495 void *codeptr = team->t.ompt_team_info.master_return_address;
2496#endif
2497
2498#if USE_ITT_BUILD
2499 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2500 if (team->t.t_active_level == 1 &&
2501 (!master_th->th.th_teams_microtask || /* not in teams construct */
2502 master_th->th.th_teams_size.nteams == 1)) {
2503 master_th->th.th_ident = loc;
2504 // only one notification scheme (either "submit" or "forking/joined", not
2505 // both)
2506 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2507 __kmp_forkjoin_frames_mode == 3)
2508 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2509 master_th->th.th_frame_time, 0, loc,
2510 master_th->th.th_team_nproc, 1);
2511 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2512 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2513 __kmp_itt_region_joined(gtid);
2514 } // active_level == 1
2515#endif /* USE_ITT_BUILD */
2516
2517#if KMP_AFFINITY_SUPPORTED
2518 if (!exit_teams) {
2519 // Restore master thread's partition.
2520 master_th->th.th_first_place = team->t.t_first_place;
2521 master_th->th.th_last_place = team->t.t_last_place;
2522 }
2523#endif // KMP_AFFINITY_SUPPORTED
2524
2525 if (master_th->th.th_teams_microtask && !exit_teams &&
2526 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2527 team->t.t_level == master_th->th.th_teams_level + 1) {
2528// AC: We need to leave the team structure intact at the end of parallel
2529// inside the teams construct, so that at the next parallel same (hot) team
2530// works, only adjust nesting levels
2531#if OMPT_SUPPORT
2532 ompt_data_t ompt_parallel_data = ompt_data_none;
2533 if (ompt_enabled.enabled) {
2534 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2535 if (ompt_enabled.ompt_callback_implicit_task) {
2536 int ompt_team_size = team->t.t_nproc;
2537 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2538 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2539 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2540 }
2541 task_info->frame.exit_frame = ompt_data_none;
2542 task_info->task_data = ompt_data_none;
2543 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2544 __ompt_lw_taskteam_unlink(master_th);
2545 }
2546#endif
2547 /* Decrement our nested depth level */
2548 team->t.t_level--;
2549 team->t.t_active_level--;
2550 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2551
2552 // Restore number of threads in the team if needed. This code relies on
2553 // the proper adjustment of th_teams_size.nth after the fork in
2554 // __kmp_teams_master on each teams primary thread in the case that
2555 // __kmp_reserve_threads reduced it.
2556 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2557 int old_num = master_th->th.th_team_nproc;
2558 int new_num = master_th->th.th_teams_size.nth;
2559 kmp_info_t **other_threads = team->t.t_threads;
2560 team->t.t_nproc = new_num;
2561 for (int i = 0; i < old_num; ++i) {
2562 other_threads[i]->th.th_team_nproc = new_num;
2563 }
2564 // Adjust states of non-used threads of the team
2565 for (int i = old_num; i < new_num; ++i) {
2566 // Re-initialize thread's barrier data.
2567 KMP_DEBUG_ASSERT(other_threads[i]);
2568 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2569 for (int b = 0; b < bs_last_barrier; ++b) {
2570 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2571 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2572#if USE_DEBUGGER
2573 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2574#endif
2575 }
2576 if (__kmp_tasking_mode != tskm_immediate_exec) {
2577 // Synchronize thread's task state
2578 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2579 }
2580 }
2581 }
2582
2583#if OMPT_SUPPORT
2584 if (ompt_enabled.enabled) {
2585 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2586 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2587 }
2588#endif
2589
2590 return;
2591 }
2592
2593 /* do cleanup and restore the parent team */
2594 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2595 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2596
2597 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2598
2599 /* jc: The following lock has instructions with REL and ACQ semantics,
2600 separating the parallel user code called in this parallel region
2601 from the serial user code called after this function returns. */
2602 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2603
2604 if (!master_th->th.th_teams_microtask ||
2605 team->t.t_level > master_th->th.th_teams_level) {
2606 /* Decrement our nested depth level */
2607 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2608 }
2609 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2610
2611#if OMPT_SUPPORT
2612 if (ompt_enabled.enabled) {
2613 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2614 if (ompt_enabled.ompt_callback_implicit_task) {
2615 int flags = (team_microtask == (void *)__kmp_teams_master)
2616 ? ompt_task_initial
2617 : ompt_task_implicit;
2618 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2619 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2620 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2621 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2622 }
2623 task_info->frame.exit_frame = ompt_data_none;
2624 task_info->task_data = ompt_data_none;
2625 }
2626#endif
2627
2628 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2629 master_th, team));
2630 __kmp_pop_current_task_from_thread(master_th);
2631
2632 master_th->th.th_def_allocator = team->t.t_def_allocator;
2633
2634#if OMPD_SUPPORT
2635 if (ompd_state & OMPD_ENABLE_BP)
2636 ompd_bp_parallel_end();
2637#endif
2638 updateHWFPControl(team);
2639
2640 if (root->r.r_active != master_active)
2641 root->r.r_active = master_active;
2642
2643 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2644 master_th)); // this will free worker threads
2645
2646 /* this race was fun to find. make sure the following is in the critical
2647 region otherwise assertions may fail occasionally since the old team may be
2648 reallocated and the hierarchy appears inconsistent. it is actually safe to
2649 run and won't cause any bugs, but will cause those assertion failures. it's
2650 only one deref&assign so might as well put this in the critical region */
2651 master_th->th.th_team = parent_team;
2652 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2653 master_th->th.th_team_master = parent_team->t.t_threads[0];
2654 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2655
2656 /* restore serialized team, if need be */
2657 if (parent_team->t.t_serialized &&
2658 parent_team != master_th->th.th_serial_team &&
2659 parent_team != root->r.r_root_team) {
2660 __kmp_free_team(root,
2661 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2662 master_th->th.th_serial_team = parent_team;
2663 }
2664
2665 if (__kmp_tasking_mode != tskm_immediate_exec) {
2666 if (master_th->th.th_task_state_top >
2667 0) { // Restore task state from memo stack
2668 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2669 // Remember primary thread's state if we re-use this nested hot team
2670 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2671 master_th->th.th_task_state;
2672 --master_th->th.th_task_state_top; // pop
2673 // Now restore state at this level
2674 master_th->th.th_task_state =
2675 master_th->th
2676 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2677 } else if (team != root->r.r_hot_team) {
2678 // Reset the task state of primary thread if we are not hot team because
2679 // in this case all the worker threads will be free, and their task state
2680 // will be reset. If not reset the primary's, the task state will be
2681 // inconsistent.
2682 master_th->th.th_task_state = 0;
2683 }
2684 // Copy the task team from the parent team to the primary thread
2685 master_th->th.th_task_team =
2686 parent_team->t.t_task_team[master_th->th.th_task_state];
2687 KA_TRACE(20,
2688 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2689 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2690 parent_team));
2691 }
2692
2693 // TODO: GEH - cannot do this assertion because root thread not set up as
2694 // executing
2695 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2696 master_th->th.th_current_task->td_flags.executing = 1;
2697
2698 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2699
2700#if KMP_AFFINITY_SUPPORTED
2701 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2702 __kmp_reset_root_init_mask(gtid);
2703 }
2704#endif
2705#if OMPT_SUPPORT
2706 int flags =
2707 OMPT_INVOKER(fork_context) |
2708 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2709 : ompt_parallel_team);
2710 if (ompt_enabled.enabled) {
2711 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2712 codeptr);
2713 }
2714#endif
2715
2716 KMP_MB();
2717 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2718}
2719
2720/* Check whether we should push an internal control record onto the
2721 serial team stack. If so, do it. */
2722void __kmp_save_internal_controls(kmp_info_t *thread) {
2723
2724 if (thread->th.th_team != thread->th.th_serial_team) {
2725 return;
2726 }
2727 if (thread->th.th_team->t.t_serialized > 1) {
2728 int push = 0;
2729
2730 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2731 push = 1;
2732 } else {
2733 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2734 thread->th.th_team->t.t_serialized) {
2735 push = 1;
2736 }
2737 }
2738 if (push) { /* push a record on the serial team's stack */
2739 kmp_internal_control_t *control =
2740 (kmp_internal_control_t *)__kmp_allocate(
2741 sizeof(kmp_internal_control_t));
2742
2743 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2744
2745 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2746
2747 control->next = thread->th.th_team->t.t_control_stack_top;
2748 thread->th.th_team->t.t_control_stack_top = control;
2749 }
2750 }
2751}
2752
2753/* Changes set_nproc */
2754void __kmp_set_num_threads(int new_nth, int gtid) {
2755 kmp_info_t *thread;
2756 kmp_root_t *root;
2757
2758 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2759 KMP_DEBUG_ASSERT(__kmp_init_serial);
2760
2761 if (new_nth < 1)
2762 new_nth = 1;
2763 else if (new_nth > __kmp_max_nth)
2764 new_nth = __kmp_max_nth;
2765
2766 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2767 thread = __kmp_threads[gtid];
2768 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2769 return; // nothing to do
2770
2771 __kmp_save_internal_controls(thread);
2772
2773 set__nproc(thread, new_nth);
2774
2775 // If this omp_set_num_threads() call will cause the hot team size to be
2776 // reduced (in the absence of a num_threads clause), then reduce it now,
2777 // rather than waiting for the next parallel region.
2778 root = thread->th.th_root;
2779 if (__kmp_init_parallel && (!root->r.r_active) &&
2780 (root->r.r_hot_team->t.t_nproc > new_nth)
2781#if KMP_NESTED_HOT_TEAMS
2782 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2783#endif
2784 ) {
2785 kmp_team_t *hot_team = root->r.r_hot_team;
2786 int f;
2787
2788 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2789
2790 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2791 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2792 }
2793 // Release the extra threads we don't need any more.
2794 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2795 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2796 if (__kmp_tasking_mode != tskm_immediate_exec) {
2797 // When decreasing team size, threads no longer in the team should unref
2798 // task team.
2799 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2800 }
2801 __kmp_free_thread(hot_team->t.t_threads[f]);
2802 hot_team->t.t_threads[f] = NULL;
2803 }
2804 hot_team->t.t_nproc = new_nth;
2805#if KMP_NESTED_HOT_TEAMS
2806 if (thread->th.th_hot_teams) {
2807 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2808 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2809 }
2810#endif
2811
2812 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2813 hot_team->t.b->update_num_threads(new_nth);
2814 __kmp_add_threads_to_team(hot_team, new_nth);
2815 }
2816
2817 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2818
2819 // Update the t_nproc field in the threads that are still active.
2820 for (f = 0; f < new_nth; f++) {
2821 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2822 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2823 }
2824 // Special flag in case omp_set_num_threads() call
2825 hot_team->t.t_size_changed = -1;
2826 }
2827}
2828
2829/* Changes max_active_levels */
2830void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2831 kmp_info_t *thread;
2832
2833 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2834 "%d = (%d)\n",
2835 gtid, max_active_levels));
2836 KMP_DEBUG_ASSERT(__kmp_init_serial);
2837
2838 // validate max_active_levels
2839 if (max_active_levels < 0) {
2840 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2841 // We ignore this call if the user has specified a negative value.
2842 // The current setting won't be changed. The last valid setting will be
2843 // used. A warning will be issued (if warnings are allowed as controlled by
2844 // the KMP_WARNINGS env var).
2845 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2846 "max_active_levels for thread %d = (%d)\n",
2847 gtid, max_active_levels));
2848 return;
2849 }
2850 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2851 // it's OK, the max_active_levels is within the valid range: [ 0;
2852 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2853 // We allow a zero value. (implementation defined behavior)
2854 } else {
2855 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2856 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2857 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2858 // Current upper limit is MAX_INT. (implementation defined behavior)
2859 // If the input exceeds the upper limit, we correct the input to be the
2860 // upper limit. (implementation defined behavior)
2861 // Actually, the flow should never get here until we use MAX_INT limit.
2862 }
2863 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2864 "max_active_levels for thread %d = (%d)\n",
2865 gtid, max_active_levels));
2866
2867 thread = __kmp_threads[gtid];
2868
2869 __kmp_save_internal_controls(thread);
2870
2871 set__max_active_levels(thread, max_active_levels);
2872}
2873
2874/* Gets max_active_levels */
2875int __kmp_get_max_active_levels(int gtid) {
2876 kmp_info_t *thread;
2877
2878 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2879 KMP_DEBUG_ASSERT(__kmp_init_serial);
2880
2881 thread = __kmp_threads[gtid];
2882 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2883 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2884 "curtask_maxaclevel=%d\n",
2885 gtid, thread->th.th_current_task,
2886 thread->th.th_current_task->td_icvs.max_active_levels));
2887 return thread->th.th_current_task->td_icvs.max_active_levels;
2888}
2889
2890// nteams-var per-device ICV
2891void __kmp_set_num_teams(int num_teams) {
2892 if (num_teams > 0)
2893 __kmp_nteams = num_teams;
2894}
2895int __kmp_get_max_teams(void) { return __kmp_nteams; }
2896// teams-thread-limit-var per-device ICV
2897void __kmp_set_teams_thread_limit(int limit) {
2898 if (limit > 0)
2899 __kmp_teams_thread_limit = limit;
2900}
2901int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2902
2903KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2904KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2905
2906/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2907void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2908 kmp_info_t *thread;
2909 kmp_sched_t orig_kind;
2910 // kmp_team_t *team;
2911
2912 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2913 gtid, (int)kind, chunk));
2914 KMP_DEBUG_ASSERT(__kmp_init_serial);
2915
2916 // Check if the kind parameter is valid, correct if needed.
2917 // Valid parameters should fit in one of two intervals - standard or extended:
2918 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2919 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2920 orig_kind = kind;
2921 kind = __kmp_sched_without_mods(kind);
2922
2923 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2924 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2925 // TODO: Hint needs attention in case we change the default schedule.
2926 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2927 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2928 __kmp_msg_null);
2929 kind = kmp_sched_default;
2930 chunk = 0; // ignore chunk value in case of bad kind
2931 }
2932
2933 thread = __kmp_threads[gtid];
2934
2935 __kmp_save_internal_controls(thread);
2936
2937 if (kind < kmp_sched_upper_std) {
2938 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2939 // differ static chunked vs. unchunked: chunk should be invalid to
2940 // indicate unchunked schedule (which is the default)
2941 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2942 } else {
2943 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2944 __kmp_sch_map[kind - kmp_sched_lower - 1];
2945 }
2946 } else {
2947 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2948 // kmp_sched_lower - 2 ];
2949 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2950 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2951 kmp_sched_lower - 2];
2952 }
2953 __kmp_sched_apply_mods_intkind(
2954 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2955 if (kind == kmp_sched_auto || chunk < 1) {
2956 // ignore parameter chunk for schedule auto
2957 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2958 } else {
2959 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2960 }
2961}
2962
2963/* Gets def_sched_var ICV values */
2964void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2965 kmp_info_t *thread;
2966 enum sched_type th_type;
2967
2968 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2969 KMP_DEBUG_ASSERT(__kmp_init_serial);
2970
2971 thread = __kmp_threads[gtid];
2972
2973 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2974 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2975 case kmp_sch_static:
2976 case kmp_sch_static_greedy:
2977 case kmp_sch_static_balanced:
2978 *kind = kmp_sched_static;
2979 __kmp_sched_apply_mods_stdkind(kind, th_type);
2980 *chunk = 0; // chunk was not set, try to show this fact via zero value
2981 return;
2982 case kmp_sch_static_chunked:
2983 *kind = kmp_sched_static;
2984 break;
2985 case kmp_sch_dynamic_chunked:
2986 *kind = kmp_sched_dynamic;
2987 break;
2989 case kmp_sch_guided_iterative_chunked:
2990 case kmp_sch_guided_analytical_chunked:
2991 *kind = kmp_sched_guided;
2992 break;
2993 case kmp_sch_auto:
2994 *kind = kmp_sched_auto;
2995 break;
2996 case kmp_sch_trapezoidal:
2997 *kind = kmp_sched_trapezoidal;
2998 break;
2999#if KMP_STATIC_STEAL_ENABLED
3000 case kmp_sch_static_steal:
3001 *kind = kmp_sched_static_steal;
3002 break;
3003#endif
3004 default:
3005 KMP_FATAL(UnknownSchedulingType, th_type);
3006 }
3007
3008 __kmp_sched_apply_mods_stdkind(kind, th_type);
3009 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3010}
3011
3012int __kmp_get_ancestor_thread_num(int gtid, int level) {
3013
3014 int ii, dd;
3015 kmp_team_t *team;
3016 kmp_info_t *thr;
3017
3018 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3019 KMP_DEBUG_ASSERT(__kmp_init_serial);
3020
3021 // validate level
3022 if (level == 0)
3023 return 0;
3024 if (level < 0)
3025 return -1;
3026 thr = __kmp_threads[gtid];
3027 team = thr->th.th_team;
3028 ii = team->t.t_level;
3029 if (level > ii)
3030 return -1;
3031
3032 if (thr->th.th_teams_microtask) {
3033 // AC: we are in teams region where multiple nested teams have same level
3034 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3035 if (level <=
3036 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3037 KMP_DEBUG_ASSERT(ii >= tlevel);
3038 // AC: As we need to pass by the teams league, we need to artificially
3039 // increase ii
3040 if (ii == tlevel) {
3041 ii += 2; // three teams have same level
3042 } else {
3043 ii++; // two teams have same level
3044 }
3045 }
3046 }
3047
3048 if (ii == level)
3049 return __kmp_tid_from_gtid(gtid);
3050
3051 dd = team->t.t_serialized;
3052 level++;
3053 while (ii > level) {
3054 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3055 }
3056 if ((team->t.t_serialized) && (!dd)) {
3057 team = team->t.t_parent;
3058 continue;
3059 }
3060 if (ii > level) {
3061 team = team->t.t_parent;
3062 dd = team->t.t_serialized;
3063 ii--;
3064 }
3065 }
3066
3067 return (dd > 1) ? (0) : (team->t.t_master_tid);
3068}
3069
3070int __kmp_get_team_size(int gtid, int level) {
3071
3072 int ii, dd;
3073 kmp_team_t *team;
3074 kmp_info_t *thr;
3075
3076 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3077 KMP_DEBUG_ASSERT(__kmp_init_serial);
3078
3079 // validate level
3080 if (level == 0)
3081 return 1;
3082 if (level < 0)
3083 return -1;
3084 thr = __kmp_threads[gtid];
3085 team = thr->th.th_team;
3086 ii = team->t.t_level;
3087 if (level > ii)
3088 return -1;
3089
3090 if (thr->th.th_teams_microtask) {
3091 // AC: we are in teams region where multiple nested teams have same level
3092 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3093 if (level <=
3094 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3095 KMP_DEBUG_ASSERT(ii >= tlevel);
3096 // AC: As we need to pass by the teams league, we need to artificially
3097 // increase ii
3098 if (ii == tlevel) {
3099 ii += 2; // three teams have same level
3100 } else {
3101 ii++; // two teams have same level
3102 }
3103 }
3104 }
3105
3106 while (ii > level) {
3107 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3108 }
3109 if (team->t.t_serialized && (!dd)) {
3110 team = team->t.t_parent;
3111 continue;
3112 }
3113 if (ii > level) {
3114 team = team->t.t_parent;
3115 ii--;
3116 }
3117 }
3118
3119 return team->t.t_nproc;
3120}
3121
3122kmp_r_sched_t __kmp_get_schedule_global() {
3123 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3124 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3125 // independently. So one can get the updated schedule here.
3126
3127 kmp_r_sched_t r_sched;
3128
3129 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3130 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3131 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3132 // different roots (even in OMP 2.5)
3133 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3134 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3135 if (s == kmp_sch_static) {
3136 // replace STATIC with more detailed schedule (balanced or greedy)
3137 r_sched.r_sched_type = __kmp_static;
3138 } else if (s == kmp_sch_guided_chunked) {
3139 // replace GUIDED with more detailed schedule (iterative or analytical)
3140 r_sched.r_sched_type = __kmp_guided;
3141 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3142 r_sched.r_sched_type = __kmp_sched;
3143 }
3144 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3145
3146 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3147 // __kmp_chunk may be wrong here (if it was not ever set)
3148 r_sched.chunk = KMP_DEFAULT_CHUNK;
3149 } else {
3150 r_sched.chunk = __kmp_chunk;
3151 }
3152
3153 return r_sched;
3154}
3155
3156/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3157 at least argc number of *t_argv entries for the requested team. */
3158static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3159
3160 KMP_DEBUG_ASSERT(team);
3161 if (!realloc || argc > team->t.t_max_argc) {
3162
3163 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3164 "current entries=%d\n",
3165 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3166 /* if previously allocated heap space for args, free them */
3167 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3168 __kmp_free((void *)team->t.t_argv);
3169
3170 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3171 /* use unused space in the cache line for arguments */
3172 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3173 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3174 "argv entries\n",
3175 team->t.t_id, team->t.t_max_argc));
3176 team->t.t_argv = &team->t.t_inline_argv[0];
3177 if (__kmp_storage_map) {
3178 __kmp_print_storage_map_gtid(
3179 -1, &team->t.t_inline_argv[0],
3180 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3181 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3182 team->t.t_id);
3183 }
3184 } else {
3185 /* allocate space for arguments in the heap */
3186 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3187 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3188 : 2 * argc;
3189 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3190 "argv entries\n",
3191 team->t.t_id, team->t.t_max_argc));
3192 team->t.t_argv =
3193 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3194 if (__kmp_storage_map) {
3195 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3196 &team->t.t_argv[team->t.t_max_argc],
3197 sizeof(void *) * team->t.t_max_argc,
3198 "team_%d.t_argv", team->t.t_id);
3199 }
3200 }
3201 }
3202}
3203
3204static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3205 int i;
3206 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3207 team->t.t_threads =
3208 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3209 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3210 sizeof(dispatch_shared_info_t) * num_disp_buff);
3211 team->t.t_dispatch =
3212 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3213 team->t.t_implicit_task_taskdata =
3214 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3215 team->t.t_max_nproc = max_nth;
3216
3217 /* setup dispatch buffers */
3218 for (i = 0; i < num_disp_buff; ++i) {
3219 team->t.t_disp_buffer[i].buffer_index = i;
3220 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3221 }
3222}
3223
3224static void __kmp_free_team_arrays(kmp_team_t *team) {
3225 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3226 int i;
3227 for (i = 0; i < team->t.t_max_nproc; ++i) {
3228 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3229 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3230 team->t.t_dispatch[i].th_disp_buffer = NULL;
3231 }
3232 }
3233#if KMP_USE_HIER_SCHED
3234 __kmp_dispatch_free_hierarchies(team);
3235#endif
3236 __kmp_free(team->t.t_threads);
3237 __kmp_free(team->t.t_disp_buffer);
3238 __kmp_free(team->t.t_dispatch);
3239 __kmp_free(team->t.t_implicit_task_taskdata);
3240 team->t.t_threads = NULL;
3241 team->t.t_disp_buffer = NULL;
3242 team->t.t_dispatch = NULL;
3243 team->t.t_implicit_task_taskdata = 0;
3244}
3245
3246static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3247 kmp_info_t **oldThreads = team->t.t_threads;
3248
3249 __kmp_free(team->t.t_disp_buffer);
3250 __kmp_free(team->t.t_dispatch);
3251 __kmp_free(team->t.t_implicit_task_taskdata);
3252 __kmp_allocate_team_arrays(team, max_nth);
3253
3254 KMP_MEMCPY(team->t.t_threads, oldThreads,
3255 team->t.t_nproc * sizeof(kmp_info_t *));
3256
3257 __kmp_free(oldThreads);
3258}
3259
3260static kmp_internal_control_t __kmp_get_global_icvs(void) {
3261
3262 kmp_r_sched_t r_sched =
3263 __kmp_get_schedule_global(); // get current state of scheduling globals
3264
3265 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3266
3267 kmp_internal_control_t g_icvs = {
3268 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3269 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3270 // adjustment of threads (per thread)
3271 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3272 // whether blocktime is explicitly set
3273 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3274#if KMP_USE_MONITOR
3275 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3276// intervals
3277#endif
3278 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3279 // next parallel region (per thread)
3280 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3281 __kmp_cg_max_nth, // int thread_limit;
3282 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3283 // for max_active_levels
3284 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3285 // {sched,chunk} pair
3286 __kmp_nested_proc_bind.bind_types[0],
3287 __kmp_default_device,
3288 NULL // struct kmp_internal_control *next;
3289 };
3290
3291 return g_icvs;
3292}
3293
3294static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3295
3296 kmp_internal_control_t gx_icvs;
3297 gx_icvs.serial_nesting_level =
3298 0; // probably =team->t.t_serial like in save_inter_controls
3299 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3300 gx_icvs.next = NULL;
3301
3302 return gx_icvs;
3303}
3304
3305static void __kmp_initialize_root(kmp_root_t *root) {
3306 int f;
3307 kmp_team_t *root_team;
3308 kmp_team_t *hot_team;
3309 int hot_team_max_nth;
3310 kmp_r_sched_t r_sched =
3311 __kmp_get_schedule_global(); // get current state of scheduling globals
3312 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3313 KMP_DEBUG_ASSERT(root);
3314 KMP_ASSERT(!root->r.r_begin);
3315
3316 /* setup the root state structure */
3317 __kmp_init_lock(&root->r.r_begin_lock);
3318 root->r.r_begin = FALSE;
3319 root->r.r_active = FALSE;
3320 root->r.r_in_parallel = 0;
3321 root->r.r_blocktime = __kmp_dflt_blocktime;
3322#if KMP_AFFINITY_SUPPORTED
3323 root->r.r_affinity_assigned = FALSE;
3324#endif
3325
3326 /* setup the root team for this task */
3327 /* allocate the root team structure */
3328 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3329
3330 root_team =
3331 __kmp_allocate_team(root,
3332 1, // new_nproc
3333 1, // max_nproc
3334#if OMPT_SUPPORT
3335 ompt_data_none, // root parallel id
3336#endif
3337 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3338 0 // argc
3339 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3340 );
3341#if USE_DEBUGGER
3342 // Non-NULL value should be assigned to make the debugger display the root
3343 // team.
3344 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3345#endif
3346
3347 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3348
3349 root->r.r_root_team = root_team;
3350 root_team->t.t_control_stack_top = NULL;
3351
3352 /* initialize root team */
3353 root_team->t.t_threads[0] = NULL;
3354 root_team->t.t_nproc = 1;
3355 root_team->t.t_serialized = 1;
3356 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3357 root_team->t.t_sched.sched = r_sched.sched;
3358 KA_TRACE(
3359 20,
3360 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3361 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3362
3363 /* setup the hot team for this task */
3364 /* allocate the hot team structure */
3365 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3366
3367 hot_team =
3368 __kmp_allocate_team(root,
3369 1, // new_nproc
3370 __kmp_dflt_team_nth_ub * 2, // max_nproc
3371#if OMPT_SUPPORT
3372 ompt_data_none, // root parallel id
3373#endif
3374 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3375 0 // argc
3376 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3377 );
3378 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3379
3380 root->r.r_hot_team = hot_team;
3381 root_team->t.t_control_stack_top = NULL;
3382
3383 /* first-time initialization */
3384 hot_team->t.t_parent = root_team;
3385
3386 /* initialize hot team */
3387 hot_team_max_nth = hot_team->t.t_max_nproc;
3388 for (f = 0; f < hot_team_max_nth; ++f) {
3389 hot_team->t.t_threads[f] = NULL;
3390 }
3391 hot_team->t.t_nproc = 1;
3392 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3393 hot_team->t.t_sched.sched = r_sched.sched;
3394 hot_team->t.t_size_changed = 0;
3395}
3396
3397#ifdef KMP_DEBUG
3398
3399typedef struct kmp_team_list_item {
3400 kmp_team_p const *entry;
3401 struct kmp_team_list_item *next;
3402} kmp_team_list_item_t;
3403typedef kmp_team_list_item_t *kmp_team_list_t;
3404
3405static void __kmp_print_structure_team_accum( // Add team to list of teams.
3406 kmp_team_list_t list, // List of teams.
3407 kmp_team_p const *team // Team to add.
3408) {
3409
3410 // List must terminate with item where both entry and next are NULL.
3411 // Team is added to the list only once.
3412 // List is sorted in ascending order by team id.
3413 // Team id is *not* a key.
3414
3415 kmp_team_list_t l;
3416
3417 KMP_DEBUG_ASSERT(list != NULL);
3418 if (team == NULL) {
3419 return;
3420 }
3421
3422 __kmp_print_structure_team_accum(list, team->t.t_parent);
3423 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3424
3425 // Search list for the team.
3426 l = list;
3427 while (l->next != NULL && l->entry != team) {
3428 l = l->next;
3429 }
3430 if (l->next != NULL) {
3431 return; // Team has been added before, exit.
3432 }
3433
3434 // Team is not found. Search list again for insertion point.
3435 l = list;
3436 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3437 l = l->next;
3438 }
3439
3440 // Insert team.
3441 {
3442 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3443 sizeof(kmp_team_list_item_t));
3444 *item = *l;
3445 l->entry = team;
3446 l->next = item;
3447 }
3448}
3449
3450static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3451
3452) {
3453 __kmp_printf("%s", title);
3454 if (team != NULL) {
3455 __kmp_printf("%2x %p\n", team->t.t_id, team);
3456 } else {
3457 __kmp_printf(" - (nil)\n");
3458 }
3459}
3460
3461static void __kmp_print_structure_thread(char const *title,
3462 kmp_info_p const *thread) {
3463 __kmp_printf("%s", title);
3464 if (thread != NULL) {
3465 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3466 } else {
3467 __kmp_printf(" - (nil)\n");
3468 }
3469}
3470
3471void __kmp_print_structure(void) {
3472
3473 kmp_team_list_t list;
3474
3475 // Initialize list of teams.
3476 list =
3477 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3478 list->entry = NULL;
3479 list->next = NULL;
3480
3481 __kmp_printf("\n------------------------------\nGlobal Thread "
3482 "Table\n------------------------------\n");
3483 {
3484 int gtid;
3485 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3486 __kmp_printf("%2d", gtid);
3487 if (__kmp_threads != NULL) {
3488 __kmp_printf(" %p", __kmp_threads[gtid]);
3489 }
3490 if (__kmp_root != NULL) {
3491 __kmp_printf(" %p", __kmp_root[gtid]);
3492 }
3493 __kmp_printf("\n");
3494 }
3495 }
3496
3497 // Print out __kmp_threads array.
3498 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3499 "----------\n");
3500 if (__kmp_threads != NULL) {
3501 int gtid;
3502 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3503 kmp_info_t const *thread = __kmp_threads[gtid];
3504 if (thread != NULL) {
3505 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3506 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3507 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3508 __kmp_print_structure_team(" Serial Team: ",
3509 thread->th.th_serial_team);
3510 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3511 __kmp_print_structure_thread(" Primary: ",
3512 thread->th.th_team_master);
3513 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3514 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3515 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3516 __kmp_print_structure_thread(" Next in pool: ",
3517 thread->th.th_next_pool);
3518 __kmp_printf("\n");
3519 __kmp_print_structure_team_accum(list, thread->th.th_team);
3520 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3521 }
3522 }
3523 } else {
3524 __kmp_printf("Threads array is not allocated.\n");
3525 }
3526
3527 // Print out __kmp_root array.
3528 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3529 "--------\n");
3530 if (__kmp_root != NULL) {
3531 int gtid;
3532 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3533 kmp_root_t const *root = __kmp_root[gtid];
3534 if (root != NULL) {
3535 __kmp_printf("GTID %2d %p:\n", gtid, root);
3536 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3537 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3538 __kmp_print_structure_thread(" Uber Thread: ",
3539 root->r.r_uber_thread);
3540 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3541 __kmp_printf(" In Parallel: %2d\n",
3542 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3543 __kmp_printf("\n");
3544 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3545 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3546 }
3547 }
3548 } else {
3549 __kmp_printf("Ubers array is not allocated.\n");
3550 }
3551
3552 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3553 "--------\n");
3554 while (list->next != NULL) {
3555 kmp_team_p const *team = list->entry;
3556 int i;
3557 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3558 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3559 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3560 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3561 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3562 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3563 for (i = 0; i < team->t.t_nproc; ++i) {
3564 __kmp_printf(" Thread %2d: ", i);
3565 __kmp_print_structure_thread("", team->t.t_threads[i]);
3566 }
3567 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3568 __kmp_printf("\n");
3569 list = list->next;
3570 }
3571
3572 // Print out __kmp_thread_pool and __kmp_team_pool.
3573 __kmp_printf("\n------------------------------\nPools\n----------------------"
3574 "--------\n");
3575 __kmp_print_structure_thread("Thread pool: ",
3576 CCAST(kmp_info_t *, __kmp_thread_pool));
3577 __kmp_print_structure_team("Team pool: ",
3578 CCAST(kmp_team_t *, __kmp_team_pool));
3579 __kmp_printf("\n");
3580
3581 // Free team list.
3582 while (list != NULL) {
3583 kmp_team_list_item_t *item = list;
3584 list = list->next;
3585 KMP_INTERNAL_FREE(item);
3586 }
3587}
3588
3589#endif
3590
3591//---------------------------------------------------------------------------
3592// Stuff for per-thread fast random number generator
3593// Table of primes
3594static const unsigned __kmp_primes[] = {
3595 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3596 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3597 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3598 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3599 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3600 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3601 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3602 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3603 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3604 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3605 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3606
3607//---------------------------------------------------------------------------
3608// __kmp_get_random: Get a random number using a linear congruential method.
3609unsigned short __kmp_get_random(kmp_info_t *thread) {
3610 unsigned x = thread->th.th_x;
3611 unsigned short r = (unsigned short)(x >> 16);
3612
3613 thread->th.th_x = x * thread->th.th_a + 1;
3614
3615 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3616 thread->th.th_info.ds.ds_tid, r));
3617
3618 return r;
3619}
3620//--------------------------------------------------------
3621// __kmp_init_random: Initialize a random number generator
3622void __kmp_init_random(kmp_info_t *thread) {
3623 unsigned seed = thread->th.th_info.ds.ds_tid;
3624
3625 thread->th.th_a =
3626 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3627 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3628 KA_TRACE(30,
3629 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3630}
3631
3632#if KMP_OS_WINDOWS
3633/* reclaim array entries for root threads that are already dead, returns number
3634 * reclaimed */
3635static int __kmp_reclaim_dead_roots(void) {
3636 int i, r = 0;
3637
3638 for (i = 0; i < __kmp_threads_capacity; ++i) {
3639 if (KMP_UBER_GTID(i) &&
3640 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3641 !__kmp_root[i]
3642 ->r.r_active) { // AC: reclaim only roots died in non-active state
3643 r += __kmp_unregister_root_other_thread(i);
3644 }
3645 }
3646 return r;
3647}
3648#endif
3649
3650/* This function attempts to create free entries in __kmp_threads and
3651 __kmp_root, and returns the number of free entries generated.
3652
3653 For Windows* OS static library, the first mechanism used is to reclaim array
3654 entries for root threads that are already dead.
3655
3656 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3657 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3658 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3659 threadprivate cache array has been created. Synchronization with
3660 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3661
3662 After any dead root reclamation, if the clipping value allows array expansion
3663 to result in the generation of a total of nNeed free slots, the function does
3664 that expansion. If not, nothing is done beyond the possible initial root
3665 thread reclamation.
3666
3667 If any argument is negative, the behavior is undefined. */
3668static int __kmp_expand_threads(int nNeed) {
3669 int added = 0;
3670 int minimumRequiredCapacity;
3671 int newCapacity;
3672 kmp_info_t **newThreads;
3673 kmp_root_t **newRoot;
3674
3675 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3676 // resizing __kmp_threads does not need additional protection if foreign
3677 // threads are present
3678
3679#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3680 /* only for Windows static library */
3681 /* reclaim array entries for root threads that are already dead */
3682 added = __kmp_reclaim_dead_roots();
3683
3684 if (nNeed) {
3685 nNeed -= added;
3686 if (nNeed < 0)
3687 nNeed = 0;
3688 }
3689#endif
3690 if (nNeed <= 0)
3691 return added;
3692
3693 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3694 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3695 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3696 // > __kmp_max_nth in one of two ways:
3697 //
3698 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3699 // may not be reused by another thread, so we may need to increase
3700 // __kmp_threads_capacity to __kmp_max_nth + 1.
3701 //
3702 // 2) New foreign root(s) are encountered. We always register new foreign
3703 // roots. This may cause a smaller # of threads to be allocated at
3704 // subsequent parallel regions, but the worker threads hang around (and
3705 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3706 //
3707 // Anyway, that is the reason for moving the check to see if
3708 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3709 // instead of having it performed here. -BB
3710
3711 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3712
3713 /* compute expansion headroom to check if we can expand */
3714 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3715 /* possible expansion too small -- give up */
3716 return added;
3717 }
3718 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3719
3720 newCapacity = __kmp_threads_capacity;
3721 do {
3722 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3723 : __kmp_sys_max_nth;
3724 } while (newCapacity < minimumRequiredCapacity);
3725 newThreads = (kmp_info_t **)__kmp_allocate(
3726 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3727 newRoot =
3728 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3729 KMP_MEMCPY(newThreads, __kmp_threads,
3730 __kmp_threads_capacity * sizeof(kmp_info_t *));
3731 KMP_MEMCPY(newRoot, __kmp_root,
3732 __kmp_threads_capacity * sizeof(kmp_root_t *));
3733 // Put old __kmp_threads array on a list. Any ongoing references to the old
3734 // list will be valid. This list is cleaned up at library shutdown.
3735 kmp_old_threads_list_t *node =
3736 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3737 node->threads = __kmp_threads;
3738 node->next = __kmp_old_threads_list;
3739 __kmp_old_threads_list = node;
3740
3741 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3742 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3743 added += newCapacity - __kmp_threads_capacity;
3744 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3745
3746 if (newCapacity > __kmp_tp_capacity) {
3747 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3748 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3749 __kmp_threadprivate_resize_cache(newCapacity);
3750 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3751 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3752 }
3753 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3754 }
3755
3756 return added;
3757}
3758
3759/* Register the current thread as a root thread and obtain our gtid. We must
3760 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3761 thread that calls from __kmp_do_serial_initialize() */
3762int __kmp_register_root(int initial_thread) {
3763 kmp_info_t *root_thread;
3764 kmp_root_t *root;
3765 int gtid;
3766 int capacity;
3767 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3768 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3769 KMP_MB();
3770
3771 /* 2007-03-02:
3772 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3773 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3774 work as expected -- it may return false (that means there is at least one
3775 empty slot in __kmp_threads array), but it is possible the only free slot
3776 is #0, which is reserved for initial thread and so cannot be used for this
3777 one. Following code workarounds this bug.
3778
3779 However, right solution seems to be not reserving slot #0 for initial
3780 thread because:
3781 (1) there is no magic in slot #0,
3782 (2) we cannot detect initial thread reliably (the first thread which does
3783 serial initialization may be not a real initial thread).
3784 */
3785 capacity = __kmp_threads_capacity;
3786 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3787 --capacity;
3788 }
3789
3790 // If it is not for initializing the hidden helper team, we need to take
3791 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3792 // in __kmp_threads_capacity.
3793 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3794 capacity -= __kmp_hidden_helper_threads_num;
3795 }
3796
3797 /* see if there are too many threads */
3798 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3799 if (__kmp_tp_cached) {
3800 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3801 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3802 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3803 } else {
3804 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3805 __kmp_msg_null);
3806 }
3807 }
3808
3809 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3810 // 0: initial thread, also a regular OpenMP thread.
3811 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3812 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3813 // regular OpenMP threads.
3814 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3815 // Find an available thread slot for hidden helper thread. Slots for hidden
3816 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3817 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3818 gtid <= __kmp_hidden_helper_threads_num;
3819 gtid++)
3820 ;
3821 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3822 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3823 "hidden helper thread: T#%d\n",
3824 gtid));
3825 } else {
3826 /* find an available thread slot */
3827 // Don't reassign the zero slot since we need that to only be used by
3828 // initial thread. Slots for hidden helper threads should also be skipped.
3829 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3830 gtid = 0;
3831 } else {
3832 for (gtid = __kmp_hidden_helper_threads_num + 1;
3833 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3834 ;
3835 }
3836 KA_TRACE(
3837 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3838 KMP_ASSERT(gtid < __kmp_threads_capacity);
3839 }
3840
3841 /* update global accounting */
3842 __kmp_all_nth++;
3843 TCW_4(__kmp_nth, __kmp_nth + 1);
3844
3845 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3846 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3847 if (__kmp_adjust_gtid_mode) {
3848 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3849 if (TCR_4(__kmp_gtid_mode) != 2) {
3850 TCW_4(__kmp_gtid_mode, 2);
3851 }
3852 } else {
3853 if (TCR_4(__kmp_gtid_mode) != 1) {
3854 TCW_4(__kmp_gtid_mode, 1);
3855 }
3856 }
3857 }
3858
3859#ifdef KMP_ADJUST_BLOCKTIME
3860 /* Adjust blocktime to zero if necessary */
3861 /* Middle initialization might not have occurred yet */
3862 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3863 if (__kmp_nth > __kmp_avail_proc) {
3864 __kmp_zero_bt = TRUE;
3865 }
3866 }
3867#endif /* KMP_ADJUST_BLOCKTIME */
3868
3869 /* setup this new hierarchy */
3870 if (!(root = __kmp_root[gtid])) {
3871 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3872 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3873 }
3874
3875#if KMP_STATS_ENABLED
3876 // Initialize stats as soon as possible (right after gtid assignment).
3877 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3878 __kmp_stats_thread_ptr->startLife();
3879 KMP_SET_THREAD_STATE(SERIAL_REGION);
3880 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3881#endif
3882 __kmp_initialize_root(root);
3883
3884 /* setup new root thread structure */
3885 if (root->r.r_uber_thread) {
3886 root_thread = root->r.r_uber_thread;
3887 } else {
3888 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3889 if (__kmp_storage_map) {
3890 __kmp_print_thread_storage_map(root_thread, gtid);
3891 }
3892 root_thread->th.th_info.ds.ds_gtid = gtid;
3893#if OMPT_SUPPORT
3894 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3895#endif
3896 root_thread->th.th_root = root;
3897 if (__kmp_env_consistency_check) {
3898 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3899 }
3900#if USE_FAST_MEMORY
3901 __kmp_initialize_fast_memory(root_thread);
3902#endif /* USE_FAST_MEMORY */
3903
3904#if KMP_USE_BGET
3905 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3906 __kmp_initialize_bget(root_thread);
3907#endif
3908 __kmp_init_random(root_thread); // Initialize random number generator
3909 }
3910
3911 /* setup the serial team held in reserve by the root thread */
3912 if (!root_thread->th.th_serial_team) {
3913 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3914 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3915 root_thread->th.th_serial_team = __kmp_allocate_team(
3916 root, 1, 1,
3917#if OMPT_SUPPORT
3918 ompt_data_none, // root parallel id
3919#endif
3920 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3921 }
3922 KMP_ASSERT(root_thread->th.th_serial_team);
3923 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3924 root_thread->th.th_serial_team));
3925
3926 /* drop root_thread into place */
3927 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3928
3929 root->r.r_root_team->t.t_threads[0] = root_thread;
3930 root->r.r_hot_team->t.t_threads[0] = root_thread;
3931 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3932 // AC: the team created in reserve, not for execution (it is unused for now).
3933 root_thread->th.th_serial_team->t.t_serialized = 0;
3934 root->r.r_uber_thread = root_thread;
3935
3936 /* initialize the thread, get it ready to go */
3937 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3938 TCW_4(__kmp_init_gtid, TRUE);
3939
3940 /* prepare the primary thread for get_gtid() */
3941 __kmp_gtid_set_specific(gtid);
3942
3943#if USE_ITT_BUILD
3944 __kmp_itt_thread_name(gtid);
3945#endif /* USE_ITT_BUILD */
3946
3947#ifdef KMP_TDATA_GTID
3948 __kmp_gtid = gtid;
3949#endif
3950 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3951 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3952
3953 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3954 "plain=%u\n",
3955 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3956 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3957 KMP_INIT_BARRIER_STATE));
3958 { // Initialize barrier data.
3959 int b;
3960 for (b = 0; b < bs_last_barrier; ++b) {
3961 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3962#if USE_DEBUGGER
3963 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3964#endif
3965 }
3966 }
3967 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3968 KMP_INIT_BARRIER_STATE);
3969
3970#if KMP_AFFINITY_SUPPORTED
3971 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3972 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3973 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3974 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3975#endif /* KMP_AFFINITY_SUPPORTED */
3976 root_thread->th.th_def_allocator = __kmp_def_allocator;
3977 root_thread->th.th_prev_level = 0;
3978 root_thread->th.th_prev_num_threads = 1;
3979
3980 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3981 tmp->cg_root = root_thread;
3982 tmp->cg_thread_limit = __kmp_cg_max_nth;
3983 tmp->cg_nthreads = 1;
3984 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3985 " cg_nthreads init to 1\n",
3986 root_thread, tmp));
3987 tmp->up = NULL;
3988 root_thread->th.th_cg_roots = tmp;
3989
3990 __kmp_root_counter++;
3991
3992#if OMPT_SUPPORT
3993 if (!initial_thread && ompt_enabled.enabled) {
3994
3995 kmp_info_t *root_thread = ompt_get_thread();
3996
3997 ompt_set_thread_state(root_thread, ompt_state_overhead);
3998
3999 if (ompt_enabled.ompt_callback_thread_begin) {
4000 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4001 ompt_thread_initial, __ompt_get_thread_data_internal());
4002 }
4003 ompt_data_t *task_data;
4004 ompt_data_t *parallel_data;
4005 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4006 NULL);
4007 if (ompt_enabled.ompt_callback_implicit_task) {
4008 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4009 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4010 }
4011
4012 ompt_set_thread_state(root_thread, ompt_state_work_serial);
4013 }
4014#endif
4015#if OMPD_SUPPORT
4016 if (ompd_state & OMPD_ENABLE_BP)
4017 ompd_bp_thread_begin();
4018#endif
4019
4020 KMP_MB();
4021 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4022
4023 return gtid;
4024}
4025
4026#if KMP_NESTED_HOT_TEAMS
4027static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4028 const int max_level) {
4029 int i, n, nth;
4030 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4031 if (!hot_teams || !hot_teams[level].hot_team) {
4032 return 0;
4033 }
4034 KMP_DEBUG_ASSERT(level < max_level);
4035 kmp_team_t *team = hot_teams[level].hot_team;
4036 nth = hot_teams[level].hot_team_nth;
4037 n = nth - 1; // primary thread is not freed
4038 if (level < max_level - 1) {
4039 for (i = 0; i < nth; ++i) {
4040 kmp_info_t *th = team->t.t_threads[i];
4041 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4042 if (i > 0 && th->th.th_hot_teams) {
4043 __kmp_free(th->th.th_hot_teams);
4044 th->th.th_hot_teams = NULL;
4045 }
4046 }
4047 }
4048 __kmp_free_team(root, team, NULL);
4049 return n;
4050}
4051#endif
4052
4053// Resets a root thread and clear its root and hot teams.
4054// Returns the number of __kmp_threads entries directly and indirectly freed.
4055static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4056 kmp_team_t *root_team = root->r.r_root_team;
4057 kmp_team_t *hot_team = root->r.r_hot_team;
4058 int n = hot_team->t.t_nproc;
4059 int i;
4060
4061 KMP_DEBUG_ASSERT(!root->r.r_active);
4062
4063 root->r.r_root_team = NULL;
4064 root->r.r_hot_team = NULL;
4065 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4066 // before call to __kmp_free_team().
4067 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4068#if KMP_NESTED_HOT_TEAMS
4069 if (__kmp_hot_teams_max_level >
4070 0) { // need to free nested hot teams and their threads if any
4071 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4072 kmp_info_t *th = hot_team->t.t_threads[i];
4073 if (__kmp_hot_teams_max_level > 1) {
4074 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4075 }
4076 if (th->th.th_hot_teams) {
4077 __kmp_free(th->th.th_hot_teams);
4078 th->th.th_hot_teams = NULL;
4079 }
4080 }
4081 }
4082#endif
4083 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4084
4085 // Before we can reap the thread, we need to make certain that all other
4086 // threads in the teams that had this root as ancestor have stopped trying to
4087 // steal tasks.
4088 if (__kmp_tasking_mode != tskm_immediate_exec) {
4089 __kmp_wait_to_unref_task_teams();
4090 }
4091
4092#if KMP_OS_WINDOWS
4093 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4094 KA_TRACE(
4095 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4096 "\n",
4097 (LPVOID) & (root->r.r_uber_thread->th),
4098 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4099 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4100#endif /* KMP_OS_WINDOWS */
4101
4102#if OMPD_SUPPORT
4103 if (ompd_state & OMPD_ENABLE_BP)
4104 ompd_bp_thread_end();
4105#endif
4106
4107#if OMPT_SUPPORT
4108 ompt_data_t *task_data;
4109 ompt_data_t *parallel_data;
4110 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4111 NULL);
4112 if (ompt_enabled.ompt_callback_implicit_task) {
4113 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4114 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4115 }
4116 if (ompt_enabled.ompt_callback_thread_end) {
4117 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4118 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4119 }
4120#endif
4121
4122 TCW_4(__kmp_nth,
4123 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4124 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4125 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4126 " to %d\n",
4127 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4128 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4129 if (i == 1) {
4130 // need to free contention group structure
4131 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4132 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4133 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4134 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4135 root->r.r_uber_thread->th.th_cg_roots = NULL;
4136 }
4137 __kmp_reap_thread(root->r.r_uber_thread, 1);
4138
4139 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4140 // instead of freeing.
4141 root->r.r_uber_thread = NULL;
4142 /* mark root as no longer in use */
4143 root->r.r_begin = FALSE;
4144
4145 return n;
4146}
4147
4148void __kmp_unregister_root_current_thread(int gtid) {
4149 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4150 /* this lock should be ok, since unregister_root_current_thread is never
4151 called during an abort, only during a normal close. furthermore, if you
4152 have the forkjoin lock, you should never try to get the initz lock */
4153 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4154 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4155 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4156 "exiting T#%d\n",
4157 gtid));
4158 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4159 return;
4160 }
4161 kmp_root_t *root = __kmp_root[gtid];
4162
4163 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4164 KMP_ASSERT(KMP_UBER_GTID(gtid));
4165 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4166 KMP_ASSERT(root->r.r_active == FALSE);
4167
4168 KMP_MB();
4169
4170 kmp_info_t *thread = __kmp_threads[gtid];
4171 kmp_team_t *team = thread->th.th_team;
4172 kmp_task_team_t *task_team = thread->th.th_task_team;
4173
4174 // we need to wait for the proxy tasks before finishing the thread
4175 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4176 task_team->tt.tt_hidden_helper_task_encountered)) {
4177#if OMPT_SUPPORT
4178 // the runtime is shutting down so we won't report any events
4179 thread->th.ompt_thread_info.state = ompt_state_undefined;
4180#endif
4181 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4182 }
4183
4184 __kmp_reset_root(gtid, root);
4185
4186 KMP_MB();
4187 KC_TRACE(10,
4188 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4189
4190 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4191}
4192
4193#if KMP_OS_WINDOWS
4194/* __kmp_forkjoin_lock must be already held
4195 Unregisters a root thread that is not the current thread. Returns the number
4196 of __kmp_threads entries freed as a result. */
4197static int __kmp_unregister_root_other_thread(int gtid) {
4198 kmp_root_t *root = __kmp_root[gtid];
4199 int r;
4200
4201 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4202 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4203 KMP_ASSERT(KMP_UBER_GTID(gtid));
4204 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4205 KMP_ASSERT(root->r.r_active == FALSE);
4206
4207 r = __kmp_reset_root(gtid, root);
4208 KC_TRACE(10,
4209 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4210 return r;
4211}
4212#endif
4213
4214#if KMP_DEBUG
4215void __kmp_task_info() {
4216
4217 kmp_int32 gtid = __kmp_entry_gtid();
4218 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4219 kmp_info_t *this_thr = __kmp_threads[gtid];
4220 kmp_team_t *steam = this_thr->th.th_serial_team;
4221 kmp_team_t *team = this_thr->th.th_team;
4222
4223 __kmp_printf(
4224 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4225 "ptask=%p\n",
4226 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4227 team->t.t_implicit_task_taskdata[tid].td_parent);
4228}
4229#endif // KMP_DEBUG
4230
4231/* TODO optimize with one big memclr, take out what isn't needed, split
4232 responsibility to workers as much as possible, and delay initialization of
4233 features as much as possible */
4234static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4235 int tid, int gtid) {
4236 /* this_thr->th.th_info.ds.ds_gtid is setup in
4237 kmp_allocate_thread/create_worker.
4238 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4239 KMP_DEBUG_ASSERT(this_thr != NULL);
4240 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4241 KMP_DEBUG_ASSERT(team);
4242 KMP_DEBUG_ASSERT(team->t.t_threads);
4243 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4244 kmp_info_t *master = team->t.t_threads[0];
4245 KMP_DEBUG_ASSERT(master);
4246 KMP_DEBUG_ASSERT(master->th.th_root);
4247
4248 KMP_MB();
4249
4250 TCW_SYNC_PTR(this_thr->th.th_team, team);
4251
4252 this_thr->th.th_info.ds.ds_tid = tid;
4253 this_thr->th.th_set_nproc = 0;
4254 if (__kmp_tasking_mode != tskm_immediate_exec)
4255 // When tasking is possible, threads are not safe to reap until they are
4256 // done tasking; this will be set when tasking code is exited in wait
4257 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4258 else // no tasking --> always safe to reap
4259 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4260 this_thr->th.th_set_proc_bind = proc_bind_default;
4261#if KMP_AFFINITY_SUPPORTED
4262 this_thr->th.th_new_place = this_thr->th.th_current_place;
4263#endif
4264 this_thr->th.th_root = master->th.th_root;
4265
4266 /* setup the thread's cache of the team structure */
4267 this_thr->th.th_team_nproc = team->t.t_nproc;
4268 this_thr->th.th_team_master = master;
4269 this_thr->th.th_team_serialized = team->t.t_serialized;
4270
4271 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4272
4273 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4274 tid, gtid, this_thr, this_thr->th.th_current_task));
4275
4276 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4277 team, tid, TRUE);
4278
4279 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4280 tid, gtid, this_thr, this_thr->th.th_current_task));
4281 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4282 // __kmp_initialize_team()?
4283
4284 /* TODO no worksharing in speculative threads */
4285 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4286
4287 this_thr->th.th_local.this_construct = 0;
4288
4289 if (!this_thr->th.th_pri_common) {
4290 this_thr->th.th_pri_common =
4291 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4292 if (__kmp_storage_map) {
4293 __kmp_print_storage_map_gtid(
4294 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4295 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4296 }
4297 this_thr->th.th_pri_head = NULL;
4298 }
4299
4300 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4301 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4302 // Make new thread's CG root same as primary thread's
4303 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4304 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4305 if (tmp) {
4306 // worker changes CG, need to check if old CG should be freed
4307 int i = tmp->cg_nthreads--;
4308 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4309 " on node %p of thread %p to %d\n",
4310 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4311 if (i == 1) {
4312 __kmp_free(tmp); // last thread left CG --> free it
4313 }
4314 }
4315 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4316 // Increment new thread's CG root's counter to add the new thread
4317 this_thr->th.th_cg_roots->cg_nthreads++;
4318 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4319 " node %p of thread %p to %d\n",
4320 this_thr, this_thr->th.th_cg_roots,
4321 this_thr->th.th_cg_roots->cg_root,
4322 this_thr->th.th_cg_roots->cg_nthreads));
4323 this_thr->th.th_current_task->td_icvs.thread_limit =
4324 this_thr->th.th_cg_roots->cg_thread_limit;
4325 }
4326
4327 /* Initialize dynamic dispatch */
4328 {
4329 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4330 // Use team max_nproc since this will never change for the team.
4331 size_t disp_size =
4332 sizeof(dispatch_private_info_t) *
4333 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4334 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4335 team->t.t_max_nproc));
4336 KMP_ASSERT(dispatch);
4337 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4338 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4339
4340 dispatch->th_disp_index = 0;
4341 dispatch->th_doacross_buf_idx = 0;
4342 if (!dispatch->th_disp_buffer) {
4343 dispatch->th_disp_buffer =
4344 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4345
4346 if (__kmp_storage_map) {
4347 __kmp_print_storage_map_gtid(
4348 gtid, &dispatch->th_disp_buffer[0],
4349 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4350 ? 1
4351 : __kmp_dispatch_num_buffers],
4352 disp_size,
4353 "th_%d.th_dispatch.th_disp_buffer "
4354 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4355 gtid, team->t.t_id, gtid);
4356 }
4357 } else {
4358 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4359 }
4360
4361 dispatch->th_dispatch_pr_current = 0;
4362 dispatch->th_dispatch_sh_current = 0;
4363
4364 dispatch->th_deo_fcn = 0; /* ORDERED */
4365 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4366 }
4367
4368 this_thr->th.th_next_pool = NULL;
4369
4370 if (!this_thr->th.th_task_state_memo_stack) {
4371 size_t i;
4372 this_thr->th.th_task_state_memo_stack =
4373 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4374 this_thr->th.th_task_state_top = 0;
4375 this_thr->th.th_task_state_stack_sz = 4;
4376 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4377 ++i) // zero init the stack
4378 this_thr->th.th_task_state_memo_stack[i] = 0;
4379 }
4380
4381 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4382 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4383
4384 KMP_MB();
4385}
4386
4387/* allocate a new thread for the requesting team. this is only called from
4388 within a forkjoin critical section. we will first try to get an available
4389 thread from the thread pool. if none is available, we will fork a new one
4390 assuming we are able to create a new one. this should be assured, as the
4391 caller should check on this first. */
4392kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4393 int new_tid) {
4394 kmp_team_t *serial_team;
4395 kmp_info_t *new_thr;
4396 int new_gtid;
4397
4398 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4399 KMP_DEBUG_ASSERT(root && team);
4400#if !KMP_NESTED_HOT_TEAMS
4401 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4402#endif
4403 KMP_MB();
4404
4405 /* first, try to get one from the thread pool */
4406 if (__kmp_thread_pool) {
4407 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4408 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4409 if (new_thr == __kmp_thread_pool_insert_pt) {
4410 __kmp_thread_pool_insert_pt = NULL;
4411 }
4412 TCW_4(new_thr->th.th_in_pool, FALSE);
4413 __kmp_suspend_initialize_thread(new_thr);
4414 __kmp_lock_suspend_mx(new_thr);
4415 if (new_thr->th.th_active_in_pool == TRUE) {
4416 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4417 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4418 new_thr->th.th_active_in_pool = FALSE;
4419 }
4420 __kmp_unlock_suspend_mx(new_thr);
4421
4422 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4423 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4424 KMP_ASSERT(!new_thr->th.th_team);
4425 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4426
4427 /* setup the thread structure */
4428 __kmp_initialize_info(new_thr, team, new_tid,
4429 new_thr->th.th_info.ds.ds_gtid);
4430 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4431
4432 TCW_4(__kmp_nth, __kmp_nth + 1);
4433
4434 new_thr->th.th_task_state = 0;
4435 new_thr->th.th_task_state_top = 0;
4436 new_thr->th.th_task_state_stack_sz = 4;
4437
4438 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4439 // Make sure pool thread has transitioned to waiting on own thread struct
4440 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4441 // Thread activated in __kmp_allocate_team when increasing team size
4442 }
4443
4444#ifdef KMP_ADJUST_BLOCKTIME
4445 /* Adjust blocktime back to zero if necessary */
4446 /* Middle initialization might not have occurred yet */
4447 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4448 if (__kmp_nth > __kmp_avail_proc) {
4449 __kmp_zero_bt = TRUE;
4450 }
4451 }
4452#endif /* KMP_ADJUST_BLOCKTIME */
4453
4454#if KMP_DEBUG
4455 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4456 // KMP_BARRIER_PARENT_FLAG.
4457 int b;
4458 kmp_balign_t *balign = new_thr->th.th_bar;
4459 for (b = 0; b < bs_last_barrier; ++b)
4460 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4461#endif
4462
4463 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4464 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4465
4466 KMP_MB();
4467 return new_thr;
4468 }
4469
4470 /* no, well fork a new one */
4471 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4472 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4473
4474#if KMP_USE_MONITOR
4475 // If this is the first worker thread the RTL is creating, then also
4476 // launch the monitor thread. We try to do this as early as possible.
4477 if (!TCR_4(__kmp_init_monitor)) {
4478 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4479 if (!TCR_4(__kmp_init_monitor)) {
4480 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4481 TCW_4(__kmp_init_monitor, 1);
4482 __kmp_create_monitor(&__kmp_monitor);
4483 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4484#if KMP_OS_WINDOWS
4485 // AC: wait until monitor has started. This is a fix for CQ232808.
4486 // The reason is that if the library is loaded/unloaded in a loop with
4487 // small (parallel) work in between, then there is high probability that
4488 // monitor thread started after the library shutdown. At shutdown it is
4489 // too late to cope with the problem, because when the primary thread is
4490 // in DllMain (process detach) the monitor has no chances to start (it is
4491 // blocked), and primary thread has no means to inform the monitor that
4492 // the library has gone, because all the memory which the monitor can
4493 // access is going to be released/reset.
4494 while (TCR_4(__kmp_init_monitor) < 2) {
4495 KMP_YIELD(TRUE);
4496 }
4497 KF_TRACE(10, ("after monitor thread has started\n"));
4498#endif
4499 }
4500 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4501 }
4502#endif
4503
4504 KMP_MB();
4505
4506 {
4507 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4508 ? 1
4509 : __kmp_hidden_helper_threads_num + 1;
4510
4511 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4512 ++new_gtid) {
4513 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4514 }
4515
4516 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4517 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4518 }
4519 }
4520
4521 /* allocate space for it. */
4522 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4523
4524 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4525
4526#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4527 // suppress race conditions detection on synchronization flags in debug mode
4528 // this helps to analyze library internals eliminating false positives
4529 __itt_suppress_mark_range(
4530 __itt_suppress_range, __itt_suppress_threading_errors,
4531 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4532 __itt_suppress_mark_range(
4533 __itt_suppress_range, __itt_suppress_threading_errors,
4534 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4535#if KMP_OS_WINDOWS
4536 __itt_suppress_mark_range(
4537 __itt_suppress_range, __itt_suppress_threading_errors,
4538 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4539#else
4540 __itt_suppress_mark_range(__itt_suppress_range,
4541 __itt_suppress_threading_errors,
4542 &new_thr->th.th_suspend_init_count,
4543 sizeof(new_thr->th.th_suspend_init_count));
4544#endif
4545 // TODO: check if we need to also suppress b_arrived flags
4546 __itt_suppress_mark_range(__itt_suppress_range,
4547 __itt_suppress_threading_errors,
4548 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4549 sizeof(new_thr->th.th_bar[0].bb.b_go));
4550 __itt_suppress_mark_range(__itt_suppress_range,
4551 __itt_suppress_threading_errors,
4552 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4553 sizeof(new_thr->th.th_bar[1].bb.b_go));
4554 __itt_suppress_mark_range(__itt_suppress_range,
4555 __itt_suppress_threading_errors,
4556 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4557 sizeof(new_thr->th.th_bar[2].bb.b_go));
4558#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4559 if (__kmp_storage_map) {
4560 __kmp_print_thread_storage_map(new_thr, new_gtid);
4561 }
4562
4563 // add the reserve serialized team, initialized from the team's primary thread
4564 {
4565 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4566 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4567 new_thr->th.th_serial_team = serial_team =
4568 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4569#if OMPT_SUPPORT
4570 ompt_data_none, // root parallel id
4571#endif
4572 proc_bind_default, &r_icvs,
4573 0 USE_NESTED_HOT_ARG(NULL));
4574 }
4575 KMP_ASSERT(serial_team);
4576 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4577 // execution (it is unused for now).
4578 serial_team->t.t_threads[0] = new_thr;
4579 KF_TRACE(10,
4580 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4581 new_thr));
4582
4583 /* setup the thread structures */
4584 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4585
4586#if USE_FAST_MEMORY
4587 __kmp_initialize_fast_memory(new_thr);
4588#endif /* USE_FAST_MEMORY */
4589
4590#if KMP_USE_BGET
4591 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4592 __kmp_initialize_bget(new_thr);
4593#endif
4594
4595 __kmp_init_random(new_thr); // Initialize random number generator
4596
4597 /* Initialize these only once when thread is grabbed for a team allocation */
4598 KA_TRACE(20,
4599 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4600 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4601
4602 int b;
4603 kmp_balign_t *balign = new_thr->th.th_bar;
4604 for (b = 0; b < bs_last_barrier; ++b) {
4605 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4606 balign[b].bb.team = NULL;
4607 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4608 balign[b].bb.use_oncore_barrier = 0;
4609 }
4610
4611 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4612 new_thr->th.th_sleep_loc_type = flag_unset;
4613
4614 new_thr->th.th_spin_here = FALSE;
4615 new_thr->th.th_next_waiting = 0;
4616#if KMP_OS_UNIX
4617 new_thr->th.th_blocking = false;
4618#endif
4619
4620#if KMP_AFFINITY_SUPPORTED
4621 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4622 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4623 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4624 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4625#endif
4626 new_thr->th.th_def_allocator = __kmp_def_allocator;
4627 new_thr->th.th_prev_level = 0;
4628 new_thr->th.th_prev_num_threads = 1;
4629
4630 TCW_4(new_thr->th.th_in_pool, FALSE);
4631 new_thr->th.th_active_in_pool = FALSE;
4632 TCW_4(new_thr->th.th_active, TRUE);
4633
4634 /* adjust the global counters */
4635 __kmp_all_nth++;
4636 __kmp_nth++;
4637
4638 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4639 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4640 if (__kmp_adjust_gtid_mode) {
4641 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4642 if (TCR_4(__kmp_gtid_mode) != 2) {
4643 TCW_4(__kmp_gtid_mode, 2);
4644 }
4645 } else {
4646 if (TCR_4(__kmp_gtid_mode) != 1) {
4647 TCW_4(__kmp_gtid_mode, 1);
4648 }
4649 }
4650 }
4651
4652#ifdef KMP_ADJUST_BLOCKTIME
4653 /* Adjust blocktime back to zero if necessary */
4654 /* Middle initialization might not have occurred yet */
4655 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4656 if (__kmp_nth > __kmp_avail_proc) {
4657 __kmp_zero_bt = TRUE;
4658 }
4659 }
4660#endif /* KMP_ADJUST_BLOCKTIME */
4661
4662 /* actually fork it and create the new worker thread */
4663 KF_TRACE(
4664 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4665 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4666 KF_TRACE(10,
4667 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4668
4669 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4670 new_gtid));
4671 KMP_MB();
4672 return new_thr;
4673}
4674
4675/* Reinitialize team for reuse.
4676 The hot team code calls this case at every fork barrier, so EPCC barrier
4677 test are extremely sensitive to changes in it, esp. writes to the team
4678 struct, which cause a cache invalidation in all threads.
4679 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4680static void __kmp_reinitialize_team(kmp_team_t *team,
4681 kmp_internal_control_t *new_icvs,
4682 ident_t *loc) {
4683 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4684 team->t.t_threads[0], team));
4685 KMP_DEBUG_ASSERT(team && new_icvs);
4686 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4687 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4688
4689 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4690 // Copy ICVs to the primary thread's implicit taskdata
4691 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4692 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4693
4694 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4695 team->t.t_threads[0], team));
4696}
4697
4698/* Initialize the team data structure.
4699 This assumes the t_threads and t_max_nproc are already set.
4700 Also, we don't touch the arguments */
4701static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4702 kmp_internal_control_t *new_icvs,
4703 ident_t *loc) {
4704 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4705
4706 /* verify */
4707 KMP_DEBUG_ASSERT(team);
4708 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4709 KMP_DEBUG_ASSERT(team->t.t_threads);
4710 KMP_MB();
4711
4712 team->t.t_master_tid = 0; /* not needed */
4713 /* team->t.t_master_bar; not needed */
4714 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4715 team->t.t_nproc = new_nproc;
4716
4717 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4718 team->t.t_next_pool = NULL;
4719 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4720 * up hot team */
4721
4722 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4723 team->t.t_invoke = NULL; /* not needed */
4724
4725 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4726 team->t.t_sched.sched = new_icvs->sched.sched;
4727
4728#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4729 team->t.t_fp_control_saved = FALSE; /* not needed */
4730 team->t.t_x87_fpu_control_word = 0; /* not needed */
4731 team->t.t_mxcsr = 0; /* not needed */
4732#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4733
4734 team->t.t_construct = 0;
4735
4736 team->t.t_ordered.dt.t_value = 0;
4737 team->t.t_master_active = FALSE;
4738
4739#ifdef KMP_DEBUG
4740 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4741#endif
4742#if KMP_OS_WINDOWS
4743 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4744#endif
4745
4746 team->t.t_control_stack_top = NULL;
4747
4748 __kmp_reinitialize_team(team, new_icvs, loc);
4749
4750 KMP_MB();
4751 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4752}
4753
4754#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4755/* Sets full mask for thread and returns old mask, no changes to structures. */
4756static void
4757__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4758 if (KMP_AFFINITY_CAPABLE()) {
4759 int status;
4760 if (old_mask != NULL) {
4761 status = __kmp_get_system_affinity(old_mask, TRUE);
4762 int error = errno;
4763 if (status != 0) {
4764 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4765 __kmp_msg_null);
4766 }
4767 }
4768 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4769 }
4770}
4771#endif
4772
4773#if KMP_AFFINITY_SUPPORTED
4774
4775// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4776// It calculates the worker + primary thread's partition based upon the parent
4777// thread's partition, and binds each worker to a thread in their partition.
4778// The primary thread's partition should already include its current binding.
4779static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4780 // Do not partition places for the hidden helper team
4781 if (KMP_HIDDEN_HELPER_TEAM(team))
4782 return;
4783 // Copy the primary thread's place partition to the team struct
4784 kmp_info_t *master_th = team->t.t_threads[0];
4785 KMP_DEBUG_ASSERT(master_th != NULL);
4786 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4787 int first_place = master_th->th.th_first_place;
4788 int last_place = master_th->th.th_last_place;
4789 int masters_place = master_th->th.th_current_place;
4790 int num_masks = __kmp_affinity.num_masks;
4791 team->t.t_first_place = first_place;
4792 team->t.t_last_place = last_place;
4793
4794 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4795 "bound to place %d partition = [%d,%d]\n",
4796 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4797 team->t.t_id, masters_place, first_place, last_place));
4798
4799 switch (proc_bind) {
4800
4801 case proc_bind_default:
4802 // Serial teams might have the proc_bind policy set to proc_bind_default.
4803 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4804 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4805 break;
4806
4807 case proc_bind_primary: {
4808 int f;
4809 int n_th = team->t.t_nproc;
4810 for (f = 1; f < n_th; f++) {
4811 kmp_info_t *th = team->t.t_threads[f];
4812 KMP_DEBUG_ASSERT(th != NULL);
4813 th->th.th_first_place = first_place;
4814 th->th.th_last_place = last_place;
4815 th->th.th_new_place = masters_place;
4816 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4817 team->t.t_display_affinity != 1) {
4818 team->t.t_display_affinity = 1;
4819 }
4820
4821 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4822 "partition = [%d,%d]\n",
4823 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4824 f, masters_place, first_place, last_place));
4825 }
4826 } break;
4827
4828 case proc_bind_close: {
4829 int f;
4830 int n_th = team->t.t_nproc;
4831 int n_places;
4832 if (first_place <= last_place) {
4833 n_places = last_place - first_place + 1;
4834 } else {
4835 n_places = num_masks - first_place + last_place + 1;
4836 }
4837 if (n_th <= n_places) {
4838 int place = masters_place;
4839 for (f = 1; f < n_th; f++) {
4840 kmp_info_t *th = team->t.t_threads[f];
4841 KMP_DEBUG_ASSERT(th != NULL);
4842
4843 if (place == last_place) {
4844 place = first_place;
4845 } else if (place == (num_masks - 1)) {
4846 place = 0;
4847 } else {
4848 place++;
4849 }
4850 th->th.th_first_place = first_place;
4851 th->th.th_last_place = last_place;
4852 th->th.th_new_place = place;
4853 if (__kmp_display_affinity && place != th->th.th_current_place &&
4854 team->t.t_display_affinity != 1) {
4855 team->t.t_display_affinity = 1;
4856 }
4857
4858 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4859 "partition = [%d,%d]\n",
4860 __kmp_gtid_from_thread(team->t.t_threads[f]),
4861 team->t.t_id, f, place, first_place, last_place));
4862 }
4863 } else {
4864 int S, rem, gap, s_count;
4865 S = n_th / n_places;
4866 s_count = 0;
4867 rem = n_th - (S * n_places);
4868 gap = rem > 0 ? n_places / rem : n_places;
4869 int place = masters_place;
4870 int gap_ct = gap;
4871 for (f = 0; f < n_th; f++) {
4872 kmp_info_t *th = team->t.t_threads[f];
4873 KMP_DEBUG_ASSERT(th != NULL);
4874
4875 th->th.th_first_place = first_place;
4876 th->th.th_last_place = last_place;
4877 th->th.th_new_place = place;
4878 if (__kmp_display_affinity && place != th->th.th_current_place &&
4879 team->t.t_display_affinity != 1) {
4880 team->t.t_display_affinity = 1;
4881 }
4882 s_count++;
4883
4884 if ((s_count == S) && rem && (gap_ct == gap)) {
4885 // do nothing, add an extra thread to place on next iteration
4886 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4887 // we added an extra thread to this place; move to next place
4888 if (place == last_place) {
4889 place = first_place;
4890 } else if (place == (num_masks - 1)) {
4891 place = 0;
4892 } else {
4893 place++;
4894 }
4895 s_count = 0;
4896 gap_ct = 1;
4897 rem--;
4898 } else if (s_count == S) { // place full; don't add extra
4899 if (place == last_place) {
4900 place = first_place;
4901 } else if (place == (num_masks - 1)) {
4902 place = 0;
4903 } else {
4904 place++;
4905 }
4906 gap_ct++;
4907 s_count = 0;
4908 }
4909
4910 KA_TRACE(100,
4911 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4912 "partition = [%d,%d]\n",
4913 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4914 th->th.th_new_place, first_place, last_place));
4915 }
4916 KMP_DEBUG_ASSERT(place == masters_place);
4917 }
4918 } break;
4919
4920 case proc_bind_spread: {
4921 int f;
4922 int n_th = team->t.t_nproc;
4923 int n_places;
4924 int thidx;
4925 if (first_place <= last_place) {
4926 n_places = last_place - first_place + 1;
4927 } else {
4928 n_places = num_masks - first_place + last_place + 1;
4929 }
4930 if (n_th <= n_places) {
4931 int place = -1;
4932
4933 if (n_places != num_masks) {
4934 int S = n_places / n_th;
4935 int s_count, rem, gap, gap_ct;
4936
4937 place = masters_place;
4938 rem = n_places - n_th * S;
4939 gap = rem ? n_th / rem : 1;
4940 gap_ct = gap;
4941 thidx = n_th;
4942 if (update_master_only == 1)
4943 thidx = 1;
4944 for (f = 0; f < thidx; f++) {
4945 kmp_info_t *th = team->t.t_threads[f];
4946 KMP_DEBUG_ASSERT(th != NULL);
4947
4948 th->th.th_first_place = place;
4949 th->th.th_new_place = place;
4950 if (__kmp_display_affinity && place != th->th.th_current_place &&
4951 team->t.t_display_affinity != 1) {
4952 team->t.t_display_affinity = 1;
4953 }
4954 s_count = 1;
4955 while (s_count < S) {
4956 if (place == last_place) {
4957 place = first_place;
4958 } else if (place == (num_masks - 1)) {
4959 place = 0;
4960 } else {
4961 place++;
4962 }
4963 s_count++;
4964 }
4965 if (rem && (gap_ct == gap)) {
4966 if (place == last_place) {
4967 place = first_place;
4968 } else if (place == (num_masks - 1)) {
4969 place = 0;
4970 } else {
4971 place++;
4972 }
4973 rem--;
4974 gap_ct = 0;
4975 }
4976 th->th.th_last_place = place;
4977 gap_ct++;
4978
4979 if (place == last_place) {
4980 place = first_place;
4981 } else if (place == (num_masks - 1)) {
4982 place = 0;
4983 } else {
4984 place++;
4985 }
4986
4987 KA_TRACE(100,
4988 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4989 "partition = [%d,%d], num_masks: %u\n",
4990 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4991 f, th->th.th_new_place, th->th.th_first_place,
4992 th->th.th_last_place, num_masks));
4993 }
4994 } else {
4995 /* Having uniform space of available computation places I can create
4996 T partitions of round(P/T) size and put threads into the first
4997 place of each partition. */
4998 double current = static_cast<double>(masters_place);
4999 double spacing =
5000 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5001 int first, last;
5002 kmp_info_t *th;
5003
5004 thidx = n_th + 1;
5005 if (update_master_only == 1)
5006 thidx = 1;
5007 for (f = 0; f < thidx; f++) {
5008 first = static_cast<int>(current);
5009 last = static_cast<int>(current + spacing) - 1;
5010 KMP_DEBUG_ASSERT(last >= first);
5011 if (first >= n_places) {
5012 if (masters_place) {
5013 first -= n_places;
5014 last -= n_places;
5015 if (first == (masters_place + 1)) {
5016 KMP_DEBUG_ASSERT(f == n_th);
5017 first--;
5018 }
5019 if (last == masters_place) {
5020 KMP_DEBUG_ASSERT(f == (n_th - 1));
5021 last--;
5022 }
5023 } else {
5024 KMP_DEBUG_ASSERT(f == n_th);
5025 first = 0;
5026 last = 0;
5027 }
5028 }
5029 if (last >= n_places) {
5030 last = (n_places - 1);
5031 }
5032 place = first;
5033 current += spacing;
5034 if (f < n_th) {
5035 KMP_DEBUG_ASSERT(0 <= first);
5036 KMP_DEBUG_ASSERT(n_places > first);
5037 KMP_DEBUG_ASSERT(0 <= last);
5038 KMP_DEBUG_ASSERT(n_places > last);
5039 KMP_DEBUG_ASSERT(last_place >= first_place);
5040 th = team->t.t_threads[f];
5041 KMP_DEBUG_ASSERT(th);
5042 th->th.th_first_place = first;
5043 th->th.th_new_place = place;
5044 th->th.th_last_place = last;
5045 if (__kmp_display_affinity && place != th->th.th_current_place &&
5046 team->t.t_display_affinity != 1) {
5047 team->t.t_display_affinity = 1;
5048 }
5049 KA_TRACE(100,
5050 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5051 "partition = [%d,%d], spacing = %.4f\n",
5052 __kmp_gtid_from_thread(team->t.t_threads[f]),
5053 team->t.t_id, f, th->th.th_new_place,
5054 th->th.th_first_place, th->th.th_last_place, spacing));
5055 }
5056 }
5057 }
5058 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5059 } else {
5060 int S, rem, gap, s_count;
5061 S = n_th / n_places;
5062 s_count = 0;
5063 rem = n_th - (S * n_places);
5064 gap = rem > 0 ? n_places / rem : n_places;
5065 int place = masters_place;
5066 int gap_ct = gap;
5067 thidx = n_th;
5068 if (update_master_only == 1)
5069 thidx = 1;
5070 for (f = 0; f < thidx; f++) {
5071 kmp_info_t *th = team->t.t_threads[f];
5072 KMP_DEBUG_ASSERT(th != NULL);
5073
5074 th->th.th_first_place = place;
5075 th->th.th_last_place = place;
5076 th->th.th_new_place = place;
5077 if (__kmp_display_affinity && place != th->th.th_current_place &&
5078 team->t.t_display_affinity != 1) {
5079 team->t.t_display_affinity = 1;
5080 }
5081 s_count++;
5082
5083 if ((s_count == S) && rem && (gap_ct == gap)) {
5084 // do nothing, add an extra thread to place on next iteration
5085 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5086 // we added an extra thread to this place; move on to next place
5087 if (place == last_place) {
5088 place = first_place;
5089 } else if (place == (num_masks - 1)) {
5090 place = 0;
5091 } else {
5092 place++;
5093 }
5094 s_count = 0;
5095 gap_ct = 1;
5096 rem--;
5097 } else if (s_count == S) { // place is full; don't add extra thread
5098 if (place == last_place) {
5099 place = first_place;
5100 } else if (place == (num_masks - 1)) {
5101 place = 0;
5102 } else {
5103 place++;
5104 }
5105 gap_ct++;
5106 s_count = 0;
5107 }
5108
5109 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5110 "partition = [%d,%d]\n",
5111 __kmp_gtid_from_thread(team->t.t_threads[f]),
5112 team->t.t_id, f, th->th.th_new_place,
5113 th->th.th_first_place, th->th.th_last_place));
5114 }
5115 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5116 }
5117 } break;
5118
5119 default:
5120 break;
5121 }
5122
5123 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5124}
5125
5126#endif // KMP_AFFINITY_SUPPORTED
5127
5128/* allocate a new team data structure to use. take one off of the free pool if
5129 available */
5130kmp_team_t *
5131__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5132#if OMPT_SUPPORT
5133 ompt_data_t ompt_parallel_data,
5134#endif
5135 kmp_proc_bind_t new_proc_bind,
5136 kmp_internal_control_t *new_icvs,
5137 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5138 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5139 int f;
5140 kmp_team_t *team;
5141 int use_hot_team = !root->r.r_active;
5142 int level = 0;
5143 int do_place_partition = 1;
5144
5145 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5146 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5147 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5148 KMP_MB();
5149
5150#if KMP_NESTED_HOT_TEAMS
5151 kmp_hot_team_ptr_t *hot_teams;
5152 if (master) {
5153 team = master->th.th_team;
5154 level = team->t.t_active_level;
5155 if (master->th.th_teams_microtask) { // in teams construct?
5156 if (master->th.th_teams_size.nteams > 1 &&
5157 ( // #teams > 1
5158 team->t.t_pkfn ==
5159 (microtask_t)__kmp_teams_master || // inner fork of the teams
5160 master->th.th_teams_level <
5161 team->t.t_level)) { // or nested parallel inside the teams
5162 ++level; // not increment if #teams==1, or for outer fork of the teams;
5163 // increment otherwise
5164 }
5165 // Do not perform the place partition if inner fork of the teams
5166 // Wait until nested parallel region encountered inside teams construct
5167 if ((master->th.th_teams_size.nteams == 1 &&
5168 master->th.th_teams_level >= team->t.t_level) ||
5169 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5170 do_place_partition = 0;
5171 }
5172 hot_teams = master->th.th_hot_teams;
5173 if (level < __kmp_hot_teams_max_level && hot_teams &&
5174 hot_teams[level].hot_team) {
5175 // hot team has already been allocated for given level
5176 use_hot_team = 1;
5177 } else {
5178 use_hot_team = 0;
5179 }
5180 } else {
5181 // check we won't access uninitialized hot_teams, just in case
5182 KMP_DEBUG_ASSERT(new_nproc == 1);
5183 }
5184#endif
5185 // Optimization to use a "hot" team
5186 if (use_hot_team && new_nproc > 1) {
5187 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5188#if KMP_NESTED_HOT_TEAMS
5189 team = hot_teams[level].hot_team;
5190#else
5191 team = root->r.r_hot_team;
5192#endif
5193#if KMP_DEBUG
5194 if (__kmp_tasking_mode != tskm_immediate_exec) {
5195 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5196 "task_team[1] = %p before reinit\n",
5197 team->t.t_task_team[0], team->t.t_task_team[1]));
5198 }
5199#endif
5200
5201 if (team->t.t_nproc != new_nproc &&
5202 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5203 // Distributed barrier may need a resize
5204 int old_nthr = team->t.t_nproc;
5205 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5206 }
5207
5208 // If not doing the place partition, then reset the team's proc bind
5209 // to indicate that partitioning of all threads still needs to take place
5210 if (do_place_partition == 0)
5211 team->t.t_proc_bind = proc_bind_default;
5212 // Has the number of threads changed?
5213 /* Let's assume the most common case is that the number of threads is
5214 unchanged, and put that case first. */
5215 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5216 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5217 // This case can mean that omp_set_num_threads() was called and the hot
5218 // team size was already reduced, so we check the special flag
5219 if (team->t.t_size_changed == -1) {
5220 team->t.t_size_changed = 1;
5221 } else {
5222 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5223 }
5224
5225 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5226 kmp_r_sched_t new_sched = new_icvs->sched;
5227 // set primary thread's schedule as new run-time schedule
5228 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5229
5230 __kmp_reinitialize_team(team, new_icvs,
5231 root->r.r_uber_thread->th.th_ident);
5232
5233 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5234 team->t.t_threads[0], team));
5235 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5236
5237#if KMP_AFFINITY_SUPPORTED
5238 if ((team->t.t_size_changed == 0) &&
5239 (team->t.t_proc_bind == new_proc_bind)) {
5240 if (new_proc_bind == proc_bind_spread) {
5241 if (do_place_partition) {
5242 // add flag to update only master for spread
5243 __kmp_partition_places(team, 1);
5244 }
5245 }
5246 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5247 "proc_bind = %d, partition = [%d,%d]\n",
5248 team->t.t_id, new_proc_bind, team->t.t_first_place,
5249 team->t.t_last_place));
5250 } else {
5251 if (do_place_partition) {
5252 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5253 __kmp_partition_places(team);
5254 }
5255 }
5256#else
5257 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5258#endif /* KMP_AFFINITY_SUPPORTED */
5259 } else if (team->t.t_nproc > new_nproc) {
5260 KA_TRACE(20,
5261 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5262 new_nproc));
5263
5264 team->t.t_size_changed = 1;
5265 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5266 // Barrier size already reduced earlier in this function
5267 // Activate team threads via th_used_in_team
5268 __kmp_add_threads_to_team(team, new_nproc);
5269 }
5270#if KMP_NESTED_HOT_TEAMS
5271 if (__kmp_hot_teams_mode == 0) {
5272 // AC: saved number of threads should correspond to team's value in this
5273 // mode, can be bigger in mode 1, when hot team has threads in reserve
5274 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5275 hot_teams[level].hot_team_nth = new_nproc;
5276#endif // KMP_NESTED_HOT_TEAMS
5277 /* release the extra threads we don't need any more */
5278 for (f = new_nproc; f < team->t.t_nproc; f++) {
5279 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5280 if (__kmp_tasking_mode != tskm_immediate_exec) {
5281 // When decreasing team size, threads no longer in the team should
5282 // unref task team.
5283 team->t.t_threads[f]->th.th_task_team = NULL;
5284 }
5285 __kmp_free_thread(team->t.t_threads[f]);
5286 team->t.t_threads[f] = NULL;
5287 }
5288#if KMP_NESTED_HOT_TEAMS
5289 } // (__kmp_hot_teams_mode == 0)
5290 else {
5291 // When keeping extra threads in team, switch threads to wait on own
5292 // b_go flag
5293 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5294 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5295 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5296 for (int b = 0; b < bs_last_barrier; ++b) {
5297 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5298 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5299 }
5300 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5301 }
5302 }
5303 }
5304#endif // KMP_NESTED_HOT_TEAMS
5305 team->t.t_nproc = new_nproc;
5306 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5307 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5308 __kmp_reinitialize_team(team, new_icvs,
5309 root->r.r_uber_thread->th.th_ident);
5310
5311 // Update remaining threads
5312 for (f = 0; f < new_nproc; ++f) {
5313 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5314 }
5315
5316 // restore the current task state of the primary thread: should be the
5317 // implicit task
5318 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5319 team->t.t_threads[0], team));
5320
5321 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5322
5323#ifdef KMP_DEBUG
5324 for (f = 0; f < team->t.t_nproc; f++) {
5325 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5326 team->t.t_threads[f]->th.th_team_nproc ==
5327 team->t.t_nproc);
5328 }
5329#endif
5330
5331 if (do_place_partition) {
5332 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5333#if KMP_AFFINITY_SUPPORTED
5334 __kmp_partition_places(team);
5335#endif
5336 }
5337 } else { // team->t.t_nproc < new_nproc
5338#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5339 kmp_affin_mask_t *old_mask;
5340 if (KMP_AFFINITY_CAPABLE()) {
5341 KMP_CPU_ALLOC(old_mask);
5342 }
5343#endif
5344
5345 KA_TRACE(20,
5346 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5347 new_nproc));
5348 int old_nproc = team->t.t_nproc; // save old value and use to update only
5349 team->t.t_size_changed = 1;
5350
5351#if KMP_NESTED_HOT_TEAMS
5352 int avail_threads = hot_teams[level].hot_team_nth;
5353 if (new_nproc < avail_threads)
5354 avail_threads = new_nproc;
5355 kmp_info_t **other_threads = team->t.t_threads;
5356 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5357 // Adjust barrier data of reserved threads (if any) of the team
5358 // Other data will be set in __kmp_initialize_info() below.
5359 int b;
5360 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5361 for (b = 0; b < bs_last_barrier; ++b) {
5362 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5363 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5364#if USE_DEBUGGER
5365 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5366#endif
5367 }
5368 }
5369 if (hot_teams[level].hot_team_nth >= new_nproc) {
5370 // we have all needed threads in reserve, no need to allocate any
5371 // this only possible in mode 1, cannot have reserved threads in mode 0
5372 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5373 team->t.t_nproc = new_nproc; // just get reserved threads involved
5374 } else {
5375 // We may have some threads in reserve, but not enough;
5376 // get reserved threads involved if any.
5377 team->t.t_nproc = hot_teams[level].hot_team_nth;
5378 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5379#endif // KMP_NESTED_HOT_TEAMS
5380 if (team->t.t_max_nproc < new_nproc) {
5381 /* reallocate larger arrays */
5382 __kmp_reallocate_team_arrays(team, new_nproc);
5383 __kmp_reinitialize_team(team, new_icvs, NULL);
5384 }
5385
5386#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5387 /* Temporarily set full mask for primary thread before creation of
5388 workers. The reason is that workers inherit the affinity from the
5389 primary thread, so if a lot of workers are created on the single
5390 core quickly, they don't get a chance to set their own affinity for
5391 a long time. */
5392 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5393#endif
5394
5395 /* allocate new threads for the hot team */
5396 for (f = team->t.t_nproc; f < new_nproc; f++) {
5397 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5398 KMP_DEBUG_ASSERT(new_worker);
5399 team->t.t_threads[f] = new_worker;
5400
5401 KA_TRACE(20,
5402 ("__kmp_allocate_team: team %d init T#%d arrived: "
5403 "join=%llu, plain=%llu\n",
5404 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5405 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5406 team->t.t_bar[bs_plain_barrier].b_arrived));
5407
5408 { // Initialize barrier data for new threads.
5409 int b;
5410 kmp_balign_t *balign = new_worker->th.th_bar;
5411 for (b = 0; b < bs_last_barrier; ++b) {
5412 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5413 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5414 KMP_BARRIER_PARENT_FLAG);
5415#if USE_DEBUGGER
5416 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5417#endif
5418 }
5419 }
5420 }
5421
5422#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5423 if (KMP_AFFINITY_CAPABLE()) {
5424 /* Restore initial primary thread's affinity mask */
5425 __kmp_set_system_affinity(old_mask, TRUE);
5426 KMP_CPU_FREE(old_mask);
5427 }
5428#endif
5429#if KMP_NESTED_HOT_TEAMS
5430 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5431#endif // KMP_NESTED_HOT_TEAMS
5432 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5433 // Barrier size already increased earlier in this function
5434 // Activate team threads via th_used_in_team
5435 __kmp_add_threads_to_team(team, new_nproc);
5436 }
5437 /* make sure everyone is syncronized */
5438 // new threads below
5439 __kmp_initialize_team(team, new_nproc, new_icvs,
5440 root->r.r_uber_thread->th.th_ident);
5441
5442 /* reinitialize the threads */
5443 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5444 for (f = 0; f < team->t.t_nproc; ++f)
5445 __kmp_initialize_info(team->t.t_threads[f], team, f,
5446 __kmp_gtid_from_tid(f, team));
5447
5448 if (level) { // set th_task_state for new threads in nested hot team
5449 // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5450 // only need to set the th_task_state for the new threads. th_task_state
5451 // for primary thread will not be accurate until after this in
5452 // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5453 // get the correct value.
5454 for (f = old_nproc; f < team->t.t_nproc; ++f)
5455 team->t.t_threads[f]->th.th_task_state =
5456 team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5457 } else { // set th_task_state for new threads in non-nested hot team
5458 // copy primary thread's state
5459 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5460 for (f = old_nproc; f < team->t.t_nproc; ++f)
5461 team->t.t_threads[f]->th.th_task_state = old_state;
5462 }
5463
5464#ifdef KMP_DEBUG
5465 for (f = 0; f < team->t.t_nproc; ++f) {
5466 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5467 team->t.t_threads[f]->th.th_team_nproc ==
5468 team->t.t_nproc);
5469 }
5470#endif
5471
5472 if (do_place_partition) {
5473 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5474#if KMP_AFFINITY_SUPPORTED
5475 __kmp_partition_places(team);
5476#endif
5477 }
5478 } // Check changes in number of threads
5479
5480 kmp_info_t *master = team->t.t_threads[0];
5481 if (master->th.th_teams_microtask) {
5482 for (f = 1; f < new_nproc; ++f) {
5483 // propagate teams construct specific info to workers
5484 kmp_info_t *thr = team->t.t_threads[f];
5485 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5486 thr->th.th_teams_level = master->th.th_teams_level;
5487 thr->th.th_teams_size = master->th.th_teams_size;
5488 }
5489 }
5490#if KMP_NESTED_HOT_TEAMS
5491 if (level) {
5492 // Sync barrier state for nested hot teams, not needed for outermost hot
5493 // team.
5494 for (f = 1; f < new_nproc; ++f) {
5495 kmp_info_t *thr = team->t.t_threads[f];
5496 int b;
5497 kmp_balign_t *balign = thr->th.th_bar;
5498 for (b = 0; b < bs_last_barrier; ++b) {
5499 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5500 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5501#if USE_DEBUGGER
5502 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5503#endif
5504 }
5505 }
5506 }
5507#endif // KMP_NESTED_HOT_TEAMS
5508
5509 /* reallocate space for arguments if necessary */
5510 __kmp_alloc_argv_entries(argc, team, TRUE);
5511 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5512 // The hot team re-uses the previous task team,
5513 // if untouched during the previous release->gather phase.
5514
5515 KF_TRACE(10, (" hot_team = %p\n", team));
5516
5517#if KMP_DEBUG
5518 if (__kmp_tasking_mode != tskm_immediate_exec) {
5519 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5520 "task_team[1] = %p after reinit\n",
5521 team->t.t_task_team[0], team->t.t_task_team[1]));
5522 }
5523#endif
5524
5525#if OMPT_SUPPORT
5526 __ompt_team_assign_id(team, ompt_parallel_data);
5527#endif
5528
5529 KMP_MB();
5530
5531 return team;
5532 }
5533
5534 /* next, let's try to take one from the team pool */
5535 KMP_MB();
5536 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5537 /* TODO: consider resizing undersized teams instead of reaping them, now
5538 that we have a resizing mechanism */
5539 if (team->t.t_max_nproc >= max_nproc) {
5540 /* take this team from the team pool */
5541 __kmp_team_pool = team->t.t_next_pool;
5542
5543 if (max_nproc > 1 &&
5544 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5545 if (!team->t.b) { // Allocate barrier structure
5546 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5547 }
5548 }
5549
5550 /* setup the team for fresh use */
5551 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5552
5553 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5554 "task_team[1] %p to NULL\n",
5555 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5556 team->t.t_task_team[0] = NULL;
5557 team->t.t_task_team[1] = NULL;
5558
5559 /* reallocate space for arguments if necessary */
5560 __kmp_alloc_argv_entries(argc, team, TRUE);
5561 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5562
5563 KA_TRACE(
5564 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5565 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5566 { // Initialize barrier data.
5567 int b;
5568 for (b = 0; b < bs_last_barrier; ++b) {
5569 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5570#if USE_DEBUGGER
5571 team->t.t_bar[b].b_master_arrived = 0;
5572 team->t.t_bar[b].b_team_arrived = 0;
5573#endif
5574 }
5575 }
5576
5577 team->t.t_proc_bind = new_proc_bind;
5578
5579 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5580 team->t.t_id));
5581
5582#if OMPT_SUPPORT
5583 __ompt_team_assign_id(team, ompt_parallel_data);
5584#endif
5585
5586 KMP_MB();
5587
5588 return team;
5589 }
5590
5591 /* reap team if it is too small, then loop back and check the next one */
5592 // not sure if this is wise, but, will be redone during the hot-teams
5593 // rewrite.
5594 /* TODO: Use technique to find the right size hot-team, don't reap them */
5595 team = __kmp_reap_team(team);
5596 __kmp_team_pool = team;
5597 }
5598
5599 /* nothing available in the pool, no matter, make a new team! */
5600 KMP_MB();
5601 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5602
5603 /* and set it up */
5604 team->t.t_max_nproc = max_nproc;
5605 if (max_nproc > 1 &&
5606 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5607 // Allocate barrier structure
5608 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5609 }
5610
5611 /* NOTE well, for some reason allocating one big buffer and dividing it up
5612 seems to really hurt performance a lot on the P4, so, let's not use this */
5613 __kmp_allocate_team_arrays(team, max_nproc);
5614
5615 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5616 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5617
5618 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5619 "%p to NULL\n",
5620 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5621 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5622 // memory, no need to duplicate
5623 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5624 // memory, no need to duplicate
5625
5626 if (__kmp_storage_map) {
5627 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5628 }
5629
5630 /* allocate space for arguments */
5631 __kmp_alloc_argv_entries(argc, team, FALSE);
5632 team->t.t_argc = argc;
5633
5634 KA_TRACE(20,
5635 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5636 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5637 { // Initialize barrier data.
5638 int b;
5639 for (b = 0; b < bs_last_barrier; ++b) {
5640 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5641#if USE_DEBUGGER
5642 team->t.t_bar[b].b_master_arrived = 0;
5643 team->t.t_bar[b].b_team_arrived = 0;
5644#endif
5645 }
5646 }
5647
5648 team->t.t_proc_bind = new_proc_bind;
5649
5650#if OMPT_SUPPORT
5651 __ompt_team_assign_id(team, ompt_parallel_data);
5652 team->t.ompt_serialized_team_info = NULL;
5653#endif
5654
5655 KMP_MB();
5656
5657 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5658 team->t.t_id));
5659
5660 return team;
5661}
5662
5663/* TODO implement hot-teams at all levels */
5664/* TODO implement lazy thread release on demand (disband request) */
5665
5666/* free the team. return it to the team pool. release all the threads
5667 * associated with it */
5668void __kmp_free_team(kmp_root_t *root,
5669 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5670 int f;
5671 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5672 team->t.t_id));
5673
5674 /* verify state */
5675 KMP_DEBUG_ASSERT(root);
5676 KMP_DEBUG_ASSERT(team);
5677 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5678 KMP_DEBUG_ASSERT(team->t.t_threads);
5679
5680 int use_hot_team = team == root->r.r_hot_team;
5681#if KMP_NESTED_HOT_TEAMS
5682 int level;
5683 if (master) {
5684 level = team->t.t_active_level - 1;
5685 if (master->th.th_teams_microtask) { // in teams construct?
5686 if (master->th.th_teams_size.nteams > 1) {
5687 ++level; // level was not increased in teams construct for
5688 // team_of_masters
5689 }
5690 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5691 master->th.th_teams_level == team->t.t_level) {
5692 ++level; // level was not increased in teams construct for
5693 // team_of_workers before the parallel
5694 } // team->t.t_level will be increased inside parallel
5695 }
5696#if KMP_DEBUG
5697 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5698#endif
5699 if (level < __kmp_hot_teams_max_level) {
5700 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5701 use_hot_team = 1;
5702 }
5703 }
5704#endif // KMP_NESTED_HOT_TEAMS
5705
5706 /* team is done working */
5707 TCW_SYNC_PTR(team->t.t_pkfn,
5708 NULL); // Important for Debugging Support Library.
5709#if KMP_OS_WINDOWS
5710 team->t.t_copyin_counter = 0; // init counter for possible reuse
5711#endif
5712 // Do not reset pointer to parent team to NULL for hot teams.
5713
5714 /* if we are non-hot team, release our threads */
5715 if (!use_hot_team) {
5716 if (__kmp_tasking_mode != tskm_immediate_exec) {
5717 // Wait for threads to reach reapable state
5718 for (f = 1; f < team->t.t_nproc; ++f) {
5719 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5720 kmp_info_t *th = team->t.t_threads[f];
5721 volatile kmp_uint32 *state = &th->th.th_reap_state;
5722 while (*state != KMP_SAFE_TO_REAP) {
5723#if KMP_OS_WINDOWS
5724 // On Windows a thread can be killed at any time, check this
5725 DWORD ecode;
5726 if (!__kmp_is_thread_alive(th, &ecode)) {
5727 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5728 break;
5729 }
5730#endif
5731 // first check if thread is sleeping
5732 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5733 if (fl.is_sleeping())
5734 fl.resume(__kmp_gtid_from_thread(th));
5735 KMP_CPU_PAUSE();
5736 }
5737 }
5738
5739 // Delete task teams
5740 int tt_idx;
5741 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5742 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5743 if (task_team != NULL) {
5744 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5745 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5746 team->t.t_threads[f]->th.th_task_team = NULL;
5747 }
5748 KA_TRACE(
5749 20,
5750 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5751 __kmp_get_gtid(), task_team, team->t.t_id));
5752#if KMP_NESTED_HOT_TEAMS
5753 __kmp_free_task_team(master, task_team);
5754#endif
5755 team->t.t_task_team[tt_idx] = NULL;
5756 }
5757 }
5758 }
5759
5760 // Reset pointer to parent team only for non-hot teams.
5761 team->t.t_parent = NULL;
5762 team->t.t_level = 0;
5763 team->t.t_active_level = 0;
5764
5765 /* free the worker threads */
5766 for (f = 1; f < team->t.t_nproc; ++f) {
5767 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5768 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5769 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5770 1, 2);
5771 }
5772 __kmp_free_thread(team->t.t_threads[f]);
5773 }
5774
5775 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5776 if (team->t.b) {
5777 // wake up thread at old location
5778 team->t.b->go_release();
5779 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5780 for (f = 1; f < team->t.t_nproc; ++f) {
5781 if (team->t.b->sleep[f].sleep) {
5782 __kmp_atomic_resume_64(
5783 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5784 (kmp_atomic_flag_64<> *)NULL);
5785 }
5786 }
5787 }
5788 // Wait for threads to be removed from team
5789 for (int f = 1; f < team->t.t_nproc; ++f) {
5790 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5791 KMP_CPU_PAUSE();
5792 }
5793 }
5794 }
5795
5796 for (f = 1; f < team->t.t_nproc; ++f) {
5797 team->t.t_threads[f] = NULL;
5798 }
5799
5800 if (team->t.t_max_nproc > 1 &&
5801 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5802 distributedBarrier::deallocate(team->t.b);
5803 team->t.b = NULL;
5804 }
5805 /* put the team back in the team pool */
5806 /* TODO limit size of team pool, call reap_team if pool too large */
5807 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5808 __kmp_team_pool = (volatile kmp_team_t *)team;
5809 } else { // Check if team was created for primary threads in teams construct
5810 // See if first worker is a CG root
5811 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5812 team->t.t_threads[1]->th.th_cg_roots);
5813 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5814 // Clean up the CG root nodes on workers so that this team can be re-used
5815 for (f = 1; f < team->t.t_nproc; ++f) {
5816 kmp_info_t *thr = team->t.t_threads[f];
5817 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5818 thr->th.th_cg_roots->cg_root == thr);
5819 // Pop current CG root off list
5820 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5821 thr->th.th_cg_roots = tmp->up;
5822 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5823 " up to node %p. cg_nthreads was %d\n",
5824 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5825 int i = tmp->cg_nthreads--;
5826 if (i == 1) {
5827 __kmp_free(tmp); // free CG if we are the last thread in it
5828 }
5829 // Restore current task's thread_limit from CG root
5830 if (thr->th.th_cg_roots)
5831 thr->th.th_current_task->td_icvs.thread_limit =
5832 thr->th.th_cg_roots->cg_thread_limit;
5833 }
5834 }
5835 }
5836
5837 KMP_MB();
5838}
5839
5840/* reap the team. destroy it, reclaim all its resources and free its memory */
5841kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5842 kmp_team_t *next_pool = team->t.t_next_pool;
5843
5844 KMP_DEBUG_ASSERT(team);
5845 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5846 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5847 KMP_DEBUG_ASSERT(team->t.t_threads);
5848 KMP_DEBUG_ASSERT(team->t.t_argv);
5849
5850 /* TODO clean the threads that are a part of this? */
5851
5852 /* free stuff */
5853 __kmp_free_team_arrays(team);
5854 if (team->t.t_argv != &team->t.t_inline_argv[0])
5855 __kmp_free((void *)team->t.t_argv);
5856 __kmp_free(team);
5857
5858 KMP_MB();
5859 return next_pool;
5860}
5861
5862// Free the thread. Don't reap it, just place it on the pool of available
5863// threads.
5864//
5865// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5866// binding for the affinity mechanism to be useful.
5867//
5868// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5869// However, we want to avoid a potential performance problem by always
5870// scanning through the list to find the correct point at which to insert
5871// the thread (potential N**2 behavior). To do this we keep track of the
5872// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5873// With single-level parallelism, threads will always be added to the tail
5874// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5875// parallelism, all bets are off and we may need to scan through the entire
5876// free list.
5877//
5878// This change also has a potentially large performance benefit, for some
5879// applications. Previously, as threads were freed from the hot team, they
5880// would be placed back on the free list in inverse order. If the hot team
5881// grew back to it's original size, then the freed thread would be placed
5882// back on the hot team in reverse order. This could cause bad cache
5883// locality problems on programs where the size of the hot team regularly
5884// grew and shrunk.
5885//
5886// Now, for single-level parallelism, the OMP tid is always == gtid.
5887void __kmp_free_thread(kmp_info_t *this_th) {
5888 int gtid;
5889 kmp_info_t **scan;
5890
5891 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5892 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5893
5894 KMP_DEBUG_ASSERT(this_th);
5895
5896 // When moving thread to pool, switch thread to wait on own b_go flag, and
5897 // uninitialized (NULL team).
5898 int b;
5899 kmp_balign_t *balign = this_th->th.th_bar;
5900 for (b = 0; b < bs_last_barrier; ++b) {
5901 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5902 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5903 balign[b].bb.team = NULL;
5904 balign[b].bb.leaf_kids = 0;
5905 }
5906 this_th->th.th_task_state = 0;
5907 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5908
5909 /* put thread back on the free pool */
5910 TCW_PTR(this_th->th.th_team, NULL);
5911 TCW_PTR(this_th->th.th_root, NULL);
5912 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5913
5914 while (this_th->th.th_cg_roots) {
5915 this_th->th.th_cg_roots->cg_nthreads--;
5916 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5917 " %p of thread %p to %d\n",
5918 this_th, this_th->th.th_cg_roots,
5919 this_th->th.th_cg_roots->cg_root,
5920 this_th->th.th_cg_roots->cg_nthreads));
5921 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5922 if (tmp->cg_root == this_th) { // Thread is a cg_root
5923 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5924 KA_TRACE(
5925 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5926 this_th->th.th_cg_roots = tmp->up;
5927 __kmp_free(tmp);
5928 } else { // Worker thread
5929 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5930 __kmp_free(tmp);
5931 }
5932 this_th->th.th_cg_roots = NULL;
5933 break;
5934 }
5935 }
5936
5937 /* If the implicit task assigned to this thread can be used by other threads
5938 * -> multiple threads can share the data and try to free the task at
5939 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5940 * with higher probability when hot team is disabled but can occurs even when
5941 * the hot team is enabled */
5942 __kmp_free_implicit_task(this_th);
5943 this_th->th.th_current_task = NULL;
5944
5945 // If the __kmp_thread_pool_insert_pt is already past the new insert
5946 // point, then we need to re-scan the entire list.
5947 gtid = this_th->th.th_info.ds.ds_gtid;
5948 if (__kmp_thread_pool_insert_pt != NULL) {
5949 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5950 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5951 __kmp_thread_pool_insert_pt = NULL;
5952 }
5953 }
5954
5955 // Scan down the list to find the place to insert the thread.
5956 // scan is the address of a link in the list, possibly the address of
5957 // __kmp_thread_pool itself.
5958 //
5959 // In the absence of nested parallelism, the for loop will have 0 iterations.
5960 if (__kmp_thread_pool_insert_pt != NULL) {
5961 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5962 } else {
5963 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5964 }
5965 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5966 scan = &((*scan)->th.th_next_pool))
5967 ;
5968
5969 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5970 // to its address.
5971 TCW_PTR(this_th->th.th_next_pool, *scan);
5972 __kmp_thread_pool_insert_pt = *scan = this_th;
5973 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5974 (this_th->th.th_info.ds.ds_gtid <
5975 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5976 TCW_4(this_th->th.th_in_pool, TRUE);
5977 __kmp_suspend_initialize_thread(this_th);
5978 __kmp_lock_suspend_mx(this_th);
5979 if (this_th->th.th_active == TRUE) {
5980 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5981 this_th->th.th_active_in_pool = TRUE;
5982 }
5983#if KMP_DEBUG
5984 else {
5985 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5986 }
5987#endif
5988 __kmp_unlock_suspend_mx(this_th);
5989
5990 TCW_4(__kmp_nth, __kmp_nth - 1);
5991
5992#ifdef KMP_ADJUST_BLOCKTIME
5993 /* Adjust blocktime back to user setting or default if necessary */
5994 /* Middle initialization might never have occurred */
5995 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5996 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5997 if (__kmp_nth <= __kmp_avail_proc) {
5998 __kmp_zero_bt = FALSE;
5999 }
6000 }
6001#endif /* KMP_ADJUST_BLOCKTIME */
6002
6003 KMP_MB();
6004}
6005
6006/* ------------------------------------------------------------------------ */
6007
6008void *__kmp_launch_thread(kmp_info_t *this_thr) {
6009#if OMP_PROFILING_SUPPORT
6010 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6011 // TODO: add a configuration option for time granularity
6012 if (ProfileTraceFile)
6013 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6014#endif
6015
6016 int gtid = this_thr->th.th_info.ds.ds_gtid;
6017 /* void *stack_data;*/
6018 kmp_team_t **volatile pteam;
6019
6020 KMP_MB();
6021 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6022
6023 if (__kmp_env_consistency_check) {
6024 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6025 }
6026
6027#if OMPD_SUPPORT
6028 if (ompd_state & OMPD_ENABLE_BP)
6029 ompd_bp_thread_begin();
6030#endif
6031
6032#if OMPT_SUPPORT
6033 ompt_data_t *thread_data = nullptr;
6034 if (ompt_enabled.enabled) {
6035 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6036 *thread_data = ompt_data_none;
6037
6038 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6039 this_thr->th.ompt_thread_info.wait_id = 0;
6040 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6041 this_thr->th.ompt_thread_info.parallel_flags = 0;
6042 if (ompt_enabled.ompt_callback_thread_begin) {
6043 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6044 ompt_thread_worker, thread_data);
6045 }
6046 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6047 }
6048#endif
6049
6050 /* This is the place where threads wait for work */
6051 while (!TCR_4(__kmp_global.g.g_done)) {
6052 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6053 KMP_MB();
6054
6055 /* wait for work to do */
6056 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6057
6058 /* No tid yet since not part of a team */
6059 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6060
6061#if OMPT_SUPPORT
6062 if (ompt_enabled.enabled) {
6063 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6064 }
6065#endif
6066
6067 pteam = &this_thr->th.th_team;
6068
6069 /* have we been allocated? */
6070 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6071 /* we were just woken up, so run our new task */
6072 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6073 int rc;
6074 KA_TRACE(20,
6075 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6076 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6077 (*pteam)->t.t_pkfn));
6078
6079 updateHWFPControl(*pteam);
6080
6081#if OMPT_SUPPORT
6082 if (ompt_enabled.enabled) {
6083 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6084 }
6085#endif
6086
6087 rc = (*pteam)->t.t_invoke(gtid);
6088 KMP_ASSERT(rc);
6089
6090 KMP_MB();
6091 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6092 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6093 (*pteam)->t.t_pkfn));
6094 }
6095#if OMPT_SUPPORT
6096 if (ompt_enabled.enabled) {
6097 /* no frame set while outside task */
6098 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6099
6100 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6101 }
6102#endif
6103 /* join barrier after parallel region */
6104 __kmp_join_barrier(gtid);
6105 }
6106 }
6107 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6108
6109#if OMPD_SUPPORT
6110 if (ompd_state & OMPD_ENABLE_BP)
6111 ompd_bp_thread_end();
6112#endif
6113
6114#if OMPT_SUPPORT
6115 if (ompt_enabled.ompt_callback_thread_end) {
6116 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6117 }
6118#endif
6119
6120 this_thr->th.th_task_team = NULL;
6121 /* run the destructors for the threadprivate data for this thread */
6122 __kmp_common_destroy_gtid(gtid);
6123
6124 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6125 KMP_MB();
6126
6127#if OMP_PROFILING_SUPPORT
6128 llvm::timeTraceProfilerFinishThread();
6129#endif
6130 return this_thr;
6131}
6132
6133/* ------------------------------------------------------------------------ */
6134
6135void __kmp_internal_end_dest(void *specific_gtid) {
6136 // Make sure no significant bits are lost
6137 int gtid;
6138 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6139
6140 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6141 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6142 * this is because 0 is reserved for the nothing-stored case */
6143
6144 __kmp_internal_end_thread(gtid);
6145}
6146
6147#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6148
6149__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6150 __kmp_internal_end_atexit();
6151}
6152
6153#endif
6154
6155/* [Windows] josh: when the atexit handler is called, there may still be more
6156 than one thread alive */
6157void __kmp_internal_end_atexit(void) {
6158 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6159 /* [Windows]
6160 josh: ideally, we want to completely shutdown the library in this atexit
6161 handler, but stat code that depends on thread specific data for gtid fails
6162 because that data becomes unavailable at some point during the shutdown, so
6163 we call __kmp_internal_end_thread instead. We should eventually remove the
6164 dependency on __kmp_get_specific_gtid in the stat code and use
6165 __kmp_internal_end_library to cleanly shutdown the library.
6166
6167 // TODO: Can some of this comment about GVS be removed?
6168 I suspect that the offending stat code is executed when the calling thread
6169 tries to clean up a dead root thread's data structures, resulting in GVS
6170 code trying to close the GVS structures for that thread, but since the stat
6171 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6172 the calling thread is cleaning up itself instead of another thread, it get
6173 confused. This happens because allowing a thread to unregister and cleanup
6174 another thread is a recent modification for addressing an issue.
6175 Based on the current design (20050722), a thread may end up
6176 trying to unregister another thread only if thread death does not trigger
6177 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6178 thread specific data destructor function to detect thread death. For
6179 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6180 is nothing. Thus, the workaround is applicable only for Windows static
6181 stat library. */
6182 __kmp_internal_end_library(-1);
6183#if KMP_OS_WINDOWS
6184 __kmp_close_console();
6185#endif
6186}
6187
6188static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6189 // It is assumed __kmp_forkjoin_lock is acquired.
6190
6191 int gtid;
6192
6193 KMP_DEBUG_ASSERT(thread != NULL);
6194
6195 gtid = thread->th.th_info.ds.ds_gtid;
6196
6197 if (!is_root) {
6198 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6199 /* Assume the threads are at the fork barrier here */
6200 KA_TRACE(
6201 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6202 gtid));
6203 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6204 while (
6205 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6206 KMP_CPU_PAUSE();
6207 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6208 } else {
6209 /* Need release fence here to prevent seg faults for tree forkjoin
6210 barrier (GEH) */
6211 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6212 thread);
6213 __kmp_release_64(&flag);
6214 }
6215 }
6216
6217 // Terminate OS thread.
6218 __kmp_reap_worker(thread);
6219
6220 // The thread was killed asynchronously. If it was actively
6221 // spinning in the thread pool, decrement the global count.
6222 //
6223 // There is a small timing hole here - if the worker thread was just waking
6224 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6225 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6226 // the global counter might not get updated.
6227 //
6228 // Currently, this can only happen as the library is unloaded,
6229 // so there are no harmful side effects.
6230 if (thread->th.th_active_in_pool) {
6231 thread->th.th_active_in_pool = FALSE;
6232 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6233 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6234 }
6235 }
6236
6237 __kmp_free_implicit_task(thread);
6238
6239// Free the fast memory for tasking
6240#if USE_FAST_MEMORY
6241 __kmp_free_fast_memory(thread);
6242#endif /* USE_FAST_MEMORY */
6243
6244 __kmp_suspend_uninitialize_thread(thread);
6245
6246 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6247 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6248
6249 --__kmp_all_nth;
6250 // __kmp_nth was decremented when thread is added to the pool.
6251
6252#ifdef KMP_ADJUST_BLOCKTIME
6253 /* Adjust blocktime back to user setting or default if necessary */
6254 /* Middle initialization might never have occurred */
6255 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6256 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6257 if (__kmp_nth <= __kmp_avail_proc) {
6258 __kmp_zero_bt = FALSE;
6259 }
6260 }
6261#endif /* KMP_ADJUST_BLOCKTIME */
6262
6263 /* free the memory being used */
6264 if (__kmp_env_consistency_check) {
6265 if (thread->th.th_cons) {
6266 __kmp_free_cons_stack(thread->th.th_cons);
6267 thread->th.th_cons = NULL;
6268 }
6269 }
6270
6271 if (thread->th.th_pri_common != NULL) {
6272 __kmp_free(thread->th.th_pri_common);
6273 thread->th.th_pri_common = NULL;
6274 }
6275
6276 if (thread->th.th_task_state_memo_stack != NULL) {
6277 __kmp_free(thread->th.th_task_state_memo_stack);
6278 thread->th.th_task_state_memo_stack = NULL;
6279 }
6280
6281#if KMP_USE_BGET
6282 if (thread->th.th_local.bget_data != NULL) {
6283 __kmp_finalize_bget(thread);
6284 }
6285#endif
6286
6287#if KMP_AFFINITY_SUPPORTED
6288 if (thread->th.th_affin_mask != NULL) {
6289 KMP_CPU_FREE(thread->th.th_affin_mask);
6290 thread->th.th_affin_mask = NULL;
6291 }
6292#endif /* KMP_AFFINITY_SUPPORTED */
6293
6294#if KMP_USE_HIER_SCHED
6295 if (thread->th.th_hier_bar_data != NULL) {
6296 __kmp_free(thread->th.th_hier_bar_data);
6297 thread->th.th_hier_bar_data = NULL;
6298 }
6299#endif
6300
6301 __kmp_reap_team(thread->th.th_serial_team);
6302 thread->th.th_serial_team = NULL;
6303 __kmp_free(thread);
6304
6305 KMP_MB();
6306
6307} // __kmp_reap_thread
6308
6309static void __kmp_itthash_clean(kmp_info_t *th) {
6310#if USE_ITT_NOTIFY
6311 if (__kmp_itt_region_domains.count > 0) {
6312 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6313 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6314 while (bucket) {
6315 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6316 __kmp_thread_free(th, bucket);
6317 bucket = next;
6318 }
6319 }
6320 }
6321 if (__kmp_itt_barrier_domains.count > 0) {
6322 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6323 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6324 while (bucket) {
6325 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6326 __kmp_thread_free(th, bucket);
6327 bucket = next;
6328 }
6329 }
6330 }
6331#endif
6332}
6333
6334static void __kmp_internal_end(void) {
6335 int i;
6336
6337 /* First, unregister the library */
6338 __kmp_unregister_library();
6339
6340#if KMP_OS_WINDOWS
6341 /* In Win static library, we can't tell when a root actually dies, so we
6342 reclaim the data structures for any root threads that have died but not
6343 unregistered themselves, in order to shut down cleanly.
6344 In Win dynamic library we also can't tell when a thread dies. */
6345 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6346// dead roots
6347#endif
6348
6349 for (i = 0; i < __kmp_threads_capacity; i++)
6350 if (__kmp_root[i])
6351 if (__kmp_root[i]->r.r_active)
6352 break;
6353 KMP_MB(); /* Flush all pending memory write invalidates. */
6354 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6355
6356 if (i < __kmp_threads_capacity) {
6357#if KMP_USE_MONITOR
6358 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6359 KMP_MB(); /* Flush all pending memory write invalidates. */
6360
6361 // Need to check that monitor was initialized before reaping it. If we are
6362 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6363 // __kmp_monitor will appear to contain valid data, but it is only valid in
6364 // the parent process, not the child.
6365 // New behavior (201008): instead of keying off of the flag
6366 // __kmp_init_parallel, the monitor thread creation is keyed off
6367 // of the new flag __kmp_init_monitor.
6368 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6369 if (TCR_4(__kmp_init_monitor)) {
6370 __kmp_reap_monitor(&__kmp_monitor);
6371 TCW_4(__kmp_init_monitor, 0);
6372 }
6373 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6374 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6375#endif // KMP_USE_MONITOR
6376 } else {
6377/* TODO move this to cleanup code */
6378#ifdef KMP_DEBUG
6379 /* make sure that everything has properly ended */
6380 for (i = 0; i < __kmp_threads_capacity; i++) {
6381 if (__kmp_root[i]) {
6382 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6383 // there can be uber threads alive here
6384 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6385 }
6386 }
6387#endif
6388
6389 KMP_MB();
6390
6391 // Reap the worker threads.
6392 // This is valid for now, but be careful if threads are reaped sooner.
6393 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6394 // Get the next thread from the pool.
6395 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6396 __kmp_thread_pool = thread->th.th_next_pool;
6397 // Reap it.
6398 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6399 thread->th.th_next_pool = NULL;
6400 thread->th.th_in_pool = FALSE;
6401 __kmp_reap_thread(thread, 0);
6402 }
6403 __kmp_thread_pool_insert_pt = NULL;
6404
6405 // Reap teams.
6406 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6407 // Get the next team from the pool.
6408 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6409 __kmp_team_pool = team->t.t_next_pool;
6410 // Reap it.
6411 team->t.t_next_pool = NULL;
6412 __kmp_reap_team(team);
6413 }
6414
6415 __kmp_reap_task_teams();
6416
6417#if KMP_OS_UNIX
6418 // Threads that are not reaped should not access any resources since they
6419 // are going to be deallocated soon, so the shutdown sequence should wait
6420 // until all threads either exit the final spin-waiting loop or begin
6421 // sleeping after the given blocktime.
6422 for (i = 0; i < __kmp_threads_capacity; i++) {
6423 kmp_info_t *thr = __kmp_threads[i];
6424 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6425 KMP_CPU_PAUSE();
6426 }
6427#endif
6428
6429 for (i = 0; i < __kmp_threads_capacity; ++i) {
6430 // TBD: Add some checking...
6431 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6432 }
6433
6434 /* Make sure all threadprivate destructors get run by joining with all
6435 worker threads before resetting this flag */
6436 TCW_SYNC_4(__kmp_init_common, FALSE);
6437
6438 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6439 KMP_MB();
6440
6441#if KMP_USE_MONITOR
6442 // See note above: One of the possible fixes for CQ138434 / CQ140126
6443 //
6444 // FIXME: push both code fragments down and CSE them?
6445 // push them into __kmp_cleanup() ?
6446 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6447 if (TCR_4(__kmp_init_monitor)) {
6448 __kmp_reap_monitor(&__kmp_monitor);
6449 TCW_4(__kmp_init_monitor, 0);
6450 }
6451 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6452 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6453#endif
6454 } /* else !__kmp_global.t_active */
6455 TCW_4(__kmp_init_gtid, FALSE);
6456 KMP_MB(); /* Flush all pending memory write invalidates. */
6457
6458 __kmp_cleanup();
6459#if OMPT_SUPPORT
6460 ompt_fini();
6461#endif
6462}
6463
6464void __kmp_internal_end_library(int gtid_req) {
6465 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6466 /* this shouldn't be a race condition because __kmp_internal_end() is the
6467 only place to clear __kmp_serial_init */
6468 /* we'll check this later too, after we get the lock */
6469 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6470 // redundant, because the next check will work in any case.
6471 if (__kmp_global.g.g_abort) {
6472 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6473 /* TODO abort? */
6474 return;
6475 }
6476 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6477 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6478 return;
6479 }
6480
6481 // If hidden helper team has been initialized, we need to deinit it
6482 if (TCR_4(__kmp_init_hidden_helper) &&
6483 !TCR_4(__kmp_hidden_helper_team_done)) {
6484 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6485 // First release the main thread to let it continue its work
6486 __kmp_hidden_helper_main_thread_release();
6487 // Wait until the hidden helper team has been destroyed
6488 __kmp_hidden_helper_threads_deinitz_wait();
6489 }
6490
6491 KMP_MB(); /* Flush all pending memory write invalidates. */
6492 /* find out who we are and what we should do */
6493 {
6494 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6495 KA_TRACE(
6496 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6497 if (gtid == KMP_GTID_SHUTDOWN) {
6498 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6499 "already shutdown\n"));
6500 return;
6501 } else if (gtid == KMP_GTID_MONITOR) {
6502 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6503 "registered, or system shutdown\n"));
6504 return;
6505 } else if (gtid == KMP_GTID_DNE) {
6506 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6507 "shutdown\n"));
6508 /* we don't know who we are, but we may still shutdown the library */
6509 } else if (KMP_UBER_GTID(gtid)) {
6510 /* unregister ourselves as an uber thread. gtid is no longer valid */
6511 if (__kmp_root[gtid]->r.r_active) {
6512 __kmp_global.g.g_abort = -1;
6513 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6514 __kmp_unregister_library();
6515 KA_TRACE(10,
6516 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6517 gtid));
6518 return;
6519 } else {
6520 __kmp_itthash_clean(__kmp_threads[gtid]);
6521 KA_TRACE(
6522 10,
6523 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6524 __kmp_unregister_root_current_thread(gtid);
6525 }
6526 } else {
6527/* worker threads may call this function through the atexit handler, if they
6528 * call exit() */
6529/* For now, skip the usual subsequent processing and just dump the debug buffer.
6530 TODO: do a thorough shutdown instead */
6531#ifdef DUMP_DEBUG_ON_EXIT
6532 if (__kmp_debug_buf)
6533 __kmp_dump_debug_buffer();
6534#endif
6535 // added unregister library call here when we switch to shm linux
6536 // if we don't, it will leave lots of files in /dev/shm
6537 // cleanup shared memory file before exiting.
6538 __kmp_unregister_library();
6539 return;
6540 }
6541 }
6542 /* synchronize the termination process */
6543 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6544
6545 /* have we already finished */
6546 if (__kmp_global.g.g_abort) {
6547 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6548 /* TODO abort? */
6549 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6550 return;
6551 }
6552 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6553 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6554 return;
6555 }
6556
6557 /* We need this lock to enforce mutex between this reading of
6558 __kmp_threads_capacity and the writing by __kmp_register_root.
6559 Alternatively, we can use a counter of roots that is atomically updated by
6560 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6561 __kmp_internal_end_*. */
6562 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6563
6564 /* now we can safely conduct the actual termination */
6565 __kmp_internal_end();
6566
6567 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6568 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6569
6570 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6571
6572#ifdef DUMP_DEBUG_ON_EXIT
6573 if (__kmp_debug_buf)
6574 __kmp_dump_debug_buffer();
6575#endif
6576
6577#if KMP_OS_WINDOWS
6578 __kmp_close_console();
6579#endif
6580
6581 __kmp_fini_allocator();
6582
6583} // __kmp_internal_end_library
6584
6585void __kmp_internal_end_thread(int gtid_req) {
6586 int i;
6587
6588 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6589 /* this shouldn't be a race condition because __kmp_internal_end() is the
6590 * only place to clear __kmp_serial_init */
6591 /* we'll check this later too, after we get the lock */
6592 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6593 // redundant, because the next check will work in any case.
6594 if (__kmp_global.g.g_abort) {
6595 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6596 /* TODO abort? */
6597 return;
6598 }
6599 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6600 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6601 return;
6602 }
6603
6604 // If hidden helper team has been initialized, we need to deinit it
6605 if (TCR_4(__kmp_init_hidden_helper) &&
6606 !TCR_4(__kmp_hidden_helper_team_done)) {
6607 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6608 // First release the main thread to let it continue its work
6609 __kmp_hidden_helper_main_thread_release();
6610 // Wait until the hidden helper team has been destroyed
6611 __kmp_hidden_helper_threads_deinitz_wait();
6612 }
6613
6614 KMP_MB(); /* Flush all pending memory write invalidates. */
6615
6616 /* find out who we are and what we should do */
6617 {
6618 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6619 KA_TRACE(10,
6620 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6621 if (gtid == KMP_GTID_SHUTDOWN) {
6622 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6623 "already shutdown\n"));
6624 return;
6625 } else if (gtid == KMP_GTID_MONITOR) {
6626 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6627 "registered, or system shutdown\n"));
6628 return;
6629 } else if (gtid == KMP_GTID_DNE) {
6630 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6631 "shutdown\n"));
6632 return;
6633 /* we don't know who we are */
6634 } else if (KMP_UBER_GTID(gtid)) {
6635 /* unregister ourselves as an uber thread. gtid is no longer valid */
6636 if (__kmp_root[gtid]->r.r_active) {
6637 __kmp_global.g.g_abort = -1;
6638 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6639 KA_TRACE(10,
6640 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6641 gtid));
6642 return;
6643 } else {
6644 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6645 gtid));
6646 __kmp_unregister_root_current_thread(gtid);
6647 }
6648 } else {
6649 /* just a worker thread, let's leave */
6650 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6651
6652 if (gtid >= 0) {
6653 __kmp_threads[gtid]->th.th_task_team = NULL;
6654 }
6655
6656 KA_TRACE(10,
6657 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6658 gtid));
6659 return;
6660 }
6661 }
6662#if KMP_DYNAMIC_LIB
6663 if (__kmp_pause_status != kmp_hard_paused)
6664 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6665 // because we will better shutdown later in the library destructor.
6666 {
6667 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6668 return;
6669 }
6670#endif
6671 /* synchronize the termination process */
6672 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6673
6674 /* have we already finished */
6675 if (__kmp_global.g.g_abort) {
6676 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6677 /* TODO abort? */
6678 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6679 return;
6680 }
6681 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6682 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6683 return;
6684 }
6685
6686 /* We need this lock to enforce mutex between this reading of
6687 __kmp_threads_capacity and the writing by __kmp_register_root.
6688 Alternatively, we can use a counter of roots that is atomically updated by
6689 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6690 __kmp_internal_end_*. */
6691
6692 /* should we finish the run-time? are all siblings done? */
6693 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6694
6695 for (i = 0; i < __kmp_threads_capacity; ++i) {
6696 if (KMP_UBER_GTID(i)) {
6697 KA_TRACE(
6698 10,
6699 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6700 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6701 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6702 return;
6703 }
6704 }
6705
6706 /* now we can safely conduct the actual termination */
6707
6708 __kmp_internal_end();
6709
6710 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6711 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6712
6713 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6714
6715#ifdef DUMP_DEBUG_ON_EXIT
6716 if (__kmp_debug_buf)
6717 __kmp_dump_debug_buffer();
6718#endif
6719} // __kmp_internal_end_thread
6720
6721// -----------------------------------------------------------------------------
6722// Library registration stuff.
6723
6724static long __kmp_registration_flag = 0;
6725// Random value used to indicate library initialization.
6726static char *__kmp_registration_str = NULL;
6727// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6728
6729static inline char *__kmp_reg_status_name() {
6730/* On RHEL 3u5 if linked statically, getpid() returns different values in
6731 each thread. If registration and unregistration go in different threads
6732 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6733 env var can not be found, because the name will contain different pid. */
6734// macOS* complains about name being too long with additional getuid()
6735#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6736 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6737 (int)getuid());
6738#else
6739 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6740#endif
6741} // __kmp_reg_status_get
6742
6743#if defined(KMP_USE_SHM)
6744// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6745char *temp_reg_status_file_name = nullptr;
6746#endif
6747
6748void __kmp_register_library_startup(void) {
6749
6750 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6751 int done = 0;
6752 union {
6753 double dtime;
6754 long ltime;
6755 } time;
6756#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6757 __kmp_initialize_system_tick();
6758#endif
6759 __kmp_read_system_time(&time.dtime);
6760 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6761 __kmp_registration_str =
6762 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6763 __kmp_registration_flag, KMP_LIBRARY_FILE);
6764
6765 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6766 __kmp_registration_str));
6767
6768 while (!done) {
6769
6770 char *value = NULL; // Actual value of the environment variable.
6771
6772#if defined(KMP_USE_SHM)
6773 char *shm_name = __kmp_str_format("/%s", name);
6774 int shm_preexist = 0;
6775 char *data1;
6776 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6777 if ((fd1 == -1) && (errno == EEXIST)) {
6778 // file didn't open because it already exists.
6779 // try opening existing file
6780 fd1 = shm_open(shm_name, O_RDWR, 0666);
6781 if (fd1 == -1) { // file didn't open
6782 // error out here
6783 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6784 __kmp_msg_null);
6785 } else {
6786 // able to open existing file
6787 shm_preexist = 1;
6788 }
6789 } else if (fd1 == -1) {
6790 // SHM didn't open; it was due to error other than already exists. Try to
6791 // create a temp file under /tmp.
6792 // TODO: /tmp might not always be the temporary directory. For now we will
6793 // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6794 char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6795 fd1 = mkstemp(temp_file_name);
6796 if (fd1 == -1) {
6797 // error out here.
6798 __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6799 __kmp_msg_null);
6800 }
6801 temp_reg_status_file_name = temp_file_name;
6802 }
6803 if (shm_preexist == 0) {
6804 // we created SHM now set size
6805 if (ftruncate(fd1, SHM_SIZE) == -1) {
6806 // error occured setting size;
6807 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6808 KMP_ERR(errno), __kmp_msg_null);
6809 }
6810 }
6811 data1 =
6812 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6813 if (data1 == MAP_FAILED) {
6814 // failed to map shared memory
6815 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6816 __kmp_msg_null);
6817 }
6818 if (shm_preexist == 0) { // set data to SHM, set value
6819 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6820 }
6821 // Read value from either what we just wrote or existing file.
6822 value = __kmp_str_format("%s", data1); // read value from SHM
6823 munmap(data1, SHM_SIZE);
6824 close(fd1);
6825#else // Windows and unix with static library
6826 // Set environment variable, but do not overwrite if it is exist.
6827 __kmp_env_set(name, __kmp_registration_str, 0);
6828 // read value to see if it got set
6829 value = __kmp_env_get(name);
6830#endif
6831
6832 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6833 done = 1; // Ok, environment variable set successfully, exit the loop.
6834 } else {
6835 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6836 // Check whether it alive or dead.
6837 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6838 char *tail = value;
6839 char *flag_addr_str = NULL;
6840 char *flag_val_str = NULL;
6841 char const *file_name = NULL;
6842 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6843 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6844 file_name = tail;
6845 if (tail != NULL) {
6846 unsigned long *flag_addr = 0;
6847 unsigned long flag_val = 0;
6848 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6849 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6850 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6851 // First, check whether environment-encoded address is mapped into
6852 // addr space.
6853 // If so, dereference it to see if it still has the right value.
6854 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6855 neighbor = 1;
6856 } else {
6857 // If not, then we know the other copy of the library is no longer
6858 // running.
6859 neighbor = 2;
6860 }
6861 }
6862 }
6863 switch (neighbor) {
6864 case 0: // Cannot parse environment variable -- neighbor status unknown.
6865 // Assume it is the incompatible format of future version of the
6866 // library. Assume the other library is alive.
6867 // WARN( ... ); // TODO: Issue a warning.
6868 file_name = "unknown library";
6869 KMP_FALLTHROUGH();
6870 // Attention! Falling to the next case. That's intentional.
6871 case 1: { // Neighbor is alive.
6872 // Check it is allowed.
6873 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6874 if (!__kmp_str_match_true(duplicate_ok)) {
6875 // That's not allowed. Issue fatal error.
6876 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6877 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6878 }
6879 KMP_INTERNAL_FREE(duplicate_ok);
6880 __kmp_duplicate_library_ok = 1;
6881 done = 1; // Exit the loop.
6882 } break;
6883 case 2: { // Neighbor is dead.
6884
6885#if defined(KMP_USE_SHM)
6886 // close shared memory.
6887 shm_unlink(shm_name); // this removes file in /dev/shm
6888#else
6889 // Clear the variable and try to register library again.
6890 __kmp_env_unset(name);
6891#endif
6892 } break;
6893 default: {
6894 KMP_DEBUG_ASSERT(0);
6895 } break;
6896 }
6897 }
6898 KMP_INTERNAL_FREE((void *)value);
6899#if defined(KMP_USE_SHM)
6900 KMP_INTERNAL_FREE((void *)shm_name);
6901#endif
6902 } // while
6903 KMP_INTERNAL_FREE((void *)name);
6904
6905} // func __kmp_register_library_startup
6906
6907void __kmp_unregister_library(void) {
6908
6909 char *name = __kmp_reg_status_name();
6910 char *value = NULL;
6911
6912#if defined(KMP_USE_SHM)
6913 bool use_shm = true;
6914 char *shm_name = __kmp_str_format("/%s", name);
6915 int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6916 if (fd1 == -1) {
6917 // File did not open. Try the temporary file.
6918 use_shm = false;
6919 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6920 FILE *tf = fopen(temp_reg_status_file_name, O_RDONLY);
6921 if (!tf) {
6922 // give it up now.
6923 return;
6924 }
6925 fd1 = fileno(tf);
6926 }
6927 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6928 if (data1 != MAP_FAILED) {
6929 value = __kmp_str_format("%s", data1); // read value from SHM
6930 munmap(data1, SHM_SIZE);
6931 }
6932 close(fd1);
6933#else
6934 value = __kmp_env_get(name);
6935#endif
6936
6937 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6938 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6939 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6940// Ok, this is our variable. Delete it.
6941#if defined(KMP_USE_SHM)
6942 if (use_shm) {
6943 shm_unlink(shm_name); // this removes file in /dev/shm
6944 } else {
6945 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6946 unlink(temp_reg_status_file_name); // this removes the temp file
6947 }
6948#else
6949 __kmp_env_unset(name);
6950#endif
6951 }
6952
6953#if defined(KMP_USE_SHM)
6954 KMP_INTERNAL_FREE(shm_name);
6955 if (!use_shm) {
6956 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6957 KMP_INTERNAL_FREE(temp_reg_status_file_name);
6958 }
6959#endif
6960
6961 KMP_INTERNAL_FREE(__kmp_registration_str);
6962 KMP_INTERNAL_FREE(value);
6963 KMP_INTERNAL_FREE(name);
6964
6965 __kmp_registration_flag = 0;
6966 __kmp_registration_str = NULL;
6967
6968} // __kmp_unregister_library
6969
6970// End of Library registration stuff.
6971// -----------------------------------------------------------------------------
6972
6973#if KMP_MIC_SUPPORTED
6974
6975static void __kmp_check_mic_type() {
6976 kmp_cpuid_t cpuid_state = {0};
6977 kmp_cpuid_t *cs_p = &cpuid_state;
6978 __kmp_x86_cpuid(1, 0, cs_p);
6979 // We don't support mic1 at the moment
6980 if ((cs_p->eax & 0xff0) == 0xB10) {
6981 __kmp_mic_type = mic2;
6982 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6983 __kmp_mic_type = mic3;
6984 } else {
6985 __kmp_mic_type = non_mic;
6986 }
6987}
6988
6989#endif /* KMP_MIC_SUPPORTED */
6990
6991#if KMP_HAVE_UMWAIT
6992static void __kmp_user_level_mwait_init() {
6993 struct kmp_cpuid buf;
6994 __kmp_x86_cpuid(7, 0, &buf);
6995 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6996 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6997 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6998 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6999 __kmp_umwait_enabled));
7000}
7001#elif KMP_HAVE_MWAIT
7002#ifndef AT_INTELPHIUSERMWAIT
7003// Spurious, non-existent value that should always fail to return anything.
7004// Will be replaced with the correct value when we know that.
7005#define AT_INTELPHIUSERMWAIT 10000
7006#endif
7007// getauxval() function is available in RHEL7 and SLES12. If a system with an
7008// earlier OS is used to build the RTL, we'll use the following internal
7009// function when the entry is not found.
7010unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7011unsigned long getauxval(unsigned long) { return 0; }
7012
7013static void __kmp_user_level_mwait_init() {
7014 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7015 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7016 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7017 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7018 if (__kmp_mic_type == mic3) {
7019 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7020 if ((res & 0x1) || __kmp_user_level_mwait) {
7021 __kmp_mwait_enabled = TRUE;
7022 if (__kmp_user_level_mwait) {
7023 KMP_INFORM(EnvMwaitWarn);
7024 }
7025 } else {
7026 __kmp_mwait_enabled = FALSE;
7027 }
7028 }
7029 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7030 "__kmp_mwait_enabled = %d\n",
7031 __kmp_mic_type, __kmp_mwait_enabled));
7032}
7033#endif /* KMP_HAVE_UMWAIT */
7034
7035static void __kmp_do_serial_initialize(void) {
7036 int i, gtid;
7037 size_t size;
7038
7039 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7040
7041 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7042 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7043 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7044 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7045 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7046
7047#if OMPT_SUPPORT
7048 ompt_pre_init();
7049#endif
7050#if OMPD_SUPPORT
7051 __kmp_env_dump();
7052 ompd_init();
7053#endif
7054
7055 __kmp_validate_locks();
7056
7057 /* Initialize internal memory allocator */
7058 __kmp_init_allocator();
7059
7060 /* Register the library startup via an environment variable or via mapped
7061 shared memory file and check to see whether another copy of the library is
7062 already registered. Since forked child process is often terminated, we
7063 postpone the registration till middle initialization in the child */
7064 if (__kmp_need_register_serial)
7065 __kmp_register_library_startup();
7066
7067 /* TODO reinitialization of library */
7068 if (TCR_4(__kmp_global.g.g_done)) {
7069 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7070 }
7071
7072 __kmp_global.g.g_abort = 0;
7073 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7074
7075/* initialize the locks */
7076#if KMP_USE_ADAPTIVE_LOCKS
7077#if KMP_DEBUG_ADAPTIVE_LOCKS
7078 __kmp_init_speculative_stats();
7079#endif
7080#endif
7081#if KMP_STATS_ENABLED
7082 __kmp_stats_init();
7083#endif
7084 __kmp_init_lock(&__kmp_global_lock);
7085 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7086 __kmp_init_lock(&__kmp_debug_lock);
7087 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7088 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7089 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7090 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7091 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7092 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7093 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7094 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7095 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7096 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7097 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7098 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7099 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7100 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7101 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7102#if KMP_USE_MONITOR
7103 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7104#endif
7105 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7106
7107 /* conduct initialization and initial setup of configuration */
7108
7109 __kmp_runtime_initialize();
7110
7111#if KMP_MIC_SUPPORTED
7112 __kmp_check_mic_type();
7113#endif
7114
7115// Some global variable initialization moved here from kmp_env_initialize()
7116#ifdef KMP_DEBUG
7117 kmp_diag = 0;
7118#endif
7119 __kmp_abort_delay = 0;
7120
7121 // From __kmp_init_dflt_team_nth()
7122 /* assume the entire machine will be used */
7123 __kmp_dflt_team_nth_ub = __kmp_xproc;
7124 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7125 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7126 }
7127 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7128 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7129 }
7130 __kmp_max_nth = __kmp_sys_max_nth;
7131 __kmp_cg_max_nth = __kmp_sys_max_nth;
7132 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7133 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7134 __kmp_teams_max_nth = __kmp_sys_max_nth;
7135 }
7136
7137 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7138 // part
7139 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7140#if KMP_USE_MONITOR
7141 __kmp_monitor_wakeups =
7142 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7143 __kmp_bt_intervals =
7144 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7145#endif
7146 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7147 __kmp_library = library_throughput;
7148 // From KMP_SCHEDULE initialization
7149 __kmp_static = kmp_sch_static_balanced;
7150// AC: do not use analytical here, because it is non-monotonous
7151//__kmp_guided = kmp_sch_guided_iterative_chunked;
7152//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7153// need to repeat assignment
7154// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7155// bit control and barrier method control parts
7156#if KMP_FAST_REDUCTION_BARRIER
7157#define kmp_reduction_barrier_gather_bb ((int)1)
7158#define kmp_reduction_barrier_release_bb ((int)1)
7159#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7160#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7161#endif // KMP_FAST_REDUCTION_BARRIER
7162 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7163 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7164 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7165 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7166 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7167#if KMP_FAST_REDUCTION_BARRIER
7168 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7169 // lin_64 ): hyper,1
7170 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7171 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7172 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7173 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7174 }
7175#endif // KMP_FAST_REDUCTION_BARRIER
7176 }
7177#if KMP_FAST_REDUCTION_BARRIER
7178#undef kmp_reduction_barrier_release_pat
7179#undef kmp_reduction_barrier_gather_pat
7180#undef kmp_reduction_barrier_release_bb
7181#undef kmp_reduction_barrier_gather_bb
7182#endif // KMP_FAST_REDUCTION_BARRIER
7183#if KMP_MIC_SUPPORTED
7184 if (__kmp_mic_type == mic2) { // KNC
7185 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7186 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7187 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7188 1; // forkjoin release
7189 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7190 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7191 }
7192#if KMP_FAST_REDUCTION_BARRIER
7193 if (__kmp_mic_type == mic2) { // KNC
7194 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7195 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7196 }
7197#endif // KMP_FAST_REDUCTION_BARRIER
7198#endif // KMP_MIC_SUPPORTED
7199
7200// From KMP_CHECKS initialization
7201#ifdef KMP_DEBUG
7202 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7203#else
7204 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7205#endif
7206
7207 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7208 __kmp_foreign_tp = TRUE;
7209
7210 __kmp_global.g.g_dynamic = FALSE;
7211 __kmp_global.g.g_dynamic_mode = dynamic_default;
7212
7213 __kmp_init_nesting_mode();
7214
7215 __kmp_env_initialize(NULL);
7216
7217#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7218 __kmp_user_level_mwait_init();
7219#endif
7220// Print all messages in message catalog for testing purposes.
7221#ifdef KMP_DEBUG
7222 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7223 if (__kmp_str_match_true(val)) {
7224 kmp_str_buf_t buffer;
7225 __kmp_str_buf_init(&buffer);
7226 __kmp_i18n_dump_catalog(&buffer);
7227 __kmp_printf("%s", buffer.str);
7228 __kmp_str_buf_free(&buffer);
7229 }
7230 __kmp_env_free(&val);
7231#endif
7232
7233 __kmp_threads_capacity =
7234 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7235 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7236 __kmp_tp_capacity = __kmp_default_tp_capacity(
7237 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7238
7239 // If the library is shut down properly, both pools must be NULL. Just in
7240 // case, set them to NULL -- some memory may leak, but subsequent code will
7241 // work even if pools are not freed.
7242 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7243 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7244 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7245 __kmp_thread_pool = NULL;
7246 __kmp_thread_pool_insert_pt = NULL;
7247 __kmp_team_pool = NULL;
7248
7249 /* Allocate all of the variable sized records */
7250 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7251 * expandable */
7252 /* Since allocation is cache-aligned, just add extra padding at the end */
7253 size =
7254 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7255 CACHE_LINE;
7256 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7257 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7258 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7259
7260 /* init thread counts */
7261 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7262 0); // Asserts fail if the library is reinitializing and
7263 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7264 __kmp_all_nth = 0;
7265 __kmp_nth = 0;
7266
7267 /* setup the uber master thread and hierarchy */
7268 gtid = __kmp_register_root(TRUE);
7269 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7270 KMP_ASSERT(KMP_UBER_GTID(gtid));
7271 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7272
7273 KMP_MB(); /* Flush all pending memory write invalidates. */
7274
7275 __kmp_common_initialize();
7276
7277#if KMP_OS_UNIX
7278 /* invoke the child fork handler */
7279 __kmp_register_atfork();
7280#endif
7281
7282#if !KMP_DYNAMIC_LIB || \
7283 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7284 {
7285 /* Invoke the exit handler when the program finishes, only for static
7286 library and macOS* dynamic. For other dynamic libraries, we already
7287 have _fini and DllMain. */
7288 int rc = atexit(__kmp_internal_end_atexit);
7289 if (rc != 0) {
7290 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7291 __kmp_msg_null);
7292 }
7293 }
7294#endif
7295
7296#if KMP_HANDLE_SIGNALS
7297#if KMP_OS_UNIX
7298 /* NOTE: make sure that this is called before the user installs their own
7299 signal handlers so that the user handlers are called first. this way they
7300 can return false, not call our handler, avoid terminating the library, and
7301 continue execution where they left off. */
7302 __kmp_install_signals(FALSE);
7303#endif /* KMP_OS_UNIX */
7304#if KMP_OS_WINDOWS
7305 __kmp_install_signals(TRUE);
7306#endif /* KMP_OS_WINDOWS */
7307#endif
7308
7309 /* we have finished the serial initialization */
7310 __kmp_init_counter++;
7311
7312 __kmp_init_serial = TRUE;
7313
7314 if (__kmp_settings) {
7315 __kmp_env_print();
7316 }
7317
7318 if (__kmp_display_env || __kmp_display_env_verbose) {
7319 __kmp_env_print_2();
7320 }
7321
7322#if OMPT_SUPPORT
7323 ompt_post_init();
7324#endif
7325
7326 KMP_MB();
7327
7328 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7329}
7330
7331void __kmp_serial_initialize(void) {
7332 if (__kmp_init_serial) {
7333 return;
7334 }
7335 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7336 if (__kmp_init_serial) {
7337 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7338 return;
7339 }
7340 __kmp_do_serial_initialize();
7341 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7342}
7343
7344static void __kmp_do_middle_initialize(void) {
7345 int i, j;
7346 int prev_dflt_team_nth;
7347
7348 if (!__kmp_init_serial) {
7349 __kmp_do_serial_initialize();
7350 }
7351
7352 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7353
7354 if (UNLIKELY(!__kmp_need_register_serial)) {
7355 // We are in a forked child process. The registration was skipped during
7356 // serial initialization in __kmp_atfork_child handler. Do it here.
7357 __kmp_register_library_startup();
7358 }
7359
7360 // Save the previous value for the __kmp_dflt_team_nth so that
7361 // we can avoid some reinitialization if it hasn't changed.
7362 prev_dflt_team_nth = __kmp_dflt_team_nth;
7363
7364#if KMP_AFFINITY_SUPPORTED
7365 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7366 // number of cores on the machine.
7367 __kmp_affinity_initialize(__kmp_affinity);
7368
7369#endif /* KMP_AFFINITY_SUPPORTED */
7370
7371 KMP_ASSERT(__kmp_xproc > 0);
7372 if (__kmp_avail_proc == 0) {
7373 __kmp_avail_proc = __kmp_xproc;
7374 }
7375
7376 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7377 // correct them now
7378 j = 0;
7379 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7380 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7381 __kmp_avail_proc;
7382 j++;
7383 }
7384
7385 if (__kmp_dflt_team_nth == 0) {
7386#ifdef KMP_DFLT_NTH_CORES
7387 // Default #threads = #cores
7388 __kmp_dflt_team_nth = __kmp_ncores;
7389 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7390 "__kmp_ncores (%d)\n",
7391 __kmp_dflt_team_nth));
7392#else
7393 // Default #threads = #available OS procs
7394 __kmp_dflt_team_nth = __kmp_avail_proc;
7395 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7396 "__kmp_avail_proc(%d)\n",
7397 __kmp_dflt_team_nth));
7398#endif /* KMP_DFLT_NTH_CORES */
7399 }
7400
7401 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7402 __kmp_dflt_team_nth = KMP_MIN_NTH;
7403 }
7404 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7405 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7406 }
7407
7408 if (__kmp_nesting_mode > 0)
7409 __kmp_set_nesting_mode_threads();
7410
7411 // There's no harm in continuing if the following check fails,
7412 // but it indicates an error in the previous logic.
7413 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7414
7415 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7416 // Run through the __kmp_threads array and set the num threads icv for each
7417 // root thread that is currently registered with the RTL (which has not
7418 // already explicitly set its nthreads-var with a call to
7419 // omp_set_num_threads()).
7420 for (i = 0; i < __kmp_threads_capacity; i++) {
7421 kmp_info_t *thread = __kmp_threads[i];
7422 if (thread == NULL)
7423 continue;
7424 if (thread->th.th_current_task->td_icvs.nproc != 0)
7425 continue;
7426
7427 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7428 }
7429 }
7430 KA_TRACE(
7431 20,
7432 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7433 __kmp_dflt_team_nth));
7434
7435#ifdef KMP_ADJUST_BLOCKTIME
7436 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7437 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7438 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7439 if (__kmp_nth > __kmp_avail_proc) {
7440 __kmp_zero_bt = TRUE;
7441 }
7442 }
7443#endif /* KMP_ADJUST_BLOCKTIME */
7444
7445 /* we have finished middle initialization */
7446 TCW_SYNC_4(__kmp_init_middle, TRUE);
7447
7448 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7449}
7450
7451void __kmp_middle_initialize(void) {
7452 if (__kmp_init_middle) {
7453 return;
7454 }
7455 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7456 if (__kmp_init_middle) {
7457 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7458 return;
7459 }
7460 __kmp_do_middle_initialize();
7461 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7462}
7463
7464void __kmp_parallel_initialize(void) {
7465 int gtid = __kmp_entry_gtid(); // this might be a new root
7466
7467 /* synchronize parallel initialization (for sibling) */
7468 if (TCR_4(__kmp_init_parallel))
7469 return;
7470 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7471 if (TCR_4(__kmp_init_parallel)) {
7472 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7473 return;
7474 }
7475
7476 /* TODO reinitialization after we have already shut down */
7477 if (TCR_4(__kmp_global.g.g_done)) {
7478 KA_TRACE(
7479 10,
7480 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7481 __kmp_infinite_loop();
7482 }
7483
7484 /* jc: The lock __kmp_initz_lock is already held, so calling
7485 __kmp_serial_initialize would cause a deadlock. So we call
7486 __kmp_do_serial_initialize directly. */
7487 if (!__kmp_init_middle) {
7488 __kmp_do_middle_initialize();
7489 }
7490 __kmp_assign_root_init_mask();
7491 __kmp_resume_if_hard_paused();
7492
7493 /* begin initialization */
7494 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7495 KMP_ASSERT(KMP_UBER_GTID(gtid));
7496
7497#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7498 // Save the FP control regs.
7499 // Worker threads will set theirs to these values at thread startup.
7500 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7501 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7502 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7503#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7504
7505#if KMP_OS_UNIX
7506#if KMP_HANDLE_SIGNALS
7507 /* must be after __kmp_serial_initialize */
7508 __kmp_install_signals(TRUE);
7509#endif
7510#endif
7511
7512 __kmp_suspend_initialize();
7513
7514#if defined(USE_LOAD_BALANCE)
7515 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7516 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7517 }
7518#else
7519 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7520 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7521 }
7522#endif
7523
7524 if (__kmp_version) {
7525 __kmp_print_version_2();
7526 }
7527
7528 /* we have finished parallel initialization */
7529 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7530
7531 KMP_MB();
7532 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7533
7534 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7535}
7536
7537void __kmp_hidden_helper_initialize() {
7538 if (TCR_4(__kmp_init_hidden_helper))
7539 return;
7540
7541 // __kmp_parallel_initialize is required before we initialize hidden helper
7542 if (!TCR_4(__kmp_init_parallel))
7543 __kmp_parallel_initialize();
7544
7545 // Double check. Note that this double check should not be placed before
7546 // __kmp_parallel_initialize as it will cause dead lock.
7547 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7548 if (TCR_4(__kmp_init_hidden_helper)) {
7549 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7550 return;
7551 }
7552
7553#if KMP_AFFINITY_SUPPORTED
7554 // Initialize hidden helper affinity settings.
7555 // The above __kmp_parallel_initialize() will initialize
7556 // regular affinity (and topology) if not already done.
7557 if (!__kmp_hh_affinity.flags.initialized)
7558 __kmp_affinity_initialize(__kmp_hh_affinity);
7559#endif
7560
7561 // Set the count of hidden helper tasks to be executed to zero
7562 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7563
7564 // Set the global variable indicating that we're initializing hidden helper
7565 // team/threads
7566 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7567
7568 // Platform independent initialization
7569 __kmp_do_initialize_hidden_helper_threads();
7570
7571 // Wait here for the finish of initialization of hidden helper teams
7572 __kmp_hidden_helper_threads_initz_wait();
7573
7574 // We have finished hidden helper initialization
7575 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7576
7577 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7578}
7579
7580/* ------------------------------------------------------------------------ */
7581
7582void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7583 kmp_team_t *team) {
7584 kmp_disp_t *dispatch;
7585
7586 KMP_MB();
7587
7588 /* none of the threads have encountered any constructs, yet. */
7589 this_thr->th.th_local.this_construct = 0;
7590#if KMP_CACHE_MANAGE
7591 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7592#endif /* KMP_CACHE_MANAGE */
7593 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7594 KMP_DEBUG_ASSERT(dispatch);
7595 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7596 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7597 // this_thr->th.th_info.ds.ds_tid ] );
7598
7599 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7600 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7601 if (__kmp_env_consistency_check)
7602 __kmp_push_parallel(gtid, team->t.t_ident);
7603
7604 KMP_MB(); /* Flush all pending memory write invalidates. */
7605}
7606
7607void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7608 kmp_team_t *team) {
7609 if (__kmp_env_consistency_check)
7610 __kmp_pop_parallel(gtid, team->t.t_ident);
7611
7612 __kmp_finish_implicit_task(this_thr);
7613}
7614
7615int __kmp_invoke_task_func(int gtid) {
7616 int rc;
7617 int tid = __kmp_tid_from_gtid(gtid);
7618 kmp_info_t *this_thr = __kmp_threads[gtid];
7619 kmp_team_t *team = this_thr->th.th_team;
7620
7621 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7622#if USE_ITT_BUILD
7623 if (__itt_stack_caller_create_ptr) {
7624 // inform ittnotify about entering user's code
7625 if (team->t.t_stack_id != NULL) {
7626 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7627 } else {
7628 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7629 __kmp_itt_stack_callee_enter(
7630 (__itt_caller)team->t.t_parent->t.t_stack_id);
7631 }
7632 }
7633#endif /* USE_ITT_BUILD */
7634#if INCLUDE_SSC_MARKS
7635 SSC_MARK_INVOKING();
7636#endif
7637
7638#if OMPT_SUPPORT
7639 void *dummy;
7640 void **exit_frame_p;
7641 ompt_data_t *my_task_data;
7642 ompt_data_t *my_parallel_data;
7643 int ompt_team_size;
7644
7645 if (ompt_enabled.enabled) {
7646 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7647 .ompt_task_info.frame.exit_frame.ptr);
7648 } else {
7649 exit_frame_p = &dummy;
7650 }
7651
7652 my_task_data =
7653 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7654 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7655 if (ompt_enabled.ompt_callback_implicit_task) {
7656 ompt_team_size = team->t.t_nproc;
7657 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7658 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7659 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7660 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7661 }
7662#endif
7663
7664#if KMP_STATS_ENABLED
7665 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7666 if (previous_state == stats_state_e::TEAMS_REGION) {
7667 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7668 } else {
7669 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7670 }
7671 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7672#endif
7673
7674 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7675 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7676#if OMPT_SUPPORT
7677 ,
7678 exit_frame_p
7679#endif
7680 );
7681#if OMPT_SUPPORT
7682 *exit_frame_p = NULL;
7683 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7684#endif
7685
7686#if KMP_STATS_ENABLED
7687 if (previous_state == stats_state_e::TEAMS_REGION) {
7688 KMP_SET_THREAD_STATE(previous_state);
7689 }
7690 KMP_POP_PARTITIONED_TIMER();
7691#endif
7692
7693#if USE_ITT_BUILD
7694 if (__itt_stack_caller_create_ptr) {
7695 // inform ittnotify about leaving user's code
7696 if (team->t.t_stack_id != NULL) {
7697 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7698 } else {
7699 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7700 __kmp_itt_stack_callee_leave(
7701 (__itt_caller)team->t.t_parent->t.t_stack_id);
7702 }
7703 }
7704#endif /* USE_ITT_BUILD */
7705 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7706
7707 return rc;
7708}
7709
7710void __kmp_teams_master(int gtid) {
7711 // This routine is called by all primary threads in teams construct
7712 kmp_info_t *thr = __kmp_threads[gtid];
7713 kmp_team_t *team = thr->th.th_team;
7714 ident_t *loc = team->t.t_ident;
7715 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7716 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7717 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7718 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7719 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7720
7721 // This thread is a new CG root. Set up the proper variables.
7722 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7723 tmp->cg_root = thr; // Make thr the CG root
7724 // Init to thread limit stored when league primary threads were forked
7725 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7726 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7727 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7728 " cg_nthreads to 1\n",
7729 thr, tmp));
7730 tmp->up = thr->th.th_cg_roots;
7731 thr->th.th_cg_roots = tmp;
7732
7733// Launch league of teams now, but not let workers execute
7734// (they hang on fork barrier until next parallel)
7735#if INCLUDE_SSC_MARKS
7736 SSC_MARK_FORKING();
7737#endif
7738 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7739 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7740 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7741#if INCLUDE_SSC_MARKS
7742 SSC_MARK_JOINING();
7743#endif
7744 // If the team size was reduced from the limit, set it to the new size
7745 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7746 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7747 // AC: last parameter "1" eliminates join barrier which won't work because
7748 // worker threads are in a fork barrier waiting for more parallel regions
7749 __kmp_join_call(loc, gtid
7750#if OMPT_SUPPORT
7751 ,
7752 fork_context_intel
7753#endif
7754 ,
7755 1);
7756}
7757
7758int __kmp_invoke_teams_master(int gtid) {
7759 kmp_info_t *this_thr = __kmp_threads[gtid];
7760 kmp_team_t *team = this_thr->th.th_team;
7761#if KMP_DEBUG
7762 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7763 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7764 (void *)__kmp_teams_master);
7765#endif
7766 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7767#if OMPT_SUPPORT
7768 int tid = __kmp_tid_from_gtid(gtid);
7769 ompt_data_t *task_data =
7770 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7771 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7772 if (ompt_enabled.ompt_callback_implicit_task) {
7773 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7774 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7775 ompt_task_initial);
7776 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7777 }
7778#endif
7779 __kmp_teams_master(gtid);
7780#if OMPT_SUPPORT
7781 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7782#endif
7783 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7784 return 1;
7785}
7786
7787/* this sets the requested number of threads for the next parallel region
7788 encountered by this team. since this should be enclosed in the forkjoin
7789 critical section it should avoid race conditions with asymmetrical nested
7790 parallelism */
7791
7792void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7793 kmp_info_t *thr = __kmp_threads[gtid];
7794
7795 if (num_threads > 0)
7796 thr->th.th_set_nproc = num_threads;
7797}
7798
7799static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7800 int num_threads) {
7801 KMP_DEBUG_ASSERT(thr);
7802 // Remember the number of threads for inner parallel regions
7803 if (!TCR_4(__kmp_init_middle))
7804 __kmp_middle_initialize(); // get internal globals calculated
7805 __kmp_assign_root_init_mask();
7806 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7807 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7808
7809 if (num_threads == 0) {
7810 if (__kmp_teams_thread_limit > 0) {
7811 num_threads = __kmp_teams_thread_limit;
7812 } else {
7813 num_threads = __kmp_avail_proc / num_teams;
7814 }
7815 // adjust num_threads w/o warning as it is not user setting
7816 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7817 // no thread_limit clause specified - do not change thread-limit-var ICV
7818 if (num_threads > __kmp_dflt_team_nth) {
7819 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7820 }
7821 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7822 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7823 } // prevent team size to exceed thread-limit-var
7824 if (num_teams * num_threads > __kmp_teams_max_nth) {
7825 num_threads = __kmp_teams_max_nth / num_teams;
7826 }
7827 if (num_threads == 0) {
7828 num_threads = 1;
7829 }
7830 } else {
7831 if (num_threads < 0) {
7832 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7833 __kmp_msg_null);
7834 num_threads = 1;
7835 }
7836 // This thread will be the primary thread of the league primary threads
7837 // Store new thread limit; old limit is saved in th_cg_roots list
7838 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7839 // num_threads = min(num_threads, nthreads-var)
7840 if (num_threads > __kmp_dflt_team_nth) {
7841 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7842 }
7843 if (num_teams * num_threads > __kmp_teams_max_nth) {
7844 int new_threads = __kmp_teams_max_nth / num_teams;
7845 if (new_threads == 0) {
7846 new_threads = 1;
7847 }
7848 if (new_threads != num_threads) {
7849 if (!__kmp_reserve_warn) { // user asked for too many threads
7850 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7851 __kmp_msg(kmp_ms_warning,
7852 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7853 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7854 }
7855 }
7856 num_threads = new_threads;
7857 }
7858 }
7859 thr->th.th_teams_size.nth = num_threads;
7860}
7861
7862/* this sets the requested number of teams for the teams region and/or
7863 the number of threads for the next parallel region encountered */
7864void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7865 int num_threads) {
7866 kmp_info_t *thr = __kmp_threads[gtid];
7867 if (num_teams < 0) {
7868 // OpenMP specification requires requested values to be positive,
7869 // but people can send us any value, so we'd better check
7870 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7871 __kmp_msg_null);
7872 num_teams = 1;
7873 }
7874 if (num_teams == 0) {
7875 if (__kmp_nteams > 0) {
7876 num_teams = __kmp_nteams;
7877 } else {
7878 num_teams = 1; // default number of teams is 1.
7879 }
7880 }
7881 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7882 if (!__kmp_reserve_warn) {
7883 __kmp_reserve_warn = 1;
7884 __kmp_msg(kmp_ms_warning,
7885 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7886 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7887 }
7888 num_teams = __kmp_teams_max_nth;
7889 }
7890 // Set number of teams (number of threads in the outer "parallel" of the
7891 // teams)
7892 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7893
7894 __kmp_push_thread_limit(thr, num_teams, num_threads);
7895}
7896
7897/* This sets the requested number of teams for the teams region and/or
7898 the number of threads for the next parallel region encountered */
7899void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7900 int num_teams_ub, int num_threads) {
7901 kmp_info_t *thr = __kmp_threads[gtid];
7902 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7903 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7904 KMP_DEBUG_ASSERT(num_threads >= 0);
7905
7906 if (num_teams_lb > num_teams_ub) {
7907 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7908 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7909 }
7910
7911 int num_teams = 1; // defalt number of teams is 1.
7912
7913 if (num_teams_lb == 0 && num_teams_ub > 0)
7914 num_teams_lb = num_teams_ub;
7915
7916 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7917 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7918 if (num_teams > __kmp_teams_max_nth) {
7919 if (!__kmp_reserve_warn) {
7920 __kmp_reserve_warn = 1;
7921 __kmp_msg(kmp_ms_warning,
7922 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7923 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7924 }
7925 num_teams = __kmp_teams_max_nth;
7926 }
7927 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7928 num_teams = num_teams_ub;
7929 } else { // num_teams_lb <= num_teams <= num_teams_ub
7930 if (num_threads <= 0) {
7931 if (num_teams_ub > __kmp_teams_max_nth) {
7932 num_teams = num_teams_lb;
7933 } else {
7934 num_teams = num_teams_ub;
7935 }
7936 } else {
7937 num_teams = (num_threads > __kmp_teams_max_nth)
7938 ? num_teams
7939 : __kmp_teams_max_nth / num_threads;
7940 if (num_teams < num_teams_lb) {
7941 num_teams = num_teams_lb;
7942 } else if (num_teams > num_teams_ub) {
7943 num_teams = num_teams_ub;
7944 }
7945 }
7946 }
7947 // Set number of teams (number of threads in the outer "parallel" of the
7948 // teams)
7949 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7950
7951 __kmp_push_thread_limit(thr, num_teams, num_threads);
7952}
7953
7954// Set the proc_bind var to use in the following parallel region.
7955void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7956 kmp_info_t *thr = __kmp_threads[gtid];
7957 thr->th.th_set_proc_bind = proc_bind;
7958}
7959
7960/* Launch the worker threads into the microtask. */
7961
7962void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7963 kmp_info_t *this_thr = __kmp_threads[gtid];
7964
7965#ifdef KMP_DEBUG
7966 int f;
7967#endif /* KMP_DEBUG */
7968
7969 KMP_DEBUG_ASSERT(team);
7970 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7971 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7972 KMP_MB(); /* Flush all pending memory write invalidates. */
7973
7974 team->t.t_construct = 0; /* no single directives seen yet */
7975 team->t.t_ordered.dt.t_value =
7976 0; /* thread 0 enters the ordered section first */
7977
7978 /* Reset the identifiers on the dispatch buffer */
7979 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7980 if (team->t.t_max_nproc > 1) {
7981 int i;
7982 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7983 team->t.t_disp_buffer[i].buffer_index = i;
7984 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7985 }
7986 } else {
7987 team->t.t_disp_buffer[0].buffer_index = 0;
7988 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7989 }
7990
7991 KMP_MB(); /* Flush all pending memory write invalidates. */
7992 KMP_ASSERT(this_thr->th.th_team == team);
7993
7994#ifdef KMP_DEBUG
7995 for (f = 0; f < team->t.t_nproc; f++) {
7996 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7997 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7998 }
7999#endif /* KMP_DEBUG */
8000
8001 /* release the worker threads so they may begin working */
8002 __kmp_fork_barrier(gtid, 0);
8003}
8004
8005void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8006 kmp_info_t *this_thr = __kmp_threads[gtid];
8007
8008 KMP_DEBUG_ASSERT(team);
8009 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8010 KMP_ASSERT(KMP_MASTER_GTID(gtid));
8011 KMP_MB(); /* Flush all pending memory write invalidates. */
8012
8013 /* Join barrier after fork */
8014
8015#ifdef KMP_DEBUG
8016 if (__kmp_threads[gtid] &&
8017 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8018 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8019 __kmp_threads[gtid]);
8020 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8021 "team->t.t_nproc=%d\n",
8022 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8023 team->t.t_nproc);
8024 __kmp_print_structure();
8025 }
8026 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8027 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8028#endif /* KMP_DEBUG */
8029
8030 __kmp_join_barrier(gtid); /* wait for everyone */
8031#if OMPT_SUPPORT
8032 if (ompt_enabled.enabled &&
8033 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8034 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8035 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8036 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8037#if OMPT_OPTIONAL
8038 void *codeptr = NULL;
8039 if (KMP_MASTER_TID(ds_tid) &&
8040 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8041 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8042 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8043
8044 if (ompt_enabled.ompt_callback_sync_region_wait) {
8045 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8046 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8047 codeptr);
8048 }
8049 if (ompt_enabled.ompt_callback_sync_region) {
8050 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8051 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8052 codeptr);
8053 }
8054#endif
8055 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8056 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8057 ompt_scope_end, NULL, task_data, 0, ds_tid,
8058 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8059 }
8060 }
8061#endif
8062
8063 KMP_MB(); /* Flush all pending memory write invalidates. */
8064 KMP_ASSERT(this_thr->th.th_team == team);
8065}
8066
8067/* ------------------------------------------------------------------------ */
8068
8069#ifdef USE_LOAD_BALANCE
8070
8071// Return the worker threads actively spinning in the hot team, if we
8072// are at the outermost level of parallelism. Otherwise, return 0.
8073static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8074 int i;
8075 int retval;
8076 kmp_team_t *hot_team;
8077
8078 if (root->r.r_active) {
8079 return 0;
8080 }
8081 hot_team = root->r.r_hot_team;
8082 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8083 return hot_team->t.t_nproc - 1; // Don't count primary thread
8084 }
8085
8086 // Skip the primary thread - it is accounted for elsewhere.
8087 retval = 0;
8088 for (i = 1; i < hot_team->t.t_nproc; i++) {
8089 if (hot_team->t.t_threads[i]->th.th_active) {
8090 retval++;
8091 }
8092 }
8093 return retval;
8094}
8095
8096// Perform an automatic adjustment to the number of
8097// threads used by the next parallel region.
8098static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8099 int retval;
8100 int pool_active;
8101 int hot_team_active;
8102 int team_curr_active;
8103 int system_active;
8104
8105 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8106 set_nproc));
8107 KMP_DEBUG_ASSERT(root);
8108 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8109 ->th.th_current_task->td_icvs.dynamic == TRUE);
8110 KMP_DEBUG_ASSERT(set_nproc > 1);
8111
8112 if (set_nproc == 1) {
8113 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8114 return 1;
8115 }
8116
8117 // Threads that are active in the thread pool, active in the hot team for this
8118 // particular root (if we are at the outer par level), and the currently
8119 // executing thread (to become the primary thread) are available to add to the
8120 // new team, but are currently contributing to the system load, and must be
8121 // accounted for.
8122 pool_active = __kmp_thread_pool_active_nth;
8123 hot_team_active = __kmp_active_hot_team_nproc(root);
8124 team_curr_active = pool_active + hot_team_active + 1;
8125
8126 // Check the system load.
8127 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8128 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8129 "hot team active = %d\n",
8130 system_active, pool_active, hot_team_active));
8131
8132 if (system_active < 0) {
8133 // There was an error reading the necessary info from /proc, so use the
8134 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8135 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8136 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8137 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8138
8139 // Make this call behave like the thread limit algorithm.
8140 retval = __kmp_avail_proc - __kmp_nth +
8141 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8142 if (retval > set_nproc) {
8143 retval = set_nproc;
8144 }
8145 if (retval < KMP_MIN_NTH) {
8146 retval = KMP_MIN_NTH;
8147 }
8148
8149 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8150 retval));
8151 return retval;
8152 }
8153
8154 // There is a slight delay in the load balance algorithm in detecting new
8155 // running procs. The real system load at this instant should be at least as
8156 // large as the #active omp thread that are available to add to the team.
8157 if (system_active < team_curr_active) {
8158 system_active = team_curr_active;
8159 }
8160 retval = __kmp_avail_proc - system_active + team_curr_active;
8161 if (retval > set_nproc) {
8162 retval = set_nproc;
8163 }
8164 if (retval < KMP_MIN_NTH) {
8165 retval = KMP_MIN_NTH;
8166 }
8167
8168 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8169 return retval;
8170} // __kmp_load_balance_nproc()
8171
8172#endif /* USE_LOAD_BALANCE */
8173
8174/* ------------------------------------------------------------------------ */
8175
8176/* NOTE: this is called with the __kmp_init_lock held */
8177void __kmp_cleanup(void) {
8178 int f;
8179
8180 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8181
8182 if (TCR_4(__kmp_init_parallel)) {
8183#if KMP_HANDLE_SIGNALS
8184 __kmp_remove_signals();
8185#endif
8186 TCW_4(__kmp_init_parallel, FALSE);
8187 }
8188
8189 if (TCR_4(__kmp_init_middle)) {
8190#if KMP_AFFINITY_SUPPORTED
8191 __kmp_affinity_uninitialize();
8192#endif /* KMP_AFFINITY_SUPPORTED */
8193 __kmp_cleanup_hierarchy();
8194 TCW_4(__kmp_init_middle, FALSE);
8195 }
8196
8197 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8198
8199 if (__kmp_init_serial) {
8200 __kmp_runtime_destroy();
8201 __kmp_init_serial = FALSE;
8202 }
8203
8204 __kmp_cleanup_threadprivate_caches();
8205
8206 for (f = 0; f < __kmp_threads_capacity; f++) {
8207 if (__kmp_root[f] != NULL) {
8208 __kmp_free(__kmp_root[f]);
8209 __kmp_root[f] = NULL;
8210 }
8211 }
8212 __kmp_free(__kmp_threads);
8213 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8214 // there is no need in freeing __kmp_root.
8215 __kmp_threads = NULL;
8216 __kmp_root = NULL;
8217 __kmp_threads_capacity = 0;
8218
8219 // Free old __kmp_threads arrays if they exist.
8220 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8221 while (ptr) {
8222 kmp_old_threads_list_t *next = ptr->next;
8223 __kmp_free(ptr->threads);
8224 __kmp_free(ptr);
8225 ptr = next;
8226 }
8227
8228#if KMP_USE_DYNAMIC_LOCK
8229 __kmp_cleanup_indirect_user_locks();
8230#else
8231 __kmp_cleanup_user_locks();
8232#endif
8233#if OMPD_SUPPORT
8234 if (ompd_state) {
8235 __kmp_free(ompd_env_block);
8236 ompd_env_block = NULL;
8237 ompd_env_block_size = 0;
8238 }
8239#endif
8240
8241#if KMP_AFFINITY_SUPPORTED
8242 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8243 __kmp_cpuinfo_file = NULL;
8244#endif /* KMP_AFFINITY_SUPPORTED */
8245
8246#if KMP_USE_ADAPTIVE_LOCKS
8247#if KMP_DEBUG_ADAPTIVE_LOCKS
8248 __kmp_print_speculative_stats();
8249#endif
8250#endif
8251 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8252 __kmp_nested_nth.nth = NULL;
8253 __kmp_nested_nth.size = 0;
8254 __kmp_nested_nth.used = 0;
8255 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8256 __kmp_nested_proc_bind.bind_types = NULL;
8257 __kmp_nested_proc_bind.size = 0;
8258 __kmp_nested_proc_bind.used = 0;
8259 if (__kmp_affinity_format) {
8260 KMP_INTERNAL_FREE(__kmp_affinity_format);
8261 __kmp_affinity_format = NULL;
8262 }
8263
8264 __kmp_i18n_catclose();
8265
8266#if KMP_USE_HIER_SCHED
8267 __kmp_hier_scheds.deallocate();
8268#endif
8269
8270#if KMP_STATS_ENABLED
8271 __kmp_stats_fini();
8272#endif
8273
8274 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8275}
8276
8277/* ------------------------------------------------------------------------ */
8278
8279int __kmp_ignore_mppbeg(void) {
8280 char *env;
8281
8282 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8283 if (__kmp_str_match_false(env))
8284 return FALSE;
8285 }
8286 // By default __kmpc_begin() is no-op.
8287 return TRUE;
8288}
8289
8290int __kmp_ignore_mppend(void) {
8291 char *env;
8292
8293 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8294 if (__kmp_str_match_false(env))
8295 return FALSE;
8296 }
8297 // By default __kmpc_end() is no-op.
8298 return TRUE;
8299}
8300
8301void __kmp_internal_begin(void) {
8302 int gtid;
8303 kmp_root_t *root;
8304
8305 /* this is a very important step as it will register new sibling threads
8306 and assign these new uber threads a new gtid */
8307 gtid = __kmp_entry_gtid();
8308 root = __kmp_threads[gtid]->th.th_root;
8309 KMP_ASSERT(KMP_UBER_GTID(gtid));
8310
8311 if (root->r.r_begin)
8312 return;
8313 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8314 if (root->r.r_begin) {
8315 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8316 return;
8317 }
8318
8319 root->r.r_begin = TRUE;
8320
8321 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8322}
8323
8324/* ------------------------------------------------------------------------ */
8325
8326void __kmp_user_set_library(enum library_type arg) {
8327 int gtid;
8328 kmp_root_t *root;
8329 kmp_info_t *thread;
8330
8331 /* first, make sure we are initialized so we can get our gtid */
8332
8333 gtid = __kmp_entry_gtid();
8334 thread = __kmp_threads[gtid];
8335
8336 root = thread->th.th_root;
8337
8338 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8339 library_serial));
8340 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8341 thread */
8342 KMP_WARNING(SetLibraryIncorrectCall);
8343 return;
8344 }
8345
8346 switch (arg) {
8347 case library_serial:
8348 thread->th.th_set_nproc = 0;
8349 set__nproc(thread, 1);
8350 break;
8351 case library_turnaround:
8352 thread->th.th_set_nproc = 0;
8353 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8354 : __kmp_dflt_team_nth_ub);
8355 break;
8356 case library_throughput:
8357 thread->th.th_set_nproc = 0;
8358 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8359 : __kmp_dflt_team_nth_ub);
8360 break;
8361 default:
8362 KMP_FATAL(UnknownLibraryType, arg);
8363 }
8364
8365 __kmp_aux_set_library(arg);
8366}
8367
8368void __kmp_aux_set_stacksize(size_t arg) {
8369 if (!__kmp_init_serial)
8370 __kmp_serial_initialize();
8371
8372#if KMP_OS_DARWIN
8373 if (arg & (0x1000 - 1)) {
8374 arg &= ~(0x1000 - 1);
8375 if (arg + 0x1000) /* check for overflow if we round up */
8376 arg += 0x1000;
8377 }
8378#endif
8379 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8380
8381 /* only change the default stacksize before the first parallel region */
8382 if (!TCR_4(__kmp_init_parallel)) {
8383 size_t value = arg; /* argument is in bytes */
8384
8385 if (value < __kmp_sys_min_stksize)
8386 value = __kmp_sys_min_stksize;
8387 else if (value > KMP_MAX_STKSIZE)
8388 value = KMP_MAX_STKSIZE;
8389
8390 __kmp_stksize = value;
8391
8392 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8393 }
8394
8395 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8396}
8397
8398/* set the behaviour of the runtime library */
8399/* TODO this can cause some odd behaviour with sibling parallelism... */
8400void __kmp_aux_set_library(enum library_type arg) {
8401 __kmp_library = arg;
8402
8403 switch (__kmp_library) {
8404 case library_serial: {
8405 KMP_INFORM(LibraryIsSerial);
8406 } break;
8407 case library_turnaround:
8408 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8409 __kmp_use_yield = 2; // only yield when oversubscribed
8410 break;
8411 case library_throughput:
8412 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8413 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8414 break;
8415 default:
8416 KMP_FATAL(UnknownLibraryType, arg);
8417 }
8418}
8419
8420/* Getting team information common for all team API */
8421// Returns NULL if not in teams construct
8422static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8423 kmp_info_t *thr = __kmp_entry_thread();
8424 teams_serialized = 0;
8425 if (thr->th.th_teams_microtask) {
8426 kmp_team_t *team = thr->th.th_team;
8427 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8428 int ii = team->t.t_level;
8429 teams_serialized = team->t.t_serialized;
8430 int level = tlevel + 1;
8431 KMP_DEBUG_ASSERT(ii >= tlevel);
8432 while (ii > level) {
8433 for (teams_serialized = team->t.t_serialized;
8434 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8435 }
8436 if (team->t.t_serialized && (!teams_serialized)) {
8437 team = team->t.t_parent;
8438 continue;
8439 }
8440 if (ii > level) {
8441 team = team->t.t_parent;
8442 ii--;
8443 }
8444 }
8445 return team;
8446 }
8447 return NULL;
8448}
8449
8450int __kmp_aux_get_team_num() {
8451 int serialized;
8452 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8453 if (team) {
8454 if (serialized > 1) {
8455 return 0; // teams region is serialized ( 1 team of 1 thread ).
8456 } else {
8457 return team->t.t_master_tid;
8458 }
8459 }
8460 return 0;
8461}
8462
8463int __kmp_aux_get_num_teams() {
8464 int serialized;
8465 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8466 if (team) {
8467 if (serialized > 1) {
8468 return 1;
8469 } else {
8470 return team->t.t_parent->t.t_nproc;
8471 }
8472 }
8473 return 1;
8474}
8475
8476/* ------------------------------------------------------------------------ */
8477
8478/*
8479 * Affinity Format Parser
8480 *
8481 * Field is in form of: %[[[0].]size]type
8482 * % and type are required (%% means print a literal '%')
8483 * type is either single char or long name surrounded by {},
8484 * e.g., N or {num_threads}
8485 * 0 => leading zeros
8486 * . => right justified when size is specified
8487 * by default output is left justified
8488 * size is the *minimum* field length
8489 * All other characters are printed as is
8490 *
8491 * Available field types:
8492 * L {thread_level} - omp_get_level()
8493 * n {thread_num} - omp_get_thread_num()
8494 * h {host} - name of host machine
8495 * P {process_id} - process id (integer)
8496 * T {thread_identifier} - native thread identifier (integer)
8497 * N {num_threads} - omp_get_num_threads()
8498 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8499 * a {thread_affinity} - comma separated list of integers or integer ranges
8500 * (values of affinity mask)
8501 *
8502 * Implementation-specific field types can be added
8503 * If a type is unknown, print "undefined"
8504 */
8505
8506// Structure holding the short name, long name, and corresponding data type
8507// for snprintf. A table of these will represent the entire valid keyword
8508// field types.
8509typedef struct kmp_affinity_format_field_t {
8510 char short_name; // from spec e.g., L -> thread level
8511 const char *long_name; // from spec thread_level -> thread level
8512 char field_format; // data type for snprintf (typically 'd' or 's'
8513 // for integer or string)
8514} kmp_affinity_format_field_t;
8515
8516static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8517#if KMP_AFFINITY_SUPPORTED
8518 {'A', "thread_affinity", 's'},
8519#endif
8520 {'t', "team_num", 'd'},
8521 {'T', "num_teams", 'd'},
8522 {'L', "nesting_level", 'd'},
8523 {'n', "thread_num", 'd'},
8524 {'N', "num_threads", 'd'},
8525 {'a', "ancestor_tnum", 'd'},
8526 {'H', "host", 's'},
8527 {'P', "process_id", 'd'},
8528 {'i', "native_thread_id", 'd'}};
8529
8530// Return the number of characters it takes to hold field
8531static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8532 const char **ptr,
8533 kmp_str_buf_t *field_buffer) {
8534 int rc, format_index, field_value;
8535 const char *width_left, *width_right;
8536 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8537 static const int FORMAT_SIZE = 20;
8538 char format[FORMAT_SIZE] = {0};
8539 char absolute_short_name = 0;
8540
8541 KMP_DEBUG_ASSERT(gtid >= 0);
8542 KMP_DEBUG_ASSERT(th);
8543 KMP_DEBUG_ASSERT(**ptr == '%');
8544 KMP_DEBUG_ASSERT(field_buffer);
8545
8546 __kmp_str_buf_clear(field_buffer);
8547
8548 // Skip the initial %
8549 (*ptr)++;
8550
8551 // Check for %% first
8552 if (**ptr == '%') {
8553 __kmp_str_buf_cat(field_buffer, "%", 1);
8554 (*ptr)++; // skip over the second %
8555 return 1;
8556 }
8557
8558 // Parse field modifiers if they are present
8559 pad_zeros = false;
8560 if (**ptr == '0') {
8561 pad_zeros = true;
8562 (*ptr)++; // skip over 0
8563 }
8564 right_justify = false;
8565 if (**ptr == '.') {
8566 right_justify = true;
8567 (*ptr)++; // skip over .
8568 }
8569 // Parse width of field: [width_left, width_right)
8570 width_left = width_right = NULL;
8571 if (**ptr >= '0' && **ptr <= '9') {
8572 width_left = *ptr;
8573 SKIP_DIGITS(*ptr);
8574 width_right = *ptr;
8575 }
8576
8577 // Create the format for KMP_SNPRINTF based on flags parsed above
8578 format_index = 0;
8579 format[format_index++] = '%';
8580 if (!right_justify)
8581 format[format_index++] = '-';
8582 if (pad_zeros)
8583 format[format_index++] = '0';
8584 if (width_left && width_right) {
8585 int i = 0;
8586 // Only allow 8 digit number widths.
8587 // This also prevents overflowing format variable
8588 while (i < 8 && width_left < width_right) {
8589 format[format_index++] = *width_left;
8590 width_left++;
8591 i++;
8592 }
8593 }
8594
8595 // Parse a name (long or short)
8596 // Canonicalize the name into absolute_short_name
8597 found_valid_name = false;
8598 parse_long_name = (**ptr == '{');
8599 if (parse_long_name)
8600 (*ptr)++; // skip initial left brace
8601 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8602 sizeof(__kmp_affinity_format_table[0]);
8603 ++i) {
8604 char short_name = __kmp_affinity_format_table[i].short_name;
8605 const char *long_name = __kmp_affinity_format_table[i].long_name;
8606 char field_format = __kmp_affinity_format_table[i].field_format;
8607 if (parse_long_name) {
8608 size_t length = KMP_STRLEN(long_name);
8609 if (strncmp(*ptr, long_name, length) == 0) {
8610 found_valid_name = true;
8611 (*ptr) += length; // skip the long name
8612 }
8613 } else if (**ptr == short_name) {
8614 found_valid_name = true;
8615 (*ptr)++; // skip the short name
8616 }
8617 if (found_valid_name) {
8618 format[format_index++] = field_format;
8619 format[format_index++] = '\0';
8620 absolute_short_name = short_name;
8621 break;
8622 }
8623 }
8624 if (parse_long_name) {
8625 if (**ptr != '}') {
8626 absolute_short_name = 0;
8627 } else {
8628 (*ptr)++; // skip over the right brace
8629 }
8630 }
8631
8632 // Attempt to fill the buffer with the requested
8633 // value using snprintf within __kmp_str_buf_print()
8634 switch (absolute_short_name) {
8635 case 't':
8636 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8637 break;
8638 case 'T':
8639 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8640 break;
8641 case 'L':
8642 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8643 break;
8644 case 'n':
8645 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8646 break;
8647 case 'H': {
8648 static const int BUFFER_SIZE = 256;
8649 char buf[BUFFER_SIZE];
8650 __kmp_expand_host_name(buf, BUFFER_SIZE);
8651 rc = __kmp_str_buf_print(field_buffer, format, buf);
8652 } break;
8653 case 'P':
8654 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8655 break;
8656 case 'i':
8657 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8658 break;
8659 case 'N':
8660 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8661 break;
8662 case 'a':
8663 field_value =
8664 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8665 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8666 break;
8667#if KMP_AFFINITY_SUPPORTED
8668 case 'A': {
8669 kmp_str_buf_t buf;
8670 __kmp_str_buf_init(&buf);
8671 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8672 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8673 __kmp_str_buf_free(&buf);
8674 } break;
8675#endif
8676 default:
8677 // According to spec, If an implementation does not have info for field
8678 // type, then "undefined" is printed
8679 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8680 // Skip the field
8681 if (parse_long_name) {
8682 SKIP_TOKEN(*ptr);
8683 if (**ptr == '}')
8684 (*ptr)++;
8685 } else {
8686 (*ptr)++;
8687 }
8688 }
8689
8690 KMP_ASSERT(format_index <= FORMAT_SIZE);
8691 return rc;
8692}
8693
8694/*
8695 * Return number of characters needed to hold the affinity string
8696 * (not including null byte character)
8697 * The resultant string is printed to buffer, which the caller can then
8698 * handle afterwards
8699 */
8700size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8701 kmp_str_buf_t *buffer) {
8702 const char *parse_ptr;
8703 size_t retval;
8704 const kmp_info_t *th;
8705 kmp_str_buf_t field;
8706
8707 KMP_DEBUG_ASSERT(buffer);
8708 KMP_DEBUG_ASSERT(gtid >= 0);
8709
8710 __kmp_str_buf_init(&field);
8711 __kmp_str_buf_clear(buffer);
8712
8713 th = __kmp_threads[gtid];
8714 retval = 0;
8715
8716 // If format is NULL or zero-length string, then we use
8717 // affinity-format-var ICV
8718 parse_ptr = format;
8719 if (parse_ptr == NULL || *parse_ptr == '\0') {
8720 parse_ptr = __kmp_affinity_format;
8721 }
8722 KMP_DEBUG_ASSERT(parse_ptr);
8723
8724 while (*parse_ptr != '\0') {
8725 // Parse a field
8726 if (*parse_ptr == '%') {
8727 // Put field in the buffer
8728 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8729 __kmp_str_buf_catbuf(buffer, &field);
8730 retval += rc;
8731 } else {
8732 // Put literal character in buffer
8733 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8734 retval++;
8735 parse_ptr++;
8736 }
8737 }
8738 __kmp_str_buf_free(&field);
8739 return retval;
8740}
8741
8742// Displays the affinity string to stdout
8743void __kmp_aux_display_affinity(int gtid, const char *format) {
8744 kmp_str_buf_t buf;
8745 __kmp_str_buf_init(&buf);
8746 __kmp_aux_capture_affinity(gtid, format, &buf);
8747 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8748 __kmp_str_buf_free(&buf);
8749}
8750
8751/* ------------------------------------------------------------------------ */
8752
8753void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8754 int blocktime = arg; /* argument is in milliseconds */
8755#if KMP_USE_MONITOR
8756 int bt_intervals;
8757#endif
8758 kmp_int8 bt_set;
8759
8760 __kmp_save_internal_controls(thread);
8761
8762 /* Normalize and set blocktime for the teams */
8763 if (blocktime < KMP_MIN_BLOCKTIME)
8764 blocktime = KMP_MIN_BLOCKTIME;
8765 else if (blocktime > KMP_MAX_BLOCKTIME)
8766 blocktime = KMP_MAX_BLOCKTIME;
8767
8768 set__blocktime_team(thread->th.th_team, tid, blocktime);
8769 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8770
8771#if KMP_USE_MONITOR
8772 /* Calculate and set blocktime intervals for the teams */
8773 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8774
8775 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8776 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8777#endif
8778
8779 /* Set whether blocktime has been set to "TRUE" */
8780 bt_set = TRUE;
8781
8782 set__bt_set_team(thread->th.th_team, tid, bt_set);
8783 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8784#if KMP_USE_MONITOR
8785 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8786 "bt_intervals=%d, monitor_updates=%d\n",
8787 __kmp_gtid_from_tid(tid, thread->th.th_team),
8788 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8789 __kmp_monitor_wakeups));
8790#else
8791 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8792 __kmp_gtid_from_tid(tid, thread->th.th_team),
8793 thread->th.th_team->t.t_id, tid, blocktime));
8794#endif
8795}
8796
8797void __kmp_aux_set_defaults(char const *str, size_t len) {
8798 if (!__kmp_init_serial) {
8799 __kmp_serial_initialize();
8800 }
8801 __kmp_env_initialize(str);
8802
8803 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8804 __kmp_env_print();
8805 }
8806} // __kmp_aux_set_defaults
8807
8808/* ------------------------------------------------------------------------ */
8809/* internal fast reduction routines */
8810
8811PACKED_REDUCTION_METHOD_T
8812__kmp_determine_reduction_method(
8813 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8814 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8815 kmp_critical_name *lck) {
8816
8817 // Default reduction method: critical construct ( lck != NULL, like in current
8818 // PAROPT )
8819 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8820 // can be selected by RTL
8821 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8822 // can be selected by RTL
8823 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8824 // among generated by PAROPT.
8825
8826 PACKED_REDUCTION_METHOD_T retval;
8827
8828 int team_size;
8829
8830 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8831 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8832
8833#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8834 (loc && \
8835 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8836#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8837
8838 retval = critical_reduce_block;
8839
8840 // another choice of getting a team size (with 1 dynamic deference) is slower
8841 team_size = __kmp_get_team_num_threads(global_tid);
8842 if (team_size == 1) {
8843
8844 retval = empty_reduce_block;
8845
8846 } else {
8847
8848 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8849
8850#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8851 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8852
8853#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8854 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8855
8856 int teamsize_cutoff = 4;
8857
8858#if KMP_MIC_SUPPORTED
8859 if (__kmp_mic_type != non_mic) {
8860 teamsize_cutoff = 8;
8861 }
8862#endif
8863 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8864 if (tree_available) {
8865 if (team_size <= teamsize_cutoff) {
8866 if (atomic_available) {
8867 retval = atomic_reduce_block;
8868 }
8869 } else {
8870 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8871 }
8872 } else if (atomic_available) {
8873 retval = atomic_reduce_block;
8874 }
8875#else
8876#error "Unknown or unsupported OS"
8877#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8878 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8879
8880#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8881
8882#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8883
8884 // basic tuning
8885
8886 if (atomic_available) {
8887 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8888 retval = atomic_reduce_block;
8889 }
8890 } // otherwise: use critical section
8891
8892#elif KMP_OS_DARWIN
8893
8894 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8895 if (atomic_available && (num_vars <= 3)) {
8896 retval = atomic_reduce_block;
8897 } else if (tree_available) {
8898 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8899 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8900 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8901 }
8902 } // otherwise: use critical section
8903
8904#else
8905#error "Unknown or unsupported OS"
8906#endif
8907
8908#else
8909#error "Unknown or unsupported architecture"
8910#endif
8911 }
8912
8913 // KMP_FORCE_REDUCTION
8914
8915 // If the team is serialized (team_size == 1), ignore the forced reduction
8916 // method and stay with the unsynchronized method (empty_reduce_block)
8917 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8918 team_size != 1) {
8919
8920 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8921
8922 int atomic_available, tree_available;
8923
8924 switch ((forced_retval = __kmp_force_reduction_method)) {
8925 case critical_reduce_block:
8926 KMP_ASSERT(lck); // lck should be != 0
8927 break;
8928
8929 case atomic_reduce_block:
8930 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8931 if (!atomic_available) {
8932 KMP_WARNING(RedMethodNotSupported, "atomic");
8933 forced_retval = critical_reduce_block;
8934 }
8935 break;
8936
8937 case tree_reduce_block:
8938 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8939 if (!tree_available) {
8940 KMP_WARNING(RedMethodNotSupported, "tree");
8941 forced_retval = critical_reduce_block;
8942 } else {
8943#if KMP_FAST_REDUCTION_BARRIER
8944 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8945#endif
8946 }
8947 break;
8948
8949 default:
8950 KMP_ASSERT(0); // "unsupported method specified"
8951 }
8952
8953 retval = forced_retval;
8954 }
8955
8956 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8957
8958#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8959#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8960
8961 return (retval);
8962}
8963// this function is for testing set/get/determine reduce method
8964kmp_int32 __kmp_get_reduce_method(void) {
8965 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8966}
8967
8968// Soft pause sets up threads to ignore blocktime and just go to sleep.
8969// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8970void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8971
8972// Hard pause shuts down the runtime completely. Resume happens naturally when
8973// OpenMP is used subsequently.
8974void __kmp_hard_pause() {
8975 __kmp_pause_status = kmp_hard_paused;
8976 __kmp_internal_end_thread(-1);
8977}
8978
8979// Soft resume sets __kmp_pause_status, and wakes up all threads.
8980void __kmp_resume_if_soft_paused() {
8981 if (__kmp_pause_status == kmp_soft_paused) {
8982 __kmp_pause_status = kmp_not_paused;
8983
8984 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8985 kmp_info_t *thread = __kmp_threads[gtid];
8986 if (thread) { // Wake it if sleeping
8987 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8988 thread);
8989 if (fl.is_sleeping())
8990 fl.resume(gtid);
8991 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8992 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8993 } else { // thread holds the lock and may sleep soon
8994 do { // until either the thread sleeps, or we can get the lock
8995 if (fl.is_sleeping()) {
8996 fl.resume(gtid);
8997 break;
8998 } else if (__kmp_try_suspend_mx(thread)) {
8999 __kmp_unlock_suspend_mx(thread);
9000 break;
9001 }
9002 } while (1);
9003 }
9004 }
9005 }
9006 }
9007}
9008
9009// This function is called via __kmpc_pause_resource. Returns 0 if successful.
9010// TODO: add warning messages
9011int __kmp_pause_resource(kmp_pause_status_t level) {
9012 if (level == kmp_not_paused) { // requesting resume
9013 if (__kmp_pause_status == kmp_not_paused) {
9014 // error message about runtime not being paused, so can't resume
9015 return 1;
9016 } else {
9017 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9018 __kmp_pause_status == kmp_hard_paused);
9019 __kmp_pause_status = kmp_not_paused;
9020 return 0;
9021 }
9022 } else if (level == kmp_soft_paused) { // requesting soft pause
9023 if (__kmp_pause_status != kmp_not_paused) {
9024 // error message about already being paused
9025 return 1;
9026 } else {
9027 __kmp_soft_pause();
9028 return 0;
9029 }
9030 } else if (level == kmp_hard_paused) { // requesting hard pause
9031 if (__kmp_pause_status != kmp_not_paused) {
9032 // error message about already being paused
9033 return 1;
9034 } else {
9035 __kmp_hard_pause();
9036 return 0;
9037 }
9038 } else {
9039 // error message about invalid level
9040 return 1;
9041 }
9042}
9043
9044void __kmp_omp_display_env(int verbose) {
9045 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9046 if (__kmp_init_serial == 0)
9047 __kmp_do_serial_initialize();
9048 __kmp_display_env_impl(!verbose, verbose);
9049 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9050}
9051
9052// The team size is changing, so distributed barrier must be modified
9053void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9054 int new_nthreads) {
9055 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9056 bp_dist_bar);
9057 kmp_info_t **other_threads = team->t.t_threads;
9058
9059 // We want all the workers to stop waiting on the barrier while we adjust the
9060 // size of the team.
9061 for (int f = 1; f < old_nthreads; ++f) {
9062 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9063 // Ignore threads that are already inactive or not present in the team
9064 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9065 // teams construct causes thread_limit to get passed in, and some of
9066 // those could be inactive; just ignore them
9067 continue;
9068 }
9069 // If thread is transitioning still to in_use state, wait for it
9070 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9071 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9072 KMP_CPU_PAUSE();
9073 }
9074 // The thread should be in_use now
9075 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9076 // Transition to unused state
9077 team->t.t_threads[f]->th.th_used_in_team.store(2);
9078 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9079 }
9080 // Release all the workers
9081 team->t.b->go_release();
9082
9083 KMP_MFENCE();
9084
9085 // Workers should see transition status 2 and move to 0; but may need to be
9086 // woken up first
9087 int count = old_nthreads - 1;
9088 while (count > 0) {
9089 count = old_nthreads - 1;
9090 for (int f = 1; f < old_nthreads; ++f) {
9091 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9092 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9093 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9094 void *, other_threads[f]->th.th_sleep_loc);
9095 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9096 }
9097 } else {
9098 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9099 count--;
9100 }
9101 }
9102 }
9103 // Now update the barrier size
9104 team->t.b->update_num_threads(new_nthreads);
9105 team->t.b->go_reset();
9106}
9107
9108void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9109 // Add the threads back to the team
9110 KMP_DEBUG_ASSERT(team);
9111 // Threads were paused and pointed at th_used_in_team temporarily during a
9112 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9113 // the thread that it should transition itself back into the team. Then, if
9114 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9115 // to wake it up.
9116 for (int f = 1; f < new_nthreads; ++f) {
9117 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9118 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9119 3);
9120 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9121 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9122 (kmp_flag_32<false, false> *)NULL);
9123 }
9124 }
9125 // The threads should be transitioning to the team; when they are done, they
9126 // should have set th_used_in_team to 1. This loop forces master to wait until
9127 // all threads have moved into the team and are waiting in the barrier.
9128 int count = new_nthreads - 1;
9129 while (count > 0) {
9130 count = new_nthreads - 1;
9131 for (int f = 1; f < new_nthreads; ++f) {
9132 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9133 count--;
9134 }
9135 }
9136 }
9137}
9138
9139// Globals and functions for hidden helper task
9140kmp_info_t **__kmp_hidden_helper_threads;
9141kmp_info_t *__kmp_hidden_helper_main_thread;
9142std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9143#if KMP_OS_LINUX
9144kmp_int32 __kmp_hidden_helper_threads_num = 8;
9145kmp_int32 __kmp_enable_hidden_helper = TRUE;
9146#else
9147kmp_int32 __kmp_hidden_helper_threads_num = 0;
9148kmp_int32 __kmp_enable_hidden_helper = FALSE;
9149#endif
9150
9151namespace {
9152std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9153
9154void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9155 // This is an explicit synchronization on all hidden helper threads in case
9156 // that when a regular thread pushes a hidden helper task to one hidden
9157 // helper thread, the thread has not been awaken once since they're released
9158 // by the main thread after creating the team.
9159 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9160 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9161 __kmp_hidden_helper_threads_num)
9162 ;
9163
9164 // If main thread, then wait for signal
9165 if (__kmpc_master(nullptr, *gtid)) {
9166 // First, unset the initial state and release the initial thread
9167 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9168 __kmp_hidden_helper_initz_release();
9169 __kmp_hidden_helper_main_thread_wait();
9170 // Now wake up all worker threads
9171 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9172 __kmp_hidden_helper_worker_thread_signal();
9173 }
9174 }
9175}
9176} // namespace
9177
9178void __kmp_hidden_helper_threads_initz_routine() {
9179 // Create a new root for hidden helper team/threads
9180 const int gtid = __kmp_register_root(TRUE);
9181 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9182 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9183 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9184 __kmp_hidden_helper_threads_num;
9185
9186 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9187
9188 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9189
9190 // Set the initialization flag to FALSE
9191 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9192
9193 __kmp_hidden_helper_threads_deinitz_release();
9194}
9195
9196/* Nesting Mode:
9197 Set via KMP_NESTING_MODE, which takes an integer.
9198 Note: we skip duplicate topology levels, and skip levels with only
9199 one entity.
9200 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9201 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9202 in the topology, and initializes the number of threads at each of those
9203 levels to the number of entities at each level, respectively, below the
9204 entity at the parent level.
9205 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9206 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9207 the user to turn nesting on explicitly. This is an even more experimental
9208 option to this experimental feature, and may change or go away in the
9209 future.
9210*/
9211
9212// Allocate space to store nesting levels
9213void __kmp_init_nesting_mode() {
9214 int levels = KMP_HW_LAST;
9215 __kmp_nesting_mode_nlevels = levels;
9216 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9217 for (int i = 0; i < levels; ++i)
9218 __kmp_nesting_nth_level[i] = 0;
9219 if (__kmp_nested_nth.size < levels) {
9220 __kmp_nested_nth.nth =
9221 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9222 __kmp_nested_nth.size = levels;
9223 }
9224}
9225
9226// Set # threads for top levels of nesting; must be called after topology set
9227void __kmp_set_nesting_mode_threads() {
9228 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9229
9230 if (__kmp_nesting_mode == 1)
9231 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9232 else if (__kmp_nesting_mode > 1)
9233 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9234
9235 if (__kmp_topology) { // use topology info
9236 int loc, hw_level;
9237 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9238 loc < __kmp_nesting_mode_nlevels;
9239 loc++, hw_level++) {
9240 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9241 if (__kmp_nesting_nth_level[loc] == 1)
9242 loc--;
9243 }
9244 // Make sure all cores are used
9245 if (__kmp_nesting_mode > 1 && loc > 1) {
9246 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9247 int num_cores = __kmp_topology->get_count(core_level);
9248 int upper_levels = 1;
9249 for (int level = 0; level < loc - 1; ++level)
9250 upper_levels *= __kmp_nesting_nth_level[level];
9251 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9252 __kmp_nesting_nth_level[loc - 1] =
9253 num_cores / __kmp_nesting_nth_level[loc - 2];
9254 }
9255 __kmp_nesting_mode_nlevels = loc;
9256 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9257 } else { // no topology info available; provide a reasonable guesstimation
9258 if (__kmp_avail_proc >= 4) {
9259 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9260 __kmp_nesting_nth_level[1] = 2;
9261 __kmp_nesting_mode_nlevels = 2;
9262 } else {
9263 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9264 __kmp_nesting_mode_nlevels = 1;
9265 }
9266 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9267 }
9268 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9269 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9270 }
9271 set__nproc(thread, __kmp_nesting_nth_level[0]);
9272 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9273 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9274 if (get__max_active_levels(thread) > 1) {
9275 // if max levels was set, set nesting mode levels to same
9276 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9277 }
9278 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9279 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9280}
9281
9282// Empty symbols to export (see exports_so.txt) when feature is disabled
9283extern "C" {
9284#if !KMP_STATS_ENABLED
9285void __kmp_reset_stats() {}
9286#endif
9287#if !USE_DEBUGGER
9288int __kmp_omp_debug_struct_info = FALSE;
9289int __kmp_debugging = FALSE;
9290#endif
9291#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9292void __kmp_itt_fini_ittlib() {}
9293void __kmp_itt_init_ittlib() {}
9294#endif
9295}
9296
9297// end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236