LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182  return i;
183  }
184  }
185  }
186 
187  /* get specific to try and determine our gtid */
188  KA_TRACE(1000,
189  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190  "thread, using TLS\n"));
191  i = __kmp_gtid_get_specific();
192 
193  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194 
195  /* if we havn't been assigned a gtid, then return code */
196  if (i < 0)
197  return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific
200  call */
201  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202  KMP_FATAL(StackOverflow, i);
203  }
204 
205  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  if (stack_addr > stack_base) {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210  stack_base);
211  } else {
212  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213  stack_base - stack_addr);
214  }
215 
216  /* Reprint stack bounds for ubermaster since they have been refined */
217  if (__kmp_storage_map) {
218  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221  other_threads[i]->th.th_info.ds.ds_stacksize,
222  "th_%d stack (refinement)", i);
223  }
224  return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228  int gtid;
229 
230  if (!__kmp_init_serial) {
231  gtid = KMP_GTID_DNE;
232  } else
233 #ifdef KMP_TDATA_GTID
234  if (TCR_4(__kmp_gtid_mode) >= 3) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236  gtid = __kmp_gtid;
237  } else
238 #endif
239  if (TCR_4(__kmp_gtid_mode) >= 2) {
240  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  KA_TRACE(1000,
244  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245  gtid = __kmp_get_global_thread_id();
246  }
247 
248  /* we must be a new uber master sibling thread */
249  if (gtid == KMP_GTID_DNE) {
250  KA_TRACE(10,
251  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252  "Registering a new gtid.\n"));
253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254  if (!__kmp_init_serial) {
255  __kmp_do_serial_initialize();
256  gtid = __kmp_gtid_get_specific();
257  } else {
258  gtid = __kmp_register_root(FALSE);
259  }
260  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262  }
263 
264  KMP_DEBUG_ASSERT(gtid >= 0);
265 
266  return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271  int f;
272  char *stack_beg = NULL;
273  char *stack_end = NULL;
274  int gtid;
275 
276  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277  if (__kmp_storage_map) {
278  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281  gtid = __kmp_gtid_from_thread(th);
282 
283  if (gtid == KMP_GTID_MONITOR) {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%s stack (%s)", "mon",
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  } else {
289  __kmp_print_storage_map_gtid(
290  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291  "th_%d stack (%s)", gtid,
292  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293  }
294  }
295 
296  /* No point in checking ubermaster threads since they use refinement and
297  * cannot overlap */
298  gtid = __kmp_gtid_from_thread(th);
299  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300  KA_TRACE(10,
301  ("__kmp_check_stack_overlap: performing extensive checking\n"));
302  if (stack_beg == NULL) {
303  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305  }
306 
307  for (f = 0; f < __kmp_threads_capacity; f++) {
308  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310  if (f_th && f_th != th) {
311  char *other_stack_end =
312  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313  char *other_stack_beg =
314  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318  /* Print the other stack values before the abort */
319  if (__kmp_storage_map)
320  __kmp_print_storage_map_gtid(
321  -1, other_stack_beg, other_stack_end,
322  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326  __kmp_msg_null);
327  }
328  }
329  }
330  }
331  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337  static int done = FALSE;
338 
339  while (!done) {
340  KMP_YIELD(TRUE);
341  }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347  char const *format, ...) {
348  char buffer[MAX_MESSAGE];
349  va_list ap;
350 
351  va_start(ap, format);
352  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353  p2, (unsigned long)size, format);
354  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355  __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357  int node;
358  if (gtid >= 0) {
359  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360  if (__kmp_storage_map_verbose) {
361  node = __kmp_get_host_node(p1);
362  if (node < 0) /* doesn't work, so don't try this next time */
363  __kmp_storage_map_verbose = FALSE;
364  else {
365  char *last;
366  int lastNode;
367  int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369  const int page_size = KMP_GET_PAGE_SIZE();
370 
371  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373  if (localProc >= 0)
374  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375  localProc >> 1);
376  else
377  __kmp_printf_no_lock(" GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379  /* The more elaborate format is disabled for now because of the prctl
380  * hanging bug. */
381  do {
382  last = p1;
383  lastNode = node;
384  /* This loop collates adjacent pages with the same host node. */
385  do {
386  (char *)p1 += page_size;
387  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389  lastNode);
390  } while (p1 <= p2);
391 #else
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393  (char *)p1 + (page_size - 1),
394  __kmp_get_host_node(p1));
395  if (p1 < p2) {
396  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397  (char *)p2 + (page_size - 1),
398  __kmp_get_host_node(p2));
399  }
400 #endif
401  }
402  }
403  } else
404  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405  }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411  char buffer[MAX_MESSAGE];
412  va_list ap;
413 
414  if (__kmp_generate_warnings == kmp_warnings_off) {
415  return;
416  }
417 
418  va_start(ap, format);
419 
420  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422  __kmp_vprintf(kmp_err, buffer, ap);
423  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425  va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429  // Later threads may stall here, but that's ok because abort() will kill them.
430  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432  if (__kmp_debug_buf) {
433  __kmp_dump_debug_buffer();
434  }
435 
436  if (KMP_OS_WINDOWS) {
437  // Let other threads know of abnormal termination and prevent deadlock
438  // if abort happened during library initialization or shutdown
439  __kmp_global.g.g_abort = SIGABRT;
440 
441  /* On Windows* OS by default abort() causes pop-up error box, which stalls
442  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443  boxes. _set_abort_behavior() works well, but this function is not
444  available in VS7 (this is not problem for DLL, but it is a problem for
445  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446  help, at least in some versions of MS C RTL.
447 
448  It seems following sequence is the only way to simulate abort() and
449  avoid pop-up error box. */
450  raise(SIGABRT);
451  _exit(3); // Just in case, if signal ignored, exit anyway.
452  } else {
453  __kmp_unregister_library();
454  abort();
455  }
456 
457  __kmp_infinite_loop();
458  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463  // TODO: Eliminate g_abort global variable and this function.
464  // In case of abort just call abort(), it will kill all the threads.
465  __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469  that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473  gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479  sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481  __kmp_print_storage_map_gtid(
482  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486  &thr->th.th_bar[bs_plain_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488  gtid);
489 
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491  &thr->th.th_bar[bs_forkjoin_barrier + 1],
492  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493  gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497  &thr->th.th_bar[bs_reduction_barrier + 1],
498  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499  gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504  that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507  int team_id, int num_thr) {
508  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513  &team->t.t_bar[bs_last_barrier],
514  sizeof(kmp_balign_team_t) * bs_last_barrier,
515  "%s_%d.t_bar", header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518  &team->t.t_bar[bs_plain_barrier + 1],
519  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520  header, team_id);
521 
522  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523  &team->t.t_bar[bs_forkjoin_barrier + 1],
524  sizeof(kmp_balign_team_t),
525  "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529  &team->t.t_bar[bs_reduction_barrier + 1],
530  sizeof(kmp_balign_team_t),
531  "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534  __kmp_print_storage_map_gtid(
535  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538  __kmp_print_storage_map_gtid(
539  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543  &team->t.t_disp_buffer[num_disp_buff],
544  sizeof(dispatch_shared_info_t) * num_disp_buff,
545  "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549  __kmp_init_memkind();
550  __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562  switch (fdwReason) {
563 
564  case DLL_PROCESS_ATTACH:
565  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567  return TRUE;
568 
569  case DLL_PROCESS_DETACH:
570  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572  // According to Windows* documentation for DllMain entry point:
573  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574  // lpReserved == NULL when FreeLibrary() is called,
575  // lpReserved != NULL when the process is terminated.
576  // When FreeLibrary() is called, worker threads remain alive. So the
577  // runtime's state is consistent and executing proper shutdown is OK.
578  // When the process is terminated, worker threads have exited or been
579  // forcefully terminated by the OS and only the shutdown thread remains.
580  // This can leave the runtime in an inconsistent state.
581  // Hence, only attempt proper cleanup when FreeLibrary() is called.
582  // Otherwise, rely on OS to reclaim resources.
583  if (lpReserved == NULL)
584  __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586  return TRUE;
587 
588  case DLL_THREAD_ATTACH:
589  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591  /* if we want to register new siblings all the time here call
592  * __kmp_get_gtid(); */
593  return TRUE;
594 
595  case DLL_THREAD_DETACH:
596  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598  __kmp_internal_end_thread(__kmp_gtid_get_specific());
599  return TRUE;
600  }
601 
602  return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610  int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612  kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615  if (__kmp_env_consistency_check) {
616  if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622  }
623 #ifdef BUILD_PARALLEL_ORDERED
624  if (!team->t.t_serialized) {
625  KMP_MB();
626  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627  NULL);
628  KMP_MB();
629  }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635  int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637  int tid = __kmp_tid_from_gtid(gtid);
638  kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641  if (__kmp_env_consistency_check) {
642  if (__kmp_threads[gtid]->th.th_root->r.r_active)
643  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644  }
645 #ifdef BUILD_PARALLEL_ORDERED
646  if (!team->t.t_serialized) {
647  KMP_MB(); /* Flush all pending memory write invalidates. */
648 
649  /* use the tid of the next thread in this team */
650  /* TODO replace with general release procedure */
651  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653  KMP_MB(); /* Flush all pending memory write invalidates. */
654  }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662  int status;
663  kmp_info_t *th;
664  kmp_team_t *team;
665 
666  if (!TCR_4(__kmp_init_parallel))
667  __kmp_parallel_initialize();
668  __kmp_resume_if_soft_paused();
669 
670  th = __kmp_threads[gtid];
671  team = th->th.th_team;
672  status = 0;
673 
674  th->th.th_ident = id_ref;
675 
676  if (team->t.t_serialized) {
677  status = 1;
678  } else {
679  kmp_int32 old_this = th->th.th_local.this_construct;
680 
681  ++th->th.th_local.this_construct;
682  /* try to set team count to thread count--success means thread got the
683  single block */
684  /* TODO: Should this be acquire or release? */
685  if (team->t.t_construct == old_this) {
686  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687  th->th.th_local.this_construct);
688  }
689 #if USE_ITT_BUILD
690  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692  team->t.t_active_level == 1) {
693  // Only report metadata by primary thread of active team at level 1
694  __kmp_itt_metadata_single(id_ref);
695  }
696 #endif /* USE_ITT_BUILD */
697  }
698 
699  if (__kmp_env_consistency_check) {
700  if (status && push_ws) {
701  __kmp_push_workshare(gtid, ct_psingle, id_ref);
702  } else {
703  __kmp_check_workshare(gtid, ct_psingle, id_ref);
704  }
705  }
706 #if USE_ITT_BUILD
707  if (status) {
708  __kmp_itt_single_start(gtid);
709  }
710 #endif /* USE_ITT_BUILD */
711  return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716  __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718  if (__kmp_env_consistency_check)
719  __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729  int master_tid, int set_nthreads,
730  int enter_teams) {
731  int capacity;
732  int new_nthreads;
733  KMP_DEBUG_ASSERT(__kmp_init_serial);
734  KMP_DEBUG_ASSERT(root && parent_team);
735  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737  // If dyn-var is set, dynamically adjust the number of desired threads,
738  // according to the method specified by dynamic_mode.
739  new_nthreads = set_nthreads;
740  if (!get__dynamic_2(parent_team, master_tid)) {
741  ;
742  }
743 #ifdef USE_LOAD_BALANCE
744  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746  if (new_nthreads == 1) {
747  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748  "reservation to 1 thread\n",
749  master_tid));
750  return 1;
751  }
752  if (new_nthreads < set_nthreads) {
753  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754  "reservation to %d threads\n",
755  master_tid, new_nthreads));
756  }
757  }
758 #endif /* USE_LOAD_BALANCE */
759  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760  new_nthreads = __kmp_avail_proc - __kmp_nth +
761  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762  if (new_nthreads <= 1) {
763  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764  "reservation to 1 thread\n",
765  master_tid));
766  return 1;
767  }
768  if (new_nthreads < set_nthreads) {
769  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770  "reservation to %d threads\n",
771  master_tid, new_nthreads));
772  } else {
773  new_nthreads = set_nthreads;
774  }
775  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776  if (set_nthreads > 2) {
777  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778  new_nthreads = (new_nthreads % set_nthreads) + 1;
779  if (new_nthreads == 1) {
780  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781  "reservation to 1 thread\n",
782  master_tid));
783  return 1;
784  }
785  if (new_nthreads < set_nthreads) {
786  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787  "reservation to %d threads\n",
788  master_tid, new_nthreads));
789  }
790  }
791  } else {
792  KMP_ASSERT(0);
793  }
794 
795  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796  if (__kmp_nth + new_nthreads -
797  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798  __kmp_max_nth) {
799  int tl_nthreads = __kmp_max_nth - __kmp_nth +
800  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801  if (tl_nthreads <= 0) {
802  tl_nthreads = 1;
803  }
804 
805  // If dyn-var is false, emit a 1-time warning.
806  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807  __kmp_reserve_warn = 1;
808  __kmp_msg(kmp_ms_warning,
809  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811  }
812  if (tl_nthreads == 1) {
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814  "reduced reservation to 1 thread\n",
815  master_tid));
816  return 1;
817  }
818  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819  "reservation to %d threads\n",
820  master_tid, tl_nthreads));
821  new_nthreads = tl_nthreads;
822  }
823 
824  // Respect OMP_THREAD_LIMIT
825  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827  if (cg_nthreads + new_nthreads -
828  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829  max_cg_threads) {
830  int tl_nthreads = max_cg_threads - cg_nthreads +
831  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832  if (tl_nthreads <= 0) {
833  tl_nthreads = 1;
834  }
835 
836  // If dyn-var is false, emit a 1-time warning.
837  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838  __kmp_reserve_warn = 1;
839  __kmp_msg(kmp_ms_warning,
840  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842  }
843  if (tl_nthreads == 1) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845  "reduced reservation to 1 thread\n",
846  master_tid));
847  return 1;
848  }
849  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850  "reservation to %d threads\n",
851  master_tid, tl_nthreads));
852  new_nthreads = tl_nthreads;
853  }
854 
855  // Check if the threads array is large enough, or needs expanding.
856  // See comment in __kmp_register_root() about the adjustment if
857  // __kmp_threads[0] == NULL.
858  capacity = __kmp_threads_capacity;
859  if (TCR_PTR(__kmp_threads[0]) == NULL) {
860  --capacity;
861  }
862  // If it is not for initializing the hidden helper team, we need to take
863  // __kmp_hidden_helper_threads_num out of the capacity because it is included
864  // in __kmp_threads_capacity.
865  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866  capacity -= __kmp_hidden_helper_threads_num;
867  }
868  if (__kmp_nth + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  capacity) {
871  // Expand the threads array.
872  int slotsRequired = __kmp_nth + new_nthreads -
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874  capacity;
875  int slotsAdded = __kmp_expand_threads(slotsRequired);
876  if (slotsAdded < slotsRequired) {
877  // The threads array was not expanded enough.
878  new_nthreads -= (slotsRequired - slotsAdded);
879  KMP_ASSERT(new_nthreads >= 1);
880 
881  // If dyn-var is false, emit a 1-time warning.
882  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883  __kmp_reserve_warn = 1;
884  if (__kmp_tp_cached) {
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889  } else {
890  __kmp_msg(kmp_ms_warning,
891  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893  }
894  }
895  }
896  }
897 
898 #ifdef KMP_DEBUG
899  if (new_nthreads == 1) {
900  KC_TRACE(10,
901  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902  "dead roots and rechecking; requested %d threads\n",
903  __kmp_get_gtid(), set_nthreads));
904  } else {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906  " %d threads\n",
907  __kmp_get_gtid(), new_nthreads, set_nthreads));
908  }
909 #endif // KMP_DEBUG
910  return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914  assured that there are enough threads available, because we checked on that
915  earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917  kmp_info_t *master_th, int master_gtid,
918  int fork_teams_workers) {
919  int i;
920  int use_hot_team;
921 
922  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924  KMP_MB();
925 
926  /* first, let's setup the primary thread */
927  master_th->th.th_info.ds.ds_tid = 0;
928  master_th->th.th_team = team;
929  master_th->th.th_team_nproc = team->t.t_nproc;
930  master_th->th.th_team_master = master_th;
931  master_th->th.th_team_serialized = FALSE;
932  master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936  use_hot_team = 0;
937  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938  if (hot_teams) { // hot teams array is not allocated if
939  // KMP_HOT_TEAMS_MAX_LEVEL=0
940  int level = team->t.t_active_level - 1; // index in array of hot teams
941  if (master_th->th.th_teams_microtask) { // are we inside the teams?
942  if (master_th->th.th_teams_size.nteams > 1) {
943  ++level; // level was not increased in teams construct for
944  // team_of_masters
945  }
946  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947  master_th->th.th_teams_level == team->t.t_level) {
948  ++level; // level was not increased in teams construct for
949  // team_of_workers before the parallel
950  } // team->t.t_level will be increased inside parallel
951  }
952  if (level < __kmp_hot_teams_max_level) {
953  if (hot_teams[level].hot_team) {
954  // hot team has already been allocated for given level
955  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956  use_hot_team = 1; // the team is ready to use
957  } else {
958  use_hot_team = 0; // AC: threads are not allocated yet
959  hot_teams[level].hot_team = team; // remember new hot team
960  hot_teams[level].hot_team_nth = team->t.t_nproc;
961  }
962  } else {
963  use_hot_team = 0;
964  }
965  }
966 #else
967  use_hot_team = team == root->r.r_hot_team;
968 #endif
969  if (!use_hot_team) {
970 
971  /* install the primary thread */
972  team->t.t_threads[0] = master_th;
973  __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975  /* now, install the worker threads */
976  for (i = 1; i < team->t.t_nproc; i++) {
977 
978  /* fork or reallocate a new thread and install it in team */
979  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980  team->t.t_threads[i] = thr;
981  KMP_DEBUG_ASSERT(thr);
982  KMP_DEBUG_ASSERT(thr->th.th_team == team);
983  /* align team and thread arrived states */
984  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985  "T#%d(%d:%d) join =%llu, plain=%llu\n",
986  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989  team->t.t_bar[bs_plain_barrier].b_arrived));
990  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991  thr->th.th_teams_level = master_th->th.th_teams_level;
992  thr->th.th_teams_size = master_th->th.th_teams_size;
993  { // Initialize threads' barrier data.
994  int b;
995  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996  for (b = 0; b < bs_last_barrier; ++b) {
997  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002  }
1003  }
1004  }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007  // Do not partition the places list for teams construct workers who
1008  // haven't actually been forked to do real work yet. This partitioning
1009  // will take place in the parallel region nested within the teams construct.
1010  if (!fork_teams_workers) {
1011  __kmp_partition_places(team);
1012  }
1013 #endif
1014  }
1015 
1016  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1017  for (i = 0; i < team->t.t_nproc; i++) {
1018  kmp_info_t *thr = team->t.t_threads[i];
1019  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1020  thr->th.th_prev_level != team->t.t_level) {
1021  team->t.t_display_affinity = 1;
1022  break;
1023  }
1024  }
1025  }
1026 
1027  KMP_MB();
1028 }
1029 
1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031 // Propagate any changes to the floating point control registers out to the team
1032 // We try to avoid unnecessary writes to the relevant cache line in the team
1033 // structure, so we don't make changes unless they are needed.
1034 inline static void propagateFPControl(kmp_team_t *team) {
1035  if (__kmp_inherit_fp_control) {
1036  kmp_int16 x87_fpu_control_word;
1037  kmp_uint32 mxcsr;
1038 
1039  // Get primary thread's values of FPU control flags (both X87 and vector)
1040  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1041  __kmp_store_mxcsr(&mxcsr);
1042  mxcsr &= KMP_X86_MXCSR_MASK;
1043 
1044  // There is no point looking at t_fp_control_saved here.
1045  // If it is TRUE, we still have to update the values if they are different
1046  // from those we now have. If it is FALSE we didn't save anything yet, but
1047  // our objective is the same. We have to ensure that the values in the team
1048  // are the same as those we have.
1049  // So, this code achieves what we need whether or not t_fp_control_saved is
1050  // true. By checking whether the value needs updating we avoid unnecessary
1051  // writes that would put the cache-line into a written state, causing all
1052  // threads in the team to have to read it again.
1053  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1054  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1055  // Although we don't use this value, other code in the runtime wants to know
1056  // whether it should restore them. So we must ensure it is correct.
1057  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1058  } else {
1059  // Similarly here. Don't write to this cache-line in the team structure
1060  // unless we have to.
1061  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1062  }
1063 }
1064 
1065 // Do the opposite, setting the hardware registers to the updated values from
1066 // the team.
1067 inline static void updateHWFPControl(kmp_team_t *team) {
1068  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1069  // Only reset the fp control regs if they have been changed in the team.
1070  // the parallel region that we are exiting.
1071  kmp_int16 x87_fpu_control_word;
1072  kmp_uint32 mxcsr;
1073  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1074  __kmp_store_mxcsr(&mxcsr);
1075  mxcsr &= KMP_X86_MXCSR_MASK;
1076 
1077  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1078  __kmp_clear_x87_fpu_status_word();
1079  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1080  }
1081 
1082  if (team->t.t_mxcsr != mxcsr) {
1083  __kmp_load_mxcsr(&team->t.t_mxcsr);
1084  }
1085  }
1086 }
1087 #else
1088 #define propagateFPControl(x) ((void)0)
1089 #define updateHWFPControl(x) ((void)0)
1090 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1091 
1092 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1093  int realloc); // forward declaration
1094 
1095 /* Run a parallel region that has been serialized, so runs only in a team of the
1096  single primary thread. */
1097 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1098  kmp_info_t *this_thr;
1099  kmp_team_t *serial_team;
1100 
1101  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1102 
1103  /* Skip all this code for autopar serialized loops since it results in
1104  unacceptable overhead */
1105  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1106  return;
1107 
1108  if (!TCR_4(__kmp_init_parallel))
1109  __kmp_parallel_initialize();
1110  __kmp_resume_if_soft_paused();
1111 
1112  this_thr = __kmp_threads[global_tid];
1113  serial_team = this_thr->th.th_serial_team;
1114 
1115  /* utilize the serialized team held by this thread */
1116  KMP_DEBUG_ASSERT(serial_team);
1117  KMP_MB();
1118 
1119  if (__kmp_tasking_mode != tskm_immediate_exec) {
1120  KMP_DEBUG_ASSERT(
1121  this_thr->th.th_task_team ==
1122  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1123  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1124  NULL);
1125  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1126  "team %p, new task_team = NULL\n",
1127  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1128  this_thr->th.th_task_team = NULL;
1129  }
1130 
1131  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1132  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1133  proc_bind = proc_bind_false;
1134  } else if (proc_bind == proc_bind_default) {
1135  // No proc_bind clause was specified, so use the current value
1136  // of proc-bind-var for this parallel region.
1137  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1138  }
1139  // Reset for next parallel region
1140  this_thr->th.th_set_proc_bind = proc_bind_default;
1141 
1142 #if OMPT_SUPPORT
1143  ompt_data_t ompt_parallel_data = ompt_data_none;
1144  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1145  if (ompt_enabled.enabled &&
1146  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1147 
1148  ompt_task_info_t *parent_task_info;
1149  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1150 
1151  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1152  if (ompt_enabled.ompt_callback_parallel_begin) {
1153  int team_size = 1;
1154 
1155  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1156  &(parent_task_info->task_data), &(parent_task_info->frame),
1157  &ompt_parallel_data, team_size,
1158  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1159  }
1160  }
1161 #endif // OMPT_SUPPORT
1162 
1163  if (this_thr->th.th_team != serial_team) {
1164  // Nested level will be an index in the nested nthreads array
1165  int level = this_thr->th.th_team->t.t_level;
1166 
1167  if (serial_team->t.t_serialized) {
1168  /* this serial team was already used
1169  TODO increase performance by making this locks more specific */
1170  kmp_team_t *new_team;
1171 
1172  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1173 
1174  new_team =
1175  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1176 #if OMPT_SUPPORT
1177  ompt_parallel_data,
1178 #endif
1179  proc_bind, &this_thr->th.th_current_task->td_icvs,
1180  0 USE_NESTED_HOT_ARG(NULL));
1181  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1182  KMP_ASSERT(new_team);
1183 
1184  /* setup new serialized team and install it */
1185  new_team->t.t_threads[0] = this_thr;
1186  new_team->t.t_parent = this_thr->th.th_team;
1187  serial_team = new_team;
1188  this_thr->th.th_serial_team = serial_team;
1189 
1190  KF_TRACE(
1191  10,
1192  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1193  global_tid, serial_team));
1194 
1195  /* TODO the above breaks the requirement that if we run out of resources,
1196  then we can still guarantee that serialized teams are ok, since we may
1197  need to allocate a new one */
1198  } else {
1199  KF_TRACE(
1200  10,
1201  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1202  global_tid, serial_team));
1203  }
1204 
1205  /* we have to initialize this serial team */
1206  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1207  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1208  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1209  serial_team->t.t_ident = loc;
1210  serial_team->t.t_serialized = 1;
1211  serial_team->t.t_nproc = 1;
1212  serial_team->t.t_parent = this_thr->th.th_team;
1213  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1214  this_thr->th.th_team = serial_team;
1215  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1216 
1217  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1218  this_thr->th.th_current_task));
1219  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1220  this_thr->th.th_current_task->td_flags.executing = 0;
1221 
1222  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1223 
1224  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1225  implicit task for each serialized task represented by
1226  team->t.t_serialized? */
1227  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1228  &this_thr->th.th_current_task->td_parent->td_icvs);
1229 
1230  // Thread value exists in the nested nthreads array for the next nested
1231  // level
1232  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1233  this_thr->th.th_current_task->td_icvs.nproc =
1234  __kmp_nested_nth.nth[level + 1];
1235  }
1236 
1237  if (__kmp_nested_proc_bind.used &&
1238  (level + 1 < __kmp_nested_proc_bind.used)) {
1239  this_thr->th.th_current_task->td_icvs.proc_bind =
1240  __kmp_nested_proc_bind.bind_types[level + 1];
1241  }
1242 
1243 #if USE_DEBUGGER
1244  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1245 #endif
1246  this_thr->th.th_info.ds.ds_tid = 0;
1247 
1248  /* set thread cache values */
1249  this_thr->th.th_team_nproc = 1;
1250  this_thr->th.th_team_master = this_thr;
1251  this_thr->th.th_team_serialized = 1;
1252 
1253  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1254  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1255  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1256 
1257  propagateFPControl(serial_team);
1258 
1259  /* check if we need to allocate dispatch buffers stack */
1260  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1261  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1262  serial_team->t.t_dispatch->th_disp_buffer =
1263  (dispatch_private_info_t *)__kmp_allocate(
1264  sizeof(dispatch_private_info_t));
1265  }
1266  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1267 
1268  KMP_MB();
1269 
1270  } else {
1271  /* this serialized team is already being used,
1272  * that's fine, just add another nested level */
1273  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1274  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1275  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1276  ++serial_team->t.t_serialized;
1277  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1278 
1279  // Nested level will be an index in the nested nthreads array
1280  int level = this_thr->th.th_team->t.t_level;
1281  // Thread value exists in the nested nthreads array for the next nested
1282  // level
1283  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1284  this_thr->th.th_current_task->td_icvs.nproc =
1285  __kmp_nested_nth.nth[level + 1];
1286  }
1287  serial_team->t.t_level++;
1288  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1289  "of serial team %p to %d\n",
1290  global_tid, serial_team, serial_team->t.t_level));
1291 
1292  /* allocate/push dispatch buffers stack */
1293  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1294  {
1295  dispatch_private_info_t *disp_buffer =
1296  (dispatch_private_info_t *)__kmp_allocate(
1297  sizeof(dispatch_private_info_t));
1298  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1299  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1300  }
1301  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1302 
1303  KMP_MB();
1304  }
1305  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1306 
1307  // Perform the display affinity functionality for
1308  // serialized parallel regions
1309  if (__kmp_display_affinity) {
1310  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1311  this_thr->th.th_prev_num_threads != 1) {
1312  // NULL means use the affinity-format-var ICV
1313  __kmp_aux_display_affinity(global_tid, NULL);
1314  this_thr->th.th_prev_level = serial_team->t.t_level;
1315  this_thr->th.th_prev_num_threads = 1;
1316  }
1317  }
1318 
1319  if (__kmp_env_consistency_check)
1320  __kmp_push_parallel(global_tid, NULL);
1321 #if OMPT_SUPPORT
1322  serial_team->t.ompt_team_info.master_return_address = codeptr;
1323  if (ompt_enabled.enabled &&
1324  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1325  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1326  OMPT_GET_FRAME_ADDRESS(0);
1327 
1328  ompt_lw_taskteam_t lw_taskteam;
1329  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1330  &ompt_parallel_data, codeptr);
1331 
1332  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1333  // don't use lw_taskteam after linking. content was swaped
1334 
1335  /* OMPT implicit task begin */
1336  if (ompt_enabled.ompt_callback_implicit_task) {
1337  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1338  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1339  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1340  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1341  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1342  __kmp_tid_from_gtid(global_tid);
1343  }
1344 
1345  /* OMPT state */
1346  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1347  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1348  OMPT_GET_FRAME_ADDRESS(0);
1349  }
1350 #endif
1351 }
1352 
1353 /* most of the work for a fork */
1354 /* return true if we really went parallel, false if serialized */
1355 int __kmp_fork_call(ident_t *loc, int gtid,
1356  enum fork_context_e call_context, // Intel, GNU, ...
1357  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1358  kmp_va_list ap) {
1359  void **argv;
1360  int i;
1361  int master_tid;
1362  int master_this_cons;
1363  kmp_team_t *team;
1364  kmp_team_t *parent_team;
1365  kmp_info_t *master_th;
1366  kmp_root_t *root;
1367  int nthreads;
1368  int master_active;
1369  int master_set_numthreads;
1370  int level;
1371  int active_level;
1372  int teams_level;
1373 #if KMP_NESTED_HOT_TEAMS
1374  kmp_hot_team_ptr_t **p_hot_teams;
1375 #endif
1376  { // KMP_TIME_BLOCK
1377  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1378  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1379 
1380  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1381  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1382  /* Some systems prefer the stack for the root thread(s) to start with */
1383  /* some gap from the parent stack to prevent false sharing. */
1384  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1385  /* These 2 lines below are so this does not get optimized out */
1386  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1387  __kmp_stkpadding += (short)((kmp_int64)dummy);
1388  }
1389 
1390  /* initialize if needed */
1391  KMP_DEBUG_ASSERT(
1392  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1393  if (!TCR_4(__kmp_init_parallel))
1394  __kmp_parallel_initialize();
1395  __kmp_resume_if_soft_paused();
1396 
1397  /* setup current data */
1398  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399  // shutdown
1400  parent_team = master_th->th.th_team;
1401  master_tid = master_th->th.th_info.ds.ds_tid;
1402  master_this_cons = master_th->th.th_local.this_construct;
1403  root = master_th->th.th_root;
1404  master_active = root->r.r_active;
1405  master_set_numthreads = master_th->th.th_set_nproc;
1406 
1407 #if OMPT_SUPPORT
1408  ompt_data_t ompt_parallel_data = ompt_data_none;
1409  ompt_data_t *parent_task_data;
1410  ompt_frame_t *ompt_frame;
1411  ompt_data_t *implicit_task_data;
1412  void *return_address = NULL;
1413 
1414  if (ompt_enabled.enabled) {
1415  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1416  NULL, NULL);
1417  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1418  }
1419 #endif
1420 
1421  // Assign affinity to root thread if it hasn't happened yet
1422  __kmp_assign_root_init_mask();
1423 
1424  // Nested level will be an index in the nested nthreads array
1425  level = parent_team->t.t_level;
1426  // used to launch non-serial teams even if nested is not allowed
1427  active_level = parent_team->t.t_active_level;
1428  // needed to check nesting inside the teams
1429  teams_level = master_th->th.th_teams_level;
1430 #if KMP_NESTED_HOT_TEAMS
1431  p_hot_teams = &master_th->th.th_hot_teams;
1432  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1433  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1434  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1435  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1436  // it is either actual or not needed (when active_level > 0)
1437  (*p_hot_teams)[0].hot_team_nth = 1;
1438  }
1439 #endif
1440 
1441 #if OMPT_SUPPORT
1442  if (ompt_enabled.enabled) {
1443  if (ompt_enabled.ompt_callback_parallel_begin) {
1444  int team_size = master_set_numthreads
1445  ? master_set_numthreads
1446  : get__nproc_2(parent_team, master_tid);
1447  int flags = OMPT_INVOKER(call_context) |
1448  ((microtask == (microtask_t)__kmp_teams_master)
1449  ? ompt_parallel_league
1450  : ompt_parallel_team);
1451  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1452  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1453  return_address);
1454  }
1455  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1456  }
1457 #endif
1458 
1459  master_th->th.th_ident = loc;
1460 
1461  if (master_th->th.th_teams_microtask && ap &&
1462  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1463  // AC: This is start of parallel that is nested inside teams construct.
1464  // The team is actual (hot), all workers are ready at the fork barrier.
1465  // No lock needed to initialize the team a bit, then free workers.
1466  parent_team->t.t_ident = loc;
1467  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1468  parent_team->t.t_argc = argc;
1469  argv = (void **)parent_team->t.t_argv;
1470  for (i = argc - 1; i >= 0; --i)
1471  *argv++ = va_arg(kmp_va_deref(ap), void *);
1472  // Increment our nested depth levels, but not increase the serialization
1473  if (parent_team == master_th->th.th_serial_team) {
1474  // AC: we are in serialized parallel
1475  __kmpc_serialized_parallel(loc, gtid);
1476  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477 
1478  if (call_context == fork_context_gnu) {
1479  // AC: need to decrement t_serialized for enquiry functions to work
1480  // correctly, will restore at join time
1481  parent_team->t.t_serialized--;
1482  return TRUE;
1483  }
1484 
1485 #if OMPD_SUPPORT
1486  parent_team->t.t_pkfn = microtask;
1487 #endif
1488 
1489 #if OMPT_SUPPORT
1490  void *dummy;
1491  void **exit_frame_p;
1492 
1493  ompt_lw_taskteam_t lw_taskteam;
1494 
1495  if (ompt_enabled.enabled) {
1496  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1497  &ompt_parallel_data, return_address);
1498  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1499 
1500  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1501  // don't use lw_taskteam after linking. content was swaped
1502 
1503  /* OMPT implicit task begin */
1504  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1505  if (ompt_enabled.ompt_callback_implicit_task) {
1506  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1507  __kmp_tid_from_gtid(gtid);
1508  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1509  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1510  implicit_task_data, 1,
1511  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1512  }
1513 
1514  /* OMPT state */
1515  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1516  } else {
1517  exit_frame_p = &dummy;
1518  }
1519 #endif
1520  // AC: need to decrement t_serialized for enquiry functions to work
1521  // correctly, will restore at join time
1522  parent_team->t.t_serialized--;
1523 
1524  {
1525  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1526  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1527  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1528 #if OMPT_SUPPORT
1529  ,
1530  exit_frame_p
1531 #endif
1532  );
1533  }
1534 
1535 #if OMPT_SUPPORT
1536  if (ompt_enabled.enabled) {
1537  *exit_frame_p = NULL;
1538  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1539  if (ompt_enabled.ompt_callback_implicit_task) {
1540  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1541  ompt_scope_end, NULL, implicit_task_data, 1,
1542  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543  }
1544  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1545  __ompt_lw_taskteam_unlink(master_th);
1546  if (ompt_enabled.ompt_callback_parallel_end) {
1547  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1548  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1549  OMPT_INVOKER(call_context) | ompt_parallel_team,
1550  return_address);
1551  }
1552  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1553  }
1554 #endif
1555  return TRUE;
1556  }
1557 
1558  parent_team->t.t_pkfn = microtask;
1559  parent_team->t.t_invoke = invoker;
1560  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1561  parent_team->t.t_active_level++;
1562  parent_team->t.t_level++;
1563  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1564 
1565 #if OMPT_SUPPORT
1566  if (ompt_enabled.enabled) {
1567  ompt_lw_taskteam_t lw_taskteam;
1568  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1569  &ompt_parallel_data, return_address);
1570  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1571  }
1572 #endif
1573 
1574  /* Change number of threads in the team if requested */
1575  if (master_set_numthreads) { // The parallel has num_threads clause
1576  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1577  // AC: only can reduce number of threads dynamically, can't increase
1578  kmp_info_t **other_threads = parent_team->t.t_threads;
1579  // NOTE: if using distributed barrier, we need to run this code block
1580  // even when the team size appears not to have changed from the max.
1581  int old_proc = master_th->th.th_teams_size.nth;
1582  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1583  bp_dist_bar) {
1584  __kmp_resize_dist_barrier(parent_team, old_proc,
1585  master_set_numthreads);
1586  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1587  }
1588  parent_team->t.t_nproc = master_set_numthreads;
1589  for (i = 0; i < master_set_numthreads; ++i) {
1590  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591  }
1592  }
1593  // Keep extra threads hot in the team for possible next parallels
1594  master_th->th.th_set_nproc = 0;
1595  }
1596 
1597 #if USE_DEBUGGER
1598  if (__kmp_debugging) { // Let debugger override number of threads.
1599  int nth = __kmp_omp_num_threads(loc);
1600  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601  master_set_numthreads = nth;
1602  }
1603  }
1604 #endif
1605 
1606  // Figure out the proc_bind policy for the nested parallel within teams
1607  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1608  // proc_bind_default means don't update
1609  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1610  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1611  proc_bind = proc_bind_false;
1612  } else {
1613  // No proc_bind clause specified; use current proc-bind-var
1614  if (proc_bind == proc_bind_default) {
1615  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1616  }
1617  /* else: The proc_bind policy was specified explicitly on parallel
1618  clause.
1619  This overrides proc-bind-var for this parallel region, but does not
1620  change proc-bind-var. */
1621  // Figure the value of proc-bind-var for the child threads.
1622  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1623  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1624  master_th->th.th_current_task->td_icvs.proc_bind)) {
1625  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1626  }
1627  }
1628  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1629  // Need to change the bind-var ICV to correct value for each implicit task
1630  if (proc_bind_icv != proc_bind_default &&
1631  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1632  kmp_info_t **other_threads = parent_team->t.t_threads;
1633  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1634  other_threads[i]->th.th_current_task->td_icvs.proc_bind =
1635  proc_bind_icv;
1636  }
1637  }
1638  // Reset for next parallel region
1639  master_th->th.th_set_proc_bind = proc_bind_default;
1640 
1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1642  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1643  KMP_ITT_DEBUG) &&
1644  __kmp_forkjoin_frames_mode == 3 &&
1645  parent_team->t.t_active_level == 1 // only report frames at level 1
1646  && master_th->th.th_teams_size.nteams == 1) {
1647  kmp_uint64 tmp_time = __itt_get_timestamp();
1648  master_th->th.th_frame_time = tmp_time;
1649  parent_team->t.t_region_time = tmp_time;
1650  }
1651  if (__itt_stack_caller_create_ptr) {
1652  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1653  // create new stack stitching id before entering fork barrier
1654  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1655  }
1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657 #if KMP_AFFINITY_SUPPORTED
1658  __kmp_partition_places(parent_team);
1659 #endif
1660 
1661  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1662  "master_th=%p, gtid=%d\n",
1663  root, parent_team, master_th, gtid));
1664  __kmp_internal_fork(loc, gtid, parent_team);
1665  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1666  "master_th=%p, gtid=%d\n",
1667  root, parent_team, master_th, gtid));
1668 
1669  if (call_context == fork_context_gnu)
1670  return TRUE;
1671 
1672  /* Invoke microtask for PRIMARY thread */
1673  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1674  parent_team->t.t_id, parent_team->t.t_pkfn));
1675 
1676  if (!parent_team->t.t_invoke(gtid)) {
1677  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1678  }
1679  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1680  parent_team->t.t_id, parent_team->t.t_pkfn));
1681  KMP_MB(); /* Flush all pending memory write invalidates. */
1682 
1683  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1684 
1685  return TRUE;
1686  } // Parallel closely nested in teams construct
1687 
1688 #if KMP_DEBUG
1689  if (__kmp_tasking_mode != tskm_immediate_exec) {
1690  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1691  parent_team->t.t_task_team[master_th->th.th_task_state]);
1692  }
1693 #endif
1694 
1695  // Need this to happen before we determine the number of threads, not while
1696  // we are allocating the team
1697  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1698  int enter_teams = 0;
1699  if (parent_team->t.t_active_level >=
1700  master_th->th.th_current_task->td_icvs.max_active_levels) {
1701  nthreads = 1;
1702  } else {
1703  enter_teams = ((ap == NULL && active_level == 0) ||
1704  (ap && teams_level > 0 && teams_level == level));
1705  nthreads = master_set_numthreads
1706  ? master_set_numthreads
1707  // TODO: get nproc directly from current task
1708  : get__nproc_2(parent_team, master_tid);
1709  // Check if we need to take forkjoin lock? (no need for serialized
1710  // parallel out of teams construct). This code moved here from
1711  // __kmp_reserve_threads() to speedup nested serialized parallels.
1712  if (nthreads > 1) {
1713  if ((get__max_active_levels(master_th) == 1 &&
1714  (root->r.r_in_parallel && !enter_teams)) ||
1715  (__kmp_library == library_serial)) {
1716  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1717  " threads\n",
1718  gtid, nthreads));
1719  nthreads = 1;
1720  }
1721  }
1722  if (nthreads > 1) {
1723  /* determine how many new threads we can use */
1724  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1725  /* AC: If we execute teams from parallel region (on host), then teams
1726  should be created but each can only have 1 thread if nesting is
1727  disabled. If teams called from serial region, then teams and their
1728  threads should be created regardless of the nesting setting. */
1729  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1730  nthreads, enter_teams);
1731  if (nthreads == 1) {
1732  // Free lock for single thread execution here; for multi-thread
1733  // execution it will be freed later after team of threads created
1734  // and initialized
1735  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1736  }
1737  }
1738  }
1739  KMP_DEBUG_ASSERT(nthreads > 0);
1740 
1741  // If we temporarily changed the set number of threads then restore it now
1742  master_th->th.th_set_nproc = 0;
1743 
1744  /* create a serialized parallel region? */
1745  if (nthreads == 1) {
1746 /* josh todo: hypothetical question: what do we do for OS X*? */
1747 #if KMP_OS_LINUX && \
1748  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1749  void *args[argc];
1750 #else
1751  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1752 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1753  KMP_ARCH_AARCH64) */
1754 
1755  KA_TRACE(20,
1756  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1757 
1758  __kmpc_serialized_parallel(loc, gtid);
1759 
1760 #if OMPD_SUPPORT
1761  master_th->th.th_serial_team->t.t_pkfn = microtask;
1762 #endif
1763 
1764  if (call_context == fork_context_intel) {
1765  /* TODO this sucks, use the compiler itself to pass args! :) */
1766  master_th->th.th_serial_team->t.t_ident = loc;
1767  if (!ap) {
1768  // revert change made in __kmpc_serialized_parallel()
1769  master_th->th.th_serial_team->t.t_level--;
1770  // Get args from parent team for teams construct
1771 
1772 #if OMPT_SUPPORT
1773  void *dummy;
1774  void **exit_frame_p;
1775  ompt_task_info_t *task_info;
1776 
1777  ompt_lw_taskteam_t lw_taskteam;
1778 
1779  if (ompt_enabled.enabled) {
1780  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1781  &ompt_parallel_data, return_address);
1782 
1783  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1784  // don't use lw_taskteam after linking. content was swaped
1785 
1786  task_info = OMPT_CUR_TASK_INFO(master_th);
1787  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1788  if (ompt_enabled.ompt_callback_implicit_task) {
1789  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1790  __kmp_tid_from_gtid(gtid);
1791  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1793  &(task_info->task_data), 1,
1794  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1795  ompt_task_implicit);
1796  }
1797 
1798  /* OMPT state */
1799  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1800  } else {
1801  exit_frame_p = &dummy;
1802  }
1803 #endif
1804 
1805  {
1806  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1809  parent_team->t.t_argv
1810 #if OMPT_SUPPORT
1811  ,
1812  exit_frame_p
1813 #endif
1814  );
1815  }
1816 
1817 #if OMPT_SUPPORT
1818  if (ompt_enabled.enabled) {
1819  *exit_frame_p = NULL;
1820  if (ompt_enabled.ompt_callback_implicit_task) {
1821  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822  ompt_scope_end, NULL, &(task_info->task_data), 1,
1823  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1824  ompt_task_implicit);
1825  }
1826  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1827  __ompt_lw_taskteam_unlink(master_th);
1828  if (ompt_enabled.ompt_callback_parallel_end) {
1829  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830  &ompt_parallel_data, parent_task_data,
1831  OMPT_INVOKER(call_context) | ompt_parallel_team,
1832  return_address);
1833  }
1834  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835  }
1836 #endif
1837  } else if (microtask == (microtask_t)__kmp_teams_master) {
1838  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1839  master_th->th.th_serial_team);
1840  team = master_th->th.th_team;
1841  // team->t.t_pkfn = microtask;
1842  team->t.t_invoke = invoker;
1843  __kmp_alloc_argv_entries(argc, team, TRUE);
1844  team->t.t_argc = argc;
1845  argv = (void **)team->t.t_argv;
1846  if (ap) {
1847  for (i = argc - 1; i >= 0; --i)
1848  *argv++ = va_arg(kmp_va_deref(ap), void *);
1849  } else {
1850  for (i = 0; i < argc; ++i)
1851  // Get args from parent team for teams construct
1852  argv[i] = parent_team->t.t_argv[i];
1853  }
1854  // AC: revert change made in __kmpc_serialized_parallel()
1855  // because initial code in teams should have level=0
1856  team->t.t_level--;
1857  // AC: call special invoker for outer "parallel" of teams construct
1858  invoker(gtid);
1859 #if OMPT_SUPPORT
1860  if (ompt_enabled.enabled) {
1861  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1862  if (ompt_enabled.ompt_callback_implicit_task) {
1863  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864  ompt_scope_end, NULL, &(task_info->task_data), 0,
1865  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1866  }
1867  if (ompt_enabled.ompt_callback_parallel_end) {
1868  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1869  &ompt_parallel_data, parent_task_data,
1870  OMPT_INVOKER(call_context) | ompt_parallel_league,
1871  return_address);
1872  }
1873  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874  }
1875 #endif
1876  } else {
1877  argv = args;
1878  for (i = argc - 1; i >= 0; --i)
1879  *argv++ = va_arg(kmp_va_deref(ap), void *);
1880  KMP_MB();
1881 
1882 #if OMPT_SUPPORT
1883  void *dummy;
1884  void **exit_frame_p;
1885  ompt_task_info_t *task_info;
1886 
1887  ompt_lw_taskteam_t lw_taskteam;
1888 
1889  if (ompt_enabled.enabled) {
1890  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1891  &ompt_parallel_data, return_address);
1892  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1893  // don't use lw_taskteam after linking. content was swaped
1894  task_info = OMPT_CUR_TASK_INFO(master_th);
1895  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1896 
1897  /* OMPT implicit task begin */
1898  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1899  if (ompt_enabled.ompt_callback_implicit_task) {
1900  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1902  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1903  ompt_task_implicit);
1904  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1905  __kmp_tid_from_gtid(gtid);
1906  }
1907 
1908  /* OMPT state */
1909  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910  } else {
1911  exit_frame_p = &dummy;
1912  }
1913 #endif
1914 
1915  {
1916  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919 #if OMPT_SUPPORT
1920  ,
1921  exit_frame_p
1922 #endif
1923  );
1924  }
1925 
1926 #if OMPT_SUPPORT
1927  if (ompt_enabled.enabled) {
1928  *exit_frame_p = NULL;
1929  if (ompt_enabled.ompt_callback_implicit_task) {
1930  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931  ompt_scope_end, NULL, &(task_info->task_data), 1,
1932  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1933  ompt_task_implicit);
1934  }
1935 
1936  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1937  __ompt_lw_taskteam_unlink(master_th);
1938  if (ompt_enabled.ompt_callback_parallel_end) {
1939  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1940  &ompt_parallel_data, parent_task_data,
1941  OMPT_INVOKER(call_context) | ompt_parallel_team,
1942  return_address);
1943  }
1944  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1945  }
1946 #endif
1947  }
1948  } else if (call_context == fork_context_gnu) {
1949 #if OMPT_SUPPORT
1950  ompt_lw_taskteam_t lwt;
1951  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952  return_address);
1953 
1954  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956 // don't use lw_taskteam after linking. content was swaped
1957 #endif
1958 
1959  // we were called from GNU native code
1960  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961  return FALSE;
1962  } else {
1963  KMP_ASSERT2(call_context < fork_context_last,
1964  "__kmp_fork_call: unknown fork_context parameter");
1965  }
1966 
1967  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968  KMP_MB();
1969  return FALSE;
1970  } // if (nthreads == 1)
1971 
1972  // GEH: only modify the executing flag in the case when not serialized
1973  // serialized case is handled in kmpc_serialized_parallel
1974  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975  "curtask=%p, curtask_max_aclevel=%d\n",
1976  parent_team->t.t_active_level, master_th,
1977  master_th->th.th_current_task,
1978  master_th->th.th_current_task->td_icvs.max_active_levels));
1979  // TODO: GEH - cannot do this assertion because root thread not set up as
1980  // executing
1981  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982  master_th->th.th_current_task->td_flags.executing = 0;
1983 
1984  if (!master_th->th.th_teams_microtask || level > teams_level) {
1985  /* Increment our nested depth level */
1986  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1987  }
1988 
1989  // See if we need to make a copy of the ICVs.
1990  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1991  if ((level + 1 < __kmp_nested_nth.used) &&
1992  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1993  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1994  } else {
1995  nthreads_icv = 0; // don't update
1996  }
1997 
1998  // Figure out the proc_bind_policy for the new team.
1999  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2000  // proc_bind_default means don't update
2001  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2002  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2003  proc_bind = proc_bind_false;
2004  } else {
2005  // No proc_bind clause specified; use current proc-bind-var for this
2006  // parallel region
2007  if (proc_bind == proc_bind_default) {
2008  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2009  }
2010  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2011  if (master_th->th.th_teams_microtask &&
2012  microtask == (microtask_t)__kmp_teams_master) {
2013  proc_bind = __kmp_teams_proc_bind;
2014  }
2015  /* else: The proc_bind policy was specified explicitly on parallel clause.
2016  This overrides proc-bind-var for this parallel region, but does not
2017  change proc-bind-var. */
2018  // Figure the value of proc-bind-var for the child threads.
2019  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2020  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2021  master_th->th.th_current_task->td_icvs.proc_bind)) {
2022  // Do not modify the proc bind icv for the two teams construct forks
2023  // They just let the proc bind icv pass through
2024  if (!master_th->th.th_teams_microtask ||
2025  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2026  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2027  }
2028  }
2029 
2030  // Reset for next parallel region
2031  master_th->th.th_set_proc_bind = proc_bind_default;
2032 
2033  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2034  kmp_internal_control_t new_icvs;
2035  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036  new_icvs.next = NULL;
2037  if (nthreads_icv > 0) {
2038  new_icvs.nproc = nthreads_icv;
2039  }
2040  if (proc_bind_icv != proc_bind_default) {
2041  new_icvs.proc_bind = proc_bind_icv;
2042  }
2043 
2044  /* allocate a new parallel team */
2045  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2046  team = __kmp_allocate_team(root, nthreads, nthreads,
2047 #if OMPT_SUPPORT
2048  ompt_parallel_data,
2049 #endif
2050  proc_bind, &new_icvs,
2051  argc USE_NESTED_HOT_ARG(master_th));
2052  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2053  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2054  } else {
2055  /* allocate a new parallel team */
2056  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057  team = __kmp_allocate_team(root, nthreads, nthreads,
2058 #if OMPT_SUPPORT
2059  ompt_parallel_data,
2060 #endif
2061  proc_bind,
2062  &master_th->th.th_current_task->td_icvs,
2063  argc USE_NESTED_HOT_ARG(master_th));
2064  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2065  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2066  &master_th->th.th_current_task->td_icvs);
2067  }
2068  KF_TRACE(
2069  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2070 
2071  /* setup the new team */
2072  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2073  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2074  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2075  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2076  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2077 #if OMPT_SUPPORT
2078  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2079  return_address);
2080 #endif
2081  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2082  // TODO: parent_team->t.t_level == INT_MAX ???
2083  if (!master_th->th.th_teams_microtask || level > teams_level) {
2084  int new_level = parent_team->t.t_level + 1;
2085  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2086  new_level = parent_team->t.t_active_level + 1;
2087  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2088  } else {
2089  // AC: Do not increase parallel level at start of the teams construct
2090  int new_level = parent_team->t.t_level;
2091  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2092  new_level = parent_team->t.t_active_level;
2093  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2094  }
2095  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2096  // set primary thread's schedule as new run-time schedule
2097  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2098 
2099  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2100  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2101 
2102  // Update the floating point rounding in the team if required.
2103  propagateFPControl(team);
2104 #if OMPD_SUPPORT
2105  if (ompd_state & OMPD_ENABLE_BP)
2106  ompd_bp_parallel_begin();
2107 #endif
2108 
2109  if (__kmp_tasking_mode != tskm_immediate_exec) {
2110  // Set primary thread's task team to team's task team. Unless this is hot
2111  // team, it should be NULL.
2112  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2113  parent_team->t.t_task_team[master_th->th.th_task_state]);
2114  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2115  "%p, new task_team %p / team %p\n",
2116  __kmp_gtid_from_thread(master_th),
2117  master_th->th.th_task_team, parent_team,
2118  team->t.t_task_team[master_th->th.th_task_state], team));
2119 
2120  if (active_level || master_th->th.th_task_team) {
2121  // Take a memo of primary thread's task_state
2122  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2123  if (master_th->th.th_task_state_top >=
2124  master_th->th.th_task_state_stack_sz) { // increase size
2125  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2126  kmp_uint8 *old_stack, *new_stack;
2127  kmp_uint32 i;
2128  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2129  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2130  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2131  }
2132  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2133  ++i) { // zero-init rest of stack
2134  new_stack[i] = 0;
2135  }
2136  old_stack = master_th->th.th_task_state_memo_stack;
2137  master_th->th.th_task_state_memo_stack = new_stack;
2138  master_th->th.th_task_state_stack_sz = new_size;
2139  __kmp_free(old_stack);
2140  }
2141  // Store primary thread's task_state on stack
2142  master_th->th
2143  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2144  master_th->th.th_task_state;
2145  master_th->th.th_task_state_top++;
2146 #if KMP_NESTED_HOT_TEAMS
2147  if (master_th->th.th_hot_teams &&
2148  active_level < __kmp_hot_teams_max_level &&
2149  team == master_th->th.th_hot_teams[active_level].hot_team) {
2150  // Restore primary thread's nested state if nested hot team
2151  master_th->th.th_task_state =
2152  master_th->th
2153  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2154  } else {
2155 #endif
2156  master_th->th.th_task_state = 0;
2157 #if KMP_NESTED_HOT_TEAMS
2158  }
2159 #endif
2160  }
2161 #if !KMP_NESTED_HOT_TEAMS
2162  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2163  (team == root->r.r_hot_team));
2164 #endif
2165  }
2166 
2167  KA_TRACE(
2168  20,
2169  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2170  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2171  team->t.t_nproc));
2172  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2173  (team->t.t_master_tid == 0 &&
2174  (team->t.t_parent == root->r.r_root_team ||
2175  team->t.t_parent->t.t_serialized)));
2176  KMP_MB();
2177 
2178  /* now, setup the arguments */
2179  argv = (void **)team->t.t_argv;
2180  if (ap) {
2181  for (i = argc - 1; i >= 0; --i) {
2182  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2183  KMP_CHECK_UPDATE(*argv, new_argv);
2184  argv++;
2185  }
2186  } else {
2187  for (i = 0; i < argc; ++i) {
2188  // Get args from parent team for teams construct
2189  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2190  }
2191  }
2192 
2193  /* now actually fork the threads */
2194  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2195  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2196  root->r.r_active = TRUE;
2197 
2198  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2199  __kmp_setup_icv_copy(team, nthreads,
2200  &master_th->th.th_current_task->td_icvs, loc);
2201 
2202 #if OMPT_SUPPORT
2203  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2204 #endif
2205 
2206  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2207 
2208 #if USE_ITT_BUILD
2209  if (team->t.t_active_level == 1 // only report frames at level 1
2210  && !master_th->th.th_teams_microtask) { // not in teams construct
2211 #if USE_ITT_NOTIFY
2212  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2213  (__kmp_forkjoin_frames_mode == 3 ||
2214  __kmp_forkjoin_frames_mode == 1)) {
2215  kmp_uint64 tmp_time = 0;
2216  if (__itt_get_timestamp_ptr)
2217  tmp_time = __itt_get_timestamp();
2218  // Internal fork - report frame begin
2219  master_th->th.th_frame_time = tmp_time;
2220  if (__kmp_forkjoin_frames_mode == 3)
2221  team->t.t_region_time = tmp_time;
2222  } else
2223 // only one notification scheme (either "submit" or "forking/joined", not both)
2224 #endif /* USE_ITT_NOTIFY */
2225  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2226  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2227  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2228  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2229  }
2230  }
2231 #endif /* USE_ITT_BUILD */
2232 
2233  /* now go on and do the work */
2234  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2235  KMP_MB();
2236  KF_TRACE(10,
2237  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2238  root, team, master_th, gtid));
2239 
2240 #if USE_ITT_BUILD
2241  if (__itt_stack_caller_create_ptr) {
2242  // create new stack stitching id before entering fork barrier
2243  if (!enter_teams) {
2244  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2245  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2246  } else if (parent_team->t.t_serialized) {
2247  // keep stack stitching id in the serialized parent_team;
2248  // current team will be used for parallel inside the teams;
2249  // if parent_team is active, then it already keeps stack stitching id
2250  // for the league of teams
2251  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2252  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2253  }
2254  }
2255 #endif /* USE_ITT_BUILD */
2256 
2257  // AC: skip __kmp_internal_fork at teams construct, let only primary
2258  // threads execute
2259  if (ap) {
2260  __kmp_internal_fork(loc, gtid, team);
2261  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2262  "master_th=%p, gtid=%d\n",
2263  root, team, master_th, gtid));
2264  }
2265 
2266  if (call_context == fork_context_gnu) {
2267  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2268  return TRUE;
2269  }
2270 
2271  /* Invoke microtask for PRIMARY thread */
2272  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2273  team->t.t_id, team->t.t_pkfn));
2274  } // END of timer KMP_fork_call block
2275 
2276 #if KMP_STATS_ENABLED
2277  // If beginning a teams construct, then change thread state
2278  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2279  if (!ap) {
2280  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2281  }
2282 #endif
2283 
2284  if (!team->t.t_invoke(gtid)) {
2285  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2286  }
2287 
2288 #if KMP_STATS_ENABLED
2289  // If was beginning of a teams construct, then reset thread state
2290  if (!ap) {
2291  KMP_SET_THREAD_STATE(previous_state);
2292  }
2293 #endif
2294 
2295  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2296  team->t.t_id, team->t.t_pkfn));
2297  KMP_MB(); /* Flush all pending memory write invalidates. */
2298 
2299  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300 #if OMPT_SUPPORT
2301  if (ompt_enabled.enabled) {
2302  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2303  }
2304 #endif
2305 
2306  return TRUE;
2307 }
2308 
2309 #if OMPT_SUPPORT
2310 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2311  kmp_team_t *team) {
2312  // restore state outside the region
2313  thread->th.ompt_thread_info.state =
2314  ((team->t.t_serialized) ? ompt_state_work_serial
2315  : ompt_state_work_parallel);
2316 }
2317 
2318 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2319  kmp_team_t *team, ompt_data_t *parallel_data,
2320  int flags, void *codeptr) {
2321  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2322  if (ompt_enabled.ompt_callback_parallel_end) {
2323  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2324  parallel_data, &(task_info->task_data), flags, codeptr);
2325  }
2326 
2327  task_info->frame.enter_frame = ompt_data_none;
2328  __kmp_join_restore_state(thread, team);
2329 }
2330 #endif
2331 
2332 void __kmp_join_call(ident_t *loc, int gtid
2333 #if OMPT_SUPPORT
2334  ,
2335  enum fork_context_e fork_context
2336 #endif
2337  ,
2338  int exit_teams) {
2339  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2340  kmp_team_t *team;
2341  kmp_team_t *parent_team;
2342  kmp_info_t *master_th;
2343  kmp_root_t *root;
2344  int master_active;
2345 
2346  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2347 
2348  /* setup current data */
2349  master_th = __kmp_threads[gtid];
2350  root = master_th->th.th_root;
2351  team = master_th->th.th_team;
2352  parent_team = team->t.t_parent;
2353 
2354  master_th->th.th_ident = loc;
2355 
2356 #if OMPT_SUPPORT
2357  void *team_microtask = (void *)team->t.t_pkfn;
2358  // For GOMP interface with serialized parallel, need the
2359  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2360  // and end-parallel events.
2361  if (ompt_enabled.enabled &&
2362  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2363  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2364  }
2365 #endif
2366 
2367 #if KMP_DEBUG
2368  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2369  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2370  "th_task_team = %p\n",
2371  __kmp_gtid_from_thread(master_th), team,
2372  team->t.t_task_team[master_th->th.th_task_state],
2373  master_th->th.th_task_team));
2374  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2375  team->t.t_task_team[master_th->th.th_task_state]);
2376  }
2377 #endif
2378 
2379  if (team->t.t_serialized) {
2380  if (master_th->th.th_teams_microtask) {
2381  // We are in teams construct
2382  int level = team->t.t_level;
2383  int tlevel = master_th->th.th_teams_level;
2384  if (level == tlevel) {
2385  // AC: we haven't incremented it earlier at start of teams construct,
2386  // so do it here - at the end of teams construct
2387  team->t.t_level++;
2388  } else if (level == tlevel + 1) {
2389  // AC: we are exiting parallel inside teams, need to increment
2390  // serialization in order to restore it in the next call to
2391  // __kmpc_end_serialized_parallel
2392  team->t.t_serialized++;
2393  }
2394  }
2395  __kmpc_end_serialized_parallel(loc, gtid);
2396 
2397 #if OMPT_SUPPORT
2398  if (ompt_enabled.enabled) {
2399  __kmp_join_restore_state(master_th, parent_team);
2400  }
2401 #endif
2402 
2403  return;
2404  }
2405 
2406  master_active = team->t.t_master_active;
2407 
2408  if (!exit_teams) {
2409  // AC: No barrier for internal teams at exit from teams construct.
2410  // But there is barrier for external team (league).
2411  __kmp_internal_join(loc, gtid, team);
2412 #if USE_ITT_BUILD
2413  if (__itt_stack_caller_create_ptr) {
2414  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2415  // destroy the stack stitching id after join barrier
2416  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2417  team->t.t_stack_id = NULL;
2418  }
2419 #endif
2420  } else {
2421  master_th->th.th_task_state =
2422  0; // AC: no tasking in teams (out of any parallel)
2423 #if USE_ITT_BUILD
2424  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2425  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2426  // destroy the stack stitching id on exit from the teams construct
2427  // if parent_team is active, then the id will be destroyed later on
2428  // by master of the league of teams
2429  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2430  parent_team->t.t_stack_id = NULL;
2431  }
2432 #endif
2433 
2434  if (team->t.t_nproc > 1 &&
2435  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2436  team->t.b->update_num_threads(team->t.t_nproc);
2437  __kmp_add_threads_to_team(team, team->t.t_nproc);
2438  }
2439  }
2440 
2441  KMP_MB();
2442 
2443 #if OMPT_SUPPORT
2444  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2445  void *codeptr = team->t.ompt_team_info.master_return_address;
2446 #endif
2447 
2448 #if USE_ITT_BUILD
2449  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2450  if (team->t.t_active_level == 1 &&
2451  (!master_th->th.th_teams_microtask || /* not in teams construct */
2452  master_th->th.th_teams_size.nteams == 1)) {
2453  master_th->th.th_ident = loc;
2454  // only one notification scheme (either "submit" or "forking/joined", not
2455  // both)
2456  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2457  __kmp_forkjoin_frames_mode == 3)
2458  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2459  master_th->th.th_frame_time, 0, loc,
2460  master_th->th.th_team_nproc, 1);
2461  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2462  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2463  __kmp_itt_region_joined(gtid);
2464  } // active_level == 1
2465 #endif /* USE_ITT_BUILD */
2466 
2467 #if KMP_AFFINITY_SUPPORTED
2468  if (!exit_teams) {
2469  // Restore master thread's partition.
2470  master_th->th.th_first_place = team->t.t_first_place;
2471  master_th->th.th_last_place = team->t.t_last_place;
2472  }
2473 #endif // KMP_AFFINITY_SUPPORTED
2474 
2475  if (master_th->th.th_teams_microtask && !exit_teams &&
2476  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2477  team->t.t_level == master_th->th.th_teams_level + 1) {
2478 // AC: We need to leave the team structure intact at the end of parallel
2479 // inside the teams construct, so that at the next parallel same (hot) team
2480 // works, only adjust nesting levels
2481 #if OMPT_SUPPORT
2482  ompt_data_t ompt_parallel_data = ompt_data_none;
2483  if (ompt_enabled.enabled) {
2484  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2485  if (ompt_enabled.ompt_callback_implicit_task) {
2486  int ompt_team_size = team->t.t_nproc;
2487  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2488  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2489  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2490  }
2491  task_info->frame.exit_frame = ompt_data_none;
2492  task_info->task_data = ompt_data_none;
2493  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2494  __ompt_lw_taskteam_unlink(master_th);
2495  }
2496 #endif
2497  /* Decrement our nested depth level */
2498  team->t.t_level--;
2499  team->t.t_active_level--;
2500  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2501 
2502  // Restore number of threads in the team if needed. This code relies on
2503  // the proper adjustment of th_teams_size.nth after the fork in
2504  // __kmp_teams_master on each teams primary thread in the case that
2505  // __kmp_reserve_threads reduced it.
2506  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2507  int old_num = master_th->th.th_team_nproc;
2508  int new_num = master_th->th.th_teams_size.nth;
2509  kmp_info_t **other_threads = team->t.t_threads;
2510  team->t.t_nproc = new_num;
2511  for (int i = 0; i < old_num; ++i) {
2512  other_threads[i]->th.th_team_nproc = new_num;
2513  }
2514  // Adjust states of non-used threads of the team
2515  for (int i = old_num; i < new_num; ++i) {
2516  // Re-initialize thread's barrier data.
2517  KMP_DEBUG_ASSERT(other_threads[i]);
2518  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2519  for (int b = 0; b < bs_last_barrier; ++b) {
2520  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2521  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2522 #if USE_DEBUGGER
2523  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2524 #endif
2525  }
2526  if (__kmp_tasking_mode != tskm_immediate_exec) {
2527  // Synchronize thread's task state
2528  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2529  }
2530  }
2531  }
2532 
2533 #if OMPT_SUPPORT
2534  if (ompt_enabled.enabled) {
2535  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2536  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2537  }
2538 #endif
2539 
2540  return;
2541  }
2542 
2543  /* do cleanup and restore the parent team */
2544  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2545  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2546 
2547  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2548 
2549  /* jc: The following lock has instructions with REL and ACQ semantics,
2550  separating the parallel user code called in this parallel region
2551  from the serial user code called after this function returns. */
2552  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2553 
2554  if (!master_th->th.th_teams_microtask ||
2555  team->t.t_level > master_th->th.th_teams_level) {
2556  /* Decrement our nested depth level */
2557  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2558  }
2559  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2560 
2561 #if OMPT_SUPPORT
2562  if (ompt_enabled.enabled) {
2563  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2564  if (ompt_enabled.ompt_callback_implicit_task) {
2565  int flags = (team_microtask == (void *)__kmp_teams_master)
2566  ? ompt_task_initial
2567  : ompt_task_implicit;
2568  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2569  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2571  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2572  }
2573  task_info->frame.exit_frame = ompt_data_none;
2574  task_info->task_data = ompt_data_none;
2575  }
2576 #endif
2577 
2578  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2579  master_th, team));
2580  __kmp_pop_current_task_from_thread(master_th);
2581 
2582  master_th->th.th_def_allocator = team->t.t_def_allocator;
2583 
2584 #if OMPD_SUPPORT
2585  if (ompd_state & OMPD_ENABLE_BP)
2586  ompd_bp_parallel_end();
2587 #endif
2588  updateHWFPControl(team);
2589 
2590  if (root->r.r_active != master_active)
2591  root->r.r_active = master_active;
2592 
2593  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2594  master_th)); // this will free worker threads
2595 
2596  /* this race was fun to find. make sure the following is in the critical
2597  region otherwise assertions may fail occasionally since the old team may be
2598  reallocated and the hierarchy appears inconsistent. it is actually safe to
2599  run and won't cause any bugs, but will cause those assertion failures. it's
2600  only one deref&assign so might as well put this in the critical region */
2601  master_th->th.th_team = parent_team;
2602  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2603  master_th->th.th_team_master = parent_team->t.t_threads[0];
2604  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2605 
2606  /* restore serialized team, if need be */
2607  if (parent_team->t.t_serialized &&
2608  parent_team != master_th->th.th_serial_team &&
2609  parent_team != root->r.r_root_team) {
2610  __kmp_free_team(root,
2611  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2612  master_th->th.th_serial_team = parent_team;
2613  }
2614 
2615  if (__kmp_tasking_mode != tskm_immediate_exec) {
2616  if (master_th->th.th_task_state_top >
2617  0) { // Restore task state from memo stack
2618  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2619  // Remember primary thread's state if we re-use this nested hot team
2620  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2621  master_th->th.th_task_state;
2622  --master_th->th.th_task_state_top; // pop
2623  // Now restore state at this level
2624  master_th->th.th_task_state =
2625  master_th->th
2626  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2627  }
2628  // Copy the task team from the parent team to the primary thread
2629  master_th->th.th_task_team =
2630  parent_team->t.t_task_team[master_th->th.th_task_state];
2631  KA_TRACE(20,
2632  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2633  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2634  parent_team));
2635  }
2636 
2637  // TODO: GEH - cannot do this assertion because root thread not set up as
2638  // executing
2639  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2640  master_th->th.th_current_task->td_flags.executing = 1;
2641 
2642  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2643 
2644 #if OMPT_SUPPORT
2645  int flags =
2646  OMPT_INVOKER(fork_context) |
2647  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2648  : ompt_parallel_team);
2649  if (ompt_enabled.enabled) {
2650  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2651  codeptr);
2652  }
2653 #endif
2654 
2655  KMP_MB();
2656  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2657 }
2658 
2659 /* Check whether we should push an internal control record onto the
2660  serial team stack. If so, do it. */
2661 void __kmp_save_internal_controls(kmp_info_t *thread) {
2662 
2663  if (thread->th.th_team != thread->th.th_serial_team) {
2664  return;
2665  }
2666  if (thread->th.th_team->t.t_serialized > 1) {
2667  int push = 0;
2668 
2669  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2670  push = 1;
2671  } else {
2672  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2673  thread->th.th_team->t.t_serialized) {
2674  push = 1;
2675  }
2676  }
2677  if (push) { /* push a record on the serial team's stack */
2678  kmp_internal_control_t *control =
2679  (kmp_internal_control_t *)__kmp_allocate(
2680  sizeof(kmp_internal_control_t));
2681 
2682  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2683 
2684  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2685 
2686  control->next = thread->th.th_team->t.t_control_stack_top;
2687  thread->th.th_team->t.t_control_stack_top = control;
2688  }
2689  }
2690 }
2691 
2692 /* Changes set_nproc */
2693 void __kmp_set_num_threads(int new_nth, int gtid) {
2694  kmp_info_t *thread;
2695  kmp_root_t *root;
2696 
2697  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2698  KMP_DEBUG_ASSERT(__kmp_init_serial);
2699 
2700  if (new_nth < 1)
2701  new_nth = 1;
2702  else if (new_nth > __kmp_max_nth)
2703  new_nth = __kmp_max_nth;
2704 
2705  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2706  thread = __kmp_threads[gtid];
2707  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2708  return; // nothing to do
2709 
2710  __kmp_save_internal_controls(thread);
2711 
2712  set__nproc(thread, new_nth);
2713 
2714  // If this omp_set_num_threads() call will cause the hot team size to be
2715  // reduced (in the absence of a num_threads clause), then reduce it now,
2716  // rather than waiting for the next parallel region.
2717  root = thread->th.th_root;
2718  if (__kmp_init_parallel && (!root->r.r_active) &&
2719  (root->r.r_hot_team->t.t_nproc > new_nth)
2720 #if KMP_NESTED_HOT_TEAMS
2721  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2722 #endif
2723  ) {
2724  kmp_team_t *hot_team = root->r.r_hot_team;
2725  int f;
2726 
2727  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2728 
2729  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2730  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2731  }
2732  // Release the extra threads we don't need any more.
2733  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2734  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2735  if (__kmp_tasking_mode != tskm_immediate_exec) {
2736  // When decreasing team size, threads no longer in the team should unref
2737  // task team.
2738  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2739  }
2740  __kmp_free_thread(hot_team->t.t_threads[f]);
2741  hot_team->t.t_threads[f] = NULL;
2742  }
2743  hot_team->t.t_nproc = new_nth;
2744 #if KMP_NESTED_HOT_TEAMS
2745  if (thread->th.th_hot_teams) {
2746  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2747  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2748  }
2749 #endif
2750 
2751  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2752  hot_team->t.b->update_num_threads(new_nth);
2753  __kmp_add_threads_to_team(hot_team, new_nth);
2754  }
2755 
2756  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2757 
2758  // Update the t_nproc field in the threads that are still active.
2759  for (f = 0; f < new_nth; f++) {
2760  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2761  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2762  }
2763  // Special flag in case omp_set_num_threads() call
2764  hot_team->t.t_size_changed = -1;
2765  }
2766 }
2767 
2768 /* Changes max_active_levels */
2769 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2770  kmp_info_t *thread;
2771 
2772  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2773  "%d = (%d)\n",
2774  gtid, max_active_levels));
2775  KMP_DEBUG_ASSERT(__kmp_init_serial);
2776 
2777  // validate max_active_levels
2778  if (max_active_levels < 0) {
2779  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2780  // We ignore this call if the user has specified a negative value.
2781  // The current setting won't be changed. The last valid setting will be
2782  // used. A warning will be issued (if warnings are allowed as controlled by
2783  // the KMP_WARNINGS env var).
2784  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2785  "max_active_levels for thread %d = (%d)\n",
2786  gtid, max_active_levels));
2787  return;
2788  }
2789  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2790  // it's OK, the max_active_levels is within the valid range: [ 0;
2791  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2792  // We allow a zero value. (implementation defined behavior)
2793  } else {
2794  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2795  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2796  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2797  // Current upper limit is MAX_INT. (implementation defined behavior)
2798  // If the input exceeds the upper limit, we correct the input to be the
2799  // upper limit. (implementation defined behavior)
2800  // Actually, the flow should never get here until we use MAX_INT limit.
2801  }
2802  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2803  "max_active_levels for thread %d = (%d)\n",
2804  gtid, max_active_levels));
2805 
2806  thread = __kmp_threads[gtid];
2807 
2808  __kmp_save_internal_controls(thread);
2809 
2810  set__max_active_levels(thread, max_active_levels);
2811 }
2812 
2813 /* Gets max_active_levels */
2814 int __kmp_get_max_active_levels(int gtid) {
2815  kmp_info_t *thread;
2816 
2817  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2818  KMP_DEBUG_ASSERT(__kmp_init_serial);
2819 
2820  thread = __kmp_threads[gtid];
2821  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2822  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2823  "curtask_maxaclevel=%d\n",
2824  gtid, thread->th.th_current_task,
2825  thread->th.th_current_task->td_icvs.max_active_levels));
2826  return thread->th.th_current_task->td_icvs.max_active_levels;
2827 }
2828 
2829 // nteams-var per-device ICV
2830 void __kmp_set_num_teams(int num_teams) {
2831  if (num_teams > 0)
2832  __kmp_nteams = num_teams;
2833 }
2834 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2835 // teams-thread-limit-var per-device ICV
2836 void __kmp_set_teams_thread_limit(int limit) {
2837  if (limit > 0)
2838  __kmp_teams_thread_limit = limit;
2839 }
2840 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2841 
2842 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2843 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2844 
2845 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2846 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2847  kmp_info_t *thread;
2848  kmp_sched_t orig_kind;
2849  // kmp_team_t *team;
2850 
2851  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2852  gtid, (int)kind, chunk));
2853  KMP_DEBUG_ASSERT(__kmp_init_serial);
2854 
2855  // Check if the kind parameter is valid, correct if needed.
2856  // Valid parameters should fit in one of two intervals - standard or extended:
2857  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2858  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2859  orig_kind = kind;
2860  kind = __kmp_sched_without_mods(kind);
2861 
2862  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2863  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2864  // TODO: Hint needs attention in case we change the default schedule.
2865  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2866  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2867  __kmp_msg_null);
2868  kind = kmp_sched_default;
2869  chunk = 0; // ignore chunk value in case of bad kind
2870  }
2871 
2872  thread = __kmp_threads[gtid];
2873 
2874  __kmp_save_internal_controls(thread);
2875 
2876  if (kind < kmp_sched_upper_std) {
2877  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2878  // differ static chunked vs. unchunked: chunk should be invalid to
2879  // indicate unchunked schedule (which is the default)
2880  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2881  } else {
2882  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2883  __kmp_sch_map[kind - kmp_sched_lower - 1];
2884  }
2885  } else {
2886  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2887  // kmp_sched_lower - 2 ];
2888  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2889  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2890  kmp_sched_lower - 2];
2891  }
2892  __kmp_sched_apply_mods_intkind(
2893  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2894  if (kind == kmp_sched_auto || chunk < 1) {
2895  // ignore parameter chunk for schedule auto
2896  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2897  } else {
2898  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2899  }
2900 }
2901 
2902 /* Gets def_sched_var ICV values */
2903 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2904  kmp_info_t *thread;
2905  enum sched_type th_type;
2906 
2907  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2908  KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910  thread = __kmp_threads[gtid];
2911 
2912  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2913  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2914  case kmp_sch_static:
2915  case kmp_sch_static_greedy:
2916  case kmp_sch_static_balanced:
2917  *kind = kmp_sched_static;
2918  __kmp_sched_apply_mods_stdkind(kind, th_type);
2919  *chunk = 0; // chunk was not set, try to show this fact via zero value
2920  return;
2921  case kmp_sch_static_chunked:
2922  *kind = kmp_sched_static;
2923  break;
2924  case kmp_sch_dynamic_chunked:
2925  *kind = kmp_sched_dynamic;
2926  break;
2928  case kmp_sch_guided_iterative_chunked:
2929  case kmp_sch_guided_analytical_chunked:
2930  *kind = kmp_sched_guided;
2931  break;
2932  case kmp_sch_auto:
2933  *kind = kmp_sched_auto;
2934  break;
2935  case kmp_sch_trapezoidal:
2936  *kind = kmp_sched_trapezoidal;
2937  break;
2938 #if KMP_STATIC_STEAL_ENABLED
2939  case kmp_sch_static_steal:
2940  *kind = kmp_sched_static_steal;
2941  break;
2942 #endif
2943  default:
2944  KMP_FATAL(UnknownSchedulingType, th_type);
2945  }
2946 
2947  __kmp_sched_apply_mods_stdkind(kind, th_type);
2948  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2949 }
2950 
2951 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2952 
2953  int ii, dd;
2954  kmp_team_t *team;
2955  kmp_info_t *thr;
2956 
2957  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2958  KMP_DEBUG_ASSERT(__kmp_init_serial);
2959 
2960  // validate level
2961  if (level == 0)
2962  return 0;
2963  if (level < 0)
2964  return -1;
2965  thr = __kmp_threads[gtid];
2966  team = thr->th.th_team;
2967  ii = team->t.t_level;
2968  if (level > ii)
2969  return -1;
2970 
2971  if (thr->th.th_teams_microtask) {
2972  // AC: we are in teams region where multiple nested teams have same level
2973  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2974  if (level <=
2975  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2976  KMP_DEBUG_ASSERT(ii >= tlevel);
2977  // AC: As we need to pass by the teams league, we need to artificially
2978  // increase ii
2979  if (ii == tlevel) {
2980  ii += 2; // three teams have same level
2981  } else {
2982  ii++; // two teams have same level
2983  }
2984  }
2985  }
2986 
2987  if (ii == level)
2988  return __kmp_tid_from_gtid(gtid);
2989 
2990  dd = team->t.t_serialized;
2991  level++;
2992  while (ii > level) {
2993  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2994  }
2995  if ((team->t.t_serialized) && (!dd)) {
2996  team = team->t.t_parent;
2997  continue;
2998  }
2999  if (ii > level) {
3000  team = team->t.t_parent;
3001  dd = team->t.t_serialized;
3002  ii--;
3003  }
3004  }
3005 
3006  return (dd > 1) ? (0) : (team->t.t_master_tid);
3007 }
3008 
3009 int __kmp_get_team_size(int gtid, int level) {
3010 
3011  int ii, dd;
3012  kmp_team_t *team;
3013  kmp_info_t *thr;
3014 
3015  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3016  KMP_DEBUG_ASSERT(__kmp_init_serial);
3017 
3018  // validate level
3019  if (level == 0)
3020  return 1;
3021  if (level < 0)
3022  return -1;
3023  thr = __kmp_threads[gtid];
3024  team = thr->th.th_team;
3025  ii = team->t.t_level;
3026  if (level > ii)
3027  return -1;
3028 
3029  if (thr->th.th_teams_microtask) {
3030  // AC: we are in teams region where multiple nested teams have same level
3031  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3032  if (level <=
3033  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3034  KMP_DEBUG_ASSERT(ii >= tlevel);
3035  // AC: As we need to pass by the teams league, we need to artificially
3036  // increase ii
3037  if (ii == tlevel) {
3038  ii += 2; // three teams have same level
3039  } else {
3040  ii++; // two teams have same level
3041  }
3042  }
3043  }
3044 
3045  while (ii > level) {
3046  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3047  }
3048  if (team->t.t_serialized && (!dd)) {
3049  team = team->t.t_parent;
3050  continue;
3051  }
3052  if (ii > level) {
3053  team = team->t.t_parent;
3054  ii--;
3055  }
3056  }
3057 
3058  return team->t.t_nproc;
3059 }
3060 
3061 kmp_r_sched_t __kmp_get_schedule_global() {
3062  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3063  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3064  // independently. So one can get the updated schedule here.
3065 
3066  kmp_r_sched_t r_sched;
3067 
3068  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3069  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3070  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3071  // different roots (even in OMP 2.5)
3072  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3073  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3074  if (s == kmp_sch_static) {
3075  // replace STATIC with more detailed schedule (balanced or greedy)
3076  r_sched.r_sched_type = __kmp_static;
3077  } else if (s == kmp_sch_guided_chunked) {
3078  // replace GUIDED with more detailed schedule (iterative or analytical)
3079  r_sched.r_sched_type = __kmp_guided;
3080  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3081  r_sched.r_sched_type = __kmp_sched;
3082  }
3083  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3084 
3085  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3086  // __kmp_chunk may be wrong here (if it was not ever set)
3087  r_sched.chunk = KMP_DEFAULT_CHUNK;
3088  } else {
3089  r_sched.chunk = __kmp_chunk;
3090  }
3091 
3092  return r_sched;
3093 }
3094 
3095 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3096  at least argc number of *t_argv entries for the requested team. */
3097 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3098 
3099  KMP_DEBUG_ASSERT(team);
3100  if (!realloc || argc > team->t.t_max_argc) {
3101 
3102  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3103  "current entries=%d\n",
3104  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3105  /* if previously allocated heap space for args, free them */
3106  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3107  __kmp_free((void *)team->t.t_argv);
3108 
3109  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3110  /* use unused space in the cache line for arguments */
3111  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3112  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3113  "argv entries\n",
3114  team->t.t_id, team->t.t_max_argc));
3115  team->t.t_argv = &team->t.t_inline_argv[0];
3116  if (__kmp_storage_map) {
3117  __kmp_print_storage_map_gtid(
3118  -1, &team->t.t_inline_argv[0],
3119  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3120  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3121  team->t.t_id);
3122  }
3123  } else {
3124  /* allocate space for arguments in the heap */
3125  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3126  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3127  : 2 * argc;
3128  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3129  "argv entries\n",
3130  team->t.t_id, team->t.t_max_argc));
3131  team->t.t_argv =
3132  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3133  if (__kmp_storage_map) {
3134  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3135  &team->t.t_argv[team->t.t_max_argc],
3136  sizeof(void *) * team->t.t_max_argc,
3137  "team_%d.t_argv", team->t.t_id);
3138  }
3139  }
3140  }
3141 }
3142 
3143 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3144  int i;
3145  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3146  team->t.t_threads =
3147  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3148  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3149  sizeof(dispatch_shared_info_t) * num_disp_buff);
3150  team->t.t_dispatch =
3151  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3152  team->t.t_implicit_task_taskdata =
3153  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3154  team->t.t_max_nproc = max_nth;
3155 
3156  /* setup dispatch buffers */
3157  for (i = 0; i < num_disp_buff; ++i) {
3158  team->t.t_disp_buffer[i].buffer_index = i;
3159  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3160  }
3161 }
3162 
3163 static void __kmp_free_team_arrays(kmp_team_t *team) {
3164  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3165  int i;
3166  for (i = 0; i < team->t.t_max_nproc; ++i) {
3167  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3168  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3169  team->t.t_dispatch[i].th_disp_buffer = NULL;
3170  }
3171  }
3172 #if KMP_USE_HIER_SCHED
3173  __kmp_dispatch_free_hierarchies(team);
3174 #endif
3175  __kmp_free(team->t.t_threads);
3176  __kmp_free(team->t.t_disp_buffer);
3177  __kmp_free(team->t.t_dispatch);
3178  __kmp_free(team->t.t_implicit_task_taskdata);
3179  team->t.t_threads = NULL;
3180  team->t.t_disp_buffer = NULL;
3181  team->t.t_dispatch = NULL;
3182  team->t.t_implicit_task_taskdata = 0;
3183 }
3184 
3185 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3186  kmp_info_t **oldThreads = team->t.t_threads;
3187 
3188  __kmp_free(team->t.t_disp_buffer);
3189  __kmp_free(team->t.t_dispatch);
3190  __kmp_free(team->t.t_implicit_task_taskdata);
3191  __kmp_allocate_team_arrays(team, max_nth);
3192 
3193  KMP_MEMCPY(team->t.t_threads, oldThreads,
3194  team->t.t_nproc * sizeof(kmp_info_t *));
3195 
3196  __kmp_free(oldThreads);
3197 }
3198 
3199 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3200 
3201  kmp_r_sched_t r_sched =
3202  __kmp_get_schedule_global(); // get current state of scheduling globals
3203 
3204  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3205 
3206  kmp_internal_control_t g_icvs = {
3207  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3208  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3209  // adjustment of threads (per thread)
3210  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3211  // whether blocktime is explicitly set
3212  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3213 #if KMP_USE_MONITOR
3214  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3215 // intervals
3216 #endif
3217  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3218  // next parallel region (per thread)
3219  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3220  __kmp_cg_max_nth, // int thread_limit;
3221  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3222  // for max_active_levels
3223  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3224  // {sched,chunk} pair
3225  __kmp_nested_proc_bind.bind_types[0],
3226  __kmp_default_device,
3227  NULL // struct kmp_internal_control *next;
3228  };
3229 
3230  return g_icvs;
3231 }
3232 
3233 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3234 
3235  kmp_internal_control_t gx_icvs;
3236  gx_icvs.serial_nesting_level =
3237  0; // probably =team->t.t_serial like in save_inter_controls
3238  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3239  gx_icvs.next = NULL;
3240 
3241  return gx_icvs;
3242 }
3243 
3244 static void __kmp_initialize_root(kmp_root_t *root) {
3245  int f;
3246  kmp_team_t *root_team;
3247  kmp_team_t *hot_team;
3248  int hot_team_max_nth;
3249  kmp_r_sched_t r_sched =
3250  __kmp_get_schedule_global(); // get current state of scheduling globals
3251  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3252  KMP_DEBUG_ASSERT(root);
3253  KMP_ASSERT(!root->r.r_begin);
3254 
3255  /* setup the root state structure */
3256  __kmp_init_lock(&root->r.r_begin_lock);
3257  root->r.r_begin = FALSE;
3258  root->r.r_active = FALSE;
3259  root->r.r_in_parallel = 0;
3260  root->r.r_blocktime = __kmp_dflt_blocktime;
3261 #if KMP_AFFINITY_SUPPORTED
3262  root->r.r_affinity_assigned = FALSE;
3263 #endif
3264 
3265  /* setup the root team for this task */
3266  /* allocate the root team structure */
3267  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3268 
3269  root_team =
3270  __kmp_allocate_team(root,
3271  1, // new_nproc
3272  1, // max_nproc
3273 #if OMPT_SUPPORT
3274  ompt_data_none, // root parallel id
3275 #endif
3276  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3277  0 // argc
3278  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3279  );
3280 #if USE_DEBUGGER
3281  // Non-NULL value should be assigned to make the debugger display the root
3282  // team.
3283  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3284 #endif
3285 
3286  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3287 
3288  root->r.r_root_team = root_team;
3289  root_team->t.t_control_stack_top = NULL;
3290 
3291  /* initialize root team */
3292  root_team->t.t_threads[0] = NULL;
3293  root_team->t.t_nproc = 1;
3294  root_team->t.t_serialized = 1;
3295  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3296  root_team->t.t_sched.sched = r_sched.sched;
3297  KA_TRACE(
3298  20,
3299  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3300  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3301 
3302  /* setup the hot team for this task */
3303  /* allocate the hot team structure */
3304  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3305 
3306  hot_team =
3307  __kmp_allocate_team(root,
3308  1, // new_nproc
3309  __kmp_dflt_team_nth_ub * 2, // max_nproc
3310 #if OMPT_SUPPORT
3311  ompt_data_none, // root parallel id
3312 #endif
3313  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3314  0 // argc
3315  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3316  );
3317  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3318 
3319  root->r.r_hot_team = hot_team;
3320  root_team->t.t_control_stack_top = NULL;
3321 
3322  /* first-time initialization */
3323  hot_team->t.t_parent = root_team;
3324 
3325  /* initialize hot team */
3326  hot_team_max_nth = hot_team->t.t_max_nproc;
3327  for (f = 0; f < hot_team_max_nth; ++f) {
3328  hot_team->t.t_threads[f] = NULL;
3329  }
3330  hot_team->t.t_nproc = 1;
3331  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3332  hot_team->t.t_sched.sched = r_sched.sched;
3333  hot_team->t.t_size_changed = 0;
3334 }
3335 
3336 #ifdef KMP_DEBUG
3337 
3338 typedef struct kmp_team_list_item {
3339  kmp_team_p const *entry;
3340  struct kmp_team_list_item *next;
3341 } kmp_team_list_item_t;
3342 typedef kmp_team_list_item_t *kmp_team_list_t;
3343 
3344 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3345  kmp_team_list_t list, // List of teams.
3346  kmp_team_p const *team // Team to add.
3347 ) {
3348 
3349  // List must terminate with item where both entry and next are NULL.
3350  // Team is added to the list only once.
3351  // List is sorted in ascending order by team id.
3352  // Team id is *not* a key.
3353 
3354  kmp_team_list_t l;
3355 
3356  KMP_DEBUG_ASSERT(list != NULL);
3357  if (team == NULL) {
3358  return;
3359  }
3360 
3361  __kmp_print_structure_team_accum(list, team->t.t_parent);
3362  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3363 
3364  // Search list for the team.
3365  l = list;
3366  while (l->next != NULL && l->entry != team) {
3367  l = l->next;
3368  }
3369  if (l->next != NULL) {
3370  return; // Team has been added before, exit.
3371  }
3372 
3373  // Team is not found. Search list again for insertion point.
3374  l = list;
3375  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3376  l = l->next;
3377  }
3378 
3379  // Insert team.
3380  {
3381  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3382  sizeof(kmp_team_list_item_t));
3383  *item = *l;
3384  l->entry = team;
3385  l->next = item;
3386  }
3387 }
3388 
3389 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3390 
3391 ) {
3392  __kmp_printf("%s", title);
3393  if (team != NULL) {
3394  __kmp_printf("%2x %p\n", team->t.t_id, team);
3395  } else {
3396  __kmp_printf(" - (nil)\n");
3397  }
3398 }
3399 
3400 static void __kmp_print_structure_thread(char const *title,
3401  kmp_info_p const *thread) {
3402  __kmp_printf("%s", title);
3403  if (thread != NULL) {
3404  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3405  } else {
3406  __kmp_printf(" - (nil)\n");
3407  }
3408 }
3409 
3410 void __kmp_print_structure(void) {
3411 
3412  kmp_team_list_t list;
3413 
3414  // Initialize list of teams.
3415  list =
3416  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3417  list->entry = NULL;
3418  list->next = NULL;
3419 
3420  __kmp_printf("\n------------------------------\nGlobal Thread "
3421  "Table\n------------------------------\n");
3422  {
3423  int gtid;
3424  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3425  __kmp_printf("%2d", gtid);
3426  if (__kmp_threads != NULL) {
3427  __kmp_printf(" %p", __kmp_threads[gtid]);
3428  }
3429  if (__kmp_root != NULL) {
3430  __kmp_printf(" %p", __kmp_root[gtid]);
3431  }
3432  __kmp_printf("\n");
3433  }
3434  }
3435 
3436  // Print out __kmp_threads array.
3437  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3438  "----------\n");
3439  if (__kmp_threads != NULL) {
3440  int gtid;
3441  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3442  kmp_info_t const *thread = __kmp_threads[gtid];
3443  if (thread != NULL) {
3444  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3445  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3446  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3447  __kmp_print_structure_team(" Serial Team: ",
3448  thread->th.th_serial_team);
3449  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3450  __kmp_print_structure_thread(" Primary: ",
3451  thread->th.th_team_master);
3452  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3453  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3454  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3455  __kmp_print_structure_thread(" Next in pool: ",
3456  thread->th.th_next_pool);
3457  __kmp_printf("\n");
3458  __kmp_print_structure_team_accum(list, thread->th.th_team);
3459  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3460  }
3461  }
3462  } else {
3463  __kmp_printf("Threads array is not allocated.\n");
3464  }
3465 
3466  // Print out __kmp_root array.
3467  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3468  "--------\n");
3469  if (__kmp_root != NULL) {
3470  int gtid;
3471  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3472  kmp_root_t const *root = __kmp_root[gtid];
3473  if (root != NULL) {
3474  __kmp_printf("GTID %2d %p:\n", gtid, root);
3475  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3476  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3477  __kmp_print_structure_thread(" Uber Thread: ",
3478  root->r.r_uber_thread);
3479  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3480  __kmp_printf(" In Parallel: %2d\n",
3481  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3482  __kmp_printf("\n");
3483  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3484  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3485  }
3486  }
3487  } else {
3488  __kmp_printf("Ubers array is not allocated.\n");
3489  }
3490 
3491  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3492  "--------\n");
3493  while (list->next != NULL) {
3494  kmp_team_p const *team = list->entry;
3495  int i;
3496  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3497  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3498  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3499  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3500  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3501  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3502  for (i = 0; i < team->t.t_nproc; ++i) {
3503  __kmp_printf(" Thread %2d: ", i);
3504  __kmp_print_structure_thread("", team->t.t_threads[i]);
3505  }
3506  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3507  __kmp_printf("\n");
3508  list = list->next;
3509  }
3510 
3511  // Print out __kmp_thread_pool and __kmp_team_pool.
3512  __kmp_printf("\n------------------------------\nPools\n----------------------"
3513  "--------\n");
3514  __kmp_print_structure_thread("Thread pool: ",
3515  CCAST(kmp_info_t *, __kmp_thread_pool));
3516  __kmp_print_structure_team("Team pool: ",
3517  CCAST(kmp_team_t *, __kmp_team_pool));
3518  __kmp_printf("\n");
3519 
3520  // Free team list.
3521  while (list != NULL) {
3522  kmp_team_list_item_t *item = list;
3523  list = list->next;
3524  KMP_INTERNAL_FREE(item);
3525  }
3526 }
3527 
3528 #endif
3529 
3530 //---------------------------------------------------------------------------
3531 // Stuff for per-thread fast random number generator
3532 // Table of primes
3533 static const unsigned __kmp_primes[] = {
3534  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3535  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3536  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3537  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3538  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3539  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3540  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3541  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3542  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3543  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3544  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3545 
3546 //---------------------------------------------------------------------------
3547 // __kmp_get_random: Get a random number using a linear congruential method.
3548 unsigned short __kmp_get_random(kmp_info_t *thread) {
3549  unsigned x = thread->th.th_x;
3550  unsigned short r = (unsigned short)(x >> 16);
3551 
3552  thread->th.th_x = x * thread->th.th_a + 1;
3553 
3554  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3555  thread->th.th_info.ds.ds_tid, r));
3556 
3557  return r;
3558 }
3559 //--------------------------------------------------------
3560 // __kmp_init_random: Initialize a random number generator
3561 void __kmp_init_random(kmp_info_t *thread) {
3562  unsigned seed = thread->th.th_info.ds.ds_tid;
3563 
3564  thread->th.th_a =
3565  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3566  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3567  KA_TRACE(30,
3568  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3569 }
3570 
3571 #if KMP_OS_WINDOWS
3572 /* reclaim array entries for root threads that are already dead, returns number
3573  * reclaimed */
3574 static int __kmp_reclaim_dead_roots(void) {
3575  int i, r = 0;
3576 
3577  for (i = 0; i < __kmp_threads_capacity; ++i) {
3578  if (KMP_UBER_GTID(i) &&
3579  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3580  !__kmp_root[i]
3581  ->r.r_active) { // AC: reclaim only roots died in non-active state
3582  r += __kmp_unregister_root_other_thread(i);
3583  }
3584  }
3585  return r;
3586 }
3587 #endif
3588 
3589 /* This function attempts to create free entries in __kmp_threads and
3590  __kmp_root, and returns the number of free entries generated.
3591 
3592  For Windows* OS static library, the first mechanism used is to reclaim array
3593  entries for root threads that are already dead.
3594 
3595  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3596  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3597  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3598  threadprivate cache array has been created. Synchronization with
3599  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3600 
3601  After any dead root reclamation, if the clipping value allows array expansion
3602  to result in the generation of a total of nNeed free slots, the function does
3603  that expansion. If not, nothing is done beyond the possible initial root
3604  thread reclamation.
3605 
3606  If any argument is negative, the behavior is undefined. */
3607 static int __kmp_expand_threads(int nNeed) {
3608  int added = 0;
3609  int minimumRequiredCapacity;
3610  int newCapacity;
3611  kmp_info_t **newThreads;
3612  kmp_root_t **newRoot;
3613 
3614  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3615  // resizing __kmp_threads does not need additional protection if foreign
3616  // threads are present
3617 
3618 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3619  /* only for Windows static library */
3620  /* reclaim array entries for root threads that are already dead */
3621  added = __kmp_reclaim_dead_roots();
3622 
3623  if (nNeed) {
3624  nNeed -= added;
3625  if (nNeed < 0)
3626  nNeed = 0;
3627  }
3628 #endif
3629  if (nNeed <= 0)
3630  return added;
3631 
3632  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3633  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3634  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3635  // > __kmp_max_nth in one of two ways:
3636  //
3637  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3638  // may not be reused by another thread, so we may need to increase
3639  // __kmp_threads_capacity to __kmp_max_nth + 1.
3640  //
3641  // 2) New foreign root(s) are encountered. We always register new foreign
3642  // roots. This may cause a smaller # of threads to be allocated at
3643  // subsequent parallel regions, but the worker threads hang around (and
3644  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3645  //
3646  // Anyway, that is the reason for moving the check to see if
3647  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3648  // instead of having it performed here. -BB
3649 
3650  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3651 
3652  /* compute expansion headroom to check if we can expand */
3653  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3654  /* possible expansion too small -- give up */
3655  return added;
3656  }
3657  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3658 
3659  newCapacity = __kmp_threads_capacity;
3660  do {
3661  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3662  : __kmp_sys_max_nth;
3663  } while (newCapacity < minimumRequiredCapacity);
3664  newThreads = (kmp_info_t **)__kmp_allocate(
3665  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3666  newRoot =
3667  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3668  KMP_MEMCPY(newThreads, __kmp_threads,
3669  __kmp_threads_capacity * sizeof(kmp_info_t *));
3670  KMP_MEMCPY(newRoot, __kmp_root,
3671  __kmp_threads_capacity * sizeof(kmp_root_t *));
3672 
3673  kmp_info_t **temp_threads = __kmp_threads;
3674  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3675  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3676  __kmp_free(temp_threads);
3677  added += newCapacity - __kmp_threads_capacity;
3678  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3679 
3680  if (newCapacity > __kmp_tp_capacity) {
3681  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3682  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3683  __kmp_threadprivate_resize_cache(newCapacity);
3684  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3685  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3686  }
3687  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3688  }
3689 
3690  return added;
3691 }
3692 
3693 /* Register the current thread as a root thread and obtain our gtid. We must
3694  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3695  thread that calls from __kmp_do_serial_initialize() */
3696 int __kmp_register_root(int initial_thread) {
3697  kmp_info_t *root_thread;
3698  kmp_root_t *root;
3699  int gtid;
3700  int capacity;
3701  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3702  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3703  KMP_MB();
3704 
3705  /* 2007-03-02:
3706  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3707  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3708  work as expected -- it may return false (that means there is at least one
3709  empty slot in __kmp_threads array), but it is possible the only free slot
3710  is #0, which is reserved for initial thread and so cannot be used for this
3711  one. Following code workarounds this bug.
3712 
3713  However, right solution seems to be not reserving slot #0 for initial
3714  thread because:
3715  (1) there is no magic in slot #0,
3716  (2) we cannot detect initial thread reliably (the first thread which does
3717  serial initialization may be not a real initial thread).
3718  */
3719  capacity = __kmp_threads_capacity;
3720  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3721  --capacity;
3722  }
3723 
3724  // If it is not for initializing the hidden helper team, we need to take
3725  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3726  // in __kmp_threads_capacity.
3727  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3728  capacity -= __kmp_hidden_helper_threads_num;
3729  }
3730 
3731  /* see if there are too many threads */
3732  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3733  if (__kmp_tp_cached) {
3734  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3735  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3736  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3737  } else {
3738  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3739  __kmp_msg_null);
3740  }
3741  }
3742 
3743  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3744  // 0: initial thread, also a regular OpenMP thread.
3745  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3746  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3747  // regular OpenMP threads.
3748  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3749  // Find an available thread slot for hidden helper thread. Slots for hidden
3750  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3751  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3752  gtid <= __kmp_hidden_helper_threads_num;
3753  gtid++)
3754  ;
3755  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3756  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3757  "hidden helper thread: T#%d\n",
3758  gtid));
3759  } else {
3760  /* find an available thread slot */
3761  // Don't reassign the zero slot since we need that to only be used by
3762  // initial thread. Slots for hidden helper threads should also be skipped.
3763  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3764  gtid = 0;
3765  } else {
3766  for (gtid = __kmp_hidden_helper_threads_num + 1;
3767  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3768  ;
3769  }
3770  KA_TRACE(
3771  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3772  KMP_ASSERT(gtid < __kmp_threads_capacity);
3773  }
3774 
3775  /* update global accounting */
3776  __kmp_all_nth++;
3777  TCW_4(__kmp_nth, __kmp_nth + 1);
3778 
3779  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3780  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3781  if (__kmp_adjust_gtid_mode) {
3782  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3783  if (TCR_4(__kmp_gtid_mode) != 2) {
3784  TCW_4(__kmp_gtid_mode, 2);
3785  }
3786  } else {
3787  if (TCR_4(__kmp_gtid_mode) != 1) {
3788  TCW_4(__kmp_gtid_mode, 1);
3789  }
3790  }
3791  }
3792 
3793 #ifdef KMP_ADJUST_BLOCKTIME
3794  /* Adjust blocktime to zero if necessary */
3795  /* Middle initialization might not have occurred yet */
3796  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3797  if (__kmp_nth > __kmp_avail_proc) {
3798  __kmp_zero_bt = TRUE;
3799  }
3800  }
3801 #endif /* KMP_ADJUST_BLOCKTIME */
3802 
3803  /* setup this new hierarchy */
3804  if (!(root = __kmp_root[gtid])) {
3805  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3806  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3807  }
3808 
3809 #if KMP_STATS_ENABLED
3810  // Initialize stats as soon as possible (right after gtid assignment).
3811  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3812  __kmp_stats_thread_ptr->startLife();
3813  KMP_SET_THREAD_STATE(SERIAL_REGION);
3814  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3815 #endif
3816  __kmp_initialize_root(root);
3817 
3818  /* setup new root thread structure */
3819  if (root->r.r_uber_thread) {
3820  root_thread = root->r.r_uber_thread;
3821  } else {
3822  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3823  if (__kmp_storage_map) {
3824  __kmp_print_thread_storage_map(root_thread, gtid);
3825  }
3826  root_thread->th.th_info.ds.ds_gtid = gtid;
3827 #if OMPT_SUPPORT
3828  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3829 #endif
3830  root_thread->th.th_root = root;
3831  if (__kmp_env_consistency_check) {
3832  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3833  }
3834 #if USE_FAST_MEMORY
3835  __kmp_initialize_fast_memory(root_thread);
3836 #endif /* USE_FAST_MEMORY */
3837 
3838 #if KMP_USE_BGET
3839  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3840  __kmp_initialize_bget(root_thread);
3841 #endif
3842  __kmp_init_random(root_thread); // Initialize random number generator
3843  }
3844 
3845  /* setup the serial team held in reserve by the root thread */
3846  if (!root_thread->th.th_serial_team) {
3847  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3848  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3849  root_thread->th.th_serial_team = __kmp_allocate_team(
3850  root, 1, 1,
3851 #if OMPT_SUPPORT
3852  ompt_data_none, // root parallel id
3853 #endif
3854  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3855  }
3856  KMP_ASSERT(root_thread->th.th_serial_team);
3857  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3858  root_thread->th.th_serial_team));
3859 
3860  /* drop root_thread into place */
3861  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3862 
3863  root->r.r_root_team->t.t_threads[0] = root_thread;
3864  root->r.r_hot_team->t.t_threads[0] = root_thread;
3865  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3866  // AC: the team created in reserve, not for execution (it is unused for now).
3867  root_thread->th.th_serial_team->t.t_serialized = 0;
3868  root->r.r_uber_thread = root_thread;
3869 
3870  /* initialize the thread, get it ready to go */
3871  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3872  TCW_4(__kmp_init_gtid, TRUE);
3873 
3874  /* prepare the primary thread for get_gtid() */
3875  __kmp_gtid_set_specific(gtid);
3876 
3877 #if USE_ITT_BUILD
3878  __kmp_itt_thread_name(gtid);
3879 #endif /* USE_ITT_BUILD */
3880 
3881 #ifdef KMP_TDATA_GTID
3882  __kmp_gtid = gtid;
3883 #endif
3884  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3885  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3886 
3887  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3888  "plain=%u\n",
3889  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3890  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3891  KMP_INIT_BARRIER_STATE));
3892  { // Initialize barrier data.
3893  int b;
3894  for (b = 0; b < bs_last_barrier; ++b) {
3895  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3896 #if USE_DEBUGGER
3897  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3898 #endif
3899  }
3900  }
3901  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3902  KMP_INIT_BARRIER_STATE);
3903 
3904 #if KMP_AFFINITY_SUPPORTED
3905  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3906  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3907  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3908  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3909 #endif /* KMP_AFFINITY_SUPPORTED */
3910  root_thread->th.th_def_allocator = __kmp_def_allocator;
3911  root_thread->th.th_prev_level = 0;
3912  root_thread->th.th_prev_num_threads = 1;
3913 
3914  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3915  tmp->cg_root = root_thread;
3916  tmp->cg_thread_limit = __kmp_cg_max_nth;
3917  tmp->cg_nthreads = 1;
3918  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3919  " cg_nthreads init to 1\n",
3920  root_thread, tmp));
3921  tmp->up = NULL;
3922  root_thread->th.th_cg_roots = tmp;
3923 
3924  __kmp_root_counter++;
3925 
3926 #if OMPT_SUPPORT
3927  if (!initial_thread && ompt_enabled.enabled) {
3928 
3929  kmp_info_t *root_thread = ompt_get_thread();
3930 
3931  ompt_set_thread_state(root_thread, ompt_state_overhead);
3932 
3933  if (ompt_enabled.ompt_callback_thread_begin) {
3934  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3935  ompt_thread_initial, __ompt_get_thread_data_internal());
3936  }
3937  ompt_data_t *task_data;
3938  ompt_data_t *parallel_data;
3939  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3940  NULL);
3941  if (ompt_enabled.ompt_callback_implicit_task) {
3942  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3943  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3944  }
3945 
3946  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3947  }
3948 #endif
3949 #if OMPD_SUPPORT
3950  if (ompd_state & OMPD_ENABLE_BP)
3951  ompd_bp_thread_begin();
3952 #endif
3953 
3954  KMP_MB();
3955  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3956 
3957  return gtid;
3958 }
3959 
3960 #if KMP_NESTED_HOT_TEAMS
3961 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3962  const int max_level) {
3963  int i, n, nth;
3964  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3965  if (!hot_teams || !hot_teams[level].hot_team) {
3966  return 0;
3967  }
3968  KMP_DEBUG_ASSERT(level < max_level);
3969  kmp_team_t *team = hot_teams[level].hot_team;
3970  nth = hot_teams[level].hot_team_nth;
3971  n = nth - 1; // primary thread is not freed
3972  if (level < max_level - 1) {
3973  for (i = 0; i < nth; ++i) {
3974  kmp_info_t *th = team->t.t_threads[i];
3975  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3976  if (i > 0 && th->th.th_hot_teams) {
3977  __kmp_free(th->th.th_hot_teams);
3978  th->th.th_hot_teams = NULL;
3979  }
3980  }
3981  }
3982  __kmp_free_team(root, team, NULL);
3983  return n;
3984 }
3985 #endif
3986 
3987 // Resets a root thread and clear its root and hot teams.
3988 // Returns the number of __kmp_threads entries directly and indirectly freed.
3989 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3990  kmp_team_t *root_team = root->r.r_root_team;
3991  kmp_team_t *hot_team = root->r.r_hot_team;
3992  int n = hot_team->t.t_nproc;
3993  int i;
3994 
3995  KMP_DEBUG_ASSERT(!root->r.r_active);
3996 
3997  root->r.r_root_team = NULL;
3998  root->r.r_hot_team = NULL;
3999  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4000  // before call to __kmp_free_team().
4001  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4002 #if KMP_NESTED_HOT_TEAMS
4003  if (__kmp_hot_teams_max_level >
4004  0) { // need to free nested hot teams and their threads if any
4005  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4006  kmp_info_t *th = hot_team->t.t_threads[i];
4007  if (__kmp_hot_teams_max_level > 1) {
4008  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4009  }
4010  if (th->th.th_hot_teams) {
4011  __kmp_free(th->th.th_hot_teams);
4012  th->th.th_hot_teams = NULL;
4013  }
4014  }
4015  }
4016 #endif
4017  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4018 
4019  // Before we can reap the thread, we need to make certain that all other
4020  // threads in the teams that had this root as ancestor have stopped trying to
4021  // steal tasks.
4022  if (__kmp_tasking_mode != tskm_immediate_exec) {
4023  __kmp_wait_to_unref_task_teams();
4024  }
4025 
4026 #if KMP_OS_WINDOWS
4027  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4028  KA_TRACE(
4029  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4030  "\n",
4031  (LPVOID) & (root->r.r_uber_thread->th),
4032  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4033  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4034 #endif /* KMP_OS_WINDOWS */
4035 
4036 #if OMPD_SUPPORT
4037  if (ompd_state & OMPD_ENABLE_BP)
4038  ompd_bp_thread_end();
4039 #endif
4040 
4041 #if OMPT_SUPPORT
4042  ompt_data_t *task_data;
4043  ompt_data_t *parallel_data;
4044  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4045  NULL);
4046  if (ompt_enabled.ompt_callback_implicit_task) {
4047  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4048  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4049  }
4050  if (ompt_enabled.ompt_callback_thread_end) {
4051  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4052  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4053  }
4054 #endif
4055 
4056  TCW_4(__kmp_nth,
4057  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4058  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4059  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4060  " to %d\n",
4061  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4062  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4063  if (i == 1) {
4064  // need to free contention group structure
4065  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4066  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4067  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4068  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4069  root->r.r_uber_thread->th.th_cg_roots = NULL;
4070  }
4071  __kmp_reap_thread(root->r.r_uber_thread, 1);
4072 
4073  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4074  // instead of freeing.
4075  root->r.r_uber_thread = NULL;
4076  /* mark root as no longer in use */
4077  root->r.r_begin = FALSE;
4078 
4079  return n;
4080 }
4081 
4082 void __kmp_unregister_root_current_thread(int gtid) {
4083  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4084  /* this lock should be ok, since unregister_root_current_thread is never
4085  called during an abort, only during a normal close. furthermore, if you
4086  have the forkjoin lock, you should never try to get the initz lock */
4087  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4088  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4089  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4090  "exiting T#%d\n",
4091  gtid));
4092  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4093  return;
4094  }
4095  kmp_root_t *root = __kmp_root[gtid];
4096 
4097  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4098  KMP_ASSERT(KMP_UBER_GTID(gtid));
4099  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4100  KMP_ASSERT(root->r.r_active == FALSE);
4101 
4102  KMP_MB();
4103 
4104  kmp_info_t *thread = __kmp_threads[gtid];
4105  kmp_team_t *team = thread->th.th_team;
4106  kmp_task_team_t *task_team = thread->th.th_task_team;
4107 
4108  // we need to wait for the proxy tasks before finishing the thread
4109  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4110  task_team->tt.tt_hidden_helper_task_encountered)) {
4111 #if OMPT_SUPPORT
4112  // the runtime is shutting down so we won't report any events
4113  thread->th.ompt_thread_info.state = ompt_state_undefined;
4114 #endif
4115  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4116  }
4117 
4118  __kmp_reset_root(gtid, root);
4119 
4120  KMP_MB();
4121  KC_TRACE(10,
4122  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4123 
4124  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4125 }
4126 
4127 #if KMP_OS_WINDOWS
4128 /* __kmp_forkjoin_lock must be already held
4129  Unregisters a root thread that is not the current thread. Returns the number
4130  of __kmp_threads entries freed as a result. */
4131 static int __kmp_unregister_root_other_thread(int gtid) {
4132  kmp_root_t *root = __kmp_root[gtid];
4133  int r;
4134 
4135  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4136  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4137  KMP_ASSERT(KMP_UBER_GTID(gtid));
4138  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4139  KMP_ASSERT(root->r.r_active == FALSE);
4140 
4141  r = __kmp_reset_root(gtid, root);
4142  KC_TRACE(10,
4143  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4144  return r;
4145 }
4146 #endif
4147 
4148 #if KMP_DEBUG
4149 void __kmp_task_info() {
4150 
4151  kmp_int32 gtid = __kmp_entry_gtid();
4152  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4153  kmp_info_t *this_thr = __kmp_threads[gtid];
4154  kmp_team_t *steam = this_thr->th.th_serial_team;
4155  kmp_team_t *team = this_thr->th.th_team;
4156 
4157  __kmp_printf(
4158  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4159  "ptask=%p\n",
4160  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4161  team->t.t_implicit_task_taskdata[tid].td_parent);
4162 }
4163 #endif // KMP_DEBUG
4164 
4165 /* TODO optimize with one big memclr, take out what isn't needed, split
4166  responsibility to workers as much as possible, and delay initialization of
4167  features as much as possible */
4168 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4169  int tid, int gtid) {
4170  /* this_thr->th.th_info.ds.ds_gtid is setup in
4171  kmp_allocate_thread/create_worker.
4172  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4173  KMP_DEBUG_ASSERT(this_thr != NULL);
4174  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4175  KMP_DEBUG_ASSERT(team);
4176  KMP_DEBUG_ASSERT(team->t.t_threads);
4177  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4178  kmp_info_t *master = team->t.t_threads[0];
4179  KMP_DEBUG_ASSERT(master);
4180  KMP_DEBUG_ASSERT(master->th.th_root);
4181 
4182  KMP_MB();
4183 
4184  TCW_SYNC_PTR(this_thr->th.th_team, team);
4185 
4186  this_thr->th.th_info.ds.ds_tid = tid;
4187  this_thr->th.th_set_nproc = 0;
4188  if (__kmp_tasking_mode != tskm_immediate_exec)
4189  // When tasking is possible, threads are not safe to reap until they are
4190  // done tasking; this will be set when tasking code is exited in wait
4191  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4192  else // no tasking --> always safe to reap
4193  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4194  this_thr->th.th_set_proc_bind = proc_bind_default;
4195 #if KMP_AFFINITY_SUPPORTED
4196  this_thr->th.th_new_place = this_thr->th.th_current_place;
4197 #endif
4198  this_thr->th.th_root = master->th.th_root;
4199 
4200  /* setup the thread's cache of the team structure */
4201  this_thr->th.th_team_nproc = team->t.t_nproc;
4202  this_thr->th.th_team_master = master;
4203  this_thr->th.th_team_serialized = team->t.t_serialized;
4204 
4205  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4206 
4207  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4208  tid, gtid, this_thr, this_thr->th.th_current_task));
4209 
4210  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4211  team, tid, TRUE);
4212 
4213  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4214  tid, gtid, this_thr, this_thr->th.th_current_task));
4215  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4216  // __kmp_initialize_team()?
4217 
4218  /* TODO no worksharing in speculative threads */
4219  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4220 
4221  this_thr->th.th_local.this_construct = 0;
4222 
4223  if (!this_thr->th.th_pri_common) {
4224  this_thr->th.th_pri_common =
4225  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4226  if (__kmp_storage_map) {
4227  __kmp_print_storage_map_gtid(
4228  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4229  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4230  }
4231  this_thr->th.th_pri_head = NULL;
4232  }
4233 
4234  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4235  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4236  // Make new thread's CG root same as primary thread's
4237  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4238  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4239  if (tmp) {
4240  // worker changes CG, need to check if old CG should be freed
4241  int i = tmp->cg_nthreads--;
4242  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4243  " on node %p of thread %p to %d\n",
4244  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4245  if (i == 1) {
4246  __kmp_free(tmp); // last thread left CG --> free it
4247  }
4248  }
4249  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4250  // Increment new thread's CG root's counter to add the new thread
4251  this_thr->th.th_cg_roots->cg_nthreads++;
4252  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4253  " node %p of thread %p to %d\n",
4254  this_thr, this_thr->th.th_cg_roots,
4255  this_thr->th.th_cg_roots->cg_root,
4256  this_thr->th.th_cg_roots->cg_nthreads));
4257  this_thr->th.th_current_task->td_icvs.thread_limit =
4258  this_thr->th.th_cg_roots->cg_thread_limit;
4259  }
4260 
4261  /* Initialize dynamic dispatch */
4262  {
4263  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4264  // Use team max_nproc since this will never change for the team.
4265  size_t disp_size =
4266  sizeof(dispatch_private_info_t) *
4267  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4268  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4269  team->t.t_max_nproc));
4270  KMP_ASSERT(dispatch);
4271  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4272  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4273 
4274  dispatch->th_disp_index = 0;
4275  dispatch->th_doacross_buf_idx = 0;
4276  if (!dispatch->th_disp_buffer) {
4277  dispatch->th_disp_buffer =
4278  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4279 
4280  if (__kmp_storage_map) {
4281  __kmp_print_storage_map_gtid(
4282  gtid, &dispatch->th_disp_buffer[0],
4283  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4284  ? 1
4285  : __kmp_dispatch_num_buffers],
4286  disp_size,
4287  "th_%d.th_dispatch.th_disp_buffer "
4288  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4289  gtid, team->t.t_id, gtid);
4290  }
4291  } else {
4292  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4293  }
4294 
4295  dispatch->th_dispatch_pr_current = 0;
4296  dispatch->th_dispatch_sh_current = 0;
4297 
4298  dispatch->th_deo_fcn = 0; /* ORDERED */
4299  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4300  }
4301 
4302  this_thr->th.th_next_pool = NULL;
4303 
4304  if (!this_thr->th.th_task_state_memo_stack) {
4305  size_t i;
4306  this_thr->th.th_task_state_memo_stack =
4307  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4308  this_thr->th.th_task_state_top = 0;
4309  this_thr->th.th_task_state_stack_sz = 4;
4310  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4311  ++i) // zero init the stack
4312  this_thr->th.th_task_state_memo_stack[i] = 0;
4313  }
4314 
4315  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4316  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4317 
4318  KMP_MB();
4319 }
4320 
4321 /* allocate a new thread for the requesting team. this is only called from
4322  within a forkjoin critical section. we will first try to get an available
4323  thread from the thread pool. if none is available, we will fork a new one
4324  assuming we are able to create a new one. this should be assured, as the
4325  caller should check on this first. */
4326 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4327  int new_tid) {
4328  kmp_team_t *serial_team;
4329  kmp_info_t *new_thr;
4330  int new_gtid;
4331 
4332  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4333  KMP_DEBUG_ASSERT(root && team);
4334 #if !KMP_NESTED_HOT_TEAMS
4335  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4336 #endif
4337  KMP_MB();
4338 
4339  /* first, try to get one from the thread pool */
4340  if (__kmp_thread_pool) {
4341  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4342  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4343  if (new_thr == __kmp_thread_pool_insert_pt) {
4344  __kmp_thread_pool_insert_pt = NULL;
4345  }
4346  TCW_4(new_thr->th.th_in_pool, FALSE);
4347  __kmp_suspend_initialize_thread(new_thr);
4348  __kmp_lock_suspend_mx(new_thr);
4349  if (new_thr->th.th_active_in_pool == TRUE) {
4350  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4351  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4352  new_thr->th.th_active_in_pool = FALSE;
4353  }
4354  __kmp_unlock_suspend_mx(new_thr);
4355 
4356  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4357  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4358  KMP_ASSERT(!new_thr->th.th_team);
4359  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4360 
4361  /* setup the thread structure */
4362  __kmp_initialize_info(new_thr, team, new_tid,
4363  new_thr->th.th_info.ds.ds_gtid);
4364  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4365 
4366  TCW_4(__kmp_nth, __kmp_nth + 1);
4367 
4368  new_thr->th.th_task_state = 0;
4369  new_thr->th.th_task_state_top = 0;
4370  new_thr->th.th_task_state_stack_sz = 4;
4371 
4372  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4373  // Make sure pool thread has transitioned to waiting on own thread struct
4374  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4375  // Thread activated in __kmp_allocate_team when increasing team size
4376  }
4377 
4378 #ifdef KMP_ADJUST_BLOCKTIME
4379  /* Adjust blocktime back to zero if necessary */
4380  /* Middle initialization might not have occurred yet */
4381  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4382  if (__kmp_nth > __kmp_avail_proc) {
4383  __kmp_zero_bt = TRUE;
4384  }
4385  }
4386 #endif /* KMP_ADJUST_BLOCKTIME */
4387 
4388 #if KMP_DEBUG
4389  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4390  // KMP_BARRIER_PARENT_FLAG.
4391  int b;
4392  kmp_balign_t *balign = new_thr->th.th_bar;
4393  for (b = 0; b < bs_last_barrier; ++b)
4394  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4395 #endif
4396 
4397  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4398  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4399 
4400  KMP_MB();
4401  return new_thr;
4402  }
4403 
4404  /* no, well fork a new one */
4405  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4406  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4407 
4408 #if KMP_USE_MONITOR
4409  // If this is the first worker thread the RTL is creating, then also
4410  // launch the monitor thread. We try to do this as early as possible.
4411  if (!TCR_4(__kmp_init_monitor)) {
4412  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4413  if (!TCR_4(__kmp_init_monitor)) {
4414  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4415  TCW_4(__kmp_init_monitor, 1);
4416  __kmp_create_monitor(&__kmp_monitor);
4417  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4418 #if KMP_OS_WINDOWS
4419  // AC: wait until monitor has started. This is a fix for CQ232808.
4420  // The reason is that if the library is loaded/unloaded in a loop with
4421  // small (parallel) work in between, then there is high probability that
4422  // monitor thread started after the library shutdown. At shutdown it is
4423  // too late to cope with the problem, because when the primary thread is
4424  // in DllMain (process detach) the monitor has no chances to start (it is
4425  // blocked), and primary thread has no means to inform the monitor that
4426  // the library has gone, because all the memory which the monitor can
4427  // access is going to be released/reset.
4428  while (TCR_4(__kmp_init_monitor) < 2) {
4429  KMP_YIELD(TRUE);
4430  }
4431  KF_TRACE(10, ("after monitor thread has started\n"));
4432 #endif
4433  }
4434  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4435  }
4436 #endif
4437 
4438  KMP_MB();
4439 
4440  {
4441  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4442  ? 1
4443  : __kmp_hidden_helper_threads_num + 1;
4444 
4445  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4446  ++new_gtid) {
4447  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4448  }
4449 
4450  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4451  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4452  }
4453  }
4454 
4455  /* allocate space for it. */
4456  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4457 
4458  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4459 
4460 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4461  // suppress race conditions detection on synchronization flags in debug mode
4462  // this helps to analyze library internals eliminating false positives
4463  __itt_suppress_mark_range(
4464  __itt_suppress_range, __itt_suppress_threading_errors,
4465  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4466  __itt_suppress_mark_range(
4467  __itt_suppress_range, __itt_suppress_threading_errors,
4468  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4469 #if KMP_OS_WINDOWS
4470  __itt_suppress_mark_range(
4471  __itt_suppress_range, __itt_suppress_threading_errors,
4472  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4473 #else
4474  __itt_suppress_mark_range(__itt_suppress_range,
4475  __itt_suppress_threading_errors,
4476  &new_thr->th.th_suspend_init_count,
4477  sizeof(new_thr->th.th_suspend_init_count));
4478 #endif
4479  // TODO: check if we need to also suppress b_arrived flags
4480  __itt_suppress_mark_range(__itt_suppress_range,
4481  __itt_suppress_threading_errors,
4482  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4483  sizeof(new_thr->th.th_bar[0].bb.b_go));
4484  __itt_suppress_mark_range(__itt_suppress_range,
4485  __itt_suppress_threading_errors,
4486  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4487  sizeof(new_thr->th.th_bar[1].bb.b_go));
4488  __itt_suppress_mark_range(__itt_suppress_range,
4489  __itt_suppress_threading_errors,
4490  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4491  sizeof(new_thr->th.th_bar[2].bb.b_go));
4492 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4493  if (__kmp_storage_map) {
4494  __kmp_print_thread_storage_map(new_thr, new_gtid);
4495  }
4496 
4497  // add the reserve serialized team, initialized from the team's primary thread
4498  {
4499  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4500  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4501  new_thr->th.th_serial_team = serial_team =
4502  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4503 #if OMPT_SUPPORT
4504  ompt_data_none, // root parallel id
4505 #endif
4506  proc_bind_default, &r_icvs,
4507  0 USE_NESTED_HOT_ARG(NULL));
4508  }
4509  KMP_ASSERT(serial_team);
4510  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4511  // execution (it is unused for now).
4512  serial_team->t.t_threads[0] = new_thr;
4513  KF_TRACE(10,
4514  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4515  new_thr));
4516 
4517  /* setup the thread structures */
4518  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4519 
4520 #if USE_FAST_MEMORY
4521  __kmp_initialize_fast_memory(new_thr);
4522 #endif /* USE_FAST_MEMORY */
4523 
4524 #if KMP_USE_BGET
4525  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4526  __kmp_initialize_bget(new_thr);
4527 #endif
4528 
4529  __kmp_init_random(new_thr); // Initialize random number generator
4530 
4531  /* Initialize these only once when thread is grabbed for a team allocation */
4532  KA_TRACE(20,
4533  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4534  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4535 
4536  int b;
4537  kmp_balign_t *balign = new_thr->th.th_bar;
4538  for (b = 0; b < bs_last_barrier; ++b) {
4539  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4540  balign[b].bb.team = NULL;
4541  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4542  balign[b].bb.use_oncore_barrier = 0;
4543  }
4544 
4545  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4546  new_thr->th.th_sleep_loc_type = flag_unset;
4547 
4548  new_thr->th.th_spin_here = FALSE;
4549  new_thr->th.th_next_waiting = 0;
4550 #if KMP_OS_UNIX
4551  new_thr->th.th_blocking = false;
4552 #endif
4553 
4554 #if KMP_AFFINITY_SUPPORTED
4555  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4556  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4557  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4558  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4559 #endif
4560  new_thr->th.th_def_allocator = __kmp_def_allocator;
4561  new_thr->th.th_prev_level = 0;
4562  new_thr->th.th_prev_num_threads = 1;
4563 
4564  TCW_4(new_thr->th.th_in_pool, FALSE);
4565  new_thr->th.th_active_in_pool = FALSE;
4566  TCW_4(new_thr->th.th_active, TRUE);
4567 
4568  /* adjust the global counters */
4569  __kmp_all_nth++;
4570  __kmp_nth++;
4571 
4572  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4573  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4574  if (__kmp_adjust_gtid_mode) {
4575  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4576  if (TCR_4(__kmp_gtid_mode) != 2) {
4577  TCW_4(__kmp_gtid_mode, 2);
4578  }
4579  } else {
4580  if (TCR_4(__kmp_gtid_mode) != 1) {
4581  TCW_4(__kmp_gtid_mode, 1);
4582  }
4583  }
4584  }
4585 
4586 #ifdef KMP_ADJUST_BLOCKTIME
4587  /* Adjust blocktime back to zero if necessary */
4588  /* Middle initialization might not have occurred yet */
4589  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4590  if (__kmp_nth > __kmp_avail_proc) {
4591  __kmp_zero_bt = TRUE;
4592  }
4593  }
4594 #endif /* KMP_ADJUST_BLOCKTIME */
4595 
4596  /* actually fork it and create the new worker thread */
4597  KF_TRACE(
4598  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4599  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4600  KF_TRACE(10,
4601  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4602 
4603  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4604  new_gtid));
4605  KMP_MB();
4606  return new_thr;
4607 }
4608 
4609 /* Reinitialize team for reuse.
4610  The hot team code calls this case at every fork barrier, so EPCC barrier
4611  test are extremely sensitive to changes in it, esp. writes to the team
4612  struct, which cause a cache invalidation in all threads.
4613  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4614 static void __kmp_reinitialize_team(kmp_team_t *team,
4615  kmp_internal_control_t *new_icvs,
4616  ident_t *loc) {
4617  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4618  team->t.t_threads[0], team));
4619  KMP_DEBUG_ASSERT(team && new_icvs);
4620  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4621  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4622 
4623  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4624  // Copy ICVs to the primary thread's implicit taskdata
4625  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4626  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4627 
4628  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4629  team->t.t_threads[0], team));
4630 }
4631 
4632 /* Initialize the team data structure.
4633  This assumes the t_threads and t_max_nproc are already set.
4634  Also, we don't touch the arguments */
4635 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4636  kmp_internal_control_t *new_icvs,
4637  ident_t *loc) {
4638  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4639 
4640  /* verify */
4641  KMP_DEBUG_ASSERT(team);
4642  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4643  KMP_DEBUG_ASSERT(team->t.t_threads);
4644  KMP_MB();
4645 
4646  team->t.t_master_tid = 0; /* not needed */
4647  /* team->t.t_master_bar; not needed */
4648  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4649  team->t.t_nproc = new_nproc;
4650 
4651  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4652  team->t.t_next_pool = NULL;
4653  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4654  * up hot team */
4655 
4656  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4657  team->t.t_invoke = NULL; /* not needed */
4658 
4659  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4660  team->t.t_sched.sched = new_icvs->sched.sched;
4661 
4662 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4663  team->t.t_fp_control_saved = FALSE; /* not needed */
4664  team->t.t_x87_fpu_control_word = 0; /* not needed */
4665  team->t.t_mxcsr = 0; /* not needed */
4666 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4667 
4668  team->t.t_construct = 0;
4669 
4670  team->t.t_ordered.dt.t_value = 0;
4671  team->t.t_master_active = FALSE;
4672 
4673 #ifdef KMP_DEBUG
4674  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4675 #endif
4676 #if KMP_OS_WINDOWS
4677  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4678 #endif
4679 
4680  team->t.t_control_stack_top = NULL;
4681 
4682  __kmp_reinitialize_team(team, new_icvs, loc);
4683 
4684  KMP_MB();
4685  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4686 }
4687 
4688 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4689 /* Sets full mask for thread and returns old mask, no changes to structures. */
4690 static void
4691 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4692  if (KMP_AFFINITY_CAPABLE()) {
4693  int status;
4694  if (old_mask != NULL) {
4695  status = __kmp_get_system_affinity(old_mask, TRUE);
4696  int error = errno;
4697  if (status != 0) {
4698  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4699  __kmp_msg_null);
4700  }
4701  }
4702  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4703  }
4704 }
4705 #endif
4706 
4707 #if KMP_AFFINITY_SUPPORTED
4708 
4709 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4710 // It calculates the worker + primary thread's partition based upon the parent
4711 // thread's partition, and binds each worker to a thread in their partition.
4712 // The primary thread's partition should already include its current binding.
4713 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4714  // Do not partition places for the hidden helper team
4715  if (KMP_HIDDEN_HELPER_TEAM(team))
4716  return;
4717  // Copy the primary thread's place partition to the team struct
4718  kmp_info_t *master_th = team->t.t_threads[0];
4719  KMP_DEBUG_ASSERT(master_th != NULL);
4720  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4721  int first_place = master_th->th.th_first_place;
4722  int last_place = master_th->th.th_last_place;
4723  int masters_place = master_th->th.th_current_place;
4724  team->t.t_first_place = first_place;
4725  team->t.t_last_place = last_place;
4726 
4727  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4728  "bound to place %d partition = [%d,%d]\n",
4729  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4730  team->t.t_id, masters_place, first_place, last_place));
4731 
4732  switch (proc_bind) {
4733 
4734  case proc_bind_default:
4735  // Serial teams might have the proc_bind policy set to proc_bind_default.
4736  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4737  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4738  break;
4739 
4740  case proc_bind_primary: {
4741  int f;
4742  int n_th = team->t.t_nproc;
4743  for (f = 1; f < n_th; f++) {
4744  kmp_info_t *th = team->t.t_threads[f];
4745  KMP_DEBUG_ASSERT(th != NULL);
4746  th->th.th_first_place = first_place;
4747  th->th.th_last_place = last_place;
4748  th->th.th_new_place = masters_place;
4749  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4750  team->t.t_display_affinity != 1) {
4751  team->t.t_display_affinity = 1;
4752  }
4753 
4754  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4755  "partition = [%d,%d]\n",
4756  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4757  f, masters_place, first_place, last_place));
4758  }
4759  } break;
4760 
4761  case proc_bind_close: {
4762  int f;
4763  int n_th = team->t.t_nproc;
4764  int n_places;
4765  if (first_place <= last_place) {
4766  n_places = last_place - first_place + 1;
4767  } else {
4768  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4769  }
4770  if (n_th <= n_places) {
4771  int place = masters_place;
4772  for (f = 1; f < n_th; f++) {
4773  kmp_info_t *th = team->t.t_threads[f];
4774  KMP_DEBUG_ASSERT(th != NULL);
4775 
4776  if (place == last_place) {
4777  place = first_place;
4778  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4779  place = 0;
4780  } else {
4781  place++;
4782  }
4783  th->th.th_first_place = first_place;
4784  th->th.th_last_place = last_place;
4785  th->th.th_new_place = place;
4786  if (__kmp_display_affinity && place != th->th.th_current_place &&
4787  team->t.t_display_affinity != 1) {
4788  team->t.t_display_affinity = 1;
4789  }
4790 
4791  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4792  "partition = [%d,%d]\n",
4793  __kmp_gtid_from_thread(team->t.t_threads[f]),
4794  team->t.t_id, f, place, first_place, last_place));
4795  }
4796  } else {
4797  int S, rem, gap, s_count;
4798  S = n_th / n_places;
4799  s_count = 0;
4800  rem = n_th - (S * n_places);
4801  gap = rem > 0 ? n_places / rem : n_places;
4802  int place = masters_place;
4803  int gap_ct = gap;
4804  for (f = 0; f < n_th; f++) {
4805  kmp_info_t *th = team->t.t_threads[f];
4806  KMP_DEBUG_ASSERT(th != NULL);
4807 
4808  th->th.th_first_place = first_place;
4809  th->th.th_last_place = last_place;
4810  th->th.th_new_place = place;
4811  if (__kmp_display_affinity && place != th->th.th_current_place &&
4812  team->t.t_display_affinity != 1) {
4813  team->t.t_display_affinity = 1;
4814  }
4815  s_count++;
4816 
4817  if ((s_count == S) && rem && (gap_ct == gap)) {
4818  // do nothing, add an extra thread to place on next iteration
4819  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4820  // we added an extra thread to this place; move to next place
4821  if (place == last_place) {
4822  place = first_place;
4823  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4824  place = 0;
4825  } else {
4826  place++;
4827  }
4828  s_count = 0;
4829  gap_ct = 1;
4830  rem--;
4831  } else if (s_count == S) { // place full; don't add extra
4832  if (place == last_place) {
4833  place = first_place;
4834  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4835  place = 0;
4836  } else {
4837  place++;
4838  }
4839  gap_ct++;
4840  s_count = 0;
4841  }
4842 
4843  KA_TRACE(100,
4844  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4845  "partition = [%d,%d]\n",
4846  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4847  th->th.th_new_place, first_place, last_place));
4848  }
4849  KMP_DEBUG_ASSERT(place == masters_place);
4850  }
4851  } break;
4852 
4853  case proc_bind_spread: {
4854  int f;
4855  int n_th = team->t.t_nproc;
4856  int n_places;
4857  int thidx;
4858  if (first_place <= last_place) {
4859  n_places = last_place - first_place + 1;
4860  } else {
4861  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4862  }
4863  if (n_th <= n_places) {
4864  int place = -1;
4865 
4866  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4867  int S = n_places / n_th;
4868  int s_count, rem, gap, gap_ct;
4869 
4870  place = masters_place;
4871  rem = n_places - n_th * S;
4872  gap = rem ? n_th / rem : 1;
4873  gap_ct = gap;
4874  thidx = n_th;
4875  if (update_master_only == 1)
4876  thidx = 1;
4877  for (f = 0; f < thidx; f++) {
4878  kmp_info_t *th = team->t.t_threads[f];
4879  KMP_DEBUG_ASSERT(th != NULL);
4880 
4881  th->th.th_first_place = place;
4882  th->th.th_new_place = place;
4883  if (__kmp_display_affinity && place != th->th.th_current_place &&
4884  team->t.t_display_affinity != 1) {
4885  team->t.t_display_affinity = 1;
4886  }
4887  s_count = 1;
4888  while (s_count < S) {
4889  if (place == last_place) {
4890  place = first_place;
4891  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4892  place = 0;
4893  } else {
4894  place++;
4895  }
4896  s_count++;
4897  }
4898  if (rem && (gap_ct == gap)) {
4899  if (place == last_place) {
4900  place = first_place;
4901  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4902  place = 0;
4903  } else {
4904  place++;
4905  }
4906  rem--;
4907  gap_ct = 0;
4908  }
4909  th->th.th_last_place = place;
4910  gap_ct++;
4911 
4912  if (place == last_place) {
4913  place = first_place;
4914  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4915  place = 0;
4916  } else {
4917  place++;
4918  }
4919 
4920  KA_TRACE(100,
4921  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4922  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4923  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4924  f, th->th.th_new_place, th->th.th_first_place,
4925  th->th.th_last_place, __kmp_affinity_num_masks));
4926  }
4927  } else {
4928  /* Having uniform space of available computation places I can create
4929  T partitions of round(P/T) size and put threads into the first
4930  place of each partition. */
4931  double current = static_cast<double>(masters_place);
4932  double spacing =
4933  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4934  int first, last;
4935  kmp_info_t *th;
4936 
4937  thidx = n_th + 1;
4938  if (update_master_only == 1)
4939  thidx = 1;
4940  for (f = 0; f < thidx; f++) {
4941  first = static_cast<int>(current);
4942  last = static_cast<int>(current + spacing) - 1;
4943  KMP_DEBUG_ASSERT(last >= first);
4944  if (first >= n_places) {
4945  if (masters_place) {
4946  first -= n_places;
4947  last -= n_places;
4948  if (first == (masters_place + 1)) {
4949  KMP_DEBUG_ASSERT(f == n_th);
4950  first--;
4951  }
4952  if (last == masters_place) {
4953  KMP_DEBUG_ASSERT(f == (n_th - 1));
4954  last--;
4955  }
4956  } else {
4957  KMP_DEBUG_ASSERT(f == n_th);
4958  first = 0;
4959  last = 0;
4960  }
4961  }
4962  if (last >= n_places) {
4963  last = (n_places - 1);
4964  }
4965  place = first;
4966  current += spacing;
4967  if (f < n_th) {
4968  KMP_DEBUG_ASSERT(0 <= first);
4969  KMP_DEBUG_ASSERT(n_places > first);
4970  KMP_DEBUG_ASSERT(0 <= last);
4971  KMP_DEBUG_ASSERT(n_places > last);
4972  KMP_DEBUG_ASSERT(last_place >= first_place);
4973  th = team->t.t_threads[f];
4974  KMP_DEBUG_ASSERT(th);
4975  th->th.th_first_place = first;
4976  th->th.th_new_place = place;
4977  th->th.th_last_place = last;
4978  if (__kmp_display_affinity && place != th->th.th_current_place &&
4979  team->t.t_display_affinity != 1) {
4980  team->t.t_display_affinity = 1;
4981  }
4982  KA_TRACE(100,
4983  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4984  "partition = [%d,%d], spacing = %.4f\n",
4985  __kmp_gtid_from_thread(team->t.t_threads[f]),
4986  team->t.t_id, f, th->th.th_new_place,
4987  th->th.th_first_place, th->th.th_last_place, spacing));
4988  }
4989  }
4990  }
4991  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4992  } else {
4993  int S, rem, gap, s_count;
4994  S = n_th / n_places;
4995  s_count = 0;
4996  rem = n_th - (S * n_places);
4997  gap = rem > 0 ? n_places / rem : n_places;
4998  int place = masters_place;
4999  int gap_ct = gap;
5000  thidx = n_th;
5001  if (update_master_only == 1)
5002  thidx = 1;
5003  for (f = 0; f < thidx; f++) {
5004  kmp_info_t *th = team->t.t_threads[f];
5005  KMP_DEBUG_ASSERT(th != NULL);
5006 
5007  th->th.th_first_place = place;
5008  th->th.th_last_place = place;
5009  th->th.th_new_place = place;
5010  if (__kmp_display_affinity && place != th->th.th_current_place &&
5011  team->t.t_display_affinity != 1) {
5012  team->t.t_display_affinity = 1;
5013  }
5014  s_count++;
5015 
5016  if ((s_count == S) && rem && (gap_ct == gap)) {
5017  // do nothing, add an extra thread to place on next iteration
5018  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5019  // we added an extra thread to this place; move on to next place
5020  if (place == last_place) {
5021  place = first_place;
5022  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5023  place = 0;
5024  } else {
5025  place++;
5026  }
5027  s_count = 0;
5028  gap_ct = 1;
5029  rem--;
5030  } else if (s_count == S) { // place is full; don't add extra thread
5031  if (place == last_place) {
5032  place = first_place;
5033  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5034  place = 0;
5035  } else {
5036  place++;
5037  }
5038  gap_ct++;
5039  s_count = 0;
5040  }
5041 
5042  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5043  "partition = [%d,%d]\n",
5044  __kmp_gtid_from_thread(team->t.t_threads[f]),
5045  team->t.t_id, f, th->th.th_new_place,
5046  th->th.th_first_place, th->th.th_last_place));
5047  }
5048  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5049  }
5050  } break;
5051 
5052  default:
5053  break;
5054  }
5055 
5056  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5057 }
5058 
5059 #endif // KMP_AFFINITY_SUPPORTED
5060 
5061 /* allocate a new team data structure to use. take one off of the free pool if
5062  available */
5063 kmp_team_t *
5064 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5065 #if OMPT_SUPPORT
5066  ompt_data_t ompt_parallel_data,
5067 #endif
5068  kmp_proc_bind_t new_proc_bind,
5069  kmp_internal_control_t *new_icvs,
5070  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5071  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5072  int f;
5073  kmp_team_t *team;
5074  int use_hot_team = !root->r.r_active;
5075  int level = 0;
5076  int do_place_partition = 1;
5077 
5078  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5079  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5080  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5081  KMP_MB();
5082 
5083 #if KMP_NESTED_HOT_TEAMS
5084  kmp_hot_team_ptr_t *hot_teams;
5085  if (master) {
5086  team = master->th.th_team;
5087  level = team->t.t_active_level;
5088  if (master->th.th_teams_microtask) { // in teams construct?
5089  if (master->th.th_teams_size.nteams > 1 &&
5090  ( // #teams > 1
5091  team->t.t_pkfn ==
5092  (microtask_t)__kmp_teams_master || // inner fork of the teams
5093  master->th.th_teams_level <
5094  team->t.t_level)) { // or nested parallel inside the teams
5095  ++level; // not increment if #teams==1, or for outer fork of the teams;
5096  // increment otherwise
5097  }
5098  // Do not perform the place partition if inner fork of the teams
5099  // Wait until nested parallel region encountered inside teams construct
5100  if ((master->th.th_teams_size.nteams == 1 &&
5101  master->th.th_teams_level >= team->t.t_level) ||
5102  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5103  do_place_partition = 0;
5104  }
5105  hot_teams = master->th.th_hot_teams;
5106  if (level < __kmp_hot_teams_max_level && hot_teams &&
5107  hot_teams[level].hot_team) {
5108  // hot team has already been allocated for given level
5109  use_hot_team = 1;
5110  } else {
5111  use_hot_team = 0;
5112  }
5113  } else {
5114  // check we won't access uninitialized hot_teams, just in case
5115  KMP_DEBUG_ASSERT(new_nproc == 1);
5116  }
5117 #endif
5118  // Optimization to use a "hot" team
5119  if (use_hot_team && new_nproc > 1) {
5120  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5121 #if KMP_NESTED_HOT_TEAMS
5122  team = hot_teams[level].hot_team;
5123 #else
5124  team = root->r.r_hot_team;
5125 #endif
5126 #if KMP_DEBUG
5127  if (__kmp_tasking_mode != tskm_immediate_exec) {
5128  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5129  "task_team[1] = %p before reinit\n",
5130  team->t.t_task_team[0], team->t.t_task_team[1]));
5131  }
5132 #endif
5133 
5134  if (team->t.t_nproc != new_nproc &&
5135  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5136  // Distributed barrier may need a resize
5137  int old_nthr = team->t.t_nproc;
5138  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5139  }
5140 
5141  // If not doing the place partition, then reset the team's proc bind
5142  // to indicate that partitioning of all threads still needs to take place
5143  if (do_place_partition == 0)
5144  team->t.t_proc_bind = proc_bind_default;
5145  // Has the number of threads changed?
5146  /* Let's assume the most common case is that the number of threads is
5147  unchanged, and put that case first. */
5148  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5149  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5150  // This case can mean that omp_set_num_threads() was called and the hot
5151  // team size was already reduced, so we check the special flag
5152  if (team->t.t_size_changed == -1) {
5153  team->t.t_size_changed = 1;
5154  } else {
5155  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5156  }
5157 
5158  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5159  kmp_r_sched_t new_sched = new_icvs->sched;
5160  // set primary thread's schedule as new run-time schedule
5161  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5162 
5163  __kmp_reinitialize_team(team, new_icvs,
5164  root->r.r_uber_thread->th.th_ident);
5165 
5166  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5167  team->t.t_threads[0], team));
5168  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5169 
5170 #if KMP_AFFINITY_SUPPORTED
5171  if ((team->t.t_size_changed == 0) &&
5172  (team->t.t_proc_bind == new_proc_bind)) {
5173  if (new_proc_bind == proc_bind_spread) {
5174  if (do_place_partition) {
5175  // add flag to update only master for spread
5176  __kmp_partition_places(team, 1);
5177  }
5178  }
5179  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5180  "proc_bind = %d, partition = [%d,%d]\n",
5181  team->t.t_id, new_proc_bind, team->t.t_first_place,
5182  team->t.t_last_place));
5183  } else {
5184  if (do_place_partition) {
5185  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5186  __kmp_partition_places(team);
5187  }
5188  }
5189 #else
5190  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5191 #endif /* KMP_AFFINITY_SUPPORTED */
5192  } else if (team->t.t_nproc > new_nproc) {
5193  KA_TRACE(20,
5194  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5195  new_nproc));
5196 
5197  team->t.t_size_changed = 1;
5198  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5199  // Barrier size already reduced earlier in this function
5200  // Activate team threads via th_used_in_team
5201  __kmp_add_threads_to_team(team, new_nproc);
5202  }
5203 #if KMP_NESTED_HOT_TEAMS
5204  if (__kmp_hot_teams_mode == 0) {
5205  // AC: saved number of threads should correspond to team's value in this
5206  // mode, can be bigger in mode 1, when hot team has threads in reserve
5207  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5208  hot_teams[level].hot_team_nth = new_nproc;
5209 #endif // KMP_NESTED_HOT_TEAMS
5210  /* release the extra threads we don't need any more */
5211  for (f = new_nproc; f < team->t.t_nproc; f++) {
5212  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5213  if (__kmp_tasking_mode != tskm_immediate_exec) {
5214  // When decreasing team size, threads no longer in the team should
5215  // unref task team.
5216  team->t.t_threads[f]->th.th_task_team = NULL;
5217  }
5218  __kmp_free_thread(team->t.t_threads[f]);
5219  team->t.t_threads[f] = NULL;
5220  }
5221 #if KMP_NESTED_HOT_TEAMS
5222  } // (__kmp_hot_teams_mode == 0)
5223  else {
5224  // When keeping extra threads in team, switch threads to wait on own
5225  // b_go flag
5226  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5227  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5228  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5229  for (int b = 0; b < bs_last_barrier; ++b) {
5230  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5231  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5232  }
5233  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5234  }
5235  }
5236  }
5237 #endif // KMP_NESTED_HOT_TEAMS
5238  team->t.t_nproc = new_nproc;
5239  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5240  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5241  __kmp_reinitialize_team(team, new_icvs,
5242  root->r.r_uber_thread->th.th_ident);
5243 
5244  // Update remaining threads
5245  for (f = 0; f < new_nproc; ++f) {
5246  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5247  }
5248 
5249  // restore the current task state of the primary thread: should be the
5250  // implicit task
5251  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5252  team->t.t_threads[0], team));
5253 
5254  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5255 
5256 #ifdef KMP_DEBUG
5257  for (f = 0; f < team->t.t_nproc; f++) {
5258  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5259  team->t.t_threads[f]->th.th_team_nproc ==
5260  team->t.t_nproc);
5261  }
5262 #endif
5263 
5264  if (do_place_partition) {
5265  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5266 #if KMP_AFFINITY_SUPPORTED
5267  __kmp_partition_places(team);
5268 #endif
5269  }
5270  } else { // team->t.t_nproc < new_nproc
5271 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5272  kmp_affin_mask_t *old_mask;
5273  if (KMP_AFFINITY_CAPABLE()) {
5274  KMP_CPU_ALLOC(old_mask);
5275  }
5276 #endif
5277 
5278  KA_TRACE(20,
5279  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5280  new_nproc));
5281  int old_nproc = team->t.t_nproc; // save old value and use to update only
5282  team->t.t_size_changed = 1;
5283 
5284 #if KMP_NESTED_HOT_TEAMS
5285  int avail_threads = hot_teams[level].hot_team_nth;
5286  if (new_nproc < avail_threads)
5287  avail_threads = new_nproc;
5288  kmp_info_t **other_threads = team->t.t_threads;
5289  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5290  // Adjust barrier data of reserved threads (if any) of the team
5291  // Other data will be set in __kmp_initialize_info() below.
5292  int b;
5293  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5294  for (b = 0; b < bs_last_barrier; ++b) {
5295  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5296  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5297 #if USE_DEBUGGER
5298  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5299 #endif
5300  }
5301  }
5302  if (hot_teams[level].hot_team_nth >= new_nproc) {
5303  // we have all needed threads in reserve, no need to allocate any
5304  // this only possible in mode 1, cannot have reserved threads in mode 0
5305  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5306  team->t.t_nproc = new_nproc; // just get reserved threads involved
5307  } else {
5308  // We may have some threads in reserve, but not enough;
5309  // get reserved threads involved if any.
5310  team->t.t_nproc = hot_teams[level].hot_team_nth;
5311  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5312 #endif // KMP_NESTED_HOT_TEAMS
5313  if (team->t.t_max_nproc < new_nproc) {
5314  /* reallocate larger arrays */
5315  __kmp_reallocate_team_arrays(team, new_nproc);
5316  __kmp_reinitialize_team(team, new_icvs, NULL);
5317  }
5318 
5319 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5320  /* Temporarily set full mask for primary thread before creation of
5321  workers. The reason is that workers inherit the affinity from the
5322  primary thread, so if a lot of workers are created on the single
5323  core quickly, they don't get a chance to set their own affinity for
5324  a long time. */
5325  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5326 #endif
5327 
5328  /* allocate new threads for the hot team */
5329  for (f = team->t.t_nproc; f < new_nproc; f++) {
5330  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5331  KMP_DEBUG_ASSERT(new_worker);
5332  team->t.t_threads[f] = new_worker;
5333 
5334  KA_TRACE(20,
5335  ("__kmp_allocate_team: team %d init T#%d arrived: "
5336  "join=%llu, plain=%llu\n",
5337  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5338  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5339  team->t.t_bar[bs_plain_barrier].b_arrived));
5340 
5341  { // Initialize barrier data for new threads.
5342  int b;
5343  kmp_balign_t *balign = new_worker->th.th_bar;
5344  for (b = 0; b < bs_last_barrier; ++b) {
5345  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5346  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5347  KMP_BARRIER_PARENT_FLAG);
5348 #if USE_DEBUGGER
5349  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5350 #endif
5351  }
5352  }
5353  }
5354 
5355 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5356  if (KMP_AFFINITY_CAPABLE()) {
5357  /* Restore initial primary thread's affinity mask */
5358  __kmp_set_system_affinity(old_mask, TRUE);
5359  KMP_CPU_FREE(old_mask);
5360  }
5361 #endif
5362 #if KMP_NESTED_HOT_TEAMS
5363  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5364 #endif // KMP_NESTED_HOT_TEAMS
5365  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5366  // Barrier size already increased earlier in this function
5367  // Activate team threads via th_used_in_team
5368  __kmp_add_threads_to_team(team, new_nproc);
5369  }
5370  /* make sure everyone is syncronized */
5371  // new threads below
5372  __kmp_initialize_team(team, new_nproc, new_icvs,
5373  root->r.r_uber_thread->th.th_ident);
5374 
5375  /* reinitialize the threads */
5376  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5377  for (f = 0; f < team->t.t_nproc; ++f)
5378  __kmp_initialize_info(team->t.t_threads[f], team, f,
5379  __kmp_gtid_from_tid(f, team));
5380 
5381  if (level) { // set th_task_state for new threads in nested hot team
5382  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5383  // only need to set the th_task_state for the new threads. th_task_state
5384  // for primary thread will not be accurate until after this in
5385  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5386  // get the correct value.
5387  for (f = old_nproc; f < team->t.t_nproc; ++f)
5388  team->t.t_threads[f]->th.th_task_state =
5389  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5390  } else { // set th_task_state for new threads in non-nested hot team
5391  // copy primary thread's state
5392  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5393  for (f = old_nproc; f < team->t.t_nproc; ++f)
5394  team->t.t_threads[f]->th.th_task_state = old_state;
5395  }
5396 
5397 #ifdef KMP_DEBUG
5398  for (f = 0; f < team->t.t_nproc; ++f) {
5399  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5400  team->t.t_threads[f]->th.th_team_nproc ==
5401  team->t.t_nproc);
5402  }
5403 #endif
5404 
5405  if (do_place_partition) {
5406  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5407 #if KMP_AFFINITY_SUPPORTED
5408  __kmp_partition_places(team);
5409 #endif
5410  }
5411  } // Check changes in number of threads
5412 
5413  kmp_info_t *master = team->t.t_threads[0];
5414  if (master->th.th_teams_microtask) {
5415  for (f = 1; f < new_nproc; ++f) {
5416  // propagate teams construct specific info to workers
5417  kmp_info_t *thr = team->t.t_threads[f];
5418  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5419  thr->th.th_teams_level = master->th.th_teams_level;
5420  thr->th.th_teams_size = master->th.th_teams_size;
5421  }
5422  }
5423 #if KMP_NESTED_HOT_TEAMS
5424  if (level) {
5425  // Sync barrier state for nested hot teams, not needed for outermost hot
5426  // team.
5427  for (f = 1; f < new_nproc; ++f) {
5428  kmp_info_t *thr = team->t.t_threads[f];
5429  int b;
5430  kmp_balign_t *balign = thr->th.th_bar;
5431  for (b = 0; b < bs_last_barrier; ++b) {
5432  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5433  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5434 #if USE_DEBUGGER
5435  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5436 #endif
5437  }
5438  }
5439  }
5440 #endif // KMP_NESTED_HOT_TEAMS
5441 
5442  /* reallocate space for arguments if necessary */
5443  __kmp_alloc_argv_entries(argc, team, TRUE);
5444  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5445  // The hot team re-uses the previous task team,
5446  // if untouched during the previous release->gather phase.
5447 
5448  KF_TRACE(10, (" hot_team = %p\n", team));
5449 
5450 #if KMP_DEBUG
5451  if (__kmp_tasking_mode != tskm_immediate_exec) {
5452  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5453  "task_team[1] = %p after reinit\n",
5454  team->t.t_task_team[0], team->t.t_task_team[1]));
5455  }
5456 #endif
5457 
5458 #if OMPT_SUPPORT
5459  __ompt_team_assign_id(team, ompt_parallel_data);
5460 #endif
5461 
5462  KMP_MB();
5463 
5464  return team;
5465  }
5466 
5467  /* next, let's try to take one from the team pool */
5468  KMP_MB();
5469  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5470  /* TODO: consider resizing undersized teams instead of reaping them, now
5471  that we have a resizing mechanism */
5472  if (team->t.t_max_nproc >= max_nproc) {
5473  /* take this team from the team pool */
5474  __kmp_team_pool = team->t.t_next_pool;
5475 
5476  if (max_nproc > 1 &&
5477  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5478  if (!team->t.b) { // Allocate barrier structure
5479  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5480  }
5481  }
5482 
5483  /* setup the team for fresh use */
5484  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5485 
5486  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5487  "task_team[1] %p to NULL\n",
5488  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5489  team->t.t_task_team[0] = NULL;
5490  team->t.t_task_team[1] = NULL;
5491 
5492  /* reallocate space for arguments if necessary */
5493  __kmp_alloc_argv_entries(argc, team, TRUE);
5494  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5495 
5496  KA_TRACE(
5497  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5498  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5499  { // Initialize barrier data.
5500  int b;
5501  for (b = 0; b < bs_last_barrier; ++b) {
5502  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5503 #if USE_DEBUGGER
5504  team->t.t_bar[b].b_master_arrived = 0;
5505  team->t.t_bar[b].b_team_arrived = 0;
5506 #endif
5507  }
5508  }
5509 
5510  team->t.t_proc_bind = new_proc_bind;
5511 
5512  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5513  team->t.t_id));
5514 
5515 #if OMPT_SUPPORT
5516  __ompt_team_assign_id(team, ompt_parallel_data);
5517 #endif
5518 
5519  KMP_MB();
5520 
5521  return team;
5522  }
5523 
5524  /* reap team if it is too small, then loop back and check the next one */
5525  // not sure if this is wise, but, will be redone during the hot-teams
5526  // rewrite.
5527  /* TODO: Use technique to find the right size hot-team, don't reap them */
5528  team = __kmp_reap_team(team);
5529  __kmp_team_pool = team;
5530  }
5531 
5532  /* nothing available in the pool, no matter, make a new team! */
5533  KMP_MB();
5534  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5535 
5536  /* and set it up */
5537  team->t.t_max_nproc = max_nproc;
5538  if (max_nproc > 1 &&
5539  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5540  // Allocate barrier structure
5541  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5542  }
5543 
5544  /* NOTE well, for some reason allocating one big buffer and dividing it up
5545  seems to really hurt performance a lot on the P4, so, let's not use this */
5546  __kmp_allocate_team_arrays(team, max_nproc);
5547 
5548  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5549  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5550 
5551  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5552  "%p to NULL\n",
5553  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5554  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5555  // memory, no need to duplicate
5556  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5557  // memory, no need to duplicate
5558 
5559  if (__kmp_storage_map) {
5560  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5561  }
5562 
5563  /* allocate space for arguments */
5564  __kmp_alloc_argv_entries(argc, team, FALSE);
5565  team->t.t_argc = argc;
5566 
5567  KA_TRACE(20,
5568  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5569  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5570  { // Initialize barrier data.
5571  int b;
5572  for (b = 0; b < bs_last_barrier; ++b) {
5573  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5574 #if USE_DEBUGGER
5575  team->t.t_bar[b].b_master_arrived = 0;
5576  team->t.t_bar[b].b_team_arrived = 0;
5577 #endif
5578  }
5579  }
5580 
5581  team->t.t_proc_bind = new_proc_bind;
5582 
5583 #if OMPT_SUPPORT
5584  __ompt_team_assign_id(team, ompt_parallel_data);
5585  team->t.ompt_serialized_team_info = NULL;
5586 #endif
5587 
5588  KMP_MB();
5589 
5590  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5591  team->t.t_id));
5592 
5593  return team;
5594 }
5595 
5596 /* TODO implement hot-teams at all levels */
5597 /* TODO implement lazy thread release on demand (disband request) */
5598 
5599 /* free the team. return it to the team pool. release all the threads
5600  * associated with it */
5601 void __kmp_free_team(kmp_root_t *root,
5602  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5603  int f;
5604  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5605  team->t.t_id));
5606 
5607  /* verify state */
5608  KMP_DEBUG_ASSERT(root);
5609  KMP_DEBUG_ASSERT(team);
5610  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5611  KMP_DEBUG_ASSERT(team->t.t_threads);
5612 
5613  int use_hot_team = team == root->r.r_hot_team;
5614 #if KMP_NESTED_HOT_TEAMS
5615  int level;
5616  if (master) {
5617  level = team->t.t_active_level - 1;
5618  if (master->th.th_teams_microtask) { // in teams construct?
5619  if (master->th.th_teams_size.nteams > 1) {
5620  ++level; // level was not increased in teams construct for
5621  // team_of_masters
5622  }
5623  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5624  master->th.th_teams_level == team->t.t_level) {
5625  ++level; // level was not increased in teams construct for
5626  // team_of_workers before the parallel
5627  } // team->t.t_level will be increased inside parallel
5628  }
5629 #if KMP_DEBUG
5630  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5631 #endif
5632  if (level < __kmp_hot_teams_max_level) {
5633  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5634  use_hot_team = 1;
5635  }
5636  }
5637 #endif // KMP_NESTED_HOT_TEAMS
5638 
5639  /* team is done working */
5640  TCW_SYNC_PTR(team->t.t_pkfn,
5641  NULL); // Important for Debugging Support Library.
5642 #if KMP_OS_WINDOWS
5643  team->t.t_copyin_counter = 0; // init counter for possible reuse
5644 #endif
5645  // Do not reset pointer to parent team to NULL for hot teams.
5646 
5647  /* if we are non-hot team, release our threads */
5648  if (!use_hot_team) {
5649  if (__kmp_tasking_mode != tskm_immediate_exec) {
5650  // Wait for threads to reach reapable state
5651  for (f = 1; f < team->t.t_nproc; ++f) {
5652  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5653  kmp_info_t *th = team->t.t_threads[f];
5654  volatile kmp_uint32 *state = &th->th.th_reap_state;
5655  while (*state != KMP_SAFE_TO_REAP) {
5656 #if KMP_OS_WINDOWS
5657  // On Windows a thread can be killed at any time, check this
5658  DWORD ecode;
5659  if (!__kmp_is_thread_alive(th, &ecode)) {
5660  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5661  break;
5662  }
5663 #endif
5664  // first check if thread is sleeping
5665  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5666  if (fl.is_sleeping())
5667  fl.resume(__kmp_gtid_from_thread(th));
5668  KMP_CPU_PAUSE();
5669  }
5670  }
5671 
5672  // Delete task teams
5673  int tt_idx;
5674  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5675  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5676  if (task_team != NULL) {
5677  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5678  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5679  team->t.t_threads[f]->th.th_task_team = NULL;
5680  }
5681  KA_TRACE(
5682  20,
5683  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5684  __kmp_get_gtid(), task_team, team->t.t_id));
5685 #if KMP_NESTED_HOT_TEAMS
5686  __kmp_free_task_team(master, task_team);
5687 #endif
5688  team->t.t_task_team[tt_idx] = NULL;
5689  }
5690  }
5691  }
5692 
5693  // Reset pointer to parent team only for non-hot teams.
5694  team->t.t_parent = NULL;
5695  team->t.t_level = 0;
5696  team->t.t_active_level = 0;
5697 
5698  /* free the worker threads */
5699  for (f = 1; f < team->t.t_nproc; ++f) {
5700  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5701  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5702  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5703  1, 2);
5704  }
5705  __kmp_free_thread(team->t.t_threads[f]);
5706  }
5707 
5708  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5709  if (team->t.b) {
5710  // wake up thread at old location
5711  team->t.b->go_release();
5712  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5713  for (f = 1; f < team->t.t_nproc; ++f) {
5714  if (team->t.b->sleep[f].sleep) {
5715  __kmp_atomic_resume_64(
5716  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5717  (kmp_atomic_flag_64<> *)NULL);
5718  }
5719  }
5720  }
5721  // Wait for threads to be removed from team
5722  for (int f = 1; f < team->t.t_nproc; ++f) {
5723  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5724  KMP_CPU_PAUSE();
5725  }
5726  }
5727  }
5728 
5729  for (f = 1; f < team->t.t_nproc; ++f) {
5730  team->t.t_threads[f] = NULL;
5731  }
5732 
5733  if (team->t.t_max_nproc > 1 &&
5734  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5735  distributedBarrier::deallocate(team->t.b);
5736  team->t.b = NULL;
5737  }
5738  /* put the team back in the team pool */
5739  /* TODO limit size of team pool, call reap_team if pool too large */
5740  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5741  __kmp_team_pool = (volatile kmp_team_t *)team;
5742  } else { // Check if team was created for primary threads in teams construct
5743  // See if first worker is a CG root
5744  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5745  team->t.t_threads[1]->th.th_cg_roots);
5746  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5747  // Clean up the CG root nodes on workers so that this team can be re-used
5748  for (f = 1; f < team->t.t_nproc; ++f) {
5749  kmp_info_t *thr = team->t.t_threads[f];
5750  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5751  thr->th.th_cg_roots->cg_root == thr);
5752  // Pop current CG root off list
5753  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5754  thr->th.th_cg_roots = tmp->up;
5755  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5756  " up to node %p. cg_nthreads was %d\n",
5757  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5758  int i = tmp->cg_nthreads--;
5759  if (i == 1) {
5760  __kmp_free(tmp); // free CG if we are the last thread in it
5761  }
5762  // Restore current task's thread_limit from CG root
5763  if (thr->th.th_cg_roots)
5764  thr->th.th_current_task->td_icvs.thread_limit =
5765  thr->th.th_cg_roots->cg_thread_limit;
5766  }
5767  }
5768  }
5769 
5770  KMP_MB();
5771 }
5772 
5773 /* reap the team. destroy it, reclaim all its resources and free its memory */
5774 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5775  kmp_team_t *next_pool = team->t.t_next_pool;
5776 
5777  KMP_DEBUG_ASSERT(team);
5778  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5779  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5780  KMP_DEBUG_ASSERT(team->t.t_threads);
5781  KMP_DEBUG_ASSERT(team->t.t_argv);
5782 
5783  /* TODO clean the threads that are a part of this? */
5784 
5785  /* free stuff */
5786  __kmp_free_team_arrays(team);
5787  if (team->t.t_argv != &team->t.t_inline_argv[0])
5788  __kmp_free((void *)team->t.t_argv);
5789  __kmp_free(team);
5790 
5791  KMP_MB();
5792  return next_pool;
5793 }
5794 
5795 // Free the thread. Don't reap it, just place it on the pool of available
5796 // threads.
5797 //
5798 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5799 // binding for the affinity mechanism to be useful.
5800 //
5801 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5802 // However, we want to avoid a potential performance problem by always
5803 // scanning through the list to find the correct point at which to insert
5804 // the thread (potential N**2 behavior). To do this we keep track of the
5805 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5806 // With single-level parallelism, threads will always be added to the tail
5807 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5808 // parallelism, all bets are off and we may need to scan through the entire
5809 // free list.
5810 //
5811 // This change also has a potentially large performance benefit, for some
5812 // applications. Previously, as threads were freed from the hot team, they
5813 // would be placed back on the free list in inverse order. If the hot team
5814 // grew back to it's original size, then the freed thread would be placed
5815 // back on the hot team in reverse order. This could cause bad cache
5816 // locality problems on programs where the size of the hot team regularly
5817 // grew and shrunk.
5818 //
5819 // Now, for single-level parallelism, the OMP tid is always == gtid.
5820 void __kmp_free_thread(kmp_info_t *this_th) {
5821  int gtid;
5822  kmp_info_t **scan;
5823 
5824  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5825  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5826 
5827  KMP_DEBUG_ASSERT(this_th);
5828 
5829  // When moving thread to pool, switch thread to wait on own b_go flag, and
5830  // uninitialized (NULL team).
5831  int b;
5832  kmp_balign_t *balign = this_th->th.th_bar;
5833  for (b = 0; b < bs_last_barrier; ++b) {
5834  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5835  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5836  balign[b].bb.team = NULL;
5837  balign[b].bb.leaf_kids = 0;
5838  }
5839  this_th->th.th_task_state = 0;
5840  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5841 
5842  /* put thread back on the free pool */
5843  TCW_PTR(this_th->th.th_team, NULL);
5844  TCW_PTR(this_th->th.th_root, NULL);
5845  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5846 
5847  while (this_th->th.th_cg_roots) {
5848  this_th->th.th_cg_roots->cg_nthreads--;
5849  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5850  " %p of thread %p to %d\n",
5851  this_th, this_th->th.th_cg_roots,
5852  this_th->th.th_cg_roots->cg_root,
5853  this_th->th.th_cg_roots->cg_nthreads));
5854  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5855  if (tmp->cg_root == this_th) { // Thread is a cg_root
5856  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5857  KA_TRACE(
5858  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5859  this_th->th.th_cg_roots = tmp->up;
5860  __kmp_free(tmp);
5861  } else { // Worker thread
5862  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5863  __kmp_free(tmp);
5864  }
5865  this_th->th.th_cg_roots = NULL;
5866  break;
5867  }
5868  }
5869 
5870  /* If the implicit task assigned to this thread can be used by other threads
5871  * -> multiple threads can share the data and try to free the task at
5872  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5873  * with higher probability when hot team is disabled but can occurs even when
5874  * the hot team is enabled */
5875  __kmp_free_implicit_task(this_th);
5876  this_th->th.th_current_task = NULL;
5877 
5878  // If the __kmp_thread_pool_insert_pt is already past the new insert
5879  // point, then we need to re-scan the entire list.
5880  gtid = this_th->th.th_info.ds.ds_gtid;
5881  if (__kmp_thread_pool_insert_pt != NULL) {
5882  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5883  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5884  __kmp_thread_pool_insert_pt = NULL;
5885  }
5886  }
5887 
5888  // Scan down the list to find the place to insert the thread.
5889  // scan is the address of a link in the list, possibly the address of
5890  // __kmp_thread_pool itself.
5891  //
5892  // In the absence of nested parallelism, the for loop will have 0 iterations.
5893  if (__kmp_thread_pool_insert_pt != NULL) {
5894  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5895  } else {
5896  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5897  }
5898  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5899  scan = &((*scan)->th.th_next_pool))
5900  ;
5901 
5902  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5903  // to its address.
5904  TCW_PTR(this_th->th.th_next_pool, *scan);
5905  __kmp_thread_pool_insert_pt = *scan = this_th;
5906  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5907  (this_th->th.th_info.ds.ds_gtid <
5908  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5909  TCW_4(this_th->th.th_in_pool, TRUE);
5910  __kmp_suspend_initialize_thread(this_th);
5911  __kmp_lock_suspend_mx(this_th);
5912  if (this_th->th.th_active == TRUE) {
5913  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5914  this_th->th.th_active_in_pool = TRUE;
5915  }
5916 #if KMP_DEBUG
5917  else {
5918  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5919  }
5920 #endif
5921  __kmp_unlock_suspend_mx(this_th);
5922 
5923  TCW_4(__kmp_nth, __kmp_nth - 1);
5924 
5925 #ifdef KMP_ADJUST_BLOCKTIME
5926  /* Adjust blocktime back to user setting or default if necessary */
5927  /* Middle initialization might never have occurred */
5928  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5929  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5930  if (__kmp_nth <= __kmp_avail_proc) {
5931  __kmp_zero_bt = FALSE;
5932  }
5933  }
5934 #endif /* KMP_ADJUST_BLOCKTIME */
5935 
5936  KMP_MB();
5937 }
5938 
5939 /* ------------------------------------------------------------------------ */
5940 
5941 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5942 #if OMP_PROFILING_SUPPORT
5943  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5944  // TODO: add a configuration option for time granularity
5945  if (ProfileTraceFile)
5946  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5947 #endif
5948 
5949  int gtid = this_thr->th.th_info.ds.ds_gtid;
5950  /* void *stack_data;*/
5951  kmp_team_t **volatile pteam;
5952 
5953  KMP_MB();
5954  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5955 
5956  if (__kmp_env_consistency_check) {
5957  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5958  }
5959 
5960 #if OMPD_SUPPORT
5961  if (ompd_state & OMPD_ENABLE_BP)
5962  ompd_bp_thread_begin();
5963 #endif
5964 
5965 #if OMPT_SUPPORT
5966  ompt_data_t *thread_data = nullptr;
5967  if (ompt_enabled.enabled) {
5968  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5969  *thread_data = ompt_data_none;
5970 
5971  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5972  this_thr->th.ompt_thread_info.wait_id = 0;
5973  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5974  this_thr->th.ompt_thread_info.parallel_flags = 0;
5975  if (ompt_enabled.ompt_callback_thread_begin) {
5976  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5977  ompt_thread_worker, thread_data);
5978  }
5979  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5980  }
5981 #endif
5982 
5983  /* This is the place where threads wait for work */
5984  while (!TCR_4(__kmp_global.g.g_done)) {
5985  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5986  KMP_MB();
5987 
5988  /* wait for work to do */
5989  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5990 
5991  /* No tid yet since not part of a team */
5992  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5993 
5994 #if OMPT_SUPPORT
5995  if (ompt_enabled.enabled) {
5996  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5997  }
5998 #endif
5999 
6000  pteam = &this_thr->th.th_team;
6001 
6002  /* have we been allocated? */
6003  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6004  /* we were just woken up, so run our new task */
6005  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6006  int rc;
6007  KA_TRACE(20,
6008  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6009  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6010  (*pteam)->t.t_pkfn));
6011 
6012  updateHWFPControl(*pteam);
6013 
6014 #if OMPT_SUPPORT
6015  if (ompt_enabled.enabled) {
6016  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6017  }
6018 #endif
6019 
6020  rc = (*pteam)->t.t_invoke(gtid);
6021  KMP_ASSERT(rc);
6022 
6023  KMP_MB();
6024  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6025  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6026  (*pteam)->t.t_pkfn));
6027  }
6028 #if OMPT_SUPPORT
6029  if (ompt_enabled.enabled) {
6030  /* no frame set while outside task */
6031  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6032 
6033  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6034  }
6035 #endif
6036  /* join barrier after parallel region */
6037  __kmp_join_barrier(gtid);
6038  }
6039  }
6040  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6041 
6042 #if OMPD_SUPPORT
6043  if (ompd_state & OMPD_ENABLE_BP)
6044  ompd_bp_thread_end();
6045 #endif
6046 
6047 #if OMPT_SUPPORT
6048  if (ompt_enabled.ompt_callback_thread_end) {
6049  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6050  }
6051 #endif
6052 
6053  this_thr->th.th_task_team = NULL;
6054  /* run the destructors for the threadprivate data for this thread */
6055  __kmp_common_destroy_gtid(gtid);
6056 
6057  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6058  KMP_MB();
6059 
6060 #if OMP_PROFILING_SUPPORT
6061  llvm::timeTraceProfilerFinishThread();
6062 #endif
6063  return this_thr;
6064 }
6065 
6066 /* ------------------------------------------------------------------------ */
6067 
6068 void __kmp_internal_end_dest(void *specific_gtid) {
6069  // Make sure no significant bits are lost
6070  int gtid;
6071  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6072 
6073  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6074  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6075  * this is because 0 is reserved for the nothing-stored case */
6076 
6077  __kmp_internal_end_thread(gtid);
6078 }
6079 
6080 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6081 
6082 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6083  __kmp_internal_end_atexit();
6084 }
6085 
6086 #endif
6087 
6088 /* [Windows] josh: when the atexit handler is called, there may still be more
6089  than one thread alive */
6090 void __kmp_internal_end_atexit(void) {
6091  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6092  /* [Windows]
6093  josh: ideally, we want to completely shutdown the library in this atexit
6094  handler, but stat code that depends on thread specific data for gtid fails
6095  because that data becomes unavailable at some point during the shutdown, so
6096  we call __kmp_internal_end_thread instead. We should eventually remove the
6097  dependency on __kmp_get_specific_gtid in the stat code and use
6098  __kmp_internal_end_library to cleanly shutdown the library.
6099 
6100  // TODO: Can some of this comment about GVS be removed?
6101  I suspect that the offending stat code is executed when the calling thread
6102  tries to clean up a dead root thread's data structures, resulting in GVS
6103  code trying to close the GVS structures for that thread, but since the stat
6104  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6105  the calling thread is cleaning up itself instead of another thread, it get
6106  confused. This happens because allowing a thread to unregister and cleanup
6107  another thread is a recent modification for addressing an issue.
6108  Based on the current design (20050722), a thread may end up
6109  trying to unregister another thread only if thread death does not trigger
6110  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6111  thread specific data destructor function to detect thread death. For
6112  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6113  is nothing. Thus, the workaround is applicable only for Windows static
6114  stat library. */
6115  __kmp_internal_end_library(-1);
6116 #if KMP_OS_WINDOWS
6117  __kmp_close_console();
6118 #endif
6119 }
6120 
6121 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6122  // It is assumed __kmp_forkjoin_lock is acquired.
6123 
6124  int gtid;
6125 
6126  KMP_DEBUG_ASSERT(thread != NULL);
6127 
6128  gtid = thread->th.th_info.ds.ds_gtid;
6129 
6130  if (!is_root) {
6131  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6132  /* Assume the threads are at the fork barrier here */
6133  KA_TRACE(
6134  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6135  gtid));
6136  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6137  while (
6138  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6139  KMP_CPU_PAUSE();
6140  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6141  } else {
6142  /* Need release fence here to prevent seg faults for tree forkjoin
6143  barrier (GEH) */
6144  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6145  thread);
6146  __kmp_release_64(&flag);
6147  }
6148  }
6149 
6150  // Terminate OS thread.
6151  __kmp_reap_worker(thread);
6152 
6153  // The thread was killed asynchronously. If it was actively
6154  // spinning in the thread pool, decrement the global count.
6155  //
6156  // There is a small timing hole here - if the worker thread was just waking
6157  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6158  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6159  // the global counter might not get updated.
6160  //
6161  // Currently, this can only happen as the library is unloaded,
6162  // so there are no harmful side effects.
6163  if (thread->th.th_active_in_pool) {
6164  thread->th.th_active_in_pool = FALSE;
6165  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6166  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6167  }
6168  }
6169 
6170  __kmp_free_implicit_task(thread);
6171 
6172 // Free the fast memory for tasking
6173 #if USE_FAST_MEMORY
6174  __kmp_free_fast_memory(thread);
6175 #endif /* USE_FAST_MEMORY */
6176 
6177  __kmp_suspend_uninitialize_thread(thread);
6178 
6179  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6180  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6181 
6182  --__kmp_all_nth;
6183  // __kmp_nth was decremented when thread is added to the pool.
6184 
6185 #ifdef KMP_ADJUST_BLOCKTIME
6186  /* Adjust blocktime back to user setting or default if necessary */
6187  /* Middle initialization might never have occurred */
6188  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6189  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6190  if (__kmp_nth <= __kmp_avail_proc) {
6191  __kmp_zero_bt = FALSE;
6192  }
6193  }
6194 #endif /* KMP_ADJUST_BLOCKTIME */
6195 
6196  /* free the memory being used */
6197  if (__kmp_env_consistency_check) {
6198  if (thread->th.th_cons) {
6199  __kmp_free_cons_stack(thread->th.th_cons);
6200  thread->th.th_cons = NULL;
6201  }
6202  }
6203 
6204  if (thread->th.th_pri_common != NULL) {
6205  __kmp_free(thread->th.th_pri_common);
6206  thread->th.th_pri_common = NULL;
6207  }
6208 
6209  if (thread->th.th_task_state_memo_stack != NULL) {
6210  __kmp_free(thread->th.th_task_state_memo_stack);
6211  thread->th.th_task_state_memo_stack = NULL;
6212  }
6213 
6214 #if KMP_USE_BGET
6215  if (thread->th.th_local.bget_data != NULL) {
6216  __kmp_finalize_bget(thread);
6217  }
6218 #endif
6219 
6220 #if KMP_AFFINITY_SUPPORTED
6221  if (thread->th.th_affin_mask != NULL) {
6222  KMP_CPU_FREE(thread->th.th_affin_mask);
6223  thread->th.th_affin_mask = NULL;
6224  }
6225 #endif /* KMP_AFFINITY_SUPPORTED */
6226 
6227 #if KMP_USE_HIER_SCHED
6228  if (thread->th.th_hier_bar_data != NULL) {
6229  __kmp_free(thread->th.th_hier_bar_data);
6230  thread->th.th_hier_bar_data = NULL;
6231  }
6232 #endif
6233 
6234  __kmp_reap_team(thread->th.th_serial_team);
6235  thread->th.th_serial_team = NULL;
6236  __kmp_free(thread);
6237 
6238  KMP_MB();
6239 
6240 } // __kmp_reap_thread
6241 
6242 static void __kmp_itthash_clean(kmp_info_t *th) {
6243 #if USE_ITT_NOTIFY
6244  if (__kmp_itt_region_domains.count > 0) {
6245  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6246  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6247  while (bucket) {
6248  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6249  __kmp_thread_free(th, bucket);
6250  bucket = next;
6251  }
6252  }
6253  }
6254  if (__kmp_itt_barrier_domains.count > 0) {
6255  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6256  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6257  while (bucket) {
6258  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6259  __kmp_thread_free(th, bucket);
6260  bucket = next;
6261  }
6262  }
6263  }
6264 #endif
6265 }
6266 
6267 static void __kmp_internal_end(void) {
6268  int i;
6269 
6270  /* First, unregister the library */
6271  __kmp_unregister_library();
6272 
6273 #if KMP_OS_WINDOWS
6274  /* In Win static library, we can't tell when a root actually dies, so we
6275  reclaim the data structures for any root threads that have died but not
6276  unregistered themselves, in order to shut down cleanly.
6277  In Win dynamic library we also can't tell when a thread dies. */
6278  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6279 // dead roots
6280 #endif
6281 
6282  for (i = 0; i < __kmp_threads_capacity; i++)
6283  if (__kmp_root[i])
6284  if (__kmp_root[i]->r.r_active)
6285  break;
6286  KMP_MB(); /* Flush all pending memory write invalidates. */
6287  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6288 
6289  if (i < __kmp_threads_capacity) {
6290 #if KMP_USE_MONITOR
6291  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6292  KMP_MB(); /* Flush all pending memory write invalidates. */
6293 
6294  // Need to check that monitor was initialized before reaping it. If we are
6295  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6296  // __kmp_monitor will appear to contain valid data, but it is only valid in
6297  // the parent process, not the child.
6298  // New behavior (201008): instead of keying off of the flag
6299  // __kmp_init_parallel, the monitor thread creation is keyed off
6300  // of the new flag __kmp_init_monitor.
6301  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6302  if (TCR_4(__kmp_init_monitor)) {
6303  __kmp_reap_monitor(&__kmp_monitor);
6304  TCW_4(__kmp_init_monitor, 0);
6305  }
6306  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6307  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6308 #endif // KMP_USE_MONITOR
6309  } else {
6310 /* TODO move this to cleanup code */
6311 #ifdef KMP_DEBUG
6312  /* make sure that everything has properly ended */
6313  for (i = 0; i < __kmp_threads_capacity; i++) {
6314  if (__kmp_root[i]) {
6315  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6316  // there can be uber threads alive here
6317  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6318  }
6319  }
6320 #endif
6321 
6322  KMP_MB();
6323 
6324  // Reap the worker threads.
6325  // This is valid for now, but be careful if threads are reaped sooner.
6326  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6327  // Get the next thread from the pool.
6328  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6329  __kmp_thread_pool = thread->th.th_next_pool;
6330  // Reap it.
6331  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6332  thread->th.th_next_pool = NULL;
6333  thread->th.th_in_pool = FALSE;
6334  __kmp_reap_thread(thread, 0);
6335  }
6336  __kmp_thread_pool_insert_pt = NULL;
6337 
6338  // Reap teams.
6339  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6340  // Get the next team from the pool.
6341  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6342  __kmp_team_pool = team->t.t_next_pool;
6343  // Reap it.
6344  team->t.t_next_pool = NULL;
6345  __kmp_reap_team(team);
6346  }
6347 
6348  __kmp_reap_task_teams();
6349 
6350 #if KMP_OS_UNIX
6351  // Threads that are not reaped should not access any resources since they
6352  // are going to be deallocated soon, so the shutdown sequence should wait
6353  // until all threads either exit the final spin-waiting loop or begin
6354  // sleeping after the given blocktime.
6355  for (i = 0; i < __kmp_threads_capacity; i++) {
6356  kmp_info_t *thr = __kmp_threads[i];
6357  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6358  KMP_CPU_PAUSE();
6359  }
6360 #endif
6361 
6362  for (i = 0; i < __kmp_threads_capacity; ++i) {
6363  // TBD: Add some checking...
6364  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6365  }
6366 
6367  /* Make sure all threadprivate destructors get run by joining with all
6368  worker threads before resetting this flag */
6369  TCW_SYNC_4(__kmp_init_common, FALSE);
6370 
6371  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6372  KMP_MB();
6373 
6374 #if KMP_USE_MONITOR
6375  // See note above: One of the possible fixes for CQ138434 / CQ140126
6376  //
6377  // FIXME: push both code fragments down and CSE them?
6378  // push them into __kmp_cleanup() ?
6379  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6380  if (TCR_4(__kmp_init_monitor)) {
6381  __kmp_reap_monitor(&__kmp_monitor);
6382  TCW_4(__kmp_init_monitor, 0);
6383  }
6384  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6385  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6386 #endif
6387  } /* else !__kmp_global.t_active */
6388  TCW_4(__kmp_init_gtid, FALSE);
6389  KMP_MB(); /* Flush all pending memory write invalidates. */
6390 
6391  __kmp_cleanup();
6392 #if OMPT_SUPPORT
6393  ompt_fini();
6394 #endif
6395 }
6396 
6397 void __kmp_internal_end_library(int gtid_req) {
6398  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6399  /* this shouldn't be a race condition because __kmp_internal_end() is the
6400  only place to clear __kmp_serial_init */
6401  /* we'll check this later too, after we get the lock */
6402  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6403  // redundant, because the next check will work in any case.
6404  if (__kmp_global.g.g_abort) {
6405  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6406  /* TODO abort? */
6407  return;
6408  }
6409  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6410  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6411  return;
6412  }
6413 
6414  // If hidden helper team has been initialized, we need to deinit it
6415  if (TCR_4(__kmp_init_hidden_helper) &&
6416  !TCR_4(__kmp_hidden_helper_team_done)) {
6417  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6418  // First release the main thread to let it continue its work
6419  __kmp_hidden_helper_main_thread_release();
6420  // Wait until the hidden helper team has been destroyed
6421  __kmp_hidden_helper_threads_deinitz_wait();
6422  }
6423 
6424  KMP_MB(); /* Flush all pending memory write invalidates. */
6425  /* find out who we are and what we should do */
6426  {
6427  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6428  KA_TRACE(
6429  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6430  if (gtid == KMP_GTID_SHUTDOWN) {
6431  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6432  "already shutdown\n"));
6433  return;
6434  } else if (gtid == KMP_GTID_MONITOR) {
6435  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6436  "registered, or system shutdown\n"));
6437  return;
6438  } else if (gtid == KMP_GTID_DNE) {
6439  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6440  "shutdown\n"));
6441  /* we don't know who we are, but we may still shutdown the library */
6442  } else if (KMP_UBER_GTID(gtid)) {
6443  /* unregister ourselves as an uber thread. gtid is no longer valid */
6444  if (__kmp_root[gtid]->r.r_active) {
6445  __kmp_global.g.g_abort = -1;
6446  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6447  __kmp_unregister_library();
6448  KA_TRACE(10,
6449  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6450  gtid));
6451  return;
6452  } else {
6453  __kmp_itthash_clean(__kmp_threads[gtid]);
6454  KA_TRACE(
6455  10,
6456  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6457  __kmp_unregister_root_current_thread(gtid);
6458  }
6459  } else {
6460 /* worker threads may call this function through the atexit handler, if they
6461  * call exit() */
6462 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6463  TODO: do a thorough shutdown instead */
6464 #ifdef DUMP_DEBUG_ON_EXIT
6465  if (__kmp_debug_buf)
6466  __kmp_dump_debug_buffer();
6467 #endif
6468  // added unregister library call here when we switch to shm linux
6469  // if we don't, it will leave lots of files in /dev/shm
6470  // cleanup shared memory file before exiting.
6471  __kmp_unregister_library();
6472  return;
6473  }
6474  }
6475  /* synchronize the termination process */
6476  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6477 
6478  /* have we already finished */
6479  if (__kmp_global.g.g_abort) {
6480  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6481  /* TODO abort? */
6482  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6483  return;
6484  }
6485  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6486  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6487  return;
6488  }
6489 
6490  /* We need this lock to enforce mutex between this reading of
6491  __kmp_threads_capacity and the writing by __kmp_register_root.
6492  Alternatively, we can use a counter of roots that is atomically updated by
6493  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6494  __kmp_internal_end_*. */
6495  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6496 
6497  /* now we can safely conduct the actual termination */
6498  __kmp_internal_end();
6499 
6500  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6501  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6502 
6503  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6504 
6505 #ifdef DUMP_DEBUG_ON_EXIT
6506  if (__kmp_debug_buf)
6507  __kmp_dump_debug_buffer();
6508 #endif
6509 
6510 #if KMP_OS_WINDOWS
6511  __kmp_close_console();
6512 #endif
6513 
6514  __kmp_fini_allocator();
6515 
6516 } // __kmp_internal_end_library
6517 
6518 void __kmp_internal_end_thread(int gtid_req) {
6519  int i;
6520 
6521  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6522  /* this shouldn't be a race condition because __kmp_internal_end() is the
6523  * only place to clear __kmp_serial_init */
6524  /* we'll check this later too, after we get the lock */
6525  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6526  // redundant, because the next check will work in any case.
6527  if (__kmp_global.g.g_abort) {
6528  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6529  /* TODO abort? */
6530  return;
6531  }
6532  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6533  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6534  return;
6535  }
6536 
6537  // If hidden helper team has been initialized, we need to deinit it
6538  if (TCR_4(__kmp_init_hidden_helper) &&
6539  !TCR_4(__kmp_hidden_helper_team_done)) {
6540  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6541  // First release the main thread to let it continue its work
6542  __kmp_hidden_helper_main_thread_release();
6543  // Wait until the hidden helper team has been destroyed
6544  __kmp_hidden_helper_threads_deinitz_wait();
6545  }
6546 
6547  KMP_MB(); /* Flush all pending memory write invalidates. */
6548 
6549  /* find out who we are and what we should do */
6550  {
6551  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6552  KA_TRACE(10,
6553  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6554  if (gtid == KMP_GTID_SHUTDOWN) {
6555  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6556  "already shutdown\n"));
6557  return;
6558  } else if (gtid == KMP_GTID_MONITOR) {
6559  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6560  "registered, or system shutdown\n"));
6561  return;
6562  } else if (gtid == KMP_GTID_DNE) {
6563  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6564  "shutdown\n"));
6565  return;
6566  /* we don't know who we are */
6567  } else if (KMP_UBER_GTID(gtid)) {
6568  /* unregister ourselves as an uber thread. gtid is no longer valid */
6569  if (__kmp_root[gtid]->r.r_active) {
6570  __kmp_global.g.g_abort = -1;
6571  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6572  KA_TRACE(10,
6573  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6574  gtid));
6575  return;
6576  } else {
6577  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6578  gtid));
6579  __kmp_unregister_root_current_thread(gtid);
6580  }
6581  } else {
6582  /* just a worker thread, let's leave */
6583  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6584 
6585  if (gtid >= 0) {
6586  __kmp_threads[gtid]->th.th_task_team = NULL;
6587  }
6588 
6589  KA_TRACE(10,
6590  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6591  gtid));
6592  return;
6593  }
6594  }
6595 #if KMP_DYNAMIC_LIB
6596  if (__kmp_pause_status != kmp_hard_paused)
6597  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6598  // because we will better shutdown later in the library destructor.
6599  {
6600  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6601  return;
6602  }
6603 #endif
6604  /* synchronize the termination process */
6605  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6606 
6607  /* have we already finished */
6608  if (__kmp_global.g.g_abort) {
6609  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6610  /* TODO abort? */
6611  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6612  return;
6613  }
6614  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6615  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6616  return;
6617  }
6618 
6619  /* We need this lock to enforce mutex between this reading of
6620  __kmp_threads_capacity and the writing by __kmp_register_root.
6621  Alternatively, we can use a counter of roots that is atomically updated by
6622  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6623  __kmp_internal_end_*. */
6624 
6625  /* should we finish the run-time? are all siblings done? */
6626  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6627 
6628  for (i = 0; i < __kmp_threads_capacity; ++i) {
6629  if (KMP_UBER_GTID(i)) {
6630  KA_TRACE(
6631  10,
6632  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6633  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6634  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6635  return;
6636  }
6637  }
6638 
6639  /* now we can safely conduct the actual termination */
6640 
6641  __kmp_internal_end();
6642 
6643  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6644  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6645 
6646  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6647 
6648 #ifdef DUMP_DEBUG_ON_EXIT
6649  if (__kmp_debug_buf)
6650  __kmp_dump_debug_buffer();
6651 #endif
6652 } // __kmp_internal_end_thread
6653 
6654 // -----------------------------------------------------------------------------
6655 // Library registration stuff.
6656 
6657 static long __kmp_registration_flag = 0;
6658 // Random value used to indicate library initialization.
6659 static char *__kmp_registration_str = NULL;
6660 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6661 
6662 static inline char *__kmp_reg_status_name() {
6663 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6664  each thread. If registration and unregistration go in different threads
6665  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6666  env var can not be found, because the name will contain different pid. */
6667 // macOS* complains about name being too long with additional getuid()
6668 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6669  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6670  (int)getuid());
6671 #else
6672  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6673 #endif
6674 } // __kmp_reg_status_get
6675 
6676 void __kmp_register_library_startup(void) {
6677 
6678  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6679  int done = 0;
6680  union {
6681  double dtime;
6682  long ltime;
6683  } time;
6684 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6685  __kmp_initialize_system_tick();
6686 #endif
6687  __kmp_read_system_time(&time.dtime);
6688  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6689  __kmp_registration_str =
6690  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6691  __kmp_registration_flag, KMP_LIBRARY_FILE);
6692 
6693  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6694  __kmp_registration_str));
6695 
6696  while (!done) {
6697 
6698  char *value = NULL; // Actual value of the environment variable.
6699 
6700 #if defined(KMP_USE_SHM)
6701  char *shm_name = __kmp_str_format("/%s", name);
6702  int shm_preexist = 0;
6703  char *data1;
6704  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6705  if ((fd1 == -1) && (errno == EEXIST)) {
6706  // file didn't open because it already exists.
6707  // try opening existing file
6708  fd1 = shm_open(shm_name, O_RDWR, 0666);
6709  if (fd1 == -1) { // file didn't open
6710  // error out here
6711  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6712  __kmp_msg_null);
6713  } else {
6714  // able to open existing file
6715  shm_preexist = 1;
6716  }
6717  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6718  // already exists.
6719  // error out here.
6720  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6721  __kmp_msg_null);
6722  }
6723  if (shm_preexist == 0) {
6724  // we created SHM now set size
6725  if (ftruncate(fd1, SHM_SIZE) == -1) {
6726  // error occured setting size;
6727  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6728  KMP_ERR(errno), __kmp_msg_null);
6729  }
6730  }
6731  data1 =
6732  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6733  if (data1 == MAP_FAILED) {
6734  // failed to map shared memory
6735  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6736  __kmp_msg_null);
6737  }
6738  if (shm_preexist == 0) { // set data to SHM, set value
6739  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6740  }
6741  // Read value from either what we just wrote or existing file.
6742  value = __kmp_str_format("%s", data1); // read value from SHM
6743  munmap(data1, SHM_SIZE);
6744  close(fd1);
6745 #else // Windows and unix with static library
6746  // Set environment variable, but do not overwrite if it is exist.
6747  __kmp_env_set(name, __kmp_registration_str, 0);
6748  // read value to see if it got set
6749  value = __kmp_env_get(name);
6750 #endif
6751 
6752  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6753  done = 1; // Ok, environment variable set successfully, exit the loop.
6754  } else {
6755  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6756  // Check whether it alive or dead.
6757  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6758  char *tail = value;
6759  char *flag_addr_str = NULL;
6760  char *flag_val_str = NULL;
6761  char const *file_name = NULL;
6762  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6763  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6764  file_name = tail;
6765  if (tail != NULL) {
6766  unsigned long *flag_addr = 0;
6767  unsigned long flag_val = 0;
6768  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6769  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6770  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6771  // First, check whether environment-encoded address is mapped into
6772  // addr space.
6773  // If so, dereference it to see if it still has the right value.
6774  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6775  neighbor = 1;
6776  } else {
6777  // If not, then we know the other copy of the library is no longer
6778  // running.
6779  neighbor = 2;
6780  }
6781  }
6782  }
6783  switch (neighbor) {
6784  case 0: // Cannot parse environment variable -- neighbor status unknown.
6785  // Assume it is the incompatible format of future version of the
6786  // library. Assume the other library is alive.
6787  // WARN( ... ); // TODO: Issue a warning.
6788  file_name = "unknown library";
6789  KMP_FALLTHROUGH();
6790  // Attention! Falling to the next case. That's intentional.
6791  case 1: { // Neighbor is alive.
6792  // Check it is allowed.
6793  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6794  if (!__kmp_str_match_true(duplicate_ok)) {
6795  // That's not allowed. Issue fatal error.
6796  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6797  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6798  }
6799  KMP_INTERNAL_FREE(duplicate_ok);
6800  __kmp_duplicate_library_ok = 1;
6801  done = 1; // Exit the loop.
6802  } break;
6803  case 2: { // Neighbor is dead.
6804 
6805 #if defined(KMP_USE_SHM)
6806  // close shared memory.
6807  shm_unlink(shm_name); // this removes file in /dev/shm
6808 #else
6809  // Clear the variable and try to register library again.
6810  __kmp_env_unset(name);
6811 #endif
6812  } break;
6813  default: {
6814  KMP_DEBUG_ASSERT(0);
6815  } break;
6816  }
6817  }
6818  KMP_INTERNAL_FREE((void *)value);
6819 #if defined(KMP_USE_SHM)
6820  KMP_INTERNAL_FREE((void *)shm_name);
6821 #endif
6822  } // while
6823  KMP_INTERNAL_FREE((void *)name);
6824 
6825 } // func __kmp_register_library_startup
6826 
6827 void __kmp_unregister_library(void) {
6828 
6829  char *name = __kmp_reg_status_name();
6830  char *value = NULL;
6831 
6832 #if defined(KMP_USE_SHM)
6833  char *shm_name = __kmp_str_format("/%s", name);
6834  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6835  if (fd1 == -1) {
6836  // file did not open. return.
6837  return;
6838  }
6839  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6840  if (data1 != MAP_FAILED) {
6841  value = __kmp_str_format("%s", data1); // read value from SHM
6842  munmap(data1, SHM_SIZE);
6843  }
6844  close(fd1);
6845 #else
6846  value = __kmp_env_get(name);
6847 #endif
6848 
6849  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6850  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6851  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6852 // Ok, this is our variable. Delete it.
6853 #if defined(KMP_USE_SHM)
6854  shm_unlink(shm_name); // this removes file in /dev/shm
6855 #else
6856  __kmp_env_unset(name);
6857 #endif
6858  }
6859 
6860 #if defined(KMP_USE_SHM)
6861  KMP_INTERNAL_FREE(shm_name);
6862 #endif
6863 
6864  KMP_INTERNAL_FREE(__kmp_registration_str);
6865  KMP_INTERNAL_FREE(value);
6866  KMP_INTERNAL_FREE(name);
6867 
6868  __kmp_registration_flag = 0;
6869  __kmp_registration_str = NULL;
6870 
6871 } // __kmp_unregister_library
6872 
6873 // End of Library registration stuff.
6874 // -----------------------------------------------------------------------------
6875 
6876 #if KMP_MIC_SUPPORTED
6877 
6878 static void __kmp_check_mic_type() {
6879  kmp_cpuid_t cpuid_state = {0};
6880  kmp_cpuid_t *cs_p = &cpuid_state;
6881  __kmp_x86_cpuid(1, 0, cs_p);
6882  // We don't support mic1 at the moment
6883  if ((cs_p->eax & 0xff0) == 0xB10) {
6884  __kmp_mic_type = mic2;
6885  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6886  __kmp_mic_type = mic3;
6887  } else {
6888  __kmp_mic_type = non_mic;
6889  }
6890 }
6891 
6892 #endif /* KMP_MIC_SUPPORTED */
6893 
6894 #if KMP_HAVE_UMWAIT
6895 static void __kmp_user_level_mwait_init() {
6896  struct kmp_cpuid buf;
6897  __kmp_x86_cpuid(7, 0, &buf);
6898  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6899  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6900  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6901  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6902  __kmp_umwait_enabled));
6903 }
6904 #elif KMP_HAVE_MWAIT
6905 #ifndef AT_INTELPHIUSERMWAIT
6906 // Spurious, non-existent value that should always fail to return anything.
6907 // Will be replaced with the correct value when we know that.
6908 #define AT_INTELPHIUSERMWAIT 10000
6909 #endif
6910 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6911 // earlier OS is used to build the RTL, we'll use the following internal
6912 // function when the entry is not found.
6913 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6914 unsigned long getauxval(unsigned long) { return 0; }
6915 
6916 static void __kmp_user_level_mwait_init() {
6917  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6918  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6919  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6920  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6921  if (__kmp_mic_type == mic3) {
6922  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6923  if ((res & 0x1) || __kmp_user_level_mwait) {
6924  __kmp_mwait_enabled = TRUE;
6925  if (__kmp_user_level_mwait) {
6926  KMP_INFORM(EnvMwaitWarn);
6927  }
6928  } else {
6929  __kmp_mwait_enabled = FALSE;
6930  }
6931  }
6932  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6933  "__kmp_mwait_enabled = %d\n",
6934  __kmp_mic_type, __kmp_mwait_enabled));
6935 }
6936 #endif /* KMP_HAVE_UMWAIT */
6937 
6938 static void __kmp_do_serial_initialize(void) {
6939  int i, gtid;
6940  size_t size;
6941 
6942  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6943 
6944  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6945  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6946  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6947  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6948  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6949 
6950 #if OMPT_SUPPORT
6951  ompt_pre_init();
6952 #endif
6953 #if OMPD_SUPPORT
6954  __kmp_env_dump();
6955  ompd_init();
6956 #endif
6957 
6958  __kmp_validate_locks();
6959 
6960  /* Initialize internal memory allocator */
6961  __kmp_init_allocator();
6962 
6963  /* Register the library startup via an environment variable and check to see
6964  whether another copy of the library is already registered. */
6965 
6966  __kmp_register_library_startup();
6967 
6968  /* TODO reinitialization of library */
6969  if (TCR_4(__kmp_global.g.g_done)) {
6970  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6971  }
6972 
6973  __kmp_global.g.g_abort = 0;
6974  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6975 
6976 /* initialize the locks */
6977 #if KMP_USE_ADAPTIVE_LOCKS
6978 #if KMP_DEBUG_ADAPTIVE_LOCKS
6979  __kmp_init_speculative_stats();
6980 #endif
6981 #endif
6982 #if KMP_STATS_ENABLED
6983  __kmp_stats_init();
6984 #endif
6985  __kmp_init_lock(&__kmp_global_lock);
6986  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6987  __kmp_init_lock(&__kmp_debug_lock);
6988  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6989  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6990  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6991  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6992  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6993  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6994  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6995  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6996  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6997  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6998  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6999  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7000  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7001  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7002  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7003 #if KMP_USE_MONITOR
7004  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7005 #endif
7006  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7007 
7008  /* conduct initialization and initial setup of configuration */
7009 
7010  __kmp_runtime_initialize();
7011 
7012 #if KMP_MIC_SUPPORTED
7013  __kmp_check_mic_type();
7014 #endif
7015 
7016 // Some global variable initialization moved here from kmp_env_initialize()
7017 #ifdef KMP_DEBUG
7018  kmp_diag = 0;
7019 #endif
7020  __kmp_abort_delay = 0;
7021 
7022  // From __kmp_init_dflt_team_nth()
7023  /* assume the entire machine will be used */
7024  __kmp_dflt_team_nth_ub = __kmp_xproc;
7025  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7026  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7027  }
7028  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7029  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7030  }
7031  __kmp_max_nth = __kmp_sys_max_nth;
7032  __kmp_cg_max_nth = __kmp_sys_max_nth;
7033  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7034  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7035  __kmp_teams_max_nth = __kmp_sys_max_nth;
7036  }
7037 
7038  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7039  // part
7040  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7041 #if KMP_USE_MONITOR
7042  __kmp_monitor_wakeups =
7043  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7044  __kmp_bt_intervals =
7045  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7046 #endif
7047  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7048  __kmp_library = library_throughput;
7049  // From KMP_SCHEDULE initialization
7050  __kmp_static = kmp_sch_static_balanced;
7051 // AC: do not use analytical here, because it is non-monotonous
7052 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7053 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7054 // need to repeat assignment
7055 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7056 // bit control and barrier method control parts
7057 #if KMP_FAST_REDUCTION_BARRIER
7058 #define kmp_reduction_barrier_gather_bb ((int)1)
7059 #define kmp_reduction_barrier_release_bb ((int)1)
7060 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7061 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7062 #endif // KMP_FAST_REDUCTION_BARRIER
7063  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7064  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7065  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7066  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7067  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7068 #if KMP_FAST_REDUCTION_BARRIER
7069  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7070  // lin_64 ): hyper,1
7071  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7072  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7073  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7074  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7075  }
7076 #endif // KMP_FAST_REDUCTION_BARRIER
7077  }
7078 #if KMP_FAST_REDUCTION_BARRIER
7079 #undef kmp_reduction_barrier_release_pat
7080 #undef kmp_reduction_barrier_gather_pat
7081 #undef kmp_reduction_barrier_release_bb
7082 #undef kmp_reduction_barrier_gather_bb
7083 #endif // KMP_FAST_REDUCTION_BARRIER
7084 #if KMP_MIC_SUPPORTED
7085  if (__kmp_mic_type == mic2) { // KNC
7086  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7087  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7088  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7089  1; // forkjoin release
7090  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7091  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7092  }
7093 #if KMP_FAST_REDUCTION_BARRIER
7094  if (__kmp_mic_type == mic2) { // KNC
7095  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7096  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7097  }
7098 #endif // KMP_FAST_REDUCTION_BARRIER
7099 #endif // KMP_MIC_SUPPORTED
7100 
7101 // From KMP_CHECKS initialization
7102 #ifdef KMP_DEBUG
7103  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7104 #else
7105  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7106 #endif
7107 
7108  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7109  __kmp_foreign_tp = TRUE;
7110 
7111  __kmp_global.g.g_dynamic = FALSE;
7112  __kmp_global.g.g_dynamic_mode = dynamic_default;
7113 
7114  __kmp_init_nesting_mode();
7115 
7116  __kmp_env_initialize(NULL);
7117 
7118 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7119  __kmp_user_level_mwait_init();
7120 #endif
7121 // Print all messages in message catalog for testing purposes.
7122 #ifdef KMP_DEBUG
7123  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7124  if (__kmp_str_match_true(val)) {
7125  kmp_str_buf_t buffer;
7126  __kmp_str_buf_init(&buffer);
7127  __kmp_i18n_dump_catalog(&buffer);
7128  __kmp_printf("%s", buffer.str);
7129  __kmp_str_buf_free(&buffer);
7130  }
7131  __kmp_env_free(&val);
7132 #endif
7133 
7134  __kmp_threads_capacity =
7135  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7136  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7137  __kmp_tp_capacity = __kmp_default_tp_capacity(
7138  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7139 
7140  // If the library is shut down properly, both pools must be NULL. Just in
7141  // case, set them to NULL -- some memory may leak, but subsequent code will
7142  // work even if pools are not freed.
7143  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7144  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7145  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7146  __kmp_thread_pool = NULL;
7147  __kmp_thread_pool_insert_pt = NULL;
7148  __kmp_team_pool = NULL;
7149 
7150  /* Allocate all of the variable sized records */
7151  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7152  * expandable */
7153  /* Since allocation is cache-aligned, just add extra padding at the end */
7154  size =
7155  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7156  CACHE_LINE;
7157  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7158  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7159  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7160 
7161  /* init thread counts */
7162  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7163  0); // Asserts fail if the library is reinitializing and
7164  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7165  __kmp_all_nth = 0;
7166  __kmp_nth = 0;
7167 
7168  /* setup the uber master thread and hierarchy */
7169  gtid = __kmp_register_root(TRUE);
7170  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7171  KMP_ASSERT(KMP_UBER_GTID(gtid));
7172  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7173 
7174  KMP_MB(); /* Flush all pending memory write invalidates. */
7175 
7176  __kmp_common_initialize();
7177 
7178 #if KMP_OS_UNIX
7179  /* invoke the child fork handler */
7180  __kmp_register_atfork();
7181 #endif
7182 
7183 #if !KMP_DYNAMIC_LIB
7184  {
7185  /* Invoke the exit handler when the program finishes, only for static
7186  library. For dynamic library, we already have _fini and DllMain. */
7187  int rc = atexit(__kmp_internal_end_atexit);
7188  if (rc != 0) {
7189  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7190  __kmp_msg_null);
7191  }
7192  }
7193 #endif
7194 
7195 #if KMP_HANDLE_SIGNALS
7196 #if KMP_OS_UNIX
7197  /* NOTE: make sure that this is called before the user installs their own
7198  signal handlers so that the user handlers are called first. this way they
7199  can return false, not call our handler, avoid terminating the library, and
7200  continue execution where they left off. */
7201  __kmp_install_signals(FALSE);
7202 #endif /* KMP_OS_UNIX */
7203 #if KMP_OS_WINDOWS
7204  __kmp_install_signals(TRUE);
7205 #endif /* KMP_OS_WINDOWS */
7206 #endif
7207 
7208  /* we have finished the serial initialization */
7209  __kmp_init_counter++;
7210 
7211  __kmp_init_serial = TRUE;
7212 
7213  if (__kmp_settings) {
7214  __kmp_env_print();
7215  }
7216 
7217  if (__kmp_display_env || __kmp_display_env_verbose) {
7218  __kmp_env_print_2();
7219  }
7220 
7221 #if OMPT_SUPPORT
7222  ompt_post_init();
7223 #endif
7224 
7225  KMP_MB();
7226 
7227  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7228 }
7229 
7230 void __kmp_serial_initialize(void) {
7231  if (__kmp_init_serial) {
7232  return;
7233  }
7234  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7235  if (__kmp_init_serial) {
7236  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7237  return;
7238  }
7239  __kmp_do_serial_initialize();
7240  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7241 }
7242 
7243 static void __kmp_do_middle_initialize(void) {
7244  int i, j;
7245  int prev_dflt_team_nth;
7246 
7247  if (!__kmp_init_serial) {
7248  __kmp_do_serial_initialize();
7249  }
7250 
7251  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7252 
7253  // Save the previous value for the __kmp_dflt_team_nth so that
7254  // we can avoid some reinitialization if it hasn't changed.
7255  prev_dflt_team_nth = __kmp_dflt_team_nth;
7256 
7257 #if KMP_AFFINITY_SUPPORTED
7258  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7259  // number of cores on the machine.
7260  __kmp_affinity_initialize();
7261 
7262 #endif /* KMP_AFFINITY_SUPPORTED */
7263 
7264  KMP_ASSERT(__kmp_xproc > 0);
7265  if (__kmp_avail_proc == 0) {
7266  __kmp_avail_proc = __kmp_xproc;
7267  }
7268 
7269  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7270  // correct them now
7271  j = 0;
7272  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7273  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7274  __kmp_avail_proc;
7275  j++;
7276  }
7277 
7278  if (__kmp_dflt_team_nth == 0) {
7279 #ifdef KMP_DFLT_NTH_CORES
7280  // Default #threads = #cores
7281  __kmp_dflt_team_nth = __kmp_ncores;
7282  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7283  "__kmp_ncores (%d)\n",
7284  __kmp_dflt_team_nth));
7285 #else
7286  // Default #threads = #available OS procs
7287  __kmp_dflt_team_nth = __kmp_avail_proc;
7288  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7289  "__kmp_avail_proc(%d)\n",
7290  __kmp_dflt_team_nth));
7291 #endif /* KMP_DFLT_NTH_CORES */
7292  }
7293 
7294  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7295  __kmp_dflt_team_nth = KMP_MIN_NTH;
7296  }
7297  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7298  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7299  }
7300 
7301  if (__kmp_nesting_mode > 0)
7302  __kmp_set_nesting_mode_threads();
7303 
7304  // There's no harm in continuing if the following check fails,
7305  // but it indicates an error in the previous logic.
7306  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7307 
7308  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7309  // Run through the __kmp_threads array and set the num threads icv for each
7310  // root thread that is currently registered with the RTL (which has not
7311  // already explicitly set its nthreads-var with a call to
7312  // omp_set_num_threads()).
7313  for (i = 0; i < __kmp_threads_capacity; i++) {
7314  kmp_info_t *thread = __kmp_threads[i];
7315  if (thread == NULL)
7316  continue;
7317  if (thread->th.th_current_task->td_icvs.nproc != 0)
7318  continue;
7319 
7320  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7321  }
7322  }
7323  KA_TRACE(
7324  20,
7325  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7326  __kmp_dflt_team_nth));
7327 
7328 #ifdef KMP_ADJUST_BLOCKTIME
7329  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7330  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7331  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7332  if (__kmp_nth > __kmp_avail_proc) {
7333  __kmp_zero_bt = TRUE;
7334  }
7335  }
7336 #endif /* KMP_ADJUST_BLOCKTIME */
7337 
7338  /* we have finished middle initialization */
7339  TCW_SYNC_4(__kmp_init_middle, TRUE);
7340 
7341  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7342 }
7343 
7344 void __kmp_middle_initialize(void) {
7345  if (__kmp_init_middle) {
7346  return;
7347  }
7348  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7349  if (__kmp_init_middle) {
7350  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7351  return;
7352  }
7353  __kmp_do_middle_initialize();
7354  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7355 }
7356 
7357 void __kmp_parallel_initialize(void) {
7358  int gtid = __kmp_entry_gtid(); // this might be a new root
7359 
7360  /* synchronize parallel initialization (for sibling) */
7361  if (TCR_4(__kmp_init_parallel))
7362  return;
7363  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7364  if (TCR_4(__kmp_init_parallel)) {
7365  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7366  return;
7367  }
7368 
7369  /* TODO reinitialization after we have already shut down */
7370  if (TCR_4(__kmp_global.g.g_done)) {
7371  KA_TRACE(
7372  10,
7373  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7374  __kmp_infinite_loop();
7375  }
7376 
7377  /* jc: The lock __kmp_initz_lock is already held, so calling
7378  __kmp_serial_initialize would cause a deadlock. So we call
7379  __kmp_do_serial_initialize directly. */
7380  if (!__kmp_init_middle) {
7381  __kmp_do_middle_initialize();
7382  }
7383  __kmp_assign_root_init_mask();
7384  __kmp_resume_if_hard_paused();
7385 
7386  /* begin initialization */
7387  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7388  KMP_ASSERT(KMP_UBER_GTID(gtid));
7389 
7390 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7391  // Save the FP control regs.
7392  // Worker threads will set theirs to these values at thread startup.
7393  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7394  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7395  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7396 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7397 
7398 #if KMP_OS_UNIX
7399 #if KMP_HANDLE_SIGNALS
7400  /* must be after __kmp_serial_initialize */
7401  __kmp_install_signals(TRUE);
7402 #endif
7403 #endif
7404 
7405  __kmp_suspend_initialize();
7406 
7407 #if defined(USE_LOAD_BALANCE)
7408  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7409  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7410  }
7411 #else
7412  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7413  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7414  }
7415 #endif
7416 
7417  if (__kmp_version) {
7418  __kmp_print_version_2();
7419  }
7420 
7421  /* we have finished parallel initialization */
7422  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7423 
7424  KMP_MB();
7425  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7426 
7427  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7428 }
7429 
7430 void __kmp_hidden_helper_initialize() {
7431  if (TCR_4(__kmp_init_hidden_helper))
7432  return;
7433 
7434  // __kmp_parallel_initialize is required before we initialize hidden helper
7435  if (!TCR_4(__kmp_init_parallel))
7436  __kmp_parallel_initialize();
7437 
7438  // Double check. Note that this double check should not be placed before
7439  // __kmp_parallel_initialize as it will cause dead lock.
7440  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7441  if (TCR_4(__kmp_init_hidden_helper)) {
7442  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7443  return;
7444  }
7445 
7446  // Set the count of hidden helper tasks to be executed to zero
7447  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7448 
7449  // Set the global variable indicating that we're initializing hidden helper
7450  // team/threads
7451  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7452 
7453  // Platform independent initialization
7454  __kmp_do_initialize_hidden_helper_threads();
7455 
7456  // Wait here for the finish of initialization of hidden helper teams
7457  __kmp_hidden_helper_threads_initz_wait();
7458 
7459  // We have finished hidden helper initialization
7460  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7461 
7462  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7463 }
7464 
7465 /* ------------------------------------------------------------------------ */
7466 
7467 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7468  kmp_team_t *team) {
7469  kmp_disp_t *dispatch;
7470 
7471  KMP_MB();
7472 
7473  /* none of the threads have encountered any constructs, yet. */
7474  this_thr->th.th_local.this_construct = 0;
7475 #if KMP_CACHE_MANAGE
7476  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7477 #endif /* KMP_CACHE_MANAGE */
7478  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7479  KMP_DEBUG_ASSERT(dispatch);
7480  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7481  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7482  // this_thr->th.th_info.ds.ds_tid ] );
7483 
7484  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7485  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7486  if (__kmp_env_consistency_check)
7487  __kmp_push_parallel(gtid, team->t.t_ident);
7488 
7489  KMP_MB(); /* Flush all pending memory write invalidates. */
7490 }
7491 
7492 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7493  kmp_team_t *team) {
7494  if (__kmp_env_consistency_check)
7495  __kmp_pop_parallel(gtid, team->t.t_ident);
7496 
7497  __kmp_finish_implicit_task(this_thr);
7498 }
7499 
7500 int __kmp_invoke_task_func(int gtid) {
7501  int rc;
7502  int tid = __kmp_tid_from_gtid(gtid);
7503  kmp_info_t *this_thr = __kmp_threads[gtid];
7504  kmp_team_t *team = this_thr->th.th_team;
7505 
7506  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7507 #if USE_ITT_BUILD
7508  if (__itt_stack_caller_create_ptr) {
7509  // inform ittnotify about entering user's code
7510  if (team->t.t_stack_id != NULL) {
7511  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7512  } else {
7513  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7514  __kmp_itt_stack_callee_enter(
7515  (__itt_caller)team->t.t_parent->t.t_stack_id);
7516  }
7517  }
7518 #endif /* USE_ITT_BUILD */
7519 #if INCLUDE_SSC_MARKS
7520  SSC_MARK_INVOKING();
7521 #endif
7522 
7523 #if OMPT_SUPPORT
7524  void *dummy;
7525  void **exit_frame_p;
7526  ompt_data_t *my_task_data;
7527  ompt_data_t *my_parallel_data;
7528  int ompt_team_size;
7529 
7530  if (ompt_enabled.enabled) {
7531  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7532  .ompt_task_info.frame.exit_frame.ptr);
7533  } else {
7534  exit_frame_p = &dummy;
7535  }
7536 
7537  my_task_data =
7538  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7539  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7540  if (ompt_enabled.ompt_callback_implicit_task) {
7541  ompt_team_size = team->t.t_nproc;
7542  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7543  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7544  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7545  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7546  }
7547 #endif
7548 
7549 #if KMP_STATS_ENABLED
7550  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7551  if (previous_state == stats_state_e::TEAMS_REGION) {
7552  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7553  } else {
7554  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7555  }
7556  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7557 #endif
7558 
7559  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7560  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7561 #if OMPT_SUPPORT
7562  ,
7563  exit_frame_p
7564 #endif
7565  );
7566 #if OMPT_SUPPORT
7567  *exit_frame_p = NULL;
7568  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7569 #endif
7570 
7571 #if KMP_STATS_ENABLED
7572  if (previous_state == stats_state_e::TEAMS_REGION) {
7573  KMP_SET_THREAD_STATE(previous_state);
7574  }
7575  KMP_POP_PARTITIONED_TIMER();
7576 #endif
7577 
7578 #if USE_ITT_BUILD
7579  if (__itt_stack_caller_create_ptr) {
7580  // inform ittnotify about leaving user's code
7581  if (team->t.t_stack_id != NULL) {
7582  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7583  } else {
7584  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7585  __kmp_itt_stack_callee_leave(
7586  (__itt_caller)team->t.t_parent->t.t_stack_id);
7587  }
7588  }
7589 #endif /* USE_ITT_BUILD */
7590  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7591 
7592  return rc;
7593 }
7594 
7595 void __kmp_teams_master(int gtid) {
7596  // This routine is called by all primary threads in teams construct
7597  kmp_info_t *thr = __kmp_threads[gtid];
7598  kmp_team_t *team = thr->th.th_team;
7599  ident_t *loc = team->t.t_ident;
7600  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7601  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7602  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7603  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7604  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7605 
7606  // This thread is a new CG root. Set up the proper variables.
7607  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7608  tmp->cg_root = thr; // Make thr the CG root
7609  // Init to thread limit stored when league primary threads were forked
7610  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7611  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7612  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7613  " cg_nthreads to 1\n",
7614  thr, tmp));
7615  tmp->up = thr->th.th_cg_roots;
7616  thr->th.th_cg_roots = tmp;
7617 
7618 // Launch league of teams now, but not let workers execute
7619 // (they hang on fork barrier until next parallel)
7620 #if INCLUDE_SSC_MARKS
7621  SSC_MARK_FORKING();
7622 #endif
7623  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7624  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7625  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7626 #if INCLUDE_SSC_MARKS
7627  SSC_MARK_JOINING();
7628 #endif
7629  // If the team size was reduced from the limit, set it to the new size
7630  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7631  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7632  // AC: last parameter "1" eliminates join barrier which won't work because
7633  // worker threads are in a fork barrier waiting for more parallel regions
7634  __kmp_join_call(loc, gtid
7635 #if OMPT_SUPPORT
7636  ,
7637  fork_context_intel
7638 #endif
7639  ,
7640  1);
7641 }
7642 
7643 int __kmp_invoke_teams_master(int gtid) {
7644  kmp_info_t *this_thr = __kmp_threads[gtid];
7645  kmp_team_t *team = this_thr->th.th_team;
7646 #if KMP_DEBUG
7647  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7648  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7649  (void *)__kmp_teams_master);
7650 #endif
7651  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7652 #if OMPT_SUPPORT
7653  int tid = __kmp_tid_from_gtid(gtid);
7654  ompt_data_t *task_data =
7655  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7656  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7657  if (ompt_enabled.ompt_callback_implicit_task) {
7658  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7659  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7660  ompt_task_initial);
7661  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7662  }
7663 #endif
7664  __kmp_teams_master(gtid);
7665 #if OMPT_SUPPORT
7666  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7667 #endif
7668  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7669  return 1;
7670 }
7671 
7672 /* this sets the requested number of threads for the next parallel region
7673  encountered by this team. since this should be enclosed in the forkjoin
7674  critical section it should avoid race conditions with asymmetrical nested
7675  parallelism */
7676 
7677 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7678  kmp_info_t *thr = __kmp_threads[gtid];
7679 
7680  if (num_threads > 0)
7681  thr->th.th_set_nproc = num_threads;
7682 }
7683 
7684 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7685  int num_threads) {
7686  KMP_DEBUG_ASSERT(thr);
7687  // Remember the number of threads for inner parallel regions
7688  if (!TCR_4(__kmp_init_middle))
7689  __kmp_middle_initialize(); // get internal globals calculated
7690  __kmp_assign_root_init_mask();
7691  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7692  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7693 
7694  if (num_threads == 0) {
7695  if (__kmp_teams_thread_limit > 0) {
7696  num_threads = __kmp_teams_thread_limit;
7697  } else {
7698  num_threads = __kmp_avail_proc / num_teams;
7699  }
7700  // adjust num_threads w/o warning as it is not user setting
7701  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7702  // no thread_limit clause specified - do not change thread-limit-var ICV
7703  if (num_threads > __kmp_dflt_team_nth) {
7704  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7705  }
7706  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7707  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7708  } // prevent team size to exceed thread-limit-var
7709  if (num_teams * num_threads > __kmp_teams_max_nth) {
7710  num_threads = __kmp_teams_max_nth / num_teams;
7711  }
7712  if (num_threads == 0) {
7713  num_threads = 1;
7714  }
7715  } else {
7716  if (num_threads < 0) {
7717  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7718  __kmp_msg_null);
7719  num_threads = 1;
7720  }
7721  // This thread will be the primary thread of the league primary threads
7722  // Store new thread limit; old limit is saved in th_cg_roots list
7723  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7724  // num_threads = min(num_threads, nthreads-var)
7725  if (num_threads > __kmp_dflt_team_nth) {
7726  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7727  }
7728  if (num_teams * num_threads > __kmp_teams_max_nth) {
7729  int new_threads = __kmp_teams_max_nth / num_teams;
7730  if (new_threads == 0) {
7731  new_threads = 1;
7732  }
7733  if (new_threads != num_threads) {
7734  if (!__kmp_reserve_warn) { // user asked for too many threads
7735  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7736  __kmp_msg(kmp_ms_warning,
7737  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7738  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7739  }
7740  }
7741  num_threads = new_threads;
7742  }
7743  }
7744  thr->th.th_teams_size.nth = num_threads;
7745 }
7746 
7747 /* this sets the requested number of teams for the teams region and/or
7748  the number of threads for the next parallel region encountered */
7749 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7750  int num_threads) {
7751  kmp_info_t *thr = __kmp_threads[gtid];
7752  if (num_teams < 0) {
7753  // OpenMP specification requires requested values to be positive,
7754  // but people can send us any value, so we'd better check
7755  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7756  __kmp_msg_null);
7757  num_teams = 1;
7758  }
7759  if (num_teams == 0) {
7760  if (__kmp_nteams > 0) {
7761  num_teams = __kmp_nteams;
7762  } else {
7763  num_teams = 1; // default number of teams is 1.
7764  }
7765  }
7766  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7767  if (!__kmp_reserve_warn) {
7768  __kmp_reserve_warn = 1;
7769  __kmp_msg(kmp_ms_warning,
7770  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7771  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7772  }
7773  num_teams = __kmp_teams_max_nth;
7774  }
7775  // Set number of teams (number of threads in the outer "parallel" of the
7776  // teams)
7777  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7778 
7779  __kmp_push_thread_limit(thr, num_teams, num_threads);
7780 }
7781 
7782 /* This sets the requested number of teams for the teams region and/or
7783  the number of threads for the next parallel region encountered */
7784 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7785  int num_teams_ub, int num_threads) {
7786  kmp_info_t *thr = __kmp_threads[gtid];
7787  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7788  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7789  KMP_DEBUG_ASSERT(num_threads >= 0);
7790 
7791  if (num_teams_lb > num_teams_ub) {
7792  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7793  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7794  }
7795 
7796  int num_teams = 1; // defalt number of teams is 1.
7797 
7798  if (num_teams_lb == 0 && num_teams_ub > 0)
7799  num_teams_lb = num_teams_ub;
7800 
7801  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7802  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7803  if (num_teams > __kmp_teams_max_nth) {
7804  if (!__kmp_reserve_warn) {
7805  __kmp_reserve_warn = 1;
7806  __kmp_msg(kmp_ms_warning,
7807  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7808  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7809  }
7810  num_teams = __kmp_teams_max_nth;
7811  }
7812  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7813  num_teams = num_teams_ub;
7814  } else { // num_teams_lb <= num_teams <= num_teams_ub
7815  if (num_threads <= 0) {
7816  if (num_teams_ub > __kmp_teams_max_nth) {
7817  num_teams = num_teams_lb;
7818  } else {
7819  num_teams = num_teams_ub;
7820  }
7821  } else {
7822  num_teams = (num_threads > __kmp_teams_max_nth)
7823  ? num_teams
7824  : __kmp_teams_max_nth / num_threads;
7825  if (num_teams < num_teams_lb) {
7826  num_teams = num_teams_lb;
7827  } else if (num_teams > num_teams_ub) {
7828  num_teams = num_teams_ub;
7829  }
7830  }
7831  }
7832  // Set number of teams (number of threads in the outer "parallel" of the
7833  // teams)
7834  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7835 
7836  __kmp_push_thread_limit(thr, num_teams, num_threads);
7837 }
7838 
7839 // Set the proc_bind var to use in the following parallel region.
7840 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7841  kmp_info_t *thr = __kmp_threads[gtid];
7842  thr->th.th_set_proc_bind = proc_bind;
7843 }
7844 
7845 /* Launch the worker threads into the microtask. */
7846 
7847 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7848  kmp_info_t *this_thr = __kmp_threads[gtid];
7849 
7850 #ifdef KMP_DEBUG
7851  int f;
7852 #endif /* KMP_DEBUG */
7853 
7854  KMP_DEBUG_ASSERT(team);
7855  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7856  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7857  KMP_MB(); /* Flush all pending memory write invalidates. */
7858 
7859  team->t.t_construct = 0; /* no single directives seen yet */
7860  team->t.t_ordered.dt.t_value =
7861  0; /* thread 0 enters the ordered section first */
7862 
7863  /* Reset the identifiers on the dispatch buffer */
7864  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7865  if (team->t.t_max_nproc > 1) {
7866  int i;
7867  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7868  team->t.t_disp_buffer[i].buffer_index = i;
7869  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7870  }
7871  } else {
7872  team->t.t_disp_buffer[0].buffer_index = 0;
7873  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7874  }
7875 
7876  KMP_MB(); /* Flush all pending memory write invalidates. */
7877  KMP_ASSERT(this_thr->th.th_team == team);
7878 
7879 #ifdef KMP_DEBUG
7880  for (f = 0; f < team->t.t_nproc; f++) {
7881  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7882  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7883  }
7884 #endif /* KMP_DEBUG */
7885 
7886  /* release the worker threads so they may begin working */
7887  __kmp_fork_barrier(gtid, 0);
7888 }
7889 
7890 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7891  kmp_info_t *this_thr = __kmp_threads[gtid];
7892 
7893  KMP_DEBUG_ASSERT(team);
7894  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7895  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7896  KMP_MB(); /* Flush all pending memory write invalidates. */
7897 
7898  /* Join barrier after fork */
7899 
7900 #ifdef KMP_DEBUG
7901  if (__kmp_threads[gtid] &&
7902  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7903  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7904  __kmp_threads[gtid]);
7905  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7906  "team->t.t_nproc=%d\n",
7907  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7908  team->t.t_nproc);
7909  __kmp_print_structure();
7910  }
7911  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7912  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7913 #endif /* KMP_DEBUG */
7914 
7915  __kmp_join_barrier(gtid); /* wait for everyone */
7916 #if OMPT_SUPPORT
7917  if (ompt_enabled.enabled &&
7918  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7919  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7920  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7921  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7922 #if OMPT_OPTIONAL
7923  void *codeptr = NULL;
7924  if (KMP_MASTER_TID(ds_tid) &&
7925  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7926  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7927  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7928 
7929  if (ompt_enabled.ompt_callback_sync_region_wait) {
7930  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7931  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7932  codeptr);
7933  }
7934  if (ompt_enabled.ompt_callback_sync_region) {
7935  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7936  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7937  codeptr);
7938  }
7939 #endif
7940  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7941  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7942  ompt_scope_end, NULL, task_data, 0, ds_tid,
7943  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7944  }
7945  }
7946 #endif
7947 
7948  KMP_MB(); /* Flush all pending memory write invalidates. */
7949  KMP_ASSERT(this_thr->th.th_team == team);
7950 }
7951 
7952 /* ------------------------------------------------------------------------ */
7953 
7954 #ifdef USE_LOAD_BALANCE
7955 
7956 // Return the worker threads actively spinning in the hot team, if we
7957 // are at the outermost level of parallelism. Otherwise, return 0.
7958 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7959  int i;
7960  int retval;
7961  kmp_team_t *hot_team;
7962 
7963  if (root->r.r_active) {
7964  return 0;
7965  }
7966  hot_team = root->r.r_hot_team;
7967  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7968  return hot_team->t.t_nproc - 1; // Don't count primary thread
7969  }
7970 
7971  // Skip the primary thread - it is accounted for elsewhere.
7972  retval = 0;
7973  for (i = 1; i < hot_team->t.t_nproc; i++) {
7974  if (hot_team->t.t_threads[i]->th.th_active) {
7975  retval++;
7976  }
7977  }
7978  return retval;
7979 }
7980 
7981 // Perform an automatic adjustment to the number of
7982 // threads used by the next parallel region.
7983 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7984  int retval;
7985  int pool_active;
7986  int hot_team_active;
7987  int team_curr_active;
7988  int system_active;
7989 
7990  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7991  set_nproc));
7992  KMP_DEBUG_ASSERT(root);
7993  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7994  ->th.th_current_task->td_icvs.dynamic == TRUE);
7995  KMP_DEBUG_ASSERT(set_nproc > 1);
7996 
7997  if (set_nproc == 1) {
7998  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7999  return 1;
8000  }
8001 
8002  // Threads that are active in the thread pool, active in the hot team for this
8003  // particular root (if we are at the outer par level), and the currently
8004  // executing thread (to become the primary thread) are available to add to the
8005  // new team, but are currently contributing to the system load, and must be
8006  // accounted for.
8007  pool_active = __kmp_thread_pool_active_nth;
8008  hot_team_active = __kmp_active_hot_team_nproc(root);
8009  team_curr_active = pool_active + hot_team_active + 1;
8010 
8011  // Check the system load.
8012  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8013  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8014  "hot team active = %d\n",
8015  system_active, pool_active, hot_team_active));
8016 
8017  if (system_active < 0) {
8018  // There was an error reading the necessary info from /proc, so use the
8019  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8020  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8021  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8022  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8023 
8024  // Make this call behave like the thread limit algorithm.
8025  retval = __kmp_avail_proc - __kmp_nth +
8026  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8027  if (retval > set_nproc) {
8028  retval = set_nproc;
8029  }
8030  if (retval < KMP_MIN_NTH) {
8031  retval = KMP_MIN_NTH;
8032  }
8033 
8034  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8035  retval));
8036  return retval;
8037  }
8038 
8039  // There is a slight delay in the load balance algorithm in detecting new
8040  // running procs. The real system load at this instant should be at least as
8041  // large as the #active omp thread that are available to add to the team.
8042  if (system_active < team_curr_active) {
8043  system_active = team_curr_active;
8044  }
8045  retval = __kmp_avail_proc - system_active + team_curr_active;
8046  if (retval > set_nproc) {
8047  retval = set_nproc;
8048  }
8049  if (retval < KMP_MIN_NTH) {
8050  retval = KMP_MIN_NTH;
8051  }
8052 
8053  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8054  return retval;
8055 } // __kmp_load_balance_nproc()
8056 
8057 #endif /* USE_LOAD_BALANCE */
8058 
8059 /* ------------------------------------------------------------------------ */
8060 
8061 /* NOTE: this is called with the __kmp_init_lock held */
8062 void __kmp_cleanup(void) {
8063  int f;
8064 
8065  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8066 
8067  if (TCR_4(__kmp_init_parallel)) {
8068 #if KMP_HANDLE_SIGNALS
8069  __kmp_remove_signals();
8070 #endif
8071  TCW_4(__kmp_init_parallel, FALSE);
8072  }
8073 
8074  if (TCR_4(__kmp_init_middle)) {
8075 #if KMP_AFFINITY_SUPPORTED
8076  __kmp_affinity_uninitialize();
8077 #endif /* KMP_AFFINITY_SUPPORTED */
8078  __kmp_cleanup_hierarchy();
8079  TCW_4(__kmp_init_middle, FALSE);
8080  }
8081 
8082  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8083 
8084  if (__kmp_init_serial) {
8085  __kmp_runtime_destroy();
8086  __kmp_init_serial = FALSE;
8087  }
8088 
8089  __kmp_cleanup_threadprivate_caches();
8090 
8091  for (f = 0; f < __kmp_threads_capacity; f++) {
8092  if (__kmp_root[f] != NULL) {
8093  __kmp_free(__kmp_root[f]);
8094  __kmp_root[f] = NULL;
8095  }
8096  }
8097  __kmp_free(__kmp_threads);
8098  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8099  // there is no need in freeing __kmp_root.
8100  __kmp_threads = NULL;
8101  __kmp_root = NULL;
8102  __kmp_threads_capacity = 0;
8103 
8104 #if KMP_USE_DYNAMIC_LOCK
8105  __kmp_cleanup_indirect_user_locks();
8106 #else
8107  __kmp_cleanup_user_locks();
8108 #endif
8109 #if OMPD_SUPPORT
8110  if (ompd_state) {
8111  __kmp_free(ompd_env_block);
8112  ompd_env_block = NULL;
8113  ompd_env_block_size = 0;
8114  }
8115 #endif
8116 
8117 #if KMP_AFFINITY_SUPPORTED
8118  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8119  __kmp_cpuinfo_file = NULL;
8120 #endif /* KMP_AFFINITY_SUPPORTED */
8121 
8122 #if KMP_USE_ADAPTIVE_LOCKS
8123 #if KMP_DEBUG_ADAPTIVE_LOCKS
8124  __kmp_print_speculative_stats();
8125 #endif
8126 #endif
8127  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8128  __kmp_nested_nth.nth = NULL;
8129  __kmp_nested_nth.size = 0;
8130  __kmp_nested_nth.used = 0;
8131  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8132  __kmp_nested_proc_bind.bind_types = NULL;
8133  __kmp_nested_proc_bind.size = 0;
8134  __kmp_nested_proc_bind.used = 0;
8135  if (__kmp_affinity_format) {
8136  KMP_INTERNAL_FREE(__kmp_affinity_format);
8137  __kmp_affinity_format = NULL;
8138  }
8139 
8140  __kmp_i18n_catclose();
8141 
8142 #if KMP_USE_HIER_SCHED
8143  __kmp_hier_scheds.deallocate();
8144 #endif
8145 
8146 #if KMP_STATS_ENABLED
8147  __kmp_stats_fini();
8148 #endif
8149 
8150  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8151 }
8152 
8153 /* ------------------------------------------------------------------------ */
8154 
8155 int __kmp_ignore_mppbeg(void) {
8156  char *env;
8157 
8158  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8159  if (__kmp_str_match_false(env))
8160  return FALSE;
8161  }
8162  // By default __kmpc_begin() is no-op.
8163  return TRUE;
8164 }
8165 
8166 int __kmp_ignore_mppend(void) {
8167  char *env;
8168 
8169  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8170  if (__kmp_str_match_false(env))
8171  return FALSE;
8172  }
8173  // By default __kmpc_end() is no-op.
8174  return TRUE;
8175 }
8176 
8177 void __kmp_internal_begin(void) {
8178  int gtid;
8179  kmp_root_t *root;
8180 
8181  /* this is a very important step as it will register new sibling threads
8182  and assign these new uber threads a new gtid */
8183  gtid = __kmp_entry_gtid();
8184  root = __kmp_threads[gtid]->th.th_root;
8185  KMP_ASSERT(KMP_UBER_GTID(gtid));
8186 
8187  if (root->r.r_begin)
8188  return;
8189  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8190  if (root->r.r_begin) {
8191  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8192  return;
8193  }
8194 
8195  root->r.r_begin = TRUE;
8196 
8197  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8198 }
8199 
8200 /* ------------------------------------------------------------------------ */
8201 
8202 void __kmp_user_set_library(enum library_type arg) {
8203  int gtid;
8204  kmp_root_t *root;
8205  kmp_info_t *thread;
8206 
8207  /* first, make sure we are initialized so we can get our gtid */
8208 
8209  gtid = __kmp_entry_gtid();
8210  thread = __kmp_threads[gtid];
8211 
8212  root = thread->th.th_root;
8213 
8214  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8215  library_serial));
8216  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8217  thread */
8218  KMP_WARNING(SetLibraryIncorrectCall);
8219  return;
8220  }
8221 
8222  switch (arg) {
8223  case library_serial:
8224  thread->th.th_set_nproc = 0;
8225  set__nproc(thread, 1);
8226  break;
8227  case library_turnaround:
8228  thread->th.th_set_nproc = 0;
8229  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8230  : __kmp_dflt_team_nth_ub);
8231  break;
8232  case library_throughput:
8233  thread->th.th_set_nproc = 0;
8234  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8235  : __kmp_dflt_team_nth_ub);
8236  break;
8237  default:
8238  KMP_FATAL(UnknownLibraryType, arg);
8239  }
8240 
8241  __kmp_aux_set_library(arg);
8242 }
8243 
8244 void __kmp_aux_set_stacksize(size_t arg) {
8245  if (!__kmp_init_serial)
8246  __kmp_serial_initialize();
8247 
8248 #if KMP_OS_DARWIN
8249  if (arg & (0x1000 - 1)) {
8250  arg &= ~(0x1000 - 1);
8251  if (arg + 0x1000) /* check for overflow if we round up */
8252  arg += 0x1000;
8253  }
8254 #endif
8255  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8256 
8257  /* only change the default stacksize before the first parallel region */
8258  if (!TCR_4(__kmp_init_parallel)) {
8259  size_t value = arg; /* argument is in bytes */
8260 
8261  if (value < __kmp_sys_min_stksize)
8262  value = __kmp_sys_min_stksize;
8263  else if (value > KMP_MAX_STKSIZE)
8264  value = KMP_MAX_STKSIZE;
8265 
8266  __kmp_stksize = value;
8267 
8268  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8269  }
8270 
8271  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8272 }
8273 
8274 /* set the behaviour of the runtime library */
8275 /* TODO this can cause some odd behaviour with sibling parallelism... */
8276 void __kmp_aux_set_library(enum library_type arg) {
8277  __kmp_library = arg;
8278 
8279  switch (__kmp_library) {
8280  case library_serial: {
8281  KMP_INFORM(LibraryIsSerial);
8282  } break;
8283  case library_turnaround:
8284  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8285  __kmp_use_yield = 2; // only yield when oversubscribed
8286  break;
8287  case library_throughput:
8288  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8289  __kmp_dflt_blocktime = 200;
8290  break;
8291  default:
8292  KMP_FATAL(UnknownLibraryType, arg);
8293  }
8294 }
8295 
8296 /* Getting team information common for all team API */
8297 // Returns NULL if not in teams construct
8298 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8299  kmp_info_t *thr = __kmp_entry_thread();
8300  teams_serialized = 0;
8301  if (thr->th.th_teams_microtask) {
8302  kmp_team_t *team = thr->th.th_team;
8303  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8304  int ii = team->t.t_level;
8305  teams_serialized = team->t.t_serialized;
8306  int level = tlevel + 1;
8307  KMP_DEBUG_ASSERT(ii >= tlevel);
8308  while (ii > level) {
8309  for (teams_serialized = team->t.t_serialized;
8310  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8311  }
8312  if (team->t.t_serialized && (!teams_serialized)) {
8313  team = team->t.t_parent;
8314  continue;
8315  }
8316  if (ii > level) {
8317  team = team->t.t_parent;
8318  ii--;
8319  }
8320  }
8321  return team;
8322  }
8323  return NULL;
8324 }
8325 
8326 int __kmp_aux_get_team_num() {
8327  int serialized;
8328  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8329  if (team) {
8330  if (serialized > 1) {
8331  return 0; // teams region is serialized ( 1 team of 1 thread ).
8332  } else {
8333  return team->t.t_master_tid;
8334  }
8335  }
8336  return 0;
8337 }
8338 
8339 int __kmp_aux_get_num_teams() {
8340  int serialized;
8341  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8342  if (team) {
8343  if (serialized > 1) {
8344  return 1;
8345  } else {
8346  return team->t.t_parent->t.t_nproc;
8347  }
8348  }
8349  return 1;
8350 }
8351 
8352 /* ------------------------------------------------------------------------ */
8353 
8354 /*
8355  * Affinity Format Parser
8356  *
8357  * Field is in form of: %[[[0].]size]type
8358  * % and type are required (%% means print a literal '%')
8359  * type is either single char or long name surrounded by {},
8360  * e.g., N or {num_threads}
8361  * 0 => leading zeros
8362  * . => right justified when size is specified
8363  * by default output is left justified
8364  * size is the *minimum* field length
8365  * All other characters are printed as is
8366  *
8367  * Available field types:
8368  * L {thread_level} - omp_get_level()
8369  * n {thread_num} - omp_get_thread_num()
8370  * h {host} - name of host machine
8371  * P {process_id} - process id (integer)
8372  * T {thread_identifier} - native thread identifier (integer)
8373  * N {num_threads} - omp_get_num_threads()
8374  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8375  * a {thread_affinity} - comma separated list of integers or integer ranges
8376  * (values of affinity mask)
8377  *
8378  * Implementation-specific field types can be added
8379  * If a type is unknown, print "undefined"
8380  */
8381 
8382 // Structure holding the short name, long name, and corresponding data type
8383 // for snprintf. A table of these will represent the entire valid keyword
8384 // field types.
8385 typedef struct kmp_affinity_format_field_t {
8386  char short_name; // from spec e.g., L -> thread level
8387  const char *long_name; // from spec thread_level -> thread level
8388  char field_format; // data type for snprintf (typically 'd' or 's'
8389  // for integer or string)
8390 } kmp_affinity_format_field_t;
8391 
8392 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8393 #if KMP_AFFINITY_SUPPORTED
8394  {'A', "thread_affinity", 's'},
8395 #endif
8396  {'t', "team_num", 'd'},
8397  {'T', "num_teams", 'd'},
8398  {'L', "nesting_level", 'd'},
8399  {'n', "thread_num", 'd'},
8400  {'N', "num_threads", 'd'},
8401  {'a', "ancestor_tnum", 'd'},
8402  {'H', "host", 's'},
8403  {'P', "process_id", 'd'},
8404  {'i', "native_thread_id", 'd'}};
8405 
8406 // Return the number of characters it takes to hold field
8407 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8408  const char **ptr,
8409  kmp_str_buf_t *field_buffer) {
8410  int rc, format_index, field_value;
8411  const char *width_left, *width_right;
8412  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8413  static const int FORMAT_SIZE = 20;
8414  char format[FORMAT_SIZE] = {0};
8415  char absolute_short_name = 0;
8416 
8417  KMP_DEBUG_ASSERT(gtid >= 0);
8418  KMP_DEBUG_ASSERT(th);
8419  KMP_DEBUG_ASSERT(**ptr == '%');
8420  KMP_DEBUG_ASSERT(field_buffer);
8421 
8422  __kmp_str_buf_clear(field_buffer);
8423 
8424  // Skip the initial %
8425  (*ptr)++;
8426 
8427  // Check for %% first
8428  if (**ptr == '%') {
8429  __kmp_str_buf_cat(field_buffer, "%", 1);
8430  (*ptr)++; // skip over the second %
8431  return 1;
8432  }
8433 
8434  // Parse field modifiers if they are present
8435  pad_zeros = false;
8436  if (**ptr == '0') {
8437  pad_zeros = true;
8438  (*ptr)++; // skip over 0
8439  }
8440  right_justify = false;
8441  if (**ptr == '.') {
8442  right_justify = true;
8443  (*ptr)++; // skip over .
8444  }
8445  // Parse width of field: [width_left, width_right)
8446  width_left = width_right = NULL;
8447  if (**ptr >= '0' && **ptr <= '9') {
8448  width_left = *ptr;
8449  SKIP_DIGITS(*ptr);
8450  width_right = *ptr;
8451  }
8452 
8453  // Create the format for KMP_SNPRINTF based on flags parsed above
8454  format_index = 0;
8455  format[format_index++] = '%';
8456  if (!right_justify)
8457  format[format_index++] = '-';
8458  if (pad_zeros)
8459  format[format_index++] = '0';
8460  if (width_left && width_right) {
8461  int i = 0;
8462  // Only allow 8 digit number widths.
8463  // This also prevents overflowing format variable
8464  while (i < 8 && width_left < width_right) {
8465  format[format_index++] = *width_left;
8466  width_left++;
8467  i++;
8468  }
8469  }
8470 
8471  // Parse a name (long or short)
8472  // Canonicalize the name into absolute_short_name
8473  found_valid_name = false;
8474  parse_long_name = (**ptr == '{');
8475  if (parse_long_name)
8476  (*ptr)++; // skip initial left brace
8477  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8478  sizeof(__kmp_affinity_format_table[0]);
8479  ++i) {
8480  char short_name = __kmp_affinity_format_table[i].short_name;
8481  const char *long_name = __kmp_affinity_format_table[i].long_name;
8482  char field_format = __kmp_affinity_format_table[i].field_format;
8483  if (parse_long_name) {
8484  size_t length = KMP_STRLEN(long_name);
8485  if (strncmp(*ptr, long_name, length) == 0) {
8486  found_valid_name = true;
8487  (*ptr) += length; // skip the long name
8488  }
8489  } else if (**ptr == short_name) {
8490  found_valid_name = true;
8491  (*ptr)++; // skip the short name
8492  }
8493  if (found_valid_name) {
8494  format[format_index++] = field_format;
8495  format[format_index++] = '\0';
8496  absolute_short_name = short_name;
8497  break;
8498  }
8499  }
8500  if (parse_long_name) {
8501  if (**ptr != '}') {
8502  absolute_short_name = 0;
8503  } else {
8504  (*ptr)++; // skip over the right brace
8505  }
8506  }
8507 
8508  // Attempt to fill the buffer with the requested
8509  // value using snprintf within __kmp_str_buf_print()
8510  switch (absolute_short_name) {
8511  case 't':
8512  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8513  break;
8514  case 'T':
8515  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8516  break;
8517  case 'L':
8518  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8519  break;
8520  case 'n':
8521  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8522  break;
8523  case 'H': {
8524  static const int BUFFER_SIZE = 256;
8525  char buf[BUFFER_SIZE];
8526  __kmp_expand_host_name(buf, BUFFER_SIZE);
8527  rc = __kmp_str_buf_print(field_buffer, format, buf);
8528  } break;
8529  case 'P':
8530  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8531  break;
8532  case 'i':
8533  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8534  break;
8535  case 'N':
8536  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8537  break;
8538  case 'a':
8539  field_value =
8540  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8541  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8542  break;
8543 #if KMP_AFFINITY_SUPPORTED
8544  case 'A': {
8545  kmp_str_buf_t buf;
8546  __kmp_str_buf_init(&buf);
8547  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8548  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8549  __kmp_str_buf_free(&buf);
8550  } break;
8551 #endif
8552  default:
8553  // According to spec, If an implementation does not have info for field
8554  // type, then "undefined" is printed
8555  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8556  // Skip the field
8557  if (parse_long_name) {
8558  SKIP_TOKEN(*ptr);
8559  if (**ptr == '}')
8560  (*ptr)++;
8561  } else {
8562  (*ptr)++;
8563  }
8564  }
8565 
8566  KMP_ASSERT(format_index <= FORMAT_SIZE);
8567  return rc;
8568 }
8569 
8570 /*
8571  * Return number of characters needed to hold the affinity string
8572  * (not including null byte character)
8573  * The resultant string is printed to buffer, which the caller can then
8574  * handle afterwards
8575  */
8576 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8577  kmp_str_buf_t *buffer) {
8578  const char *parse_ptr;
8579  size_t retval;
8580  const kmp_info_t *th;
8581  kmp_str_buf_t field;
8582 
8583  KMP_DEBUG_ASSERT(buffer);
8584  KMP_DEBUG_ASSERT(gtid >= 0);
8585 
8586  __kmp_str_buf_init(&field);
8587  __kmp_str_buf_clear(buffer);
8588 
8589  th = __kmp_threads[gtid];
8590  retval = 0;
8591 
8592  // If format is NULL or zero-length string, then we use
8593  // affinity-format-var ICV
8594  parse_ptr = format;
8595  if (parse_ptr == NULL || *parse_ptr == '\0') {
8596  parse_ptr = __kmp_affinity_format;
8597  }
8598  KMP_DEBUG_ASSERT(parse_ptr);
8599 
8600  while (*parse_ptr != '\0') {
8601  // Parse a field
8602  if (*parse_ptr == '%') {
8603  // Put field in the buffer
8604  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8605  __kmp_str_buf_catbuf(buffer, &field);
8606  retval += rc;
8607  } else {
8608  // Put literal character in buffer
8609  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8610  retval++;
8611  parse_ptr++;
8612  }
8613  }
8614  __kmp_str_buf_free(&field);
8615  return retval;
8616 }
8617 
8618 // Displays the affinity string to stdout
8619 void __kmp_aux_display_affinity(int gtid, const char *format) {
8620  kmp_str_buf_t buf;
8621  __kmp_str_buf_init(&buf);
8622  __kmp_aux_capture_affinity(gtid, format, &buf);
8623  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8624  __kmp_str_buf_free(&buf);
8625 }
8626 
8627 /* ------------------------------------------------------------------------ */
8628 
8629 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8630  int blocktime = arg; /* argument is in milliseconds */
8631 #if KMP_USE_MONITOR
8632  int bt_intervals;
8633 #endif
8634  kmp_int8 bt_set;
8635 
8636  __kmp_save_internal_controls(thread);
8637 
8638  /* Normalize and set blocktime for the teams */
8639  if (blocktime < KMP_MIN_BLOCKTIME)
8640  blocktime = KMP_MIN_BLOCKTIME;
8641  else if (blocktime > KMP_MAX_BLOCKTIME)
8642  blocktime = KMP_MAX_BLOCKTIME;
8643 
8644  set__blocktime_team(thread->th.th_team, tid, blocktime);
8645  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8646 
8647 #if KMP_USE_MONITOR
8648  /* Calculate and set blocktime intervals for the teams */
8649  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8650 
8651  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8652  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8653 #endif
8654 
8655  /* Set whether blocktime has been set to "TRUE" */
8656  bt_set = TRUE;
8657 
8658  set__bt_set_team(thread->th.th_team, tid, bt_set);
8659  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8660 #if KMP_USE_MONITOR
8661  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8662  "bt_intervals=%d, monitor_updates=%d\n",
8663  __kmp_gtid_from_tid(tid, thread->th.th_team),
8664  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8665  __kmp_monitor_wakeups));
8666 #else
8667  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8668  __kmp_gtid_from_tid(tid, thread->th.th_team),
8669  thread->th.th_team->t.t_id, tid, blocktime));
8670 #endif
8671 }
8672 
8673 void __kmp_aux_set_defaults(char const *str, size_t len) {
8674  if (!__kmp_init_serial) {
8675  __kmp_serial_initialize();
8676  }
8677  __kmp_env_initialize(str);
8678 
8679  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8680  __kmp_env_print();
8681  }
8682 } // __kmp_aux_set_defaults
8683 
8684 /* ------------------------------------------------------------------------ */
8685 /* internal fast reduction routines */
8686 
8687 PACKED_REDUCTION_METHOD_T
8688 __kmp_determine_reduction_method(
8689  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8690  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8691  kmp_critical_name *lck) {
8692 
8693  // Default reduction method: critical construct ( lck != NULL, like in current
8694  // PAROPT )
8695  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8696  // can be selected by RTL
8697  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8698  // can be selected by RTL
8699  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8700  // among generated by PAROPT.
8701 
8702  PACKED_REDUCTION_METHOD_T retval;
8703 
8704  int team_size;
8705 
8706  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8707  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8708 
8709 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8710  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8711 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8712 
8713  retval = critical_reduce_block;
8714 
8715  // another choice of getting a team size (with 1 dynamic deference) is slower
8716  team_size = __kmp_get_team_num_threads(global_tid);
8717  if (team_size == 1) {
8718 
8719  retval = empty_reduce_block;
8720 
8721  } else {
8722 
8723  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8724 
8725 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8726  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8727 
8728 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8729  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8730 
8731  int teamsize_cutoff = 4;
8732 
8733 #if KMP_MIC_SUPPORTED
8734  if (__kmp_mic_type != non_mic) {
8735  teamsize_cutoff = 8;
8736  }
8737 #endif
8738  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8739  if (tree_available) {
8740  if (team_size <= teamsize_cutoff) {
8741  if (atomic_available) {
8742  retval = atomic_reduce_block;
8743  }
8744  } else {
8745  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8746  }
8747  } else if (atomic_available) {
8748  retval = atomic_reduce_block;
8749  }
8750 #else
8751 #error "Unknown or unsupported OS"
8752 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8753  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8754 
8755 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8756 
8757 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8758 
8759  // basic tuning
8760 
8761  if (atomic_available) {
8762  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8763  retval = atomic_reduce_block;
8764  }
8765  } // otherwise: use critical section
8766 
8767 #elif KMP_OS_DARWIN
8768 
8769  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8770  if (atomic_available && (num_vars <= 3)) {
8771  retval = atomic_reduce_block;
8772  } else if (tree_available) {
8773  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8774  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8775  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8776  }
8777  } // otherwise: use critical section
8778 
8779 #else
8780 #error "Unknown or unsupported OS"
8781 #endif
8782 
8783 #else
8784 #error "Unknown or unsupported architecture"
8785 #endif
8786  }
8787 
8788  // KMP_FORCE_REDUCTION
8789 
8790  // If the team is serialized (team_size == 1), ignore the forced reduction
8791  // method and stay with the unsynchronized method (empty_reduce_block)
8792  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8793  team_size != 1) {
8794 
8795  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8796 
8797  int atomic_available, tree_available;
8798 
8799  switch ((forced_retval = __kmp_force_reduction_method)) {
8800  case critical_reduce_block:
8801  KMP_ASSERT(lck); // lck should be != 0
8802  break;
8803 
8804  case atomic_reduce_block:
8805  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8806  if (!atomic_available) {
8807  KMP_WARNING(RedMethodNotSupported, "atomic");
8808  forced_retval = critical_reduce_block;
8809  }
8810  break;
8811 
8812  case tree_reduce_block:
8813  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8814  if (!tree_available) {
8815  KMP_WARNING(RedMethodNotSupported, "tree");
8816  forced_retval = critical_reduce_block;
8817  } else {
8818 #if KMP_FAST_REDUCTION_BARRIER
8819  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8820 #endif
8821  }
8822  break;
8823 
8824  default:
8825  KMP_ASSERT(0); // "unsupported method specified"
8826  }
8827 
8828  retval = forced_retval;
8829  }
8830 
8831  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8832 
8833 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8834 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8835 
8836  return (retval);
8837 }
8838 // this function is for testing set/get/determine reduce method
8839 kmp_int32 __kmp_get_reduce_method(void) {
8840  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8841 }
8842 
8843 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8844 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8845 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8846 
8847 // Hard pause shuts down the runtime completely. Resume happens naturally when
8848 // OpenMP is used subsequently.
8849 void __kmp_hard_pause() {
8850  __kmp_pause_status = kmp_hard_paused;
8851  __kmp_internal_end_thread(-1);
8852 }
8853 
8854 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8855 void __kmp_resume_if_soft_paused() {
8856  if (__kmp_pause_status == kmp_soft_paused) {
8857  __kmp_pause_status = kmp_not_paused;
8858 
8859  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8860  kmp_info_t *thread = __kmp_threads[gtid];
8861  if (thread) { // Wake it if sleeping
8862  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8863  thread);
8864  if (fl.is_sleeping())
8865  fl.resume(gtid);
8866  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8867  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8868  } else { // thread holds the lock and may sleep soon
8869  do { // until either the thread sleeps, or we can get the lock
8870  if (fl.is_sleeping()) {
8871  fl.resume(gtid);
8872  break;
8873  } else if (__kmp_try_suspend_mx(thread)) {
8874  __kmp_unlock_suspend_mx(thread);
8875  break;
8876  }
8877  } while (1);
8878  }
8879  }
8880  }
8881  }
8882 }
8883 
8884 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8885 // TODO: add warning messages
8886 int __kmp_pause_resource(kmp_pause_status_t level) {
8887  if (level == kmp_not_paused) { // requesting resume
8888  if (__kmp_pause_status == kmp_not_paused) {
8889  // error message about runtime not being paused, so can't resume
8890  return 1;
8891  } else {
8892  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8893  __kmp_pause_status == kmp_hard_paused);
8894  __kmp_pause_status = kmp_not_paused;
8895  return 0;
8896  }
8897  } else if (level == kmp_soft_paused) { // requesting soft pause
8898  if (__kmp_pause_status != kmp_not_paused) {
8899  // error message about already being paused
8900  return 1;
8901  } else {
8902  __kmp_soft_pause();
8903  return 0;
8904  }
8905  } else if (level == kmp_hard_paused) { // requesting hard pause
8906  if (__kmp_pause_status != kmp_not_paused) {
8907  // error message about already being paused
8908  return 1;
8909  } else {
8910  __kmp_hard_pause();
8911  return 0;
8912  }
8913  } else {
8914  // error message about invalid level
8915  return 1;
8916  }
8917 }
8918 
8919 void __kmp_omp_display_env(int verbose) {
8920  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8921  if (__kmp_init_serial == 0)
8922  __kmp_do_serial_initialize();
8923  __kmp_display_env_impl(!verbose, verbose);
8924  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8925 }
8926 
8927 // The team size is changing, so distributed barrier must be modified
8928 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8929  int new_nthreads) {
8930  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8931  bp_dist_bar);
8932  kmp_info_t **other_threads = team->t.t_threads;
8933 
8934  // We want all the workers to stop waiting on the barrier while we adjust the
8935  // size of the team.
8936  for (int f = 1; f < old_nthreads; ++f) {
8937  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8938  // Ignore threads that are already inactive or not present in the team
8939  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8940  // teams construct causes thread_limit to get passed in, and some of
8941  // those could be inactive; just ignore them
8942  continue;
8943  }
8944  // If thread is transitioning still to in_use state, wait for it
8945  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8946  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8947  KMP_CPU_PAUSE();
8948  }
8949  // The thread should be in_use now
8950  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8951  // Transition to unused state
8952  team->t.t_threads[f]->th.th_used_in_team.store(2);
8953  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8954  }
8955  // Release all the workers
8956  kmp_uint64 new_value; // new value for go
8957  new_value = team->t.b->go_release();
8958 
8959  KMP_MFENCE();
8960 
8961  // Workers should see transition status 2 and move to 0; but may need to be
8962  // woken up first
8963  size_t my_go_index;
8964  int count = old_nthreads - 1;
8965  while (count > 0) {
8966  count = old_nthreads - 1;
8967  for (int f = 1; f < old_nthreads; ++f) {
8968  my_go_index = f / team->t.b->threads_per_go;
8969  if (other_threads[f]->th.th_used_in_team.load() != 0) {
8970  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8971  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8972  void *, other_threads[f]->th.th_sleep_loc);
8973  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8974  }
8975  } else {
8976  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8977  count--;
8978  }
8979  }
8980  }
8981  // Now update the barrier size
8982  team->t.b->update_num_threads(new_nthreads);
8983  team->t.b->go_reset();
8984 }
8985 
8986 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
8987  // Add the threads back to the team
8988  KMP_DEBUG_ASSERT(team);
8989  // Threads were paused and pointed at th_used_in_team temporarily during a
8990  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
8991  // the thread that it should transition itself back into the team. Then, if
8992  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
8993  // to wake it up.
8994  for (int f = 1; f < new_nthreads; ++f) {
8995  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
8996  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
8997  3);
8998  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
8999  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9000  (kmp_flag_32<false, false> *)NULL);
9001  }
9002  }
9003  // The threads should be transitioning to the team; when they are done, they
9004  // should have set th_used_in_team to 1. This loop forces master to wait until
9005  // all threads have moved into the team and are waiting in the barrier.
9006  int count = new_nthreads - 1;
9007  while (count > 0) {
9008  count = new_nthreads - 1;
9009  for (int f = 1; f < new_nthreads; ++f) {
9010  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9011  count--;
9012  }
9013  }
9014  }
9015 }
9016 
9017 // Globals and functions for hidden helper task
9018 kmp_info_t **__kmp_hidden_helper_threads;
9019 kmp_info_t *__kmp_hidden_helper_main_thread;
9020 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9021 #if KMP_OS_LINUX
9022 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9023 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9024 #else
9025 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9026 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9027 #endif
9028 
9029 namespace {
9030 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9031 
9032 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9033  // This is an explicit synchronization on all hidden helper threads in case
9034  // that when a regular thread pushes a hidden helper task to one hidden
9035  // helper thread, the thread has not been awaken once since they're released
9036  // by the main thread after creating the team.
9037  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9038  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9039  __kmp_hidden_helper_threads_num)
9040  ;
9041 
9042  // If main thread, then wait for signal
9043  if (__kmpc_master(nullptr, *gtid)) {
9044  // First, unset the initial state and release the initial thread
9045  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9046  __kmp_hidden_helper_initz_release();
9047  __kmp_hidden_helper_main_thread_wait();
9048  // Now wake up all worker threads
9049  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9050  __kmp_hidden_helper_worker_thread_signal();
9051  }
9052  }
9053 }
9054 } // namespace
9055 
9056 void __kmp_hidden_helper_threads_initz_routine() {
9057  // Create a new root for hidden helper team/threads
9058  const int gtid = __kmp_register_root(TRUE);
9059  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9060  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9061  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9062  __kmp_hidden_helper_threads_num;
9063 
9064  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9065 
9066  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9067 
9068  // Set the initialization flag to FALSE
9069  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9070 
9071  __kmp_hidden_helper_threads_deinitz_release();
9072 }
9073 
9074 /* Nesting Mode:
9075  Set via KMP_NESTING_MODE, which takes an integer.
9076  Note: we skip duplicate topology levels, and skip levels with only
9077  one entity.
9078  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9079  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9080  in the topology, and initializes the number of threads at each of those
9081  levels to the number of entities at each level, respectively, below the
9082  entity at the parent level.
9083  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9084  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9085  the user to turn nesting on explicitly. This is an even more experimental
9086  option to this experimental feature, and may change or go away in the
9087  future.
9088 */
9089 
9090 // Allocate space to store nesting levels
9091 void __kmp_init_nesting_mode() {
9092  int levels = KMP_HW_LAST;
9093  __kmp_nesting_mode_nlevels = levels;
9094  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9095  for (int i = 0; i < levels; ++i)
9096  __kmp_nesting_nth_level[i] = 0;
9097  if (__kmp_nested_nth.size < levels) {
9098  __kmp_nested_nth.nth =
9099  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9100  __kmp_nested_nth.size = levels;
9101  }
9102 }
9103 
9104 // Set # threads for top levels of nesting; must be called after topology set
9105 void __kmp_set_nesting_mode_threads() {
9106  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9107 
9108  if (__kmp_nesting_mode == 1)
9109  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9110  else if (__kmp_nesting_mode > 1)
9111  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9112 
9113  if (__kmp_topology) { // use topology info
9114  int loc, hw_level;
9115  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9116  loc < __kmp_nesting_mode_nlevels;
9117  loc++, hw_level++) {
9118  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9119  if (__kmp_nesting_nth_level[loc] == 1)
9120  loc--;
9121  }
9122  // Make sure all cores are used
9123  if (__kmp_nesting_mode > 1 && loc > 1) {
9124  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9125  int num_cores = __kmp_topology->get_count(core_level);
9126  int upper_levels = 1;
9127  for (int level = 0; level < loc - 1; ++level)
9128  upper_levels *= __kmp_nesting_nth_level[level];
9129  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9130  __kmp_nesting_nth_level[loc - 1] =
9131  num_cores / __kmp_nesting_nth_level[loc - 2];
9132  }
9133  __kmp_nesting_mode_nlevels = loc;
9134  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9135  } else { // no topology info available; provide a reasonable guesstimation
9136  if (__kmp_avail_proc >= 4) {
9137  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9138  __kmp_nesting_nth_level[1] = 2;
9139  __kmp_nesting_mode_nlevels = 2;
9140  } else {
9141  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9142  __kmp_nesting_mode_nlevels = 1;
9143  }
9144  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9145  }
9146  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9147  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9148  }
9149  set__nproc(thread, __kmp_nesting_nth_level[0]);
9150  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9151  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9152  if (get__max_active_levels(thread) > 1) {
9153  // if max levels was set, set nesting mode levels to same
9154  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9155  }
9156  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9157  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9158 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:937
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236