LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24 
25 #include "tsan_annotations.h"
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 
44 // Linear Barrier
45 template <bool cancellable = false>
46 static bool __kmp_linear_barrier_gather_template(
47  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50  kmp_team_t *team = this_thr->th.th_team;
51  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52  kmp_info_t **other_threads = team->t.t_threads;
53 
54  KA_TRACE(
55  20,
56  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57  gtid, team->t.t_id, tid, bt));
58  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61  // Barrier imbalance - save arrive time to the thread
62  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64  __itt_get_timestamp();
65  }
66 #endif
67  // We now perform a linear reduction to signal that all of the threads have
68  // arrived.
69  if (!KMP_MASTER_TID(tid)) {
70  KA_TRACE(20,
71  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72  "arrived(%p): %llu => %llu\n",
73  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76  // Mark arrival to master thread
77  /* After performing this write, a worker thread may not assume that the team
78  is valid any more - it could be deallocated by the master thread at any
79  time. */
80  ANNOTATE_BARRIER_BEGIN(this_thr);
81  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
82  flag.release();
83  } else {
84  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85  int nproc = this_thr->th.th_team_nproc;
86  int i;
87  // Don't have to worry about sleep bit here or atomic since team setting
88  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89 
90  // Collect all the worker team member threads.
91  for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93  // Prefetch next thread's arrived count
94  if (i + 1 < nproc)
95  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98  "arrived(%p) == %llu\n",
99  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100  team->t.t_id, i,
101  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102 
103  // Wait for worker thread to arrive
104  if (cancellable) {
105  kmp_flag_64<true, false> flag(
106  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107  if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
108  return true;
109  } else {
110  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
111  new_state);
112  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113  }
114  ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116  // Barrier imbalance - write min of the thread time and the other thread
117  // time to the thread.
118  if (__kmp_forkjoin_frames_mode == 2) {
119  this_thr->th.th_bar_min_time = KMP_MIN(
120  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121  }
122 #endif
123  if (reduce) {
124  KA_TRACE(100,
125  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127  team->t.t_id, i));
128  ANNOTATE_REDUCE_AFTER(reduce);
129  OMPT_REDUCTION_DECL(this_thr, gtid);
130  OMPT_REDUCTION_BEGIN;
131  (*reduce)(this_thr->th.th_local.reduce_data,
132  other_threads[i]->th.th_local.reduce_data);
133  OMPT_REDUCTION_END;
134  ANNOTATE_REDUCE_BEFORE(reduce);
135  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136  }
137  }
138  // Don't have to worry about sleep bit here or atomic since team setting
139  team_bar->b_arrived = new_state;
140  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141  "arrived(%p) = %llu\n",
142  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143  new_state));
144  }
145  KA_TRACE(
146  20,
147  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148  gtid, team->t.t_id, tid, bt));
149  return false;
150 }
151 
152 template <bool cancellable = false>
153 static bool __kmp_linear_barrier_release_template(
154  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158  kmp_team_t *team;
159 
160  if (KMP_MASTER_TID(tid)) {
161  unsigned int i;
162  kmp_uint32 nproc = this_thr->th.th_team_nproc;
163  kmp_info_t **other_threads;
164 
165  team = __kmp_threads[gtid]->th.th_team;
166  KMP_DEBUG_ASSERT(team != NULL);
167  other_threads = team->t.t_threads;
168 
169  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170  "barrier type %d\n",
171  gtid, team->t.t_id, tid, bt));
172 
173  if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175  {
176  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177  if (propagate_icvs) {
178  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179  for (i = 1; i < nproc; ++i) {
180  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181  team, i, FALSE);
182  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183  &team->t.t_implicit_task_taskdata[0].td_icvs);
184  }
185  ngo_sync();
186  }
187  }
188 #endif // KMP_BARRIER_ICV_PUSH
189 
190  // Now, release all of the worker threads
191  for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193  // Prefetch next thread's go flag
194  if (i + 1 < nproc)
195  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197  KA_TRACE(
198  20,
199  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200  "go(%p): %u => %u\n",
201  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203  other_threads[i]->th.th_bar[bt].bb.b_go,
204  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207  other_threads[i]);
208  flag.release();
209  }
210  }
211  } else { // Wait for the MASTER thread to release us
212  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214  if (cancellable) {
215  kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216  if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
217  return true;
218  } else {
219  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221  }
222  ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY
224  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
225  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
226  // disabled)
227  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
228  // Cancel wait on previous parallel region...
229  __kmp_itt_task_starting(itt_sync_obj);
230 
231  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
232  return false;
233 
234  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235  if (itt_sync_obj != NULL)
236  // Call prepare as early as possible for "new" barrier
237  __kmp_itt_task_finished(itt_sync_obj);
238  } else
239 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
240  // Early exit for reaping threads releasing forkjoin barrier
241  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
242  return false;
243 // The worker thread may now assume that the team is valid.
244 #ifdef KMP_DEBUG
245  tid = __kmp_tid_from_gtid(gtid);
246  team = __kmp_threads[gtid]->th.th_team;
247 #endif
248  KMP_DEBUG_ASSERT(team != NULL);
249  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
250  KA_TRACE(20,
251  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
253  KMP_MB(); // Flush all pending memory write invalidates.
254  }
255  KA_TRACE(
256  20,
257  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258  gtid, team->t.t_id, tid, bt));
259  return false;
260 }
261 
262 static void __kmp_linear_barrier_gather(
263  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
264  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
265  __kmp_linear_barrier_gather_template<false>(
266  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
267 }
268 
269 static bool __kmp_linear_barrier_gather_cancellable(
270  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
271  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
272  return __kmp_linear_barrier_gather_template<true>(
273  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
274 }
275 
276 static void __kmp_linear_barrier_release(
277  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
278  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
279  __kmp_linear_barrier_release_template<false>(
280  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
281 }
282 
283 static bool __kmp_linear_barrier_release_cancellable(
284  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
285  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
286  return __kmp_linear_barrier_release_template<true>(
287  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
288 }
289 
290 // Tree barrier
291 static void
292 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
293  int tid, void (*reduce)(void *, void *)
294  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
295  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
296  kmp_team_t *team = this_thr->th.th_team;
297  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
298  kmp_info_t **other_threads = team->t.t_threads;
299  kmp_uint32 nproc = this_thr->th.th_team_nproc;
300  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
301  kmp_uint32 branch_factor = 1 << branch_bits;
302  kmp_uint32 child;
303  kmp_uint32 child_tid;
304  kmp_uint64 new_state;
305 
306  KA_TRACE(
307  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
308  gtid, team->t.t_id, tid, bt));
309  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
310 
311 #if USE_ITT_BUILD && USE_ITT_NOTIFY
312  // Barrier imbalance - save arrive time to the thread
313  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
314  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
315  __itt_get_timestamp();
316  }
317 #endif
318  // Perform tree gather to wait until all threads have arrived; reduce any
319  // required data as we go
320  child_tid = (tid << branch_bits) + 1;
321  if (child_tid < nproc) {
322  // Parent threads wait for all their children to arrive
323  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
324  child = 1;
325  do {
326  kmp_info_t *child_thr = other_threads[child_tid];
327  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
328 #if KMP_CACHE_MANAGE
329  // Prefetch next thread's arrived count
330  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
331  KMP_CACHE_PREFETCH(
332  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
333 #endif /* KMP_CACHE_MANAGE */
334  KA_TRACE(20,
335  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
336  "arrived(%p) == %llu\n",
337  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
338  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
339  // Wait for child to arrive
340  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
341  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
342  ANNOTATE_BARRIER_END(child_thr);
343 #if USE_ITT_BUILD && USE_ITT_NOTIFY
344  // Barrier imbalance - write min of the thread time and a child time to
345  // the thread.
346  if (__kmp_forkjoin_frames_mode == 2) {
347  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
348  child_thr->th.th_bar_min_time);
349  }
350 #endif
351  if (reduce) {
352  KA_TRACE(100,
353  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
354  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
355  team->t.t_id, child_tid));
356  ANNOTATE_REDUCE_AFTER(reduce);
357  OMPT_REDUCTION_DECL(this_thr, gtid);
358  OMPT_REDUCTION_BEGIN;
359  (*reduce)(this_thr->th.th_local.reduce_data,
360  child_thr->th.th_local.reduce_data);
361  OMPT_REDUCTION_END;
362  ANNOTATE_REDUCE_BEFORE(reduce);
363  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
364  }
365  child++;
366  child_tid++;
367  } while (child <= branch_factor && child_tid < nproc);
368  }
369 
370  if (!KMP_MASTER_TID(tid)) { // Worker threads
371  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
372 
373  KA_TRACE(20,
374  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
375  "arrived(%p): %llu => %llu\n",
376  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
377  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
378  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
379 
380  // Mark arrival to parent thread
381  /* After performing this write, a worker thread may not assume that the team
382  is valid any more - it could be deallocated by the master thread at any
383  time. */
384  ANNOTATE_BARRIER_BEGIN(this_thr);
385  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
386  flag.release();
387  } else {
388  // Need to update the team arrived pointer if we are the master thread
389  if (nproc > 1) // New value was already computed above
390  team->t.t_bar[bt].b_arrived = new_state;
391  else
392  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
393  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
394  "arrived(%p) = %llu\n",
395  gtid, team->t.t_id, tid, team->t.t_id,
396  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
397  }
398  KA_TRACE(20,
399  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
400  gtid, team->t.t_id, tid, bt));
401 }
402 
403 static void __kmp_tree_barrier_release(
404  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
405  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
406  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
407  kmp_team_t *team;
408  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
409  kmp_uint32 nproc;
410  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
411  kmp_uint32 branch_factor = 1 << branch_bits;
412  kmp_uint32 child;
413  kmp_uint32 child_tid;
414 
415  // Perform a tree release for all of the threads that have been gathered
416  if (!KMP_MASTER_TID(
417  tid)) { // Handle fork barrier workers who aren't part of a team yet
418  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
419  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
420  // Wait for parent thread to release us
421  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
422  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
423  ANNOTATE_BARRIER_END(this_thr);
424 #if USE_ITT_BUILD && USE_ITT_NOTIFY
425  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
426  // In fork barrier where we could not get the object reliably (or
427  // ITTNOTIFY is disabled)
428  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
429  // Cancel wait on previous parallel region...
430  __kmp_itt_task_starting(itt_sync_obj);
431 
432  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
433  return;
434 
435  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
436  if (itt_sync_obj != NULL)
437  // Call prepare as early as possible for "new" barrier
438  __kmp_itt_task_finished(itt_sync_obj);
439  } else
440 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
441  // Early exit for reaping threads releasing forkjoin barrier
442  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
443  return;
444 
445  // The worker thread may now assume that the team is valid.
446  team = __kmp_threads[gtid]->th.th_team;
447  KMP_DEBUG_ASSERT(team != NULL);
448  tid = __kmp_tid_from_gtid(gtid);
449 
450  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
451  KA_TRACE(20,
452  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
453  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
454  KMP_MB(); // Flush all pending memory write invalidates.
455  } else {
456  team = __kmp_threads[gtid]->th.th_team;
457  KMP_DEBUG_ASSERT(team != NULL);
458  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
459  "barrier type %d\n",
460  gtid, team->t.t_id, tid, bt));
461  }
462  nproc = this_thr->th.th_team_nproc;
463  child_tid = (tid << branch_bits) + 1;
464 
465  if (child_tid < nproc) {
466  kmp_info_t **other_threads = team->t.t_threads;
467  child = 1;
468  // Parent threads release all their children
469  do {
470  kmp_info_t *child_thr = other_threads[child_tid];
471  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472 #if KMP_CACHE_MANAGE
473  // Prefetch next thread's go count
474  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
475  KMP_CACHE_PREFETCH(
476  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
477 #endif /* KMP_CACHE_MANAGE */
478 
479 #if KMP_BARRIER_ICV_PUSH
480  {
481  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
482  if (propagate_icvs) {
483  __kmp_init_implicit_task(team->t.t_ident,
484  team->t.t_threads[child_tid], team,
485  child_tid, FALSE);
486  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
487  &team->t.t_implicit_task_taskdata[0].td_icvs);
488  }
489  }
490 #endif // KMP_BARRIER_ICV_PUSH
491  KA_TRACE(20,
492  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
493  "go(%p): %u => %u\n",
494  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
495  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
496  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
497  // Release child from barrier
498  ANNOTATE_BARRIER_BEGIN(child_thr);
499  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
500  flag.release();
501  child++;
502  child_tid++;
503  } while (child <= branch_factor && child_tid < nproc);
504  }
505  KA_TRACE(
506  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
507  gtid, team->t.t_id, tid, bt));
508 }
509 
510 // Hyper Barrier
511 static void
512 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
513  int tid, void (*reduce)(void *, void *)
514  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
515  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
516  kmp_team_t *team = this_thr->th.th_team;
517  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
518  kmp_info_t **other_threads = team->t.t_threads;
519  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
520  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
521  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
522  kmp_uint32 branch_factor = 1 << branch_bits;
523  kmp_uint32 offset;
524  kmp_uint32 level;
525 
526  KA_TRACE(
527  20,
528  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
529  gtid, team->t.t_id, tid, bt));
530  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
531 
532 #if USE_ITT_BUILD && USE_ITT_NOTIFY
533  // Barrier imbalance - save arrive time to the thread
534  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
535  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
536  __itt_get_timestamp();
537  }
538 #endif
539  /* Perform a hypercube-embedded tree gather to wait until all of the threads
540  have arrived, and reduce any required data as we go. */
541  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
542  for (level = 0, offset = 1; offset < num_threads;
543  level += branch_bits, offset <<= branch_bits) {
544  kmp_uint32 child;
545  kmp_uint32 child_tid;
546 
547  if (((tid >> level) & (branch_factor - 1)) != 0) {
548  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
549 
550  KMP_MB(); // Synchronize parent and child threads.
551  KA_TRACE(20,
552  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
553  "arrived(%p): %llu => %llu\n",
554  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
555  team->t.t_id, parent_tid, &thr_bar->b_arrived,
556  thr_bar->b_arrived,
557  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
558  // Mark arrival to parent thread
559  /* After performing this write (in the last iteration of the enclosing for
560  loop), a worker thread may not assume that the team is valid any more
561  - it could be deallocated by the master thread at any time. */
562  ANNOTATE_BARRIER_BEGIN(this_thr);
563  p_flag.set_waiter(other_threads[parent_tid]);
564  p_flag.release();
565  break;
566  }
567 
568  // Parent threads wait for children to arrive
569  if (new_state == KMP_BARRIER_UNUSED_STATE)
570  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
571  for (child = 1, child_tid = tid + (1 << level);
572  child < branch_factor && child_tid < num_threads;
573  child++, child_tid += (1 << level)) {
574  kmp_info_t *child_thr = other_threads[child_tid];
575  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
576 #if KMP_CACHE_MANAGE
577  kmp_uint32 next_child_tid = child_tid + (1 << level);
578  // Prefetch next thread's arrived count
579  if (child + 1 < branch_factor && next_child_tid < num_threads)
580  KMP_CACHE_PREFETCH(
581  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
582 #endif /* KMP_CACHE_MANAGE */
583  KA_TRACE(20,
584  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
585  "arrived(%p) == %llu\n",
586  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
587  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
588  // Wait for child to arrive
589  kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
590  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
591  ANNOTATE_BARRIER_END(child_thr);
592  KMP_MB(); // Synchronize parent and child threads.
593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
594  // Barrier imbalance - write min of the thread time and a child time to
595  // the thread.
596  if (__kmp_forkjoin_frames_mode == 2) {
597  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598  child_thr->th.th_bar_min_time);
599  }
600 #endif
601  if (reduce) {
602  KA_TRACE(100,
603  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605  team->t.t_id, child_tid));
606  ANNOTATE_REDUCE_AFTER(reduce);
607  OMPT_REDUCTION_DECL(this_thr, gtid);
608  OMPT_REDUCTION_BEGIN;
609  (*reduce)(this_thr->th.th_local.reduce_data,
610  child_thr->th.th_local.reduce_data);
611  OMPT_REDUCTION_END;
612  ANNOTATE_REDUCE_BEFORE(reduce);
613  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
614  }
615  }
616  }
617 
618  if (KMP_MASTER_TID(tid)) {
619  // Need to update the team arrived pointer if we are the master thread
620  if (new_state == KMP_BARRIER_UNUSED_STATE)
621  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
622  else
623  team->t.t_bar[bt].b_arrived = new_state;
624  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625  "arrived(%p) = %llu\n",
626  gtid, team->t.t_id, tid, team->t.t_id,
627  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
628  }
629  KA_TRACE(
630  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631  gtid, team->t.t_id, tid, bt));
632 }
633 
634 // The reverse versions seem to beat the forward versions overall
635 #define KMP_REVERSE_HYPER_BAR
636 static void __kmp_hyper_barrier_release(
637  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
638  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
639  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
640  kmp_team_t *team;
641  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642  kmp_info_t **other_threads;
643  kmp_uint32 num_threads;
644  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645  kmp_uint32 branch_factor = 1 << branch_bits;
646  kmp_uint32 child;
647  kmp_uint32 child_tid;
648  kmp_uint32 offset;
649  kmp_uint32 level;
650 
651  /* Perform a hypercube-embedded tree release for all of the threads that have
652  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
653  are released in the reverse order of the corresponding gather, otherwise
654  threads are released in the same order. */
655  if (KMP_MASTER_TID(tid)) { // master
656  team = __kmp_threads[gtid]->th.th_team;
657  KMP_DEBUG_ASSERT(team != NULL);
658  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
659  "barrier type %d\n",
660  gtid, team->t.t_id, tid, bt));
661 #if KMP_BARRIER_ICV_PUSH
662  if (propagate_icvs) { // master already has ICVs in final destination; copy
663  copy_icvs(&thr_bar->th_fixed_icvs,
664  &team->t.t_implicit_task_taskdata[tid].td_icvs);
665  }
666 #endif
667  } else { // Handle fork barrier workers who aren't part of a team yet
668  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
670  // Wait for parent thread to release us
671  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673  ANNOTATE_BARRIER_END(this_thr);
674 #if USE_ITT_BUILD && USE_ITT_NOTIFY
675  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
676  // In fork barrier where we could not get the object reliably
677  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
678  // Cancel wait on previous parallel region...
679  __kmp_itt_task_starting(itt_sync_obj);
680 
681  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
682  return;
683 
684  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685  if (itt_sync_obj != NULL)
686  // Call prepare as early as possible for "new" barrier
687  __kmp_itt_task_finished(itt_sync_obj);
688  } else
689 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
690  // Early exit for reaping threads releasing forkjoin barrier
691  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
692  return;
693 
694  // The worker thread may now assume that the team is valid.
695  team = __kmp_threads[gtid]->th.th_team;
696  KMP_DEBUG_ASSERT(team != NULL);
697  tid = __kmp_tid_from_gtid(gtid);
698 
699  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
700  KA_TRACE(20,
701  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
703  KMP_MB(); // Flush all pending memory write invalidates.
704  }
705  num_threads = this_thr->th.th_team_nproc;
706  other_threads = team->t.t_threads;
707 
708 #ifdef KMP_REVERSE_HYPER_BAR
709  // Count up to correct level for parent
710  for (level = 0, offset = 1;
711  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712  level += branch_bits, offset <<= branch_bits)
713  ;
714 
715  // Now go down from there
716  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717  level -= branch_bits, offset >>= branch_bits)
718 #else
719  // Go down the tree, level by level
720  for (level = 0, offset = 1; offset < num_threads;
721  level += branch_bits, offset <<= branch_bits)
722 #endif // KMP_REVERSE_HYPER_BAR
723  {
724 #ifdef KMP_REVERSE_HYPER_BAR
725  /* Now go in reverse order through the children, highest to lowest.
726  Initial setting of child is conservative here. */
727  child = num_threads >> ((level == 0) ? level : level - 1);
728  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729  child_tid = tid + (child << level);
730  child >= 1; child--, child_tid -= (1 << level))
731 #else
732  if (((tid >> level) & (branch_factor - 1)) != 0)
733  // No need to go lower than this, since this is the level parent would be
734  // notified
735  break;
736  // Iterate through children on this level of the tree
737  for (child = 1, child_tid = tid + (1 << level);
738  child < branch_factor && child_tid < num_threads;
739  child++, child_tid += (1 << level))
740 #endif // KMP_REVERSE_HYPER_BAR
741  {
742  if (child_tid >= num_threads)
743  continue; // Child doesn't exist so keep going
744  else {
745  kmp_info_t *child_thr = other_threads[child_tid];
746  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
747 #if KMP_CACHE_MANAGE
748  kmp_uint32 next_child_tid = child_tid - (1 << level);
749 // Prefetch next thread's go count
750 #ifdef KMP_REVERSE_HYPER_BAR
751  if (child - 1 >= 1 && next_child_tid < num_threads)
752 #else
753  if (child + 1 < branch_factor && next_child_tid < num_threads)
754 #endif // KMP_REVERSE_HYPER_BAR
755  KMP_CACHE_PREFETCH(
756  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
757 #endif /* KMP_CACHE_MANAGE */
758 
759 #if KMP_BARRIER_ICV_PUSH
760  if (propagate_icvs) // push my fixed ICVs to my child
761  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
762 #endif // KMP_BARRIER_ICV_PUSH
763 
764  KA_TRACE(
765  20,
766  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767  "go(%p): %u => %u\n",
768  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
771  // Release child from barrier
772  ANNOTATE_BARRIER_BEGIN(child_thr);
773  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
774  flag.release();
775  }
776  }
777  }
778 #if KMP_BARRIER_ICV_PUSH
779  if (propagate_icvs &&
780  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
781  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
782  FALSE);
783  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784  &thr_bar->th_fixed_icvs);
785  }
786 #endif
787  KA_TRACE(
788  20,
789  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790  gtid, team->t.t_id, tid, bt));
791 }
792 
793 // Hierarchical Barrier
794 
795 // Initialize thread barrier data
796 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
797  Performs the minimum amount of initialization required based on how the team
798  has changed. Returns true if leaf children will require both on-core and
799  traditional wake-up mechanisms. For example, if the team size increases,
800  threads already in the team will respond to on-core wakeup on their parent
801  thread, but threads newly added to the team will only be listening on the
802  their local b_go. */
803 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
804  kmp_bstate_t *thr_bar,
805  kmp_uint32 nproc, int gtid,
806  int tid, kmp_team_t *team) {
807  // Checks to determine if (re-)initialization is needed
808  bool uninitialized = thr_bar->team == NULL;
809  bool team_changed = team != thr_bar->team;
810  bool team_sz_changed = nproc != thr_bar->nproc;
811  bool tid_changed = tid != thr_bar->old_tid;
812  bool retval = false;
813 
814  if (uninitialized || team_sz_changed) {
815  __kmp_get_hierarchy(nproc, thr_bar);
816  }
817 
818  if (uninitialized || team_sz_changed || tid_changed) {
819  thr_bar->my_level = thr_bar->depth - 1; // default for master
820  thr_bar->parent_tid = -1; // default for master
821  if (!KMP_MASTER_TID(
822  tid)) { // if not master, find parent thread in hierarchy
823  kmp_uint32 d = 0;
824  while (d < thr_bar->depth) { // find parent based on level of thread in
825  // hierarchy, and note level
826  kmp_uint32 rem;
827  if (d == thr_bar->depth - 2) { // reached level right below the master
828  thr_bar->parent_tid = 0;
829  thr_bar->my_level = d;
830  break;
831  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
832  // TODO: can we make the above op faster?
833  // thread is not a subtree root at next level, so this is max
834  thr_bar->parent_tid = tid - rem;
835  thr_bar->my_level = d;
836  break;
837  }
838  ++d;
839  }
840  }
841  __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
842  (thr_bar->skip_per_level[thr_bar->my_level])),
843  &(thr_bar->offset));
844  thr_bar->old_tid = tid;
845  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
846  thr_bar->team = team;
847  thr_bar->parent_bar =
848  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
849  }
850  if (uninitialized || team_changed || tid_changed) {
851  thr_bar->team = team;
852  thr_bar->parent_bar =
853  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
854  retval = true;
855  }
856  if (uninitialized || team_sz_changed || tid_changed) {
857  thr_bar->nproc = nproc;
858  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
859  if (thr_bar->my_level == 0)
860  thr_bar->leaf_kids = 0;
861  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
862  __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
863  thr_bar->leaf_state = 0;
864  for (int i = 0; i < thr_bar->leaf_kids; ++i)
865  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
866  }
867  return retval;
868 }
869 
870 static void __kmp_hierarchical_barrier_gather(
871  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
872  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
873  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
874  kmp_team_t *team = this_thr->th.th_team;
875  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
876  kmp_uint32 nproc = this_thr->th.th_team_nproc;
877  kmp_info_t **other_threads = team->t.t_threads;
878  kmp_uint64 new_state;
879 
880  int level = team->t.t_level;
881  if (other_threads[0]
882  ->th.th_teams_microtask) // are we inside the teams construct?
883  if (this_thr->th.th_teams_size.nteams > 1)
884  ++level; // level was not increased in teams construct for team_of_masters
885  if (level == 1)
886  thr_bar->use_oncore_barrier = 1;
887  else
888  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
889 
890  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
891  "barrier type %d\n",
892  gtid, team->t.t_id, tid, bt));
893  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
894 
895 #if USE_ITT_BUILD && USE_ITT_NOTIFY
896  // Barrier imbalance - save arrive time to the thread
897  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
898  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
899  }
900 #endif
901 
902  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
903  team);
904 
905  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
906  kmp_int32 child_tid;
907  new_state =
908  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
909  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
910  thr_bar->use_oncore_barrier) {
911  if (thr_bar->leaf_kids) {
912  // First, wait for leaf children to check-in on my b_arrived flag
913  kmp_uint64 leaf_state =
914  KMP_MASTER_TID(tid)
915  ? thr_bar->b_arrived | thr_bar->leaf_state
916  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
917  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
918  "for leaf kids\n",
919  gtid, team->t.t_id, tid));
920  kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
921  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
922  if (reduce) {
923  ANNOTATE_REDUCE_AFTER(reduce);
924  OMPT_REDUCTION_DECL(this_thr, gtid);
925  OMPT_REDUCTION_BEGIN;
926  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
927  ++child_tid) {
928  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
929  "T#%d(%d:%d)\n",
930  gtid, team->t.t_id, tid,
931  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
932  child_tid));
933  ANNOTATE_BARRIER_END(other_threads[child_tid]);
934  (*reduce)(this_thr->th.th_local.reduce_data,
935  other_threads[child_tid]->th.th_local.reduce_data);
936  }
937  OMPT_REDUCTION_END;
938  ANNOTATE_REDUCE_BEFORE(reduce);
939  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
940  }
941  // clear leaf_state bits
942  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
943  }
944  // Next, wait for higher level children on each child's b_arrived flag
945  for (kmp_uint32 d = 1; d < thr_bar->my_level;
946  ++d) { // gather lowest level threads first, but skip 0
947  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
948  skip = thr_bar->skip_per_level[d];
949  if (last > nproc)
950  last = nproc;
951  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
952  kmp_info_t *child_thr = other_threads[child_tid];
953  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
954  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
955  "T#%d(%d:%d) "
956  "arrived(%p) == %llu\n",
957  gtid, team->t.t_id, tid,
958  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959  child_tid, &child_bar->b_arrived, new_state));
960  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
961  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
962  ANNOTATE_BARRIER_END(child_thr);
963  if (reduce) {
964  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
965  "T#%d(%d:%d)\n",
966  gtid, team->t.t_id, tid,
967  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
968  child_tid));
969  ANNOTATE_REDUCE_AFTER(reduce);
970  (*reduce)(this_thr->th.th_local.reduce_data,
971  child_thr->th.th_local.reduce_data);
972  ANNOTATE_REDUCE_BEFORE(reduce);
973  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
974  }
975  }
976  }
977  } else { // Blocktime is not infinite
978  for (kmp_uint32 d = 0; d < thr_bar->my_level;
979  ++d) { // Gather lowest level threads first
980  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
981  skip = thr_bar->skip_per_level[d];
982  if (last > nproc)
983  last = nproc;
984  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
985  kmp_info_t *child_thr = other_threads[child_tid];
986  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
987  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
988  "T#%d(%d:%d) "
989  "arrived(%p) == %llu\n",
990  gtid, team->t.t_id, tid,
991  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992  child_tid, &child_bar->b_arrived, new_state));
993  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
994  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
995  ANNOTATE_BARRIER_END(child_thr);
996  if (reduce) {
997  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
998  "T#%d(%d:%d)\n",
999  gtid, team->t.t_id, tid,
1000  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1001  child_tid));
1002  ANNOTATE_REDUCE_AFTER(reduce);
1003  (*reduce)(this_thr->th.th_local.reduce_data,
1004  child_thr->th.th_local.reduce_data);
1005  ANNOTATE_REDUCE_BEFORE(reduce);
1006  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1007  }
1008  }
1009  }
1010  }
1011  }
1012  // All subordinates are gathered; now release parent if not master thread
1013 
1014  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1015  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1016  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1017  gtid, team->t.t_id, tid,
1018  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1019  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1020  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1021  /* Mark arrival to parent: After performing this write, a worker thread may
1022  not assume that the team is valid any more - it could be deallocated by
1023  the master thread at any time. */
1024  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1025  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1026  // flag; release it
1027  ANNOTATE_BARRIER_BEGIN(this_thr);
1028  kmp_flag_64<> flag(&thr_bar->b_arrived,
1029  other_threads[thr_bar->parent_tid]);
1030  flag.release();
1031  } else {
1032  // Leaf does special release on "offset" bits of parent's b_arrived flag
1033  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1034  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1035  thr_bar->offset + 1);
1036  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1037  flag.release();
1038  }
1039  } else { // Master thread needs to update the team's b_arrived value
1040  team->t.t_bar[bt].b_arrived = new_state;
1041  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1042  "arrived(%p) = %llu\n",
1043  gtid, team->t.t_id, tid, team->t.t_id,
1044  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1045  }
1046  // Is the team access below unsafe or just technically invalid?
1047  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1048  "barrier type %d\n",
1049  gtid, team->t.t_id, tid, bt));
1050 }
1051 
1052 static void __kmp_hierarchical_barrier_release(
1053  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1054  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1055  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1056  kmp_team_t *team;
1057  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1058  kmp_uint32 nproc;
1059  bool team_change = false; // indicates on-core barrier shouldn't be used
1060 
1061  if (KMP_MASTER_TID(tid)) {
1062  team = __kmp_threads[gtid]->th.th_team;
1063  KMP_DEBUG_ASSERT(team != NULL);
1064  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1065  "entered barrier type %d\n",
1066  gtid, team->t.t_id, tid, bt));
1067  } else { // Worker threads
1068  // Wait for parent thread to release me
1069  if (!thr_bar->use_oncore_barrier ||
1070  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1071  thr_bar->team == NULL) {
1072  // Use traditional method of waiting on my own b_go flag
1073  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1074  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1075  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1076  ANNOTATE_BARRIER_END(this_thr);
1077  TCW_8(thr_bar->b_go,
1078  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1079  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1080  // infinite, not nested
1081  // Wait on my "offset" bits on parent's b_go flag
1082  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1083  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1084  thr_bar->offset + 1, bt,
1085  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1086  flag.wait(this_thr, TRUE);
1087  if (thr_bar->wait_flag ==
1088  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1089  TCW_8(thr_bar->b_go,
1090  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1091  } else { // Reset my bits on parent's b_go flag
1092  (RCAST(volatile char *,
1093  &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1094  }
1095  }
1096  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1097  // Early exit for reaping threads releasing forkjoin barrier
1098  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1099  return;
1100  // The worker thread may now assume that the team is valid.
1101  team = __kmp_threads[gtid]->th.th_team;
1102  KMP_DEBUG_ASSERT(team != NULL);
1103  tid = __kmp_tid_from_gtid(gtid);
1104 
1105  KA_TRACE(
1106  20,
1107  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1108  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1109  KMP_MB(); // Flush all pending memory write invalidates.
1110  }
1111 
1112  nproc = this_thr->th.th_team_nproc;
1113  int level = team->t.t_level;
1114  if (team->t.t_threads[0]
1115  ->th.th_teams_microtask) { // are we inside the teams construct?
1116  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1117  this_thr->th.th_teams_level == level)
1118  ++level; // level was not increased in teams construct for team_of_workers
1119  if (this_thr->th.th_teams_size.nteams > 1)
1120  ++level; // level was not increased in teams construct for team_of_masters
1121  }
1122  if (level == 1)
1123  thr_bar->use_oncore_barrier = 1;
1124  else
1125  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1126 
1127  // If the team size has increased, we still communicate with old leaves via
1128  // oncore barrier.
1129  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1130  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1131  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1132  tid, team);
1133  // But if the entire team changes, we won't use oncore barrier at all
1134  if (team_change)
1135  old_leaf_kids = 0;
1136 
1137 #if KMP_BARRIER_ICV_PUSH
1138  if (propagate_icvs) {
1139  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1140  FALSE);
1141  if (KMP_MASTER_TID(
1142  tid)) { // master already has copy in final destination; copy
1143  copy_icvs(&thr_bar->th_fixed_icvs,
1144  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1145  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1146  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1147  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1148  // leaves (on-core children) pull parent's fixed ICVs directly to local
1149  // ICV store
1150  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1151  &thr_bar->parent_bar->th_fixed_icvs);
1152  // non-leaves will get ICVs piggybacked with b_go via NGO store
1153  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1154  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1155  // access
1156  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1157  else // leaves copy parent's fixed ICVs directly to local ICV store
1158  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1159  &thr_bar->parent_bar->th_fixed_icvs);
1160  }
1161  }
1162 #endif // KMP_BARRIER_ICV_PUSH
1163 
1164  // Now, release my children
1165  if (thr_bar->my_level) { // not a leaf
1166  kmp_int32 child_tid;
1167  kmp_uint32 last;
1168  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1169  thr_bar->use_oncore_barrier) {
1170  if (KMP_MASTER_TID(tid)) { // do a flat release
1171  // Set local b_go to bump children via NGO store of the cache line
1172  // containing IVCs and b_go.
1173  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1174  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1175  // the cache line
1176  ngo_load(&thr_bar->th_fixed_icvs);
1177  // This loops over all the threads skipping only the leaf nodes in the
1178  // hierarchy
1179  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1180  child_tid += thr_bar->skip_per_level[1]) {
1181  kmp_bstate_t *child_bar =
1182  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1183  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1184  "releasing T#%d(%d:%d)"
1185  " go(%p): %u => %u\n",
1186  gtid, team->t.t_id, tid,
1187  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1188  child_tid, &child_bar->b_go, child_bar->b_go,
1189  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1190  // Use ngo store (if available) to both store ICVs and release child
1191  // via child's b_go
1192  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1193  }
1194  ngo_sync();
1195  }
1196  TCW_8(thr_bar->b_go,
1197  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1198  // Now, release leaf children
1199  if (thr_bar->leaf_kids) { // if there are any
1200  // We test team_change on the off-chance that the level 1 team changed.
1201  if (team_change ||
1202  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1203  if (old_leaf_kids) { // release old leaf kids
1204  thr_bar->b_go |= old_leaf_state;
1205  }
1206  // Release new leaf kids
1207  last = tid + thr_bar->skip_per_level[1];
1208  if (last > nproc)
1209  last = nproc;
1210  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1211  ++child_tid) { // skip_per_level[0]=1
1212  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1213  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1214  KA_TRACE(
1215  20,
1216  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1217  " T#%d(%d:%d) go(%p): %u => %u\n",
1218  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1219  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1220  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1221  // Release child using child's b_go flag
1222  ANNOTATE_BARRIER_BEGIN(child_thr);
1223  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1224  flag.release();
1225  }
1226  } else { // Release all children at once with leaf_state bits on my own
1227  // b_go flag
1228  thr_bar->b_go |= thr_bar->leaf_state;
1229  }
1230  }
1231  } else { // Blocktime is not infinite; do a simple hierarchical release
1232  for (int d = thr_bar->my_level - 1; d >= 0;
1233  --d) { // Release highest level threads first
1234  last = tid + thr_bar->skip_per_level[d + 1];
1235  kmp_uint32 skip = thr_bar->skip_per_level[d];
1236  if (last > nproc)
1237  last = nproc;
1238  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1239  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1240  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1241  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1242  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1243  gtid, team->t.t_id, tid,
1244  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1245  child_tid, &child_bar->b_go, child_bar->b_go,
1246  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1247  // Release child using child's b_go flag
1248  ANNOTATE_BARRIER_BEGIN(child_thr);
1249  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1250  flag.release();
1251  }
1252  }
1253  }
1254 #if KMP_BARRIER_ICV_PUSH
1255  if (propagate_icvs && !KMP_MASTER_TID(tid))
1256  // non-leaves copy ICVs from fixed ICVs to local dest
1257  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1258  &thr_bar->th_fixed_icvs);
1259 #endif // KMP_BARRIER_ICV_PUSH
1260  }
1261  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1262  "barrier type %d\n",
1263  gtid, team->t.t_id, tid, bt));
1264 }
1265 
1266 // End of Barrier Algorithms
1267 
1268 // type traits for cancellable value
1269 // if cancellable is true, then is_cancellable is a normal boolean variable
1270 // if cancellable is false, then is_cancellable is a compile time constant
1271 template <bool cancellable> struct is_cancellable {};
1272 template <> struct is_cancellable<true> {
1273  bool value;
1274  is_cancellable() : value(false) {}
1275  is_cancellable(bool b) : value(b) {}
1276  is_cancellable &operator=(bool b) {
1277  value = b;
1278  return *this;
1279  }
1280  operator bool() const { return value; }
1281 };
1282 template <> struct is_cancellable<false> {
1283  is_cancellable &operator=(bool b) { return *this; }
1284  constexpr operator bool() const { return false; }
1285 };
1286 
1287 // Internal function to do a barrier.
1288 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1289  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1290  barrier
1291  When cancellable = false,
1292  Returns 0 if master thread, 1 if worker thread.
1293  When cancellable = true
1294  Returns 0 if not cancelled, 1 if cancelled. */
1295 template <bool cancellable = false>
1296 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1297  size_t reduce_size, void *reduce_data,
1298  void (*reduce)(void *, void *)) {
1299  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1300  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1301  int tid = __kmp_tid_from_gtid(gtid);
1302  kmp_info_t *this_thr = __kmp_threads[gtid];
1303  kmp_team_t *team = this_thr->th.th_team;
1304  int status = 0;
1305  is_cancellable<cancellable> cancelled;
1306 #if OMPT_SUPPORT && OMPT_OPTIONAL
1307  ompt_data_t *my_task_data;
1308  ompt_data_t *my_parallel_data;
1309  void *return_address;
1310  ompt_sync_region_t barrier_kind;
1311 #endif
1312 
1313  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1314  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1315 
1316  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1317 #if OMPT_SUPPORT
1318  if (ompt_enabled.enabled) {
1319 #if OMPT_OPTIONAL
1320  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1321  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1322  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1323  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1324  if (ompt_enabled.ompt_callback_sync_region) {
1325  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1326  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1327  return_address);
1328  }
1329  if (ompt_enabled.ompt_callback_sync_region_wait) {
1330  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1331  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1332  return_address);
1333  }
1334 #endif
1335  // It is OK to report the barrier state after the barrier begin callback.
1336  // According to the OMPT specification, a compliant implementation may
1337  // even delay reporting this state until the barrier begins to wait.
1338  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1339  }
1340 #endif
1341 
1342  if (!team->t.t_serialized) {
1343 #if USE_ITT_BUILD
1344  // This value will be used in itt notify events below.
1345  void *itt_sync_obj = NULL;
1346 #if USE_ITT_NOTIFY
1347  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1348  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1349 #endif
1350 #endif /* USE_ITT_BUILD */
1351  if (__kmp_tasking_mode == tskm_extra_barrier) {
1352  __kmp_tasking_barrier(team, this_thr, gtid);
1353  KA_TRACE(15,
1354  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1355  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1356  }
1357 
1358  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1359  access it when the team struct is not guaranteed to exist. */
1360  // See note about the corresponding code in __kmp_join_barrier() being
1361  // performance-critical.
1362  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1363 #if KMP_USE_MONITOR
1364  this_thr->th.th_team_bt_intervals =
1365  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1366  this_thr->th.th_team_bt_set =
1367  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1368 #else
1369  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1370 #endif
1371  }
1372 
1373 #if USE_ITT_BUILD
1374  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1375  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1376 #endif /* USE_ITT_BUILD */
1377 #if USE_DEBUGGER
1378  // Let the debugger know: the thread arrived to the barrier and waiting.
1379  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1380  team->t.t_bar[bt].b_master_arrived += 1;
1381  } else {
1382  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1383  } // if
1384 #endif /* USE_DEBUGGER */
1385  if (reduce != NULL) {
1386  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1387  this_thr->th.th_local.reduce_data = reduce_data;
1388  }
1389 
1390  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1391  // use 0 to only setup the current team if nthreads > 1
1392  __kmp_task_team_setup(this_thr, team, 0);
1393 
1394  if (cancellable) {
1395  cancelled = __kmp_linear_barrier_gather_cancellable(
1396  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1397  } else {
1398  switch (__kmp_barrier_gather_pattern[bt]) {
1399  case bp_hyper_bar: {
1400  // don't set branch bits to 0; use linear
1401  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1402  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1403  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1404  break;
1405  }
1406  case bp_hierarchical_bar: {
1407  __kmp_hierarchical_barrier_gather(
1408  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1409  break;
1410  }
1411  case bp_tree_bar: {
1412  // don't set branch bits to 0; use linear
1413  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1414  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1415  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1416  break;
1417  }
1418  default: {
1419  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1420  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1421  }
1422  }
1423  }
1424 
1425  KMP_MB();
1426 
1427  if (KMP_MASTER_TID(tid)) {
1428  status = 0;
1429  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1430  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1431  }
1432 #if USE_DEBUGGER
1433  // Let the debugger know: All threads are arrived and starting leaving the
1434  // barrier.
1435  team->t.t_bar[bt].b_team_arrived += 1;
1436 #endif
1437 
1438  if (__kmp_omp_cancellation) {
1439  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1440  // Reset cancellation flag for worksharing constructs
1441  if (cancel_request == cancel_loop ||
1442  cancel_request == cancel_sections) {
1443  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1444  }
1445  }
1446 #if USE_ITT_BUILD
1447  /* TODO: In case of split reduction barrier, master thread may send
1448  acquired event early, before the final summation into the shared
1449  variable is done (final summation can be a long operation for array
1450  reductions). */
1451  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1452  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1453 #endif /* USE_ITT_BUILD */
1454 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1455  // Barrier - report frame end (only if active_level == 1)
1456  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1457  __kmp_forkjoin_frames_mode &&
1458  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1459  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1460  team->t.t_active_level == 1) {
1461  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1462  kmp_uint64 cur_time = __itt_get_timestamp();
1463  kmp_info_t **other_threads = team->t.t_threads;
1464  int nproc = this_thr->th.th_team_nproc;
1465  int i;
1466  switch (__kmp_forkjoin_frames_mode) {
1467  case 1:
1468  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1469  loc, nproc);
1470  this_thr->th.th_frame_time = cur_time;
1471  break;
1472  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1473  // be fixed)
1474  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1475  1, loc, nproc);
1476  break;
1477  case 3:
1478  if (__itt_metadata_add_ptr) {
1479  // Initialize with master's wait time
1480  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1481  // Set arrive time to zero to be able to check it in
1482  // __kmp_invoke_task(); the same is done inside the loop below
1483  this_thr->th.th_bar_arrive_time = 0;
1484  for (i = 1; i < nproc; ++i) {
1485  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1486  other_threads[i]->th.th_bar_arrive_time = 0;
1487  }
1488  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1489  cur_time, delta,
1490  (kmp_uint64)(reduce != NULL));
1491  }
1492  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1493  loc, nproc);
1494  this_thr->th.th_frame_time = cur_time;
1495  break;
1496  }
1497  }
1498 #endif /* USE_ITT_BUILD */
1499  } else {
1500  status = 1;
1501 #if USE_ITT_BUILD
1502  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1503  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1504 #endif /* USE_ITT_BUILD */
1505  }
1506  if ((status == 1 || !is_split) && !cancelled) {
1507  if (cancellable) {
1508  cancelled = __kmp_linear_barrier_release_cancellable(
1509  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1510  } else {
1511  switch (__kmp_barrier_release_pattern[bt]) {
1512  case bp_hyper_bar: {
1513  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1514  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1515  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1516  break;
1517  }
1518  case bp_hierarchical_bar: {
1519  __kmp_hierarchical_barrier_release(
1520  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1521  break;
1522  }
1523  case bp_tree_bar: {
1524  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1525  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1526  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1527  break;
1528  }
1529  default: {
1530  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1531  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1532  }
1533  }
1534  }
1535  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1536  __kmp_task_team_sync(this_thr, team);
1537  }
1538  }
1539 
1540 #if USE_ITT_BUILD
1541  /* GEH: TODO: Move this under if-condition above and also include in
1542  __kmp_end_split_barrier(). This will more accurately represent the actual
1543  release time of the threads for split barriers. */
1544  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1545  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1546 #endif /* USE_ITT_BUILD */
1547  } else { // Team is serialized.
1548  status = 0;
1549  if (__kmp_tasking_mode != tskm_immediate_exec) {
1550  if (this_thr->th.th_task_team != NULL) {
1551 #if USE_ITT_NOTIFY
1552  void *itt_sync_obj = NULL;
1553  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1554  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1555  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1556  }
1557 #endif
1558 
1559  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1560  TRUE);
1561  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1562  __kmp_task_team_setup(this_thr, team, 0);
1563 
1564 #if USE_ITT_BUILD
1565  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1566  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1567 #endif /* USE_ITT_BUILD */
1568  }
1569  }
1570  }
1571  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1572  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1573  __kmp_tid_from_gtid(gtid), status));
1574 
1575 #if OMPT_SUPPORT
1576  if (ompt_enabled.enabled) {
1577 #if OMPT_OPTIONAL
1578  if (ompt_enabled.ompt_callback_sync_region_wait) {
1579  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1580  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1581  return_address);
1582  }
1583  if (ompt_enabled.ompt_callback_sync_region) {
1584  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1585  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1586  return_address);
1587  }
1588 #endif
1589  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1590  }
1591 #endif
1592  ANNOTATE_BARRIER_END(&team->t.t_bar);
1593 
1594  if (cancellable)
1595  return (int)cancelled;
1596  return status;
1597 }
1598 
1599 // Returns 0 if master thread, 1 if worker thread.
1600 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1601  size_t reduce_size, void *reduce_data,
1602  void (*reduce)(void *, void *)) {
1603  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1604  reduce);
1605 }
1606 
1607 #if defined(KMP_GOMP_COMPAT)
1608 // Returns 1 if cancelled, 0 otherwise
1609 int __kmp_barrier_gomp_cancel(int gtid) {
1610  if (__kmp_omp_cancellation) {
1611  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1612  0, NULL, NULL);
1613  if (cancelled) {
1614  int tid = __kmp_tid_from_gtid(gtid);
1615  kmp_info_t *this_thr = __kmp_threads[gtid];
1616  if (KMP_MASTER_TID(tid)) {
1617  // Master does not need to revert anything
1618  } else {
1619  // Workers need to revert their private b_arrived flag
1620  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1621  KMP_BARRIER_STATE_BUMP;
1622  }
1623  }
1624  return cancelled;
1625  }
1626  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1627  return FALSE;
1628 }
1629 #endif
1630 
1631 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1632  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1633  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1634  int tid = __kmp_tid_from_gtid(gtid);
1635  kmp_info_t *this_thr = __kmp_threads[gtid];
1636  kmp_team_t *team = this_thr->th.th_team;
1637 
1638  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1639  if (!team->t.t_serialized) {
1640  if (KMP_MASTER_GTID(gtid)) {
1641  switch (__kmp_barrier_release_pattern[bt]) {
1642  case bp_hyper_bar: {
1643  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1644  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1645  FALSE USE_ITT_BUILD_ARG(NULL));
1646  break;
1647  }
1648  case bp_hierarchical_bar: {
1649  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1650  FALSE USE_ITT_BUILD_ARG(NULL));
1651  break;
1652  }
1653  case bp_tree_bar: {
1654  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1655  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1656  FALSE USE_ITT_BUILD_ARG(NULL));
1657  break;
1658  }
1659  default: {
1660  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1661  FALSE USE_ITT_BUILD_ARG(NULL));
1662  }
1663  }
1664  if (__kmp_tasking_mode != tskm_immediate_exec) {
1665  __kmp_task_team_sync(this_thr, team);
1666  } // if
1667  }
1668  }
1669  ANNOTATE_BARRIER_END(&team->t.t_bar);
1670 }
1671 
1672 void __kmp_join_barrier(int gtid) {
1673  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1674  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1675  kmp_info_t *this_thr = __kmp_threads[gtid];
1676  kmp_team_t *team;
1677  kmp_uint nproc;
1678  kmp_info_t *master_thread;
1679  int tid;
1680 #ifdef KMP_DEBUG
1681  int team_id;
1682 #endif /* KMP_DEBUG */
1683 #if USE_ITT_BUILD
1684  void *itt_sync_obj = NULL;
1685 #if USE_ITT_NOTIFY
1686  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1687  // Get object created at fork_barrier
1688  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1689 #endif
1690 #endif /* USE_ITT_BUILD */
1691  KMP_MB();
1692 
1693  // Get current info
1694  team = this_thr->th.th_team;
1695  nproc = this_thr->th.th_team_nproc;
1696  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1697  tid = __kmp_tid_from_gtid(gtid);
1698 #ifdef KMP_DEBUG
1699  team_id = team->t.t_id;
1700 #endif /* KMP_DEBUG */
1701  master_thread = this_thr->th.th_team_master;
1702 #ifdef KMP_DEBUG
1703  if (master_thread != team->t.t_threads[0]) {
1704  __kmp_print_structure();
1705  }
1706 #endif /* KMP_DEBUG */
1707  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1708  KMP_MB();
1709 
1710  // Verify state
1711  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1712  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1713  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1714  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1715  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1716  gtid, team_id, tid));
1717 
1718  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1719 #if OMPT_SUPPORT
1720  if (ompt_enabled.enabled) {
1721 #if OMPT_OPTIONAL
1722  ompt_data_t *my_task_data;
1723  ompt_data_t *my_parallel_data;
1724  void *codeptr = NULL;
1725  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1726  if (KMP_MASTER_TID(ds_tid) &&
1727  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1728  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1729  codeptr = team->t.ompt_team_info.master_return_address;
1730  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1731  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1732  if (ompt_enabled.ompt_callback_sync_region) {
1733  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1734  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1735  my_task_data, codeptr);
1736  }
1737  if (ompt_enabled.ompt_callback_sync_region_wait) {
1738  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1739  ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1740  my_task_data, codeptr);
1741  }
1742  if (!KMP_MASTER_TID(ds_tid))
1743  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1744 #endif
1745  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1746  }
1747 #endif
1748 
1749  if (__kmp_tasking_mode == tskm_extra_barrier) {
1750  __kmp_tasking_barrier(team, this_thr, gtid);
1751  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1752  team_id, tid));
1753  }
1754 #ifdef KMP_DEBUG
1755  if (__kmp_tasking_mode != tskm_immediate_exec) {
1756  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1757  "%p, th_task_team = %p\n",
1758  __kmp_gtid_from_thread(this_thr), team_id,
1759  team->t.t_task_team[this_thr->th.th_task_state],
1760  this_thr->th.th_task_team));
1761  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1762  team->t.t_task_team[this_thr->th.th_task_state]);
1763  }
1764 #endif /* KMP_DEBUG */
1765 
1766  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1767  access it when the team struct is not guaranteed to exist. Doing these
1768  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1769  we do not perform the copy if blocktime=infinite, since the values are not
1770  used by __kmp_wait_template() in that case. */
1771  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1772 #if KMP_USE_MONITOR
1773  this_thr->th.th_team_bt_intervals =
1774  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1775  this_thr->th.th_team_bt_set =
1776  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1777 #else
1778  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1779 #endif
1780  }
1781 
1782 #if USE_ITT_BUILD
1783  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1784  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1785 #endif /* USE_ITT_BUILD */
1786 
1787  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1788  case bp_hyper_bar: {
1789  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1790  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1791  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1792  break;
1793  }
1794  case bp_hierarchical_bar: {
1795  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1796  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1797  break;
1798  }
1799  case bp_tree_bar: {
1800  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1801  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1802  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1803  break;
1804  }
1805  default: {
1806  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1807  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1808  }
1809  }
1810 
1811  /* From this point on, the team data structure may be deallocated at any time
1812  by the master thread - it is unsafe to reference it in any of the worker
1813  threads. Any per-team data items that need to be referenced before the
1814  end of the barrier should be moved to the kmp_task_team_t structs. */
1815  if (KMP_MASTER_TID(tid)) {
1816  if (__kmp_tasking_mode != tskm_immediate_exec) {
1817  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1818  }
1819  if (__kmp_display_affinity) {
1820  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1821  }
1822 #if KMP_STATS_ENABLED
1823  // Have master thread flag the workers to indicate they are now waiting for
1824  // next parallel region, Also wake them up so they switch their timers to
1825  // idle.
1826  for (int i = 0; i < team->t.t_nproc; ++i) {
1827  kmp_info_t *team_thread = team->t.t_threads[i];
1828  if (team_thread == this_thr)
1829  continue;
1830  team_thread->th.th_stats->setIdleFlag();
1831  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1832  team_thread->th.th_sleep_loc != NULL)
1833  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1834  team_thread->th.th_sleep_loc);
1835  }
1836 #endif
1837 #if USE_ITT_BUILD
1838  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1839  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1840 #endif /* USE_ITT_BUILD */
1841 
1842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1843  // Join barrier - report frame end
1844  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1845  __kmp_forkjoin_frames_mode &&
1846  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1847  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1848  team->t.t_active_level == 1) {
1849  kmp_uint64 cur_time = __itt_get_timestamp();
1850  ident_t *loc = team->t.t_ident;
1851  kmp_info_t **other_threads = team->t.t_threads;
1852  int nproc = this_thr->th.th_team_nproc;
1853  int i;
1854  switch (__kmp_forkjoin_frames_mode) {
1855  case 1:
1856  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1857  loc, nproc);
1858  break;
1859  case 2:
1860  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1861  loc, nproc);
1862  break;
1863  case 3:
1864  if (__itt_metadata_add_ptr) {
1865  // Initialize with master's wait time
1866  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1867  // Set arrive time to zero to be able to check it in
1868  // __kmp_invoke_task(); the same is done inside the loop below
1869  this_thr->th.th_bar_arrive_time = 0;
1870  for (i = 1; i < nproc; ++i) {
1871  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1872  other_threads[i]->th.th_bar_arrive_time = 0;
1873  }
1874  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1875  cur_time, delta, 0);
1876  }
1877  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1878  loc, nproc);
1879  this_thr->th.th_frame_time = cur_time;
1880  break;
1881  }
1882  }
1883 #endif /* USE_ITT_BUILD */
1884  }
1885 #if USE_ITT_BUILD
1886  else {
1887  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1888  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1889  }
1890 #endif /* USE_ITT_BUILD */
1891 
1892 #if KMP_DEBUG
1893  if (KMP_MASTER_TID(tid)) {
1894  KA_TRACE(
1895  15,
1896  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1897  gtid, team_id, tid, nproc));
1898  }
1899 #endif /* KMP_DEBUG */
1900 
1901  // TODO now, mark worker threads as done so they may be disbanded
1902  KMP_MB(); // Flush all pending memory write invalidates.
1903  KA_TRACE(10,
1904  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1905 
1906  ANNOTATE_BARRIER_END(&team->t.t_bar);
1907 }
1908 
1909 // TODO release worker threads' fork barriers as we are ready instead of all at
1910 // once
1911 void __kmp_fork_barrier(int gtid, int tid) {
1912  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1913  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1914  kmp_info_t *this_thr = __kmp_threads[gtid];
1915  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1916 #if USE_ITT_BUILD
1917  void *itt_sync_obj = NULL;
1918 #endif /* USE_ITT_BUILD */
1919  if (team)
1920  ANNOTATE_BARRIER_END(&team->t.t_bar);
1921 
1922  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1923  (team != NULL) ? team->t.t_id : -1, tid));
1924 
1925  // th_team pointer only valid for master thread here
1926  if (KMP_MASTER_TID(tid)) {
1927 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1928  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1929  // Create itt barrier object
1930  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1931  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1932  }
1933 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1934 
1935 #ifdef KMP_DEBUG
1936  kmp_info_t **other_threads = team->t.t_threads;
1937  int i;
1938 
1939  // Verify state
1940  KMP_MB();
1941 
1942  for (i = 1; i < team->t.t_nproc; ++i) {
1943  KA_TRACE(500,
1944  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1945  "== %u.\n",
1946  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1947  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1948  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1949  KMP_DEBUG_ASSERT(
1950  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1951  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1952  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1953  }
1954 #endif
1955 
1956  if (__kmp_tasking_mode != tskm_immediate_exec) {
1957  // 0 indicates setup current task team if nthreads > 1
1958  __kmp_task_team_setup(this_thr, team, 0);
1959  }
1960 
1961  /* The master thread may have changed its blocktime between the join barrier
1962  and the fork barrier. Copy the blocktime info to the thread, where
1963  __kmp_wait_template() can access it when the team struct is not
1964  guaranteed to exist. */
1965  // See note about the corresponding code in __kmp_join_barrier() being
1966  // performance-critical
1967  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1968 #if KMP_USE_MONITOR
1969  this_thr->th.th_team_bt_intervals =
1970  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1971  this_thr->th.th_team_bt_set =
1972  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1973 #else
1974  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1975 #endif
1976  }
1977  } // master
1978 
1979  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1980  case bp_hyper_bar: {
1981  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1982  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1983  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1984  break;
1985  }
1986  case bp_hierarchical_bar: {
1987  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1988  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1989  break;
1990  }
1991  case bp_tree_bar: {
1992  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1993  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1994  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1995  break;
1996  }
1997  default: {
1998  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1999  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2000  }
2001  }
2002 
2003 #if OMPT_SUPPORT
2004  if (ompt_enabled.enabled &&
2005  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2006  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2007  ompt_data_t *task_data = (team)
2008  ? OMPT_CUR_TASK_DATA(this_thr)
2009  : &(this_thr->th.ompt_thread_info.task_data);
2010  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2011 #if OMPT_OPTIONAL
2012  void *codeptr = NULL;
2013  if (KMP_MASTER_TID(ds_tid) &&
2014  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2015  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2016  codeptr = team->t.ompt_team_info.master_return_address;
2017  if (ompt_enabled.ompt_callback_sync_region_wait) {
2018  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2019  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2020  codeptr);
2021  }
2022  if (ompt_enabled.ompt_callback_sync_region) {
2023  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2024  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2025  codeptr);
2026  }
2027 #endif
2028  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2029  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2030  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2031  }
2032  }
2033 #endif
2034 
2035  // Early exit for reaping threads releasing forkjoin barrier
2036  if (TCR_4(__kmp_global.g.g_done)) {
2037  this_thr->th.th_task_team = NULL;
2038 
2039 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2040  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2041  if (!KMP_MASTER_TID(tid)) {
2042  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2043  if (itt_sync_obj)
2044  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2045  }
2046  }
2047 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2048  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2049  return;
2050  }
2051 
2052  /* We can now assume that a valid team structure has been allocated by the
2053  master and propagated to all worker threads. The current thread, however,
2054  may not be part of the team, so we can't blindly assume that the team
2055  pointer is non-null. */
2056  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2057  KMP_DEBUG_ASSERT(team != NULL);
2058  tid = __kmp_tid_from_gtid(gtid);
2059 
2060 #if KMP_BARRIER_ICV_PULL
2061  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2062  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2063  implicit task has this data before this function is called. We cannot
2064  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2065  struct, because it is not always the case that the threads arrays have
2066  been allocated when __kmp_fork_call() is executed. */
2067  {
2068  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2069  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2070  // Copy the initial ICVs from the master's thread struct to the implicit
2071  // task for this tid.
2072  KA_TRACE(10,
2073  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2074  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2075  tid, FALSE);
2076  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2077  &team->t.t_threads[0]
2078  ->th.th_bar[bs_forkjoin_barrier]
2079  .bb.th_fixed_icvs);
2080  }
2081  }
2082 #endif // KMP_BARRIER_ICV_PULL
2083 
2084  if (__kmp_tasking_mode != tskm_immediate_exec) {
2085  __kmp_task_team_sync(this_thr, team);
2086  }
2087 
2088 #if KMP_AFFINITY_SUPPORTED
2089  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2090  if (proc_bind == proc_bind_intel) {
2091  // Call dynamic affinity settings
2092  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2093  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2094  }
2095  } else if (proc_bind != proc_bind_false) {
2096  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2097  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2098  __kmp_gtid_from_thread(this_thr),
2099  this_thr->th.th_current_place));
2100  } else {
2101  __kmp_affinity_set_place(gtid);
2102  }
2103  }
2104 #endif // KMP_AFFINITY_SUPPORTED
2105  // Perform the display affinity functionality
2106  if (__kmp_display_affinity) {
2107  if (team->t.t_display_affinity
2108 #if KMP_AFFINITY_SUPPORTED
2109  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2110 #endif
2111  ) {
2112  // NULL means use the affinity-format-var ICV
2113  __kmp_aux_display_affinity(gtid, NULL);
2114  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2115  this_thr->th.th_prev_level = team->t.t_level;
2116  }
2117  }
2118  if (!KMP_MASTER_TID(tid))
2119  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2120 
2121 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2122  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2123  if (!KMP_MASTER_TID(tid)) {
2124  // Get correct barrier object
2125  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2126  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2127  } // (prepare called inside barrier_release)
2128  }
2129 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2130  ANNOTATE_BARRIER_END(&team->t.t_bar);
2131  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2132  team->t.t_id, tid));
2133 }
2134 
2135 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2136  kmp_internal_control_t *new_icvs, ident_t *loc) {
2137  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2138 
2139  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2140  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2141 
2142 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2143  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2144  implicit task has this data before this function is called. */
2145 #if KMP_BARRIER_ICV_PULL
2146  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2147  untouched), where all of the worker threads can access them and make their
2148  own copies after the barrier. */
2149  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2150  // allocated at this point
2151  copy_icvs(
2152  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2153  new_icvs);
2154  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2155  team->t.t_threads[0], team));
2156 #elif KMP_BARRIER_ICV_PUSH
2157  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2158  // done here.
2159  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2160  team->t.t_threads[0], team));
2161 #else
2162  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2163  // time.
2164  ngo_load(new_icvs);
2165  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2166  // allocated at this point
2167  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2168  // TODO: GEH - pass in better source location info since usually NULL here
2169  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2170  f, team->t.t_threads[f], team));
2171  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2172  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2173  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2174  f, team->t.t_threads[f], team));
2175  }
2176  ngo_sync();
2177 #endif // KMP_BARRIER_ICV_PULL
2178 }
Definition: kmp.h:229