LLVM OpenMP* Runtime Library
kmp_dispatch.h
1 /*
2  * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_DISPATCH_H
14 #define KMP_DISPATCH_H
15 
16 /* ------------------------------------------------------------------------ */
17 /* ------------------------------------------------------------------------ */
18 
19 #include "kmp.h"
20 #include "kmp_error.h"
21 #include "kmp_i18n.h"
22 #include "kmp_itt.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #if KMP_OS_WINDOWS && KMP_ARCH_X86
26 #include <float.h>
27 #endif
28 
29 #if OMPT_SUPPORT
30 #include "ompt-internal.h"
31 #include "ompt-specific.h"
32 #endif
33 
34 /* ------------------------------------------------------------------------ */
35 /* ------------------------------------------------------------------------ */
36 #if KMP_USE_HIER_SCHED
37 // Forward declarations of some hierarchical scheduling data structures
38 template <typename T> struct kmp_hier_t;
39 template <typename T> struct kmp_hier_top_unit_t;
40 #endif // KMP_USE_HIER_SCHED
41 
42 template <typename T> struct dispatch_shared_info_template;
43 template <typename T> struct dispatch_private_info_template;
44 
45 template <typename T>
46 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
47  dispatch_private_info_template<T> *pr,
48  enum sched_type schedule, T lb, T ub,
49  typename traits_t<T>::signed_t st,
50 #if USE_ITT_BUILD
51  kmp_uint64 *cur_chunk,
52 #endif
53  typename traits_t<T>::signed_t chunk,
54  T nproc, T unit_id);
55 template <typename T>
56 extern int __kmp_dispatch_next_algorithm(
57  int gtid, dispatch_private_info_template<T> *pr,
58  dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
59  T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
60 
61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
62 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63 
64 #if KMP_STATIC_STEAL_ENABLED
65 
66 // replaces dispatch_private_info{32,64} structures and
67 // dispatch_private_info{32,64}_t types
68 template <typename T> struct dispatch_private_infoXX_template {
69  typedef typename traits_t<T>::unsigned_t UT;
70  typedef typename traits_t<T>::signed_t ST;
71  UT count; // unsigned
72  T ub;
73  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
74  T lb;
75  ST st; // signed
76  UT tc; // unsigned
77  kmp_lock_t *steal_lock; // lock used for chunk stealing
78  /* parm[1-4] are used in different ways by different scheduling algorithms */
79 
80  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
81  // a) parm3 is properly aligned and
82  // b) all parm1-4 are in the same cache line.
83  // Because of parm1-4 are used together, performance seems to be better
84  // if they are in the same line (not measured though).
85 
86  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
87  T parm1;
88  T parm2;
89  T parm3;
90  T parm4;
91  };
92 
93  UT ordered_lower; // unsigned
94  UT ordered_upper; // unsigned
95 #if KMP_OS_WINDOWS
96  T last_upper;
97 #endif /* KMP_OS_WINDOWS */
98 };
99 
100 #else /* KMP_STATIC_STEAL_ENABLED */
101 
102 // replaces dispatch_private_info{32,64} structures and
103 // dispatch_private_info{32,64}_t types
104 template <typename T> struct dispatch_private_infoXX_template {
105  typedef typename traits_t<T>::unsigned_t UT;
106  typedef typename traits_t<T>::signed_t ST;
107  T lb;
108  T ub;
109  ST st; // signed
110  UT tc; // unsigned
111 
112  T parm1;
113  T parm2;
114  T parm3;
115  T parm4;
116 
117  UT count; // unsigned
118 
119  UT ordered_lower; // unsigned
120  UT ordered_upper; // unsigned
121 #if KMP_OS_WINDOWS
122  T last_upper;
123 #endif /* KMP_OS_WINDOWS */
124 };
125 #endif /* KMP_STATIC_STEAL_ENABLED */
126 
127 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
128  // duplicate alignment here, otherwise size of structure is not correct in our
129  // compiler
130  union KMP_ALIGN_CACHE private_info_tmpl {
131  dispatch_private_infoXX_template<T> p;
132  dispatch_private_info64_t p64;
133  } u;
134  enum sched_type schedule; /* scheduling algorithm */
135  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
136  std::atomic<kmp_uint32> steal_flag; // static_steal only, state of a buffer
137  kmp_uint32 ordered_bumped;
138  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
139  kmp_uint32 type_size;
140 #if KMP_USE_HIER_SCHED
141  kmp_int32 hier_id;
142  kmp_hier_top_unit_t<T> *hier_parent;
143  // member functions
144  kmp_int32 get_hier_id() const { return hier_id; }
145  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
146 #endif
147  enum cons_type pushed_ws;
148 };
149 
150 // replaces dispatch_shared_info{32,64} structures and
151 // dispatch_shared_info{32,64}_t types
152 template <typename T> struct dispatch_shared_infoXX_template {
153  typedef typename traits_t<T>::unsigned_t UT;
154  typedef typename traits_t<T>::signed_t ST;
155  /* chunk index under dynamic, number of idle threads under static-steal;
156  iteration index otherwise */
157  volatile UT iteration;
158  volatile ST num_done;
159  volatile UT ordered_iteration;
160  // to retain the structure size making ordered_iteration scalar
161  UT ordered_dummy[KMP_MAX_ORDERED - 3];
162 };
163 
164 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
165 template <typename T> struct dispatch_shared_info_template {
166  typedef typename traits_t<T>::unsigned_t UT;
167  // we need union here to keep the structure size
168  union shared_info_tmpl {
169  dispatch_shared_infoXX_template<UT> s;
170  dispatch_shared_info64_t s64;
171  } u;
172  volatile kmp_uint32 buffer_index;
173  volatile kmp_int32 doacross_buf_idx; // teamwise index
174  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
175  kmp_int32 doacross_num_done; // count finished threads
176 #if KMP_USE_HIER_SCHED
177  kmp_hier_t<T> *hier;
178 #endif
179 #if KMP_USE_HWLOC
180  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
181  // machines (> 48 cores). Performance analysis showed that a cache thrash
182  // was occurring and this padding helps alleviate the problem.
183  char padding[64];
184 #endif
185 };
186 
187 /* ------------------------------------------------------------------------ */
188 /* ------------------------------------------------------------------------ */
189 
190 #undef USE_TEST_LOCKS
191 
192 // test_then_add template (general template should NOT be used)
193 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
194 
195 template <>
196 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
197  kmp_int32 d) {
198  kmp_int32 r;
199  r = KMP_TEST_THEN_ADD32(p, d);
200  return r;
201 }
202 
203 template <>
204 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
205  kmp_int64 d) {
206  kmp_int64 r;
207  r = KMP_TEST_THEN_ADD64(p, d);
208  return r;
209 }
210 
211 // test_then_inc_acq template (general template should NOT be used)
212 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
213 
214 template <>
215 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
216  kmp_int32 r;
217  r = KMP_TEST_THEN_INC_ACQ32(p);
218  return r;
219 }
220 
221 template <>
222 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
223  kmp_int64 r;
224  r = KMP_TEST_THEN_INC_ACQ64(p);
225  return r;
226 }
227 
228 // test_then_inc template (general template should NOT be used)
229 template <typename T> static __forceinline T test_then_inc(volatile T *p);
230 
231 template <>
232 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
233  kmp_int32 r;
234  r = KMP_TEST_THEN_INC32(p);
235  return r;
236 }
237 
238 template <>
239 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
240  kmp_int64 r;
241  r = KMP_TEST_THEN_INC64(p);
242  return r;
243 }
244 
245 // compare_and_swap template (general template should NOT be used)
246 template <typename T>
247 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
248 
249 template <>
250 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
251  kmp_int32 c, kmp_int32 s) {
252  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
253 }
254 
255 template <>
256 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
257  kmp_int64 c, kmp_int64 s) {
258  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
259 }
260 
261 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
262  return value >= checker;
263 }
264 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
265  return value == checker;
266 }
267 
268 /*
269  Spin wait loop that pauses between checks.
270  Waits until function returns non-zero when called with *spinner and check.
271  Does NOT put threads to sleep.
272  Arguments:
273  UT is unsigned 4- or 8-byte type
274  spinner - memory location to check value
275  checker - value which spinner is >, <, ==, etc.
276  pred - predicate function to perform binary comparison of some sort
277 #if USE_ITT_BUILD
278  obj -- is higher-level synchronization object to report to ittnotify. It
279  is used to report locks consistently. For example, if lock is acquired
280  immediately, its address is reported to ittnotify via
281  KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
282  and lock routine calls to KMP_WAIT(), the later should report the
283  same address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285  TODO: make inline function (move to header file for icl)
286 */
287 template <typename UT>
288 static UT __kmp_wait(volatile UT *spinner, UT checker,
289  kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
290  // note: we may not belong to a team at this point
291  volatile UT *spin = spinner;
292  UT check = checker;
293  kmp_uint32 spins;
294  kmp_uint32 (*f)(UT, UT) = pred;
295  kmp_uint64 time;
296  UT r;
297 
298  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
299  KMP_INIT_YIELD(spins);
300  KMP_INIT_BACKOFF(time);
301  // main wait spin loop
302  while (!f(r = *spin, check)) {
303  KMP_FSYNC_SPIN_PREPARE(obj);
304  /* GEH - remove this since it was accidentally introduced when kmp_wait was
305  split.
306  It causes problems with infinite recursion because of exit lock */
307  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
308  __kmp_abort_thread(); */
309  // If oversubscribed, or have waited a bit then yield.
310  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
311  }
312  KMP_FSYNC_SPIN_ACQUIRED(obj);
313  return r;
314 }
315 
316 /* ------------------------------------------------------------------------ */
317 /* ------------------------------------------------------------------------ */
318 
319 template <typename UT>
320 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
321  dispatch_private_info_template<UT> *pr;
322 
323  int gtid = *gtid_ref;
324  // int cid = *cid_ref;
325  kmp_info_t *th = __kmp_threads[gtid];
326  KMP_DEBUG_ASSERT(th->th.th_dispatch);
327 
328  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
329  if (__kmp_env_consistency_check) {
330  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
331  th->th.th_dispatch->th_dispatch_pr_current);
332  if (pr->pushed_ws != ct_none) {
333 #if KMP_USE_DYNAMIC_LOCK
334  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
335 #else
336  __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
337 #endif
338  }
339  }
340 
341  if (!th->th.th_team->t.t_serialized) {
342  dispatch_shared_info_template<UT> *sh =
343  reinterpret_cast<dispatch_shared_info_template<UT> *>(
344  th->th.th_dispatch->th_dispatch_sh_current);
345  UT lower;
346 
347  if (!__kmp_env_consistency_check) {
348  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
349  th->th.th_dispatch->th_dispatch_pr_current);
350  }
351  lower = pr->u.p.ordered_lower;
352 
353 #if !defined(KMP_GOMP_COMPAT)
354  if (__kmp_env_consistency_check) {
355  if (pr->ordered_bumped) {
356  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
357  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
358  ct_ordered_in_pdo, loc_ref,
359  &p->stack_data[p->w_top]);
360  }
361  }
362 #endif /* !defined(KMP_GOMP_COMPAT) */
363 
364  KMP_MB();
365 #ifdef KMP_DEBUG
366  {
367  char *buff;
368  // create format specifiers before the debug output
369  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
370  "ordered_iter:%%%s lower:%%%s\n",
371  traits_t<UT>::spec, traits_t<UT>::spec);
372  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
373  __kmp_str_free(&buff);
374  }
375 #endif
376  __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
377  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
378  KMP_MB(); /* is this necessary? */
379 #ifdef KMP_DEBUG
380  {
381  char *buff;
382  // create format specifiers before the debug output
383  buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
384  "ordered_iter:%%%s lower:%%%s\n",
385  traits_t<UT>::spec, traits_t<UT>::spec);
386  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
387  __kmp_str_free(&buff);
388  }
389 #endif
390  }
391  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
392 }
393 
394 template <typename UT>
395 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
396  typedef typename traits_t<UT>::signed_t ST;
397  dispatch_private_info_template<UT> *pr;
398 
399  int gtid = *gtid_ref;
400  // int cid = *cid_ref;
401  kmp_info_t *th = __kmp_threads[gtid];
402  KMP_DEBUG_ASSERT(th->th.th_dispatch);
403 
404  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
405  if (__kmp_env_consistency_check) {
406  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
407  th->th.th_dispatch->th_dispatch_pr_current);
408  if (pr->pushed_ws != ct_none) {
409  __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
410  }
411  }
412 
413  if (!th->th.th_team->t.t_serialized) {
414  dispatch_shared_info_template<UT> *sh =
415  reinterpret_cast<dispatch_shared_info_template<UT> *>(
416  th->th.th_dispatch->th_dispatch_sh_current);
417 
418  if (!__kmp_env_consistency_check) {
419  pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
420  th->th.th_dispatch->th_dispatch_pr_current);
421  }
422 
423  KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
424 #if !defined(KMP_GOMP_COMPAT)
425  if (__kmp_env_consistency_check) {
426  if (pr->ordered_bumped != 0) {
427  struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
428  /* How to test it? - OM */
429  __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
430  ct_ordered_in_pdo, loc_ref,
431  &p->stack_data[p->w_top]);
432  }
433  }
434 #endif /* !defined(KMP_GOMP_COMPAT) */
435 
436  KMP_MB(); /* Flush all pending memory write invalidates. */
437 
438  pr->ordered_bumped += 1;
439 
440  KD_TRACE(1000,
441  ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
442  gtid, pr->ordered_bumped));
443 
444  KMP_MB(); /* Flush all pending memory write invalidates. */
445 
446  /* TODO use general release procedure? */
447  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
448 
449  KMP_MB(); /* Flush all pending memory write invalidates. */
450  }
451  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
452 }
453 
454 /* Computes and returns x to the power of y, where y must a non-negative integer
455  */
456 template <typename UT>
457 static __forceinline long double __kmp_pow(long double x, UT y) {
458  long double s = 1.0L;
459 
460  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
461  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
462  while (y) {
463  if (y & 1)
464  s *= x;
465  x *= x;
466  y >>= 1;
467  }
468  return s;
469 }
470 
471 /* Computes and returns the number of unassigned iterations after idx chunks
472  have been assigned
473  (the total number of unassigned iterations in chunks with index greater than
474  or equal to idx).
475  __forceinline seems to be broken so that if we __forceinline this function,
476  the behavior is wrong
477  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
478 */
479 template <typename T>
480 static __inline typename traits_t<T>::unsigned_t
481 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
482  typename traits_t<T>::unsigned_t idx) {
483  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
484  least for ICL 8.1, long double arithmetic may not really have
485  long double precision, even with /Qlong_double. Currently, we
486  workaround that in the caller code, by manipulating the FPCW for
487  Windows* OS on IA-32 architecture. The lack of precision is not
488  expected to be a correctness issue, though.
489  */
490  typedef typename traits_t<T>::unsigned_t UT;
491 
492  long double x = tc * __kmp_pow<UT>(base, idx);
493  UT r = (UT)x;
494  if (x == r)
495  return r;
496  return r + 1;
497 }
498 
499 // Parameters of the guided-iterative algorithm:
500 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
501 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
502 // by default n = 2. For example with n = 3 the chunks distribution will be more
503 // flat.
504 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
505 static const int guided_int_param = 2;
506 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
507 #endif // KMP_DISPATCH_H
sched_type
Definition: kmp.h:357
Definition: kmp.h:234