21 #include "kmp_error.h"
24 #include "kmp_stats.h"
26 #if KMP_USE_X87CONTROL
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
36 #include "ompt-specific.h"
42 void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
45 KMP_DEBUG_ASSERT(gtid_ref);
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
60 void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
73 bool use_hier =
false) {
76 int monotonicity = SCHEDULE_MONOTONIC;
80 if (loc->get_openmp_version() < 50)
81 monotonicity = SCHEDULE_MONOTONIC;
84 monotonicity = SCHEDULE_MONOTONIC;
85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86 monotonicity = SCHEDULE_NONMONOTONIC;
87 else if (SCHEDULE_HAS_MONOTONIC(schedule))
88 monotonicity = SCHEDULE_MONOTONIC;
103 template <
typename T>
104 void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
105 dispatch_private_info_template<T> *pr,
107 typename traits_t<T>::signed_t st,
109 kmp_uint64 *cur_chunk,
111 typename traits_t<T>::signed_t chunk,
113 typedef typename traits_t<T>::unsigned_t UT;
114 typedef typename traits_t<T>::floating_t DBL;
124 typedef typename traits_t<T>::signed_t ST;
128 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called "
129 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
130 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
131 traits_t<T>::spec, traits_t<T>::spec,
132 traits_t<ST>::spec, traits_t<ST>::spec,
133 traits_t<T>::spec, traits_t<T>::spec);
134 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
135 __kmp_str_free(&buff);
139 th = __kmp_threads[gtid];
140 team = th->th.th_team;
141 active = !team->t.t_serialized;
144 int itt_need_metadata_reporting =
145 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
146 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
147 team->t.t_active_level == 1;
150 #if KMP_USE_HIER_SCHED
151 use_hier = pr->flags.use_hier;
157 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
158 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
162 pr->flags.nomerge = TRUE;
166 pr->flags.nomerge = FALSE;
168 pr->type_size = traits_t<T>::type_size;
170 pr->flags.ordered = TRUE;
174 pr->flags.ordered = FALSE;
177 if (pr->flags.ordered) {
178 monotonicity = SCHEDULE_MONOTONIC;
182 schedule = __kmp_static;
184 if (schedule == kmp_sch_runtime) {
187 schedule = team->t.t_sched.r_sched_type;
188 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
189 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
193 schedule = __kmp_guided;
195 schedule = __kmp_static;
199 chunk = team->t.t_sched.chunk;
208 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: "
209 "schedule:%%d chunk:%%%s\n",
211 KD_TRACE(10, (buff, gtid, schedule, chunk));
212 __kmp_str_free(&buff);
217 schedule = __kmp_guided;
220 chunk = KMP_DEFAULT_CHUNK;
226 schedule = __kmp_auto;
231 buff = __kmp_str_format(
232 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
233 "schedule:%%d chunk:%%%s\n",
235 KD_TRACE(10, (buff, gtid, schedule, chunk));
236 __kmp_str_free(&buff);
240 #if KMP_STATIC_STEAL_ENABLED
242 if (schedule == kmp_sch_dynamic_chunked) {
243 if (monotonicity == SCHEDULE_NONMONOTONIC)
244 schedule = kmp_sch_static_steal;
248 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
249 schedule = kmp_sch_guided_iterative_chunked;
250 KMP_WARNING(DispatchManyThreads);
254 schedule = team->t.t_sched.r_sched_type;
255 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
256 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
260 schedule == __kmp_static) {
261 schedule = kmp_sch_static_balanced_chunked;
266 chunk = team->t.t_sched.chunk * chunk;
276 buff = __kmp_str_format(
277 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
280 KD_TRACE(10, (buff, gtid, schedule, chunk));
281 __kmp_str_free(&buff);
285 pr->u.p.parm1 = chunk;
288 "unknown scheduling type");
292 if (__kmp_env_consistency_check) {
294 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
295 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
309 tc = (UT)(lb - ub) / (-st) + 1;
317 tc = (UT)(ub - lb) / st + 1;
323 #if KMP_STATS_ENABLED
324 if (KMP_MASTER_GTID(gtid)) {
335 pr->u.p.last_upper = ub + st;
341 if (pr->flags.ordered) {
342 pr->ordered_bumped = 0;
343 pr->u.p.ordered_lower = 1;
344 pr->u.p.ordered_upper = 0;
349 #if (KMP_STATIC_STEAL_ENABLED)
350 case kmp_sch_static_steal: {
354 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
357 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
358 if (nproc > 1 && ntc >= nproc) {
361 T small_chunk, extras;
363 small_chunk = ntc / nproc;
364 extras = ntc % nproc;
366 init =
id * small_chunk + (
id < extras ? id : extras);
367 pr->u.p.count = init;
368 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0);
374 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
375 pr->u.p.parm4 = (
id + 1) % nproc;
377 if (traits_t<T>::type_size > 4) {
383 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
384 pr->u.p.th_steal_lock =
385 (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
386 __kmp_init_lock(pr->u.p.th_steal_lock);
391 schedule = kmp_sch_dynamic_chunked;
392 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d switching to "
393 "kmp_sch_dynamic_chunked\n",
395 if (pr->u.p.parm1 <= 0)
396 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
401 case kmp_sch_static_balanced: {
406 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
416 pr->u.p.parm1 = (
id == tc - 1);
419 pr->u.p.parm1 = FALSE;
423 T small_chunk = tc / nproc;
424 T extras = tc % nproc;
425 init =
id * small_chunk + (
id < extras ? id : extras);
426 limit = init + small_chunk - (
id < extras ? 0 : 1);
427 pr->u.p.parm1 = (
id == nproc - 1);
433 pr->u.p.parm1 = TRUE;
437 pr->u.p.parm1 = FALSE;
443 if (itt_need_metadata_reporting)
445 *cur_chunk = limit - init + 1;
448 pr->u.p.lb = lb + init;
449 pr->u.p.ub = lb + limit;
452 T ub_tmp = lb + limit * st;
453 pr->u.p.lb = lb + init * st;
457 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
459 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
462 if (pr->flags.ordered) {
463 pr->u.p.ordered_lower = init;
464 pr->u.p.ordered_upper = limit;
468 case kmp_sch_static_balanced_chunked: {
471 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
472 " -> falling-through to static_greedy\n",
474 schedule = kmp_sch_static_greedy;
476 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
482 case kmp_sch_guided_iterative_chunked: {
485 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
490 if ((2L * chunk + 1) * nproc >= tc) {
492 schedule = kmp_sch_dynamic_chunked;
495 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
496 *(
double *)&pr->u.p.parm3 =
497 guided_flt_param / (
double)nproc;
500 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
501 "kmp_sch_static_greedy\n",
503 schedule = kmp_sch_static_greedy;
507 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
513 case kmp_sch_guided_analytical_chunked: {
514 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
515 "kmp_sch_guided_analytical_chunked case\n",
519 if ((2L * chunk + 1) * nproc >= tc) {
521 schedule = kmp_sch_dynamic_chunked;
526 #if KMP_USE_X87CONTROL
536 unsigned int oldFpcw = _control87(0, 0);
537 _control87(_PC_64, _MCW_PC);
540 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
547 x = 1.0 - 0.5 / (double)nproc;
558 ptrdiff_t natural_alignment =
559 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
563 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
568 *(DBL *)&pr->u.p.parm3 = x;
581 p = __kmp_pow<UT>(x, right);
586 }
while (p > target && right < (1 << 27));
594 while (left + 1 < right) {
595 mid = (left + right) / 2;
596 if (__kmp_pow<UT>(x, mid) > target) {
605 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
606 __kmp_pow<UT>(x, cross) <= target);
609 pr->u.p.parm2 = cross;
612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
615 #define GUIDED_ANALYTICAL_WORKAROUND (x)
618 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
619 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
621 #if KMP_USE_X87CONTROL
623 _control87(oldFpcw, _MCW_PC);
627 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to "
628 "kmp_sch_static_greedy\n",
630 schedule = kmp_sch_static_greedy;
636 case kmp_sch_static_greedy:
639 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
641 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
643 case kmp_sch_static_chunked:
644 case kmp_sch_dynamic_chunked:
645 if (pr->u.p.parm1 <= 0) {
646 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
648 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d "
649 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
652 case kmp_sch_trapezoidal: {
655 T parm1, parm2, parm3, parm4;
657 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
663 parm2 = (tc / (2 * nproc));
673 }
else if (parm1 > parm2) {
678 parm3 = (parm2 + parm1);
679 parm3 = (2 * tc + parm3 - 1) / parm3;
687 parm4 = (parm2 - parm1) / parm4;
694 pr->u.p.parm1 = parm1;
695 pr->u.p.parm2 = parm2;
696 pr->u.p.parm3 = parm3;
697 pr->u.p.parm4 = parm4;
702 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
703 KMP_HNT(GetNewerLibrary),
708 pr->schedule = schedule;
711 #if KMP_USE_HIER_SCHED
712 template <
typename T>
713 inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
714 typename traits_t<T>::signed_t st);
717 __kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
718 kmp_int32 ub, kmp_int32 st) {
719 __kmp_dispatch_init_hierarchy<kmp_int32>(
720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
725 __kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
726 kmp_uint32 ub, kmp_int32 st) {
727 __kmp_dispatch_init_hierarchy<kmp_uint32>(
728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
733 __kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
734 kmp_int64 ub, kmp_int64 st) {
735 __kmp_dispatch_init_hierarchy<kmp_int64>(
736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
741 __kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
742 kmp_uint64 ub, kmp_int64 st) {
743 __kmp_dispatch_init_hierarchy<kmp_uint64>(
744 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
745 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
749 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
750 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
751 for (
int i = 0; i < num_disp_buff; ++i) {
754 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
755 &team->t.t_disp_buffer[i]);
757 sh->hier->deallocate();
758 __kmp_free(sh->hier);
766 template <
typename T>
769 T ub,
typename traits_t<T>::signed_t st,
770 typename traits_t<T>::signed_t chunk,
int push_ws) {
771 typedef typename traits_t<T>::unsigned_t UT;
776 kmp_uint32 my_buffer_index;
777 dispatch_private_info_template<T> *pr;
778 dispatch_shared_info_template<T>
volatile *sh;
780 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
781 sizeof(dispatch_private_info));
782 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
783 sizeof(dispatch_shared_info));
784 __kmp_assert_valid_gtid(gtid);
786 if (!TCR_4(__kmp_init_parallel))
787 __kmp_parallel_initialize();
789 __kmp_resume_if_soft_paused();
791 #if INCLUDE_SSC_MARKS
792 SSC_MARK_DISPATCH_INIT();
795 typedef typename traits_t<T>::signed_t ST;
799 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d "
800 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
801 traits_t<ST>::spec, traits_t<T>::spec,
802 traits_t<T>::spec, traits_t<ST>::spec);
803 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
804 __kmp_str_free(&buff);
808 th = __kmp_threads[gtid];
809 team = th->th.th_team;
810 active = !team->t.t_serialized;
811 th->th.th_ident = loc;
816 if (schedule == __kmp_static) {
822 #if KMP_USE_HIER_SCHED
828 my_buffer_index = th->th.th_dispatch->th_disp_index;
829 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
831 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
832 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
837 if (pr->flags.use_hier) {
839 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. "
840 "Disabling hierarchical scheduling.\n",
842 pr->flags.use_hier = FALSE;
845 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
848 if (!ordered && !pr->flags.use_hier)
849 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
854 kmp_uint64 cur_chunk = chunk;
855 int itt_need_metadata_reporting =
856 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
857 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
858 team->t.t_active_level == 1;
861 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
862 th->th.th_dispatch->th_disp_buffer);
864 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
865 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
867 my_buffer_index = th->th.th_dispatch->th_disp_index++;
870 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
872 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
873 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
874 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
875 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
879 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
883 chunk, (T)th->th.th_team_nproc,
884 (T)th->th.th_info.ds.ds_tid);
886 if (pr->flags.ordered == 0) {
887 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
888 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
890 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
891 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
899 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
900 "sh->buffer_index:%d\n",
901 gtid, my_buffer_index, sh->buffer_index));
902 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
903 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
907 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
908 "sh->buffer_index:%d\n",
909 gtid, my_buffer_index, sh->buffer_index));
911 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
912 th->th.th_dispatch->th_dispatch_sh_current =
913 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
915 if (pr->flags.ordered) {
916 __kmp_itt_ordered_init(gtid);
919 if (itt_need_metadata_reporting) {
921 kmp_uint64 schedtype = 0;
923 case kmp_sch_static_chunked:
924 case kmp_sch_static_balanced:
926 case kmp_sch_static_greedy:
927 cur_chunk = pr->u.p.parm1;
929 case kmp_sch_dynamic_chunked:
932 case kmp_sch_guided_iterative_chunked:
933 case kmp_sch_guided_analytical_chunked:
943 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
945 #if KMP_USE_HIER_SCHED
946 if (pr->flags.use_hier) {
948 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
958 buff = __kmp_str_format(
959 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
961 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
962 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
963 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
964 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
965 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
966 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
967 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
968 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
969 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
970 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
971 __kmp_str_free(&buff);
974 #if (KMP_STATIC_STEAL_ENABLED)
980 if (pr->schedule == kmp_sch_static_steal) {
984 volatile T *p = &pr->u.p.static_steal_counter;
989 #if OMPT_SUPPORT && OMPT_OPTIONAL
990 if (ompt_enabled.ompt_callback_work) {
991 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
992 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
993 ompt_callbacks.ompt_callback(ompt_callback_work)(
994 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
995 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
998 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1006 template <
typename UT>
1007 static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
1008 typedef typename traits_t<UT>::signed_t ST;
1009 __kmp_assert_valid_gtid(gtid);
1010 kmp_info_t *th = __kmp_threads[gtid];
1012 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
1013 if (!th->th.th_team->t.t_serialized) {
1015 dispatch_private_info_template<UT> *pr =
1016 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1017 th->th.th_dispatch->th_dispatch_pr_current);
1018 dispatch_shared_info_template<UT>
volatile *sh =
1019 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1020 th->th.th_dispatch->th_dispatch_sh_current);
1021 KMP_DEBUG_ASSERT(pr);
1022 KMP_DEBUG_ASSERT(sh);
1023 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1024 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1026 if (pr->ordered_bumped) {
1029 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1031 pr->ordered_bumped = 0;
1033 UT lower = pr->u.p.ordered_lower;
1039 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: "
1040 "ordered_iteration:%%%s lower:%%%s\n",
1041 traits_t<UT>::spec, traits_t<UT>::spec);
1042 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1043 __kmp_str_free(&buff);
1047 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1048 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1054 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: "
1055 "ordered_iteration:%%%s lower:%%%s\n",
1056 traits_t<UT>::spec, traits_t<UT>::spec);
1057 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1058 __kmp_str_free(&buff);
1062 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1065 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1068 #ifdef KMP_GOMP_COMPAT
1070 template <
typename UT>
1071 static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1072 typedef typename traits_t<UT>::signed_t ST;
1073 __kmp_assert_valid_gtid(gtid);
1074 kmp_info_t *th = __kmp_threads[gtid];
1076 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1077 if (!th->th.th_team->t.t_serialized) {
1079 dispatch_private_info_template<UT> *pr =
1080 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1081 th->th.th_dispatch->th_dispatch_pr_current);
1082 dispatch_shared_info_template<UT>
volatile *sh =
1083 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1084 th->th.th_dispatch->th_dispatch_sh_current);
1085 KMP_DEBUG_ASSERT(pr);
1086 KMP_DEBUG_ASSERT(sh);
1087 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1088 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1091 UT lower = pr->u.p.ordered_lower;
1092 UT upper = pr->u.p.ordered_upper;
1093 UT inc = upper - lower + 1;
1095 if (pr->ordered_bumped == inc) {
1098 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1100 pr->ordered_bumped = 0;
1102 inc -= pr->ordered_bumped;
1108 buff = __kmp_str_format(
1109 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1110 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1111 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1112 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1113 __kmp_str_free(&buff);
1117 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1118 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1121 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting "
1122 "ordered_bumped to zero\n",
1124 pr->ordered_bumped = 0;
1130 buff = __kmp_str_format(
1131 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1132 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1133 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1134 traits_t<UT>::spec);
1136 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1137 __kmp_str_free(&buff);
1141 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1145 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1150 template <
typename T>
1151 int __kmp_dispatch_next_algorithm(
int gtid,
1152 dispatch_private_info_template<T> *pr,
1153 dispatch_shared_info_template<T>
volatile *sh,
1154 kmp_int32 *p_last, T *p_lb, T *p_ub,
1155 typename traits_t<T>::signed_t *p_st, T nproc,
1157 typedef typename traits_t<T>::unsigned_t UT;
1158 typedef typename traits_t<T>::signed_t ST;
1159 typedef typename traits_t<T>::floating_t DBL;
1164 UT limit, trip, init;
1165 kmp_info_t *th = __kmp_threads[gtid];
1166 kmp_team_t *team = th->th.th_team;
1168 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1169 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1170 KMP_DEBUG_ASSERT(pr);
1171 KMP_DEBUG_ASSERT(sh);
1172 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1178 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1179 "sh:%%p nproc:%%%s tid:%%%s\n",
1180 traits_t<T>::spec, traits_t<T>::spec);
1181 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1182 __kmp_str_free(&buff);
1187 if (pr->u.p.tc == 0) {
1189 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1195 switch (pr->schedule) {
1196 #if (KMP_STATIC_STEAL_ENABLED)
1197 case kmp_sch_static_steal: {
1198 T chunk = pr->u.p.parm1;
1201 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1204 trip = pr->u.p.tc - 1;
1206 if (traits_t<T>::type_size > 4) {
1209 kmp_lock_t *lck = pr->u.p.th_steal_lock;
1210 KMP_DEBUG_ASSERT(lck != NULL);
1211 if (pr->u.p.count < (UT)pr->u.p.ub) {
1212 __kmp_acquire_lock(lck, gtid);
1214 init = (pr->u.p.count)++;
1215 status = (init < (UT)pr->u.p.ub);
1216 __kmp_release_lock(lck, gtid);
1221 kmp_info_t **other_threads = team->t.t_threads;
1222 T while_limit = pr->u.p.parm3;
1224 T
id = pr->u.p.static_steal_counter;
1225 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1226 __kmp_dispatch_num_buffers;
1230 while ((!status) && (while_limit != ++while_index)) {
1231 dispatch_private_info_template<T> *victim;
1233 T victimIdx = pr->u.p.parm4;
1234 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1235 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1236 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1237 KMP_DEBUG_ASSERT(victim);
1238 while ((victim == pr ||
id != victim->u.p.static_steal_counter) &&
1239 oldVictimIdx != victimIdx) {
1240 victimIdx = (victimIdx + 1) % nproc;
1241 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1242 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1243 KMP_DEBUG_ASSERT(victim);
1245 if (victim == pr ||
id != victim->u.p.static_steal_counter) {
1250 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1251 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1255 lck = victim->u.p.th_steal_lock;
1256 KMP_ASSERT(lck != NULL);
1257 __kmp_acquire_lock(lck, gtid);
1258 limit = victim->u.p.ub;
1259 if (victim->u.p.count >= limit ||
1260 (remaining = limit - victim->u.p.count) < 2) {
1261 __kmp_release_lock(lck, gtid);
1262 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1267 if (remaining > 3) {
1269 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1270 init = (victim->u.p.ub -= (remaining >> 2));
1273 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1274 init = (victim->u.p.ub -= 1);
1276 __kmp_release_lock(lck, gtid);
1278 KMP_DEBUG_ASSERT(init + 1 <= limit);
1279 pr->u.p.parm4 = victimIdx;
1283 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1284 pr->u.p.count = init + 1;
1286 __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1301 union_i4 vold, vnew;
1302 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1305 while (!KMP_COMPARE_AND_STORE_ACQ64(
1306 (
volatile kmp_int64 *)&pr->u.p.count,
1307 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1308 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1310 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1315 init = vnew.p.count;
1316 status = (init < (UT)vnew.p.ub);
1320 kmp_info_t **other_threads = team->t.t_threads;
1321 T while_limit = pr->u.p.parm3;
1323 T
id = pr->u.p.static_steal_counter;
1324 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1325 __kmp_dispatch_num_buffers;
1329 while ((!status) && (while_limit != ++while_index)) {
1330 dispatch_private_info_template<T> *victim;
1331 union_i4 vold, vnew;
1333 T victimIdx = pr->u.p.parm4;
1334 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1335 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1336 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1337 KMP_DEBUG_ASSERT(victim);
1338 while ((victim == pr ||
id != victim->u.p.static_steal_counter) &&
1339 oldVictimIdx != victimIdx) {
1340 victimIdx = (victimIdx + 1) % nproc;
1341 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1342 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1343 KMP_DEBUG_ASSERT(victim);
1345 if (victim == pr ||
id != victim->u.p.static_steal_counter) {
1350 pr->u.p.parm4 = victimIdx;
1352 vold.b = *(
volatile kmp_int64 *)(&victim->u.p.count);
1355 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1356 if (vnew.p.count >= (UT)vnew.p.ub ||
1357 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1358 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1361 if (remaining > 3) {
1363 vnew.p.ub -= remaining >> 2;
1367 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1369 if (KMP_COMPARE_AND_STORE_ACQ64(
1370 (
volatile kmp_int64 *)&victim->u.p.count,
1371 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1372 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1374 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1375 vold.p.ub - vnew.p.ub);
1380 vold.p.count = init + 1;
1382 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1384 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1399 start = pr->u.p.parm2;
1401 limit = chunk + init - 1;
1403 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1405 KMP_DEBUG_ASSERT(init <= trip);
1406 if ((last = (limit >= trip)) != 0)
1412 *p_lb = start + init;
1413 *p_ub = start + limit;
1415 *p_lb = start + init * incr;
1416 *p_ub = start + limit * incr;
1419 if (pr->flags.ordered) {
1420 pr->u.p.ordered_lower = init;
1421 pr->u.p.ordered_upper = limit;
1427 case kmp_sch_static_balanced: {
1430 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1433 if ((status = !pr->u.p.count) != 0) {
1437 last = (pr->u.p.parm1 != 0);
1441 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1445 case kmp_sch_static_greedy:
1447 case kmp_sch_static_chunked: {
1450 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1451 "kmp_sch_static_[affinity|chunked] case\n",
1453 parm1 = pr->u.p.parm1;
1455 trip = pr->u.p.tc - 1;
1456 init = parm1 * (pr->u.p.count + tid);
1458 if ((status = (init <= trip)) != 0) {
1461 limit = parm1 + init - 1;
1463 if ((last = (limit >= trip)) != 0)
1469 pr->u.p.count += nproc;
1472 *p_lb = start + init;
1473 *p_ub = start + limit;
1475 *p_lb = start + init * incr;
1476 *p_ub = start + limit * incr;
1479 if (pr->flags.ordered) {
1480 pr->u.p.ordered_lower = init;
1481 pr->u.p.ordered_upper = limit;
1487 case kmp_sch_dynamic_chunked: {
1488 T chunk = pr->u.p.parm1;
1492 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1495 init = chunk * test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1496 trip = pr->u.p.tc - 1;
1498 if ((status = (init <= trip)) == 0) {
1505 limit = chunk + init - 1;
1508 if ((last = (limit >= trip)) != 0)
1515 *p_lb = start + init;
1516 *p_ub = start + limit;
1518 *p_lb = start + init * incr;
1519 *p_ub = start + limit * incr;
1522 if (pr->flags.ordered) {
1523 pr->u.p.ordered_lower = init;
1524 pr->u.p.ordered_upper = limit;
1530 case kmp_sch_guided_iterative_chunked: {
1531 T chunkspec = pr->u.p.parm1;
1532 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1539 init = sh->u.s.iteration;
1540 remaining = trip - init;
1541 if (remaining <= 0) {
1550 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1552 remaining = trip - init;
1553 if (remaining <= 0) {
1558 if ((T)remaining > chunkspec) {
1559 limit = init + chunkspec - 1;
1562 limit = init + remaining - 1;
1567 limit = init + (UT)((
double)remaining *
1568 *(
double *)&pr->u.p.parm3);
1569 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1570 (ST)init, (ST)limit)) {
1582 *p_lb = start + init * incr;
1583 *p_ub = start + limit * incr;
1584 if (pr->flags.ordered) {
1585 pr->u.p.ordered_lower = init;
1586 pr->u.p.ordered_upper = limit;
1600 T chunk = pr->u.p.parm1;
1602 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1608 init = sh->u.s.iteration;
1609 remaining = trip - init;
1610 if (remaining <= 0) {
1614 KMP_DEBUG_ASSERT(init % chunk == 0);
1616 if ((T)remaining < pr->u.p.parm2) {
1619 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1621 remaining = trip - init;
1622 if (remaining <= 0) {
1627 if ((T)remaining > chunk) {
1628 limit = init + chunk - 1;
1631 limit = init + remaining - 1;
1638 __kmp_type_convert((
double)remaining * (*(
double *)&pr->u.p.parm3),
1640 UT rem = span % chunk;
1642 span += chunk - rem;
1643 limit = init + span;
1644 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1645 (ST)init, (ST)limit)) {
1657 *p_lb = start + init * incr;
1658 *p_ub = start + limit * incr;
1659 if (pr->flags.ordered) {
1660 pr->u.p.ordered_lower = init;
1661 pr->u.p.ordered_upper = limit;
1672 case kmp_sch_guided_analytical_chunked: {
1673 T chunkspec = pr->u.p.parm1;
1675 #if KMP_USE_X87CONTROL
1678 unsigned int oldFpcw;
1679 unsigned int fpcwSet = 0;
1681 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d "
1682 "kmp_sch_guided_analytical_chunked case\n",
1687 KMP_DEBUG_ASSERT(nproc > 1);
1688 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1692 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1693 if (chunkIdx >= (UT)pr->u.p.parm2) {
1696 init = chunkIdx * chunkspec + pr->u.p.count;
1699 if ((status = (init > 0 && init <= trip)) != 0) {
1700 limit = init + chunkspec - 1;
1702 if ((last = (limit >= trip)) != 0)
1712 #if KMP_USE_X87CONTROL
1717 oldFpcw = _control87(0, 0);
1718 _control87(_PC_64, _MCW_PC);
1723 init = __kmp_dispatch_guided_remaining<T>(
1724 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1725 KMP_DEBUG_ASSERT(init);
1729 limit = trip - __kmp_dispatch_guided_remaining<T>(
1730 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1731 KMP_ASSERT(init <= limit);
1733 KMP_DEBUG_ASSERT(limit <= trip);
1740 #if KMP_USE_X87CONTROL
1744 if (fpcwSet && (oldFpcw & fpcwSet))
1745 _control87(oldFpcw, _MCW_PC);
1752 *p_lb = start + init * incr;
1753 *p_ub = start + limit * incr;
1754 if (pr->flags.ordered) {
1755 pr->u.p.ordered_lower = init;
1756 pr->u.p.ordered_upper = limit;
1767 case kmp_sch_trapezoidal: {
1769 T parm2 = pr->u.p.parm2;
1770 T parm3 = pr->u.p.parm3;
1771 T parm4 = pr->u.p.parm4;
1773 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1776 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
1778 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1779 trip = pr->u.p.tc - 1;
1781 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1788 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1791 if ((last = (limit >= trip)) != 0)
1798 *p_lb = start + init;
1799 *p_ub = start + limit;
1801 *p_lb = start + init * incr;
1802 *p_ub = start + limit * incr;
1805 if (pr->flags.ordered) {
1806 pr->u.p.ordered_lower = init;
1807 pr->u.p.ordered_upper = limit;
1814 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
1815 KMP_HNT(GetNewerLibrary),
1823 if (pr->flags.ordered) {
1826 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d "
1827 "ordered_lower:%%%s ordered_upper:%%%s\n",
1828 traits_t<UT>::spec, traits_t<UT>::spec);
1829 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1830 __kmp_str_free(&buff);
1835 buff = __kmp_str_format(
1836 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1837 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1838 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1839 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1840 __kmp_str_free(&buff);
1849 #if OMPT_SUPPORT && OMPT_OPTIONAL
1850 #define OMPT_LOOP_END \
1851 if (status == 0) { \
1852 if (ompt_enabled.ompt_callback_work) { \
1853 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1854 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1855 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1856 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1857 &(task_info->task_data), 0, codeptr); \
1862 #define OMPT_LOOP_END
1865 #if KMP_STATS_ENABLED
1866 #define KMP_STATS_LOOP_END \
1868 kmp_int64 u, l, t, i; \
1869 l = (kmp_int64)(*p_lb); \
1870 u = (kmp_int64)(*p_ub); \
1871 i = (kmp_int64)(pr->u.p.st); \
1872 if (status == 0) { \
1874 KMP_POP_PARTITIONED_TIMER(); \
1875 } else if (i == 1) { \
1880 } else if (i < 0) { \
1882 t = (l - u) / (-i) + 1; \
1887 t = (u - l) / i + 1; \
1891 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1894 #define KMP_STATS_LOOP_END
1897 template <
typename T>
1898 static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
1900 typename traits_t<T>::signed_t *p_st
1901 #
if OMPT_SUPPORT && OMPT_OPTIONAL
1907 typedef typename traits_t<T>::unsigned_t UT;
1908 typedef typename traits_t<T>::signed_t ST;
1913 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1916 dispatch_private_info_template<T> *pr;
1917 __kmp_assert_valid_gtid(gtid);
1918 kmp_info_t *th = __kmp_threads[gtid];
1919 kmp_team_t *team = th->th.th_team;
1921 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
1924 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1925 gtid, p_lb, p_ub, p_st, p_last));
1927 if (team->t.t_serialized) {
1929 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1930 th->th.th_dispatch->th_disp_buffer);
1931 KMP_DEBUG_ASSERT(pr);
1933 if ((status = (pr->u.p.tc != 0)) == 0) {
1940 if (__kmp_env_consistency_check) {
1941 if (pr->pushed_ws != ct_none) {
1942 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1945 }
else if (pr->flags.nomerge) {
1948 UT limit, trip, init;
1950 T chunk = pr->u.p.parm1;
1952 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1955 init = chunk * pr->u.p.count++;
1956 trip = pr->u.p.tc - 1;
1958 if ((status = (init <= trip)) == 0) {
1965 if (__kmp_env_consistency_check) {
1966 if (pr->pushed_ws != ct_none) {
1967 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1972 limit = chunk + init - 1;
1975 if ((last = (limit >= trip)) != 0) {
1978 pr->u.p.last_upper = pr->u.p.ub;
1986 *p_lb = start + init;
1987 *p_ub = start + limit;
1989 *p_lb = start + init * incr;
1990 *p_ub = start + limit * incr;
1993 if (pr->flags.ordered) {
1994 pr->u.p.ordered_lower = init;
1995 pr->u.p.ordered_upper = limit;
2000 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d "
2001 "ordered_lower:%%%s ordered_upper:%%%s\n",
2002 traits_t<UT>::spec, traits_t<UT>::spec);
2003 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2004 pr->u.p.ordered_upper));
2005 __kmp_str_free(&buff);
2015 pr->u.p.last_upper = *p_ub;
2026 buff = __kmp_str_format(
2027 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2028 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2029 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2030 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2031 (p_last ? *p_last : 0), status));
2032 __kmp_str_free(&buff);
2035 #if INCLUDE_SSC_MARKS
2036 SSC_MARK_DISPATCH_NEXT();
2043 dispatch_shared_info_template<T>
volatile *sh;
2045 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2046 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2048 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2049 th->th.th_dispatch->th_dispatch_pr_current);
2050 KMP_DEBUG_ASSERT(pr);
2051 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2052 th->th.th_dispatch->th_dispatch_sh_current);
2053 KMP_DEBUG_ASSERT(sh);
2055 #if KMP_USE_HIER_SCHED
2056 if (pr->flags.use_hier)
2057 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2060 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2061 p_st, th->th.th_team_nproc,
2062 th->th.th_info.ds.ds_tid);
2067 num_done = test_then_inc<ST>((
volatile ST *)&sh->u.s.num_done);
2072 buff = __kmp_str_format(
2073 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2074 traits_t<UT>::spec);
2075 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2076 __kmp_str_free(&buff);
2080 #if KMP_USE_HIER_SCHED
2081 pr->flags.use_hier = FALSE;
2083 if ((ST)num_done == th->th.th_team_nproc - 1) {
2084 #if (KMP_STATIC_STEAL_ENABLED)
2085 if (pr->schedule == kmp_sch_static_steal &&
2086 traits_t<T>::type_size > 4) {
2088 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2089 __kmp_dispatch_num_buffers;
2090 kmp_info_t **other_threads = team->t.t_threads;
2092 for (i = 0; i < th->th.th_team_nproc; ++i) {
2093 dispatch_private_info_template<T> *buf =
2094 reinterpret_cast<dispatch_private_info_template<T> *
>(
2095 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2096 kmp_lock_t *lck = buf->u.p.th_steal_lock;
2097 KMP_ASSERT(lck != NULL);
2098 __kmp_destroy_lock(lck);
2100 buf->u.p.th_steal_lock = NULL;
2108 sh->u.s.num_done = 0;
2109 sh->u.s.iteration = 0;
2112 if (pr->flags.ordered) {
2113 sh->u.s.ordered_iteration = 0;
2118 sh->buffer_index += __kmp_dispatch_num_buffers;
2119 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2120 gtid, sh->buffer_index));
2125 if (__kmp_env_consistency_check) {
2126 if (pr->pushed_ws != ct_none) {
2127 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2131 th->th.th_dispatch->th_deo_fcn = NULL;
2132 th->th.th_dispatch->th_dxo_fcn = NULL;
2133 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2134 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2138 pr->u.p.last_upper = pr->u.p.ub;
2141 if (p_last != NULL && status != 0)
2149 buff = __kmp_str_format(
2150 "__kmp_dispatch_next: T#%%d normal case: "
2151 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2152 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2153 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2154 (p_last ? *p_last : 0), status));
2155 __kmp_str_free(&buff);
2158 #if INCLUDE_SSC_MARKS
2159 SSC_MARK_DISPATCH_NEXT();
2166 template <
typename T>
2167 static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2168 kmp_int32 *plastiter, T *plower, T *pupper,
2169 typename traits_t<T>::signed_t incr) {
2170 typedef typename traits_t<T>::unsigned_t UT;
2177 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2178 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2180 typedef typename traits_t<T>::signed_t ST;
2184 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d "
2185 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2186 traits_t<T>::spec, traits_t<T>::spec,
2187 traits_t<ST>::spec, traits_t<T>::spec);
2188 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2189 __kmp_str_free(&buff);
2193 if (__kmp_env_consistency_check) {
2195 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2198 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2208 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2211 __kmp_assert_valid_gtid(gtid);
2212 th = __kmp_threads[gtid];
2213 team = th->th.th_team;
2214 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2215 nteams = th->th.th_teams_size.nteams;
2216 team_id = team->t.t_master_tid;
2217 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2221 trip_count = *pupper - *plower + 1;
2222 }
else if (incr == -1) {
2223 trip_count = *plower - *pupper + 1;
2224 }
else if (incr > 0) {
2226 trip_count = (UT)(*pupper - *plower) / incr + 1;
2228 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2231 if (trip_count <= nteams) {
2233 __kmp_static == kmp_sch_static_greedy ||
2235 kmp_sch_static_balanced);
2237 if (team_id < trip_count) {
2238 *pupper = *plower = *plower + team_id * incr;
2240 *plower = *pupper + incr;
2242 if (plastiter != NULL)
2243 *plastiter = (team_id == trip_count - 1);
2245 if (__kmp_static == kmp_sch_static_balanced) {
2246 UT chunk = trip_count / nteams;
2247 UT extras = trip_count % nteams;
2249 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2250 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2251 if (plastiter != NULL)
2252 *plastiter = (team_id == nteams - 1);
2255 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2257 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2259 *plower += team_id * chunk_inc_count;
2260 *pupper = *plower + chunk_inc_count - incr;
2263 if (*pupper < *plower)
2264 *pupper = traits_t<T>::max_value;
2265 if (plastiter != NULL)
2266 *plastiter = *plower <= upper && *pupper > upper - incr;
2267 if (*pupper > upper)
2270 if (*pupper > *plower)
2271 *pupper = traits_t<T>::min_value;
2272 if (plastiter != NULL)
2273 *plastiter = *plower >= upper && *pupper < upper - incr;
2274 if (*pupper < upper)
2306 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2307 KMP_DEBUG_ASSERT(__kmp_init_serial);
2308 #if OMPT_SUPPORT && OMPT_OPTIONAL
2309 OMPT_STORE_RETURN_ADDRESS(gtid);
2311 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2318 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2319 KMP_DEBUG_ASSERT(__kmp_init_serial);
2320 #if OMPT_SUPPORT && OMPT_OPTIONAL
2321 OMPT_STORE_RETURN_ADDRESS(gtid);
2323 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2331 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2332 KMP_DEBUG_ASSERT(__kmp_init_serial);
2333 #if OMPT_SUPPORT && OMPT_OPTIONAL
2334 OMPT_STORE_RETURN_ADDRESS(gtid);
2336 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2344 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2345 KMP_DEBUG_ASSERT(__kmp_init_serial);
2346 #if OMPT_SUPPORT && OMPT_OPTIONAL
2347 OMPT_STORE_RETURN_ADDRESS(gtid);
2349 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2363 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2365 KMP_DEBUG_ASSERT(__kmp_init_serial);
2366 #if OMPT_SUPPORT && OMPT_OPTIONAL
2367 OMPT_STORE_RETURN_ADDRESS(gtid);
2369 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2370 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2373 void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2375 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2377 KMP_DEBUG_ASSERT(__kmp_init_serial);
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL
2379 OMPT_STORE_RETURN_ADDRESS(gtid);
2381 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2382 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2385 void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2387 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2389 KMP_DEBUG_ASSERT(__kmp_init_serial);
2390 #if OMPT_SUPPORT && OMPT_OPTIONAL
2391 OMPT_STORE_RETURN_ADDRESS(gtid);
2393 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2394 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2397 void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2399 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2401 KMP_DEBUG_ASSERT(__kmp_init_serial);
2402 #if OMPT_SUPPORT && OMPT_OPTIONAL
2403 OMPT_STORE_RETURN_ADDRESS(gtid);
2405 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2406 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2423 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2424 #if OMPT_SUPPORT && OMPT_OPTIONAL
2425 OMPT_STORE_RETURN_ADDRESS(gtid);
2427 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2428 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2430 OMPT_LOAD_RETURN_ADDRESS(gtid)
2439 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2441 #if OMPT_SUPPORT && OMPT_OPTIONAL
2442 OMPT_STORE_RETURN_ADDRESS(gtid);
2444 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2445 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2447 OMPT_LOAD_RETURN_ADDRESS(gtid)
2456 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2457 #if OMPT_SUPPORT && OMPT_OPTIONAL
2458 OMPT_STORE_RETURN_ADDRESS(gtid);
2460 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2461 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2463 OMPT_LOAD_RETURN_ADDRESS(gtid)
2472 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2474 #if OMPT_SUPPORT && OMPT_OPTIONAL
2475 OMPT_STORE_RETURN_ADDRESS(gtid);
2477 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2478 #
if OMPT_SUPPORT && OMPT_OPTIONAL
2480 OMPT_LOAD_RETURN_ADDRESS(gtid)
2492 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2499 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2506 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2513 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2520 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2521 return value == checker;
2524 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2525 return value != checker;
2528 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2529 return value < checker;
2532 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2533 return value >= checker;
2536 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2537 return value <= checker;
2541 __kmp_wait_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
2542 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2546 volatile kmp_uint32 *spin = spinner;
2547 kmp_uint32 check = checker;
2549 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2552 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2553 KMP_INIT_YIELD(spins);
2555 while (!f(r = TCR_4(*spin), check)) {
2556 KMP_FSYNC_SPIN_PREPARE(obj);
2561 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2563 KMP_FSYNC_SPIN_ACQUIRED(obj);
2567 void __kmp_wait_4_ptr(
void *spinner, kmp_uint32 checker,
2568 kmp_uint32 (*pred)(
void *, kmp_uint32),
2572 void *spin = spinner;
2573 kmp_uint32 check = checker;
2575 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
2577 KMP_FSYNC_SPIN_INIT(obj, spin);
2578 KMP_INIT_YIELD(spins);
2580 while (!f(spin, check)) {
2581 KMP_FSYNC_SPIN_PREPARE(obj);
2584 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2586 KMP_FSYNC_SPIN_ACQUIRED(obj);
2591 #ifdef KMP_GOMP_COMPAT
2593 void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
2595 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2597 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2601 void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2603 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2605 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2609 void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2611 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2613 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2617 void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2619 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2621 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2625 void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
2626 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2629 void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
2630 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2633 void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
2634 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2637 void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
2638 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)