29 #ifndef INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
30 #define INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
32 #include "mdds/global.hpp"
33 #include "../types.hpp"
36 #include <emmintrin.h>
39 #include <immintrin.h>
42 namespace mdds {
namespace mtv {
namespace soa {
46 template<
typename Blks, lu_factor_t F>
49 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
51 static_assert(invalid_static_int<F>,
"The loop-unrolling factor must be one of 0, 4, 8, 16, or 32.");
55 template<
typename Blks>
58 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
60 int64_t n = block_store.positions.size();
62 if (start_block_index >= n)
66 #pragma omp parallel for
68 for (int64_t i = start_block_index; i < n; ++i)
69 block_store.positions[i] += delta;
73 template<
typename Blks>
76 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
78 int64_t n = block_store.positions.size();
80 if (start_block_index >= n)
84 int64_t len = n - start_block_index;
85 int64_t rem = len & 3;
87 len += start_block_index;
89 #pragma omp parallel for
91 for (int64_t i = start_block_index; i < len; i += 4)
93 block_store.positions[i+0] += delta;
94 block_store.positions[i+1] += delta;
95 block_store.positions[i+2] += delta;
96 block_store.positions[i+3] += delta;
100 for (int64_t i = len; i < rem; ++i)
101 block_store.positions[i] += delta;
105 template<
typename Blks>
108 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
110 int64_t n = block_store.positions.size();
112 if (start_block_index >= n)
116 int64_t len = n - start_block_index;
117 int64_t rem = len & 7;
119 len += start_block_index;
121 #pragma omp parallel for
123 for (int64_t i = start_block_index; i < len; i += 8)
125 block_store.positions[i+0] += delta;
126 block_store.positions[i+1] += delta;
127 block_store.positions[i+2] += delta;
128 block_store.positions[i+3] += delta;
129 block_store.positions[i+4] += delta;
130 block_store.positions[i+5] += delta;
131 block_store.positions[i+6] += delta;
132 block_store.positions[i+7] += delta;
136 for (int64_t i = len; i < rem; ++i)
137 block_store.positions[i] += delta;
141 template<
typename Blks>
144 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
146 int64_t n = block_store.positions.size();
148 if (start_block_index >= n)
152 int64_t len = n - start_block_index;
153 int64_t rem = len & 15;
155 len += start_block_index;
157 #pragma omp parallel for
159 for (int64_t i = start_block_index; i < len; i += 16)
161 block_store.positions[i+0] += delta;
162 block_store.positions[i+1] += delta;
163 block_store.positions[i+2] += delta;
164 block_store.positions[i+3] += delta;
165 block_store.positions[i+4] += delta;
166 block_store.positions[i+5] += delta;
167 block_store.positions[i+6] += delta;
168 block_store.positions[i+7] += delta;
169 block_store.positions[i+8] += delta;
170 block_store.positions[i+9] += delta;
171 block_store.positions[i+10] += delta;
172 block_store.positions[i+11] += delta;
173 block_store.positions[i+12] += delta;
174 block_store.positions[i+13] += delta;
175 block_store.positions[i+14] += delta;
176 block_store.positions[i+15] += delta;
180 for (int64_t i = len; i < rem; ++i)
181 block_store.positions[i] += delta;
185 template<
typename Blks>
188 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
190 int64_t n = block_store.positions.size();
192 if (start_block_index >= n)
196 int64_t len = n - start_block_index;
197 int64_t rem = len & 31;
199 len += start_block_index;
201 #pragma omp parallel for
203 for (int64_t i = start_block_index; i < len; i += 32)
205 block_store.positions[i+0] += delta;
206 block_store.positions[i+1] += delta;
207 block_store.positions[i+2] += delta;
208 block_store.positions[i+3] += delta;
209 block_store.positions[i+4] += delta;
210 block_store.positions[i+5] += delta;
211 block_store.positions[i+6] += delta;
212 block_store.positions[i+7] += delta;
213 block_store.positions[i+8] += delta;
214 block_store.positions[i+9] += delta;
215 block_store.positions[i+10] += delta;
216 block_store.positions[i+11] += delta;
217 block_store.positions[i+12] += delta;
218 block_store.positions[i+13] += delta;
219 block_store.positions[i+14] += delta;
220 block_store.positions[i+15] += delta;
221 block_store.positions[i+16] += delta;
222 block_store.positions[i+17] += delta;
223 block_store.positions[i+18] += delta;
224 block_store.positions[i+19] += delta;
225 block_store.positions[i+20] += delta;
226 block_store.positions[i+21] += delta;
227 block_store.positions[i+22] += delta;
228 block_store.positions[i+23] += delta;
229 block_store.positions[i+24] += delta;
230 block_store.positions[i+25] += delta;
231 block_store.positions[i+26] += delta;
232 block_store.positions[i+27] += delta;
233 block_store.positions[i+28] += delta;
234 block_store.positions[i+29] += delta;
235 block_store.positions[i+30] += delta;
236 block_store.positions[i+31] += delta;
240 for (int64_t i = len; i < rem; ++i)
241 block_store.positions[i] += delta;
245 #if defined(__SSE2__)
247 template<
typename Blks>
250 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
253 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
254 "This code works only when the position values are 64-bit wide.");
256 int64_t n = block_store.positions.size();
258 if (start_block_index >= n)
262 int64_t len = n - start_block_index;
267 len += start_block_index;
269 __m128i right = _mm_set_epi64x(delta, delta);
272 #pragma omp parallel for
274 for (int64_t i = start_block_index; i < len; i += 2)
276 __m128i* dst = (__m128i*)&block_store.positions[i];
277 _mm_storeu_si128(dst, _mm_add_epi64(_mm_loadu_si128(dst), right));
281 block_store.positions[len] += delta;
285 template<
typename Blks>
286 struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu4>
288 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
291 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
292 "This code works only when the position values are 64-bit wide.");
294 int64_t n = block_store.positions.size();
296 if (start_block_index >= n)
300 int64_t len = n - start_block_index;
301 int64_t rem = len & 7;
303 len += start_block_index;
305 __m128i right = _mm_set_epi64x(delta, delta);
308 #pragma omp parallel for
310 for (int64_t i = start_block_index; i < len; i += 8)
312 __m128i* dst0 = (__m128i*)&block_store.positions[i];
313 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
315 __m128i* dst2 = (__m128i*)&block_store.positions[i+2];
316 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
318 __m128i* dst4 = (__m128i*)&block_store.positions[i+4];
319 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
321 __m128i* dst6 = (__m128i*)&block_store.positions[i+6];
322 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
326 for (int64_t i = len; i < rem; ++i)
327 block_store.positions[i] += delta;
331 template<
typename Blks>
332 struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu8>
334 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
337 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
338 "This code works only when the position values are 64-bit wide.");
340 int64_t n = block_store.positions.size();
342 if (start_block_index >= n)
346 int64_t len = n - start_block_index;
347 int64_t rem = len & 15;
349 len += start_block_index;
351 __m128i right = _mm_set_epi64x(delta, delta);
354 #pragma omp parallel for
356 for (int64_t i = start_block_index; i < len; i += 16)
358 __m128i* dst0 = (__m128i*)&block_store.positions[i];
359 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
361 __m128i* dst2 = (__m128i*)&block_store.positions[i+2];
362 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
364 __m128i* dst4 = (__m128i*)&block_store.positions[i+4];
365 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
367 __m128i* dst6 = (__m128i*)&block_store.positions[i+6];
368 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
370 __m128i* dst8 = (__m128i*)&block_store.positions[i+8];
371 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
373 __m128i* dst10 = (__m128i*)&block_store.positions[i+10];
374 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
376 __m128i* dst12 = (__m128i*)&block_store.positions[i+12];
377 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
379 __m128i* dst14 = (__m128i*)&block_store.positions[i+14];
380 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
384 for (int64_t i = len; i < rem; ++i)
385 block_store.positions[i] += delta;
389 template<
typename Blks>
390 struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu16>
392 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
395 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
396 "This code works only when the position values are 64-bit wide.");
398 int64_t n = block_store.positions.size();
400 if (start_block_index >= n)
404 int64_t len = n - start_block_index;
405 int64_t rem = len & 31;
407 len += start_block_index;
409 __m128i right = _mm_set_epi64x(delta, delta);
412 #pragma omp parallel for
414 for (int64_t i = start_block_index; i < len; i += 32)
416 __m128i* dst0 = (__m128i*)&block_store.positions[i];
417 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
419 __m128i* dst2 = (__m128i*)&block_store.positions[i+2];
420 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
422 __m128i* dst4 = (__m128i*)&block_store.positions[i+4];
423 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
425 __m128i* dst6 = (__m128i*)&block_store.positions[i+6];
426 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
428 __m128i* dst8 = (__m128i*)&block_store.positions[i+8];
429 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
431 __m128i* dst10 = (__m128i*)&block_store.positions[i+10];
432 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
434 __m128i* dst12 = (__m128i*)&block_store.positions[i+12];
435 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
437 __m128i* dst14 = (__m128i*)&block_store.positions[i+14];
438 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
440 __m128i* dst16 = (__m128i*)&block_store.positions[i+16];
441 _mm_storeu_si128(dst16, _mm_add_epi64(_mm_loadu_si128(dst16), right));
443 __m128i* dst18 = (__m128i*)&block_store.positions[i+18];
444 _mm_storeu_si128(dst18, _mm_add_epi64(_mm_loadu_si128(dst18), right));
446 __m128i* dst20 = (__m128i*)&block_store.positions[i+20];
447 _mm_storeu_si128(dst20, _mm_add_epi64(_mm_loadu_si128(dst20), right));
449 __m128i* dst22 = (__m128i*)&block_store.positions[i+22];
450 _mm_storeu_si128(dst22, _mm_add_epi64(_mm_loadu_si128(dst22), right));
452 __m128i* dst24 = (__m128i*)&block_store.positions[i+24];
453 _mm_storeu_si128(dst24, _mm_add_epi64(_mm_loadu_si128(dst24), right));
455 __m128i* dst26 = (__m128i*)&block_store.positions[i+26];
456 _mm_storeu_si128(dst26, _mm_add_epi64(_mm_loadu_si128(dst26), right));
458 __m128i* dst28 = (__m128i*)&block_store.positions[i+28];
459 _mm_storeu_si128(dst28, _mm_add_epi64(_mm_loadu_si128(dst28), right));
461 __m128i* dst30 = (__m128i*)&block_store.positions[i+30];
462 _mm_storeu_si128(dst30, _mm_add_epi64(_mm_loadu_si128(dst30), right));
466 for (int64_t i = len; i < rem; ++i)
467 block_store.positions[i] += delta;
473 #if defined(__AVX2__)
475 template<
typename Blks>
476 struct adjust_block_positions<Blks, lu_factor_t::avx2_x64>
478 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
481 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
482 "This code works only when the position values are 64-bit wide.");
484 int64_t n = block_store.positions.size();
486 if (start_block_index >= n)
490 int64_t len = n - start_block_index;
491 int64_t rem = len & 3;
493 len += start_block_index;
495 __m256i right = _mm256_set1_epi64x(delta);
498 #pragma omp parallel for
500 for (int64_t i = start_block_index; i < len; i += 4)
502 __m256i* dst = (__m256i*)&block_store.positions[i];
503 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
507 for (int64_t i = len; i < rem; ++i)
508 block_store.positions[i] += delta;
512 template<
typename Blks>
513 struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu4>
515 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
518 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
519 "This code works only when the position values are 64-bit wide.");
521 int64_t n = block_store.positions.size();
523 if (start_block_index >= n)
527 int64_t len = n - start_block_index;
528 int64_t rem = len & 15;
530 len += start_block_index;
532 __m256i right = _mm256_set1_epi64x(delta);
535 #pragma omp parallel for
537 for (int64_t i = start_block_index; i < len; i += 16)
539 __m256i* dst = (__m256i*)&block_store.positions[i];
540 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
542 __m256i* dst4 = (__m256i*)&block_store.positions[i+4];
543 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
545 __m256i* dst8 = (__m256i*)&block_store.positions[i+8];
546 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
548 __m256i* dst12 = (__m256i*)&block_store.positions[i+12];
549 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
553 for (int64_t i = len; i < rem; ++i)
554 block_store.positions[i] += delta;
558 template<
typename Blks>
559 struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu8>
561 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
564 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
565 "This code works only when the position values are 64-bit wide.");
567 int64_t n = block_store.positions.size();
569 if (start_block_index >= n)
573 int64_t len = n - start_block_index;
574 int64_t rem = len & 31;
576 len += start_block_index;
578 __m256i right = _mm256_set1_epi64x(delta);
581 #pragma omp parallel for
583 for (int64_t i = start_block_index; i < len; i += 32)
585 __m256i* dst = (__m256i*)&block_store.positions[i];
586 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
588 __m256i* dst4 = (__m256i*)&block_store.positions[i+4];
589 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
591 __m256i* dst8 = (__m256i*)&block_store.positions[i+8];
592 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
594 __m256i* dst12 = (__m256i*)&block_store.positions[i+12];
595 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
597 __m256i* dst16 = (__m256i*)&block_store.positions[i+16];
598 _mm256_storeu_si256(dst16, _mm256_add_epi64(_mm256_loadu_si256(dst16), right));
600 __m256i* dst20 = (__m256i*)&block_store.positions[i+20];
601 _mm256_storeu_si256(dst20, _mm256_add_epi64(_mm256_loadu_si256(dst20), right));
603 __m256i* dst24 = (__m256i*)&block_store.positions[i+24];
604 _mm256_storeu_si256(dst24, _mm256_add_epi64(_mm256_loadu_si256(dst24), right));
606 __m256i* dst28 = (__m256i*)&block_store.positions[i+28];
607 _mm256_storeu_si256(dst28, _mm256_add_epi64(_mm256_loadu_si256(dst28), right));
611 for (int64_t i = len; i < rem; ++i)
612 block_store.positions[i] += delta;
Definition: soa/block_util.hpp:48