18 #if defined(_WIN32) || defined(__i386__)
19 #define BT_USE_SSE_IN_API
24 #if defined BT_USE_SIMD_VECTOR3
38 #if defined BT_USE_SSE || defined _WIN32
40 #define LOG2_ARRAY_SIZE 6
41 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
43 #include <emmintrin.h>
45 long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
46 long _maxdot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
49 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
51 float4 vvec = _mm_loadu_ps(vec);
52 float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));
53 float4 vLo = _mm_movelh_ps(vvec, vvec);
58 float4 stack_array[STACK_ARRAY_COUNT];
71 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
86 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
87 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
88 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
92 stack_array[index] =
x;
101 lo0 = _mm_movelh_ps(v0,
v1);
102 hi0 = _mm_movehl_ps(
v1, v0);
103 lo1 = _mm_movelh_ps(
v2, v3);
104 hi1 = _mm_movehl_ps(v3,
v2);
108 z = _mm_shuffle_ps(hi0, hi1, 0x88);
109 x = _mm_shuffle_ps(lo0, lo1, 0x88);
110 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
114 stack_array[index + 1] =
x;
123 lo0 = _mm_movelh_ps(v0,
v1);
124 hi0 = _mm_movehl_ps(
v1, v0);
125 lo1 = _mm_movelh_ps(
v2, v3);
126 hi1 = _mm_movehl_ps(v3,
v2);
130 z = _mm_shuffle_ps(hi0, hi1, 0x88);
131 x = _mm_shuffle_ps(lo0, lo1, 0x88);
132 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
136 stack_array[index + 2] =
x;
145 lo0 = _mm_movelh_ps(v0,
v1);
146 hi0 = _mm_movehl_ps(
v1, v0);
147 lo1 = _mm_movelh_ps(
v2, v3);
148 hi1 = _mm_movehl_ps(v3,
v2);
152 z = _mm_shuffle_ps(hi0, hi1, 0x88);
153 x = _mm_shuffle_ps(lo0, lo1, 0x88);
154 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
158 stack_array[index + 3] =
x;
165 if (0xf != _mm_movemask_ps((
float4)_mm_cmpeq_ps(
max, dotMax)))
175 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index],
max))); index++)
179 maxIndex = 4 * index +
segment + indexTable[test];
192 for (; index + 4 <=
count / 4; index += 4)
207 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
213 stack_array[index] =
x;
222 lo0 = _mm_movelh_ps(v0,
v1);
223 hi0 = _mm_movehl_ps(
v1, v0);
224 lo1 = _mm_movelh_ps(
v2, v3);
225 hi1 = _mm_movehl_ps(v3,
v2);
229 z = _mm_shuffle_ps(hi0, hi1, 0x88);
230 x = _mm_shuffle_ps(lo0, lo1, 0x88);
231 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
235 stack_array[index + 1] =
x;
244 lo0 = _mm_movelh_ps(v0,
v1);
245 hi0 = _mm_movehl_ps(
v1, v0);
246 lo1 = _mm_movelh_ps(
v2, v3);
247 hi1 = _mm_movehl_ps(v3,
v2);
251 z = _mm_shuffle_ps(hi0, hi1, 0x88);
252 x = _mm_shuffle_ps(lo0, lo1, 0x88);
253 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
257 stack_array[index + 2] =
x;
266 lo0 = _mm_movelh_ps(v0,
v1);
267 hi0 = _mm_movehl_ps(
v1, v0);
268 lo1 = _mm_movelh_ps(
v2, v3);
269 hi1 = _mm_movehl_ps(v3,
v2);
273 z = _mm_shuffle_ps(hi0, hi1, 0x88);
274 x = _mm_shuffle_ps(lo0, lo1, 0x88);
275 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
279 stack_array[index + 3] =
x;
286 size_t localCount = (
count & -4L) - 4 * index;
290 float4 t0, t1, t2, t3, t4;
291 float4 *sap = &stack_array[index + localCount / 4];
292 vertices += localCount;
293 size_t byteIndex = -(localCount) *
sizeof(
float);
297 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
298 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
299 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
300 movaps %[t0], %[max] // vertices[0] \n\
301 movlhps %[t1], %[max] // x0y0x1y1 \n\
302 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
303 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
304 mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
305 movhlps %[t0], %[t1] // z0w0z1w1 \n\
306 movaps %[t3], %[t0] // vertices[2] \n\
307 movlhps %[t4], %[t0] // x2y2x3y3 \n\
308 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
309 movhlps %[t3], %[t4] // z2w2z3w3 \n\
310 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
311 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
312 movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
313 shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
314 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
315 addps %[t3], %[max] // x + y \n\
316 addps %[t1], %[max] // x + y + z \n\
317 movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
318 maxps %[t2], %[max] // record max, restore max \n\
319 add $16, %[byteIndex] // advance loop counter\n\
322 : [
max]
"+x"(
max), [t0]
"=&x"(t0), [t1]
"=&x"(t1), [t2]
"=&x"(t2), [t3]
"=&x"(t3), [t4]
"=&x"(t4), [byteIndex]
"+r"(byteIndex)
323 : [vLo]
"x"(vLo), [vHi]
"x"(vHi), [vertices]
"r"(vertices), [sap]
"r"(sap)
325 index += localCount / 4;
328 for (
unsigned int i = 0; i < localCount / 4; i++, index++)
343 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
344 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
345 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
349 stack_array[index] =
x;
372 z = _mm_shuffle_ps(hi0,
v2, 0xa8);
376 x = _mm_shuffle_ps(lo0, lo1, 0x88);
377 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
385 z = _mm_movehl_ps(
v1, v0);
387 z = _mm_shuffle_ps(
z,
z, 0xa8);
388 x = _mm_shuffle_ps(
xy,
xy, 0xa8);
389 y = _mm_shuffle_ps(
xy,
xy, 0xfd);
396 z = _mm_shuffle_ps(
xy,
xy, 0xaa);
399 x = _mm_shuffle_ps(
xy,
xy, 0);
400 y = _mm_shuffle_ps(
xy,
xy, 0x55);
406 stack_array[index] =
x;
412 if (0 ==
segment || 0xf != _mm_movemask_ps((
float4)_mm_cmpeq_ps(
max, dotMax)))
427 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index],
max))); index++)
430 maxIndex = 4 * index +
segment + indexTable[test];
433 _mm_store_ss(dotResult, dotMax);
437 long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
439 long _mindot_large(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
442 static const unsigned char indexTable[16] = {(
unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
444 float4 vvec = _mm_loadu_ps(vec);
445 float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));
446 float4 vLo = _mm_movelh_ps(vvec, vvec);
451 float4 stack_array[STACK_ARRAY_COUNT];
464 for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
479 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
480 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
481 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
485 stack_array[index] =
x;
494 lo0 = _mm_movelh_ps(v0,
v1);
495 hi0 = _mm_movehl_ps(
v1, v0);
496 lo1 = _mm_movelh_ps(
v2, v3);
497 hi1 = _mm_movehl_ps(v3,
v2);
501 z = _mm_shuffle_ps(hi0, hi1, 0x88);
502 x = _mm_shuffle_ps(lo0, lo1, 0x88);
503 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
507 stack_array[index + 1] =
x;
516 lo0 = _mm_movelh_ps(v0,
v1);
517 hi0 = _mm_movehl_ps(
v1, v0);
518 lo1 = _mm_movelh_ps(
v2, v3);
519 hi1 = _mm_movehl_ps(v3,
v2);
523 z = _mm_shuffle_ps(hi0, hi1, 0x88);
524 x = _mm_shuffle_ps(lo0, lo1, 0x88);
525 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
529 stack_array[index + 2] =
x;
538 lo0 = _mm_movelh_ps(v0,
v1);
539 hi0 = _mm_movehl_ps(
v1, v0);
540 lo1 = _mm_movelh_ps(
v2, v3);
541 hi1 = _mm_movehl_ps(v3,
v2);
545 z = _mm_shuffle_ps(hi0, hi1, 0x88);
546 x = _mm_shuffle_ps(lo0, lo1, 0x88);
547 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
551 stack_array[index + 3] =
x;
558 if (0xf != _mm_movemask_ps((
float4)_mm_cmpeq_ps(
min, dotmin)))
568 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index],
min))); index++)
572 minIndex = 4 * index +
segment + indexTable[test];
585 for (; index + 4 <=
count / 4; index += 4)
600 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
601 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
602 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
606 stack_array[index] =
x;
615 lo0 = _mm_movelh_ps(v0,
v1);
616 hi0 = _mm_movehl_ps(
v1, v0);
617 lo1 = _mm_movelh_ps(
v2, v3);
618 hi1 = _mm_movehl_ps(v3,
v2);
622 z = _mm_shuffle_ps(hi0, hi1, 0x88);
623 x = _mm_shuffle_ps(lo0, lo1, 0x88);
624 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
628 stack_array[index + 1] =
x;
637 lo0 = _mm_movelh_ps(v0,
v1);
638 hi0 = _mm_movehl_ps(
v1, v0);
639 lo1 = _mm_movelh_ps(
v2, v3);
640 hi1 = _mm_movehl_ps(v3,
v2);
644 z = _mm_shuffle_ps(hi0, hi1, 0x88);
645 x = _mm_shuffle_ps(lo0, lo1, 0x88);
646 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
650 stack_array[index + 2] =
x;
659 lo0 = _mm_movelh_ps(v0,
v1);
660 hi0 = _mm_movehl_ps(
v1, v0);
661 lo1 = _mm_movelh_ps(
v2, v3);
662 hi1 = _mm_movehl_ps(v3,
v2);
666 z = _mm_shuffle_ps(hi0, hi1, 0x88);
667 x = _mm_shuffle_ps(lo0, lo1, 0x88);
668 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
672 stack_array[index + 3] =
x;
679 size_t localCount = (
count & -4L) - 4 * index;
683 vertices += localCount;
684 float4 t0, t1, t2, t3, t4;
685 size_t byteIndex = -(localCount) *
sizeof(
float);
686 float4 *sap = &stack_array[index + localCount / 4];
690 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
691 movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
692 movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
693 movaps %[t0], %[min] // vertices[0] \n\
694 movlhps %[t1], %[min] // x0y0x1y1 \n\
695 movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
696 movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
697 mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
698 movhlps %[t0], %[t1] // z0w0z1w1 \n\
699 movaps %[t3], %[t0] // vertices[2] \n\
700 movlhps %[t4], %[t0] // x2y2x3y3 \n\
701 movhlps %[t3], %[t4] // z2w2z3w3 \n\
702 mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
703 shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
704 mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
705 movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
706 shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
707 shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
708 addps %[t3], %[min] // x + y \n\
709 addps %[t1], %[min] // x + y + z \n\
710 movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
711 minps %[t2], %[min] // record min, restore min \n\
712 add $16, %[byteIndex] // advance loop counter\n\
715 : [
min]
"+x"(
min), [t0]
"=&x"(t0), [t1]
"=&x"(t1), [t2]
"=&x"(t2), [t3]
"=&x"(t3), [t4]
"=&x"(t4), [byteIndex]
"+r"(byteIndex)
716 : [vLo]
"x"(vLo), [vHi]
"x"(vHi), [vertices]
"r"(vertices), [sap]
"r"(sap)
718 index += localCount / 4;
721 for (
unsigned int i = 0; i < localCount / 4; i++, index++)
736 float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
737 float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
738 float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
742 stack_array[index] =
x;
766 z = _mm_shuffle_ps(hi0,
v2, 0xa8);
770 x = _mm_shuffle_ps(lo0, lo1, 0x88);
771 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
779 z = _mm_movehl_ps(
v1, v0);
781 z = _mm_shuffle_ps(
z,
z, 0xa8);
782 x = _mm_shuffle_ps(
xy,
xy, 0xa8);
783 y = _mm_shuffle_ps(
xy,
xy, 0xfd);
790 z = _mm_shuffle_ps(
xy,
xy, 0xaa);
793 x = _mm_shuffle_ps(
xy,
xy, 0);
794 y = _mm_shuffle_ps(
xy,
xy, 0x55);
800 stack_array[index] =
x;
806 if (0 ==
segment || 0xf != _mm_movemask_ps((
float4)_mm_cmpeq_ps(
min, dotmin)))
821 for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index],
min))); index++)
824 minIndex = 4 * index +
segment + indexTable[test];
827 _mm_store_ss(dotResult, dotmin);
831 #elif defined BT_USE_NEON
833 #define ARM_NEON_GCC_COMPATIBILITY 1
834 #include <arm_neon.h>
835 #include <sys/types.h>
836 #include <sys/sysctl.h>
838 static long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
839 static long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
840 static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
841 static long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
842 static long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
843 static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult);
845 long (*_maxdot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult) = _maxdot_large_sel;
846 long (*_mindot_large)(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult) = _mindot_large_sel;
848 static inline uint32_t btGetCpuCapabilities(
void)
851 static bool testedCapabilities =
false;
853 if (0 == testedCapabilities)
856 size_t featureSize =
sizeof(hasFeature);
857 int err = sysctlbyname(
"hw.optional.neon_hpfp", &hasFeature, &featureSize,
NULL, 0);
859 if (0 ==
err && hasFeature)
860 capabilities |= 0x2000;
862 testedCapabilities =
true;
868 static long _maxdot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
870 if (btGetCpuCapabilities() & 0x2000)
871 _maxdot_large = _maxdot_large_v1;
873 _maxdot_large = _maxdot_large_v0;
875 return _maxdot_large(vv, vec,
count, dotResult);
878 static long _mindot_large_sel(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
880 if (btGetCpuCapabilities() & 0x2000)
881 _mindot_large = _mindot_large_v1;
883 _mindot_large = _mindot_large_v0;
885 return _mindot_large(vv, vec,
count, dotResult);
889 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) );
_r; })
892 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); _r; })
895 long _maxdot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
898 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
899 float32x2_t vLo = vget_low_f32(vvec);
900 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
903 uint32x2_t indexLo = (uint32x2_t){0, 1};
904 uint32x2_t indexHi = (uint32x2_t){2, 3};
905 uint32x2_t iLo = (uint32x2_t){
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1)};
906 uint32x2_t iHi = (uint32x2_t){
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1)};
907 const uint32x2_t four = (uint32x2_t){4, 4};
909 for (; i + 8 <=
count; i += 8)
911 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
912 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
913 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
914 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
916 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
917 float32x2_t xy1 = vmul_f32(vget_low_f32(
v1), vLo);
918 float32x2_t xy2 = vmul_f32(vget_low_f32(
v2), vLo);
919 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
921 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
922 float32x2x2_t z1 = vtrn_f32(vget_high_f32(
v2), vget_high_f32(v3));
923 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
924 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
926 float32x2_t rLo = vpadd_f32(xy0, xy1);
927 float32x2_t rHi = vpadd_f32(xy2, xy3);
928 rLo = vadd_f32(rLo, zLo);
929 rHi = vadd_f32(rHi, zHi);
931 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
932 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
933 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
934 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
935 iLo = vbsl_u32(maskLo, indexLo, iLo);
936 iHi = vbsl_u32(maskHi, indexHi, iHi);
937 indexLo = vadd_u32(indexLo, four);
938 indexHi = vadd_u32(indexHi, four);
940 v0 = vld1q_f32_aligned_postincrement(vv);
941 v1 = vld1q_f32_aligned_postincrement(vv);
942 v2 = vld1q_f32_aligned_postincrement(vv);
943 v3 = vld1q_f32_aligned_postincrement(vv);
945 xy0 = vmul_f32(vget_low_f32(v0), vLo);
946 xy1 = vmul_f32(vget_low_f32(
v1), vLo);
947 xy2 = vmul_f32(vget_low_f32(
v2), vLo);
948 xy3 = vmul_f32(vget_low_f32(v3), vLo);
950 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
951 z1 = vtrn_f32(vget_high_f32(
v2), vget_high_f32(v3));
952 zLo = vmul_f32(z0.val[0], vHi);
953 zHi = vmul_f32(z1.val[0], vHi);
955 rLo = vpadd_f32(xy0, xy1);
956 rHi = vpadd_f32(xy2, xy3);
957 rLo = vadd_f32(rLo, zLo);
958 rHi = vadd_f32(rHi, zHi);
960 maskLo = vcgt_f32(rLo, dotMaxLo);
961 maskHi = vcgt_f32(rHi, dotMaxHi);
962 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
963 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
964 iLo = vbsl_u32(maskLo, indexLo, iLo);
965 iHi = vbsl_u32(maskHi, indexHi, iHi);
966 indexLo = vadd_u32(indexLo, four);
967 indexHi = vadd_u32(indexHi, four);
970 for (; i + 4 <=
count; i += 4)
972 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
973 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
974 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
975 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
977 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
978 float32x2_t xy1 = vmul_f32(vget_low_f32(
v1), vLo);
979 float32x2_t xy2 = vmul_f32(vget_low_f32(
v2), vLo);
980 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
982 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
983 float32x2x2_t z1 = vtrn_f32(vget_high_f32(
v2), vget_high_f32(v3));
984 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
985 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
987 float32x2_t rLo = vpadd_f32(xy0, xy1);
988 float32x2_t rHi = vpadd_f32(xy2, xy3);
989 rLo = vadd_f32(rLo, zLo);
990 rHi = vadd_f32(rHi, zHi);
992 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
993 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
994 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
995 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
996 iLo = vbsl_u32(maskLo, indexLo, iLo);
997 iHi = vbsl_u32(maskHi, indexHi, iHi);
998 indexLo = vadd_u32(indexLo, four);
999 indexHi = vadd_u32(indexHi, four);
1006 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1007 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1008 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1010 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1011 float32x2_t xy1 = vmul_f32(vget_low_f32(
v1), vLo);
1012 float32x2_t xy2 = vmul_f32(vget_low_f32(
v2), vLo);
1014 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
1015 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1016 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(
v2), 0), vHi);
1018 float32x2_t rLo = vpadd_f32(xy0, xy1);
1019 float32x2_t rHi = vpadd_f32(xy2, xy2);
1020 rLo = vadd_f32(rLo, zLo);
1021 rHi = vadd_f32(rHi, zHi);
1023 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1024 uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
1025 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1026 dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
1027 iLo = vbsl_u32(maskLo, indexLo, iLo);
1028 iHi = vbsl_u32(maskHi, indexHi, iHi);
1033 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1034 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1036 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1037 float32x2_t xy1 = vmul_f32(vget_low_f32(
v1), vLo);
1039 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
1040 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1042 float32x2_t rLo = vpadd_f32(xy0, xy1);
1043 rLo = vadd_f32(rLo, zLo);
1045 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1046 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1047 iLo = vbsl_u32(maskLo, indexLo, iLo);
1052 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1053 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1054 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1055 float32x2_t zLo = vmul_f32(z0, vHi);
1056 float32x2_t rLo = vpadd_f32(xy0, xy0);
1057 rLo = vadd_f32(rLo, zLo);
1058 uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1059 dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1060 iLo = vbsl_u32(maskLo, indexLo, iLo);
1069 uint32x2_t
mask = vcgt_f32(dotMaxHi, dotMaxLo);
1070 dotMaxLo = vbsl_f32(
mask, dotMaxHi, dotMaxLo);
1071 iLo = vbsl_u32(
mask, iHi, iLo);
1074 dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1075 iHi = vdup_lane_u32(iLo, 1);
1076 mask = vcgt_f32(dotMaxHi, dotMaxLo);
1077 dotMaxLo = vbsl_f32(
mask, dotMaxHi, dotMaxLo);
1078 iLo = vbsl_u32(
mask, iHi, iLo);
1080 *dotResult = vget_lane_f32(dotMaxLo, 0);
1081 return vget_lane_u32(iLo, 0);
1084 long _maxdot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
1086 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1087 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1088 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1089 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1090 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1094 unsigned long i = 0;
1095 for (; i + 8 <=
count; i += 8)
1097 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1098 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1099 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1100 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1103 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1104 float32x4_t xy1 = vcombine_f32(vget_low_f32(
v2), vget_low_f32(v3));
1106 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1107 float32x4_t z1 = vcombine_f32(vget_high_f32(
v2), vget_high_f32(v3));
1109 xy0 = vmulq_f32(xy0, vLo);
1110 xy1 = vmulq_f32(xy1, vLo);
1112 float32x4x2_t zb = vuzpq_f32(z0, z1);
1113 float32x4_t
z = vmulq_f32(zb.val[0], vHi);
1114 float32x4x2_t
xy = vuzpq_f32(xy0, xy1);
1115 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1116 x = vaddq_f32(
x,
z);
1120 index = vbslq_u32(
mask, local_index, index);
1121 local_index = vaddq_u32(local_index, four);
1123 v0 = vld1q_f32_aligned_postincrement(vv);
1124 v1 = vld1q_f32_aligned_postincrement(vv);
1125 v2 = vld1q_f32_aligned_postincrement(vv);
1126 v3 = vld1q_f32_aligned_postincrement(vv);
1129 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1130 xy1 = vcombine_f32(vget_low_f32(
v2), vget_low_f32(v3));
1132 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1133 z1 = vcombine_f32(vget_high_f32(
v2), vget_high_f32(v3));
1135 xy0 = vmulq_f32(xy0, vLo);
1136 xy1 = vmulq_f32(xy1, vLo);
1138 zb = vuzpq_f32(z0, z1);
1139 z = vmulq_f32(zb.val[0], vHi);
1140 xy = vuzpq_f32(xy0, xy1);
1141 x = vaddq_f32(
xy.val[0],
xy.val[1]);
1142 x = vaddq_f32(
x,
z);
1146 index = vbslq_u32(
mask, local_index, index);
1147 local_index = vaddq_u32(local_index, four);
1150 for (; i + 4 <=
count; i += 4)
1152 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1153 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1154 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1155 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1158 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1159 float32x4_t xy1 = vcombine_f32(vget_low_f32(
v2), vget_low_f32(v3));
1161 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1162 float32x4_t z1 = vcombine_f32(vget_high_f32(
v2), vget_high_f32(v3));
1164 xy0 = vmulq_f32(xy0, vLo);
1165 xy1 = vmulq_f32(xy1, vLo);
1167 float32x4x2_t zb = vuzpq_f32(z0, z1);
1168 float32x4_t
z = vmulq_f32(zb.val[0], vHi);
1169 float32x4x2_t
xy = vuzpq_f32(xy0, xy1);
1170 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1171 x = vaddq_f32(
x,
z);
1175 index = vbslq_u32(
mask, local_index, index);
1176 local_index = vaddq_u32(local_index, four);
1183 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1184 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1185 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1188 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1189 float32x4_t xy1 = vcombine_f32(vget_low_f32(
v2), vget_low_f32(
v2));
1191 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1192 float32x4_t z1 = vcombine_f32(vget_high_f32(
v2), vget_high_f32(
v2));
1194 xy0 = vmulq_f32(xy0, vLo);
1195 xy1 = vmulq_f32(xy1, vLo);
1197 float32x4x2_t zb = vuzpq_f32(z0, z1);
1198 float32x4_t
z = vmulq_f32(zb.val[0], vHi);
1199 float32x4x2_t
xy = vuzpq_f32(xy0, xy1);
1200 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1201 x = vaddq_f32(
x,
z);
1205 index = vbslq_u32(
mask, local_index, index);
1206 local_index = vaddq_u32(local_index, four);
1212 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1213 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1216 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1218 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1220 xy0 = vmulq_f32(xy0, vLo);
1222 float32x4x2_t zb = vuzpq_f32(z0, z0);
1223 float32x4_t
z = vmulq_f32(zb.val[0], vHi);
1224 float32x4x2_t
xy = vuzpq_f32(xy0, xy0);
1225 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1226 x = vaddq_f32(
x,
z);
1230 index = vbslq_u32(
mask, local_index, index);
1231 local_index = vaddq_u32(local_index, four);
1237 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1240 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1242 float32x4_t
z = vdupq_lane_f32(vget_high_f32(v0), 0);
1244 xy0 = vmulq_f32(xy0, vLo);
1246 z = vmulq_f32(
z, vHi);
1247 float32x4x2_t
xy = vuzpq_f32(xy0, xy0);
1248 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1249 x = vaddq_f32(
x,
z);
1253 index = vbslq_u32(
mask, local_index, index);
1254 local_index = vaddq_u32(local_index, four);
1264 float32x2_t maxDot2 = vbsl_f32(
mask, vget_high_f32(
maxDot), vget_low_f32(
maxDot));
1265 uint32x2_t index2 = vbsl_u32(
mask, vget_high_u32(index), vget_low_u32(index));
1268 float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1269 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1270 mask = vcgt_f32(maxDotO, maxDot2);
1271 maxDot2 = vbsl_f32(
mask, maxDotO, maxDot2);
1272 index2 = vbsl_u32(
mask, indexHi, index2);
1274 *dotResult = vget_lane_f32(maxDot2, 0);
1275 return vget_lane_u32(index2, 0);
1278 long _mindot_large_v0(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
1280 unsigned long i = 0;
1281 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1282 float32x2_t vLo = vget_low_f32(vvec);
1283 float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1286 uint32x2_t indexLo = (uint32x2_t){0, 1};
1287 uint32x2_t indexHi = (uint32x2_t){2, 3};
1288 uint32x2_t iLo = (uint32x2_t){
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1)};
1289 uint32x2_t iHi = (uint32x2_t){
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1)};
1290 const uint32x2_t four = (uint32x2_t){4, 4};
1292 for (; i + 8 <=
count; i += 8)
1294 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1295 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1296 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1297 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1299 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1300 float32x2_t xy1 = vmul_f32(vget_low_f32(
v1), vLo);
1301 float32x2_t xy2 = vmul_f32(vget_low_f32(
v2), vLo);
1302 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1304 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
1305 float32x2x2_t z1 = vtrn_f32(vget_high_f32(
v2), vget_high_f32(v3));
1306 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1307 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1309 float32x2_t rLo = vpadd_f32(xy0, xy1);
1310 float32x2_t rHi = vpadd_f32(xy2, xy3);
1311 rLo = vadd_f32(rLo, zLo);
1312 rHi = vadd_f32(rHi, zHi);
1314 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1315 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1316 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1317 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1318 iLo = vbsl_u32(maskLo, indexLo, iLo);
1319 iHi = vbsl_u32(maskHi, indexHi, iHi);
1320 indexLo = vadd_u32(indexLo, four);
1321 indexHi = vadd_u32(indexHi, four);
1323 v0 = vld1q_f32_aligned_postincrement(vv);
1324 v1 = vld1q_f32_aligned_postincrement(vv);
1325 v2 = vld1q_f32_aligned_postincrement(vv);
1326 v3 = vld1q_f32_aligned_postincrement(vv);
1328 xy0 = vmul_f32(vget_low_f32(v0), vLo);
1329 xy1 = vmul_f32(vget_low_f32(
v1), vLo);
1330 xy2 = vmul_f32(vget_low_f32(
v2), vLo);
1331 xy3 = vmul_f32(vget_low_f32(v3), vLo);
1333 z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
1334 z1 = vtrn_f32(vget_high_f32(
v2), vget_high_f32(v3));
1335 zLo = vmul_f32(z0.val[0], vHi);
1336 zHi = vmul_f32(z1.val[0], vHi);
1338 rLo = vpadd_f32(xy0, xy1);
1339 rHi = vpadd_f32(xy2, xy3);
1340 rLo = vadd_f32(rLo, zLo);
1341 rHi = vadd_f32(rHi, zHi);
1343 maskLo = vclt_f32(rLo, dotMinLo);
1344 maskHi = vclt_f32(rHi, dotMinHi);
1345 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1346 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1347 iLo = vbsl_u32(maskLo, indexLo, iLo);
1348 iHi = vbsl_u32(maskHi, indexHi, iHi);
1349 indexLo = vadd_u32(indexLo, four);
1350 indexHi = vadd_u32(indexHi, four);
1353 for (; i + 4 <=
count; i += 4)
1355 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1356 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1357 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1358 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1360 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1361 float32x2_t xy1 = vmul_f32(vget_low_f32(
v1), vLo);
1362 float32x2_t xy2 = vmul_f32(vget_low_f32(
v2), vLo);
1363 float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1365 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
1366 float32x2x2_t z1 = vtrn_f32(vget_high_f32(
v2), vget_high_f32(v3));
1367 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1368 float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1370 float32x2_t rLo = vpadd_f32(xy0, xy1);
1371 float32x2_t rHi = vpadd_f32(xy2, xy3);
1372 rLo = vadd_f32(rLo, zLo);
1373 rHi = vadd_f32(rHi, zHi);
1375 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1376 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1377 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1378 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1379 iLo = vbsl_u32(maskLo, indexLo, iLo);
1380 iHi = vbsl_u32(maskHi, indexHi, iHi);
1381 indexLo = vadd_u32(indexLo, four);
1382 indexHi = vadd_u32(indexHi, four);
1388 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1389 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1390 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1392 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1393 float32x2_t xy1 = vmul_f32(vget_low_f32(
v1), vLo);
1394 float32x2_t xy2 = vmul_f32(vget_low_f32(
v2), vLo);
1396 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
1397 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1398 float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(
v2), 0), vHi);
1400 float32x2_t rLo = vpadd_f32(xy0, xy1);
1401 float32x2_t rHi = vpadd_f32(xy2, xy2);
1402 rLo = vadd_f32(rLo, zLo);
1403 rHi = vadd_f32(rHi, zHi);
1405 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1406 uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1407 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1408 dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1409 iLo = vbsl_u32(maskLo, indexLo, iLo);
1410 iHi = vbsl_u32(maskHi, indexHi, iHi);
1415 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1416 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1418 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1419 float32x2_t xy1 = vmul_f32(vget_low_f32(
v1), vLo);
1421 float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(
v1));
1422 float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1424 float32x2_t rLo = vpadd_f32(xy0, xy1);
1425 rLo = vadd_f32(rLo, zLo);
1427 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1428 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1429 iLo = vbsl_u32(maskLo, indexLo, iLo);
1434 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1435 float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1436 float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1437 float32x2_t zLo = vmul_f32(z0, vHi);
1438 float32x2_t rLo = vpadd_f32(xy0, xy0);
1439 rLo = vadd_f32(rLo, zLo);
1440 uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1441 dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1442 iLo = vbsl_u32(maskLo, indexLo, iLo);
1451 uint32x2_t
mask = vclt_f32(dotMinHi, dotMinLo);
1452 dotMinLo = vbsl_f32(
mask, dotMinHi, dotMinLo);
1453 iLo = vbsl_u32(
mask, iHi, iLo);
1456 dotMinHi = vdup_lane_f32(dotMinLo, 1);
1457 iHi = vdup_lane_u32(iLo, 1);
1458 mask = vclt_f32(dotMinHi, dotMinLo);
1459 dotMinLo = vbsl_f32(
mask, dotMinHi, dotMinLo);
1460 iLo = vbsl_u32(
mask, iHi, iLo);
1462 *dotResult = vget_lane_f32(dotMinLo, 0);
1463 return vget_lane_u32(iLo, 0);
1466 long _mindot_large_v1(
const float *vv,
const float *vec,
unsigned long count,
float *dotResult)
1468 float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1469 float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1470 float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1471 const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1472 uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1476 unsigned long i = 0;
1477 for (; i + 8 <=
count; i += 8)
1479 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1480 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1481 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1482 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1485 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1486 float32x4_t xy1 = vcombine_f32(vget_low_f32(
v2), vget_low_f32(v3));
1488 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1489 float32x4_t z1 = vcombine_f32(vget_high_f32(
v2), vget_high_f32(v3));
1491 xy0 = vmulq_f32(xy0, vLo);
1492 xy1 = vmulq_f32(xy1, vLo);
1494 float32x4x2_t zb = vuzpq_f32(z0, z1);
1495 float32x4_t
z = vmulq_f32(zb.val[0], vHi);
1496 float32x4x2_t
xy = vuzpq_f32(xy0, xy1);
1497 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1498 x = vaddq_f32(
x,
z);
1502 index = vbslq_u32(
mask, local_index, index);
1503 local_index = vaddq_u32(local_index, four);
1505 v0 = vld1q_f32_aligned_postincrement(vv);
1506 v1 = vld1q_f32_aligned_postincrement(vv);
1507 v2 = vld1q_f32_aligned_postincrement(vv);
1508 v3 = vld1q_f32_aligned_postincrement(vv);
1511 xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1512 xy1 = vcombine_f32(vget_low_f32(
v2), vget_low_f32(v3));
1514 z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1515 z1 = vcombine_f32(vget_high_f32(
v2), vget_high_f32(v3));
1517 xy0 = vmulq_f32(xy0, vLo);
1518 xy1 = vmulq_f32(xy1, vLo);
1520 zb = vuzpq_f32(z0, z1);
1521 z = vmulq_f32(zb.val[0], vHi);
1522 xy = vuzpq_f32(xy0, xy1);
1523 x = vaddq_f32(
xy.val[0],
xy.val[1]);
1524 x = vaddq_f32(
x,
z);
1528 index = vbslq_u32(
mask, local_index, index);
1529 local_index = vaddq_u32(local_index, four);
1532 for (; i + 4 <=
count; i += 4)
1534 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1535 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1536 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1537 float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1540 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1541 float32x4_t xy1 = vcombine_f32(vget_low_f32(
v2), vget_low_f32(v3));
1543 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1544 float32x4_t z1 = vcombine_f32(vget_high_f32(
v2), vget_high_f32(v3));
1546 xy0 = vmulq_f32(xy0, vLo);
1547 xy1 = vmulq_f32(xy1, vLo);
1549 float32x4x2_t zb = vuzpq_f32(z0, z1);
1550 float32x4_t
z = vmulq_f32(zb.val[0], vHi);
1551 float32x4x2_t
xy = vuzpq_f32(xy0, xy1);
1552 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1553 x = vaddq_f32(
x,
z);
1557 index = vbslq_u32(
mask, local_index, index);
1558 local_index = vaddq_u32(local_index, four);
1565 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1566 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1567 float32x4_t
v2 = vld1q_f32_aligned_postincrement(vv);
1570 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1571 float32x4_t xy1 = vcombine_f32(vget_low_f32(
v2), vget_low_f32(
v2));
1573 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1574 float32x4_t z1 = vcombine_f32(vget_high_f32(
v2), vget_high_f32(
v2));
1576 xy0 = vmulq_f32(xy0, vLo);
1577 xy1 = vmulq_f32(xy1, vLo);
1579 float32x4x2_t zb = vuzpq_f32(z0, z1);
1580 float32x4_t
z = vmulq_f32(zb.val[0], vHi);
1581 float32x4x2_t
xy = vuzpq_f32(xy0, xy1);
1582 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1583 x = vaddq_f32(
x,
z);
1587 index = vbslq_u32(
mask, local_index, index);
1588 local_index = vaddq_u32(local_index, four);
1594 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1595 float32x4_t
v1 = vld1q_f32_aligned_postincrement(vv);
1598 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(
v1));
1600 float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(
v1));
1602 xy0 = vmulq_f32(xy0, vLo);
1604 float32x4x2_t zb = vuzpq_f32(z0, z0);
1605 float32x4_t
z = vmulq_f32(zb.val[0], vHi);
1606 float32x4x2_t
xy = vuzpq_f32(xy0, xy0);
1607 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1608 x = vaddq_f32(
x,
z);
1612 index = vbslq_u32(
mask, local_index, index);
1613 local_index = vaddq_u32(local_index, four);
1619 float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1622 float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1624 float32x4_t
z = vdupq_lane_f32(vget_high_f32(v0), 0);
1626 xy0 = vmulq_f32(xy0, vLo);
1628 z = vmulq_f32(
z, vHi);
1629 float32x4x2_t
xy = vuzpq_f32(xy0, xy0);
1630 float32x4_t
x = vaddq_f32(
xy.val[0],
xy.val[1]);
1631 x = vaddq_f32(
x,
z);
1635 index = vbslq_u32(
mask, local_index, index);
1636 local_index = vaddq_u32(local_index, four);
1646 float32x2_t minDot2 = vbsl_f32(
mask, vget_high_f32(
minDot), vget_low_f32(
minDot));
1647 uint32x2_t index2 = vbsl_u32(
mask, vget_high_u32(index), vget_low_u32(index));
1650 float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1651 uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1652 mask = vclt_f32(minDotO, minDot2);
1653 minDot2 = vbsl_f32(
mask, minDotO, minDot2);
1654 index2 = vbsl_u32(
mask, indexHi, index2);
1656 *dotResult = vget_lane_f32(minDot2, 0);
1657 return vget_lane_u32(index2, 0);
1661 #error Unhandled __APPLE__ arch
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble z
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble v1
struct AtomicSpinLock __attribute__((aligned(32))) AtomicSpinLock
ATTR_WARN_UNUSED_RESULT const BMVert * v2
SIMD_FORCE_INLINE long minDot(const btVector3 *array, long array_count, btScalar &dotOut) const
returns index of minimum dot product between this and vectors in array[]
SIMD_FORCE_INLINE long maxDot(const btVector3 *array, long array_count, btScalar &dotOut) const
returns index of maximum dot product between this and vectors in array[]
ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
Segment< FEdge *, Vec3r > segment