Blender  V3.3
btVector3.cpp
Go to the documentation of this file.
1 /*
2  Copyright (c) 2011 Apple Inc.
3  http://continuousphysics.com/Bullet/
4 
5  This software is provided 'as-is', without any express or implied warranty.
6  In no event will the authors be held liable for any damages arising from the use of this software.
7  Permission is granted to anyone to use this software for any purpose,
8  including commercial applications, and to alter it and redistribute it freely,
9  subject to the following restrictions:
10 
11  1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12  2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13  3. This notice may not be removed or altered from any source distribution.
14 
15  This source version has been altered.
16  */
17 
18 #if defined(_WIN32) || defined(__i386__)
19 #define BT_USE_SSE_IN_API
20 #endif
21 
22 #include "btVector3.h"
23 
24 #if defined BT_USE_SIMD_VECTOR3
25 
26 #if DEBUG
27 #include <string.h> //for memset
28 #endif
29 
30 #ifdef __APPLE__
31 #include <stdint.h>
32 typedef float float4 __attribute__((vector_size(16)));
33 #else
34 #define float4 __m128
35 #endif
36 //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
37 
38 #if defined BT_USE_SSE || defined _WIN32
39 
40 #define LOG2_ARRAY_SIZE 6
41 #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
42 
43 #include <emmintrin.h>
44 
45 long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
46 long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
47 {
48  const float4 *vertices = (const float4 *)vv;
49  static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
50  float4 dotMax = btAssign128(-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY);
51  float4 vvec = _mm_loadu_ps(vec);
52  float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));
53  float4 vLo = _mm_movelh_ps(vvec, vvec);
54 
55  long maxIndex = -1L;
56 
57  size_t segment = 0;
58  float4 stack_array[STACK_ARRAY_COUNT];
59 
60 #if DEBUG
61  //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
62 #endif
63 
64  size_t index;
65  float4 max;
66  // Faster loop without cleanup code for full tiles
67  for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
68  {
69  max = dotMax;
70 
71  for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
72  { // do four dot products at a time. Carefully avoid touching the w element.
73  float4 v0 = vertices[0];
74  float4 v1 = vertices[1];
75  float4 v2 = vertices[2];
76  float4 v3 = vertices[3];
77  vertices += 4;
78 
79  float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
80  float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
81  float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
82  float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
83 
84  lo0 = lo0 * vLo;
85  lo1 = lo1 * vLo;
86  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
87  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
88  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
89  z = z * vHi;
90  x = x + y;
91  x = x + z;
92  stack_array[index] = x;
93  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
94 
95  v0 = vertices[0];
96  v1 = vertices[1];
97  v2 = vertices[2];
98  v3 = vertices[3];
99  vertices += 4;
100 
101  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
102  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
103  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
104  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
105 
106  lo0 = lo0 * vLo;
107  lo1 = lo1 * vLo;
108  z = _mm_shuffle_ps(hi0, hi1, 0x88);
109  x = _mm_shuffle_ps(lo0, lo1, 0x88);
110  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
111  z = z * vHi;
112  x = x + y;
113  x = x + z;
114  stack_array[index + 1] = x;
115  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
116 
117  v0 = vertices[0];
118  v1 = vertices[1];
119  v2 = vertices[2];
120  v3 = vertices[3];
121  vertices += 4;
122 
123  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
124  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
125  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
126  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
127 
128  lo0 = lo0 * vLo;
129  lo1 = lo1 * vLo;
130  z = _mm_shuffle_ps(hi0, hi1, 0x88);
131  x = _mm_shuffle_ps(lo0, lo1, 0x88);
132  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
133  z = z * vHi;
134  x = x + y;
135  x = x + z;
136  stack_array[index + 2] = x;
137  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
138 
139  v0 = vertices[0];
140  v1 = vertices[1];
141  v2 = vertices[2];
142  v3 = vertices[3];
143  vertices += 4;
144 
145  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
146  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
147  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
148  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
149 
150  lo0 = lo0 * vLo;
151  lo1 = lo1 * vLo;
152  z = _mm_shuffle_ps(hi0, hi1, 0x88);
153  x = _mm_shuffle_ps(lo0, lo1, 0x88);
154  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
155  z = z * vHi;
156  x = x + y;
157  x = x + z;
158  stack_array[index + 3] = x;
159  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
160 
161  // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
162  }
163 
164  // If we found a new max
165  if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
166  {
167  // copy the new max across all lanes of our max accumulator
168  max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
169  max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
170 
171  dotMax = max;
172 
173  // find first occurrence of that max
174  size_t test;
175  for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
176  {
177  }
178  // record where it is.
179  maxIndex = 4 * index + segment + indexTable[test];
180  }
181  }
182 
183  // account for work we've already done
184  count -= segment;
185 
186  // Deal with the last < STACK_ARRAY_COUNT vectors
187  max = dotMax;
188  index = 0;
189 
190  if (btUnlikely(count > 16))
191  {
192  for (; index + 4 <= count / 4; index += 4)
193  { // do four dot products at a time. Carefully avoid touching the w element.
194  float4 v0 = vertices[0];
195  float4 v1 = vertices[1];
196  float4 v2 = vertices[2];
197  float4 v3 = vertices[3];
198  vertices += 4;
199 
200  float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
201  float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
202  float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
203  float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
204 
205  lo0 = lo0 * vLo;
206  lo1 = lo1 * vLo;
207  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
208  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
209  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
210  z = z * vHi;
211  x = x + y;
212  x = x + z;
213  stack_array[index] = x;
214  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
215 
216  v0 = vertices[0];
217  v1 = vertices[1];
218  v2 = vertices[2];
219  v3 = vertices[3];
220  vertices += 4;
221 
222  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
223  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
224  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
225  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
226 
227  lo0 = lo0 * vLo;
228  lo1 = lo1 * vLo;
229  z = _mm_shuffle_ps(hi0, hi1, 0x88);
230  x = _mm_shuffle_ps(lo0, lo1, 0x88);
231  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
232  z = z * vHi;
233  x = x + y;
234  x = x + z;
235  stack_array[index + 1] = x;
236  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
237 
238  v0 = vertices[0];
239  v1 = vertices[1];
240  v2 = vertices[2];
241  v3 = vertices[3];
242  vertices += 4;
243 
244  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
245  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
246  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
247  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
248 
249  lo0 = lo0 * vLo;
250  lo1 = lo1 * vLo;
251  z = _mm_shuffle_ps(hi0, hi1, 0x88);
252  x = _mm_shuffle_ps(lo0, lo1, 0x88);
253  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
254  z = z * vHi;
255  x = x + y;
256  x = x + z;
257  stack_array[index + 2] = x;
258  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
259 
260  v0 = vertices[0];
261  v1 = vertices[1];
262  v2 = vertices[2];
263  v3 = vertices[3];
264  vertices += 4;
265 
266  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
267  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
268  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
269  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
270 
271  lo0 = lo0 * vLo;
272  lo1 = lo1 * vLo;
273  z = _mm_shuffle_ps(hi0, hi1, 0x88);
274  x = _mm_shuffle_ps(lo0, lo1, 0x88);
275  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
276  z = z * vHi;
277  x = x + y;
278  x = x + z;
279  stack_array[index + 3] = x;
280  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
281 
282  // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
283  }
284  }
285 
286  size_t localCount = (count & -4L) - 4 * index;
287  if (localCount)
288  {
289 #ifdef __APPLE__
290  float4 t0, t1, t2, t3, t4;
291  float4 *sap = &stack_array[index + localCount / 4];
292  vertices += localCount; // counter the offset
293  size_t byteIndex = -(localCount) * sizeof(float);
294  //AT&T Code style assembly
295  asm volatile(
296  ".align 4 \n\
297  0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
298  movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
299  movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
300  movaps %[t0], %[max] // vertices[0] \n\
301  movlhps %[t1], %[max] // x0y0x1y1 \n\
302  movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
303  movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
304  mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
305  movhlps %[t0], %[t1] // z0w0z1w1 \n\
306  movaps %[t3], %[t0] // vertices[2] \n\
307  movlhps %[t4], %[t0] // x2y2x3y3 \n\
308  mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
309  movhlps %[t3], %[t4] // z2w2z3w3 \n\
310  shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
311  mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
312  movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
313  shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
314  shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
315  addps %[t3], %[max] // x + y \n\
316  addps %[t1], %[max] // x + y + z \n\
317  movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
318  maxps %[t2], %[max] // record max, restore max \n\
319  add $16, %[byteIndex] // advance loop counter\n\
320  jnz 0b \n\
321  "
322  : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
323  : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
324  : "memory", "cc");
325  index += localCount / 4;
326 #else
327  {
328  for (unsigned int i = 0; i < localCount / 4; i++, index++)
329  { // do four dot products at a time. Carefully avoid touching the w element.
330  float4 v0 = vertices[0];
331  float4 v1 = vertices[1];
332  float4 v2 = vertices[2];
333  float4 v3 = vertices[3];
334  vertices += 4;
335 
336  float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
337  float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
338  float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
339  float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
340 
341  lo0 = lo0 * vLo;
342  lo1 = lo1 * vLo;
343  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
344  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
345  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
346  z = z * vHi;
347  x = x + y;
348  x = x + z;
349  stack_array[index] = x;
350  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
351  }
352  }
353 #endif //__APPLE__
354  }
355 
356  // process the last few points
357  if (count & 3)
358  {
359  float4 v0, v1, v2, x, y, z;
360  switch (count & 3)
361  {
362  case 3:
363  {
364  v0 = vertices[0];
365  v1 = vertices[1];
366  v2 = vertices[2];
367 
368  // Calculate 3 dot products, transpose, duplicate v2
369  float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
370  float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
371  lo0 = lo0 * vLo;
372  z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
373  z = z * vHi;
374  float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
375  lo1 = lo1 * vLo;
376  x = _mm_shuffle_ps(lo0, lo1, 0x88);
377  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
378  }
379  break;
380  case 2:
381  {
382  v0 = vertices[0];
383  v1 = vertices[1];
384  float4 xy = _mm_movelh_ps(v0, v1);
385  z = _mm_movehl_ps(v1, v0);
386  xy = xy * vLo;
387  z = _mm_shuffle_ps(z, z, 0xa8);
388  x = _mm_shuffle_ps(xy, xy, 0xa8);
389  y = _mm_shuffle_ps(xy, xy, 0xfd);
390  z = z * vHi;
391  }
392  break;
393  case 1:
394  {
395  float4 xy = vertices[0];
396  z = _mm_shuffle_ps(xy, xy, 0xaa);
397  xy = xy * vLo;
398  z = z * vHi;
399  x = _mm_shuffle_ps(xy, xy, 0);
400  y = _mm_shuffle_ps(xy, xy, 0x55);
401  }
402  break;
403  }
404  x = x + y;
405  x = x + z;
406  stack_array[index] = x;
407  max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
408  index++;
409  }
410 
411  // if we found a new max.
412  if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
413  { // we found a new max. Search for it
414  // find max across the max vector, place in all elements of max -- big latency hit here
415  max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
416  max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
417 
418  // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
419  // this where it actually makes a difference is handled in the early out at the top of the function,
420  // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
421  // complexity, and removed it.
422 
423  dotMax = max;
424 
425  // scan for the first occurence of max in the array
426  size_t test;
427  for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
428  {
429  }
430  maxIndex = 4 * index + segment + indexTable[test];
431  }
432 
433  _mm_store_ss(dotResult, dotMax);
434  return maxIndex;
435 }
436 
437 long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
438 
439 long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
440 {
441  const float4 *vertices = (const float4 *)vv;
442  static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
443  float4 dotmin = btAssign128(BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY);
444  float4 vvec = _mm_loadu_ps(vec);
445  float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa));
446  float4 vLo = _mm_movelh_ps(vvec, vvec);
447 
448  long minIndex = -1L;
449 
450  size_t segment = 0;
451  float4 stack_array[STACK_ARRAY_COUNT];
452 
453 #if DEBUG
454  //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
455 #endif
456 
457  size_t index;
458  float4 min;
459  // Faster loop without cleanup code for full tiles
460  for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
461  {
462  min = dotmin;
463 
464  for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
465  { // do four dot products at a time. Carefully avoid touching the w element.
466  float4 v0 = vertices[0];
467  float4 v1 = vertices[1];
468  float4 v2 = vertices[2];
469  float4 v3 = vertices[3];
470  vertices += 4;
471 
472  float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
473  float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
474  float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
475  float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
476 
477  lo0 = lo0 * vLo;
478  lo1 = lo1 * vLo;
479  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
480  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
481  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
482  z = z * vHi;
483  x = x + y;
484  x = x + z;
485  stack_array[index] = x;
486  min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
487 
488  v0 = vertices[0];
489  v1 = vertices[1];
490  v2 = vertices[2];
491  v3 = vertices[3];
492  vertices += 4;
493 
494  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
495  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
496  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
497  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
498 
499  lo0 = lo0 * vLo;
500  lo1 = lo1 * vLo;
501  z = _mm_shuffle_ps(hi0, hi1, 0x88);
502  x = _mm_shuffle_ps(lo0, lo1, 0x88);
503  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
504  z = z * vHi;
505  x = x + y;
506  x = x + z;
507  stack_array[index + 1] = x;
508  min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
509 
510  v0 = vertices[0];
511  v1 = vertices[1];
512  v2 = vertices[2];
513  v3 = vertices[3];
514  vertices += 4;
515 
516  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
517  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
518  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
519  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
520 
521  lo0 = lo0 * vLo;
522  lo1 = lo1 * vLo;
523  z = _mm_shuffle_ps(hi0, hi1, 0x88);
524  x = _mm_shuffle_ps(lo0, lo1, 0x88);
525  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
526  z = z * vHi;
527  x = x + y;
528  x = x + z;
529  stack_array[index + 2] = x;
530  min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
531 
532  v0 = vertices[0];
533  v1 = vertices[1];
534  v2 = vertices[2];
535  v3 = vertices[3];
536  vertices += 4;
537 
538  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
539  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
540  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
541  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
542 
543  lo0 = lo0 * vLo;
544  lo1 = lo1 * vLo;
545  z = _mm_shuffle_ps(hi0, hi1, 0x88);
546  x = _mm_shuffle_ps(lo0, lo1, 0x88);
547  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
548  z = z * vHi;
549  x = x + y;
550  x = x + z;
551  stack_array[index + 3] = x;
552  min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
553 
554  // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
555  }
556 
557  // If we found a new min
558  if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
559  {
560  // copy the new min across all lanes of our min accumulator
561  min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
562  min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
563 
564  dotmin = min;
565 
566  // find first occurrence of that min
567  size_t test;
568  for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
569  {
570  }
571  // record where it is.
572  minIndex = 4 * index + segment + indexTable[test];
573  }
574  }
575 
576  // account for work we've already done
577  count -= segment;
578 
579  // Deal with the last < STACK_ARRAY_COUNT vectors
580  min = dotmin;
581  index = 0;
582 
583  if (btUnlikely(count > 16))
584  {
585  for (; index + 4 <= count / 4; index += 4)
586  { // do four dot products at a time. Carefully avoid touching the w element.
587  float4 v0 = vertices[0];
588  float4 v1 = vertices[1];
589  float4 v2 = vertices[2];
590  float4 v3 = vertices[3];
591  vertices += 4;
592 
593  float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
594  float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
595  float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
596  float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
597 
598  lo0 = lo0 * vLo;
599  lo1 = lo1 * vLo;
600  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
601  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
602  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
603  z = z * vHi;
604  x = x + y;
605  x = x + z;
606  stack_array[index] = x;
607  min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
608 
609  v0 = vertices[0];
610  v1 = vertices[1];
611  v2 = vertices[2];
612  v3 = vertices[3];
613  vertices += 4;
614 
615  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
616  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
617  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
618  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
619 
620  lo0 = lo0 * vLo;
621  lo1 = lo1 * vLo;
622  z = _mm_shuffle_ps(hi0, hi1, 0x88);
623  x = _mm_shuffle_ps(lo0, lo1, 0x88);
624  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
625  z = z * vHi;
626  x = x + y;
627  x = x + z;
628  stack_array[index + 1] = x;
629  min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
630 
631  v0 = vertices[0];
632  v1 = vertices[1];
633  v2 = vertices[2];
634  v3 = vertices[3];
635  vertices += 4;
636 
637  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
638  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
639  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
640  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
641 
642  lo0 = lo0 * vLo;
643  lo1 = lo1 * vLo;
644  z = _mm_shuffle_ps(hi0, hi1, 0x88);
645  x = _mm_shuffle_ps(lo0, lo1, 0x88);
646  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
647  z = z * vHi;
648  x = x + y;
649  x = x + z;
650  stack_array[index + 2] = x;
651  min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
652 
653  v0 = vertices[0];
654  v1 = vertices[1];
655  v2 = vertices[2];
656  v3 = vertices[3];
657  vertices += 4;
658 
659  lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
660  hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
661  lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
662  hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
663 
664  lo0 = lo0 * vLo;
665  lo1 = lo1 * vLo;
666  z = _mm_shuffle_ps(hi0, hi1, 0x88);
667  x = _mm_shuffle_ps(lo0, lo1, 0x88);
668  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
669  z = z * vHi;
670  x = x + y;
671  x = x + z;
672  stack_array[index + 3] = x;
673  min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
674 
675  // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
676  }
677  }
678 
679  size_t localCount = (count & -4L) - 4 * index;
680  if (localCount)
681  {
682 #ifdef __APPLE__
683  vertices += localCount; // counter the offset
684  float4 t0, t1, t2, t3, t4;
685  size_t byteIndex = -(localCount) * sizeof(float);
686  float4 *sap = &stack_array[index + localCount / 4];
687 
688  asm volatile(
689  ".align 4 \n\
690  0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
691  movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
692  movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
693  movaps %[t0], %[min] // vertices[0] \n\
694  movlhps %[t1], %[min] // x0y0x1y1 \n\
695  movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
696  movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
697  mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
698  movhlps %[t0], %[t1] // z0w0z1w1 \n\
699  movaps %[t3], %[t0] // vertices[2] \n\
700  movlhps %[t4], %[t0] // x2y2x3y3 \n\
701  movhlps %[t3], %[t4] // z2w2z3w3 \n\
702  mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
703  shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
704  mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
705  movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
706  shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
707  shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
708  addps %[t3], %[min] // x + y \n\
709  addps %[t1], %[min] // x + y + z \n\
710  movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
711  minps %[t2], %[min] // record min, restore min \n\
712  add $16, %[byteIndex] // advance loop counter\n\
713  jnz 0b \n\
714  "
715  : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
716  : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
717  : "memory", "cc");
718  index += localCount / 4;
719 #else
720  {
721  for (unsigned int i = 0; i < localCount / 4; i++, index++)
722  { // do four dot products at a time. Carefully avoid touching the w element.
723  float4 v0 = vertices[0];
724  float4 v1 = vertices[1];
725  float4 v2 = vertices[2];
726  float4 v3 = vertices[3];
727  vertices += 4;
728 
729  float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
730  float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
731  float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
732  float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
733 
734  lo0 = lo0 * vLo;
735  lo1 = lo1 * vLo;
736  float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
737  float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
738  float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
739  z = z * vHi;
740  x = x + y;
741  x = x + z;
742  stack_array[index] = x;
743  min = _mm_min_ps(x, min); // control the order here so that max is never NaN even if x is nan
744  }
745  }
746 
747 #endif
748  }
749 
750  // process the last few points
751  if (count & 3)
752  {
753  float4 v0, v1, v2, x, y, z;
754  switch (count & 3)
755  {
756  case 3:
757  {
758  v0 = vertices[0];
759  v1 = vertices[1];
760  v2 = vertices[2];
761 
762  // Calculate 3 dot products, transpose, duplicate v2
763  float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
764  float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
765  lo0 = lo0 * vLo;
766  z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
767  z = z * vHi;
768  float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
769  lo1 = lo1 * vLo;
770  x = _mm_shuffle_ps(lo0, lo1, 0x88);
771  y = _mm_shuffle_ps(lo0, lo1, 0xdd);
772  }
773  break;
774  case 2:
775  {
776  v0 = vertices[0];
777  v1 = vertices[1];
778  float4 xy = _mm_movelh_ps(v0, v1);
779  z = _mm_movehl_ps(v1, v0);
780  xy = xy * vLo;
781  z = _mm_shuffle_ps(z, z, 0xa8);
782  x = _mm_shuffle_ps(xy, xy, 0xa8);
783  y = _mm_shuffle_ps(xy, xy, 0xfd);
784  z = z * vHi;
785  }
786  break;
787  case 1:
788  {
789  float4 xy = vertices[0];
790  z = _mm_shuffle_ps(xy, xy, 0xaa);
791  xy = xy * vLo;
792  z = z * vHi;
793  x = _mm_shuffle_ps(xy, xy, 0);
794  y = _mm_shuffle_ps(xy, xy, 0x55);
795  }
796  break;
797  }
798  x = x + y;
799  x = x + z;
800  stack_array[index] = x;
801  min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
802  index++;
803  }
804 
805  // if we found a new min.
806  if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
807  { // we found a new min. Search for it
808  // find min across the min vector, place in all elements of min -- big latency hit here
809  min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
810  min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
811 
812  // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
813  // this where it actually makes a difference is handled in the early out at the top of the function,
814  // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
815  // complexity, and removed it.
816 
817  dotmin = min;
818 
819  // scan for the first occurence of min in the array
820  size_t test;
821  for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
822  {
823  }
824  minIndex = 4 * index + segment + indexTable[test];
825  }
826 
827  _mm_store_ss(dotResult, dotmin);
828  return minIndex;
829 }
830 
831 #elif defined BT_USE_NEON
832 
833 #define ARM_NEON_GCC_COMPATIBILITY 1
834 #include <arm_neon.h>
835 #include <sys/types.h>
836 #include <sys/sysctl.h> //for sysctlbyname
837 
838 static long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
839 static long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
840 static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
841 static long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
842 static long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
843 static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
844 
845 long (*_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _maxdot_large_sel;
846 long (*_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _mindot_large_sel;
847 
848 static inline uint32_t btGetCpuCapabilities(void)
849 {
850  static uint32_t capabilities = 0;
851  static bool testedCapabilities = false;
852 
853  if (0 == testedCapabilities)
854  {
855  uint32_t hasFeature = 0;
856  size_t featureSize = sizeof(hasFeature);
857  int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
858 
859  if (0 == err && hasFeature)
860  capabilities |= 0x2000;
861 
862  testedCapabilities = true;
863  }
864 
865  return capabilities;
866 }
867 
868 static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
869 {
870  if (btGetCpuCapabilities() & 0x2000)
871  _maxdot_large = _maxdot_large_v1;
872  else
873  _maxdot_large = _maxdot_large_v0;
874 
875  return _maxdot_large(vv, vec, count, dotResult);
876 }
877 
878 static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
879 {
880  if (btGetCpuCapabilities() & 0x2000)
881  _mindot_large = _mindot_large_v1;
882  else
883  _mindot_large = _mindot_large_v0;
884 
885  return _mindot_large(vv, vec, count, dotResult);
886 }
887 
888 #if defined __arm__
889 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
890 #else
891 //support 64bit arm
892 #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
893 #endif
894 
895 long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
896 {
897  unsigned long i = 0;
898  float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
899  float32x2_t vLo = vget_low_f32(vvec);
900  float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
901  float32x2_t dotMaxLo = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
902  float32x2_t dotMaxHi = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
903  uint32x2_t indexLo = (uint32x2_t){0, 1};
904  uint32x2_t indexHi = (uint32x2_t){2, 3};
905  uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
906  uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
907  const uint32x2_t four = (uint32x2_t){4, 4};
908 
909  for (; i + 8 <= count; i += 8)
910  {
911  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
912  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
913  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
914  float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
915 
916  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
917  float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
918  float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
919  float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
920 
921  float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
922  float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
923  float32x2_t zLo = vmul_f32(z0.val[0], vHi);
924  float32x2_t zHi = vmul_f32(z1.val[0], vHi);
925 
926  float32x2_t rLo = vpadd_f32(xy0, xy1);
927  float32x2_t rHi = vpadd_f32(xy2, xy3);
928  rLo = vadd_f32(rLo, zLo);
929  rHi = vadd_f32(rHi, zHi);
930 
931  uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
932  uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
933  dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
934  dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
935  iLo = vbsl_u32(maskLo, indexLo, iLo);
936  iHi = vbsl_u32(maskHi, indexHi, iHi);
937  indexLo = vadd_u32(indexLo, four);
938  indexHi = vadd_u32(indexHi, four);
939 
940  v0 = vld1q_f32_aligned_postincrement(vv);
941  v1 = vld1q_f32_aligned_postincrement(vv);
942  v2 = vld1q_f32_aligned_postincrement(vv);
943  v3 = vld1q_f32_aligned_postincrement(vv);
944 
945  xy0 = vmul_f32(vget_low_f32(v0), vLo);
946  xy1 = vmul_f32(vget_low_f32(v1), vLo);
947  xy2 = vmul_f32(vget_low_f32(v2), vLo);
948  xy3 = vmul_f32(vget_low_f32(v3), vLo);
949 
950  z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
951  z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
952  zLo = vmul_f32(z0.val[0], vHi);
953  zHi = vmul_f32(z1.val[0], vHi);
954 
955  rLo = vpadd_f32(xy0, xy1);
956  rHi = vpadd_f32(xy2, xy3);
957  rLo = vadd_f32(rLo, zLo);
958  rHi = vadd_f32(rHi, zHi);
959 
960  maskLo = vcgt_f32(rLo, dotMaxLo);
961  maskHi = vcgt_f32(rHi, dotMaxHi);
962  dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
963  dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
964  iLo = vbsl_u32(maskLo, indexLo, iLo);
965  iHi = vbsl_u32(maskHi, indexHi, iHi);
966  indexLo = vadd_u32(indexLo, four);
967  indexHi = vadd_u32(indexHi, four);
968  }
969 
970  for (; i + 4 <= count; i += 4)
971  {
972  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
973  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
974  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
975  float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
976 
977  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
978  float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
979  float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
980  float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
981 
982  float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
983  float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
984  float32x2_t zLo = vmul_f32(z0.val[0], vHi);
985  float32x2_t zHi = vmul_f32(z1.val[0], vHi);
986 
987  float32x2_t rLo = vpadd_f32(xy0, xy1);
988  float32x2_t rHi = vpadd_f32(xy2, xy3);
989  rLo = vadd_f32(rLo, zLo);
990  rHi = vadd_f32(rHi, zHi);
991 
992  uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
993  uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
994  dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
995  dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
996  iLo = vbsl_u32(maskLo, indexLo, iLo);
997  iHi = vbsl_u32(maskHi, indexHi, iHi);
998  indexLo = vadd_u32(indexLo, four);
999  indexHi = vadd_u32(indexHi, four);
1000  }
1001 
1002  switch (count & 3)
1003  {
1004  case 3:
1005  {
1006  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1007  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1008  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1009 
1010  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1011  float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1012  float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1013 
1014  float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1015  float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1016  float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1017 
1018  float32x2_t rLo = vpadd_f32(xy0, xy1);
1019  float32x2_t rHi = vpadd_f32(xy2, xy2);
1020  rLo = vadd_f32(rLo, zLo);
1021  rHi = vadd_f32(rHi, zHi);
1022 
1023  uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1024  uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
1025  dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1026  dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
1027  iLo = vbsl_u32(maskLo, indexLo, iLo);
1028  iHi = vbsl_u32(maskHi, indexHi, iHi);
1029  }
1030  break;
1031  case 2:
1032  {
1033  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1034  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1035 
1036  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1037  float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1038 
1039  float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1040  float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1041 
1042  float32x2_t rLo = vpadd_f32(xy0, xy1);
1043  rLo = vadd_f32(rLo, zLo);
1044 
1045  uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1046  dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1047  iLo = vbsl_u32(maskLo, indexLo, iLo);
1048  }
1049  break;
1050  case 1:
1051  {
1052  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1053  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1054  float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1055  float32x2_t zLo = vmul_f32(z0, vHi);
1056  float32x2_t rLo = vpadd_f32(xy0, xy0);
1057  rLo = vadd_f32(rLo, zLo);
1058  uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
1059  dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
1060  iLo = vbsl_u32(maskLo, indexLo, iLo);
1061  }
1062  break;
1063 
1064  default:
1065  break;
1066  }
1067 
1068  // select best answer between hi and lo results
1069  uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
1070  dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1071  iLo = vbsl_u32(mask, iHi, iLo);
1072 
1073  // select best answer between even and odd results
1074  dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
1075  iHi = vdup_lane_u32(iLo, 1);
1076  mask = vcgt_f32(dotMaxHi, dotMaxLo);
1077  dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
1078  iLo = vbsl_u32(mask, iHi, iLo);
1079 
1080  *dotResult = vget_lane_f32(dotMaxLo, 0);
1081  return vget_lane_u32(iLo, 0);
1082 }
1083 
1084 long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1085 {
1086  float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1087  float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1088  float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1089  const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1090  uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1091  uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1092  float32x4_t maxDot = (float32x4_t){-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY};
1093 
1094  unsigned long i = 0;
1095  for (; i + 8 <= count; i += 8)
1096  {
1097  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1098  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1099  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1100  float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1101 
1102  // the next two lines should resolve to a single vswp d, d
1103  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1104  float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1105  // the next two lines should resolve to a single vswp d, d
1106  float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1107  float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1108 
1109  xy0 = vmulq_f32(xy0, vLo);
1110  xy1 = vmulq_f32(xy1, vLo);
1111 
1112  float32x4x2_t zb = vuzpq_f32(z0, z1);
1113  float32x4_t z = vmulq_f32(zb.val[0], vHi);
1114  float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1115  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1116  x = vaddq_f32(x, z);
1117 
1118  uint32x4_t mask = vcgtq_f32(x, maxDot);
1119  maxDot = vbslq_f32(mask, x, maxDot);
1120  index = vbslq_u32(mask, local_index, index);
1121  local_index = vaddq_u32(local_index, four);
1122 
1123  v0 = vld1q_f32_aligned_postincrement(vv);
1124  v1 = vld1q_f32_aligned_postincrement(vv);
1125  v2 = vld1q_f32_aligned_postincrement(vv);
1126  v3 = vld1q_f32_aligned_postincrement(vv);
1127 
1128  // the next two lines should resolve to a single vswp d, d
1129  xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1130  xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1131  // the next two lines should resolve to a single vswp d, d
1132  z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1133  z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1134 
1135  xy0 = vmulq_f32(xy0, vLo);
1136  xy1 = vmulq_f32(xy1, vLo);
1137 
1138  zb = vuzpq_f32(z0, z1);
1139  z = vmulq_f32(zb.val[0], vHi);
1140  xy = vuzpq_f32(xy0, xy1);
1141  x = vaddq_f32(xy.val[0], xy.val[1]);
1142  x = vaddq_f32(x, z);
1143 
1144  mask = vcgtq_f32(x, maxDot);
1145  maxDot = vbslq_f32(mask, x, maxDot);
1146  index = vbslq_u32(mask, local_index, index);
1147  local_index = vaddq_u32(local_index, four);
1148  }
1149 
1150  for (; i + 4 <= count; i += 4)
1151  {
1152  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1153  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1154  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1155  float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1156 
1157  // the next two lines should resolve to a single vswp d, d
1158  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1159  float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1160  // the next two lines should resolve to a single vswp d, d
1161  float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1162  float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1163 
1164  xy0 = vmulq_f32(xy0, vLo);
1165  xy1 = vmulq_f32(xy1, vLo);
1166 
1167  float32x4x2_t zb = vuzpq_f32(z0, z1);
1168  float32x4_t z = vmulq_f32(zb.val[0], vHi);
1169  float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1170  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1171  x = vaddq_f32(x, z);
1172 
1173  uint32x4_t mask = vcgtq_f32(x, maxDot);
1174  maxDot = vbslq_f32(mask, x, maxDot);
1175  index = vbslq_u32(mask, local_index, index);
1176  local_index = vaddq_u32(local_index, four);
1177  }
1178 
1179  switch (count & 3)
1180  {
1181  case 3:
1182  {
1183  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1184  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1185  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1186 
1187  // the next two lines should resolve to a single vswp d, d
1188  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1189  float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1190  // the next two lines should resolve to a single vswp d, d
1191  float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1192  float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1193 
1194  xy0 = vmulq_f32(xy0, vLo);
1195  xy1 = vmulq_f32(xy1, vLo);
1196 
1197  float32x4x2_t zb = vuzpq_f32(z0, z1);
1198  float32x4_t z = vmulq_f32(zb.val[0], vHi);
1199  float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1200  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1201  x = vaddq_f32(x, z);
1202 
1203  uint32x4_t mask = vcgtq_f32(x, maxDot);
1204  maxDot = vbslq_f32(mask, x, maxDot);
1205  index = vbslq_u32(mask, local_index, index);
1206  local_index = vaddq_u32(local_index, four);
1207  }
1208  break;
1209 
1210  case 2:
1211  {
1212  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1213  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1214 
1215  // the next two lines should resolve to a single vswp d, d
1216  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1217  // the next two lines should resolve to a single vswp d, d
1218  float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1219 
1220  xy0 = vmulq_f32(xy0, vLo);
1221 
1222  float32x4x2_t zb = vuzpq_f32(z0, z0);
1223  float32x4_t z = vmulq_f32(zb.val[0], vHi);
1224  float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1225  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1226  x = vaddq_f32(x, z);
1227 
1228  uint32x4_t mask = vcgtq_f32(x, maxDot);
1229  maxDot = vbslq_f32(mask, x, maxDot);
1230  index = vbslq_u32(mask, local_index, index);
1231  local_index = vaddq_u32(local_index, four);
1232  }
1233  break;
1234 
1235  case 1:
1236  {
1237  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1238 
1239  // the next two lines should resolve to a single vswp d, d
1240  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1241  // the next two lines should resolve to a single vswp d, d
1242  float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1243 
1244  xy0 = vmulq_f32(xy0, vLo);
1245 
1246  z = vmulq_f32(z, vHi);
1247  float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1248  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1249  x = vaddq_f32(x, z);
1250 
1251  uint32x4_t mask = vcgtq_f32(x, maxDot);
1252  maxDot = vbslq_f32(mask, x, maxDot);
1253  index = vbslq_u32(mask, local_index, index);
1254  local_index = vaddq_u32(local_index, four);
1255  }
1256  break;
1257 
1258  default:
1259  break;
1260  }
1261 
1262  // select best answer between hi and lo results
1263  uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
1264  float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
1265  uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1266 
1267  // select best answer between even and odd results
1268  float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
1269  uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1270  mask = vcgt_f32(maxDotO, maxDot2);
1271  maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
1272  index2 = vbsl_u32(mask, indexHi, index2);
1273 
1274  *dotResult = vget_lane_f32(maxDot2, 0);
1275  return vget_lane_u32(index2, 0);
1276 }
1277 
1278 long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
1279 {
1280  unsigned long i = 0;
1281  float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1282  float32x2_t vLo = vget_low_f32(vvec);
1283  float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
1284  float32x2_t dotMinLo = (float32x2_t){BT_INFINITY, BT_INFINITY};
1285  float32x2_t dotMinHi = (float32x2_t){BT_INFINITY, BT_INFINITY};
1286  uint32x2_t indexLo = (uint32x2_t){0, 1};
1287  uint32x2_t indexHi = (uint32x2_t){2, 3};
1288  uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1289  uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1290  const uint32x2_t four = (uint32x2_t){4, 4};
1291 
1292  for (; i + 8 <= count; i += 8)
1293  {
1294  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1295  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1296  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1297  float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1298 
1299  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1300  float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1301  float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1302  float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1303 
1304  float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1305  float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1306  float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1307  float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1308 
1309  float32x2_t rLo = vpadd_f32(xy0, xy1);
1310  float32x2_t rHi = vpadd_f32(xy2, xy3);
1311  rLo = vadd_f32(rLo, zLo);
1312  rHi = vadd_f32(rHi, zHi);
1313 
1314  uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1315  uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1316  dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1317  dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1318  iLo = vbsl_u32(maskLo, indexLo, iLo);
1319  iHi = vbsl_u32(maskHi, indexHi, iHi);
1320  indexLo = vadd_u32(indexLo, four);
1321  indexHi = vadd_u32(indexHi, four);
1322 
1323  v0 = vld1q_f32_aligned_postincrement(vv);
1324  v1 = vld1q_f32_aligned_postincrement(vv);
1325  v2 = vld1q_f32_aligned_postincrement(vv);
1326  v3 = vld1q_f32_aligned_postincrement(vv);
1327 
1328  xy0 = vmul_f32(vget_low_f32(v0), vLo);
1329  xy1 = vmul_f32(vget_low_f32(v1), vLo);
1330  xy2 = vmul_f32(vget_low_f32(v2), vLo);
1331  xy3 = vmul_f32(vget_low_f32(v3), vLo);
1332 
1333  z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1334  z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1335  zLo = vmul_f32(z0.val[0], vHi);
1336  zHi = vmul_f32(z1.val[0], vHi);
1337 
1338  rLo = vpadd_f32(xy0, xy1);
1339  rHi = vpadd_f32(xy2, xy3);
1340  rLo = vadd_f32(rLo, zLo);
1341  rHi = vadd_f32(rHi, zHi);
1342 
1343  maskLo = vclt_f32(rLo, dotMinLo);
1344  maskHi = vclt_f32(rHi, dotMinHi);
1345  dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1346  dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1347  iLo = vbsl_u32(maskLo, indexLo, iLo);
1348  iHi = vbsl_u32(maskHi, indexHi, iHi);
1349  indexLo = vadd_u32(indexLo, four);
1350  indexHi = vadd_u32(indexHi, four);
1351  }
1352 
1353  for (; i + 4 <= count; i += 4)
1354  {
1355  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1356  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1357  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1358  float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1359 
1360  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1361  float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1362  float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1363  float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
1364 
1365  float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1366  float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
1367  float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1368  float32x2_t zHi = vmul_f32(z1.val[0], vHi);
1369 
1370  float32x2_t rLo = vpadd_f32(xy0, xy1);
1371  float32x2_t rHi = vpadd_f32(xy2, xy3);
1372  rLo = vadd_f32(rLo, zLo);
1373  rHi = vadd_f32(rHi, zHi);
1374 
1375  uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1376  uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1377  dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1378  dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1379  iLo = vbsl_u32(maskLo, indexLo, iLo);
1380  iHi = vbsl_u32(maskHi, indexHi, iHi);
1381  indexLo = vadd_u32(indexLo, four);
1382  indexHi = vadd_u32(indexHi, four);
1383  }
1384  switch (count & 3)
1385  {
1386  case 3:
1387  {
1388  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1389  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1390  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1391 
1392  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1393  float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1394  float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
1395 
1396  float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1397  float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1398  float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
1399 
1400  float32x2_t rLo = vpadd_f32(xy0, xy1);
1401  float32x2_t rHi = vpadd_f32(xy2, xy2);
1402  rLo = vadd_f32(rLo, zLo);
1403  rHi = vadd_f32(rHi, zHi);
1404 
1405  uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1406  uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
1407  dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1408  dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
1409  iLo = vbsl_u32(maskLo, indexLo, iLo);
1410  iHi = vbsl_u32(maskHi, indexHi, iHi);
1411  }
1412  break;
1413  case 2:
1414  {
1415  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1416  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1417 
1418  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1419  float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
1420 
1421  float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
1422  float32x2_t zLo = vmul_f32(z0.val[0], vHi);
1423 
1424  float32x2_t rLo = vpadd_f32(xy0, xy1);
1425  rLo = vadd_f32(rLo, zLo);
1426 
1427  uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1428  dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1429  iLo = vbsl_u32(maskLo, indexLo, iLo);
1430  }
1431  break;
1432  case 1:
1433  {
1434  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1435  float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
1436  float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
1437  float32x2_t zLo = vmul_f32(z0, vHi);
1438  float32x2_t rLo = vpadd_f32(xy0, xy0);
1439  rLo = vadd_f32(rLo, zLo);
1440  uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
1441  dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
1442  iLo = vbsl_u32(maskLo, indexLo, iLo);
1443  }
1444  break;
1445 
1446  default:
1447  break;
1448  }
1449 
1450  // select best answer between hi and lo results
1451  uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
1452  dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1453  iLo = vbsl_u32(mask, iHi, iLo);
1454 
1455  // select best answer between even and odd results
1456  dotMinHi = vdup_lane_f32(dotMinLo, 1);
1457  iHi = vdup_lane_u32(iLo, 1);
1458  mask = vclt_f32(dotMinHi, dotMinLo);
1459  dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
1460  iLo = vbsl_u32(mask, iHi, iLo);
1461 
1462  *dotResult = vget_lane_f32(dotMinLo, 0);
1463  return vget_lane_u32(iLo, 0);
1464 }
1465 
1466 long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
1467 {
1468  float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
1469  float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
1470  float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
1471  const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
1472  uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
1473  uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
1474  float32x4_t minDot = (float32x4_t){BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY};
1475 
1476  unsigned long i = 0;
1477  for (; i + 8 <= count; i += 8)
1478  {
1479  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1480  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1481  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1482  float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1483 
1484  // the next two lines should resolve to a single vswp d, d
1485  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1486  float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1487  // the next two lines should resolve to a single vswp d, d
1488  float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1489  float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1490 
1491  xy0 = vmulq_f32(xy0, vLo);
1492  xy1 = vmulq_f32(xy1, vLo);
1493 
1494  float32x4x2_t zb = vuzpq_f32(z0, z1);
1495  float32x4_t z = vmulq_f32(zb.val[0], vHi);
1496  float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1497  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1498  x = vaddq_f32(x, z);
1499 
1500  uint32x4_t mask = vcltq_f32(x, minDot);
1501  minDot = vbslq_f32(mask, x, minDot);
1502  index = vbslq_u32(mask, local_index, index);
1503  local_index = vaddq_u32(local_index, four);
1504 
1505  v0 = vld1q_f32_aligned_postincrement(vv);
1506  v1 = vld1q_f32_aligned_postincrement(vv);
1507  v2 = vld1q_f32_aligned_postincrement(vv);
1508  v3 = vld1q_f32_aligned_postincrement(vv);
1509 
1510  // the next two lines should resolve to a single vswp d, d
1511  xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1512  xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1513  // the next two lines should resolve to a single vswp d, d
1514  z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1515  z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1516 
1517  xy0 = vmulq_f32(xy0, vLo);
1518  xy1 = vmulq_f32(xy1, vLo);
1519 
1520  zb = vuzpq_f32(z0, z1);
1521  z = vmulq_f32(zb.val[0], vHi);
1522  xy = vuzpq_f32(xy0, xy1);
1523  x = vaddq_f32(xy.val[0], xy.val[1]);
1524  x = vaddq_f32(x, z);
1525 
1526  mask = vcltq_f32(x, minDot);
1527  minDot = vbslq_f32(mask, x, minDot);
1528  index = vbslq_u32(mask, local_index, index);
1529  local_index = vaddq_u32(local_index, four);
1530  }
1531 
1532  for (; i + 4 <= count; i += 4)
1533  {
1534  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1535  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1536  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1537  float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
1538 
1539  // the next two lines should resolve to a single vswp d, d
1540  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1541  float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
1542  // the next two lines should resolve to a single vswp d, d
1543  float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1544  float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
1545 
1546  xy0 = vmulq_f32(xy0, vLo);
1547  xy1 = vmulq_f32(xy1, vLo);
1548 
1549  float32x4x2_t zb = vuzpq_f32(z0, z1);
1550  float32x4_t z = vmulq_f32(zb.val[0], vHi);
1551  float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1552  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1553  x = vaddq_f32(x, z);
1554 
1555  uint32x4_t mask = vcltq_f32(x, minDot);
1556  minDot = vbslq_f32(mask, x, minDot);
1557  index = vbslq_u32(mask, local_index, index);
1558  local_index = vaddq_u32(local_index, four);
1559  }
1560 
1561  switch (count & 3)
1562  {
1563  case 3:
1564  {
1565  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1566  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1567  float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
1568 
1569  // the next two lines should resolve to a single vswp d, d
1570  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1571  float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
1572  // the next two lines should resolve to a single vswp d, d
1573  float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1574  float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
1575 
1576  xy0 = vmulq_f32(xy0, vLo);
1577  xy1 = vmulq_f32(xy1, vLo);
1578 
1579  float32x4x2_t zb = vuzpq_f32(z0, z1);
1580  float32x4_t z = vmulq_f32(zb.val[0], vHi);
1581  float32x4x2_t xy = vuzpq_f32(xy0, xy1);
1582  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1583  x = vaddq_f32(x, z);
1584 
1585  uint32x4_t mask = vcltq_f32(x, minDot);
1586  minDot = vbslq_f32(mask, x, minDot);
1587  index = vbslq_u32(mask, local_index, index);
1588  local_index = vaddq_u32(local_index, four);
1589  }
1590  break;
1591 
1592  case 2:
1593  {
1594  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1595  float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
1596 
1597  // the next two lines should resolve to a single vswp d, d
1598  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
1599  // the next two lines should resolve to a single vswp d, d
1600  float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
1601 
1602  xy0 = vmulq_f32(xy0, vLo);
1603 
1604  float32x4x2_t zb = vuzpq_f32(z0, z0);
1605  float32x4_t z = vmulq_f32(zb.val[0], vHi);
1606  float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1607  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1608  x = vaddq_f32(x, z);
1609 
1610  uint32x4_t mask = vcltq_f32(x, minDot);
1611  minDot = vbslq_f32(mask, x, minDot);
1612  index = vbslq_u32(mask, local_index, index);
1613  local_index = vaddq_u32(local_index, four);
1614  }
1615  break;
1616 
1617  case 1:
1618  {
1619  float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
1620 
1621  // the next two lines should resolve to a single vswp d, d
1622  float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
1623  // the next two lines should resolve to a single vswp d, d
1624  float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
1625 
1626  xy0 = vmulq_f32(xy0, vLo);
1627 
1628  z = vmulq_f32(z, vHi);
1629  float32x4x2_t xy = vuzpq_f32(xy0, xy0);
1630  float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
1631  x = vaddq_f32(x, z);
1632 
1633  uint32x4_t mask = vcltq_f32(x, minDot);
1634  minDot = vbslq_f32(mask, x, minDot);
1635  index = vbslq_u32(mask, local_index, index);
1636  local_index = vaddq_u32(local_index, four);
1637  }
1638  break;
1639 
1640  default:
1641  break;
1642  }
1643 
1644  // select best answer between hi and lo results
1645  uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
1646  float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
1647  uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
1648 
1649  // select best answer between even and odd results
1650  float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
1651  uint32x2_t indexHi = vdup_lane_u32(index2, 1);
1652  mask = vclt_f32(minDotO, minDot2);
1653  minDot2 = vbsl_f32(mask, minDotO, minDot2);
1654  index2 = vbsl_u32(mask, indexHi, index2);
1655 
1656  *dotResult = vget_lane_f32(minDot2, 0);
1657  return vget_lane_u32(index2, 0);
1658 }
1659 
1660 #else
1661 #error Unhandled __APPLE__ arch
1662 #endif
1663 
1664 #endif /* __APPLE__ */
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble z
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble v1
float float4[4]
struct AtomicSpinLock __attribute__((aligned(32))) AtomicSpinLock
ATTR_WARN_UNUSED_RESULT const BMVert * v2
#define btUnlikely(_c)
Definition: btScalar.h:301
#define BT_INFINITY
Definition: btScalar.h:405
SIMD_FORCE_INLINE long minDot(const btVector3 *array, long array_count, btScalar &dotOut) const
returns index of minimum dot product between this and vectors in array[]
Definition: btVector3.h:1033
SIMD_FORCE_INLINE long maxDot(const btVector3 *array, long array_count, btScalar &dotOut) const
returns index of maximum dot product between this and vectors in array[]
Definition: btVector3.h:998
int count
ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
Definition: math_float4.h:513
Segment< FEdge *, Vec3r > segment
#define min(a, b)
Definition: sort.c:35
unsigned int uint32_t
Definition: stdint.h:80
float max
static FT_Error err
int xy[2]
Definition: wm_draw.c:135