5 #ifndef __UTIL_SIMD_TYPES_H__
6 #define __UTIL_SIMD_TYPES_H__
19 #if defined(FREE_WINDOWS64)
21 #elif defined(_MSC_VER)
23 #elif (defined(__x86_64__) || defined(__i386__))
24 # include <x86intrin.h>
25 #elif defined(__KERNEL_NEON__)
26 # define SSE2NEON_PRECISE_MINMAX 1
27 # include <sse2neon.h>
31 #if defined(__x86_64__) || defined(_M_X64)
32 # define SIMD_SET_FLUSH_TO_ZERO \
33 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
34 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
35 #elif defined(__aarch64__) || defined(_M_ARM64)
36 # define _MM_FLUSH_ZERO_ON 24
37 # define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r"(__fpcr))
38 # define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : : "ri"(__fpcr))
39 # define SIMD_SET_FLUSH_TO_ZERO set_fz(_MM_FLUSH_ZERO_ON);
40 # define SIMD_GET_FLUSH_TO_ZERO get_fz(_MM_FLUSH_ZERO_ON)
42 # define SIMD_SET_FLUSH_TO_ZERO
48 #ifdef __KERNEL_SSE2__
50 extern const __m128 _mm_lookupmask_ps[16];
52 static struct TrueTy {
59 static struct FalseTy {
66 static struct ZeroTy {
88 static struct NegInfTy {
91 return -std::numeric_limits<float>::infinity();
99 static struct PosInfTy {
102 return std::numeric_limits<float>::infinity();
110 static struct StepTy {
114 #if defined(__aarch64__) || defined(_M_ARM64)
118 __get_fpcr(old_fpcr);
119 new_fpcr = old_fpcr | (1ULL << flag);
120 __set_fpcr(new_fpcr);
121 __get_fpcr(old_fpcr);
122 return old_fpcr == new_fpcr;
127 __get_fpcr(cur_fpcr);
128 return (cur_fpcr & (1ULL << flag)) > 0 ? 1 : 0;
133 #if defined(__KERNEL_NEON__)
134 template<
class type,
int i0,
int i1,
int i2,
int i3>
type shuffle_neon(
const type &
a)
136 if (i0 ==
i1 && i0 == i2 && i0 == i3) {
137 return type(vdupq_laneq_s32(int32x4_t(
a), i0));
139 static const uint8_t tbl[16] = {(i0 * 4) + 0,
156 return type(vqtbl1q_s8(int8x16_t(
a), *(uint8x16_t *)tbl));
159 template<
class type,
int i0,
int i1,
int i2,
int i3>
163 static const uint8_t tbl[16] = {(i0 * 4) + 0,
180 return type(vqtbl1q_s8(int8x16_t(
b), *(uint8x16_t *)tbl));
184 static const uint8_t tbl[16] = {(i0 * 4) + 0,
201 return type(vqtbl2q_s8((int8x16x2_t){int8x16_t(
a), int8x16_t(
b)}, *(uint8x16_t *)tbl));
210 #if defined(__BMI__) && defined(__GNUC__)
212 # define _tzcnt_u32 __tzcnt_u32
215 # define _tzcnt_u64 __tzcnt_u64
219 #if defined(__LZCNT__)
220 # define _lzcnt_u32 __lzcnt32
221 # define _lzcnt_u64 __lzcnt64
224 #if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
228 # if defined(__KERNEL_AVX2__)
229 return _tzcnt_u32(
v);
232 _BitScanForward(&
r,
v);
240 _BitScanReverse(&
r,
v);
247 _bittestandcomplement(&
r, i);
253 # if defined(__KERNEL_AVX2__)
254 return _tzcnt_u32(
v);
260 # if defined(__KERNEL_64_BIT__)
264 # if defined(__KERNEL_AVX2__)
265 return _tzcnt_u64(
v);
268 _BitScanForward64(&
r,
v);
276 _BitScanReverse64(&
r,
v);
283 _bittestandcomplement64((__int64 *)&
r, i);
289 # if defined(__KERNEL_AVX2__)
290 # if defined(__KERNEL_64_BIT__)
291 return _tzcnt_u64(
v);
293 return _tzcnt_u32(
v);
302 #elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
308 asm(
"bsf %1,%0" :
"=r"(
r) :
"r"(
v));
315 asm(
"bsr %1,%0" :
"=r"(
r) :
"r"(
v));
322 asm(
"btc %1,%0" :
"=r"(
r) :
"r"(i),
"0"(
v) :
"flags");
326 # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
327 !(defined(__ILP32__) && defined(__x86_64__))
331 asm(
"bsf %1,%0" :
"=r"(
r) :
"r"(
v));
339 asm(
"bsr %1,%0" :
"=r"(
r) :
"r"(
v));
346 asm(
"btc %1,%0" :
"=r"(
r) :
"r"(i),
"0"(
v) :
"flags");
352 # if defined(__KERNEL_AVX2__)
353 return _tzcnt_u32(
v);
359 # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
360 !(defined(__ILP32__) && defined(__x86_64__))
363 # if defined(__KERNEL_AVX2__)
364 # if defined(__KERNEL_64_BIT__)
365 return _tzcnt_u64(
v);
367 return _tzcnt_u32(
v);
379 for (
int i = 0; i < 32; i++) {
388 for (
int i = 0; i < 32; i++) {
389 if (
x & (1U << (31 - i)))
403 for (
int i = 0; i < 64; i++) {
412 for (
int i = 0; i < 64; i++) {
413 if (
x & (1UL << (63 - i)))
429 while ((value & (1 << bit)) == 0) {
439 while ((value & (1 << bit)) == 0) {
451 #ifdef __KERNEL_SSE2__
456 # if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
460 # define _MM_FROUND_TO_NEAREST_INT 0x00
461 # define _MM_FROUND_TO_NEG_INF 0x01
462 # define _MM_FROUND_TO_POS_INF 0x02
463 # define _MM_FROUND_TO_ZERO 0x03
464 # define _MM_FROUND_CUR_DIRECTION 0x04
466 # undef _mm_blendv_ps
467 # define _mm_blendv_ps _mm_blendv_ps_emu
470 __m128i isignmask = _mm_set1_epi32(0x80000000);
471 __m128 signmask = _mm_castsi128_ps(isignmask);
472 __m128i iandsign = _mm_castps_si128(_mm_and_ps(
mask, signmask));
473 __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
474 __m128 cmpmask = _mm_castsi128_ps(icmpmask);
475 return _mm_or_ps(_mm_and_ps(cmpmask,
input), _mm_andnot_ps(cmpmask, value));
479 # define _mm_blend_ps _mm_blend_ps_emu
483 return _mm_blendv_ps(value,
input, _mm_lookupmask_ps[
mask]);
486 # undef _mm_blendv_epi8
487 # define _mm_blendv_epi8 _mm_blendv_epi8_emu
490 return _mm_or_si128(_mm_and_si128(
mask,
input), _mm_andnot_si128(
mask, value));
493 # undef _mm_min_epi32
494 # define _mm_min_epi32 _mm_min_epi32_emu
497 return _mm_blendv_epi8(
input, value, _mm_cmplt_epi32(value,
input));
500 # undef _mm_max_epi32
501 # define _mm_max_epi32 _mm_max_epi32_emu
504 return _mm_blendv_epi8(value,
input, _mm_cmplt_epi32(value,
input));
507 # ifndef __KERNEL_NEON__
508 # undef _mm_extract_epi32
509 # define _mm_extract_epi32 _mm_extract_epi32_emu
514 return _mm_cvtsi128_si32(
input);
516 return _mm_cvtsi128_si32(_mm_shuffle_epi32(
input, _MM_SHUFFLE(1, 1, 1, 1)));
518 return _mm_cvtsi128_si32(_mm_shuffle_epi32(
input, _MM_SHUFFLE(2, 2, 2, 2)));
520 return _mm_cvtsi128_si32(_mm_shuffle_epi32(
input, _MM_SHUFFLE(3, 3, 3, 3)));
528 # undef _mm_insert_epi32
529 # define _mm_insert_epi32 _mm_insert_epi32_emu
532 assert(index >= 0 && index < 4);
533 ((
int *)&value)[index] =
input;
537 # undef _mm_insert_ps
538 # define _mm_insert_ps _mm_insert_ps_emu
541 assert(index < 0x100);
542 ((
float *)&value)[(index >> 4) & 0x3] = ((
float *)&
input)[index >> 6];
543 return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
547 # define _mm_round_ps _mm_round_ps_emu
548 __forceinline __m128 _mm_round_ps_emu(__m128 value,
const int flags)
551 case _MM_FROUND_TO_NEAREST_INT:
552 return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
553 case _MM_FROUND_TO_NEG_INF:
554 return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
555 case _MM_FROUND_TO_POS_INF:
556 return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f))));
557 case _MM_FROUND_TO_ZERO:
558 return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
567 # if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
568 # undef _mm256_cvtss_f32
569 # define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
575 #if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
576 defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
typedef float(TangentPoint)[2]
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble GLdouble r _GL_VOID_RET _GL_VOID GLfloat GLfloat r _GL_VOID_RET _GL_VOID GLint GLint r _GL_VOID_RET _GL_VOID GLshort GLshort r _GL_VOID_RET _GL_VOID GLdouble GLdouble r
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum type
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint i1
ATTR_WARN_UNUSED_RESULT const BMVert * v
#define CCL_NAMESPACE_END
#define ccl_attr_maybe_unused
ccl_global KernelShaderEvalInput * input
ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
static const pxr::TfToken b("b", pxr::TfToken::Immortal)
CCL_NAMESPACE_BEGIN __forceinline uint32_t __bsf(const uint32_t x)
__forceinline uint32_t __bsr(const uint32_t x)
__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)
__forceinline uint32_t bitscan(uint32_t value)
unsigned __int64 uint64_t