2 #include "AL/maya/Common.h"
5 # define ALIGN16(X) __declspec(align(16)) X
6 # define ALIGN32(X) __declspec(align(32)) X
8 # define ALIGN16(X) X __attribute__((aligned(16)))
9 # define ALIGN32(X) X __attribute__((aligned(32)))
12 #if AL_MAYA_ENABLE_SIMD
15 # include <immintrin.h>
19 # include <pmmintrin.h>
23 # include <smmintrin.h>
27 # define AL_DLL_HIDDEN __attribute__ ((visibility ("hidden")))
29 # define AL_DLL_HIDDEN
34 #if (__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8)
35 # define ENABLE_SOME_AVX_ROUTINES 0
37 # define ENABLE_SOME_AVX_ROUTINES 1
46 #define shiftBytesLeft(reg, count) _mm_slli_si128(reg, count)
47 #define shiftBytesRight(reg, count) _mm_srli_si128(reg, count)
48 #define shuffle4f(a, b, W, Z, Y, X) _mm_shuffle_ps(a, b, _MM_SHUFFLE(W, Z, Y, X))
50 #define lshift64(X, N) _mm_slli_epi64(X, N)
52 AL_DLL_HIDDEN
inline f128 zero4f() {
return _mm_setzero_ps(); }
53 AL_DLL_HIDDEN
inline i128 zero4i() {
return _mm_setzero_si128(); }
54 AL_DLL_HIDDEN
inline d128 zero2d() {
return _mm_setzero_pd(); }
56 AL_DLL_HIDDEN
inline f128 cast4f(
const d128 reg) {
return _mm_castpd_ps(reg); }
57 AL_DLL_HIDDEN
inline f128 cast4f(
const i128 reg) {
return _mm_castsi128_ps(reg); }
58 AL_DLL_HIDDEN
inline i128 cast4i(
const d128 reg) {
return _mm_castpd_si128(reg); }
59 AL_DLL_HIDDEN
inline i128 cast4i(
const f128 reg) {
return _mm_castps_si128(reg); }
60 AL_DLL_HIDDEN
inline d128 cast2d(
const f128 reg) {
return _mm_castps_pd(reg); }
61 AL_DLL_HIDDEN
inline d128 cast2d(
const i128 reg) {
return _mm_castsi128_pd(reg); }
63 AL_DLL_HIDDEN
inline f128 load1f(
const float*
const ptr) {
return _mm_load_ss(ptr); }
64 AL_DLL_HIDDEN
inline f128 load2f(
const float*
const ptr) {
return cast4f(_mm_load_sd((
const double*)ptr)); }
66 AL_DLL_HIDDEN
inline int32_t movemask16i8(
const i128 reg) {
return _mm_movemask_epi8(reg); }
67 AL_DLL_HIDDEN
inline int32_t movemask4i(
const i128 reg) {
return _mm_movemask_ps(cast4f(reg)); }
68 AL_DLL_HIDDEN
inline int32_t movemask4f(
const f128 reg) {
return _mm_movemask_ps(reg); }
69 AL_DLL_HIDDEN
inline int32_t movemask2d(
const d128 reg) {
return _mm_movemask_pd(reg); }
71 AL_DLL_HIDDEN
inline i128 cmpeq4i(
const i128 a,
const i128 b) {
return _mm_cmpeq_epi32(a, b); }
72 AL_DLL_HIDDEN
inline i128 cmpeq16i8(
const i128 a,
const i128 b) {
return _mm_cmpeq_epi8(a, b); }
73 AL_DLL_HIDDEN
inline i128 cmplt16i8(
const i128 a,
const i128 b) {
return _mm_cmplt_epi8(a, b); }
74 AL_DLL_HIDDEN
inline i128 cmpgt16i8(
const i128 a,
const i128 b) {
return _mm_cmpgt_epi8(a, b); }
76 AL_DLL_HIDDEN
inline f128 set4f(
const float a,
const float b,
const float c,
const float d) {
return _mm_setr_ps(a, b, c, d); }
77 AL_DLL_HIDDEN
inline i128 set4i(
const int32_t a,
const int32_t b,
const int32_t c,
const int32_t d) {
return _mm_setr_epi32(a, b, c, d); }
78 AL_DLL_HIDDEN
inline d128 set2d(
const double a,
const double b) {
return _mm_setr_pd(a, b); }
80 AL_DLL_HIDDEN
inline i128 set16i8(
81 const int8_t a0,
const int8_t b0,
const int8_t c0,
const int8_t d0,
82 const int8_t a1,
const int8_t b1,
const int8_t c1,
const int8_t d1,
83 const int8_t a2,
const int8_t b2,
const int8_t c2,
const int8_t d2,
84 const int8_t a3,
const int8_t b3,
const int8_t c3,
const int8_t d3)
85 {
return _mm_setr_epi8(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3); }
87 AL_DLL_HIDDEN
inline f128 loadu4f(
const void*
const ptr) {
return _mm_loadu_ps((
const float*)ptr); }
88 AL_DLL_HIDDEN
inline i128 loadu4i(
const void*
const ptr) {
return _mm_loadu_si128((
const i128*)ptr); }
89 AL_DLL_HIDDEN
inline d128 loadu2d(
const void*
const ptr) {
return _mm_loadu_pd((
const double*)ptr); }
91 AL_DLL_HIDDEN
inline f128 load4f(
const void*
const ptr) {
return _mm_load_ps((
const float*)ptr); }
92 AL_DLL_HIDDEN
inline i128 load4i(
const void*
const ptr) {
return _mm_load_si128((
const i128*)ptr); }
93 AL_DLL_HIDDEN
inline d128 load2d(
const void*
const ptr) {
return _mm_load_pd((
const double*)ptr); }
95 AL_DLL_HIDDEN
inline void storeu4f(
void*
const ptr,
const f128 reg) { _mm_storeu_ps((
float*)ptr, reg); }
96 AL_DLL_HIDDEN
inline void storeu4i(
void*
const ptr,
const i128 reg) { _mm_storeu_si128((i128*)ptr, reg); }
97 AL_DLL_HIDDEN
inline void storeu2d(
void*
const ptr,
const d128 reg) { _mm_storeu_pd((
double*)ptr, reg); }
99 AL_DLL_HIDDEN
inline void store4f(
void*
const ptr,
const f128 reg) { _mm_store_ps((
float*)ptr, reg); }
100 AL_DLL_HIDDEN
inline void store4i(
void*
const ptr,
const i128 reg) { _mm_store_si128((i128*)ptr, reg); }
101 AL_DLL_HIDDEN
inline void store2d(
void*
const ptr,
const d128 reg) { _mm_store_pd((
double*)ptr, reg); }
103 AL_DLL_HIDDEN
inline d128 cvt2f_to_2d(
const f128 reg) {
return _mm_cvtps_pd(reg); }
104 AL_DLL_HIDDEN
inline f128 cvt2d_to_2f(
const d128 reg) {
return _mm_cvtpd_ps(reg); }
106 AL_DLL_HIDDEN
inline f128 movehl4f(
const f128 a,
const f128 b) {
return _mm_movehl_ps(a, b); }
107 AL_DLL_HIDDEN
inline f128 movelh4f(
const f128 a,
const f128 b) {
return _mm_movelh_ps(a, b); }
108 AL_DLL_HIDDEN
inline i128 movehl4i(
const i128 a,
const i128 b) {
return cast4i(_mm_movehl_ps(cast4f(a), cast4f(b))); }
109 AL_DLL_HIDDEN
inline i128 movelh4i(
const i128 a,
const i128 b) {
return cast4i(_mm_movelh_ps(cast4f(a), cast4f(b))); }
111 AL_DLL_HIDDEN
inline f128 or4f(
const f128 a,
const f128 b) {
return _mm_or_ps(a, b); }
112 AL_DLL_HIDDEN
inline f128 and4f(
const f128 a,
const f128 b) {
return _mm_and_ps(a, b); }
113 AL_DLL_HIDDEN
inline f128 andnot4f(
const f128 a,
const f128 b) {
return _mm_andnot_ps(a, b); }
115 AL_DLL_HIDDEN
inline i128 or4i(
const i128 a,
const i128 b) {
return _mm_or_si128(a, b); }
116 AL_DLL_HIDDEN
inline i128 and4i(
const i128 a,
const i128 b) {
return _mm_and_si128(a, b); }
117 AL_DLL_HIDDEN
inline i128 andnot4i(
const i128 a,
const i128 b) {
return _mm_andnot_si128(a, b); }
119 AL_DLL_HIDDEN
inline f128 mul4f(
const f128 a,
const f128 b) {
return _mm_mul_ps(a, b); }
120 AL_DLL_HIDDEN
inline d128 mul2d(
const d128 a,
const d128 b) {
return _mm_mul_pd(a, b); }
122 AL_DLL_HIDDEN
inline f128 add4f(
const f128 a,
const f128 b) {
return _mm_add_ps(a, b); }
123 AL_DLL_HIDDEN
inline i128 add4i(
const i128 a,
const i128 b) {
return _mm_add_epi32(a, b); }
124 AL_DLL_HIDDEN
inline d128 add2d(
const d128 a,
const d128 b) {
return _mm_add_pd(a, b); }
125 AL_DLL_HIDDEN
inline i128 add2i64(
const i128 a,
const i128 b) {
return _mm_add_epi64(a, b); }
127 AL_DLL_HIDDEN
inline f128 sub4f(
const f128 a,
const f128 b) {
return _mm_sub_ps(a, b); }
128 AL_DLL_HIDDEN
inline i128 sub4i(
const i128 a,
const i128 b) {
return _mm_sub_epi32(a, b); }
129 AL_DLL_HIDDEN
inline d128 sub2d(
const d128 a,
const d128 b) {
return _mm_sub_pd(a, b); }
130 AL_DLL_HIDDEN
inline i128 sub2i64(
const i128 a,
const i128 b) {
return _mm_sub_epi64(a, b); }
132 AL_DLL_HIDDEN
inline f128 splat4f(
float f) {
return _mm_set1_ps(f); }
133 AL_DLL_HIDDEN
inline d128 splat2d(
double f) {
return _mm_set1_pd(f); }
134 AL_DLL_HIDDEN
inline i128 splat4i(int32_t f) {
return _mm_set1_epi32(f); }
135 AL_DLL_HIDDEN
inline i128 splat2i64(
const int64_t f) {
return _mm_set1_epi64x(f); }
137 AL_DLL_HIDDEN
inline f128 unpacklo4f(
const f128 a,
const f128 b) {
return _mm_unpacklo_ps(a, b); }
138 AL_DLL_HIDDEN
inline f128 unpackhi4f(
const f128 a,
const f128 b) {
return _mm_unpackhi_ps(a, b); }
140 #if !defined(__SSE4__) && !defined(__SSE4_1__) && !defined(__SSE4_2__) && !defined(__AVX__) && !defined(__AVX2__)
141 AL_DLL_HIDDEN
inline __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c)
143 return _mm_or_ps(_mm_and_ps(c, b), _mm_andnot_ps(c, a));
146 AL_DLL_HIDDEN
inline i128 cvt2i32_to_2i64(
const i128 reg) {
return _mm_cvtepi32_epi64(reg); }
149 AL_DLL_HIDDEN
inline f128 select4f(
const f128 falseResult,
const f128 trueResult,
const f128 cmp) {
return _mm_blendv_ps(falseResult, trueResult, cmp); }
151 #define shiftBytesLeft128(reg, count) _mm_slli_si128(reg, count)
152 #define shiftBytesRight128(reg, count) _mm_srli_si128(reg, count)
153 #define shiftBitsLeft4i32(reg, count) _mm_slli_epi32(reg, count)
154 #define shiftBitsRight4i32(reg, count) _mm_srli_epi32(reg, count)
155 #define shiftBitsLeft2i64(reg, count) _mm_slli_epi64(reg, count)
156 #define shiftBitsRight2i64(reg, count) _mm_srli_epi64(reg, count)
158 #define extract128i64(reg, index) _mm_extract_epi64(reg, index)
164 typedef __m256i i256;
165 typedef __m256d d256;
167 #define shuffle8f(a, b, W, Z, Y, X) _mm256_shuffle_ps(a, b, _MM_SHUFFLE(W, Z, Y, X))
169 AL_DLL_HIDDEN
inline f256 zero8f() {
return _mm256_setzero_ps(); }
170 AL_DLL_HIDDEN
inline i256 zero8i() {
return _mm256_setzero_si256(); }
171 AL_DLL_HIDDEN
inline d256 zero4d() {
return _mm256_setzero_pd(); }
173 AL_DLL_HIDDEN
inline f256 cast8f(
const d256 reg) {
return _mm256_castpd_ps(reg); }
174 AL_DLL_HIDDEN
inline f256 cast8f(
const i256 reg) {
return _mm256_castsi256_ps(reg); }
175 AL_DLL_HIDDEN
inline i256 cast8i(
const d256 reg) {
return _mm256_castpd_si256(reg); }
176 AL_DLL_HIDDEN
inline i256 cast8i(
const f256 reg) {
return _mm256_castps_si256(reg); }
177 AL_DLL_HIDDEN
inline d256 cast4d(
const f256 reg) {
return _mm256_castps_pd(reg); }
178 AL_DLL_HIDDEN
inline d256 cast4d(
const i256 reg) {
return _mm256_castsi256_pd(reg); }
180 AL_DLL_HIDDEN
inline int32_t movemask8i(
const i256 reg) {
return _mm256_movemask_ps(cast8f(reg)); }
181 AL_DLL_HIDDEN
inline int32_t movemask8f(
const f256 reg) {
return _mm256_movemask_ps(reg); }
182 AL_DLL_HIDDEN
inline int32_t movemask4d(
const d256 reg) {
return _mm256_movemask_pd(reg); }
184 AL_DLL_HIDDEN
inline i256 cmpeq8i(
const i256 a,
const i256 b) {
return _mm256_cmpeq_epi32(a, b); }
186 #define permute2f128(a, b, mask) _mm256_permute2f128_ps(a, b, mask)
188 AL_DLL_HIDDEN
inline f256 set8f(
const float a,
const float b,
const float c,
const float d,
189 const float e,
const float f,
const float g,
const float h)
190 {
return _mm256_setr_ps(a,b,c,d,e,f,g,h); }
191 AL_DLL_HIDDEN
inline i256 set8i(
const int32_t a,
const int32_t b,
const int32_t c,
const int32_t d,
192 const int32_t e,
const int32_t f,
const int32_t g,
const int32_t h)
193 {
return _mm256_setr_epi32(a,b,c,d,e,f,g,h); }
194 AL_DLL_HIDDEN
inline d256 set4f(
const double a,
const double b,
const double c,
const double d)
195 {
return _mm256_setr_pd(a, b, c, d); }
197 AL_DLL_HIDDEN
inline f256 loadu8f(
const void*
const ptr) {
return _mm256_loadu_ps((
const float*)ptr); }
198 AL_DLL_HIDDEN
inline i256 loadu8i(
const void*
const ptr) {
return _mm256_loadu_si256((
const i256*)ptr); }
199 AL_DLL_HIDDEN
inline d256 loadu4d(
const void*
const ptr) {
return _mm256_loadu_pd((
const double*)ptr); }
201 AL_DLL_HIDDEN
inline f256 load8f(
const void*
const ptr) {
return _mm256_load_ps((
const float*)ptr); }
202 AL_DLL_HIDDEN
inline i256 load8i(
const void*
const ptr) {
return _mm256_load_si256((
const i256*)ptr); }
203 AL_DLL_HIDDEN
inline d256 load4d(
const void*
const ptr) {
return _mm256_load_pd((
const double*)ptr); }
205 AL_DLL_HIDDEN
inline void storeu8f(
void*
const ptr,
const f256 reg) { _mm256_storeu_ps((
float*)ptr, reg); }
206 AL_DLL_HIDDEN
inline void storeu8i(
void*
const ptr,
const i256 reg) { _mm256_storeu_si256((i256*)ptr, reg); }
207 AL_DLL_HIDDEN
inline void storeu4d(
void*
const ptr,
const d256 reg) { _mm256_storeu_pd((
double*)ptr, reg); }
209 AL_DLL_HIDDEN
inline void store8f(
void*
const ptr,
const f256 reg) { _mm256_store_ps((
float*)ptr, reg); }
210 AL_DLL_HIDDEN
inline void store8i(
void*
const ptr,
const i256 reg) { _mm256_store_si256((i256*)ptr, reg); }
211 AL_DLL_HIDDEN
inline void store4d(
void*
const ptr,
const d256 reg) { _mm256_store_pd((
double*)ptr, reg); }
213 AL_DLL_HIDDEN
inline d256 cvt4f_to_4d(
const f128 reg) {
return _mm256_cvtps_pd(reg); }
214 AL_DLL_HIDDEN
inline f128 cvt4d_to_4f(
const d256 reg) {
return _mm256_cvtpd_ps(reg); }
215 AL_DLL_HIDDEN
inline i256 cvt4i32_to_4i64(
const i128 reg) {
return _mm256_cvtepi32_epi64(reg); }
217 AL_DLL_HIDDEN
inline f256 or8f(
const f256 a,
const f256 b) {
return _mm256_or_ps(a, b); }
218 AL_DLL_HIDDEN
inline f256 and8f(
const f256 a,
const f256 b) {
return _mm256_and_ps(a, b); }
219 AL_DLL_HIDDEN
inline f256 andnot8f(
const f256 a,
const f256 b) {
return _mm256_andnot_ps(a, b); }
221 AL_DLL_HIDDEN
inline i256 or8i(
const i256 a,
const i256 b) {
return _mm256_or_si256(a, b); }
222 AL_DLL_HIDDEN
inline i256 and8i(
const i256 a,
const i256 b) {
return _mm256_and_si256(a, b); }
223 AL_DLL_HIDDEN
inline i256 andnot8i(
const i256 a,
const i256 b) {
return _mm256_andnot_si256(a, b); }
225 AL_DLL_HIDDEN
inline f256 mul8f(
const f256 a,
const f256 b) {
return _mm256_mul_ps(a, b); }
226 AL_DLL_HIDDEN
inline d256 mul4d(
const d256 a,
const d256 b) {
return _mm256_mul_pd(a, b); }
228 AL_DLL_HIDDEN
inline f256 add8f(
const f256 a,
const f256 b) {
return _mm256_add_ps(a, b); }
229 AL_DLL_HIDDEN
inline i256 add8i(
const i256 a,
const i256 b) {
return _mm256_add_epi32(a, b); }
230 AL_DLL_HIDDEN
inline d256 add4d(
const d256 a,
const d256 b) {
return _mm256_add_pd(a, b); }
231 AL_DLL_HIDDEN
inline i256 add4i64(
const i256 a,
const i256 b) {
return _mm256_add_epi64(a, b); }
233 AL_DLL_HIDDEN
inline f256 select8f(
const f256 falseResult,
const f256 trueResult,
const f256 cmp) {
return _mm256_blendv_ps(falseResult, trueResult, cmp); }
235 AL_DLL_HIDDEN
inline f256 permutevar8x32f(
const f256 a,
const i256 b) {
return _mm256_permutevar8x32_ps(a, b); }
237 AL_DLL_HIDDEN
inline f256 unpacklo8f(
const f256 a,
const f256 b) {
return _mm256_unpacklo_ps(a, b); }
238 AL_DLL_HIDDEN
inline f256 unpackhi8f(
const f256 a,
const f256 b) {
return _mm256_unpackhi_ps(a, b); }
240 #define extract4f(reg, index) _mm256_extractf128_ps(reg, index)
241 #define extract256i64(reg, index) _mm256_extract_epi64(reg, index)
243 AL_DLL_HIDDEN
inline f256 splat8f(
const float f) {
return _mm256_set1_ps(f); }
244 AL_DLL_HIDDEN
inline d256 splat4d(
const double f) {
return _mm256_set1_pd(f); }
245 AL_DLL_HIDDEN
inline i256 splat8i(
const int32_t f) {
return _mm256_set1_epi32(f); }
246 AL_DLL_HIDDEN
inline i256 splat4i64(
const int64_t f) {
return _mm256_set1_epi64x(f); }
248 AL_DLL_HIDDEN
inline f128 i32gather4f(
const float*
const ptr,
const i128 indices) {
return _mm_i32gather_ps(ptr, indices, 4); }
249 AL_DLL_HIDDEN
inline f256 i32gather8f(
const float*
const ptr,
const i256 indices) {
return _mm256_i32gather_ps(ptr, indices, 4); }
250 AL_DLL_HIDDEN
inline i128 i32gather4i(
const int32_t*
const ptr,
const i128 indices) {
return _mm_i32gather_epi32(ptr, indices, 4); }
251 AL_DLL_HIDDEN
inline i256 i32gather8i(
const int32_t*
const ptr,
const i256 indices) {
return _mm256_i32gather_epi32(ptr, indices, 4); }
253 AL_DLL_HIDDEN
inline f256 set2f128(
const f128 lo,
const f128 hi) {
return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
255 #define shiftBytesLeft256(reg, count) _mm256_slli_si256(reg, count)
256 #define shiftBytesRight256(reg, count) _mm256_srli_si256(reg, count)
257 #define shiftBitsLeft8i32(reg, count) _mm256_slli_epi32(reg, count)
258 #define shiftBitsRight8i32(reg, count) _mm256_srli_epi32(reg, count)
259 #define shiftBitsLeft4i64(reg, count) _mm256_slli_epi64(reg, count)
260 #define shiftBitsRight4i64(reg, count) _mm256_srli_epi64(reg, count)