AL_USDMaya  0.16.6
USD to Maya Bridge
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
SIMD.h
1 #pragma once
2 #include "AL/maya/Common.h"
3 
4 #ifdef _WIN32
5 # define ALIGN16(X) __declspec(align(16)) X
6 # define ALIGN32(X) __declspec(align(32)) X
7 #else
8 # define ALIGN16(X) X __attribute__((aligned(16)))
9 # define ALIGN32(X) X __attribute__((aligned(32)))
10 #endif
11 
12 #if AL_MAYA_ENABLE_SIMD
13 
14 # ifdef __AVX2__
15 # include <immintrin.h>
16 # endif
17 
18 # ifdef __SSE3__
19 # include <pmmintrin.h>
20 # endif
21 
22 # ifdef __SSE4_1__
23 # include <smmintrin.h>
24 # endif
25 
26 # if __GNUC__ >= 4
27 # define AL_DLL_HIDDEN __attribute__ ((visibility ("hidden")))
28 # else
29 # define AL_DLL_HIDDEN
30 # endif
31 
32 // For reasons unknown, GCC 4.8 fails to correctly assemble certain AVX2 instructions.
33 // This is a known issue that was fixed in gcc 4.9.
34 #if (__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8)
35 # define ENABLE_SOME_AVX_ROUTINES 0
36 #else
37 # define ENABLE_SOME_AVX_ROUTINES 1
38 #endif
39 
40 namespace AL {
41 # if __SSE3__
42 typedef __m128 f128;
43 typedef __m128i i128;
44 typedef __m128d d128;
45 
46 #define shiftBytesLeft(reg, count) _mm_slli_si128(reg, count)
47 #define shiftBytesRight(reg, count) _mm_srli_si128(reg, count)
48 #define shuffle4f(a, b, W, Z, Y, X) _mm_shuffle_ps(a, b, _MM_SHUFFLE(W, Z, Y, X))
49 
50 #define lshift64(X, N) _mm_slli_epi64(X, N)
51 
52 AL_DLL_HIDDEN inline f128 zero4f() { return _mm_setzero_ps(); }
53 AL_DLL_HIDDEN inline i128 zero4i() { return _mm_setzero_si128(); }
54 AL_DLL_HIDDEN inline d128 zero2d() { return _mm_setzero_pd(); }
55 
56 AL_DLL_HIDDEN inline f128 cast4f(const d128 reg) { return _mm_castpd_ps(reg); }
57 AL_DLL_HIDDEN inline f128 cast4f(const i128 reg) { return _mm_castsi128_ps(reg); }
58 AL_DLL_HIDDEN inline i128 cast4i(const d128 reg) { return _mm_castpd_si128(reg); }
59 AL_DLL_HIDDEN inline i128 cast4i(const f128 reg) { return _mm_castps_si128(reg); }
60 AL_DLL_HIDDEN inline d128 cast2d(const f128 reg) { return _mm_castps_pd(reg); }
61 AL_DLL_HIDDEN inline d128 cast2d(const i128 reg) { return _mm_castsi128_pd(reg); }
62 
63 AL_DLL_HIDDEN inline f128 load1f(const float* const ptr) { return _mm_load_ss(ptr); }
64 AL_DLL_HIDDEN inline f128 load2f(const float* const ptr) { return cast4f(_mm_load_sd((const double*)ptr)); }
65 
66 AL_DLL_HIDDEN inline int32_t movemask16i8(const i128 reg) { return _mm_movemask_epi8(reg); }
67 AL_DLL_HIDDEN inline int32_t movemask4i(const i128 reg) { return _mm_movemask_ps(cast4f(reg)); }
68 AL_DLL_HIDDEN inline int32_t movemask4f(const f128 reg) { return _mm_movemask_ps(reg); }
69 AL_DLL_HIDDEN inline int32_t movemask2d(const d128 reg) { return _mm_movemask_pd(reg); }
70 
71 AL_DLL_HIDDEN inline i128 cmpeq4i(const i128 a, const i128 b) { return _mm_cmpeq_epi32(a, b); }
72 AL_DLL_HIDDEN inline i128 cmpeq16i8(const i128 a, const i128 b) { return _mm_cmpeq_epi8(a, b); }
73 AL_DLL_HIDDEN inline i128 cmplt16i8(const i128 a, const i128 b) { return _mm_cmplt_epi8(a, b); }
74 AL_DLL_HIDDEN inline i128 cmpgt16i8(const i128 a, const i128 b) { return _mm_cmpgt_epi8(a, b); }
75 
76 AL_DLL_HIDDEN inline f128 set4f(const float a, const float b, const float c, const float d) {return _mm_setr_ps(a, b, c, d); }
77 AL_DLL_HIDDEN inline i128 set4i(const int32_t a, const int32_t b, const int32_t c, const int32_t d) {return _mm_setr_epi32(a, b, c, d); }
78 AL_DLL_HIDDEN inline d128 set2d(const double a, const double b) {return _mm_setr_pd(a, b); }
79 
80 AL_DLL_HIDDEN inline i128 set16i8(
81  const int8_t a0, const int8_t b0, const int8_t c0, const int8_t d0,
82  const int8_t a1, const int8_t b1, const int8_t c1, const int8_t d1,
83  const int8_t a2, const int8_t b2, const int8_t c2, const int8_t d2,
84  const int8_t a3, const int8_t b3, const int8_t c3, const int8_t d3)
85 {return _mm_setr_epi8(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3); }
86 
87 AL_DLL_HIDDEN inline f128 loadu4f(const void* const ptr) { return _mm_loadu_ps((const float*)ptr); }
88 AL_DLL_HIDDEN inline i128 loadu4i(const void* const ptr) { return _mm_loadu_si128((const i128*)ptr); }
89 AL_DLL_HIDDEN inline d128 loadu2d(const void* const ptr) { return _mm_loadu_pd((const double*)ptr); }
90 
91 AL_DLL_HIDDEN inline f128 load4f(const void* const ptr) { return _mm_load_ps((const float*)ptr); }
92 AL_DLL_HIDDEN inline i128 load4i(const void* const ptr) { return _mm_load_si128((const i128*)ptr); }
93 AL_DLL_HIDDEN inline d128 load2d(const void* const ptr) { return _mm_load_pd((const double*)ptr); }
94 
95 AL_DLL_HIDDEN inline void storeu4f(void* const ptr, const f128 reg) { _mm_storeu_ps((float*)ptr, reg); }
96 AL_DLL_HIDDEN inline void storeu4i(void* const ptr, const i128 reg) { _mm_storeu_si128((i128*)ptr, reg); }
97 AL_DLL_HIDDEN inline void storeu2d(void* const ptr, const d128 reg) { _mm_storeu_pd((double*)ptr, reg); }
98 
99 AL_DLL_HIDDEN inline void store4f(void* const ptr, const f128 reg) { _mm_store_ps((float*)ptr, reg); }
100 AL_DLL_HIDDEN inline void store4i(void* const ptr, const i128 reg) { _mm_store_si128((i128*)ptr, reg); }
101 AL_DLL_HIDDEN inline void store2d(void* const ptr, const d128 reg) { _mm_store_pd((double*)ptr, reg); }
102 
103 AL_DLL_HIDDEN inline d128 cvt2f_to_2d(const f128 reg) { return _mm_cvtps_pd(reg); }
104 AL_DLL_HIDDEN inline f128 cvt2d_to_2f(const d128 reg) { return _mm_cvtpd_ps(reg); }
105 
106 AL_DLL_HIDDEN inline f128 movehl4f(const f128 a, const f128 b) { return _mm_movehl_ps(a, b); }
107 AL_DLL_HIDDEN inline f128 movelh4f(const f128 a, const f128 b) { return _mm_movelh_ps(a, b); }
108 AL_DLL_HIDDEN inline i128 movehl4i(const i128 a, const i128 b) { return cast4i(_mm_movehl_ps(cast4f(a), cast4f(b))); }
109 AL_DLL_HIDDEN inline i128 movelh4i(const i128 a, const i128 b) { return cast4i(_mm_movelh_ps(cast4f(a), cast4f(b))); }
110 
111 AL_DLL_HIDDEN inline f128 or4f(const f128 a, const f128 b) { return _mm_or_ps(a, b); }
112 AL_DLL_HIDDEN inline f128 and4f(const f128 a, const f128 b) { return _mm_and_ps(a, b); }
113 AL_DLL_HIDDEN inline f128 andnot4f(const f128 a, const f128 b) { return _mm_andnot_ps(a, b); }
114 
115 AL_DLL_HIDDEN inline i128 or4i(const i128 a, const i128 b) { return _mm_or_si128(a, b); }
116 AL_DLL_HIDDEN inline i128 and4i(const i128 a, const i128 b) { return _mm_and_si128(a, b); }
117 AL_DLL_HIDDEN inline i128 andnot4i(const i128 a, const i128 b) { return _mm_andnot_si128(a, b); }
118 
119 AL_DLL_HIDDEN inline f128 mul4f(const f128 a, const f128 b) { return _mm_mul_ps(a, b); }
120 AL_DLL_HIDDEN inline d128 mul2d(const d128 a, const d128 b) { return _mm_mul_pd(a, b); }
121 
122 AL_DLL_HIDDEN inline f128 add4f(const f128 a, const f128 b) { return _mm_add_ps(a, b); }
123 AL_DLL_HIDDEN inline i128 add4i(const i128 a, const i128 b) { return _mm_add_epi32(a, b); }
124 AL_DLL_HIDDEN inline d128 add2d(const d128 a, const d128 b) { return _mm_add_pd(a, b); }
125 AL_DLL_HIDDEN inline i128 add2i64(const i128 a, const i128 b) { return _mm_add_epi64(a, b); }
126 
127 AL_DLL_HIDDEN inline f128 sub4f(const f128 a, const f128 b) { return _mm_sub_ps(a, b); }
128 AL_DLL_HIDDEN inline i128 sub4i(const i128 a, const i128 b) { return _mm_sub_epi32(a, b); }
129 AL_DLL_HIDDEN inline d128 sub2d(const d128 a, const d128 b) { return _mm_sub_pd(a, b); }
130 AL_DLL_HIDDEN inline i128 sub2i64(const i128 a, const i128 b) { return _mm_sub_epi64(a, b); }
131 
132 AL_DLL_HIDDEN inline f128 splat4f(float f) { return _mm_set1_ps(f); }
133 AL_DLL_HIDDEN inline d128 splat2d(double f) { return _mm_set1_pd(f); }
134 AL_DLL_HIDDEN inline i128 splat4i(int32_t f) { return _mm_set1_epi32(f); }
135 AL_DLL_HIDDEN inline i128 splat2i64(const int64_t f) { return _mm_set1_epi64x(f); }
136 
137 AL_DLL_HIDDEN inline f128 unpacklo4f(const f128 a, const f128 b) { return _mm_unpacklo_ps(a, b); }
138 AL_DLL_HIDDEN inline f128 unpackhi4f(const f128 a, const f128 b) { return _mm_unpackhi_ps(a, b); }
139 
140 #if !defined(__SSE4__) && !defined(__SSE4_1__) && !defined(__SSE4_2__) && !defined(__AVX__) && !defined(__AVX2__)
141 AL_DLL_HIDDEN inline __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 c)
142 {
143  return _mm_or_ps(_mm_and_ps(c, b), _mm_andnot_ps(c, a));
144 }
145 #else
146 AL_DLL_HIDDEN inline i128 cvt2i32_to_2i64(const i128 reg) { return _mm_cvtepi32_epi64(reg); }
147 #endif
148 
149 AL_DLL_HIDDEN inline f128 select4f(const f128 falseResult, const f128 trueResult, const f128 cmp) { return _mm_blendv_ps(falseResult, trueResult, cmp); }
150 
151 #define shiftBytesLeft128(reg, count) _mm_slli_si128(reg, count)
152 #define shiftBytesRight128(reg, count) _mm_srli_si128(reg, count)
153 #define shiftBitsLeft4i32(reg, count) _mm_slli_epi32(reg, count)
154 #define shiftBitsRight4i32(reg, count) _mm_srli_epi32(reg, count)
155 #define shiftBitsLeft2i64(reg, count) _mm_slli_epi64(reg, count)
156 #define shiftBitsRight2i64(reg, count) _mm_srli_epi64(reg, count)
157 
158 #define extract128i64(reg, index) _mm_extract_epi64(reg, index)
159 
160 #endif
161 
162 # if __AVX2__
163 typedef __m256 f256;
164 typedef __m256i i256;
165 typedef __m256d d256;
166 
167 #define shuffle8f(a, b, W, Z, Y, X) _mm256_shuffle_ps(a, b, _MM_SHUFFLE(W, Z, Y, X))
168 
169 AL_DLL_HIDDEN inline f256 zero8f() { return _mm256_setzero_ps(); }
170 AL_DLL_HIDDEN inline i256 zero8i() { return _mm256_setzero_si256(); }
171 AL_DLL_HIDDEN inline d256 zero4d() { return _mm256_setzero_pd(); }
172 
173 AL_DLL_HIDDEN inline f256 cast8f(const d256 reg) { return _mm256_castpd_ps(reg); }
174 AL_DLL_HIDDEN inline f256 cast8f(const i256 reg) { return _mm256_castsi256_ps(reg); }
175 AL_DLL_HIDDEN inline i256 cast8i(const d256 reg) { return _mm256_castpd_si256(reg); }
176 AL_DLL_HIDDEN inline i256 cast8i(const f256 reg) { return _mm256_castps_si256(reg); }
177 AL_DLL_HIDDEN inline d256 cast4d(const f256 reg) { return _mm256_castps_pd(reg); }
178 AL_DLL_HIDDEN inline d256 cast4d(const i256 reg) { return _mm256_castsi256_pd(reg); }
179 
180 AL_DLL_HIDDEN inline int32_t movemask8i(const i256 reg) { return _mm256_movemask_ps(cast8f(reg)); }
181 AL_DLL_HIDDEN inline int32_t movemask8f(const f256 reg) { return _mm256_movemask_ps(reg); }
182 AL_DLL_HIDDEN inline int32_t movemask4d(const d256 reg) { return _mm256_movemask_pd(reg); }
183 
184 AL_DLL_HIDDEN inline i256 cmpeq8i(const i256 a, const i256 b) { return _mm256_cmpeq_epi32(a, b); }
185 
186 #define permute2f128(a, b, mask) _mm256_permute2f128_ps(a, b, mask)
187 
188 AL_DLL_HIDDEN inline f256 set8f(const float a, const float b, const float c, const float d,
189  const float e, const float f, const float g, const float h)
190  {return _mm256_setr_ps(a,b,c,d,e,f,g,h); }
191 AL_DLL_HIDDEN inline i256 set8i(const int32_t a, const int32_t b, const int32_t c, const int32_t d,
192  const int32_t e, const int32_t f, const int32_t g, const int32_t h)
193  {return _mm256_setr_epi32(a,b,c,d,e,f,g,h); }
194 AL_DLL_HIDDEN inline d256 set4f(const double a, const double b, const double c, const double d)
195  {return _mm256_setr_pd(a, b, c, d); }
196 
197 AL_DLL_HIDDEN inline f256 loadu8f(const void* const ptr) { return _mm256_loadu_ps((const float*)ptr); }
198 AL_DLL_HIDDEN inline i256 loadu8i(const void* const ptr) { return _mm256_loadu_si256((const i256*)ptr); }
199 AL_DLL_HIDDEN inline d256 loadu4d(const void* const ptr) { return _mm256_loadu_pd((const double*)ptr); }
200 
201 AL_DLL_HIDDEN inline f256 load8f(const void* const ptr) { return _mm256_load_ps((const float*)ptr); }
202 AL_DLL_HIDDEN inline i256 load8i(const void* const ptr) { return _mm256_load_si256((const i256*)ptr); }
203 AL_DLL_HIDDEN inline d256 load4d(const void* const ptr) { return _mm256_load_pd((const double*)ptr); }
204 
205 AL_DLL_HIDDEN inline void storeu8f(void* const ptr, const f256 reg) { _mm256_storeu_ps((float*)ptr, reg); }
206 AL_DLL_HIDDEN inline void storeu8i(void* const ptr, const i256 reg) { _mm256_storeu_si256((i256*)ptr, reg); }
207 AL_DLL_HIDDEN inline void storeu4d(void* const ptr, const d256 reg) { _mm256_storeu_pd((double*)ptr, reg); }
208 
209 AL_DLL_HIDDEN inline void store8f(void* const ptr, const f256 reg) { _mm256_store_ps((float*)ptr, reg); }
210 AL_DLL_HIDDEN inline void store8i(void* const ptr, const i256 reg) { _mm256_store_si256((i256*)ptr, reg); }
211 AL_DLL_HIDDEN inline void store4d(void* const ptr, const d256 reg) { _mm256_store_pd((double*)ptr, reg); }
212 
213 AL_DLL_HIDDEN inline d256 cvt4f_to_4d(const f128 reg) { return _mm256_cvtps_pd(reg); }
214 AL_DLL_HIDDEN inline f128 cvt4d_to_4f(const d256 reg) { return _mm256_cvtpd_ps(reg); }
215 AL_DLL_HIDDEN inline i256 cvt4i32_to_4i64(const i128 reg) { return _mm256_cvtepi32_epi64(reg); }
216 
217 AL_DLL_HIDDEN inline f256 or8f(const f256 a, const f256 b) { return _mm256_or_ps(a, b); }
218 AL_DLL_HIDDEN inline f256 and8f(const f256 a, const f256 b) { return _mm256_and_ps(a, b); }
219 AL_DLL_HIDDEN inline f256 andnot8f(const f256 a, const f256 b) { return _mm256_andnot_ps(a, b); }
220 
221 AL_DLL_HIDDEN inline i256 or8i(const i256 a, const i256 b) { return _mm256_or_si256(a, b); }
222 AL_DLL_HIDDEN inline i256 and8i(const i256 a, const i256 b) { return _mm256_and_si256(a, b); }
223 AL_DLL_HIDDEN inline i256 andnot8i(const i256 a, const i256 b) { return _mm256_andnot_si256(a, b); }
224 
225 AL_DLL_HIDDEN inline f256 mul8f(const f256 a, const f256 b) { return _mm256_mul_ps(a, b); }
226 AL_DLL_HIDDEN inline d256 mul4d(const d256 a, const d256 b) { return _mm256_mul_pd(a, b); }
227 
228 AL_DLL_HIDDEN inline f256 add8f(const f256 a, const f256 b) { return _mm256_add_ps(a, b); }
229 AL_DLL_HIDDEN inline i256 add8i(const i256 a, const i256 b) { return _mm256_add_epi32(a, b); }
230 AL_DLL_HIDDEN inline d256 add4d(const d256 a, const d256 b) { return _mm256_add_pd(a, b); }
231 AL_DLL_HIDDEN inline i256 add4i64(const i256 a, const i256 b) { return _mm256_add_epi64(a, b); }
232 
233 AL_DLL_HIDDEN inline f256 select8f(const f256 falseResult, const f256 trueResult, const f256 cmp) { return _mm256_blendv_ps(falseResult, trueResult, cmp); }
234 
235 AL_DLL_HIDDEN inline f256 permutevar8x32f(const f256 a, const i256 b) { return _mm256_permutevar8x32_ps(a, b); }
236 
237 AL_DLL_HIDDEN inline f256 unpacklo8f(const f256 a, const f256 b) { return _mm256_unpacklo_ps(a, b); }
238 AL_DLL_HIDDEN inline f256 unpackhi8f(const f256 a, const f256 b) { return _mm256_unpackhi_ps(a, b); }
239 
240 #define extract4f(reg, index) _mm256_extractf128_ps(reg, index)
241 #define extract256i64(reg, index) _mm256_extract_epi64(reg, index)
242 
243 AL_DLL_HIDDEN inline f256 splat8f(const float f) { return _mm256_set1_ps(f); }
244 AL_DLL_HIDDEN inline d256 splat4d(const double f) { return _mm256_set1_pd(f); }
245 AL_DLL_HIDDEN inline i256 splat8i(const int32_t f) { return _mm256_set1_epi32(f); }
246 AL_DLL_HIDDEN inline i256 splat4i64(const int64_t f) { return _mm256_set1_epi64x(f); }
247 
248 AL_DLL_HIDDEN inline f128 i32gather4f(const float* const ptr, const i128 indices) { return _mm_i32gather_ps(ptr, indices, 4); }
249 AL_DLL_HIDDEN inline f256 i32gather8f(const float* const ptr, const i256 indices) { return _mm256_i32gather_ps(ptr, indices, 4); }
250 AL_DLL_HIDDEN inline i128 i32gather4i(const int32_t* const ptr, const i128 indices) { return _mm_i32gather_epi32(ptr, indices, 4); }
251 AL_DLL_HIDDEN inline i256 i32gather8i(const int32_t* const ptr, const i256 indices) { return _mm256_i32gather_epi32(ptr, indices, 4); }
252 
253 AL_DLL_HIDDEN inline f256 set2f128(const f128 lo, const f128 hi) { return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
254 
255 #define shiftBytesLeft256(reg, count) _mm256_slli_si256(reg, count)
256 #define shiftBytesRight256(reg, count) _mm256_srli_si256(reg, count)
257 #define shiftBitsLeft8i32(reg, count) _mm256_slli_epi32(reg, count)
258 #define shiftBitsRight8i32(reg, count) _mm256_srli_epi32(reg, count)
259 #define shiftBitsLeft4i64(reg, count) _mm256_slli_epi64(reg, count)
260 #define shiftBitsRight4i64(reg, count) _mm256_srli_epi64(reg, count)
261 
262 # endif
263 } // AL
264 #endif
265