OpenShot Audio Library | OpenShotAudio 0.4.0
 
Loading...
Searching...
No Matches
juce_SIMDNativeOps_avx.h
1/*
2 ==============================================================================
3
4 This file is part of the JUCE library.
5 Copyright (c) 2022 - Raw Material Software Limited
6
7 JUCE is an open source library subject to commercial or open-source
8 licensing.
9
10 By using JUCE, you agree to the terms of both the JUCE 7 End-User License
11 Agreement and JUCE Privacy Policy.
12
13 End User License Agreement: www.juce.com/juce-7-licence
14 Privacy Policy: www.juce.com/juce-privacy-policy
15
16 Or: You may also use this code under the terms of the GPL v3 (see
17 www.gnu.org/licenses).
18
19 JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
20 EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
21 DISCLAIMED.
22
23 ==============================================================================
24*/
25
26namespace juce::dsp
27{
28
29#ifndef DOXYGEN
30
31JUCE_BEGIN_IGNORE_WARNINGS_GCC_LIKE ("-Wignored-attributes")
32
33#ifdef _MSC_VER
34 #define DECLARE_AVX_SIMD_CONST(type, name) \
35 static __declspec (align (32)) const type name[32 / sizeof (type)]
36
37 #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
38 __declspec (align (32)) const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)]
39
40#else
41 #define DECLARE_AVX_SIMD_CONST(type, name) \
42 static const type name[32 / sizeof (type)] __attribute__ ((aligned (32)))
43
44 #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
45 const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)] __attribute__ ((aligned (32)))
46
47#endif
48
49template <typename type>
50struct SIMDNativeOps;
51
52//==============================================================================
56*/
57template <>
58struct SIMDNativeOps<float>
59{
60 using vSIMDType = __m256;
61
62 //==============================================================================
63 DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
64 DECLARE_AVX_SIMD_CONST (int32_t, kEvenHighBit);
65 DECLARE_AVX_SIMD_CONST (float, kOne);
66
67 //==============================================================================
68 static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return load (a); }
69 static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return _mm256_castsi256_ps (_mm256_load_si256 (reinterpret_cast <const __m256i*> (a))); }
70 static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); }
71 static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); }
72 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); }
73 static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); }
74 static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); }
75 static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); }
76 static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_and (__m256 a, __m256 b) noexcept { return _mm256_and_ps (a, b); }
77 static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_or (__m256 a, __m256 b) noexcept { return _mm256_or_ps (a, b); }
78 static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_xor (__m256 a, __m256 b) noexcept { return _mm256_xor_ps (a, b); }
79 static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_notand (__m256 a, __m256 b) noexcept { return _mm256_andnot_ps (a, b); }
80 static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_not (__m256 a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
81 static forcedinline __m256 JUCE_VECTOR_CALLTYPE min (__m256 a, __m256 b) noexcept { return _mm256_min_ps (a, b); }
82 static forcedinline __m256 JUCE_VECTOR_CALLTYPE max (__m256 a, __m256 b) noexcept { return _mm256_max_ps (a, b); }
83 static forcedinline __m256 JUCE_VECTOR_CALLTYPE equal (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_EQ_OQ); }
84 static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
85 static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
86 static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
87 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256 a, __m256 b) noexcept { return (_mm256_movemask_ps (equal (a, b)) == 0xff); }
88 static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
89 static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
90 static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
91 static forcedinline float JUCE_VECTOR_CALLTYPE get (__m256 v, size_t i) noexcept { return SIMDFallbackOps<float, __m256>::get (v, i); }
92 static forcedinline __m256 JUCE_VECTOR_CALLTYPE set (__m256 v, size_t i, float s) noexcept { return SIMDFallbackOps<float, __m256>::set (v, i, s); }
93 static forcedinline __m256 JUCE_VECTOR_CALLTYPE truncate (__m256 a) noexcept { return _mm256_cvtepi32_ps (_mm256_cvttps_epi32 (a)); }
94
95 static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept
96 {
97 #if __FMA__
98 return _mm256_fmadd_ps (b, c, a);
99 #else
100 return add (a, mul (b, c));
101 #endif
102 }
103
104 static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept
105 {
106 a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a);
107 return add (_mm256_permute2f128_ps (a, a, 1), a);
108 }
109
110 //==============================================================================
111 static forcedinline __m256 JUCE_VECTOR_CALLTYPE cmplxmul (__m256 a, __m256 b) noexcept
112 {
113 __m256 rr_ir = mul (a, dupeven (b));
114 __m256 ii_ri = mul (swapevenodd (a), dupodd (b));
115 return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
116 }
117
118 static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m256 a) noexcept
119 {
120 __m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff);
121 __m256 tmp = _mm256_permute2f128_ps (retval, retval, 1);
122 retval = _mm256_add_ps (retval, tmp);
123
124 #if JUCE_GCC
125 return retval[0];
126 #else
127 return _mm256_cvtss_f32 (retval);
128 #endif
129 }
130};
131
132//==============================================================================
137template <>
138struct SIMDNativeOps<double>
139{
140 using vSIMDType = __m256d;
141
142 //==============================================================================
143 DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
144 DECLARE_AVX_SIMD_CONST (int64_t, kEvenHighBit);
145 DECLARE_AVX_SIMD_CONST (double, kOne);
146
147 //==============================================================================
148 static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return load (a); }
149 static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return _mm256_castsi256_pd (_mm256_load_si256 (reinterpret_cast <const __m256i*> (a))); }
150 static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
151 static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); }
152 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); }
153 static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
154 static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
155 static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
156 static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
157 static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
158 static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
159 static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
160 static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
161 static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
162 static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
163 static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
164 static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
165 static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
166 static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
167 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
168 static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
169 static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
170 static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
171 static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
172 static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
173 static forcedinline double JUCE_VECTOR_CALLTYPE get (__m256d v, size_t i) noexcept { return SIMDFallbackOps<double, __m256d>::get (v, i); }
174 static forcedinline __m256d JUCE_VECTOR_CALLTYPE set (__m256d v, size_t i, double s) noexcept { return SIMDFallbackOps<double, __m256d>::set (v, i, s); }
175 static forcedinline __m256d JUCE_VECTOR_CALLTYPE truncate (__m256d a) noexcept { return _mm256_cvtepi32_pd (_mm256_cvttpd_epi32 (a)); }
176
177 //==============================================================================
178 static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
179 {
180 __m256d rr_ir = mul (a, dupeven (b));
181 __m256d ii_ri = mul (swapevenodd (a), dupodd (b));
182 return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
183 }
184
185 static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m256d a) noexcept
186 {
187 __m256d retval = _mm256_hadd_pd (a, a);
188 __m256d tmp = _mm256_permute2f128_pd (retval, retval, 1);
189 retval = _mm256_add_pd (retval, tmp);
190
191 #if JUCE_GCC
192 return retval[0];
193 #else
194 return _mm256_cvtsd_f64 (retval);
195 #endif
196 }
197};
198
199//==============================================================================
204template <>
205struct SIMDNativeOps<int8_t>
206{
207 using vSIMDType = __m256i;
208
209 //==============================================================================
210 DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
211
212 static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
213 static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
214 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
215 static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
216 static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
217 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
218 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
219 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
220 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
221 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
222 static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
223 static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
224 static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
225 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
226 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
227 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return _mm256_movemask_epi8 (equal (a, b)) == -1; }
228 static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
229 static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
230 static forcedinline int8_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int8_t, __m256i>::get (v, i); }
231 static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int8_t s) noexcept { return SIMDFallbackOps<int8_t, __m256i>::set (v, i, s); }
232 static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
233
234 //==============================================================================
235 static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
236 {
237 __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
238 __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
239
240 for (int i = 0; i < 3; ++i)
241 {
242 lo = _mm256_hadd_epi16 (lo, lo);
243 hi = _mm256_hadd_epi16 (hi, hi);
244 }
245
246 #if JUCE_GCC
247 return (int8_t) ((lo[0] & 0xff) +
248 (hi[0] & 0xff) +
249 (lo[2] & 0xff) +
250 (hi[2] & 0xff));
251 #else
252 constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
253
254 return (int8_t) ((_mm256_cvtsi256_si32 (lo) & 0xff) +
255 (_mm256_cvtsi256_si32 (hi) & 0xff) +
256 (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask)) & 0xff) +
257 (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask)) & 0xff));
258 #endif
259 }
260
261 static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
262 {
263 // unpack and multiply
264 __m256i even = _mm256_mullo_epi16 (a, b);
265 __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
266
267 return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
268 _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
269 }
270};
271
272//==============================================================================
277template <>
278struct SIMDNativeOps<uint8_t>
279{
280 //==============================================================================
281 using vSIMDType = __m256i;
282
283 //==============================================================================
284 DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit);
285 DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet);
286
287 static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
288 static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm256_set1_epi8 ((int8_t) s); }
289 static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
290 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
291 static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
292 static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
293 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
294 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
295 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
296 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
297 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
298 static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu8 (a, b); }
299 static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu8 (a, b); }
300 static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
301 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
302 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
303 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
304 static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
305 static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
306 static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint8_t, __m256i>::get (v, i); }
307 static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint8_t s) noexcept { return SIMDFallbackOps<uint8_t, __m256i>::set (v, i, s); }
308 static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
309
310 //==============================================================================
311 static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
312 {
313 __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
314 __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
315
316 for (int i = 0; i < 3; ++i)
317 {
318 lo = _mm256_hadd_epi16 (lo, lo);
319 hi = _mm256_hadd_epi16 (hi, hi);
320 }
321
322 #if JUCE_GCC
323 return (uint8_t) ((static_cast<uint32_t> (lo[0]) & 0xffu) +
324 (static_cast<uint32_t> (hi[0]) & 0xffu) +
325 (static_cast<uint32_t> (lo[2]) & 0xffu) +
326 (static_cast<uint32_t> (hi[2]) & 0xffu));
327 #else
328 constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
329
330 return (uint8_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (lo)) & 0xffu) +
331 (static_cast<uint32_t> (_mm256_cvtsi256_si32 (hi)) & 0xffu) +
332 (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask))) & 0xffu) +
333 (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask))) & 0xffu));
334 #endif
335 }
336
337 static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
338 {
339 // unpack and multiply
340 __m256i even = _mm256_mullo_epi16 (a, b);
341 __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
342
343 return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
344 _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
345 }
346};
347
348//==============================================================================
353template <>
354struct SIMDNativeOps<int16_t>
355{
356 //==============================================================================
357 using vSIMDType = __m256i;
358
359 //==============================================================================
360 DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet);
361
362 //==============================================================================
363 static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm256_set1_epi16 (s); }
364 static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
365 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
366 static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
367 static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
368 static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
369 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
370 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
371 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
372 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
373 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
374 static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi16 (a, b); }
375 static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi16 (a, b); }
376 static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
377 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (a, b); }
378 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
379 static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
380 static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
381 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
382 static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int16_t, __m256i>::get (v, i); }
383 static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int16_t s) noexcept { return SIMDFallbackOps<int16_t, __m256i>::set (v, i, s); }
384 static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
385
386 //==============================================================================
387 static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
388 {
389 __m256i tmp = _mm256_hadd_epi16 (a, a);
390 tmp = _mm256_hadd_epi16 (tmp, tmp);
391 tmp = _mm256_hadd_epi16 (tmp, tmp);
392
393 #if JUCE_GCC
394 return (int16_t) ((tmp[0] & 0xffff) + (tmp[2] & 0xffff));
395 #else
396 constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
397
398 return (int16_t) ((_mm256_cvtsi256_si32 (tmp) & 0xffff) +
399 (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)) & 0xffff));
400 #endif
401 }
402};
403
404//==============================================================================
409template <>
410struct SIMDNativeOps<uint16_t>
411{
412 //==============================================================================
413 using vSIMDType = __m256i;
414
415 //==============================================================================
416 DECLARE_AVX_SIMD_CONST (uint16_t, kHighBit);
417 DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet);
418
419 //==============================================================================
420 static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
421 static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); }
422 static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
423 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
424 static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
425 static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
426 static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
427 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
428 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
429 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
430 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
431 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
432 static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); }
433 static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); }
434 static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
435 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
436 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
437 static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
438 static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
439 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
440 static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint16_t, __m256i>::get (v, i); }
441 static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint16_t s) noexcept { return SIMDFallbackOps<uint16_t, __m256i>::set (v, i, s); }
442 static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
443
444 //==============================================================================
445 static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
446 {
447 __m256i tmp = _mm256_hadd_epi16 (a, a);
448 tmp = _mm256_hadd_epi16 (tmp, tmp);
449 tmp = _mm256_hadd_epi16 (tmp, tmp);
450
451 #if JUCE_GCC
452 return (uint16_t) ((static_cast<uint32_t> (tmp[0]) & 0xffffu) +
453 (static_cast<uint32_t> (tmp[2]) & 0xffffu));
454 #else
455 constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
456
457 return (uint16_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp)) & 0xffffu) +
458 (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask))) & 0xffffu));
459 #endif
460 }
461};
462
463//==============================================================================
468template <>
469struct SIMDNativeOps<int32_t>
470{
471 //==============================================================================
472 using vSIMDType = __m256i;
473
474 //==============================================================================
475 DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
476
477 //==============================================================================
478 static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm256_set1_epi32 (s); }
479 static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
480 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
481 static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
482 static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
483 static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
484 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
485 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
486 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
487 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
488 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
489 static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi32 (a, b); }
490 static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi32 (a, b); }
491 static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
492 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (a, b); }
493 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
494 static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
495 static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
496 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
497 static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int32_t, __m256i>::get (v, i); }
498 static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int32_t s) noexcept { return SIMDFallbackOps<int32_t, __m256i>::set (v, i, s); }
499 static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
500
501 //==============================================================================
502 static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
503 {
504 __m256i tmp = _mm256_hadd_epi32 (a, a);
505 tmp = _mm256_hadd_epi32 (tmp, tmp);
506
507 #if JUCE_GCC
508 return (int32_t) (tmp[0] + tmp[2]);
509 #else
510 constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
511
512 return _mm256_cvtsi256_si32 (tmp) + _mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask));
513 #endif
514 }
515};
516
517//==============================================================================
522template <>
523struct SIMDNativeOps<uint32_t>
524{
525 //==============================================================================
526 using vSIMDType = __m256i;
527
528 //==============================================================================
529 DECLARE_AVX_SIMD_CONST (uint32_t, kAllBitsSet);
530 DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit);
531
532 //==============================================================================
533 static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
534 static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); }
535 static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
536 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
537 static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
538 static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
539 static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
540 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
541 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
542 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
543 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
544 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
545 static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); }
546 static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); }
547 static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
548 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
549 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
550 static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
551 static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
552 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
553 static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint32_t, __m256i>::get (v, i); }
554 static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint32_t s) noexcept { return SIMDFallbackOps<uint32_t, __m256i>::set (v, i, s); }
555 static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
556
557 //==============================================================================
558 static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
559 {
560 __m256i tmp = _mm256_hadd_epi32 (a, a);
561 tmp = _mm256_hadd_epi32 (tmp, tmp);
562
563 #if JUCE_GCC
564 return static_cast<uint32_t> (tmp[0]) + static_cast<uint32_t> (tmp[2]);
565 #else
566 constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
567
568 return static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp))
569 + static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)));
570 #endif
571 }
572};
573
574//==============================================================================
579template <>
580struct SIMDNativeOps<int64_t>
581{
582 //==============================================================================
583 using vSIMDType = __m256i;
584
585 //==============================================================================
586 DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
587
588 static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { return _mm256_set1_epi64x ((int64_t) s); }
589 static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
590 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
591 static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
592 static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
593 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
594 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
595 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
596 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
597 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
598 static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
599 static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
600 static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
601 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (a, b); }
602 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
603 static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
604 static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
605 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
606 static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<int64_t, __m256i>::get (v, i); }
607 static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int64_t s) noexcept { return SIMDFallbackOps<int64_t, __m256i>::set (v, i, s); }
608 static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { return SIMDFallbackOps<int64_t, __m256i>::sum (a); }
609 static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return SIMDFallbackOps<int64_t, __m256i>::mul (a, b); }
610 static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
611};
612
613//==============================================================================
618template <>
619struct SIMDNativeOps<uint64_t>
620{
621 //==============================================================================
622 using vSIMDType = __m256i;
623
624 //==============================================================================
625 DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet);
626 DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit);
627
628 static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { return _mm256_set1_epi64x ((int64_t) s); }
629 static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* p) noexcept { return _mm256_load_si256 (reinterpret_cast<const __m256i*> (p)); }
630 static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept { _mm256_store_si256 (reinterpret_cast<__m256i*> (dest), value); }
631 static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); }
632 static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
633 static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
634 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
635 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
636 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
637 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
638 static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
639 static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
640 static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
641 static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
642 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
643 static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
644 static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
645 static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
646 static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
647 static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::get (v, i); }
648 static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint64_t s) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::set (v, i, s); }
649 static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::sum (a); }
650 static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return SIMDFallbackOps<uint64_t, __m256i>::mul (a, b); }
651 static forcedinline __m256i JUCE_VECTOR_CALLTYPE truncate (__m256i a) noexcept { return a; }
652};
653
654#endif
655
656JUCE_END_IGNORE_WARNINGS_GCC_LIKE
657
658} // namespace juce::dsp