mul(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img, ...] //rhs = [y1.real, y1.img, y2.real, y2.img, ...] //ymm1 = [y1.real, y1.real, y2.real, y2.real, ...] __m256 ymm1 = _mm256_moveldup_ps(rhs.value); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001); //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256 ymm3 = _mm256_movehdup_ps(rhs.value); //ymm4 = ymm2 * ymm3 __m256 ymm4 = _mm256_mul_ps(ymm2, ymm3); //result = [(lhs * ymm1) -+ ymm4]; #ifdef __FMA__ return _mm256_fmaddsub_ps(lhs.value, ymm1, ymm4); #elif defined(__FMA4__) return _mm256_maddsub_ps(lhs.value, ymm1, ymm4); #else __m256 tmp = _mm256_mul_ps(lhs.value, ymm1); return _mm256_addsub_ps(tmp, ymm4); #endif }
/**************************************************************** * This technique for efficient SIMD complex-complex multiplication was found at * https://software.intel.com/file/1000 *****************************************************************/ inline __m256 avx_multiply_float_complex_(const __m256& vecA, const __m256& vecB) { __m256 vec1 = _mm256_moveldup_ps(vecB); __m256 vec2 = _mm256_movehdup_ps(vecB); vec1 = _mm256_mul_ps(vecA,vec1); vec2 = _mm256_mul_ps(vecA,vec2); vec2 = _mm256_permute_ps(vec2,0xB1); return _mm256_addsub_ps(vec1,vec2); }
// ============================================================= // ====================== RGBX2BGRX_32F ======================== // ============================================================= void _rgbx2bgrx_32f(const float* _src, float* _dest, unsigned int _width, unsigned int _pitchs, unsigned int _pitchd, unsigned int _start, unsigned int _stop) { #ifdef USE_SSE const unsigned int widthz = (_pitchs/8); // Get start positions for buffers const float* tsrc; float* tdest; for( unsigned int y=_start; y<=_stop; ++y ) { tsrc = _src+(y*_pitchs); tdest = _dest+(y*_pitchd); for( unsigned int x=0; x<widthz; ++x ) { #ifdef USE_AVX1 const __m256 v0 = _mm256_load_ps(tsrc); tsrc+=8; __m256 r0 = _mm256_permute_ps(v0,0xc6); _mm256_store_ps(tdest, r0 ); tdest+=8; #else // NOT TESTED const __m128 v0 = _mm_load_ps(tsrc); tsrc+=4; const __m128 v1 = _mm_load_ps(tsrc); tsrc+=4; //__m128 r0 = _mm_shuffle_ps(v0,0xc6); //__m128 r1 = _mm_shuffle_ps(v1,0xc6); //_mm_store_ps(tdest, r0 ); tdest+=4; //_mm_store_ps(tdest, r1 ); tdest+=4; #endif } } #else const float* tsrc; float* tdest; for( unsigned int y=_start; y<=_stop; ++y ) { tsrc = _src+(y*_pitchs); tdest = _dest+(y*_pitchd); for( unsigned int x=0; x<_width; ++x ) { float t = tsrc[4*x]; tdest[4*x] = tsrc[4*x+2]; tdest[4*x+2] = t; } } #endif }
bool PaTriList2(PA_STATE &pa, UINT slot, simdvector tri[3]) { simdvector &a = PaGetSimdVector(pa, 0, slot); simdvector &b = PaGetSimdVector(pa, 1, slot); simdvector &c = PaGetSimdVector(pa, 2, slot); simdscalar s; for (int i = 0; i < 4; ++i) { simdvector &v0 = tri[0]; v0[i] = _simd_blend_ps(a[i], b[i], 0x92); v0[i] = _simd_blend_ps(v0[i], c[i], 0x24); v0[i] = _mm256_permute_ps(v0[i], 0x6C); s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21); v0[i] = _simd_blend_ps(v0[i], s, 0x44); simdvector &v1 = tri[1]; v1[i] = _simd_blend_ps(a[i], b[i], 0x24); v1[i] = _simd_blend_ps(v1[i], c[i], 0x49); v1[i] = _mm256_permute_ps(v1[i], 0xB1); s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21); v1[i] = _simd_blend_ps(v1[i], s, 0x66); simdvector &v2 = tri[2]; v2[i] = _simd_blend_ps(a[i], b[i], 0x49); v2[i] = _simd_blend_ps(v2[i], c[i], 0x92); v2[i] = _mm256_permute_ps(v2[i], 0xC6); s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21); v2[i] = _simd_blend_ps(v2[i], s, 0x22); } SetNextPaState(pa, PaTriList0, PaTriListSingle0); pa.reset = true; pa.numPrimsComplete += KNOB_VS_SIMD_WIDTH; return true; }
div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img ...] //rhs = [y1.real, y1.img, y2.real, y2.img ...] //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...] __m256 ymm0 = _mm256_moveldup_ps(rhs.value); //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256 ymm1 = _mm256_movehdup_ps(rhs.value); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001); //ymm4 = [x.img * y.img, x.real * y.img] __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1); //ymm5 = subadd((lhs * ymm0), ymm4) #ifdef __FMA__ __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4); #else __m256 t1 = _mm256_mul_ps(lhs.value, ymm0); __m256 t2 = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4); __m256 ymm5 = _mm256_addsub_ps(t1, t2); #endif //ymm3 = [y.imag^2, y.imag^2] __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1); //ymm0 = (ymm0 * ymm0 + ymm3) #ifdef __FMA__ ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3); #else __m256 t3 = _mm256_mul_ps(ymm0, ymm0); ymm0 = _mm256_add_ps(t3, ymm3); #endif //result = ymm5 / ymm0 return _mm256_div_ps(ymm5, ymm0); }
void test8bit (void) { i1 = _mm_cmpistrm (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistri (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistra (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrc (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistro (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrs (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrz (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ b1 = _mm256_blend_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ k1 = _cvtss_sh (f1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm256_cvtps_ph (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_dp_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute_ps (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_blend_epi16 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_cvtps_ph (a1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ d1 = _mm_dp_pd (d2, d3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_dp_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_insert_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_mpsadbw_epu8 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_permute_ps (a2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_slli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_srli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ }
INLINE avxb shuffle(const avxb& a) { return _mm256_permute_ps(a, _MM_SHUFFLE(i3, i2, i1, i0)); }
template<index_t index_0, index_t index_1, index_t index_2, index_t index_3> INLINE const avxi shuffle( const avxi& a ) { return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(index_3, index_2, index_1, index_0))); }
M_ALWAYS_INLINE static void bar(float (& input)[8]) { /* static constexpr uint_fast8_t idx[][2] = { {0, 1}, {3, 2}, {4, 5}, {7, 6}, // (1) {0, 2}, {1, 3}, {6, 4}, {7, 5}, // (2) {0, 1}, {2, 3}, {5, 4}, {7, 6}, // (3) {0, 4}, {1, 5}, {2, 6}, {3, 7}, // (4) {0, 2}, {1, 3}, {4, 6}, {5, 7}, // (5) {0, 1}, {2, 3}, {4, 5}, {6, 7} // (6) }; */ // Индекса трябва да представим в по удобен вид за // AVX инструкциите. Няма смисъл от цикъл и после развиване // защото (4)-тия случай е специален... По добре на ръка. static constexpr int blend_mask_1 =0b10011001; static constexpr int blend_mask_2=0b11000011; static constexpr int blend_mask_3 =0b10100101; static constexpr int blend_mask_4 =0b00001111; static constexpr int blend_mask_5=0b00110011; static constexpr int blend_mask_6=0b01010101; // Отговаря на (1), (3) и (6) static constexpr int permute_mask_1=0b10110001; // Отговаря на (2) и (5) static constexpr int permute_mask_2=0b01001110; __m256 result= _mm256_load_ps(input); // (1) __m256 mapped=_mm256_permute_ps(result,permute_mask_1); __m256 min=_mm256_min_ps(result,mapped); __m256 max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_1); // (2) mapped=_mm256_permute_ps(result,permute_mask_2); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_2); // (3) mapped=_mm256_permute_ps(result,permute_mask_1); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_3); // (4) Специалния случай тук трябва да пермутираме // между двете половини на YMM регистъра. mapped=_mm256_permute2f128_ps(result,result,1); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_4); // (5) mapped=_mm256_permute_ps(result,permute_mask_2); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_5); // (6) mapped=_mm256_permute_ps(result,permute_mask_1); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_6); /**/ _mm256_store_ps(input,result); }
} nl = (1 << log); *newLength = nl; float *ret = mallocf(nl + additionalLength); memcpy(ret, ptr, length * sizeof(float)); memsetf(ret + length, 0.f, nl - length); return ret; } float *rmemcpyf(float *__restrict dest, const float *__restrict src, size_t length) { #ifdef __AVX__ for (int i = 0; i < (int)length - 7; i += 8) { __m256 vec = _mm256_loadu_ps(src + i); vec = _mm256_permute2f128_ps(vec, vec, 1); vec = _mm256_permute_ps(vec, 0x1B); _mm256_storeu_ps(dest + length - i - 8, vec); } for (size_t i = (length & ~0x7); i < length; i++) { dest[length - i - 1] = src[i]; } #elif defined(__ARM_NEON__) for (int i = 0; i < (int)length - 3; i += 4) { float32x4_t vec = vld1q_f32(src + i); vec = vrev64q_f32(vec); vec = vcombine_f32(vget_high_f32(vec), vget_low_f32(vec)); vst1q_f32(dest + length - i - 4, vec); } for (size_t i = (length & ~0x3); i < length; i++) {
// PRE: all vectors aligned, // imag_c = [i1,i1,...,i4,i4] // vec = [v1r,v1i,...,v4r,v4i] // component-wise multiplication // POST: returns [-i1*v1i,i1*v1r,...,-i4*v4i,i4*v4r] inline __m256 avx_multiply_float_imag_(const __m256& imag_c, const __m256& vec) { static const __m256 zero = _mm256_setzero_ps(); __m256 vec1 = _mm256_mul_ps(imag_c,vec); vec1 = _mm256_permute_ps(vec1,0xB1); return _mm256_addsub_ps(zero,vec1); }
inline __m256 _mm256_broadcast_3_ss(__m256 a) { __m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(3, 3, 3, 3)); return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0); }
inline __m256 _mm256_broadcast_lo_ss(__m256 a) { __m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(0, 0, 0, 0)); \ return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0); \ }
__m256 test_mm256_permute_ps(__m256 a) { // Check if the mask is correct // CHECK: shufflevector{{.*}}<i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> return _mm256_permute_ps(a, 0x1b); }