void simd_complex_mult(complex float *a, complex float *b, complex float *c, complex float *r) { __m128 a_reg, b_reg, c_reg, t_reg1, t_reg2, r_reg; a_reg = _mm_loadu_ps((float *) b); b_reg = _mm_loadu_ps((float *) a); c_reg = _mm_loadu_ps((float *) c); t_reg1 = _mm_moveldup_ps(b_reg); t_reg2 = t_reg1 * a_reg; a_reg = _mm_shuffle_ps(a_reg, a_reg, 0xb1); t_reg1 = _mm_movehdup_ps(b_reg); t_reg1 = t_reg1 * a_reg; r_reg = _mm_addsub_ps(t_reg2, t_reg1); t_reg1 = _mm_moveldup_ps(r_reg); t_reg2 = t_reg1 * c_reg; c_reg = _mm_shuffle_ps(c_reg, c_reg, 0xb1); t_reg1 = _mm_movehdup_ps(r_reg); t_reg1 = t_reg1 * c_reg; r_reg = _mm_addsub_ps(t_reg2, t_reg1); _mm_storeu_ps((float *) r, r_reg); }
static __inline __m128 ZMUL2(__m128 a, __m128 b, __m128 sign) { #ifdef SSE3_ // a = a1.r a1.i a2.r a2.i // b = b1.r b1.i b2.r b2.i __m128 ar; ar = _mm_moveldup_ps(a); // ar = a1.r a1.r a2.r a2.r a = _mm_movehdup_ps(a); // a = a1.i a1.i a2.i a2.i ar = _mm_mul_ps(ar, b); // ar = a1.r*b1.r a1.r*b1.i a2.r*b2.r a2.r*b2.i b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)); // b = b1.i b1.r b2.i b2.r a = _mm_mul_ps(a, b); // ai = a1.i*b1.i a1.i*b1.r a2.i*b2.i a2.i*b2.r return _mm_addsub_ps(ar, a); // a1.r*b1.r-a1.i*b1.i a1.r*b1.i+a1.i*b1.r a2.r*b2.r-a2.i*b2.i a2.r*b2.i+a2.i*b2.r #else // a = a1.r a1.i a2.r a2.i // b = b1.r b1.i b2.r b2.i __m128 ar; ar = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 0, 0)); // ar = a1.r a1.r a2.r a2.r a = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 1, 1)); // ai = a1.i a1.i a2.i a2.i ar = _mm_mul_ps(ar, b); // ar = +a1.r*b1.r +a1.r*b1.i +a2.r*b2.r +a2.r*b2.i a = _mm_xor_ps(a, sign); // ai = a1.i -a1.i a2.i -a2.i a = _mm_mul_ps(a, b); // ai = a1.i*b1.r -a1.i*b1.i a2.i*b2.r -a2.i*b2.i a = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // ai = -a1.i*b1.i +a1.i*b1.r -a2.i*b2.i +a2.i*b2.r return _mm_add_ps(ar, a); // a1.r*b1.r-a1.i*b1.i a1.r*b1.i+a1.i*b1.r a2.r*b2.r-a2.i*b2.i a2.r*b2.i+a2.i*b2.r #endif }
/* SSE implementation for complex reciprocal */ inline __m128 srslte_mat_cf_recip_sse(__m128 a) { __m128 conj = _MM_CONJ_PS(a); __m128 sqabs = _mm_mul_ps(a, a); sqabs = _mm_add_ps(_mm_movehdup_ps(sqabs), _mm_moveldup_ps(sqabs)); __m128 recp = _mm_rcp_ps(sqabs); return _mm_mul_ps(recp, conj); }
int main(){ __m128 A1, A2, A, B, C, B1, B2, D; float a[4] __attribute__((aligned(16))) = {1.0, 2.0, 3.0, 4.0}; float b[4] __attribute__((aligned(16))) = {0.1, 0.2, 0.3, 0.4}; A = _mm_load_ps(a); B = _mm_load_ps(b); A1 = _mm_moveldup_ps(A); A2 = _mm_movehdup_ps(A); B1 = _mm_mul_ps(A1, B); B2 = _mm_mul_ps(A2, B); C = _mm_shuffle_ps(B2, B2, _MM_SHUFFLE(2, 3, 0, 1)); D = _mm_addsub_ps(B1, C); _mm_store_ps(a, D); printf("(%f, %f) (%f, %f)\n", a[0], a[1], a[2], a[3]); }
__m128 test_mm_movehdup_ps(__m128 A) { // CHECK-LABEL: test_mm_movehdup_ps // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 1, i32 3, i32 3> return _mm_movehdup_ps(A); }
//----------------------------------------------------------------------------------------- // SSE3 complex multiplication with different kernel sizes //----------------------------------------------------------------------------------------- // SSE3 2x complex multiplication (for details see example 6-9 in Intel 64 and IA-32 Architectures Optimization Reference Manual) // complex multiplication is defined as: (a+jb)*(c+jd) = a*c - b*d + j(a*d + b*c) // z1 = a1*c1 - b1*d1 + j(a1*d1 + b1*c1) // z2 = a2*c2 - b2*d2 + j(a2*d2 + b2*c2) // A = { a1, jb1, c1, jd1 } // B = { a2, jb2, c2, jd2 } // C = { Re{z1}, Im{z1}, Re{z2}, Im{z2} } = { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) } LXC_ERROR_CODE LXC_SSE3CpxMul_K2(uint Size, void *X, void *H, void *Z) { if(!X || !H || !Z) { return LXC_ERR_INVALID_INPUT; } Size = Size*2; float *m_X = (float*)X; float *m_H = (float*)H; float *m_Z = (float*)Z; for(uint ii=0; ii < Size; ii+=4) { // local variables __m128 val1; __m128 val2; //__m128 val3; //__m128 val4; // load values into __m128 val1 = _mm_load_ps(&m_X[ii]); // _mm_load_ps: src{ a1, b1, a2, b2 } --> val1 { a1, b1, a2, b2 } val2 = _mm_load_ps(&m_H[ii]); // _mm_load_ps: src{ c1, d1, c2, d2 } --> val2 { c1, d1, c2, d2 } // add/subtract, scale and store operations // duplicate values // _A1 = _mm_moveldup_ps: src{ a1, b1, a2, b2 } --> val2 { a1, a1, a2, a2 } // _A2 = _mm_movehdup_ps: src{ a1, b1, a2, b2 } --> val3 { b1, b1, b2, b2 } // a = calc { a1*c1, a1*d1, a2*c2, a2*d2 } --> sse3 multiply // b = reorder im and re numbers { c1, d1, c2, d2 } --> { d1, c1, d2, c2 } and multiply { b1*d1, b1*c1, b2*d2, b2*c2 } // A = _mm_addsub_ps: ret { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) } // _mm_store_ps: C[0] = result0, C[1] = result1, C[2] = result2, C[3] = result3 _mm_store_ps(&m_Z[ii], _mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(val1), val2), _mm_mul_ps(_mm_movehdup_ps(val1), _mm_shuffle_ps(val2, val2, _MM_SHUFFLE(2,3,0,1))))); // old loop //// local variables //__m128 val1; //__m128 val2; //__m128 val3; //__m128 val4; //// load values into __m128 //val1 = _mm_load_ps(&m_X[ii]); // _mm_load_ps: src{ a1, b1, a2, b2 } --> val1 { a1, b1, a2, b2 } //val2 = _mm_load_ps(&m_H[ii]); // _mm_load_ps: src{ c1, d1, c2, d2 } --> val2 { c1, d1, c2, d2 } //// duplicate values //val3 = _mm_moveldup_ps(val1); // _mm_moveldup_ps: src{ a1, b1, a2, b2 } --> val2 { a1, a1, a2, a2 } //val4 = _mm_movehdup_ps(val1); // _mm_movehdup_ps: src{ a1, b1, a2, b2 } --> val3 { b1, b1, b2, b2 } //// sse3 multiply //val1 = _mm_mul_ps(val3, val2); // calc { a1*c1, a1*d1, a2*c2, a2*d2 } //// reorder im and re numbers { c1, d1, c2, d2 } --> { d1, c1, d2, c2 } and multiply { b1*d1, b1*c1, b2*d2, b2*c2 } //val3 = _mm_mul_ps(val4, _mm_shuffle_ps(val2, val2, _MM_SHUFFLE(2,3,0,1))); //// add/subtract, scale and store operations //val3 = _mm_addsub_ps(val1, val3); // _mm_addsub_ps: ret { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) } //_mm_store_ps(&m_Z[ii], val3); // _mm_store_ps: C[0] = result0, C[1] = result1, C[2] = result2, C[3] = result3 } return LXC_NO_ERR; }
void gemm(complex float* A, complex float* B, complex float* C, int m, int n, int k, complex float alpha, complex float beta){ __m128 c_reg, a_reg, b_reg, alpha_reg, beta_reg, t, t2, t3; complex float *beta_reg_value = malloc(sizeof(complex float)*2); beta_reg_value[0] = beta; beta_reg_value[1] = beta; beta_reg = _mm_loadu_ps((float*)beta_reg_value); complex float *alpha_reg_value = malloc(sizeof(complex float)*2); alpha_reg_value[0] = alpha; alpha_reg_value[1] = alpha; alpha_reg = _mm_loadu_ps((float*)alpha_reg_value); complex float *a_value = malloc(sizeof(complex float)*2); for(int x = 0; x < n; x += 2){ for(int y = 0; y < m; y++){ t3 = _mm_setzero_ps(); for(int z = 0; z < k; z++){ // A[y*k+z]*B[z*n + x] a_value[0] = A[y*k + z]; a_value[1] = *a_value; a_reg = _mm_loadu_ps((float*)a_value); b_reg = _mm_loadu_ps((float*)&B[z*n + x]); t = _mm_moveldup_ps(a_reg); t2 = t * b_reg; b_reg = _mm_shuffle_ps(b_reg, b_reg, 0xb1); t = _mm_movehdup_ps(a_reg); t = t * b_reg; a_reg = _mm_addsub_ps(t2, t); t3 = t3 + a_reg; } c_reg = _mm_loadu_ps((float*)&C[y*n + x]); t = _mm_moveldup_ps(c_reg); t2 = t * beta_reg; beta_reg = _mm_shuffle_ps(beta_reg, beta_reg, 0xb1); t = _mm_movehdup_ps(c_reg); t = t * beta_reg; c_reg = _mm_addsub_ps(t2, t); beta_reg = _mm_shuffle_ps(beta_reg, beta_reg, 0xb1); t = _mm_moveldup_ps(t3); t2 = t * alpha_reg; alpha_reg = _mm_shuffle_ps(alpha_reg, alpha_reg, 0xb1); t = _mm_movehdup_ps(t3); t = t * alpha_reg; b_reg = _mm_addsub_ps(t2, t); alpha_reg = _mm_shuffle_ps(alpha_reg, alpha_reg, 0xb1); c_reg = b_reg + c_reg; _mm_storeu_ps((float*)&C[y*n + x], c_reg); } } free(beta_reg_value); free(alpha_reg_value); free(a_value); }
int sse3_ChirpData_ak8( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { #ifdef USE_MANUAL_CALLSTACK call_stack.enter("sse3_ChirpData_ak8()"); #endif int i; if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); #ifdef USE_MANUAL_CALLSTACK call_stack.exit(); #endif return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); __m128d DFOUR = _mm_set_pd(4.0, 4.0); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); __m128d di1 = _mm_set_pd(2.0, 0.0); // set time patterns for eventual moveldup/movehdup __m128d di2 = _mm_set_pd(3.0, 1.0); for (i = 0; i < vEnd; i += 4) { const float *d = (const float *) (cx_DataArray + i); float *cd = (float *) (cx_ChirpDataArray + i); __m128d a1, a2; __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 z; __m128 s; __m128 c; __m128 m; // load the signal to be chirped d1 = _mm_load_ps(d); d2 = _mm_load_ps(d+4); // calculate the input angle a1 = _mm_mul_pd(_mm_mul_pd(di1, di1), rate); a2 = _mm_mul_pd(_mm_mul_pd(di2, di2), rate); // update times for next di1 = _mm_add_pd(di1, DFOUR); di2 = _mm_add_pd(di2, DFOUR); // reduce the angle to the range (-0.5, 0.5) a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal)); a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal)); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // 3 1 2 0 // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations, Estrin's method z = _mm_mul_ps(y, y); s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4F), SS3F), z), _mm_add_ps(_mm_mul_ps(y, SS2F), SS1F)), x); c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3F), CC2F), z), _mm_add_ps(_mm_mul_ps(y, CC1F), ONE)); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude m = _mm_sub_ps(_mm_sub_ps(TWO, _mm_mul_ps(x, x)), _mm_mul_ps(y, y)); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) c = _mm_mul_ps(c, m); // c3 c1 c2 c0 s = _mm_mul_ps(s, m); // chirp the data cd1 = _mm_moveldup_ps(c); // c1 c1 c0 c0 cd2 = _mm_movehdup_ps(c); // c3 c3 c2 c2 cd1 = _mm_mul_ps(cd1, d1); // c1.i1 c1.r1 c0.i0 c0.r0 cd2 = _mm_mul_ps(cd2, d2); // c3.i3 c3.r3 c2.i2 c2.r2 d1 = _mm_shuffle_ps(d1, d1, 0xb1); d2 = _mm_shuffle_ps(d2, d2, 0xb1); td1 = _mm_moveldup_ps(s); td2 = _mm_movehdup_ps(s); td1 = _mm_mul_ps(td1, d1); td2 = _mm_mul_ps(td2, d2); cd1 = _mm_addsub_ps(cd1, td1); cd2 = _mm_addsub_ps(cd2, td2); // store chirped values _mm_stream_ps(cd, cd1); _mm_stream_ps(cd+4, cd2); } // handle tail elements with scalar code for (; i < ul_NumDataPoints; ++i) { double angle = srate * i * i * 0.5; double s = sin(angle); double c = cos(angle); float re = cx_DataArray[i][0]; float im = cx_DataArray[i][1]; cx_ChirpDataArray[i][0] = re * c - im * s; cx_ChirpDataArray[i][1] = re * s + im * c; } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; #ifdef USE_MANUAL_CALLSTACK call_stack.exit(); #endif return 0; }