void simd_complex_mult(complex float *a, complex float *b, complex float *c, complex float *r) { __m128 a_reg, b_reg, c_reg, t_reg1, t_reg2, r_reg; a_reg = _mm_loadu_ps((float *) b); b_reg = _mm_loadu_ps((float *) a); c_reg = _mm_loadu_ps((float *) c); t_reg1 = _mm_moveldup_ps(b_reg); t_reg2 = t_reg1 * a_reg; a_reg = _mm_shuffle_ps(a_reg, a_reg, 0xb1); t_reg1 = _mm_movehdup_ps(b_reg); t_reg1 = t_reg1 * a_reg; r_reg = _mm_addsub_ps(t_reg2, t_reg1); t_reg1 = _mm_moveldup_ps(r_reg); t_reg2 = t_reg1 * c_reg; c_reg = _mm_shuffle_ps(c_reg, c_reg, 0xb1); t_reg1 = _mm_movehdup_ps(r_reg); t_reg1 = t_reg1 * c_reg; r_reg = _mm_addsub_ps(t_reg2, t_reg1); _mm_storeu_ps((float *) r, r_reg); }
LXC_ERROR_CODE LXC_SSE3FreqCombine2Ch(uint Size, void *X, void *Y, void *Z) { if(!Size || !X || !Y || !Z) { return LXC_ERR_INVALID_INPUT; } float *m_X = (float*)X; float *m_Y = (float*)Y; float *m_Z = (float*)Z; #if defined(TARGET_WINDOWS) const __declspec(align(LXC_SSE3_ALIGN)) float scaleFactor = 1.0f / ((float)Size); #else const float scaleFactor = 1.0f / ((float)Size); #endif Size = Size*2; __m128 _scale = _mm_load1_ps(&scaleFactor); for(uint ii = 0; ii < Size; ii+=4) { //m_Z[ii][0] = (m_X[ii][0] - m_Y[ii][1])*scaleFactor; //m_Z[ii][1] = (m_X[ii][1] + m_Y[ii][0])*scaleFactor; //m_Z[ii][0] = (m_X[ii+1][0] - m_Y[ii+1][1])*scaleFactor; //m_Z[ii][1] = (m_X[ii+1][1] + m_Y[ii+1][0])*scaleFactor; //__m128 A = _mm_load_ps(&m_X[ii]); __m128 B = _mm_load_ps(&m_Y[ii]); B = _mm_shuffle_ps(B, B, LXC_MM_SHUFFLE(1,0,3,2)); __m128 addRes = _mm_addsub_ps (_mm_load_ps(&m_X[ii]), B); _mm_store_ps(&m_Z[ii], _mm_mul_ps(addRes, _scale)); } return LXC_NO_ERR; }
static __inline __m128 ZMUL2(__m128 a, __m128 b, __m128 sign) { #ifdef SSE3_ // a = a1.r a1.i a2.r a2.i // b = b1.r b1.i b2.r b2.i __m128 ar; ar = _mm_moveldup_ps(a); // ar = a1.r a1.r a2.r a2.r a = _mm_movehdup_ps(a); // a = a1.i a1.i a2.i a2.i ar = _mm_mul_ps(ar, b); // ar = a1.r*b1.r a1.r*b1.i a2.r*b2.r a2.r*b2.i b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)); // b = b1.i b1.r b2.i b2.r a = _mm_mul_ps(a, b); // ai = a1.i*b1.i a1.i*b1.r a2.i*b2.i a2.i*b2.r return _mm_addsub_ps(ar, a); // a1.r*b1.r-a1.i*b1.i a1.r*b1.i+a1.i*b1.r a2.r*b2.r-a2.i*b2.i a2.r*b2.i+a2.i*b2.r #else // a = a1.r a1.i a2.r a2.i // b = b1.r b1.i b2.r b2.i __m128 ar; ar = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 0, 0)); // ar = a1.r a1.r a2.r a2.r a = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 1, 1)); // ai = a1.i a1.i a2.i a2.i ar = _mm_mul_ps(ar, b); // ar = +a1.r*b1.r +a1.r*b1.i +a2.r*b2.r +a2.r*b2.i a = _mm_xor_ps(a, sign); // ai = a1.i -a1.i a2.i -a2.i a = _mm_mul_ps(a, b); // ai = a1.i*b1.r -a1.i*b1.i a2.i*b2.r -a2.i*b2.i a = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1)); // ai = -a1.i*b1.i +a1.i*b1.r -a2.i*b2.i +a2.i*b2.r return _mm_add_ps(ar, a); // a1.r*b1.r-a1.i*b1.i a1.r*b1.i+a1.i*b1.r a2.r*b2.r-a2.i*b2.i a2.r*b2.i+a2.i*b2.r #endif }
//internal simd using sse3 void LLMDCTOpt(const float* x, float* y) { float t4,t5,t6,t7; float c0,c1,c2,c3; float* r = dct_tbl; const float invsqrt2= 0.707107f;//(float)(1.0f / M_SQRT2); const float invsqrt2h=0.353554f;//invsqrt2*0.5f; { __m128 mc1 = _mm_load_ps(x); __m128 mc2 = _mm_loadr_ps(x+4); __m128 mt1 = _mm_add_ps(mc1,mc2); __m128 mt2 = _mm_sub_ps(mc1,mc2);//rev mc1 = _mm_addsub_ps(_mm_shuffle_ps(mt1,mt1,_MM_SHUFFLE(1,1,0,0)),_mm_shuffle_ps(mt1,mt1,_MM_SHUFFLE(2,2,3,3))); mc1 = _mm_shuffle_ps(mc1,mc1,_MM_SHUFFLE(0,2,3,1)); _mm_store_ps(y,mc1); _mm_store_ps(y+4,mt2); } c0=y[0]; c1=y[1]; c2=y[2]; c3=y[3]; /*c3=y[0]; c0=y[1]; c2=y[2]; c1=y[3];*/ t7=y[4]; t6=y[5]; t5=y[6]; t4=y[7]; y[0] = c0 + c1; y[4] = c0 - c1; y[2] = c2 * r[6] + c3 * r[2]; y[6] = c3 * r[6] - c2 * r[2]; c3 = t4 * r[3] + t7 * r[5]; c0 = t7 * r[3] - t4 * r[5]; c2 = t5 * r[1] + t6 * r[7]; c1 = t6 * r[1] - t5 * r[7]; y[5] = c3 - c1; y[3] = c0 - c2; c0 = (c0 + c2) * invsqrt2; c3 = (c3 + c1) * invsqrt2; y[1] = c0 + c3; y[7] = c0 - c3; const __m128 invsqh = _mm_set_ps1(invsqrt2h); __m128 my = _mm_load_ps(y); _mm_store_ps(y,_mm_mul_ps(my,invsqh)); my = _mm_load_ps(y+4); _mm_store_ps(y+4,_mm_mul_ps(my,invsqh)); }
void M_MatrixRotateAxis44_SSE(M_Matrix44 *M, M_Real theta, M_Vector3 A) { float s = sinf((float)theta); float c = cosf((float)theta); float t = 1.0f - c; M_Matrix44 R; __m128 a = A.m128, r1; #ifdef HAVE_SSE3 __m128 rC1 = _mm_set_ps(-c, s*A.z, 0.0f, +c); /* 1,3 1,2 3 2 */ __m128 rC2 = _mm_set_ps(0.0f, -s*A.y, s*A.y, -s*A.x); /* 1,2 3 1 2,3 */ #endif /* m1: [t*AxAx + c, t*AxAy + sAz, t*AxAz - sAy, 0] */ r1 = _mm_mul_ps(_mm_set1_ps(t), a); r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(0,0,0,0))); #ifdef HAVE_SSE3 R.m1 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC1,rC2,_MM_SHUFFLE(3,1,2,3))); #else R.m1 = _mm_add_ps(r1, _mm_set_ps(0.0f, -s*A.y, s*A.z, c)); #endif /* m2: [t*AxAy - sAz, t*AyAy + c, t*AyAz + sAx, 0] */ r1 = _mm_mul_ps(_mm_set1_ps(t), _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,1,1,0))); r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,2,1,1))); #ifdef HAVE_SSE3 R.m2 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC1,rC2,_MM_SHUFFLE(3,0,0,2))); #else R.m2 = _mm_add_ps(r1, _mm_set_ps(0.0f, +s*A.x, c, -s*A.z)); #endif /* m3: [t*AxAz + sAy, t*AyAz - sAx, t*AzAz + c, 0] */ r1 = _mm_mul_ps(_mm_set1_ps(t), a); r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(0,2,2,2))); #ifdef HAVE_SSE3 R.m3 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC2,rC1,_MM_SHUFFLE(1,3,0,2))); #else R.m3 = _mm_add_ps(r1, _mm_set_ps(0.0f, c, -s*A.x, +s*A.y)); #endif /* m4: [0, 0, 0, 1] */ R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); M_MatrixMult44v_SSE(M, &R); }
int sse3(){ __ma128f v1; __ma128f v2; for (int i = 4;i >= 0; i--){ v1.f32[i] = -i*5.1; v2.f32[i] = i*10.1; } __ma128f vo; vo.f = _mm_addsub_ps(v1.f,v2.f); if (abs(vo.f32[3] - (v1.f32[3]+v2.f32[3])) < FLT_EPSILON){ return 0; }else{ printf("Correct: %f result: %f\n",v1.f32[3]+v2.f32[3], vo.f32[3]); return -1; } }
int main(){ __m128 A1, A2, A, B, C, B1, B2, D; float a[4] __attribute__((aligned(16))) = {1.0, 2.0, 3.0, 4.0}; float b[4] __attribute__((aligned(16))) = {0.1, 0.2, 0.3, 0.4}; A = _mm_load_ps(a); B = _mm_load_ps(b); A1 = _mm_moveldup_ps(A); A2 = _mm_movehdup_ps(A); B1 = _mm_mul_ps(A1, B); B2 = _mm_mul_ps(A2, B); C = _mm_shuffle_ps(B2, B2, _MM_SHUFFLE(2, 3, 0, 1)); D = _mm_addsub_ps(B1, C); _mm_store_ps(a, D); printf("(%f, %f) (%f, %f)\n", a[0], a[1], a[2], a[3]); }
void parallel_vectorised_matmul(struct complex ** A, struct complex ** B, struct complex ** C, int a_rows, int a_cols, int b_cols) { #pragma omp parallel for for (int i = 0; i < a_rows; i++) { for (int k = 0; k < a_cols; k++) { struct complex r = A[i][k]; __m128 a_real = _mm_set1_ps(r.real); __m128 a_imag = _mm_set1_ps(r.imag); for (int j = 0; j < b_cols; j += 2) { __m128 b_complex = _mm_load_ps((float*) &B[k][j]); __m128 real_times_b = _mm_mul_ps(a_real, b_complex); __m128 imag_times_b = _mm_mul_ps(a_imag, b_complex); imag_times_b = _mm_shuffle_ps(imag_times_b, imag_times_b, _MM_SHUFFLE(2, 3, 0, 1)); __m128 addsub = _mm_addsub_ps(real_times_b, imag_times_b); __m128 current_c = _mm_load_ps((float*) &C[i][j]); _mm_store_ps((float*) &C[i][j], _mm_add_ps(current_c, addsub)); } } } }
inline __m128 foo4 (__m128 x, __m128 y) { return _mm_addsub_ps (x, y); }
__m128 test_mm_addsub_ps(__m128 A, __m128 B) { // CHECK-LABEL: test_mm_addsub_ps // CHECK: call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}) return _mm_addsub_ps(A, B); }
LXC_ERROR_CODE LXC_SSE3FreqSplit2Ch(uint Size, void *Z, void *X, void *Y) { if(!Size || !Z || !X || !Y) { return LXC_ERR_INVALID_INPUT; } float *m_X = (float*)X; float *m_Y = (float*)Y; float *m_Z = (float*)Z; Size = Size*2; #if defined(TARGET_WINDOWS) const __declspec(align(LXC_SSE3_ALIGN)) float scaleFactor = 0.5f; #else const float scaleFactor = 0.5f; #endif __m128 scale_05 = _mm_load1_ps(&scaleFactor); __m128 XY0 = _mm_setr_ps(m_Z[0], 0.0f, m_Z[1], 0.0f); // [0]=Z[0][0], [1]=0.0f, [2]=Z[0][1], [3]=0.0f __m128 _m128Z = _mm_load_ps(&m_Z[0]); __m128 _m128Z_Size = _mm_loadl_pi(_m128Z, (__m64*)&m_Z[Size-2]); // [0]=Z[Size-1][0], [1]=Z[Size-1][1], [2]=Z[1][0], [3]=Z[1][1] __m128 leftNumbers = _mm_shuffle_ps(_m128Z_Size, _m128Z_Size, LXC_MM_SHUFFLE(3,2,0,3)); // [0]=Z[1][1], [1]=Z[1][0], [2]=Z[Size-1][0], [3]=Z[1][1] __m128 rightNumbers = _mm_shuffle_ps(_m128Z_Size, _m128Z_Size, LXC_MM_SHUFFLE(1,0,2,1)); // [0]=Z[Size-1][1], [1]=Z[Size-1][0], [2]=Z[1][0], [3]=Z[Size-1][1] __m128 mulAddSubRes = _mm_mul_ps(_mm_addsub_ps(leftNumbers, rightNumbers), scale_05); // [0]=(Z[1][1] - Z[Size-1][1])*0.5f=X[1][1] // [1]=(Z[1][0] + Z[Size-1][0])*0.5f=X[1][0] // [2]=(Z[Size-1][0] - Z[1][0])*0.5f=Y[1][1] // [3]=(Z[1][1] + Z[Size-1][1])*0.5f=Y[1][0] _mm_store_ps(&m_X[0], _mm_shuffle_ps(XY0, mulAddSubRes, LXC_MM_SHUFFLE(0,1,1,0))); // [0]=X[0][0]=Z[0][0] // [1]=X[0][1]=0.0f // [2]=X[1][0]=(m_Z[kk][0] + m_Z[L_minus_K][0])*0.5f // [3]=X[1][1]=(m_Z[kk][1] - m_Z[L_minus_K][1])*0.5f _mm_store_ps(&m_Y[0], _mm_shuffle_ps(XY0, mulAddSubRes, LXC_MM_SHUFFLE(2,3,3,2))); // [0]=Y[0][0]=Z[0][1] // [1]=Y[0][1]=0.0f // [2]=Y[1][0]=(m_Z[kk][0] + m_Z[L_minus_K][0])*0.5f // [3]=Y[1][1]=(m_Z[kk][1] - m_Z[L_minus_K][1])*0.5f for(uint kk = 4; kk < Size; kk+=4) { //__m128 _Z = {0.0f,1.0f,2.0f,3.0f}; //__m128 L = {4.0f,5.0f,6.0f,7.0f}; __m128 _Z = _mm_load_ps(&m_Z[kk]); // [0]=Z[kk][0], [1]=Z[kk][1], [2]=Z[kk+1][0], [3]=Z[kk+1][1] __m128 _ZShuffle = _mm_shuffle_ps(_Z, _Z, LXC_MM_SHUFFLE(1,0,3,2)); // [0]=Z[kk][1], [1]=Z[kk][0], [2]=Z[kk+1][1], [3]=Z[kk+1][0] __m128 _ZSize = _mm_loadu_ps(&(m_Z[Size - kk - 2])); // [0]=Z[Size-kk-1][0], [1]=Z[Size-kk-1][1], [2]=Z[Size-kk][0], [3]=Z[Size-kk][1] // calculate X signal __m128 _ZSizeShuffle = _mm_shuffle_ps(_ZSize, _ZSize, LXC_MM_SHUFFLE(3,2,1,0)); // [0]=Z[Size-kk][1], [1]=Z[Size-kk][0], [2]=Z[Size-kk-1][1], [3]=Z[Size-kk-1][0] __m128 result = _mm_mul_ps(_mm_addsub_ps(_ZShuffle, _ZSizeShuffle), scale_05); // [0]=(Z[kk][1] - Z[Size-kk][1])*0.5f=X[kk][1] // [1]=(Z[kk][0] + Z[Size-kk][0])*0.5f=X[kk][0] // [0]=(Z[kk+1][1] - Z[Size-kk-1][1])*0.5f=X[kk+1][1] // [1]=(Z[kk+1][0] + Z[Size-kk-1][0])*0.5f=X[kk+1][0] _mm_store_ps(&m_X[kk], _mm_shuffle_ps(result, result, LXC_MM_SHUFFLE(1,0,3,2))); // [0]=X[kk][0] =(Z[kk][0] + Z[Size-kk][0])*0.5f // [1]=X[kk][1] =Z[kk][1] - Z[Size-kk][1])*0.5f // [2]=X[kk+1][0]=(Z[kk+1][1] + Z[Size-kk-1][1])*0.5f // [3]=X[kk+1][1]=(Z[Size-kk-1][0] - Z[kk+1][0])*0.5f // calculate Y signal __m128 left = _mm_shuffle_ps(_Z, _ZSize, LXC_MM_SHUFFLE(1,3,2,0)); // [0]=Z[kk][1], [1]=Z[kk+1][1], [2]=Z[Size-kk][0], [3]=Z[Size-kk-1][0] left = _mm_shuffle_ps(left, left, LXC_MM_SHUFFLE(2,0,3,1)); // [0]=Z[Size-kk][0], [1]=Z[kk][1], [2]=Z[Size-kk-1][0], [3]=Z[kk+1][1] __m128 right = _mm_shuffle_ps(_Z, _ZSize, LXC_MM_SHUFFLE(0,2,3,1)); // [0]=Z[kk][0], [1]=Z[kk+1][0], [2]=Z[Size-kk][1], [3]=Z[Size-kk-1][1] right = _mm_shuffle_ps(right, right, LXC_MM_SHUFFLE(0,2,1,3)); // [0]=Z[kk][0], [1]=Z[Size-kk][1], [2]=Z[kk+1][0], [3]=Z[Size-kk-1][1] result = _mm_mul_ps(_mm_addsub_ps(left, right), scale_05); // [0]=Y[kk][1] = 0.5f*(m_Z[Size-kk][0] - m_Z[kk][0]); // [1]=Y[kk][0] = 0.5f*(m_Z[kk][1] + m_Z[Size-kk][1]); // [2]=Y[kk+1][1]= 0.5f*(m_Z[Size-kk-1][0] - m_Z[kk+1][0]); // [3]=Y[kk+1][0]= 0.5f*(m_Z[kk+1][1] + m_Z[Size-kk-1][1]); _mm_store_ps(&m_Y[kk], _mm_shuffle_ps(result, result, LXC_MM_SHUFFLE(1,0,3,2))); // [0]=Y[kk][0] = 0.5f*(m_Z[kk][1] + m_Z[Size-kk][1]); // [1]=Y[kk][1] = 0.5f*(m_Z[Size-kk][0] - m_Z[kk][0]); // [2]=Y[kk+1][0]= 0.5f*(m_Z[kk+1][1] + m_Z[Size-kk-1][1]); // [3]=Y[kk+1][1]= 0.5f*(m_Z[Size-kk-1][0] - m_Z[kk+1][0]); } return LXC_NO_ERR; }
//----------------------------------------------------------------------------------------- // SSE3 complex multiplication with different kernel sizes //----------------------------------------------------------------------------------------- // SSE3 2x complex multiplication (for details see example 6-9 in Intel 64 and IA-32 Architectures Optimization Reference Manual) // complex multiplication is defined as: (a+jb)*(c+jd) = a*c - b*d + j(a*d + b*c) // z1 = a1*c1 - b1*d1 + j(a1*d1 + b1*c1) // z2 = a2*c2 - b2*d2 + j(a2*d2 + b2*c2) // A = { a1, jb1, c1, jd1 } // B = { a2, jb2, c2, jd2 } // C = { Re{z1}, Im{z1}, Re{z2}, Im{z2} } = { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) } LXC_ERROR_CODE LXC_SSE3CpxMul_K2(uint Size, void *X, void *H, void *Z) { if(!X || !H || !Z) { return LXC_ERR_INVALID_INPUT; } Size = Size*2; float *m_X = (float*)X; float *m_H = (float*)H; float *m_Z = (float*)Z; for(uint ii=0; ii < Size; ii+=4) { // local variables __m128 val1; __m128 val2; //__m128 val3; //__m128 val4; // load values into __m128 val1 = _mm_load_ps(&m_X[ii]); // _mm_load_ps: src{ a1, b1, a2, b2 } --> val1 { a1, b1, a2, b2 } val2 = _mm_load_ps(&m_H[ii]); // _mm_load_ps: src{ c1, d1, c2, d2 } --> val2 { c1, d1, c2, d2 } // add/subtract, scale and store operations // duplicate values // _A1 = _mm_moveldup_ps: src{ a1, b1, a2, b2 } --> val2 { a1, a1, a2, a2 } // _A2 = _mm_movehdup_ps: src{ a1, b1, a2, b2 } --> val3 { b1, b1, b2, b2 } // a = calc { a1*c1, a1*d1, a2*c2, a2*d2 } --> sse3 multiply // b = reorder im and re numbers { c1, d1, c2, d2 } --> { d1, c1, d2, c2 } and multiply { b1*d1, b1*c1, b2*d2, b2*c2 } // A = _mm_addsub_ps: ret { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) } // _mm_store_ps: C[0] = result0, C[1] = result1, C[2] = result2, C[3] = result3 _mm_store_ps(&m_Z[ii], _mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(val1), val2), _mm_mul_ps(_mm_movehdup_ps(val1), _mm_shuffle_ps(val2, val2, _MM_SHUFFLE(2,3,0,1))))); // old loop //// local variables //__m128 val1; //__m128 val2; //__m128 val3; //__m128 val4; //// load values into __m128 //val1 = _mm_load_ps(&m_X[ii]); // _mm_load_ps: src{ a1, b1, a2, b2 } --> val1 { a1, b1, a2, b2 } //val2 = _mm_load_ps(&m_H[ii]); // _mm_load_ps: src{ c1, d1, c2, d2 } --> val2 { c1, d1, c2, d2 } //// duplicate values //val3 = _mm_moveldup_ps(val1); // _mm_moveldup_ps: src{ a1, b1, a2, b2 } --> val2 { a1, a1, a2, a2 } //val4 = _mm_movehdup_ps(val1); // _mm_movehdup_ps: src{ a1, b1, a2, b2 } --> val3 { b1, b1, b2, b2 } //// sse3 multiply //val1 = _mm_mul_ps(val3, val2); // calc { a1*c1, a1*d1, a2*c2, a2*d2 } //// reorder im and re numbers { c1, d1, c2, d2 } --> { d1, c1, d2, c2 } and multiply { b1*d1, b1*c1, b2*d2, b2*c2 } //val3 = _mm_mul_ps(val4, _mm_shuffle_ps(val2, val2, _MM_SHUFFLE(2,3,0,1))); //// add/subtract, scale and store operations //val3 = _mm_addsub_ps(val1, val3); // _mm_addsub_ps: ret { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) } //_mm_store_ps(&m_Z[ii], val3); // _mm_store_ps: C[0] = result0, C[1] = result1, C[2] = result2, C[3] = result3 } return LXC_NO_ERR; }
void gemm(complex float* A, complex float* B, complex float* C, int m, int n, int k, complex float alpha, complex float beta){ __m128 c_reg, a_reg, b_reg, alpha_reg, beta_reg, t, t2, t3; complex float *beta_reg_value = malloc(sizeof(complex float)*2); beta_reg_value[0] = beta; beta_reg_value[1] = beta; beta_reg = _mm_loadu_ps((float*)beta_reg_value); complex float *alpha_reg_value = malloc(sizeof(complex float)*2); alpha_reg_value[0] = alpha; alpha_reg_value[1] = alpha; alpha_reg = _mm_loadu_ps((float*)alpha_reg_value); complex float *a_value = malloc(sizeof(complex float)*2); for(int x = 0; x < n; x += 2){ for(int y = 0; y < m; y++){ t3 = _mm_setzero_ps(); for(int z = 0; z < k; z++){ // A[y*k+z]*B[z*n + x] a_value[0] = A[y*k + z]; a_value[1] = *a_value; a_reg = _mm_loadu_ps((float*)a_value); b_reg = _mm_loadu_ps((float*)&B[z*n + x]); t = _mm_moveldup_ps(a_reg); t2 = t * b_reg; b_reg = _mm_shuffle_ps(b_reg, b_reg, 0xb1); t = _mm_movehdup_ps(a_reg); t = t * b_reg; a_reg = _mm_addsub_ps(t2, t); t3 = t3 + a_reg; } c_reg = _mm_loadu_ps((float*)&C[y*n + x]); t = _mm_moveldup_ps(c_reg); t2 = t * beta_reg; beta_reg = _mm_shuffle_ps(beta_reg, beta_reg, 0xb1); t = _mm_movehdup_ps(c_reg); t = t * beta_reg; c_reg = _mm_addsub_ps(t2, t); beta_reg = _mm_shuffle_ps(beta_reg, beta_reg, 0xb1); t = _mm_moveldup_ps(t3); t2 = t * alpha_reg; alpha_reg = _mm_shuffle_ps(alpha_reg, alpha_reg, 0xb1); t = _mm_movehdup_ps(t3); t = t * alpha_reg; b_reg = _mm_addsub_ps(t2, t); alpha_reg = _mm_shuffle_ps(alpha_reg, alpha_reg, 0xb1); c_reg = b_reg + c_reg; _mm_storeu_ps((float*)&C[y*n + x], c_reg); } } free(beta_reg_value); free(alpha_reg_value); free(a_value); }
__m128 test_mm_addsub_ps(__m128 A, __m128 B) { // CHECK-LABEL: test_mm_addsub_ps // CHECK: call <4 x float> @llvm.x86.sse3.addsub.ps // CHECK-ASM: addsubps %xmm{{.*}}, %xmm{{.*}} return _mm_addsub_ps(A, B); }
// use MMX/SSE extensions // // (a + jb)(c + jd) = (ac - bd) + j(ad + bc) // // mm_x = { x[0].real, x[0].imag, x[1].real, x[1].imag } // mm_hi = { h[0].real, h[0].real, h[1].real, h[1].real } // mm_hq = { h[0].imag, h[0].imag, h[1].imag, h[1].imag } // // mm_y0 = mm_x * mm_hi // = { x[0].real * h[0].real, // x[0].imag * h[0].real, // x[1].real * h[1].real, // x[1].imag * h[1].real }; // // mm_y1 = mm_x * mm_hq // = { x[0].real * h[0].imag, // x[0].imag * h[0].imag, // x[1].real * h[1].imag, // x[1].imag * h[1].imag }; // void dotprod_cccf_execute_mmx(dotprod_cccf _q, float complex * _x, float complex * _y) { // type cast input as floating point array float * x = (float*) _x; // double effective length unsigned int n = 2*_q->n; // temporary buffers __m128 v; // input vector __m128 hi; // coefficients vector (real) __m128 hq; // coefficients vector (imag) __m128 ci; // output multiplication (v * hi) __m128 cq; // output multiplication (v * hq) // aligned output array float w[4] __attribute__((aligned(16))) = {0,0,0,0}; #if HAVE_PMMINTRIN_H // SSE3 __m128 s; // dot product __m128 sum = _mm_setzero_ps(); // load zeros into sum register #else // no SSE3 float wi[4] __attribute__((aligned(16))); float wq[4] __attribute__((aligned(16))); #endif // t = 4*(floor(_n/4)) unsigned int t = (n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) // {x[0].real, x[0].imag, x[1].real, x[1].imag} v = _mm_loadu_ps(&x[i]); // load coefficients into register (aligned) hi = _mm_load_ps(&_q->hi[i]); hq = _mm_load_ps(&_q->hq[i]); // compute parallel multiplications ci = _mm_mul_ps(v, hi); cq = _mm_mul_ps(v, hq); // shuffle values cq = _mm_shuffle_ps( cq, cq, _MM_SHUFFLE(2,3,0,1) ); #if HAVE_PMMINTRIN_H // SSE3: combine using addsub_ps() s = _mm_addsub_ps( ci, cq ); // accumulate sum = _mm_add_ps(sum, s); #else // no SSE3: combine using slow method // FIXME: implement slow method // unload values _mm_store_ps(wi, ci); _mm_store_ps(wq, cq); // accumulate w[0] += wi[0] - wq[0]; w[1] += wi[1] + wq[1]; w[2] += wi[2] - wq[2]; w[3] += wi[3] + wq[3]; #endif } #if HAVE_PMMINTRIN_H // unload packed array _mm_store_ps(w, sum); #endif // add in-phase and quadrature components w[0] += w[2]; // I w[1] += w[3]; // Q //float complex total = *((float complex*)w); float complex total = w[0] + w[1] * _Complex_I; // cleanup for (i=t/2; i<_q->n; i++) total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I ); // set return value *_y = total; }
// ============================================================================= // // sse3_vChirpData // version by: Alex Kan // http://tbp.berkeley.edu/~alexkan/seti/ // int sse3_ChirpData_ak( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { int i; if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); for (i = 0; i < vEnd; i += 4) { const float *data = (const float *) (cx_DataArray + i); float *chirped = (float *) (cx_ChirpDataArray + i); __m128d di = _mm_set1_pd(i); __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di); __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di); __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 s; __m128 c; __m128 m; // load the signal to be chirped prefetchnta((const void *)( data+32 )); d1 = _mm_load_ps(data); d2 = _mm_load_ps(data+4); // calculate the input angle a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate); a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate); // reduce the angle to the range (-0.5, 0.5) a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal)); a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal)); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4), SS3), y), SS2), y), SS1), x); c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3), CC2), y), CC1), y), ONE); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude // m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO)); // m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO)); m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y))); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) s = _mm_mul_ps(s, m); c = _mm_mul_ps(c, m); // chirp the data cd1 = _mm_shuffle_ps(c, c, 0x50); cd2 = _mm_shuffle_ps(c, c, 0xfa); cd1 = _mm_mul_ps(cd1, d1); cd2 = _mm_mul_ps(cd2, d2); d1 = _mm_shuffle_ps(d1, d1, 0xb1); d2 = _mm_shuffle_ps(d2, d2, 0xb1); td1 = _mm_shuffle_ps(s, s, 0x50); td2 = _mm_shuffle_ps(s, s, 0xfa); td1 = _mm_mul_ps(td1, d1); td2 = _mm_mul_ps(td2, d2); cd1 = _mm_addsub_ps(cd1, td1); cd2 = _mm_addsub_ps(cd2, td2); // store chirped values _mm_stream_ps(chirped, cd1); _mm_stream_ps(chirped+4, cd2); } _mm_sfence(); // handle tail elements with scalar code for ( ; i < ul_NumDataPoints; ++i) { double angle = srate * i * i * 0.5; double s = sin(angle); double c = cos(angle); float re = cx_DataArray[i][0]; float im = cx_DataArray[i][1]; cx_ChirpDataArray[i][0] = re * c - im * s; cx_ChirpDataArray[i][1] = re * s + im * c; } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; return 0; }
int sse3_ChirpData_ak8( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { #ifdef USE_MANUAL_CALLSTACK call_stack.enter("sse3_ChirpData_ak8()"); #endif int i; if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); #ifdef USE_MANUAL_CALLSTACK call_stack.exit(); #endif return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); __m128d DFOUR = _mm_set_pd(4.0, 4.0); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); __m128d di1 = _mm_set_pd(2.0, 0.0); // set time patterns for eventual moveldup/movehdup __m128d di2 = _mm_set_pd(3.0, 1.0); for (i = 0; i < vEnd; i += 4) { const float *d = (const float *) (cx_DataArray + i); float *cd = (float *) (cx_ChirpDataArray + i); __m128d a1, a2; __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 z; __m128 s; __m128 c; __m128 m; // load the signal to be chirped d1 = _mm_load_ps(d); d2 = _mm_load_ps(d+4); // calculate the input angle a1 = _mm_mul_pd(_mm_mul_pd(di1, di1), rate); a2 = _mm_mul_pd(_mm_mul_pd(di2, di2), rate); // update times for next di1 = _mm_add_pd(di1, DFOUR); di2 = _mm_add_pd(di2, DFOUR); // reduce the angle to the range (-0.5, 0.5) a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal)); a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal)); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // 3 1 2 0 // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations, Estrin's method z = _mm_mul_ps(y, y); s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4F), SS3F), z), _mm_add_ps(_mm_mul_ps(y, SS2F), SS1F)), x); c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3F), CC2F), z), _mm_add_ps(_mm_mul_ps(y, CC1F), ONE)); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude m = _mm_sub_ps(_mm_sub_ps(TWO, _mm_mul_ps(x, x)), _mm_mul_ps(y, y)); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) c = _mm_mul_ps(c, m); // c3 c1 c2 c0 s = _mm_mul_ps(s, m); // chirp the data cd1 = _mm_moveldup_ps(c); // c1 c1 c0 c0 cd2 = _mm_movehdup_ps(c); // c3 c3 c2 c2 cd1 = _mm_mul_ps(cd1, d1); // c1.i1 c1.r1 c0.i0 c0.r0 cd2 = _mm_mul_ps(cd2, d2); // c3.i3 c3.r3 c2.i2 c2.r2 d1 = _mm_shuffle_ps(d1, d1, 0xb1); d2 = _mm_shuffle_ps(d2, d2, 0xb1); td1 = _mm_moveldup_ps(s); td2 = _mm_movehdup_ps(s); td1 = _mm_mul_ps(td1, d1); td2 = _mm_mul_ps(td2, d2); cd1 = _mm_addsub_ps(cd1, td1); cd2 = _mm_addsub_ps(cd2, td2); // store chirped values _mm_stream_ps(cd, cd1); _mm_stream_ps(cd+4, cd2); } // handle tail elements with scalar code for (; i < ul_NumDataPoints; ++i) { double angle = srate * i * i * 0.5; double s = sin(angle); double c = cos(angle); float re = cx_DataArray[i][0]; float im = cx_DataArray[i][1]; cx_ChirpDataArray[i][0] = re * c - im * s; cx_ChirpDataArray[i][1] = re * s + im * c; } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; #ifdef USE_MANUAL_CALLSTACK call_stack.exit(); #endif return 0; }