Example #1
0
File: fast.c Project: sondrele/NTNU
void simd_complex_mult(complex float *a, complex float *b, complex float *c, complex float *r) {
    __m128 a_reg,
           b_reg,
           c_reg,
           t_reg1,
           t_reg2,
           r_reg;

    a_reg = _mm_loadu_ps((float *) b);
    b_reg = _mm_loadu_ps((float *) a); 
    c_reg = _mm_loadu_ps((float *) c);

    t_reg1 = _mm_moveldup_ps(b_reg);
    t_reg2 = t_reg1 * a_reg;

    a_reg = _mm_shuffle_ps(a_reg, a_reg, 0xb1);
    t_reg1 = _mm_movehdup_ps(b_reg);
    t_reg1 = t_reg1 * a_reg;

    r_reg = _mm_addsub_ps(t_reg2, t_reg1);

    t_reg1 = _mm_moveldup_ps(r_reg);
    t_reg2 = t_reg1 * c_reg;

    c_reg = _mm_shuffle_ps(c_reg, c_reg, 0xb1);
    t_reg1 = _mm_movehdup_ps(r_reg);
    t_reg1 = t_reg1 * c_reg;

    r_reg = _mm_addsub_ps(t_reg2, t_reg1);
    _mm_storeu_ps((float *) r, r_reg);
}
Example #2
0
LXC_ERROR_CODE LXC_SSE3FreqCombine2Ch(uint Size, void *X, void *Y, void *Z)
{
    if(!Size || !X || !Y || !Z)
    {
        return LXC_ERR_INVALID_INPUT;
    }

    float *m_X = (float*)X;
    float *m_Y = (float*)Y;
    float *m_Z = (float*)Z;

#if defined(TARGET_WINDOWS)
    const __declspec(align(LXC_SSE3_ALIGN)) float  scaleFactor = 1.0f / ((float)Size);
#else
    const float  scaleFactor = 1.0f / ((float)Size);
#endif
    Size = Size*2;
    __m128 _scale = _mm_load1_ps(&scaleFactor);

    for(uint ii = 0; ii < Size; ii+=4)
    {
        //m_Z[ii][0] = (m_X[ii][0] - m_Y[ii][1])*scaleFactor;
        //m_Z[ii][1] = (m_X[ii][1] + m_Y[ii][0])*scaleFactor;
        //m_Z[ii][0] = (m_X[ii+1][0] - m_Y[ii+1][1])*scaleFactor;
        //m_Z[ii][1] = (m_X[ii+1][1] + m_Y[ii+1][0])*scaleFactor;
        //__m128 A = _mm_load_ps(&m_X[ii]);
        __m128 B = _mm_load_ps(&m_Y[ii]);
        B = _mm_shuffle_ps(B, B, LXC_MM_SHUFFLE(1,0,3,2));
        __m128 addRes = _mm_addsub_ps (_mm_load_ps(&m_X[ii]), B);
        _mm_store_ps(&m_Z[ii], _mm_mul_ps(addRes, _scale));
    }


    return LXC_NO_ERR;
}
static __inline __m128 ZMUL2(__m128 a, __m128 b, __m128 sign)
{
#ifdef SSE3_
    // a = a1.r  a1.i  a2.r  a2.i
    // b = b1.r  b1.i  b2.r  b2.i
    __m128 ar;

    ar = _mm_moveldup_ps(a);        // ar = a1.r  a1.r  a2.r  a2.r
    a = _mm_movehdup_ps(a);         // a  = a1.i  a1.i  a2.i  a2.i
    ar = _mm_mul_ps(ar, b);         // ar = a1.r*b1.r  a1.r*b1.i  a2.r*b2.r  a2.r*b2.i
    
    b  = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)); // b  = b1.i  b1.r  b2.i  b2.r
    a = _mm_mul_ps(a, b);           // ai = a1.i*b1.i  a1.i*b1.r  a2.i*b2.i  a2.i*b2.r

    return _mm_addsub_ps(ar, a);    // a1.r*b1.r-a1.i*b1.i  a1.r*b1.i+a1.i*b1.r  a2.r*b2.r-a2.i*b2.i  a2.r*b2.i+a2.i*b2.r
#else
    // a = a1.r  a1.i  a2.r  a2.i
    // b = b1.r  b1.i  b2.r  b2.i
    __m128 ar;

    ar = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 0, 0));     // ar = a1.r  a1.r  a2.r  a2.r
    a  = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 1, 1));     // ai = a1.i  a1.i  a2.i  a2.i
    ar = _mm_mul_ps(ar, b);                                 // ar = +a1.r*b1.r  +a1.r*b1.i  +a2.r*b2.r  +a2.r*b2.i
    
    a  = _mm_xor_ps(a, sign);                             // ai = a1.i  -a1.i  a2.i  -a2.i
    a  = _mm_mul_ps(a, b);                                // ai = a1.i*b1.r  -a1.i*b1.i  a2.i*b2.r  -a2.i*b2.i
    a  = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));  // ai = -a1.i*b1.i  +a1.i*b1.r  -a2.i*b2.i  +a2.i*b2.r

    return _mm_add_ps(ar, a);   // a1.r*b1.r-a1.i*b1.i  a1.r*b1.i+a1.i*b1.r  a2.r*b2.r-a2.i*b2.i  a2.r*b2.i+a2.i*b2.r
#endif
}
Example #4
0
//internal simd using sse3
void LLMDCTOpt(const float* x, float* y)
{
	float t4,t5,t6,t7; float c0,c1,c2,c3; 
	float* r = dct_tbl;

	const float invsqrt2= 0.707107f;//(float)(1.0f / M_SQRT2);
	const float invsqrt2h=0.353554f;//invsqrt2*0.5f;

	{
		__m128 mc1 = _mm_load_ps(x);
		__m128 mc2 = _mm_loadr_ps(x+4);

		__m128 mt1 = _mm_add_ps(mc1,mc2);
		__m128 mt2 = _mm_sub_ps(mc1,mc2);//rev

		mc1 = _mm_addsub_ps(_mm_shuffle_ps(mt1,mt1,_MM_SHUFFLE(1,1,0,0)),_mm_shuffle_ps(mt1,mt1,_MM_SHUFFLE(2,2,3,3)));
		mc1 = _mm_shuffle_ps(mc1,mc1,_MM_SHUFFLE(0,2,3,1));

		_mm_store_ps(y,mc1);
		_mm_store_ps(y+4,mt2);

	}
	c0=y[0];
	c1=y[1];
	c2=y[2];
	c3=y[3];
	/*c3=y[0];
	c0=y[1];
	c2=y[2];
	c1=y[3];*/

	t7=y[4];
	t6=y[5];
	t5=y[6];
	t4=y[7];

	y[0] = c0 + c1;
	y[4] = c0 - c1;
	y[2] = c2 * r[6] + c3 * r[2];
	y[6] = c3 * r[6] - c2 * r[2];

	c3 = t4 * r[3] + t7 * r[5];
	c0 = t7 * r[3] - t4 * r[5];
	c2 = t5 * r[1] + t6 * r[7];
	c1 = t6 * r[1] - t5 * r[7];

	y[5] = c3 - c1; y[3] = c0 - c2;
	c0 = (c0 + c2) * invsqrt2;
	c3 = (c3 + c1) * invsqrt2;
	y[1] = c0 + c3; y[7] = c0 - c3;

	const __m128 invsqh = _mm_set_ps1(invsqrt2h);
	__m128 my = _mm_load_ps(y);
	_mm_store_ps(y,_mm_mul_ps(my,invsqh));

	my = _mm_load_ps(y+4);
	_mm_store_ps(y+4,_mm_mul_ps(my,invsqh));
}
Example #5
0
void
M_MatrixRotateAxis44_SSE(M_Matrix44 *M, M_Real theta, M_Vector3 A)
{
	float s = sinf((float)theta);
	float c = cosf((float)theta);
	float t = 1.0f - c;
	M_Matrix44 R;
	__m128 a = A.m128, r1;
#ifdef HAVE_SSE3
	__m128 rC1 = _mm_set_ps(-c,    s*A.z, 0.0f,  +c);	/* 1,3 1,2 3 2 */
	__m128 rC2 = _mm_set_ps(0.0f, -s*A.y, s*A.y, -s*A.x);	/* 1,2 3 1 2,3 */
#endif
	
	/* m1: [t*AxAx + c,    t*AxAy + sAz,    t*AxAz - sAy,    0] */
	r1 = _mm_mul_ps(_mm_set1_ps(t), a);
	r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(0,0,0,0)));
#ifdef HAVE_SSE3
	R.m1 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC1,rC2,_MM_SHUFFLE(3,1,2,3)));
#else
	R.m1 = _mm_add_ps(r1, _mm_set_ps(0.0f, -s*A.y, s*A.z, c));
#endif

	/* m2: [t*AxAy - sAz,    t*AyAy + c,    t*AyAz + sAx,    0] */
	r1 = _mm_mul_ps(_mm_set1_ps(t), _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,1,1,0)));
	r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,2,1,1)));
#ifdef HAVE_SSE3
	R.m2 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC1,rC2,_MM_SHUFFLE(3,0,0,2)));
#else
	R.m2 = _mm_add_ps(r1, _mm_set_ps(0.0f, +s*A.x, c, -s*A.z));
#endif

	/* m3: [t*AxAz + sAy,    t*AyAz - sAx,    t*AzAz + c,    0] */
	r1 = _mm_mul_ps(_mm_set1_ps(t), a);
	r1 = _mm_mul_ps(r1, _mm_shuffle_ps(a,a,_MM_SHUFFLE(0,2,2,2)));
#ifdef HAVE_SSE3
	R.m3 = _mm_addsub_ps(r1, _mm_shuffle_ps(rC2,rC1,_MM_SHUFFLE(1,3,0,2)));
#else
	R.m3 = _mm_add_ps(r1, _mm_set_ps(0.0f, c, -s*A.x, +s*A.y));
#endif
	/* m4: [0,    0,    0,    1] */
	R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);

	M_MatrixMult44v_SSE(M, &R);
}
Example #6
0
int sse3(){
	__ma128f v1;
	__ma128f v2;
	for (int i = 4;i >= 0; i--){
		v1.f32[i] = -i*5.1;
		v2.f32[i] = i*10.1;
	}
	__ma128f vo;
	vo.f = _mm_addsub_ps(v1.f,v2.f);
	if (abs(vo.f32[3] - (v1.f32[3]+v2.f32[3])) < FLT_EPSILON){
		return 0;
	}else{
		printf("Correct: %f result: %f\n",v1.f32[3]+v2.f32[3], vo.f32[3]);
		return -1;
	}
}
Example #7
0
int main(){
	__m128 A1, A2, A, B, C, B1, B2, D;
	float a[4] __attribute__((aligned(16))) = {1.0, 2.0, 3.0, 4.0};
	float b[4] __attribute__((aligned(16))) = {0.1, 0.2, 0.3, 0.4};
	
	A = _mm_load_ps(a);
	B = _mm_load_ps(b);
	A1 = _mm_moveldup_ps(A);
	A2 = _mm_movehdup_ps(A);
	B1 = _mm_mul_ps(A1, B);
	B2 = _mm_mul_ps(A2, B);
	
	C = _mm_shuffle_ps(B2, B2, _MM_SHUFFLE(2, 3, 0, 1));
	D = _mm_addsub_ps(B1, C);
	_mm_store_ps(a, D);
	
	printf("(%f, %f) (%f, %f)\n", a[0], a[1], a[2], a[3]);
}
void parallel_vectorised_matmul(struct complex ** A, struct complex ** B, struct complex ** C, int a_rows, int a_cols, int b_cols) {
  #pragma omp parallel for
  for (int i = 0; i < a_rows; i++) {
    for (int k = 0; k < a_cols; k++) {
      struct complex r = A[i][k];

      __m128 a_real = _mm_set1_ps(r.real);
      __m128 a_imag = _mm_set1_ps(r.imag);

      for (int j = 0; j < b_cols; j += 2) {
        __m128 b_complex = _mm_load_ps((float*) &B[k][j]);

        __m128 real_times_b = _mm_mul_ps(a_real, b_complex);
        __m128 imag_times_b = _mm_mul_ps(a_imag, b_complex);
        imag_times_b = _mm_shuffle_ps(imag_times_b, imag_times_b, _MM_SHUFFLE(2, 3, 0, 1));

        __m128 addsub = _mm_addsub_ps(real_times_b, imag_times_b);

        __m128 current_c = _mm_load_ps((float*) &C[i][j]);
        _mm_store_ps((float*) &C[i][j], _mm_add_ps(current_c, addsub));
      }
    }
  }
}
Example #9
0
inline __m128 foo4 (__m128 x, __m128 y) {
    return _mm_addsub_ps (x, y);
}
Example #10
0
__m128 test_mm_addsub_ps(__m128 A, __m128 B) {
  // CHECK-LABEL: test_mm_addsub_ps
  // CHECK: call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
  return _mm_addsub_ps(A, B);
}
Example #11
0
LXC_ERROR_CODE LXC_SSE3FreqSplit2Ch(uint Size, void *Z, void *X, void *Y)
{
    if(!Size || !Z || !X || !Y)
    {
        return LXC_ERR_INVALID_INPUT;
    }

    float *m_X = (float*)X;
    float *m_Y = (float*)Y;
    float *m_Z = (float*)Z;

    Size = Size*2;
#if defined(TARGET_WINDOWS)
    const __declspec(align(LXC_SSE3_ALIGN)) float  scaleFactor = 0.5f;
#else
    const float  scaleFactor = 0.5f;
#endif
    __m128 scale_05 = _mm_load1_ps(&scaleFactor);
    __m128 XY0 = _mm_setr_ps(m_Z[0], 0.0f, m_Z[1], 0.0f);
    // [0]=Z[0][0], [1]=0.0f, [2]=Z[0][1], [3]=0.0f

    __m128 _m128Z = _mm_load_ps(&m_Z[0]);
    __m128 _m128Z_Size = _mm_loadl_pi(_m128Z, (__m64*)&m_Z[Size-2]);
    // [0]=Z[Size-1][0], [1]=Z[Size-1][1], [2]=Z[1][0], [3]=Z[1][1]

    __m128 leftNumbers = _mm_shuffle_ps(_m128Z_Size, _m128Z_Size, LXC_MM_SHUFFLE(3,2,0,3));
    // [0]=Z[1][1], [1]=Z[1][0], [2]=Z[Size-1][0], [3]=Z[1][1]

    __m128 rightNumbers = _mm_shuffle_ps(_m128Z_Size, _m128Z_Size, LXC_MM_SHUFFLE(1,0,2,1));
    // [0]=Z[Size-1][1], [1]=Z[Size-1][0], [2]=Z[1][0], [3]=Z[Size-1][1]

    __m128 mulAddSubRes = _mm_mul_ps(_mm_addsub_ps(leftNumbers, rightNumbers), scale_05);
    // [0]=(Z[1][1] - Z[Size-1][1])*0.5f=X[1][1]
    // [1]=(Z[1][0] + Z[Size-1][0])*0.5f=X[1][0]
    // [2]=(Z[Size-1][0] - Z[1][0])*0.5f=Y[1][1]
    // [3]=(Z[1][1] + Z[Size-1][1])*0.5f=Y[1][0]

    _mm_store_ps(&m_X[0], _mm_shuffle_ps(XY0, mulAddSubRes, LXC_MM_SHUFFLE(0,1,1,0)));
    // [0]=X[0][0]=Z[0][0]
    // [1]=X[0][1]=0.0f
    // [2]=X[1][0]=(m_Z[kk][0] + m_Z[L_minus_K][0])*0.5f
    // [3]=X[1][1]=(m_Z[kk][1] - m_Z[L_minus_K][1])*0.5f

    _mm_store_ps(&m_Y[0], _mm_shuffle_ps(XY0, mulAddSubRes, LXC_MM_SHUFFLE(2,3,3,2)));
    // [0]=Y[0][0]=Z[0][1]
    // [1]=Y[0][1]=0.0f
    // [2]=Y[1][0]=(m_Z[kk][0] + m_Z[L_minus_K][0])*0.5f
    // [3]=Y[1][1]=(m_Z[kk][1] - m_Z[L_minus_K][1])*0.5f

    for(uint kk = 4; kk < Size; kk+=4)
    {
        //__m128 _Z = {0.0f,1.0f,2.0f,3.0f};
        //__m128 L = {4.0f,5.0f,6.0f,7.0f};

        __m128 _Z = _mm_load_ps(&m_Z[kk]);
        // [0]=Z[kk][0], [1]=Z[kk][1], [2]=Z[kk+1][0], [3]=Z[kk+1][1]
        __m128 _ZShuffle = _mm_shuffle_ps(_Z, _Z, LXC_MM_SHUFFLE(1,0,3,2));
        // [0]=Z[kk][1], [1]=Z[kk][0], [2]=Z[kk+1][1], [3]=Z[kk+1][0]
        __m128 _ZSize = _mm_loadu_ps(&(m_Z[Size - kk - 2]));
        // [0]=Z[Size-kk-1][0], [1]=Z[Size-kk-1][1], [2]=Z[Size-kk][0], [3]=Z[Size-kk][1]

        // calculate X signal
        __m128 _ZSizeShuffle = _mm_shuffle_ps(_ZSize, _ZSize, LXC_MM_SHUFFLE(3,2,1,0));
        // [0]=Z[Size-kk][1], [1]=Z[Size-kk][0], [2]=Z[Size-kk-1][1], [3]=Z[Size-kk-1][0]
        __m128 result = _mm_mul_ps(_mm_addsub_ps(_ZShuffle, _ZSizeShuffle), scale_05);
        // [0]=(Z[kk][1] - Z[Size-kk][1])*0.5f=X[kk][1]
        // [1]=(Z[kk][0] + Z[Size-kk][0])*0.5f=X[kk][0]
        // [0]=(Z[kk+1][1] - Z[Size-kk-1][1])*0.5f=X[kk+1][1]
        // [1]=(Z[kk+1][0] + Z[Size-kk-1][0])*0.5f=X[kk+1][0]
        _mm_store_ps(&m_X[kk], _mm_shuffle_ps(result, result, LXC_MM_SHUFFLE(1,0,3,2)));
        // [0]=X[kk][0]  =(Z[kk][0] + Z[Size-kk][0])*0.5f
        // [1]=X[kk][1]  =Z[kk][1] - Z[Size-kk][1])*0.5f
        // [2]=X[kk+1][0]=(Z[kk+1][1] + Z[Size-kk-1][1])*0.5f
        // [3]=X[kk+1][1]=(Z[Size-kk-1][0] - Z[kk+1][0])*0.5f

        // calculate Y signal
        __m128 left = _mm_shuffle_ps(_Z, _ZSize, LXC_MM_SHUFFLE(1,3,2,0));
        // [0]=Z[kk][1], [1]=Z[kk+1][1], [2]=Z[Size-kk][0], [3]=Z[Size-kk-1][0]
        left = _mm_shuffle_ps(left, left, LXC_MM_SHUFFLE(2,0,3,1));
        // [0]=Z[Size-kk][0], [1]=Z[kk][1], [2]=Z[Size-kk-1][0], [3]=Z[kk+1][1]

        __m128 right = _mm_shuffle_ps(_Z, _ZSize, LXC_MM_SHUFFLE(0,2,3,1));
        // [0]=Z[kk][0], [1]=Z[kk+1][0], [2]=Z[Size-kk][1], [3]=Z[Size-kk-1][1]
        right = _mm_shuffle_ps(right, right, LXC_MM_SHUFFLE(0,2,1,3));
        // [0]=Z[kk][0], [1]=Z[Size-kk][1], [2]=Z[kk+1][0], [3]=Z[Size-kk-1][1]

        result = _mm_mul_ps(_mm_addsub_ps(left, right), scale_05);
        // [0]=Y[kk][1]  = 0.5f*(m_Z[Size-kk][0] - m_Z[kk][0]);
        // [1]=Y[kk][0]  = 0.5f*(m_Z[kk][1] + m_Z[Size-kk][1]);
        // [2]=Y[kk+1][1]= 0.5f*(m_Z[Size-kk-1][0] - m_Z[kk+1][0]);
        // [3]=Y[kk+1][0]= 0.5f*(m_Z[kk+1][1] + m_Z[Size-kk-1][1]);

        _mm_store_ps(&m_Y[kk], _mm_shuffle_ps(result, result, LXC_MM_SHUFFLE(1,0,3,2)));
        // [0]=Y[kk][0]  = 0.5f*(m_Z[kk][1] + m_Z[Size-kk][1]);
        // [1]=Y[kk][1]  = 0.5f*(m_Z[Size-kk][0] - m_Z[kk][0]);
        // [2]=Y[kk+1][0]= 0.5f*(m_Z[kk+1][1] + m_Z[Size-kk-1][1]);
        // [3]=Y[kk+1][1]= 0.5f*(m_Z[Size-kk-1][0] - m_Z[kk+1][0]);
    }

    return LXC_NO_ERR;
}
Example #12
0
//-----------------------------------------------------------------------------------------
// SSE3 complex multiplication with different kernel sizes
//-----------------------------------------------------------------------------------------
// SSE3 2x complex multiplication (for details see example 6-9 in Intel 64 and IA-32 Architectures Optimization Reference Manual)
// complex multiplication is defined as: (a+jb)*(c+jd) = a*c - b*d + j(a*d + b*c)
// z1 = a1*c1 - b1*d1 + j(a1*d1 + b1*c1)
// z2 = a2*c2 - b2*d2 + j(a2*d2 + b2*c2)
// A = { a1, jb1, c1, jd1 }
// B = { a2, jb2, c2, jd2 }
// C = { Re{z1}, Im{z1}, Re{z2}, Im{z2} } = { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) }
LXC_ERROR_CODE LXC_SSE3CpxMul_K2(uint Size, void *X, void *H, void *Z)
{
    if(!X || !H || !Z)
    {
        return LXC_ERR_INVALID_INPUT;
    }

    Size = Size*2;
    float *m_X = (float*)X;
    float *m_H = (float*)H;
    float *m_Z = (float*)Z;
    for(uint ii=0; ii < Size; ii+=4)
    {
        // local variables
        __m128 val1;
        __m128 val2;
        //__m128 val3;
        //__m128 val4;

        // load values into __m128
        val1 = _mm_load_ps(&m_X[ii]);			// _mm_load_ps:		src{ a1, b1, a2, b2 } --> val1 { a1, b1, a2, b2 }
        val2 = _mm_load_ps(&m_H[ii]);			// _mm_load_ps:		src{ c1, d1, c2, d2 } --> val2 { c1, d1, c2, d2 }

        // add/subtract, scale and store operations
        // duplicate values
        // _A1 = _mm_moveldup_ps: src{ a1, b1, a2, b2 } --> val2 { a1, a1, a2, a2 }
        // _A2 = _mm_movehdup_ps:	src{ a1, b1, a2, b2 } --> val3 { b1, b1, b2, b2 }
        // a = calc { a1*c1, a1*d1, a2*c2, a2*d2 } --> sse3 multiply
        // b = reorder im and re numbers { c1, d1, c2, d2 } --> { d1, c1, d2, c2 } and multiply { b1*d1, b1*c1, b2*d2, b2*c2 }
        // A = _mm_addsub_ps: ret { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) }
        // _mm_store_ps: C[0] = result0, C[1] = result1, C[2] = result2, C[3] = result3
        _mm_store_ps(&m_Z[ii], _mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(val1), val2), _mm_mul_ps(_mm_movehdup_ps(val1), _mm_shuffle_ps(val2, val2, _MM_SHUFFLE(2,3,0,1)))));


        // old loop
        //// local variables
        //__m128 val1;
        //__m128 val2;
        //__m128 val3;
        //__m128 val4;

        //// load values into __m128
        //val1 = _mm_load_ps(&m_X[ii]);			// _mm_load_ps:		src{ a1, b1, a2, b2 } --> val1 { a1, b1, a2, b2 }
        //val2 = _mm_load_ps(&m_H[ii]);			// _mm_load_ps:		src{ c1, d1, c2, d2 } --> val2 { c1, d1, c2, d2 }

        //// duplicate values
        //val3 = _mm_moveldup_ps(val1);			// _mm_moveldup_ps: src{ a1, b1, a2, b2 } --> val2 { a1, a1, a2, a2 }
        //val4 = _mm_movehdup_ps(val1);			// _mm_movehdup_ps:	src{ a1, b1, a2, b2 } --> val3 { b1, b1, b2, b2 }

        //// sse3 multiply
        //val1 = _mm_mul_ps(val3, val2);			// calc { a1*c1, a1*d1, a2*c2, a2*d2 }
        //// reorder im and re numbers { c1, d1, c2, d2 } --> { d1, c1, d2, c2 } and multiply { b1*d1, b1*c1, b2*d2, b2*c2 }
        //val3 = _mm_mul_ps(val4, _mm_shuffle_ps(val2, val2, _MM_SHUFFLE(2,3,0,1)));

        //// add/subtract, scale and store operations
        //val3 = _mm_addsub_ps(val1, val3);		// _mm_addsub_ps: ret { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) }
        //_mm_store_ps(&m_Z[ii], val3);			// _mm_store_ps: C[0] = result0, C[1] = result1, C[2] = result2, C[3] = result3
    }

    return LXC_NO_ERR;
}
Example #13
0
void gemm(complex float* A,
        complex float* B,
        complex float* C,
        int m,
        int n,
        int k,
        complex float alpha,
        complex float beta){

    __m128  c_reg, 
            a_reg,
            b_reg,
            alpha_reg,
            beta_reg,
            t,
            t2,
            t3;

    complex float *beta_reg_value = malloc(sizeof(complex float)*2);
    beta_reg_value[0] = beta;
    beta_reg_value[1] = beta;
    beta_reg = _mm_loadu_ps((float*)beta_reg_value);
    
    complex float *alpha_reg_value = malloc(sizeof(complex float)*2);
    alpha_reg_value[0] = alpha;
    alpha_reg_value[1] = alpha;
    alpha_reg = _mm_loadu_ps((float*)alpha_reg_value);
    

    complex float *a_value = malloc(sizeof(complex float)*2);



    for(int x = 0; x < n; x += 2){
        for(int y = 0; y < m; y++){
            t3 = _mm_setzero_ps();
            for(int z = 0; z < k; z++){
                // A[y*k+z]*B[z*n + x]
                a_value[0] = A[y*k + z];
                a_value[1] = *a_value;

                a_reg = _mm_loadu_ps((float*)a_value);
                b_reg = _mm_loadu_ps((float*)&B[z*n + x]);
                
                t = _mm_moveldup_ps(a_reg);
                t2 = t * b_reg;
                b_reg = _mm_shuffle_ps(b_reg, b_reg, 0xb1);
                t = _mm_movehdup_ps(a_reg);
                t = t * b_reg;
                a_reg = _mm_addsub_ps(t2, t);


                t3 = t3 + a_reg;
                
            }
            c_reg = _mm_loadu_ps((float*)&C[y*n + x]);
            t = _mm_moveldup_ps(c_reg);
            t2 = t * beta_reg;
            beta_reg = _mm_shuffle_ps(beta_reg, beta_reg, 0xb1);
            t = _mm_movehdup_ps(c_reg);
            t = t * beta_reg;
            c_reg = _mm_addsub_ps(t2, t);
            beta_reg = _mm_shuffle_ps(beta_reg, beta_reg, 0xb1);

            t = _mm_moveldup_ps(t3);
            t2 = t * alpha_reg;
            alpha_reg = _mm_shuffle_ps(alpha_reg, alpha_reg, 0xb1);
            t = _mm_movehdup_ps(t3);
            t = t * alpha_reg;
            b_reg = _mm_addsub_ps(t2, t);
            alpha_reg = _mm_shuffle_ps(alpha_reg, alpha_reg, 0xb1);
            
            c_reg = b_reg + c_reg;
            _mm_storeu_ps((float*)&C[y*n + x], c_reg);
        }
    }
    free(beta_reg_value);
    free(alpha_reg_value);
    free(a_value);
}
Example #14
0
__m128 test_mm_addsub_ps(__m128 A, __m128 B) {
  // CHECK-LABEL: test_mm_addsub_ps
  // CHECK: call <4 x float> @llvm.x86.sse3.addsub.ps
  // CHECK-ASM: addsubps %xmm{{.*}}, %xmm{{.*}}
  return _mm_addsub_ps(A, B);
}
Example #15
0
// use MMX/SSE extensions
//
// (a + jb)(c + jd) = (ac - bd) + j(ad + bc)
//
// mm_x  = { x[0].real, x[0].imag, x[1].real, x[1].imag }
// mm_hi = { h[0].real, h[0].real, h[1].real, h[1].real }
// mm_hq = { h[0].imag, h[0].imag, h[1].imag, h[1].imag }
//
// mm_y0 = mm_x * mm_hi
//       = { x[0].real * h[0].real,
//           x[0].imag * h[0].real,
//           x[1].real * h[1].real,
//           x[1].imag * h[1].real };
//
// mm_y1 = mm_x * mm_hq
//       = { x[0].real * h[0].imag,
//           x[0].imag * h[0].imag,
//           x[1].real * h[1].imag,
//           x[1].imag * h[1].imag };
//
void dotprod_cccf_execute_mmx(dotprod_cccf _q,
                              float complex * _x,
                              float complex * _y)
{
    // type cast input as floating point array
    float * x = (float*) _x;

    // double effective length
    unsigned int n = 2*_q->n;

    // temporary buffers
    __m128 v;   // input vector
    __m128 hi;  // coefficients vector (real)
    __m128 hq;  // coefficients vector (imag)
    __m128 ci;  // output multiplication (v * hi)
    __m128 cq;  // output multiplication (v * hq)

    // aligned output array
    float w[4] __attribute__((aligned(16))) = {0,0,0,0};

#if HAVE_PMMINTRIN_H
    // SSE3
    __m128 s;   // dot product
    __m128 sum = _mm_setzero_ps(); // load zeros into sum register
#else
    // no SSE3
    float wi[4] __attribute__((aligned(16)));
    float wq[4] __attribute__((aligned(16)));
#endif

    // t = 4*(floor(_n/4))
    unsigned int t = (n >> 2) << 2;

    //
    unsigned int i;
    for (i=0; i<t; i+=4) {
        // load inputs into register (unaligned)
        // {x[0].real, x[0].imag, x[1].real, x[1].imag}
        v = _mm_loadu_ps(&x[i]);

        // load coefficients into register (aligned)
        hi = _mm_load_ps(&_q->hi[i]);
        hq = _mm_load_ps(&_q->hq[i]);

        // compute parallel multiplications
        ci = _mm_mul_ps(v, hi);
        cq = _mm_mul_ps(v, hq);

        // shuffle values
        cq = _mm_shuffle_ps( cq, cq, _MM_SHUFFLE(2,3,0,1) );
        
#if HAVE_PMMINTRIN_H
        // SSE3: combine using addsub_ps()
        s = _mm_addsub_ps( ci, cq );

        // accumulate
        sum = _mm_add_ps(sum, s);
#else
        // no SSE3: combine using slow method
        // FIXME: implement slow method
        // unload values
        _mm_store_ps(wi, ci);
        _mm_store_ps(wq, cq);

        // accumulate
        w[0] += wi[0] - wq[0];
        w[1] += wi[1] + wq[1];
        w[2] += wi[2] - wq[2];
        w[3] += wi[3] + wq[3];
#endif
    }

#if HAVE_PMMINTRIN_H
    // unload packed array
    _mm_store_ps(w, sum);
#endif

    // add in-phase and quadrature components
    w[0] += w[2];   // I
    w[1] += w[3];   // Q

    //float complex total = *((float complex*)w);
    float complex total = w[0] + w[1] * _Complex_I;

    // cleanup
    for (i=t/2; i<_q->n; i++)
        total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I );

    // set return value
    *_y = total;
}
// =============================================================================
//
// sse3_vChirpData
// version by: Alex Kan
//   http://tbp.berkeley.edu/~alexkan/seti/
//
int sse3_ChirpData_ak(
  sah_complex * cx_DataArray,
  sah_complex * cx_ChirpDataArray,
  int chirp_rate_ind,
  double chirp_rate,
  int  ul_NumDataPoints,
  double sample_rate
) {
  int i;

  if (chirp_rate_ind == 0) {
    memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
    return 0;
  }

  int vEnd;  
  double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
  __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
  __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);

  // main vectorised loop
  vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
  for (i = 0; i < vEnd; i += 4) {
    const float *data = (const float *) (cx_DataArray + i);
    float *chirped = (float *) (cx_ChirpDataArray + i);
    __m128d di = _mm_set1_pd(i);
    __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di);
    __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di);

    __m128 d1, d2;
    __m128 cd1, cd2;
    __m128 td1, td2;
    __m128 x;
    __m128 y;
    __m128 s;
    __m128 c;
    __m128 m;

    // load the signal to be chirped
    prefetchnta((const void *)( data+32 ));
    d1 = _mm_load_ps(data);
    d2 = _mm_load_ps(data+4);

    // calculate the input angle
    a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate);
    a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate);

    // reduce the angle to the range (-0.5, 0.5)
    a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
    a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

    // convert pair of packed double into packed single
    x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));

    // square to the range [0, 0.25)
    y = _mm_mul_ps(x, x);

    // perform the initial polynomial approximations
    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4),
                                    SS3),
                                y),
                          SS2),
                    y),
              SS1),
          x);
    c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3),
                                CC2),
                          y),
                    CC1),
              y),
          ONE);

    // perform first angle doubling
    x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
    y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

    // calculate scaling factor to correct the magnitude
    //      m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO));
    //      m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO));
    m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)));

    // perform second angle doubling
    c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
    s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

    // correct the magnitude (final sine / cosine approximations)
    s = _mm_mul_ps(s, m);
    c = _mm_mul_ps(c, m);

    // chirp the data
    cd1 = _mm_shuffle_ps(c, c, 0x50);
    cd2 = _mm_shuffle_ps(c, c, 0xfa);
    cd1 = _mm_mul_ps(cd1, d1);
    cd2 = _mm_mul_ps(cd2, d2);
    d1 = _mm_shuffle_ps(d1, d1, 0xb1);
    d2 = _mm_shuffle_ps(d2, d2, 0xb1);
    td1 = _mm_shuffle_ps(s, s, 0x50);
    td2 = _mm_shuffle_ps(s, s, 0xfa);
    td1 = _mm_mul_ps(td1, d1);
    td2 = _mm_mul_ps(td2, d2);
    cd1 = _mm_addsub_ps(cd1, td1);
    cd2 = _mm_addsub_ps(cd2, td2);

    // store chirped values
    _mm_stream_ps(chirped, cd1);
    _mm_stream_ps(chirped+4, cd2);
  }
  _mm_sfence();

  // handle tail elements with scalar code
  for (   ; i < ul_NumDataPoints; ++i) {
    double angle = srate * i * i * 0.5;
    double s = sin(angle);
    double c = cos(angle);
    float re = cx_DataArray[i][0];
    float im = cx_DataArray[i][1];

    cx_ChirpDataArray[i][0] = re * c - im * s;
    cx_ChirpDataArray[i][1] = re * s + im * c;
  }
  analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

  return 0;
}
int sse3_ChirpData_ak8(
    sah_complex * cx_DataArray,
    sah_complex * cx_ChirpDataArray,
    int chirp_rate_ind,
    double chirp_rate,
    int  ul_NumDataPoints,
    double sample_rate
) {
#ifdef USE_MANUAL_CALLSTACK
    call_stack.enter("sse3_ChirpData_ak8()");
#endif 
    int i;

    if (chirp_rate_ind == 0) {
      memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
#ifdef USE_MANUAL_CALLSTACK
      call_stack.exit();
#endif 
      return 0;
    }

    int vEnd;
    double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
    __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
    __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);
    __m128d DFOUR = _mm_set_pd(4.0, 4.0);


    // main vectorised loop
    vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
    __m128d di1 = _mm_set_pd(2.0, 0.0);                 // set time patterns for eventual moveldup/movehdup
    __m128d di2 = _mm_set_pd(3.0, 1.0);

    for (i = 0; i < vEnd; i += 4) {
      const float *d = (const float *) (cx_DataArray + i);
      float *cd = (float *) (cx_ChirpDataArray + i);

      __m128d a1, a2;

      __m128 d1, d2;
      __m128 cd1, cd2;
      __m128 td1, td2;

      __m128 x;
      __m128 y;
      __m128 z;
      __m128 s;
      __m128 c;
      __m128 m;

      // load the signal to be chirped
      d1 = _mm_load_ps(d);
      d2 = _mm_load_ps(d+4);

      // calculate the input angle
      a1 = _mm_mul_pd(_mm_mul_pd(di1, di1), rate);
      a2 = _mm_mul_pd(_mm_mul_pd(di2, di2), rate);

      // update times for next
      di1 = _mm_add_pd(di1, DFOUR);
      di2 = _mm_add_pd(di2, DFOUR);

      // reduce the angle to the range (-0.5, 0.5)
      a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
      a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

      // convert pair of packed double into packed single
      x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));               // 3   1   2   0

      // square to the range [0, 0.25)
      y = _mm_mul_ps(x, x);

      // perform the initial polynomial approximations, Estrin's method
      z = _mm_mul_ps(y, y);

      s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4F),
                                                      SS3F),
                                           z),
                                _mm_add_ps(_mm_mul_ps(y, SS2F),
                                           SS1F)),
                     x);
      c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3F),
                                           CC2F),
                                z),
                     _mm_add_ps(_mm_mul_ps(y, CC1F),
                                ONE));

      // perform first angle doubling
      x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
      y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

      // calculate scaling factor to correct the magnitude
      m = _mm_sub_ps(_mm_sub_ps(TWO, _mm_mul_ps(x, x)), _mm_mul_ps(y, y));

      // perform second angle doubling
      c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
      s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

      // correct the magnitude (final sine / cosine approximations)
      c = _mm_mul_ps(c, m);                                       // c3    c1    c2    c0
      s = _mm_mul_ps(s, m);

      // chirp the data
      cd1 = _mm_moveldup_ps(c);                                   // c1    c1    c0    c0
      cd2 = _mm_movehdup_ps(c);                                   // c3    c3    c2    c2
      cd1 = _mm_mul_ps(cd1, d1);                                  // c1.i1 c1.r1 c0.i0 c0.r0
      cd2 = _mm_mul_ps(cd2, d2);                                  // c3.i3 c3.r3 c2.i2 c2.r2
      d1 = _mm_shuffle_ps(d1, d1, 0xb1);
      d2 = _mm_shuffle_ps(d2, d2, 0xb1);
      td1 = _mm_moveldup_ps(s);
      td2 = _mm_movehdup_ps(s);
      td1 = _mm_mul_ps(td1, d1);
      td2 = _mm_mul_ps(td2, d2);
      cd1 = _mm_addsub_ps(cd1, td1);
      cd2 = _mm_addsub_ps(cd2, td2);

      // store chirped values
      _mm_stream_ps(cd, cd1);
      _mm_stream_ps(cd+4, cd2);
    }

    // handle tail elements with scalar code
    for (; i < ul_NumDataPoints; ++i) {
      double angle = srate * i * i * 0.5;
      double s = sin(angle);
      double c = cos(angle);

      float re = cx_DataArray[i][0];
      float im = cx_DataArray[i][1];

      cx_ChirpDataArray[i][0] = re * c - im * s;
      cx_ChirpDataArray[i][1] = re * s + im * c;
    }
    analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;
#ifdef USE_MANUAL_CALLSTACK
    call_stack.exit();
#endif 
    return 0;
}