示例#1
0
文件: fast.c 项目: sondrele/NTNU
void simd_complex_mult(complex float *a, complex float *b, complex float *c, complex float *r) {
    __m128 a_reg,
           b_reg,
           c_reg,
           t_reg1,
           t_reg2,
           r_reg;

    a_reg = _mm_loadu_ps((float *) b);
    b_reg = _mm_loadu_ps((float *) a); 
    c_reg = _mm_loadu_ps((float *) c);

    t_reg1 = _mm_moveldup_ps(b_reg);
    t_reg2 = t_reg1 * a_reg;

    a_reg = _mm_shuffle_ps(a_reg, a_reg, 0xb1);
    t_reg1 = _mm_movehdup_ps(b_reg);
    t_reg1 = t_reg1 * a_reg;

    r_reg = _mm_addsub_ps(t_reg2, t_reg1);

    t_reg1 = _mm_moveldup_ps(r_reg);
    t_reg2 = t_reg1 * c_reg;

    c_reg = _mm_shuffle_ps(c_reg, c_reg, 0xb1);
    t_reg1 = _mm_movehdup_ps(r_reg);
    t_reg1 = t_reg1 * c_reg;

    r_reg = _mm_addsub_ps(t_reg2, t_reg1);
    _mm_storeu_ps((float *) r, r_reg);
}
static __inline __m128 ZMUL2(__m128 a, __m128 b, __m128 sign)
{
#ifdef SSE3_
    // a = a1.r  a1.i  a2.r  a2.i
    // b = b1.r  b1.i  b2.r  b2.i
    __m128 ar;

    ar = _mm_moveldup_ps(a);        // ar = a1.r  a1.r  a2.r  a2.r
    a = _mm_movehdup_ps(a);         // a  = a1.i  a1.i  a2.i  a2.i
    ar = _mm_mul_ps(ar, b);         // ar = a1.r*b1.r  a1.r*b1.i  a2.r*b2.r  a2.r*b2.i
    
    b  = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 0, 1)); // b  = b1.i  b1.r  b2.i  b2.r
    a = _mm_mul_ps(a, b);           // ai = a1.i*b1.i  a1.i*b1.r  a2.i*b2.i  a2.i*b2.r

    return _mm_addsub_ps(ar, a);    // a1.r*b1.r-a1.i*b1.i  a1.r*b1.i+a1.i*b1.r  a2.r*b2.r-a2.i*b2.i  a2.r*b2.i+a2.i*b2.r
#else
    // a = a1.r  a1.i  a2.r  a2.i
    // b = b1.r  b1.i  b2.r  b2.i
    __m128 ar;

    ar = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 0, 0));     // ar = a1.r  a1.r  a2.r  a2.r
    a  = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 1, 1));     // ai = a1.i  a1.i  a2.i  a2.i
    ar = _mm_mul_ps(ar, b);                                 // ar = +a1.r*b1.r  +a1.r*b1.i  +a2.r*b2.r  +a2.r*b2.i
    
    a  = _mm_xor_ps(a, sign);                             // ai = a1.i  -a1.i  a2.i  -a2.i
    a  = _mm_mul_ps(a, b);                                // ai = a1.i*b1.r  -a1.i*b1.i  a2.i*b2.r  -a2.i*b2.i
    a  = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 3, 0, 1));  // ai = -a1.i*b1.i  +a1.i*b1.r  -a2.i*b2.i  +a2.i*b2.r

    return _mm_add_ps(ar, a);   // a1.r*b1.r-a1.i*b1.i  a1.r*b1.i+a1.i*b1.r  a2.r*b2.r-a2.i*b2.i  a2.r*b2.i+a2.i*b2.r
#endif
}
示例#3
0
/* SSE implementation for complex reciprocal */
inline __m128 srslte_mat_cf_recip_sse(__m128 a) {
  __m128 conj = _MM_CONJ_PS(a);
  __m128 sqabs = _mm_mul_ps(a, a);
  sqabs = _mm_add_ps(_mm_movehdup_ps(sqabs), _mm_moveldup_ps(sqabs));

  __m128 recp = _mm_rcp_ps(sqabs);

  return _mm_mul_ps(recp, conj);
}
示例#4
0
int main(){
	__m128 A1, A2, A, B, C, B1, B2, D;
	float a[4] __attribute__((aligned(16))) = {1.0, 2.0, 3.0, 4.0};
	float b[4] __attribute__((aligned(16))) = {0.1, 0.2, 0.3, 0.4};
	
	A = _mm_load_ps(a);
	B = _mm_load_ps(b);
	A1 = _mm_moveldup_ps(A);
	A2 = _mm_movehdup_ps(A);
	B1 = _mm_mul_ps(A1, B);
	B2 = _mm_mul_ps(A2, B);
	
	C = _mm_shuffle_ps(B2, B2, _MM_SHUFFLE(2, 3, 0, 1));
	D = _mm_addsub_ps(B1, C);
	_mm_store_ps(a, D);
	
	printf("(%f, %f) (%f, %f)\n", a[0], a[1], a[2], a[3]);
}
示例#5
0
__m128 test_mm_movehdup_ps(__m128 A) {
  // CHECK-LABEL: test_mm_movehdup_ps
  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
  return _mm_movehdup_ps(A);
}
示例#6
0
//-----------------------------------------------------------------------------------------
// SSE3 complex multiplication with different kernel sizes
//-----------------------------------------------------------------------------------------
// SSE3 2x complex multiplication (for details see example 6-9 in Intel 64 and IA-32 Architectures Optimization Reference Manual)
// complex multiplication is defined as: (a+jb)*(c+jd) = a*c - b*d + j(a*d + b*c)
// z1 = a1*c1 - b1*d1 + j(a1*d1 + b1*c1)
// z2 = a2*c2 - b2*d2 + j(a2*d2 + b2*c2)
// A = { a1, jb1, c1, jd1 }
// B = { a2, jb2, c2, jd2 }
// C = { Re{z1}, Im{z1}, Re{z2}, Im{z2} } = { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) }
LXC_ERROR_CODE LXC_SSE3CpxMul_K2(uint Size, void *X, void *H, void *Z)
{
    if(!X || !H || !Z)
    {
        return LXC_ERR_INVALID_INPUT;
    }

    Size = Size*2;
    float *m_X = (float*)X;
    float *m_H = (float*)H;
    float *m_Z = (float*)Z;
    for(uint ii=0; ii < Size; ii+=4)
    {
        // local variables
        __m128 val1;
        __m128 val2;
        //__m128 val3;
        //__m128 val4;

        // load values into __m128
        val1 = _mm_load_ps(&m_X[ii]);			// _mm_load_ps:		src{ a1, b1, a2, b2 } --> val1 { a1, b1, a2, b2 }
        val2 = _mm_load_ps(&m_H[ii]);			// _mm_load_ps:		src{ c1, d1, c2, d2 } --> val2 { c1, d1, c2, d2 }

        // add/subtract, scale and store operations
        // duplicate values
        // _A1 = _mm_moveldup_ps: src{ a1, b1, a2, b2 } --> val2 { a1, a1, a2, a2 }
        // _A2 = _mm_movehdup_ps:	src{ a1, b1, a2, b2 } --> val3 { b1, b1, b2, b2 }
        // a = calc { a1*c1, a1*d1, a2*c2, a2*d2 } --> sse3 multiply
        // b = reorder im and re numbers { c1, d1, c2, d2 } --> { d1, c1, d2, c2 } and multiply { b1*d1, b1*c1, b2*d2, b2*c2 }
        // A = _mm_addsub_ps: ret { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) }
        // _mm_store_ps: C[0] = result0, C[1] = result1, C[2] = result2, C[3] = result3
        _mm_store_ps(&m_Z[ii], _mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(val1), val2), _mm_mul_ps(_mm_movehdup_ps(val1), _mm_shuffle_ps(val2, val2, _MM_SHUFFLE(2,3,0,1)))));


        // old loop
        //// local variables
        //__m128 val1;
        //__m128 val2;
        //__m128 val3;
        //__m128 val4;

        //// load values into __m128
        //val1 = _mm_load_ps(&m_X[ii]);			// _mm_load_ps:		src{ a1, b1, a2, b2 } --> val1 { a1, b1, a2, b2 }
        //val2 = _mm_load_ps(&m_H[ii]);			// _mm_load_ps:		src{ c1, d1, c2, d2 } --> val2 { c1, d1, c2, d2 }

        //// duplicate values
        //val3 = _mm_moveldup_ps(val1);			// _mm_moveldup_ps: src{ a1, b1, a2, b2 } --> val2 { a1, a1, a2, a2 }
        //val4 = _mm_movehdup_ps(val1);			// _mm_movehdup_ps:	src{ a1, b1, a2, b2 } --> val3 { b1, b1, b2, b2 }

        //// sse3 multiply
        //val1 = _mm_mul_ps(val3, val2);			// calc { a1*c1, a1*d1, a2*c2, a2*d2 }
        //// reorder im and re numbers { c1, d1, c2, d2 } --> { d1, c1, d2, c2 } and multiply { b1*d1, b1*c1, b2*d2, b2*c2 }
        //val3 = _mm_mul_ps(val4, _mm_shuffle_ps(val2, val2, _MM_SHUFFLE(2,3,0,1)));

        //// add/subtract, scale and store operations
        //val3 = _mm_addsub_ps(val1, val3);		// _mm_addsub_ps: ret { a1*c1 - b1*d1, j(a1*d1 + b1*c1), a2*c2 - b2*d2, j(a2*d2 + b2*c2) }
        //_mm_store_ps(&m_Z[ii], val3);			// _mm_store_ps: C[0] = result0, C[1] = result1, C[2] = result2, C[3] = result3
    }

    return LXC_NO_ERR;
}
示例#7
0
文件: fast.c 项目: CheeKinTANG/Skole
void gemm(complex float* A,
        complex float* B,
        complex float* C,
        int m,
        int n,
        int k,
        complex float alpha,
        complex float beta){

    __m128  c_reg, 
            a_reg,
            b_reg,
            alpha_reg,
            beta_reg,
            t,
            t2,
            t3;

    complex float *beta_reg_value = malloc(sizeof(complex float)*2);
    beta_reg_value[0] = beta;
    beta_reg_value[1] = beta;
    beta_reg = _mm_loadu_ps((float*)beta_reg_value);
    
    complex float *alpha_reg_value = malloc(sizeof(complex float)*2);
    alpha_reg_value[0] = alpha;
    alpha_reg_value[1] = alpha;
    alpha_reg = _mm_loadu_ps((float*)alpha_reg_value);
    

    complex float *a_value = malloc(sizeof(complex float)*2);



    for(int x = 0; x < n; x += 2){
        for(int y = 0; y < m; y++){
            t3 = _mm_setzero_ps();
            for(int z = 0; z < k; z++){
                // A[y*k+z]*B[z*n + x]
                a_value[0] = A[y*k + z];
                a_value[1] = *a_value;

                a_reg = _mm_loadu_ps((float*)a_value);
                b_reg = _mm_loadu_ps((float*)&B[z*n + x]);
                
                t = _mm_moveldup_ps(a_reg);
                t2 = t * b_reg;
                b_reg = _mm_shuffle_ps(b_reg, b_reg, 0xb1);
                t = _mm_movehdup_ps(a_reg);
                t = t * b_reg;
                a_reg = _mm_addsub_ps(t2, t);


                t3 = t3 + a_reg;
                
            }
            c_reg = _mm_loadu_ps((float*)&C[y*n + x]);
            t = _mm_moveldup_ps(c_reg);
            t2 = t * beta_reg;
            beta_reg = _mm_shuffle_ps(beta_reg, beta_reg, 0xb1);
            t = _mm_movehdup_ps(c_reg);
            t = t * beta_reg;
            c_reg = _mm_addsub_ps(t2, t);
            beta_reg = _mm_shuffle_ps(beta_reg, beta_reg, 0xb1);

            t = _mm_moveldup_ps(t3);
            t2 = t * alpha_reg;
            alpha_reg = _mm_shuffle_ps(alpha_reg, alpha_reg, 0xb1);
            t = _mm_movehdup_ps(t3);
            t = t * alpha_reg;
            b_reg = _mm_addsub_ps(t2, t);
            alpha_reg = _mm_shuffle_ps(alpha_reg, alpha_reg, 0xb1);
            
            c_reg = b_reg + c_reg;
            _mm_storeu_ps((float*)&C[y*n + x], c_reg);
        }
    }
    free(beta_reg_value);
    free(alpha_reg_value);
    free(a_value);
}
int sse3_ChirpData_ak8(
    sah_complex * cx_DataArray,
    sah_complex * cx_ChirpDataArray,
    int chirp_rate_ind,
    double chirp_rate,
    int  ul_NumDataPoints,
    double sample_rate
) {
#ifdef USE_MANUAL_CALLSTACK
    call_stack.enter("sse3_ChirpData_ak8()");
#endif 
    int i;

    if (chirp_rate_ind == 0) {
      memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
#ifdef USE_MANUAL_CALLSTACK
      call_stack.exit();
#endif 
      return 0;
    }

    int vEnd;
    double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
    __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
    __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);
    __m128d DFOUR = _mm_set_pd(4.0, 4.0);


    // main vectorised loop
    vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
    __m128d di1 = _mm_set_pd(2.0, 0.0);                 // set time patterns for eventual moveldup/movehdup
    __m128d di2 = _mm_set_pd(3.0, 1.0);

    for (i = 0; i < vEnd; i += 4) {
      const float *d = (const float *) (cx_DataArray + i);
      float *cd = (float *) (cx_ChirpDataArray + i);

      __m128d a1, a2;

      __m128 d1, d2;
      __m128 cd1, cd2;
      __m128 td1, td2;

      __m128 x;
      __m128 y;
      __m128 z;
      __m128 s;
      __m128 c;
      __m128 m;

      // load the signal to be chirped
      d1 = _mm_load_ps(d);
      d2 = _mm_load_ps(d+4);

      // calculate the input angle
      a1 = _mm_mul_pd(_mm_mul_pd(di1, di1), rate);
      a2 = _mm_mul_pd(_mm_mul_pd(di2, di2), rate);

      // update times for next
      di1 = _mm_add_pd(di1, DFOUR);
      di2 = _mm_add_pd(di2, DFOUR);

      // reduce the angle to the range (-0.5, 0.5)
      a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
      a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

      // convert pair of packed double into packed single
      x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));               // 3   1   2   0

      // square to the range [0, 0.25)
      y = _mm_mul_ps(x, x);

      // perform the initial polynomial approximations, Estrin's method
      z = _mm_mul_ps(y, y);

      s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4F),
                                                      SS3F),
                                           z),
                                _mm_add_ps(_mm_mul_ps(y, SS2F),
                                           SS1F)),
                     x);
      c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3F),
                                           CC2F),
                                z),
                     _mm_add_ps(_mm_mul_ps(y, CC1F),
                                ONE));

      // perform first angle doubling
      x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
      y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

      // calculate scaling factor to correct the magnitude
      m = _mm_sub_ps(_mm_sub_ps(TWO, _mm_mul_ps(x, x)), _mm_mul_ps(y, y));

      // perform second angle doubling
      c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
      s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

      // correct the magnitude (final sine / cosine approximations)
      c = _mm_mul_ps(c, m);                                       // c3    c1    c2    c0
      s = _mm_mul_ps(s, m);

      // chirp the data
      cd1 = _mm_moveldup_ps(c);                                   // c1    c1    c0    c0
      cd2 = _mm_movehdup_ps(c);                                   // c3    c3    c2    c2
      cd1 = _mm_mul_ps(cd1, d1);                                  // c1.i1 c1.r1 c0.i0 c0.r0
      cd2 = _mm_mul_ps(cd2, d2);                                  // c3.i3 c3.r3 c2.i2 c2.r2
      d1 = _mm_shuffle_ps(d1, d1, 0xb1);
      d2 = _mm_shuffle_ps(d2, d2, 0xb1);
      td1 = _mm_moveldup_ps(s);
      td2 = _mm_movehdup_ps(s);
      td1 = _mm_mul_ps(td1, d1);
      td2 = _mm_mul_ps(td2, d2);
      cd1 = _mm_addsub_ps(cd1, td1);
      cd2 = _mm_addsub_ps(cd2, td2);

      // store chirped values
      _mm_stream_ps(cd, cd1);
      _mm_stream_ps(cd+4, cd2);
    }

    // handle tail elements with scalar code
    for (; i < ul_NumDataPoints; ++i) {
      double angle = srate * i * i * 0.5;
      double s = sin(angle);
      double c = cos(angle);

      float re = cx_DataArray[i][0];
      float im = cx_DataArray[i][1];

      cx_ChirpDataArray[i][0] = re * c - im * s;
      cx_ChirpDataArray[i][1] = re * s + im * c;
    }
    analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;
#ifdef USE_MANUAL_CALLSTACK
    call_stack.exit();
#endif 
    return 0;
}