Пример #1
0
// Multiply 2 matrices together
Mat44 Mat44::Mult(const Mat44 &m) const
{
	Mat44 a = Transpose();
	Mat44 b = m.Transpose();
	Mat44 out;

	__m128 a_line, b_line, r_line;
	for(c8 i=0;i<16;i+=4)
	{
		// First column
		a_line = _mm_load_ps(a.mat);
		b_line = _mm_load1_ps(&b.mat[i]);
		r_line = _mm_mul_ps(a_line, b_line);

		// Second column
		a_line = _mm_load_ps(&a.mat[4]);
		b_line = _mm_load1_ps(&b.mat[i+1]);
		r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);

		// Third column
		a_line = _mm_load_ps(&a.mat[8]);
		b_line = _mm_load1_ps(&b.mat[i+2]);
		r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);

		// Last column
		a_line = _mm_load_ps(&a.mat[12]);
		b_line = _mm_load1_ps(&b.mat[i+3]);
		r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);

		_mm_store_ps(&out.mat[i], r_line);
	}

	return out.Transpose();
};
Пример #2
0
void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
{
   int j;
   __m128 xsum1, xsum2;
   xsum1 = _mm_loadu_ps(sum);
   xsum2 = _mm_setzero_ps();

   for (j = 0; j < len-3; j += 4)
   {
      __m128 x0 = _mm_loadu_ps(x+j);
      __m128 yj = _mm_loadu_ps(y+j);
      __m128 y3 = _mm_loadu_ps(y+j+3);

      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
                                          _mm_shuffle_ps(yj,y3,0x49)));
      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
                                          _mm_shuffle_ps(yj,y3,0x9e)));
      xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
   }
   if (j < len)
   {
      xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
      if (++j < len)
      {
         xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
         if (++j < len)
         {
            xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
         }
      }
   }
   _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
}
Пример #3
0
void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
      opus_val16 g10, opus_val16 g11, opus_val16 g12)
{
   int i;
   __m128 x0v;
   __m128 g10v, g11v, g12v;
   g10v = _mm_load1_ps(&g10);
   g11v = _mm_load1_ps(&g11);
   g12v = _mm_load1_ps(&g12);
   x0v = _mm_loadu_ps(&x[-T-2]);
   for (i=0;i<N-3;i+=4)
   {
      __m128 yi, yi2, x1v, x2v, x3v, x4v;
      const opus_val32 *xp = &x[i-T-2];
      yi = _mm_loadu_ps(x+i);
      x4v = _mm_loadu_ps(xp+4);
#if 0
      /* Slower version with all loads */
      x1v = _mm_loadu_ps(xp+1);
      x2v = _mm_loadu_ps(xp+2);
      x3v = _mm_loadu_ps(xp+3);
#else
      x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
      x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
      x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
#endif

      yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
#if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
      yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
      yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
#else
      /* Use partial sums */
      yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
                       _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
      yi = _mm_add_ps(yi, yi2);
#endif
      x0v=x4v;
      _mm_storeu_ps(y+i, yi);
   }
#ifdef CUSTOM_MODES
   for (;i<N;i++)
   {
      y[i] = x[i]
               + MULT16_32_Q15(g10,x[i-T])
               + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
               + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
   }
#endif
}
Пример #4
0
void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
{
	__m128 xmm0, xmm2, xmm5;

	(void) lag;
	FLAC__ASSERT(lag > 0);
	FLAC__ASSERT(lag <= 4);
	FLAC__ASSERT(lag <= data_len);
	FLAC__ASSERT(data_len > 0);

	xmm5 = _mm_setzero_ps();

	xmm0 = _mm_load_ss(data++);
	xmm2 = xmm0;
	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);

	xmm0 = _mm_mul_ps(xmm0, xmm2);
	xmm5 = _mm_add_ps(xmm5, xmm0);

	data_len--;

	while(data_len)
	{
		xmm0 = _mm_load1_ps(data++);

		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
		xmm2 = _mm_move_ss(xmm2, xmm0);
		xmm0 = _mm_mul_ps(xmm0, xmm2);
		xmm5 = _mm_add_ps(xmm5, xmm0);

		data_len--;
	}

	_mm_storeu_ps(autoc, xmm5);
}
Пример #5
0
void sgemm( int m, int n, int d, float *A, float *C )
{
    #pragma omp parallel
    {
    #pragma vectorize unroll optimize("", on)
    for( int j = 0; j < n; j++ ) {
        #pragma vectorize unroll optimize("", on)
        for( int k = 0; k < m; k++ ) {
            __m128 a1;
            a1 = _mm_load1_ps(A+j*(n+1)+k*(n));
            int i;
            int ntmp = n/8*8;
            #pragma vectorize optimize("", on)
			#pragma omp for private(i)
            for(i = 0; i < ntmp; i += 8 ) {
                float *ikn = A+i+k*(n);
                float *ijn = C+i+j*n;
                __m128 a2 = _mm_loadu_ps(ikn);
                __m128 a3 = _mm_loadu_ps(ikn+4);
                __m128 mulres = _mm_mul_ps(a1, a2);
                __m128 mulres2 = _mm_mul_ps(a1, a3);
                __m128 sum = _mm_add_ps(_mm_loadu_ps(ijn), mulres);
                __m128 sum3 = _mm_add_ps(_mm_loadu_ps(ijn+4), mulres2);

                _mm_storeu_ps(ijn, sum);
                _mm_storeu_ps(ijn+4, sum3);
            }
			// #pragma omp for
            for (i = i; i < n; i++) {
                C[i+j*n] += A[i+k*n] * A[j*(n+1)+k*n];
            }
      }
    }
}
}
Пример #6
0
LXC_ERROR_CODE LXC_SSE3FreqCombine2Ch(uint Size, void *X, void *Y, void *Z)
{
    if(!Size || !X || !Y || !Z)
    {
        return LXC_ERR_INVALID_INPUT;
    }

    float *m_X = (float*)X;
    float *m_Y = (float*)Y;
    float *m_Z = (float*)Z;

#if defined(TARGET_WINDOWS)
    const __declspec(align(LXC_SSE3_ALIGN)) float  scaleFactor = 1.0f / ((float)Size);
#else
    const float  scaleFactor = 1.0f / ((float)Size);
#endif
    Size = Size*2;
    __m128 _scale = _mm_load1_ps(&scaleFactor);

    for(uint ii = 0; ii < Size; ii+=4)
    {
        //m_Z[ii][0] = (m_X[ii][0] - m_Y[ii][1])*scaleFactor;
        //m_Z[ii][1] = (m_X[ii][1] + m_Y[ii][0])*scaleFactor;
        //m_Z[ii][0] = (m_X[ii+1][0] - m_Y[ii+1][1])*scaleFactor;
        //m_Z[ii][1] = (m_X[ii+1][1] + m_Y[ii+1][0])*scaleFactor;
        //__m128 A = _mm_load_ps(&m_X[ii]);
        __m128 B = _mm_load_ps(&m_Y[ii]);
        B = _mm_shuffle_ps(B, B, LXC_MM_SHUFFLE(1,0,3,2));
        __m128 addRes = _mm_addsub_ps (_mm_load_ps(&m_X[ii]), B);
        _mm_store_ps(&m_Z[ii], _mm_mul_ps(addRes, _scale));
    }


    return LXC_NO_ERR;
}
Пример #7
0
void shz::math::matrix<shz::math::f32, 4, 4>::mul(const shz::math::f32* left, const shz::math::f32 value, shz::math::f32 *target){
	__m128 b = _mm_load1_ps(&value);
	for(size_t i=0; i < 4; ++i){
		__m128 a = _mm_load_ps(left);
		__m128 r = _mm_mul_ps(a, b);
		_mm_store_ps(target, r);
		left+=4; target+=4;
	}
}
Пример #8
0
static void 
mexsoftmax(float* y, float* shift, mwSize m, mwSize n) {
  __m128 i1, i2;
  __m128 o1, o2;
 
  while (m>0)
    {
      mwSize curn = n;
      float sum = 0.0f;
      declconst128(zero, 0.0f);
      
      while (curn>0 && ((unsigned long)(y+curn) & 15) != 0)
        {
          --curn;
          y[curn]=fastexp(y[curn]-*shift);
          sum += y[curn];
        }

      __m128 s1 = _mm_load1_ps (shift);
      __m128 sum1 = zero;

      while (curn>7) {
        i1 = _mm_load_ps (y+curn-4);
        i2 = _mm_load_ps (y+curn-8);
        i1 = _mm_sub_ps (i1, s1);
        i2 = _mm_sub_ps (i2, s1);
        o1 = vfastexp(i1);
        o2 = vfastexp(i2);
        _mm_store_ps (y+curn-4, o1);
        sum1 = _mm_add_ps (sum1, o1);
        _mm_store_ps (y+curn-8, o2);
        sum1 = _mm_add_ps (sum1, o2);
        curn-=8;
      }

      sum1 = _mm_hadd_ps (sum1, sum1);
      sum1 = _mm_hadd_ps (sum1, sum1);
      sum += _mm_cvtss_f32 (sum1);
     
      while(curn>0) {
        --curn;
        y[curn]=fastexp(y[curn]-*shift);
        sum += y[curn];
      }

      sum = 1.0f / sum;

      ptrdiff_t n_pdt = n;
      ptrdiff_t one_pdt = 1;

      sscal (&n_pdt, &sum, y, &one_pdt);

      ++shift;
      y+=n;
      --m;
    }
}
Пример #9
0
void fast(element_t * const elements, const int num_elts, const float a) {
    element_t * elts = elements;
    float logf_a = logf(a);
    float logf_1_a = logf(1.0/a);
    v4sf log_a = _mm_load1_ps(&logf_a);
    v4sf log_1_a = _mm_load1_ps(&logf_1_a);
    assert(num_elts % 3 == 0); // operates on 3 elements at a time

    // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    for (int i = 0; i < num_elts; i += 3) {
        // transpose
        // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output
        v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0
        v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0
        v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0
        v4sf r3 = _mm_setzero_ps();        // 0, 0, 0, 0
        v4sf t0 = _mm_unpacklo_ps(r0, r1); //  x1,x2,y1,y2
        v4sf t1 = _mm_unpacklo_ps(r2, r3); //  x3,0, y3,0
        v4sf t2 = _mm_unpackhi_ps(r0, r1); //  z1,z2,0, 0
        v4sf t3 = _mm_unpackhi_ps(r2, r3); //  z3,0, 0, 0
        r0 = _mm_movelh_ps(t0, t1);        // x1,x2,x3,0
        r1 = _mm_movehl_ps(t1, t0);        // y1,y2,y3,0
        r2 = _mm_movelh_ps(t2, t3);        // z1,z2,z3,0
        // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a))
        v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0
        v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0
        v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0
        v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0
        v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0
        v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0
        // sum
        v4sf s1 = _mm_add_ps(ex0, ex1);
        v4sf s2 = _mm_add_ps(sum1, ex2);
        // pow(sum, 1/a) = exp(sum * log(1/a))
        v4sf ps = _mm_mul_ps(s2, log_1_a);
        v4sf es = exp_ps(ps);
        ALIGN16_BEG float re[4] ALIGN16_END;
        _mm_store_ps(re, es);
        elts[0].re = re[0];
        elts[1].re = re[1];
        elts[2].re = re[2];
        elts += 3;
    }
}
Пример #10
0
// Multiply matrix by scaling factor
Mat44 Mat44::Mult(const f32 scale) const
{
	__m128 mat1rows[] = { _mm_load_ps(mat), _mm_load_ps(mat+4), _mm_load_ps(mat+8), _mm_load_ps(mat+12) };
	__m128 splatMult = _mm_load1_ps(&scale);

	Mat44 res;
	_mm_store_ps(res.mat   , _mm_mul_ps(mat1rows[0], splatMult));
	_mm_store_ps(res.mat+4 , _mm_mul_ps(mat1rows[1], splatMult));
	_mm_store_ps(res.mat+8 , _mm_mul_ps(mat1rows[2], splatMult));
	_mm_store_ps(res.mat+12, _mm_mul_ps(mat1rows[3], splatMult));
	return res;
};
Пример #11
0
// Add v to "this"
Mat44 Mat44::Add(const f32 v) const
{
	__m128 mat1rows[] = { _mm_load_ps(mat), _mm_load_ps(mat+4), _mm_load_ps(mat+8), _mm_load_ps(mat+12) };
	__m128 splatAdd = _mm_load1_ps(&v);

	Mat44 res;
	_mm_store_ps(res.mat   , _mm_add_ps(mat1rows[0], splatAdd));
	_mm_store_ps(res.mat+4 , _mm_add_ps(mat1rows[1], splatAdd));
	_mm_store_ps(res.mat+8 , _mm_add_ps(mat1rows[2], splatAdd));
	_mm_store_ps(res.mat+12, _mm_add_ps(mat1rows[3], splatAdd));
	return res;
};
Пример #12
0
void R_LocalPointToGlobal( const float modelMatrix[16], const idVec3 &in, idVec3 &out ) {
#if defined(MACOS_X) && defined(__i386__)
	__m128 m0, m1, m2, m3;
	__m128 in0, in1, in2;
	float i0,i1,i2;
	i0 = in[0];
	i1 = in[1];
	i2 = in[2];
	
	m0 = _mm_loadu_ps(&modelMatrix[0]);
	m1 = _mm_loadu_ps(&modelMatrix[4]);
	m2 = _mm_loadu_ps(&modelMatrix[8]);
	m3 = _mm_loadu_ps(&modelMatrix[12]);
	
	in0 = _mm_load1_ps(&i0);
	in1 = _mm_load1_ps(&i1);
	in2 = _mm_load1_ps(&i2);
	
	m0 = _mm_mul_ps(m0, in0);
	m1 = _mm_mul_ps(m1, in1);
	m2 = _mm_mul_ps(m2, in2);

	m0 = _mm_add_ps(m0, m1);
	m0 = _mm_add_ps(m0, m2);
	m0 = _mm_add_ps(m0, m3);
	
	_mm_store_ss(&out[0], m0);
	m1 = (__m128) _mm_shuffle_epi32((__m128i)m0, 0x55);
	_mm_store_ss(&out[1], m1);
	m2 = _mm_movehl_ps(m2, m0);
	_mm_store_ss(&out[2], m2);
#else	
	out[0] = in[0] * modelMatrix[0] + in[1] * modelMatrix[4]
		+ in[2] * modelMatrix[8] + modelMatrix[12];
	out[1] = in[0] * modelMatrix[1] + in[1] * modelMatrix[5]
		+ in[2] * modelMatrix[9] + modelMatrix[13];
	out[2] = in[0] * modelMatrix[2] + in[1] * modelMatrix[6]
		+ in[2] * modelMatrix[10] + modelMatrix[14];
#endif
}
Пример #13
0
LXC_ERROR_CODE LXC_SSE3CpxAdd(LXC_BUFFER *ResultBuffer, float ScaleFactor)
{
    if(!ResultBuffer)
    {
        return LXC_ERR_INVALID_INPUT;
    }

    const uint partSize = ResultBuffer->maxFilterPartLength;
    const uint maxParts = ResultBuffer->maxFilterParts;
    const uint size = partSize*2;

    float *Z0 = (float*)LXC_SSE3Buffer_getPart(ResultBuffer, 0);
    for(uint part=1; part < maxParts; part++)
    {
        float *Zi = (float*)LXC_SSE3Buffer_getPart(ResultBuffer, part);
        for(uint ii=0; ii < size; ii+=4)
        {
            //Z0[ii][0] += Zi[ii][0];
            //Z0[ii][1] += Zi[ii][1];
            //__m128 _Zi = _mm_load_ps(&Zi[ii]);
            //__m128 _Z0 = _mm_load_ps(&Z0[ii]);
            //_mm_store_ps(&Z0[ii], _mm_add_ps(_Z0, _Zi));

            //__m128 _Zi = _mm_load_ps(&Zi[ii]);
            //__m128 _Z0 = _mm_load_ps(&Z0[ii]);
            _mm_store_ps(&Z0[ii], _mm_add_ps(_mm_load_ps(&Z0[ii]), _mm_load_ps(&Zi[ii])));
        }
    }

    if (ScaleFactor != 1.0f)
    {
        //const LXC_SSE3Float scaleFactor = 1.0f / ((float)partSize);
        const LXC_SSE3Float scaleFactor = ScaleFactor;
        __m128 _scale = _mm_load1_ps(&scaleFactor);
        for(uint ii=0; ii < size; ii+=4)
        {
            //Z0[ii][0] *= scaleFactor;
            //Z0[ii][1] *= scaleFactor;
            __m128 _Z0 = _mm_load_ps(&Z0[ii]);
            _mm_store_ps(&Z0[ii], _mm_mul_ps(_Z0, _scale));
        }
    }


    return LXC_NO_ERR;
}
Пример #14
0
void LOADERDECL TexCoord_ReadIndex_Short2_SSE4()
{
	static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!");

	// Heavy in ZWW
	auto const index = DataRead<I>();
	const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex]));
	const __m128i a = _mm_cvtsi32_si128(*pData);
	const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2);
	const __m128i c = _mm_cvtepi16_epi32(b);
	const __m128 d = _mm_cvtepi32_ps(c);
	const __m128 e = _mm_load1_ps(&tcScale[tcIndex]);
	const __m128 f = _mm_mul_ps(d, e);
	_mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, f);
	VertexManager::s_pCurBufferPointer += sizeof(float) * 2;
	LOG_TEX<2>();
	tcIndex++;
}
Пример #15
0
double CHellingerKernel<float>::Evaluate(float* x, float* y) {


#ifndef __SSE4_1__

    float result = 0;

    for(size_t i=0; i<m_n; i++)
        result += sqrt(x[i]*y[i]);

    return static_cast<double>(result);

#else

    __m128* px = (__m128*)x;
    __m128* py = (__m128*)y;

    float zero = 0;
    __m128 sum = _mm_load1_ps(&zero);

    for(int i=0; i<m_offset/4; i++) {

        __m128 temp = _mm_mul_ps(px[i],py[i]);
        temp = _mm_sqrt_ps(temp);
        sum = _mm_add_ps(sum,temp);

    }

    float result[4] = {0,0,0,0};
    _mm_storeu_ps(result,sum);

    float fresult  = result[0] + result[1] + result[2] + result[3];

    // add offset
    for(size_t i=m_offset; i<m_n; i++)
        fresult += sqrt(x[i]*y[i]);

    return static_cast<double>(fresult);

#endif

}
Пример #16
0
double CIntersectionKernel<float>::Evaluate(float* x, float* y) {

#ifndef __SSE4_1__

    float result = 0;

    for(size_t i=0; i<m_n; i++)
        result += min<float>(x[i],y[i]);

    return static_cast<double>(result);

#else

    __m128* px = (__m128*)x;
    __m128* py = (__m128*)y;

    float zero = 0;
    __m128 sum = _mm_load1_ps(&zero);

    const int mask = 255;

    for(size_t i=0; i<m_offset/4; i++) {

        __m128 temp = _mm_min_ps(px[i],py[i]);
        sum = _mm_add_ps(sum,temp);

    }

    float result[4] = {0,0,0,0};
    _mm_storeu_ps(result,sum);

    float fresult  = result[0] + result[1] + result[2] + result[3];

    // add offset
    for(size_t i=m_offset; i<m_n; i++)
        fresult += min<float>(x[i],y[i]);

    return static_cast<double>(fresult);

#endif

}
Пример #17
0
void	matrix_CpAAt_float (float* C,const float* A,size_t n,size_t p)
{
    size_t i,j,k;
    size_t q = n / 8;
    size_t r = n % 8;

    for (k=0;k<p;k++) {
        float* pC = C;
        for (j=0;j<n;j++) {
            __m128 w = _mm_load1_ps (A+j+k*n);
            const float* pA = A+k*n;
            if (ALGEBRA_IS_ALIGNED(pA) && ALGEBRA_IS_ALIGNED(pC)) {
                for (i=0;i<q;i++) {
                    __m128 i1 = _mm_load_ps(pA);
                    __m128 i2 = _mm_load_ps(pA+4);
                    __m128 o1 = _mm_load_ps(pC);
                    __m128 o2 = _mm_load_ps(pC+4);
                    _mm_store_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w)));
                    _mm_store_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w)));
                    pA += 8;
                    pC += 8;
                }
            }
            else {
                for (i=0;i<q;i++) {
                    __m128 i1 = _mm_loadu_ps(pA);
                    __m128 i2 = _mm_loadu_ps(pA+4);
                    __m128 o1 = _mm_loadu_ps(pC);
                    __m128 o2 = _mm_loadu_ps(pC+4);
                    _mm_storeu_ps(pC+0,_mm_add_ps(o1,_mm_mul_ps(i1,w)));
                    _mm_storeu_ps(pC+4,_mm_add_ps(o2,_mm_mul_ps(i2,w)));
                    pA += 8;
                    pC += 8;
                }
            }
            for (i=0;i<r;i++) {
                (*pC++) += A[j+k*n]*(*pA++);
            }
        }
    }
}
Пример #18
0
double CMercerKernel<float>::Evaluate(float* x, float* y) {

#ifndef __SSE4_1__

    float result = 0;

    for(size_t i=0; i<m_n; i++)
        result += x[i]*y[i];

    return static_cast<double>(result);

#else
    __m128* px = reinterpret_cast<__m128*>(x);
    __m128* py = reinterpret_cast<__m128*>(y);

    float zero = 0.0;
    __m128 sum = _mm_load1_ps(&zero);

    const int mask = 241;       // 4 MSB mask input, 4 LSB mask output

    for(size_t i=0; i<m_offset/4; i++) {

        __m128 temp = _mm_dp_ps(px[i],py[i],mask);
        sum = _mm_add_ss(sum,temp);                         // accumulate result in first register

    }

    float result[4] = {0.0,0.0,0.0,0.0};
    _mm_storeu_ps(result,sum);

    // add offset
    for(size_t i=m_offset; i<m_n; i++)
        result[0] += x[i]*y[i];

    return static_cast<double>(result[0]);
#endif

}
Пример #19
0
void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
                                 float* aOutput, uint32_t aSize) {
  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
      vout1, vout2, vout3, vgain;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aOutput);
  ASSERT_MULTIPLE16(aSize);

  vgain = _mm_load1_ps(&aScale);

  for (unsigned i = 0; i < aSize; i += 16) {
    vin0 = _mm_load_ps(&aInput[i]);
    vin1 = _mm_load_ps(&aInput[i + 4]);
    vin2 = _mm_load_ps(&aInput[i + 8]);
    vin3 = _mm_load_ps(&aInput[i + 12]);

    vscaled0 = _mm_mul_ps(vin0, vgain);
    vscaled1 = _mm_mul_ps(vin1, vgain);
    vscaled2 = _mm_mul_ps(vin2, vgain);
    vscaled3 = _mm_mul_ps(vin3, vgain);

    vin0 = _mm_load_ps(&aOutput[i]);
    vin1 = _mm_load_ps(&aOutput[i + 4]);
    vin2 = _mm_load_ps(&aOutput[i + 8]);
    vin3 = _mm_load_ps(&aOutput[i + 12]);

    vout0 = _mm_add_ps(vin0, vscaled0);
    vout1 = _mm_add_ps(vin1, vscaled1);
    vout2 = _mm_add_ps(vin2, vscaled2);
    vout3 = _mm_add_ps(vin3, vscaled3);

    _mm_store_ps(&aOutput[i], vout0);
    _mm_store_ps(&aOutput[i + 4], vout1);
    _mm_store_ps(&aOutput[i + 8], vout2);
    _mm_store_ps(&aOutput[i + 12], vout3);
  }
}
Пример #20
0
void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize) {
  __m128 vout0, vout1, vout2, vout3, vin0, vin1, vin2, vin3;

  ASSERT_ALIGNED16(aBlock);
  ASSERT_MULTIPLE16(aSize);

  __m128 vgain = _mm_load1_ps(&aScale);

  for (unsigned i = 0; i < aSize; i += 16) {
    vin0 = _mm_load_ps(&aBlock[i]);
    vin1 = _mm_load_ps(&aBlock[i + 4]);
    vin2 = _mm_load_ps(&aBlock[i + 8]);
    vin3 = _mm_load_ps(&aBlock[i + 12]);
    vout0 = _mm_mul_ps(vin0, vgain);
    vout1 = _mm_mul_ps(vin1, vgain);
    vout2 = _mm_mul_ps(vin2, vgain);
    vout3 = _mm_mul_ps(vin3, vgain);
    _mm_store_ps(&aBlock[i], vout0);
    _mm_store_ps(&aBlock[i + 4], vout1);
    _mm_store_ps(&aBlock[i + 8], vout2);
    _mm_store_ps(&aBlock[i + 12], vout3);
  }
}
Пример #21
0
void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
                                        float* aOutput) {
  __m128 vin0, vin1, vin2, vin3, vout0, vout1, vout2, vout3;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aOutput);

  __m128 vgain = _mm_load1_ps(&aScale);

  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
    vin0 = _mm_load_ps(&aInput[i]);
    vin1 = _mm_load_ps(&aInput[i + 4]);
    vin2 = _mm_load_ps(&aInput[i + 8]);
    vin3 = _mm_load_ps(&aInput[i + 12]);
    vout0 = _mm_mul_ps(vin0, vgain);
    vout1 = _mm_mul_ps(vin1, vgain);
    vout2 = _mm_mul_ps(vin2, vgain);
    vout3 = _mm_mul_ps(vin3, vgain);
    _mm_store_ps(&aOutput[i], vout0);
    _mm_store_ps(&aOutput[i + 4], vout1);
    _mm_store_ps(&aOutput[i + 8], vout2);
    _mm_store_ps(&aOutput[i + 12], vout3);
  }
}
Пример #22
0
int conv2D(float* in, float* out, int data_size_X, int data_size_Y,
                    float* kernel)
{
    // the x coordinate of the kernel's center
    int kern_cent_X = (KERNX - 1)/2;
    // the y coordinate of the kernel's center
    int kern_cent_Y = (KERNY - 1)/2;

    float kernelObj[KERNX*KERNY];
    for (int i = 0; i < KERNX * KERNY; i++) {
        kernelObj[i] = kernel[i];
    }
    // main convolution loop

    #pragma omp parallel for firstprivate(data_size_X, data_size_Y, kern_cent_X, kern_cent_Y)
	for(int y = 1; y < data_size_Y - 1; y++){ // the y coordinate of theoutput location we're focusing on
		int x =kern_cent_X; //Initialize x to be 1
		int t = data_size_X -28 - kern_cent_X;

		for(x; x < t; x+=28){ // the x coordinate of the output location we're focusing on
			
			__m128 output = _mm_setzero_ps();
			__m128 output1 = _mm_setzero_ps();
			__m128 output2 = _mm_setzero_ps();
			__m128 output3 = _mm_setzero_ps();
			__m128 output4 = _mm_setzero_ps();
			__m128 output5 = _mm_setzero_ps();
			__m128 output6 = _mm_setzero_ps();

			//3 SSE Loop Unrolls per loop
			for(int j = -kern_cent_Y; j <= kern_cent_Y; j++){ // kernel unflipped y coordinate 
				for(int i = -kern_cent_X; i <= kern_cent_X; i++){ // kernel unflipped x coordinate
					
					__m128 ker = _mm_load1_ps(kernelObj+kern_cent_X-i+(kern_cent_Y-j)*KERNX); //Loading kernelObj value into 128-bit ker

					//Performing 2d Convolution calculations...
					__m128 tmp = _mm_loadu_ps(in+x+i+(y+j)*data_size_X);
					__m128 mult = _mm_mul_ps(ker, tmp);
					output = _mm_add_ps(output, mult); 

					tmp = _mm_loadu_ps(in+x+4+i+(y+j)*data_size_X);
					mult = _mm_mul_ps(ker, tmp);
					output1 = _mm_add_ps(output1, mult); 

					tmp = _mm_loadu_ps(in+x+8+i+(y+j)*data_size_X);
					mult = _mm_mul_ps(ker, tmp);
					output2 = _mm_add_ps(output2, mult); 

					tmp = _mm_loadu_ps(in+x+12+i+(y+j)*data_size_X);
					mult = _mm_mul_ps(ker, tmp);
					output3 = _mm_add_ps(output3, mult); 
	
					tmp = _mm_loadu_ps(in+x+16+i+(y+j)*data_size_X);
					mult = _mm_mul_ps(ker, tmp);
					output4 = _mm_add_ps(output4, mult); 

					tmp = _mm_loadu_ps(in+x+20+i+(y+j)*data_size_X);
					mult = _mm_mul_ps(ker, tmp);
					output5 = _mm_add_ps(output5, mult); 

					tmp = _mm_loadu_ps(in+x+24+i+(y+j)*data_size_X);
					mult = _mm_mul_ps(ker, tmp);
					output6 = _mm_add_ps(output6, mult); 
/*
					output = _mm_add_ps(output, _mm_mul_ps(ker, _mm_loadu_ps(in+x+i+(y+j)*data_size_X))); 
					output1 = _mm_add_ps(output1, _mm_mul_ps(ker, _mm_loadu_ps(in+x+4+i+(y+j)*data_size_X)));
					output2 = _mm_add_ps(output2, _mm_mul_ps(ker, _mm_loadu_ps(in+x+8+i+(y+j)*data_size_X)));
*/
				}
			}
					//Storing the outputs into out matrix
					_mm_storeu_ps(out+x+y*data_size_X, output); 
					_mm_storeu_ps(out+x+4+y*data_size_X, output1);					
					_mm_storeu_ps(out+x+8+y*data_size_X, output2);
					_mm_storeu_ps(out+x+12+y*data_size_X, output3);
					_mm_storeu_ps(out+x+16+y*data_size_X, output4);
					_mm_storeu_ps(out+x+20+y*data_size_X, output5);
					_mm_storeu_ps(out+x+24+y*data_size_X, output6);

		}
		//edge cases non four
		for(x; x < data_size_X - kern_cent_X; x+=1){ // the x coordinate of the output location we're focusing on 
			for(int j = -kern_cent_Y; j <= kern_cent_Y; j++){ // kernelObj unflipped y coordinate
				for(int i = -kern_cent_X; i <= kern_cent_X; i++){ // kernelObj unflipped x coordinate

					out[x+y*data_size_X] += kernelObj[(kern_cent_X-i)+(kern_cent_Y-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
				}
			}
		}
	}

	/* For loops for different edge cases: top, bottom, left, right and corners. */
	int i, j, x, y;

	//kernal's first row and first column are excluded
    
	for(i = -kern_cent_X+1; i <= kern_cent_X; i++){ // kernelObj unflipped x coordinate
		x = 0; 
		y = 0;
		for(j = -kern_cent_Y+1; j <= kern_cent_Y; j++){ // kernelObj unflipped y coordinate
			out[x+y*data_size_X] += kernelObj[(kern_cent_X-i)+(kern_cent_Y-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
		}
	}
 
	//kernal's first row and last column are excluded
    
   	for(i = -kern_cent_X; i <= kern_cent_X-1; i++){ // kernelObj unflipped x coordinate
		x = data_size_X-1; 
		y = 0; 
		for(j = -kern_cent_Y+1; j <= kern_cent_Y; j++){ // kernelObj unflipped y coordinate
			out[x+y*data_size_X] += kernelObj[(kern_cent_X-i)+(kern_cent_Y-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
		}
	}

	//kernal's last row and first column are excluded
    
   	for(i = -kern_cent_X+1; i <= kern_cent_X; i++){ // kernelObj unflipped x coordinate
		x = 0; 
		y = data_size_Y-1;
		for(j = -kern_cent_Y; j <= kern_cent_Y-1; j++){ // kernelObj unflipped y coordinate
			out[x+y*data_size_X] += kernelObj[(kern_cent_X-i)+(kern_cent_Y-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
		}
	}

	//kernal's last row and last column are excluded
    
	for(i = -kern_cent_X; i <= kern_cent_X-1; i++){ // kernelObj unflipped x coordinate
		x = data_size_X-1; 
		y = data_size_Y-1; 
		for(j = -kern_cent_Y; j <= kern_cent_Y-1; j++){ // kernelObj unflipped y coordinate
			out[x+y*data_size_X] += kernelObj[(kern_cent_X-i)+(kern_cent_Y-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
		}
	}

	//kernal's first column are excluded
    
	for (y = 1; y < data_size_Y-1; y++){
		x = 0; // only x remains constant to access the first column of x's
		for(i = -kern_cent_X+1; i <= kern_cent_X; i++){ // kernelObj unflipped x coordinate stops before last column
			for(j = -kern_cent_Y; j <= kern_cent_Y; j++){ // kernelObj unflipped y coordinate
				out[x+y*data_size_X] += kernelObj[(kern_cent_X-i)+(kern_cent_Y-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
			}
		}
	}

	//kernal's last column are excluded	
    
	for (y = 1; y < data_size_Y-1; y++){
		x = data_size_X-1; // only x remains constant to access the last column of x's
		for(i = -kern_cent_X; i <= kern_cent_X-1; i++){ // kernelObj unflipped x coordinate stops before last column
			for(j = -kern_cent_Y; j <= kern_cent_Y; j++){ // kernelObj unflipped y coordinate
				out[x+y*data_size_X] += kernelObj[(kern_cent_X-i)+(kern_cent_Y-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
			}
		}
	}

	//kernal's first row are excluded
    
   	for (x = 1; x < data_size_X-1; x++){
		y = 0; // only y remains constant to access the first row of y's
		for(i = -kern_cent_X; i <= kern_cent_X; i++){ // kernelObj unflipped x coordinate
			for(j = -kern_cent_Y+1; j <= kern_cent_Y; j++){ // kernelObj unflipped y coordinate skips first row
				out[x+y*data_size_X] += kernelObj[(kern_cent_X-i)+(kern_cent_Y-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
			}
		}
	}

	//kernal's last row are excluded
    
   	for (x = 1; x < data_size_X-1; x++){
		y = data_size_Y-1; // only y remains constant to access the last row of y's
		for(i = -kern_cent_X; i <= kern_cent_X; i++){ // kernelObj unflipped x coordinate
			for(j = -kern_cent_Y; j <= kern_cent_Y-1; j++){ // kernelObj unflipped y coordinate stops before last row
				out[x+y*data_size_X] += kernelObj[(kern_cent_X-i)+(kern_cent_Y-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
			}
		}
	}
	
	return 1;
}
Пример #23
0
mlib_status
mlib_ImageColorConvert2_F32(
    const mlib_f32 *src,
    mlib_s32 slb,
    mlib_f32 *dst,
    mlib_s32 dlb,
    mlib_s32 xsize,
    mlib_s32 ysize,
    const mlib_d64 *fmat,
    const mlib_d64 *offset)
{
	/* pointers for pixel and line of source */
	mlib_f32 *sa, *sl;

	/* pointers for pixel and line of destination */
	mlib_f32 *da, *dl;

	/* indices */
	mlib_s32 i, j;

	/* intermediate */
	__m128 p0, p1, p2, t0, t1, t2, s0, s1, q;

	/* packed kernel */
	__m128 k0, k1, k2;

	/* packed offset */
	__m128 off;

	/* load transposed kernel */
	k0 = _mm_set_ps(0.0f,
			(mlib_f32)fmat[6],
			(mlib_f32)fmat[3],
			(mlib_f32)fmat[0]);
	k1 = _mm_set_ps(0.0f,
			(mlib_f32)fmat[7],
			(mlib_f32)fmat[4],
			(mlib_f32)fmat[1]);
	k2 = _mm_set_ps(0.0f,
			(mlib_f32)fmat[8],
			(mlib_f32)fmat[5],
			(mlib_f32)fmat[2]);

	/* load offset */
	off = _mm_set_ps(0.0f,
			(mlib_f32)offset[2],
			(mlib_f32)offset[1],
			(mlib_f32)offset[0]);

	sa = sl = (mlib_f32 *)src;
	da = dl = dst;

	for (j = 0; j < ysize; j++) {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (i = 0; i < (xsize - 1); i ++) {
			p0 = _mm_load1_ps(sa);
			sa ++;
			p1 = _mm_load1_ps(sa);
			sa ++;
			p2 = _mm_load1_ps(sa);
			sa ++;

			t0 = _mm_mul_ps(p0, k0);
			t1 = _mm_mul_ps(p1, k1);
			t2 = _mm_mul_ps(p2, k2);

			s0 = _mm_add_ps(t0, t1);
			s1 = _mm_add_ps(t2, off);
			q = _mm_add_ps(s0, s1);

			_mm_storeu_ps(da, q);
			da += 3;
		}

		/*
		 * process the last pixel of each row separately
		 * to avoid out of bound write
		 */
		p0 = _mm_load1_ps(sa);
		sa ++;
		p1 = _mm_load1_ps(sa);
		sa ++;
		p2 = _mm_load1_ps(sa);
		sa ++;

		t0 = _mm_mul_ps(p0, k0);
		t1 = _mm_mul_ps(p1, k1);
		t2 = _mm_mul_ps(p2, k2);

		s0 = _mm_add_ps(t0, t1);
		s1 = _mm_add_ps(t2, off);
		q = _mm_add_ps(s0, s1);

		_mm_storel_pi((__m64 *)da, q);
		da += 2;
		q = _mm_shuffle_ps(q, q, 0xaa);
		_mm_store_ss(da, q);

		/* set src pointer to next row */
		sa = sl = sl + slb;
		/* set dst pointer to next row */
		da = dl = dl + dlb;
	}

	return (MLIB_SUCCESS);
}
Пример #24
0
void sgemm( int m, int n, int d, float *A, float *C )
{
    int n1 = n+1, nEnd = n/VERTICAL_ROLL*VERTICAL_ROLL;
    float *B = A, *D = C;
	#pragma omp parallel for
	 for (int j = 0; j < n; j++) {
		int jn1 = j*(n+1), jn = j*n; float *Cjn = D+jn;
		// for (int b = 0; b < m; b+= BLOCKSIZE) {
			for (int i = 0; i < nEnd; i+=VERTICAL_ROLL) {
			    float *Cjni = Cjn+i;
			    float *Cjni1 = Cjni + 4;
			    float *Cjni2 = Cjni + 8;
			    float *Cjni3 = Cjni + 12;
			    float *Cjni4 = Cjni + 16;
			    float *Cjni5 = Cjni + 20;
			    float *Cjni6 = Cjni + 24;
			    float *Cjni7 = Cjni + 28;

			    int i1 = i+4;
			    int i2 = i+8;
			    int i3 = i+12;
			    int i4 = i+16;
			    int i5 = i+20;
			    int i6 = i+24;
			    int i7 = i+28;

			    __m128 Cij = _mm_loadu_ps(Cjni);
			    __m128 Cij1 = _mm_loadu_ps(Cjni1);
			    __m128 Cij2 = _mm_loadu_ps(Cjni2);
			    __m128 Cij3 = _mm_loadu_ps(Cjni3);
			    __m128 Cij4 = _mm_loadu_ps(Cjni4);
			    __m128 Cij5 = _mm_loadu_ps(Cjni5);
			    __m128 Cij6 = _mm_loadu_ps(Cjni6);
			    __m128 Cij7 = _mm_loadu_ps(Cjni7);


			    // for (int k = b; k < b+BLOCKSIZE && k < m; k++) {
			    for (int k = 0; k < m; k++) {
					int k1 = k + 1; float *Akn = B+k*n;
					__m128 Ajk = _mm_load1_ps(Akn+jn1);

					__m128 Aik = _mm_loadu_ps(Akn+i);
					__m128 Ai1k = _mm_loadu_ps(Akn+i1);
					__m128 Ai2k = _mm_loadu_ps(Akn+i2);
					__m128 Ai3k = _mm_loadu_ps(Akn+i3);
					__m128 Ai4k = _mm_loadu_ps(Akn+i4);
					__m128 Ai5k = _mm_loadu_ps(Akn+i5);
					__m128 Ai6k = _mm_loadu_ps(Akn+i6);
					__m128 Ai7k = _mm_loadu_ps(Akn+i7);

					Cij = _mm_add_ps(Cij, _mm_mul_ps(Ajk, Aik));
					Cij1 = _mm_add_ps(Cij1, _mm_mul_ps(Ajk, Ai1k));
					Cij2 = _mm_add_ps(Cij2, _mm_mul_ps(Ajk, Ai2k));
					Cij3 = _mm_add_ps(Cij3, _mm_mul_ps(Ajk, Ai3k));
					Cij4 = _mm_add_ps(Cij4, _mm_mul_ps(Ajk, Ai4k));
					Cij5 = _mm_add_ps(Cij5, _mm_mul_ps(Ajk, Ai5k));
					Cij6 = _mm_add_ps(Cij6, _mm_mul_ps(Ajk, Ai6k));
					Cij7 = _mm_add_ps(Cij7, _mm_mul_ps(Ajk, Ai7k));
			    }
			    _mm_storeu_ps(Cjni, Cij);
			    _mm_storeu_ps(Cjni1, Cij1);
			    _mm_storeu_ps(Cjni2, Cij2);
			    _mm_storeu_ps(Cjni3, Cij3);
			    _mm_storeu_ps(Cjni4, Cij4);
			    _mm_storeu_ps(Cjni5, Cij5);
			    _mm_storeu_ps(Cjni6, Cij6);
			    _mm_storeu_ps(Cjni7, Cij7);
			}
		// }
    }
    if (n % VERTICAL_ROLL != 0 && (n - (nEnd) >= 4)) {
		#pragma omp parallel for
		for (int j = 0; j < n; j++) {
			for (int i = nEnd; i < n/4*4; i+=4) {
				float *addrCij = D+i+j*n;
				float *Ajn1 = B+j*n1;
				float *Ai = A+i;
				__m128 Cij = _mm_loadu_ps(addrCij);
				for (int k = 0; k < m; k++) {
				    int kn = k*n;				    
				    __m128 Ajk = _mm_load1_ps(Ajn1+k*n);
				    __m128 Aik = _mm_loadu_ps(Ai+k*n);
				    Cij = _mm_add_ps(Cij, _mm_mul_ps(Ajk, Aik));
				}
				_mm_storeu_ps(addrCij, Cij);
			}
		}
    }
    if ((n - nEnd) % 4 != 0) {
		#pragma omp parallel for
		for (int j = 0; j < n; j++) {
		    float *Ajn1 = B+j*n1;
		    for (int i = n/4*4; i < n; i++) {
			float *addrCij = D+i+j*n;
			float *Ajn1 = B+j*n1;
			float *Ai = B+i;
			__m128 Cij = _mm_loadu_ps(addrCij);
			for (int k = 0; k < m; k++) {
			    int kn = k*n;
			    __m128 Ajk = _mm_load1_ps(Ajn1+kn);
			    __m128 Aik = _mm_loadu_ps(Ai+kn);
			    Cij = _mm_add_ps(Cij, _mm_mul_ps(Ajk, Aik));
			}
			_mm_store_ss(addrCij, Cij);
		    }
		}	
	}	
}	
Пример #25
0
/// Integrates all particles.
void PrecipitationSystem::ProcessParticles(float & timeInSeconds)
{
	Timer timer;
	timer.Start();
#ifdef USE_SSE
	__m128 sseTime = _mm_load1_ps(&timeInSeconds);
#endif
	/// Move/Process all alive particles
	const Vector3f wind = weather->globalWind;
	for (int i = 0; i < aliveParticles; ++i)
	{
#ifdef SSE_PARTICLES
		positionsSSE[i].data = _mm_add_ps(positionsSSE[i].data, _mm_mul_ps(sseTime, _mm_add_ps(velocitiesSSE[i].data, wind.data)));
#else // Not SSE_PARTICLES
		// Using SSE commands straight away reduced computation time to like 1 ms from 150ms when many particles were around (towards 500k somewhere)
#ifdef USE_SSE
		positions[i].data = _mm_add_ps(positions[i].data, _mm_mul_ps(sseTime, _mm_add_ps(velocities[i].data, weather->globalWind.data)));
#else
		positions[i] += (velocities[i] + weather->globalWind)* timeInSeconds;
#endif // USE_SSE
#endif // SSE_PARTICLES
	}
	timer.Stop();
	FrameStats.particleProcessingIntegrate += timer.GetMs();

	timer.Start();
	for (int i = 0; i < aliveParticles; ++i)
	{
#ifdef SSE_PARTICLES
		ldsSSE[i].y += timeInSeconds;	
#else // Not SSE_PARTICLES
		// No velocity decay.
		lifeDurations[i] += timeInSeconds;
#endif // SSE_PARTICLES
	}
	timer.Stop();
	FrameStats.particleProcessingOldify = timer.GetMs();


	timer.Start();
	for (int i = 0; i < aliveParticles; ++i)
	{
#ifdef SSE_PARTICLES
		if (ldsSSE[i].y > ldsSSE[i].x)
		{
			int lastIndex = aliveParticles - 1;
			positionsSSE[i] = positionsSSE[lastIndex];
			velocitiesSSE[i] = velocitiesSSE[lastIndex];
			colorsSSE[i] = colorsSSE[lastIndex];
			ldsSSE[i] = ldsSSE[lastIndex];
			// Decrement i so we don't skip processing of the one we moved back.
			--i;
			// Decrement alive particles.
			--aliveParticles;
		}			
#else // Not SSE_PARTICLES
			// If duration has elapsed life-time..
		if (lifeDurations[i] > lifeTimes[i])
		{
			int lastIndex = aliveParticles - 1;
			// Kill it, by moving in the last used data to replace it.
			positions[i] = positions[lastIndex];
			velocities[i] = velocities[lastIndex];
			lifeDurations[i] = lifeDurations[lastIndex];
			colors[i] = colors[lastIndex];
			lifeTimes[i] = lifeTimes[lastIndex];
			scales[i] = scales[lastIndex];

			// Decrement i so we don't skip processing of the one we moved back.
			--i;
			// Decrement alive particles.
			--aliveParticles;
		}
#endif
	}
	timer.Stop();
	FrameStats.particleProcessingRedead += timer.GetMs();
}
Пример #26
0
int conv2D(float* in, float* out, int data_size_X, int data_size_Y,
           float* kernel)
{

    // the x coordinate of the kernel's center
    int kern_cent_X = (KERNX - 1)/2;
    // the y coordinate of the kernel's center
    int kern_cent_Y = (KERNY - 1)/2;

    int blocksize = 512; // must be multiple of 4 (or possibly 8/12/16? if loop unrolling)
    int blocksize_Y = 16;

    __m128 kernel_vector, vector1, output_vector1 , vector2, output_vector2;

    int padding_x = (KERNX / 2);  // can we assume that kernel is a square matrix??
    int padding_y = (KERNY /2);
    int padded_size = (data_size_X + 2*padding_x) * (data_size_Y + 2*padding_y); // not initialized to zero?
    float* padded_in = malloc(padded_size * sizeof(float));

    int x,y;

    memset(padded_in, 0.0f, padded_size);

    for(y = 0; y < data_size_Y; y++) {
        memcpy(padded_in+(padding_x)+(y+padding_y)*(data_size_X+2*padding_y), in + (y*data_size_X), sizeof(float)*data_size_X);
    }

    int a, b, i, j;

    int k;
    float local_kern[KERNX*KERNY];

    for(k = 0; k < KERNX*KERNY; k++) {
        local_kern[k] = kernel[k];
    }


    omp_set_num_threads(15);
    # pragma omp parallel
    {

//    printf("There are %d threads running\n",omp_get_num_threads());

        # pragma omp for private(a, b, i, j, x, y, kernel_vector, vector1, output_vector1, vector2, output_vector2) firstprivate(local_kern, padded_in) schedule(static)
        for(y = 0; y < data_size_Y; y+=blocksize_Y) {
            for(x = 0; x < data_size_X; x+=blocksize) {
                for(a = x; a < x + blocksize && a <= data_size_X-8; a+=8) {
                    for(b = y; b < y + blocksize_Y && b < data_size_Y; b++) {
                        // set output vector to 0
                        output_vector1 = _mm_setzero_ps();
                        output_vector2 = _mm_setzero_ps();

                        for(i = -kern_cent_X; i <= kern_cent_X; i++) {         // inner loop; after all iterations, write 4 output sums
                            for(j = -kern_cent_Y; j <= kern_cent_Y; j++) {

                                kernel_vector = _mm_load1_ps(local_kern + ((kern_cent_X-i) + (kern_cent_Y-j)*KERNX));

                                vector1 = _mm_loadu_ps(padded_in + ((a+i+padding_x) + (b+j+padding_y)*(data_size_X+2*padding_y)));
                                vector2 = _mm_loadu_ps(padded_in + 4 + ((a+i+padding_x) + (b+j+padding_y)*(data_size_X+2*padding_y)));

                                vector1 = _mm_mul_ps(kernel_vector, vector1);
                                vector2 = _mm_mul_ps(kernel_vector,vector2);

                                output_vector1 = _mm_add_ps(vector1, output_vector1);
                                output_vector2 = _mm_add_ps(vector2, output_vector2);

                            }
                        }
                        // After inner loop completes, write output vector to output matrix
                        // must be storeu; can't use store aligned
                        _mm_storeu_ps(out + (a + b*data_size_X), output_vector1);
                        _mm_storeu_ps(out + 4 + (a + b*data_size_X), output_vector2);
                        //printf("Thread number %d is writing to line : %d\n", omp_get_thread_num(), b);
                    }
                }
            }
        }


    } // end parallel

    float output_float, kernel_float, input_float, product_float;
    for(b = 0; b < data_size_Y; b++) {
        for(a = (data_size_X/8)*8; a < data_size_X; a++) {
            // set output to 0
            output_float = 0.0f;

            for(i = -kern_cent_X; i <= kern_cent_X; i++) {         // inner loop : all kernel elements
                for(j = -kern_cent_Y; j <= kern_cent_Y; j++) {

                    product_float = local_kern[(kern_cent_X - i) + (kern_cent_Y-j)*KERNX] * padded_in[(a+i+padding_x)+(b+j+padding_y)*(data_size_X+2*padding_y)];
                    output_float += product_float;
                }
            }
            out[a + b*data_size_X] = output_float;
        }
    }


    free(padded_in);

    return 1;
}
void sgemm( int m, int n, float *A, float *C )
{

    __m128 c_vector1, c_vector2, c_vector3, c_vector4, c_vector5, c_vector6, c_vector7;
    __m128 tmp_vector;
    __m128 mult_vector1, mult_vector2, mult_vector3, mult_vector4, mult_vector5, mult_vector6, mult_vector7;
    #pragma omp parallel
#pragma omp for schedule(dynamic, 4) private(c_vector1, c_vector2, c_vector3, c_vector4, c_vector5, c_vector6, c_vector7, tmp_vector, mult_vector1, mult_vector2, mult_vector3, mult_vector4, mult_vector5, mult_vector6, mult_vector7)
    for( int i = 0; i < (m - m%4)*m; i+=m ) {
	float *c_point = C + i;
	for( int k = 0; k < m - 27; k+=28 ) {
	    c_vector1 = _mm_loadu_ps(c_point + k);
	    c_vector2 = _mm_loadu_ps(c_point + k + 4);
	    c_vector3 = _mm_loadu_ps(c_point + k + 8);
	    c_vector4 = _mm_loadu_ps(c_point + k + 12);
	    c_vector5 = _mm_loadu_ps(c_point + k + 16);
	    c_vector6 = _mm_loadu_ps(c_point + k + 20);
	    c_vector7 = _mm_loadu_ps(c_point + k + 24);
	    float *a_point = A + k;
	    int something = i/m;
	    for( int j = 0; j < n*m; j+=m ) {
		tmp_vector = _mm_load1_ps(A + something + j);
		
		mult_vector1 = _mm_mul_ps(tmp_vector, _mm_loadu_ps(a_point + j));
	 	mult_vector2 = _mm_mul_ps(tmp_vector, _mm_loadu_ps(a_point + 4 + j));
		mult_vector3 = _mm_mul_ps(tmp_vector, _mm_loadu_ps(a_point + 8 + j));
		mult_vector4 = _mm_mul_ps(tmp_vector, _mm_loadu_ps(a_point + 12 + j));
		mult_vector5 = _mm_mul_ps(tmp_vector, _mm_loadu_ps(a_point + 16 + j));
		mult_vector6 = _mm_mul_ps(tmp_vector, _mm_loadu_ps(a_point + 20 + j));
		mult_vector7 = _mm_mul_ps(tmp_vector, _mm_loadu_ps(a_point + 24 + j));
	
		c_vector1 = _mm_add_ps(c_vector1, mult_vector1);
		c_vector2 = _mm_add_ps(c_vector2, mult_vector2);
		c_vector3 = _mm_add_ps(c_vector3, mult_vector3);
		c_vector4 = _mm_add_ps(c_vector4, mult_vector4);
		c_vector5 = _mm_add_ps(c_vector5, mult_vector5);
		c_vector6 = _mm_add_ps(c_vector6, mult_vector6);
		c_vector7 = _mm_add_ps(c_vector7, mult_vector7);
	    }
	    _mm_storeu_ps((c_point + k), c_vector1);
	    _mm_storeu_ps((c_point + k + 4), c_vector2);
	    _mm_storeu_ps((c_point + k + 8), c_vector3);
	    _mm_storeu_ps((c_point + k + 12), c_vector4);
	    _mm_storeu_ps((c_point + k + 16), c_vector5);
	    _mm_storeu_ps((c_point + k + 20), c_vector6);
	    _mm_storeu_ps((c_point + k + 24), c_vector7);
	}
    
        
	//edge cases for loop unrolling
	for( int k = m/28*28; k < m - m%4; k+=4 ) {
	    c_vector1 = _mm_loadu_ps(C + k + i);
	    float* a_point = A + k;
	    for( int j = 0; j < n*m; j+=m ) {
		tmp_vector = _mm_load1_ps(A + i/m + j);	 		
		mult_vector1 = _mm_mul_ps(tmp_vector, _mm_loadu_ps(a_point + j));	
		c_vector1 = _mm_add_ps(c_vector1, mult_vector1);
	    }
	    _mm_storeu_ps((C + k + i), c_vector1); 
	}
    }

    //edge cases
    if (m % 4 != 0) {
	#pragma omp parallel for
    	for (int i = 0; i < m; i++) {
    	    for (int j = m - m % 4; j < m; j++) {
    		for (int k = 0; k < n*m; k+=m) {
    		    *(C + i*m + j) += *(A + j + k) * *(A + i + k);
		    if (m != n) {
			*(C + j*m + i) += *(A + i + k) * *(A + j + k);
		    }
    		}
    	    }
    	}
	if (m != n) {
	    for (int i = m - m%4; i < m; i++) {
		for (int j = m - m%4; j < m; j++) {
		    *(C + i*m + j) /= 2;
		}
	    }
	}
    }     
}
Пример #28
0
void sgemm( int m, int n, int d, float *A, float *C )
{
	#pragma omp parallel
	{
		__m128 vect, ATmatrix, ATvect1, ATvect2, ATvect3, ATvect4, ATvect5, ATvect6, ATvect7, Cmatrix, vect2, vect3, vect4, ATvect3j, AT3j, ATvect2j, AT2j, ATvect1j, AT1j, ATjmatrix, ATj;
		#pragma omp for
		for( int j = 0; j < n/2 * 2; j+=2 ) {
			for( int k = 0; k < m/4 * 4; k+=4 ) {
				ATmatrix =  _mm_load1_ps(A + (j * (n + 1) + (k) * (n)));
				float AT = A[j*(n+1)+k*(n)];
				ATjmatrix =  _mm_load1_ps(A + ((j+1) * (n + 1) + (k) * (n)));
				float ATj = A[(j+1)*(n+1)+k*(n)];

				ATvect1 =  _mm_load1_ps(A + (j * (n + 1) + (k+1) * (n)));
				float AT1 = A[j*(n+1)+(k+1)*(n)];
				ATvect1j =  _mm_load1_ps(A + ((j + 1) * (n + 1) + (k + 1) * (n)));
				float AT1j = A[(j + 1)*(n+1)+(k + 1)*(n)];

				ATvect2 =  _mm_load1_ps(A + (j * (n + 1) + (k+2) * (n)));
				float AT2 = A[j*(n+1)+(k+2)*(n)];
				ATvect2j =  _mm_load1_ps(A + ((j + 1) * (n + 1) + (k + 2) * (n)));
				float AT2j = A[(j + 1)*(n+1)+(k + 2)*(n)];

				ATvect3 =  _mm_load1_ps(A + (j * (n + 1) + (k+3) * (n)));
				float AT3 = A[j*(n+1)+(k+3)*(n)];
				ATvect3j =  _mm_load1_ps(A + ((j + 1) * (n + 1) + (k + 3) * (n)));
				float AT3j = A[(j + 1)*(n+1)+(k + 3)*(n)];

				for( int i = 0; i < n/8 * 8; i+= 8 ) {
					float *temp = C + i + j * n;
					float *tempj = C + i + (j + 1) * n;
					float *tmp = A + i + (k)*(n);
					float *tmp1 = A + i + (k + 1)*(n);
					float *tmp2 = A + i + (k + 2)*(n);
					float *tmp3 = A + i + (k + 3)*(n);

					//i = 0
					Cmatrix = _mm_loadu_ps(temp);
				    vect = _mm_mul_ps(_mm_loadu_ps(tmp), ATmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps(tmp1), ATvect1);
				    vect3 = _mm_mul_ps(_mm_loadu_ps(tmp2), ATvect2);
				    vect4 = _mm_mul_ps(_mm_loadu_ps(tmp3), ATvect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps(temp, Cmatrix);
				    //j + 1
				    Cmatrix = _mm_loadu_ps(tempj);
				    vect = _mm_mul_ps(_mm_loadu_ps(tmp), ATjmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps(tmp1), ATvect1j);
				    vect3 = _mm_mul_ps(_mm_loadu_ps(tmp2), ATvect2j);
				    vect4 = _mm_mul_ps(_mm_loadu_ps(tmp3), ATvect3j);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps(tempj, Cmatrix);

				    // i = 1

				    Cmatrix = _mm_loadu_ps((temp) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((tmp) + 4), ATmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps((tmp1) + 4), ATvect1);
				    vect3 = _mm_mul_ps(_mm_loadu_ps((tmp2) + 4), ATvect2);
				    vect4 = _mm_mul_ps(_mm_loadu_ps((tmp3) + 4), ATvect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps((temp) + 4, Cmatrix);
				    // j + 1
				    Cmatrix = _mm_loadu_ps(tempj + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps(tmp + 4), ATjmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps(tmp1 + 4), ATvect1j);
				    vect3 = _mm_mul_ps(_mm_loadu_ps(tmp2 + 4), ATvect2j);
				    vect4 = _mm_mul_ps(_mm_loadu_ps(tmp3 + 4), ATvect3j);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps(tempj + 4, Cmatrix);

				}
				for (int i = n/8 * 8; i < n; i += 1) {
					C[i+j*n] += A[i+k*(n)] * AT + A[i+(k+1)*(n)] * AT1 + A[i+(k+2)*(n)] * AT2 + A[i+(k+3)*(n)] * AT3;
					C[i+(j + 1)*n] += A[i+k*(n)] * ATj + A[i+(k+1)*(n)] * AT1j + A[i+(k+2)*(n)] * AT2j + A[i+(k+3)*(n)] * AT3j;
				}
			}
			for (int k = m/4 * 4; k < m; k += 1) {
				ATmatrix =  _mm_load1_ps(A + (j * (n + 1) + (k) * (n)));
				float AT = A[j*(n+1)+k*(n)];

				ATjmatrix =  _mm_load1_ps(A + ((j + 1) * (n + 1) + (k) * (n)));
				float ATj = A[(j + 1)*(n+1)+k*(n)];
				for( int i = 0; i < n/12 * 12; i+= 12 ) {
					float *temp = C + i + j * n;
					float *t2 = A + i + (k)*(n);
					float *tempj = C + i + (j+1) * n;

					//i = 0
					Cmatrix = _mm_loadu_ps(temp);
				    vect = _mm_mul_ps(_mm_loadu_ps(t2), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps(temp, Cmatrix);
				    //j + 1
				    Cmatrix = _mm_loadu_ps(tempj);
				    vect = _mm_mul_ps(_mm_loadu_ps(t2), ATjmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps(tempj, Cmatrix);

				    //i = 1
				    Cmatrix = _mm_loadu_ps((temp) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((t2) + 4), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((temp) + 4, Cmatrix);
				    //j + 1
				    Cmatrix = _mm_loadu_ps((tempj) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((t2) + 4), ATjmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((tempj) + 4, Cmatrix);

				    //i = 2
				    Cmatrix = _mm_loadu_ps((temp) + 8);
				    vect = _mm_mul_ps(_mm_loadu_ps((t2) + 8), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((temp) + 8, Cmatrix);
				    //j + 1
				    Cmatrix = _mm_loadu_ps((tempj) + 8);
				    vect = _mm_mul_ps(_mm_loadu_ps((t2) + 8), ATjmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((tempj) + 8, Cmatrix);
				}
				for (int i = n/12 * 12; i < n; i += 1) {
					C[i+j*n] += A[i+k*(n)] * AT;
					C[i+(j+1)*n] += A[i+k*(n)] * ATj;
				}
			}
		}
	}
	#pragma omp parallel
	{
		__m128 vect, ATmatrix, ATvect1, ATvect2, ATvect3, ATvect4, ATvect5, ATvect6, ATvect7, Cmatrix, vect2, vect3, vect4, ATvect3j, AT3j, ATvect2j, AT2j, ATvect1j, AT1j, ATjmatrix, ATj;
		#pragma omp for
		for (int j = n/2 * 2; j < n; j ++) {
			for( int k = 0; k < m/4 * 4; k+=4 ) {
				ATmatrix =  _mm_load1_ps(A + (j * (n + 1) + (k) * (n)));
				float AT = A[j*(n+1)+k*(n)];

				ATvect1 =  _mm_load1_ps(A + (j * (n + 1) + (k+1) * (n)));
				float AT1 = A[j*(n+1)+(k+1)*(n)];

				ATvect2 =  _mm_load1_ps(A + (j * (n + 1) + (k+2) * (n)));
				float AT2 = A[j*(n+1)+(k+2)*(n)];

				ATvect3 =  _mm_load1_ps(A + (j * (n + 1) + (k+3) * (n)));
				float AT3 = A[j*(n+1)+(k+3)*(n)];
				for( int i = 0; i < n/8 * 8; i+= 8 ) {
					float *temp = C + i + j * n;
					float *tmp = A + i + (k)*(n);
					float *tmp1 = A + i + (k + 1)*(n);
					float *tmp2 = A + i + (k + 2)*(n);
					float *tmp3 = A + i + (k + 3)*(n);
					//i = 0
					Cmatrix = _mm_loadu_ps(temp);
				    vect = _mm_mul_ps(_mm_loadu_ps(tmp), ATmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps(tmp1), ATvect1);
				    vect3 = _mm_mul_ps(_mm_loadu_ps(tmp2), ATvect2);
				    vect4 = _mm_mul_ps(_mm_loadu_ps(tmp3), ATvect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps(temp, Cmatrix);

				    //i = 1
				    Cmatrix = _mm_loadu_ps((temp) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((tmp) + 4), ATmatrix);
				    vect2 = _mm_mul_ps(_mm_loadu_ps((tmp1) + 4), ATvect1);
				    vect3 = _mm_mul_ps(_mm_loadu_ps((tmp2) + 4), ATvect2);
				    vect4 = _mm_mul_ps(_mm_loadu_ps((tmp3) + 4), ATvect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    Cmatrix = _mm_add_ps(Cmatrix, vect2);
				    Cmatrix = _mm_add_ps(Cmatrix, vect3);
				    Cmatrix = _mm_add_ps(Cmatrix, vect4);
				    _mm_storeu_ps((temp) + 4, Cmatrix);
				    
				}
				for (int i = n/8 * 8; i < n; i += 1) {
					C[i+j*n] += A[i+k*(n)] * AT + A[i+(k+1)*(n)] * AT1 + A[i+(k+2)*(n)] * AT2 + A[i+(k+3)*(n)] * AT3;
				}
			}
			for (int k = m/4 * 4; k < m; k += 1) {
				ATmatrix =  _mm_load1_ps(A + (j * (n + 1) + (k) * (n)));
				float AT = A[j*(n+1)+k*(n)];
				for( int i = 0; i < n/8 * 8; i+= 8 ) {

					float *temp = C + i + j * n;
					float *tempj = C + i + (j + 1) * n;
					//i = 0
					Cmatrix = _mm_loadu_ps(temp);
				    vect = _mm_mul_ps(_mm_loadu_ps(A + i + (k)*(n)), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps(temp, Cmatrix);

				    //i = 1
				    Cmatrix = _mm_loadu_ps((temp) + 4);
				    vect = _mm_mul_ps(_mm_loadu_ps((A + i + (k)*(n)) + 4), ATmatrix);
				    Cmatrix = _mm_add_ps(Cmatrix, vect);
				    _mm_storeu_ps((temp) + 4, Cmatrix);
				}
				for (int i = n/8 * 8; i < n; i += 1) {
					C[i+j*n] += A[i+k*(n)] * AT;
				}
			}
		}
	}
}
Пример #29
0
void inplace_center_and_trace_atom_major(float* coords, float* traces, const int n_frames, const int n_atoms)
{
    /* Center a trajectory containing multiple conformations inplace.
       The coordinates are store in float, but the accumulation is done in
       double.

       Also compute the traces of the centered conformations, which are necessary
       for RMSD.
    */ 
    int i, k;
    float* confp;
    __m128d sx_, sy_, sz_, trace_;
    __m128 mux_, muy_, muz_;
    float sxf, syf, szf;
    double sx[2], sy[2], sz[2], trace[2];
    __m128 x, y, z, x2, y2, z2;

    #ifdef _OPENMP
    #pragma omp parallel for default(none) shared(coords, traces) \
        private(sx_, sy_, sz_, trace_, mux_, muy_, muz_, sxf, syf, szf, \
        confp, i, x, y, z, x2, y2, z2, sx, sy, sz, trace)
    #endif
    for (k = 0; k < n_frames; k++) {
        confp = &coords[k * n_atoms * 3];
        sx_ = sy_ = sz_ = trace_ = _mm_setzero_pd();
        for (i = 0; i < n_atoms/4; i++) {
            aos_deinterleaved_loadu(confp, &x, &y, &z);

            /* accumulate the sums of each coordinate in double */
            /* get the first two values from each float4 */
            sx_ = _mm_add_pd(sx_, _mm_cvtps_pd(x));
            sy_ = _mm_add_pd(sy_, _mm_cvtps_pd(y));
            sz_ = _mm_add_pd(sz_, _mm_cvtps_pd(z));
            /* and shuffle in the second two values */
            sx_ = _mm_add_pd(sx_, _mm_cvtps_pd(_mm_movehl_ps(x, x)));
            sy_ = _mm_add_pd(sy_, _mm_cvtps_pd(_mm_movehl_ps(y, y)));
            sz_ = _mm_add_pd(sz_, _mm_cvtps_pd(_mm_movehl_ps(z, z)));
            confp += 12;
        }
        /* copy the summed coordinates out of the SSE registers */
        _mm_storeu_pd(sx, sx_);
        _mm_storeu_pd(sy, sy_);
        _mm_storeu_pd(sz, sz_);

        /* Add the last couple entries that weren't a factor of four */
        for (i = 0; i < n_atoms % 4; i++) {
            sx[0] += confp[i*3 + 0];
            sy[0] += confp[i*3 + 1];
            sz[0] += confp[i*3 + 2];
        }

        /* Put everything into the first value. We're doing this here, as */
        /* opposed to using a SSE horizontal add. */
        sx[0] += sx[1];
        sy[0] += sy[1];
        sz[0] += sz[1];

        /* Now we want mean x, y, and z positions */
        sx[0] /= n_atoms;
        sy[0] /= n_atoms;
        sz[0] /= n_atoms;

        /* Load these mean positions back into the SSE registers */
        sxf = (float) sx[0];
        syf = (float) sy[0];
        szf = (float) sz[0];
        mux_ = _mm_load1_ps(&sxf);
        muy_ = _mm_load1_ps(&syf);
        muz_ = _mm_load1_ps(&szf);

        /* And subtract them out */
        confp = &coords[k * n_atoms * 3];
        for (i = 0; i < n_atoms/4; i++) {
            aos_deinterleaved_loadu(confp, &x, &y, &z);
            x = _mm_sub_ps(x, mux_);
            y = _mm_sub_ps(y, muy_);
            z = _mm_sub_ps(z, muz_);

            x2 = _mm_mul_ps(x, x);
            y2 = _mm_mul_ps(y, y);
            z2 = _mm_mul_ps(z, z);
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(x2));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(y2));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(z2));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(_mm_movehl_ps(x2, x2)));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(_mm_movehl_ps(y2, y2)));
            trace_ = _mm_add_pd(trace_, _mm_cvtps_pd(_mm_movehl_ps(z2, z2)));

            aos_interleaved_storeu(confp, x, y, z);
            confp += 12;
        }
        _mm_storeu_pd(trace, trace_);

        for (i = 0; i < n_atoms % 4; i++) {
            confp[i*3 + 0] -= sxf;
            confp[i*3 + 1] -= syf;
            confp[i*3 + 2] -= szf;
            trace[0] += confp[i*3 + 0]*confp[i*3 + 0];
            trace[0] += confp[i*3 + 1]*confp[i*3 + 1];
            trace[0] += confp[i*3 + 2]*confp[i*3 + 2];
        }
        trace[0] += trace[1];
        if (traces != NULL)
            traces[k] = (float) trace[0];
    }
}
Пример #30
0
void sgemm( int m, int n, float *A, float *C )
{
    __m128 a;
    __m128 a1;
    __m128 a2; 
    __m128 a3;
    __m128 a4;
    __m128 a5;
    
    __m128 b;
    __m128 b1;
    __m128 b2;
    __m128 b3;
    __m128 b4;
    __m128 b5;
    __m128 b6;
    __m128 b7;
    __m128 b8;
    __m128 b9;
    __m128 b10;
    __m128 b11;
    __m128 b12;
    /*
    __m128 b13;
    __m128 b14;
    __m128 b15;
    __m128 b16;
    __m128 b17;
    __m128 b18;
    __m128 b19;
    __m128 b20;
    */
    
    __m128 c;
    __m128 c1;
    __m128 c2;
    __m128 c3;
    __m128 c4;
    
    int i, j, k, l;
    int mod = m%4;
    int end = m/4 * 4;
    int total = n*m;
    float num[4];
    float* A_address;
    float* C_address;
    int m3 = 3 * m;
    int m2 = 2 * m;
    int end1 = total/m3 * m3;
#pragma omp parallel for private(a, a1, a2, a3, b, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, c, c1, c2, c3, c4, i, j, k, l)
    for( i = 0; i < end; i += 4 ){
	for( k = 0; k < end; k += 4 ) {
	    c1 = _mm_setzero_ps();
	    c2 = _mm_setzero_ps();
	    c3 = _mm_setzero_ps();
	    c4 = _mm_setzero_ps();
	    float* A_address1 = A + i;
	    float* A_address2 = A + k;
	    float* A_address21 = A + k + 1;
	    for( j = 0; j < end1; j += m3, A_address1 += m3, A_address2 += m3, A_address21 += m3){
		a1 = _mm_loadu_ps(A_address1);
		a2 = _mm_loadu_ps(A_address1 + m);
		a3 = _mm_loadu_ps(A_address1 + m2);
		
		b1 = _mm_load1_ps(A_address2);
		b2 = _mm_load1_ps(A_address2 + m);
		b3 = _mm_load1_ps(A_address2 + m2);
		/*
		b4 = _mm_load1_ps(A_address2 + m3);
		b5 = _mm_load1_ps(A_address2 + m4);
		*/
		
		b4 = _mm_load1_ps(A_address21);
		b5 = _mm_load1_ps(A_address21 + m);
		b6 = _mm_load1_ps(A_address21 + m2);
		/*
		b9 = _mm_load1_ps(A_address21 + m3);
		b10 = _mm_load1_ps(A_address21 + m4);
		*/
		b7 = _mm_load1_ps(A + k + 2 + j);
		b8 = _mm_load1_ps(A + k + 2 + j + m);
		b9 = _mm_load1_ps(A + k + 2 + j + m2);
		/*
		b14 = _mm_load1_ps(A + k + 2 + j + m3);
		b15 = _mm_load1_ps(A + k + 2 + j + m4);
		*/
		
		b10 = _mm_load1_ps(A + k + 3 + j);
		b11 = _mm_load1_ps(A + k + 3 + j + m);
		b12 = _mm_load1_ps(A + k + 3 + j + m2);
		/*
		b19 = _mm_load1_ps(A + k + 3 + j + m3);
		b20 = _mm_load1_ps(A + k + 3 + j + m4);
		*/
		
		c1 = _mm_add_ps(c1, _mm_mul_ps(a1, b1));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a2, b2));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a3, b3));
		/*
		c1 = _mm_add_ps(c1, _mm_mul_ps(a4, b4));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a5, b5));
		*/
		c2 = _mm_add_ps(c2, _mm_mul_ps(a1, b4));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a2, b5));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a3, b6));
		/*
		c2 = _mm_add_ps(c2, _mm_mul_ps(a4, b9));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a5, b10));
		*/
		c3 = _mm_add_ps(c3, _mm_mul_ps(a1, b7));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a2, b8));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a3, b9));
		/*
		c3 = _mm_add_ps(c3, _mm_mul_ps(a4, b14));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a5, b15));
		*/
		
		c4 = _mm_add_ps(c4, _mm_mul_ps(a1, b10));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a2, b11));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a3, b12));
		/*
		c4 = _mm_add_ps(c4, _mm_mul_ps(a4, b19));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a5, b20));
		*/
		
	    }
	    for( j = end1; j < total; j += m){
		a = _mm_loadu_ps(A + i + j);
		
		b1 = _mm_load1_ps(A + k + j);
		b2 = _mm_load1_ps(A + k + 1 + j);
		b3 = _mm_load1_ps(A + k + 2 + j);
		b4 = _mm_load1_ps(A + k + 3 + j);
		
		c1 = _mm_add_ps(c1, _mm_mul_ps(a, b1));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a, b2));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a, b3));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a, b4));
	    }
	    _mm_storeu_ps(C + i + (k)*m, c1);
	    _mm_storeu_ps(C + i + (k+1)*m, c2);
	    _mm_storeu_ps(C + i + (k+2)*m, c3);
	    _mm_storeu_ps(C + i + (k+3)*m, c4);
	}
	for(k = end; k < m; k++){
	    float* A_address1 = A + i;
	    float* A_address2 = A + k;
	    c = _mm_setzero_ps();
	    for( j = 0; j < end1; j += m3, A_address1 += m3, A_address2 += m3){
		a1 = _mm_loadu_ps(A_address1);
		a2 = _mm_loadu_ps(A + i + j + m);
		a3 = _mm_loadu_ps(A + i + j + m2);
		
		b1 = _mm_load1_ps(A_address2);
		b2 = _mm_load1_ps(A + k + j + m);
		b3 = _mm_load1_ps(A + k + j + m2);
		
		c = _mm_add_ps(c, _mm_mul_ps(a1, b1));
		c = _mm_add_ps(c, _mm_mul_ps(a2, b2));
		c = _mm_add_ps(c, _mm_mul_ps(a3, b3));
	    }
	    for( j = end1; j < total; j += m){
		a = _mm_loadu_ps(A + i + j);
		
		b = _mm_load1_ps(A + k + j);
		
		c = _mm_add_ps(c, _mm_mul_ps(a, b));
	    }
	    _mm_storeu_ps(C + i + k*m, c);
	}
    }
    if (mod != 0){
	if (mod == 3){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address),*(A_address + 1),*(A_address + 2), 0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 3; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
	else if (mod == 2){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address),*(A_address + 1),0 ,0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 2; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
	else if (mod == 1){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address), 0, 0, 0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 1; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
    }
}