示例#1
0
int main( int argc, char **argv ) {
    /* set   A  =   |1 3|,     B  =   |3 0|       C =   |0 0|
                    |2 4|             |0 2|             |0 0|  */
    double A[4] = {1,2,3,4}, B[4] = {3,0,0,2}, C[4] = {0,0,0,0};

    /*   We are computing C = C + A x B, which means:
         C[0] += A[0]*B[0] + A[2]*B[1]
         C[1] += A[1]*B[0] + A[3]*B[1]
         C[2] += A[0]*B[2] + A[2]*B[3]
         C[3] += A[1]*B[2] + A[3]*B[3] */

    /* load entire matrix C into SIMD variables */
    __m128d c1 = _mm_loadu_pd( C+0 ); /* c1 = (C[0],C[1]) */
    __m128d c2 = _mm_loadu_pd( C+2 ); /* c2 = (C[2],C[3]) */

    for( int i = 0; i < 2; i++ ) {
        __m128d a  = _mm_loadu_pd( A+i*2 ); /* load next column of A */
        __m128d b1 = _mm_load1_pd( B+0+i );
        __m128d b2 = _mm_load1_pd( B+2+i ); /* load next row of B */

        c1 = _mm_add_pd( c1, _mm_mul_pd( a, b1 ) ); /* multiply and add */
        c2 = _mm_add_pd( c2, _mm_mul_pd( a, b2 ) );
    }

    /* store the result back to the C array */
    _mm_storeu_pd( C+0, c1 ); /* (C[0],C[1]) = c1 */
    _mm_storeu_pd( C+2, c2 ); /* (C[2],C[3]) = c2 */

    /* output whatever we've got */
    printf( "|%g %g| * |%g %g| = |%g %g|\n", A[0], A[2], B[0], B[2], C[0], C[2] );
    printf( "|%g %g|   |%g %g|   |%g %g|\n", A[1], A[3], B[1], B[3], C[1], C[3] );

    return 0;
}
示例#2
0
static void
scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n)
{
  __m128d xmm1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    *dest++ = *src1++ * *val;
  }
  xmm1 = _mm_load_pd1(val);
  for (; n >= 4; n -= 4) {
    __m128d xmm0;
    xmm0 = _mm_loadu_pd(src1);
    xmm0 = _mm_mul_pd(xmm0, xmm1);
    _mm_store_pd(dest, xmm0);
    xmm0 = _mm_loadu_pd(src1 + 2);
    xmm0 = _mm_mul_pd(xmm0, xmm1);
    _mm_store_pd(dest + 2, xmm0);
    dest += 4;
    src1 += 4;
  }
  for (; n > 0; n--) {
    *dest++ = *src1++ * *val;
  }
}
示例#3
0
// multiply *p by v and applied to all n
COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v)
{
#if defined(COREARRAY_SIMD_AVX)

	const __m256d v4 = _mm256_set1_pd(v);

	switch ((size_t)p & 0x1F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x10:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x18:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 4; n-=4)
		{
			_mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
		break;
	default:
		for (; n >= 4; n-=4)
		{
			_mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	const __m128d v2 = _mm_set1_pd(v);

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 2; n-=2, p+=2)
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2));
		break;
	default:
		for (; n >= 2; n-=2, p+=2)
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2));
	}

#endif

	for (; n > 0; n--) (*p++) *= v;
}
示例#4
0
// *p += (*s) * v
COREARRAY_DLL_DEFAULT double *vec_f64_addmul(double *p, const double *s,
	size_t n, double v)
{
#if defined(COREARRAY_SIMD_SSE2)

	const __m128d v2 = _mm_set1_pd(v);

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) += (*s++) * v; n--; }
	case 0x00:
		for (; n >= 2; n -= 2)
		{
			_mm_store_pd(p, _mm_add_pd(_mm_load_pd(p),
				_mm_mul_pd(_mm_loadu_pd(s), v2)));
			p += 2; s += 2;
		}
		break;
	default:
		for (; n >= 2; n-=2)
		{
			_mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p),
				_mm_mul_pd(_mm_loadu_pd(s), v2)));
			p += 2; s += 2;
		}
	}

#endif

	for (; n > 0; n--) (*p++) += (*s++) * v;
	return p;
}
void sgemm( int m, int n, float *A, float *C ) {
    int i, j, k, jtn, cieling;
    float B[n * m];
    float buf[2];
    __m128d sum, ab, cd, ef, AB, CD, EF;
    transpose(m, n, A, B);
    for (i = 0; i < m; i += 1) {
        for (j = 0; j < m; j += 1) {
            jtn = j * n;
            for (k = 0, cieling = n - 5; k < cieling; k += 6) {
                ab = _mm_load1_pd(A + i + k * m);
                cd = _mm_load1_pd(A + i + (k + 2) * m);
                ef = _mm_load1_pd(A + i + (k + 4) * m);
                AB = _mm_loadu_pd(B + k + jtn);
                CD = _mm_loadu_pd(B + k + 2 + jtn);
                EF = _mm_loadu_pd(B + k + 4 + jtn);
                sum = _mm_add_pd(sum, _mm_mul_sd(ab, AB));
                sum = _mm_add_pd(sum, _mm_mul_sd(cd, CD));
                sum = _mm_add_pd(sum, _mm_mul_sd(ef, EF));
            }
            _mm_storeu_pd(buf, sum);
            C[i + j * m] = buf[0];
            if (n % 6 != 0) {
                for ( ; k < n; k += 1) {
                    C[i + j * m] += A[i + k * m] * A[k + jtn];
                }
            }
        }
    }
}
示例#6
0
static void
filterButter(const Float_t* input, Float_t* output, size_t nSamples, const Float_t* kernel)
{   
#ifdef HAVE_SSE2
    __m128d __kernel, __result, __temp;
    __declspec(align(16)) Float_t __temp2[2];

    while (nSamples--) {
        __kernel = _mm_loadr_pd(&kernel[0]);
        __temp = _mm_loadu_pd(&input[-1]);
        __result = _mm_mul_pd(__temp, __kernel);
        __kernel = _mm_loadr_pd(&kernel[4]);
        __temp = _mm_loadu_pd(&output[-2]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_sub_pd(__result, __temp);
        _mm_store_pd(__temp2, __result);
        *output = __temp2[0]
                + __temp2[1]
                + input [-2] * kernel[2];
                ;
        ++output;
        ++input;
    }
#else
    while (nSamples--) {
        *output =  
               input [0]  * kernel[0] - output[-1] * kernel[1]
             + input [-1] * kernel[2] - output[-2] * kernel[3]
             + input [-2] * kernel[4];
        ++output;
        ++input;
    }
#endif
}
示例#7
0
/* use compiler intrinsics for 2x parallel processing */
static inline double chi2_intrinsic_double(int n, const double* x, const double* y) {
    double result=0;
    const __m128d eps = _mm_set1_pd(DBL_MIN);
    const __m128d zero = _mm_setzero_pd();
    __m128d chi2 = _mm_setzero_pd();    

    for ( ; n>1; n-=2) {
        const __m128d a = _mm_loadu_pd(x);
        const __m128d b = _mm_loadu_pd(y);
	x+=2;
	y+=2;
        const __m128d a_plus_b = _mm_add_pd(a,b);
        const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps);
        const __m128d a_minus_b = _mm_sub_pd(a,b);
        const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b);
        const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps);
        chi2 = _mm_add_pd(chi2, quotient);
    }
    const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1));
    const __m128d sum = _mm_add_pd(chi2, shuffle);
// with SSE3, we could use hadd_pd, but the difference is negligible 

    _mm_store_sd(&result,sum);
    _mm_empty();
    if (n)
        result += chi2_baseline_double(n, x, y); // remaining entries
    return result;
}
static inline void
inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i = 0;
  __m128d sum[2], t;
  const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
    (gdouble *) ((gint8 *) b + 1 * bstride)
  };

  sum[0] = sum[1] = _mm_setzero_pd ();

  for (; i < len; i += 4) {
    t = _mm_loadu_pd (a + i + 0);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
    t = _mm_loadu_pd (a + i + 2);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
  }
  sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
  sum[0] = _mm_add_pd (sum[0], sum[1]);
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
  _mm_store_sd (o, sum[0]);
}
示例#9
0
double	vector_ps_double (const double* pa,const double* pb,size_t n)
{
    size_t k;
    /* multiplication 4 par 4 */
    size_t q = n / 4;
    size_t r = n % 4;
    double w;
    _mm_prefetch (pa,_MM_HINT_NTA);
    _mm_prefetch (pb,_MM_HINT_NTA);
    if (q > 0) {
	__m128d acc1 = _mm_setzero_pd();
	__m128d acc2 = _mm_setzero_pd();
	if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
	    for (k=0;k<q;k++) {
		/* Charge 2 doubles dans chaque tableau */
		__m128d i1 = _mm_load_pd(pa);
		__m128d j1 = _mm_load_pd(pb);
		__m128d i2 = _mm_load_pd(pa+2);
		__m128d j2 = _mm_load_pd(pb+2);
		/* incrément de 4 doubles en tout (2 pour i et 2 pour j) */
		/* Multiplie */
		__m128d s1 = _mm_mul_pd(i1,j1);
		__m128d s2 = _mm_mul_pd(i2,j2);
		pa += 4;
		pb += 4;
		/* Accumule */
		acc1 = _mm_add_pd(acc1,s1);
		acc2 = _mm_add_pd(acc2,s2);
	    }
	}
	else {
	    for (k=0;k<q;k++) {
		/* Charge 2 doubles dans chaque tableau */
		__m128d i1 = _mm_loadu_pd(pa);
		__m128d j1 = _mm_loadu_pd(pb);
		__m128d i2 = _mm_loadu_pd(pa+2);
		__m128d j2 = _mm_loadu_pd(pb+2);
		/* Multiplie */
		__m128d s1 = _mm_mul_pd(i1,j1);
		__m128d s2 = _mm_mul_pd(i2,j2);
		pa += 4;
		pb += 4;
		/* Accumule */
		acc1 = _mm_add_pd(acc1,s1);
		acc2 = _mm_add_pd(acc2,s2);
	    }
	}
	/* Somme finale */
	acc1 = _mm_add_pd(acc1,acc2);
	acc1 = _mm_hadd_pd(acc1,acc1);
	_mm_store_sd(&w,acc1);
    }
    else {
	w = 0;
    }
    for (k=0;k<r;k++)
	w += (*pa++) * (*pb++);
    return w;
}
示例#10
0
value complex_add(value vx, value vy)
{
    CAMLparam2(vx, vy);
    CAMLlocal1(vz);
    vz = caml_alloc(Double_array_tag, 2);
    _mm_storeu_pd((double*) vz,
                  _mm_loadu_pd((double const*) vx) + _mm_loadu_pd((double const*) vy));
    CAMLreturn(vz);
}
示例#11
0
void
mlib_FIR_tap2f_d64s(
    mlib_d64 *pdst,
    const mlib_d64 *psrc,
    mlib_d64 *pflt,
    mlib_s32 n)
{
	mlib_s32 j;
	mlib_d64 src1_1, src2_1;
	mlib_d64 src1_2, src2_2;
	mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1];
	__m128d sdflt1, sdflt2, ssrc1, ssrc2, smul1, smul2;

	sdflt2 = _mm_set1_pd(dflt2);
	sdflt1 = _mm_set1_pd(dflt1);

	if ((mlib_addr)psrc & 15) {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (j = 0; j < n; j++) {
			ssrc1 = _mm_loadu_pd(psrc);
			ssrc2 = _mm_loadu_pd(psrc + 2);

			smul1 = _mm_mul_pd(sdflt2, ssrc1);
			smul2 = _mm_mul_pd(sdflt1, ssrc2);

			smul1 = _mm_add_pd(smul1, smul2);

			_mm_storeu_pd(pdst, smul1);

			psrc += 2;
			pdst += 2;
		}
	} else {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (j = 0; j < n; j++) {
			ssrc1 = _mm_load_pd(psrc);
			ssrc2 = _mm_load_pd(psrc + 2);

			smul1 = _mm_mul_pd(sdflt2, ssrc1);
			smul2 = _mm_mul_pd(sdflt1, ssrc2);

			smul1 = _mm_add_pd(smul1, smul2);

			_mm_storeu_pd(pdst, smul1);

			psrc += 2;
			pdst += 2;
		}
	}
}
static void
sse3_test_haddpd (double *i1, double *i2, double *r)
{
  __m128d t1 = _mm_loadu_pd (i1);
  __m128d t2 = _mm_loadu_pd (i2);

  t1 = _mm_hadd_pd (t1, t2);

  _mm_storeu_pd (r, t1);
}
示例#13
0
文件: dgemm-sse.c 项目: WillCh/cs267
/** this fun use the SSE to implement the mul **/
void square_dgemm(int lda, double* A, double* B, double* C) {
    // define the variable here

    register __m128d cTmp, aTmp, bTmp; 

    for (int j = 0; j < lda; j++) {
        for (int k = 0; k < lda; k++) {
            // copy the B's val to fill the bTmp
            bTmp = _mm_load1_pd(B + k + j*lda);

            double* adda_mid = A + k*lda;
            double* addc_mid = C + j*lda;
            for (int i = 0; i < lda/8*8; i += 8) {
                double* adda = adda_mid + i;
                double* addc = addc_mid + i;
                
                aTmp = _mm_loadu_pd(adda);
                cTmp = _mm_loadu_pd(addc);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd(addc, cTmp);

                aTmp = _mm_loadu_pd(adda + 2);
                cTmp = _mm_loadu_pd(addc + 2);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd((addc + 2), cTmp);

                aTmp = _mm_loadu_pd(adda + 4);
                cTmp = _mm_loadu_pd(addc + 4);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd((addc + 4), cTmp);

                aTmp = _mm_loadu_pd(adda + 6);
                cTmp = _mm_loadu_pd(addc + 6);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd((addc + 6), cTmp);
            }

            for (int i = lda/8*8; i < lda/2*2; i += 2) {
                double* adda = adda_mid + i;
                double* addc = addc_mid + i;
                
                aTmp = _mm_loadu_pd(adda);
                cTmp = _mm_loadu_pd(addc);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd(addc, cTmp);
            }

            // the last case
            for (int i = lda/2*2; i < lda; i ++) {
                C[i + j*lda] += A[i + k*lda] * B[k+j*lda];

            }
        }
    }
}
示例#14
0
static void
sse3_test_movddup_reg (double *i1, double *r)
{
  __m128d t1 = _mm_loadu_pd (i1);
  __m128d t2 = _mm_loadu_pd (&cnst1[0]);

  t1  = _mm_mul_pd (t1, t2);
  t2  = _mm_movedup_pd (t1);

  _mm_storeu_pd (r, t2);
}
示例#15
0
value complex_mul(value vab, value vcd)
{
    CAMLparam2(vab, vcd);
    CAMLlocal1(vz);
    vz = caml_alloc(Double_array_tag, 2);
    __m128d ab, cd, ac_bd, ba, bc_ad;
    ab = _mm_loadu_pd((double const*) vab);
    cd = _mm_loadu_pd((double const*) vcd);
    ac_bd = _mm_mul_pd(ab, cd);
    ba    = _mm_shuffle_pd(ab, ab, 1);
    bc_ad = _mm_mul_pd(ba, cd);
    _mm_storeu_pd((double*) vz, _mm_addsub_pd(ac_bd, bc_ad));
    CAMLreturn(vz);
}
void transpose_misaligned(double *a, double *b, int N1, int N2, double factor) {

    int i,j,k,k1,it,jt,itt,jtt,it_bound,jt_bound,itt_bound,jtt_bound;
    int conflict,tmp,tmpN,offset,line_offset,setnum,set[8192/(4*sizeof(double))];
    double *pA, *pB;


    register __m128d x, y, z, w, t, t1,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    itt_bound = (N1/tilesize)*tilesize;
    for (itt = 0; itt < itt_bound; itt=itt+5*tilesize) {
        jtt_bound =(N2/tilesize)*tilesize;
        for (jtt = 0; jtt < jtt_bound; jtt=jtt+5*tilesize) {
            it_bound = (itt+5*tilesize > itt_bound)?itt_bound:itt+5*tilesize;
            for (it = itt; it < it_bound; it = it+tilesize) {
                jt_bound = (jtt+5*tilesize>itt_bound)?jtt_bound:jtt+5*tilesize;
                for (jt = jtt; jt < jt_bound; jt = jt+tilesize) {
                    k = 0;
                    for (j = jt; j < jt+tilesize; j=j+2) {
                        for (i = it; i < it+tilesize; i=i+2) {
                            pA = a+i*N2+j;
                            pB = b+j*N1+i;
                            x = _mm_loadu_pd(pA);
                            x = _mm_mul_pd(x,fac_vector);
                            y = _mm_loadu_pd(pA + N2);
                            y = _mm_mul_pd(y,fac_vector);
                            z = _mm_shuffle_pd( x, y, 0);
                            w = _mm_shuffle_pd( x, y, 3);
                            _mm_storeu_pd(pB,z);
                            _mm_storeu_pd(pB + N1,w);
                        }
                    }
                }
            }
        }
        for (i = itt; i < itt+5*tilesize && i < itt_bound; i++) {
            for (j = jtt_bound; j < N2; j++) {
                b[j*N1+i] = factor * a[i*N2+j];
            }
        }
    }
    for (i = itt_bound; i < N1; i++) {
        for (j = 0; j < N2; j++) {
            b[j*N1+i] = factor * a[i*N2+j];
        }
    }
}
示例#17
0
static double* copy_block(int lda, int M, int N, double* A, double* new_A) {

    int M_even = turn_even(M);
    int N_even = turn_even(N);
    int i_step;
    __m128d a;

    for (int j=0; j<N; j++) {
        for (int i=0; i<M; i+=I_STRIDE) {
            i_step = min(I_STRIDE, M-i);
            if (i_step==1) {            
                new_A[i+j*M_even] = A[i+j*lda];
            } else {
                a = _mm_loadu_pd(A+i+j*lda);
                _mm_store_pd(new_A+i+j*M_even, a);
            }
        }
    }
    if (N % 2) {
        for (int i=0; i<M_even; i++) {
            new_A[i+(N_even-1)*M_even] = 0.0;
        }
    } 
    return new_A;
}
示例#18
0
static void
TEST (void)
{
  union128d s1;
  union128 u, s2;
  double source1[2] = {123.345, 67.3321};
  float  e[4] = {5633.098, 93.21, 3.34, 4555.2};

  s1.x = _mm_loadu_pd (source1);
  s2.x = _mm_loadu_ps (e);

  __asm("" : "+v"(s1.x), "+v"(s2.x));
  u.x = test(s2.x, s1.x);

  e[0] = (float)source1[0];

  if (check_union128(u, e))
#if DEBUG
  {
      printf ("sse2_test_cvtsd2ss_1; check_union128 failed\n");
      printf ("\t [%f,%f,%f,%f],[%f,%f]\n", s2.a[0], s2.a[1], s2.a[2], s2.a[3],
    		  s1.a[0], s1.a[1]);
      printf ("\t -> \t[%f,%f,%f,%f]\n", u.a[0], u.a[1], u.a[2], u.a[3]);
      printf ("\texpect\t[%f,%f,%f,%f]\n", e[0], e[1], e[2], e[3]);
  }
#else
    abort ();
#endif
}
示例#19
0
文件: clamp_sse.c 项目: thewb/mokoiax
static void
clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
{
  __m128d xmm1;
  double max = *src2_1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    double x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
  xmm1 = _mm_set1_pd(max);
  for (; n >= 2; n -= 2) {
    __m128d xmm0;
    xmm0 = _mm_loadu_pd(src1);
    xmm0 = _mm_min_pd(xmm0, xmm1);
    _mm_store_pd(dest, xmm0);
    dest += 2;
    src1 += 2;
  }
  for (; n > 0; n--) {
    double x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
}
示例#20
0
void trigo_vsin_vml_sse2(double* dst, const double* src, size_t length) {
    size_t i = length;

    while (i) {
        if (!SimdUtils::isAligned(dst, 16) || i == 1) {
            __m128d d = _mm_load_sd(src);
            _mm_store_sd(dst, sin_vml_pd(d));

            dst++;
            src++;

            if (--i == 0)
                break;
        }

        while (i >= 2) {
            __m128d d = _mm_loadu_pd(src);
            _mm_store_pd(dst, sin_vml_pd(d));

            dst += 2;
            src += 2;
            i -= 2;
        }
    }
}
示例#21
0
__m128d test_mm_loadu_pd(double const* A) {
  // DAG-LABEL: test_mm_loadu_pd
  // DAG: load <2 x double>, <2 x double>* %{{.*}}, align 1
  //
  // ASM-LABEL: test_mm_loadu_pd
  // ASM: movupd
  return _mm_loadu_pd(A);
}
示例#22
0
static void
sse3_test_movddup_reg_subsume_unaligned (double *i1, double *r)
{
  __m128d t1 = _mm_loadu_pd (i1);
  __m128d t2 = _mm_movedup_pd (t1);

  _mm_storeu_pd (r, t2);
}
/* This function malloc new aligned memory space for matrix and 
 * then copy the original values into it. The new matrix's size is 
 * a multiple of 8, which makes it easier to handle the boundry.
 * The new matrix's layout is like this:
 *		[[C O],
 *		 [O O]]
 * */
double* matrix_padding(double* old_matrix, int old_size, int new_size){
	double* new_matrix;
	/* Allocate aligned space according to the new size*/
	posix_memalign((void**)&new_matrix, 16, sizeof(double)*new_size*new_size);
	/* Copy data.
	 * Handle odd/even old size sepatately to avoid if-branches in
	 * any loops.
	 */
	if(old_size%2 == 1) {
		for(int i=0; i<old_size; i++) {
			for(int j=0; j<old_size - 1; j+=2) {
				__m128d v1 = _mm_loadu_pd(old_matrix + i*old_size + j);
				_mm_store_pd(new_matrix + i*new_size + j, v1);
			}
			new_matrix[i*new_size+old_size-1]=old_matrix[(i+1)*old_size-1];
			for(int j=old_size; j<new_size; j++) {
				new_matrix[i*new_size + j] = 0;
			}
	     }
	}else {
	    for(int i=0; i<old_size; i++) {
			for(int j=0; j<old_size; j+=2) {
				__m128d v1 = _mm_loadu_pd(old_matrix + i*old_size + j);
				_mm_store_pd(new_matrix + i*new_size + j, v1);
			}
			for(int j=old_size; j<new_size; j++) {
				new_matrix[i*new_size + j] = 0;
			}
		}
	}
	/* Set extra space with ZERO. */
	__m128d v_zero = _mm_setzero_pd();
	for(int i=old_size; i<new_size; i++) {
		double* addr = new_matrix + i * new_size;
		for(int j=0; j<new_size; j+=10) {
			_mm_store_pd(addr+j, v_zero);
			_mm_store_pd(addr+j+2, v_zero);
			_mm_store_pd(addr+j+4, v_zero);
			_mm_store_pd(addr+j+6, v_zero);
			_mm_store_pd(addr+j+8, v_zero);
		}
	}
	return new_matrix;
}
示例#24
0
ALGEBRA_INLINE void		vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n)
{
	size_t k;
	
	__m128d l1 = _mm_load1_pd(&lambda);

	size_t q = n / 2;
	size_t r = n % 2;
	if(q > 0) {
		if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) {
			for (k=0;k<q;k++) {
				/* Charge 2 valeurs de chaque tableau */
				__m128d i1 = _mm_load_pd(v1);
				__m128d j1 = _mm_load_pd(v2);
				/* multiplie */
					   j1 = _mm_mul_pd(j1, l1);
				/* additionne */
				i1 = _mm_add_pd(i1,j1);
				/* Sauvegarde */
				_mm_store_pd(v1, i1);
				v1 += 2;
				v2 += 2;
			}
		}
		else {		
			for (k=0;k<q;k++) {
				/* Charge 8 valeurs de chaque tableau */
				__m128d i1 = _mm_loadu_pd(v1);
				__m128d j1 = _mm_loadu_pd(v2);
					   j1 = _mm_mul_pd(j1, l1);
				/* Soustrait */
				i1 = _mm_add_pd(i1,j1);
				/* Sauvegarde */
				_mm_storeu_pd(v1, i1);
				v1 += 2;
				v2 += 2;
			}
		}
	}
	
	for(k = 0 ; k<r ; k++)
		v1[k] += lambda*v2[k];

}
示例#25
0
SSE_FUNCTION static void
add_f64_sse2 (double *dest, double *src1, double *src2, int n)
{
    __m128d xmm0, xmm1;
    while (((long)dest & 15) && (0 < n)) {
        *dest++ = *src1++ + *src2++;
        n--;
    }
    while (1 < n) {
        xmm0 = _mm_loadu_pd(src1);
        xmm1 = _mm_loadu_pd(src2);
        xmm0 = _mm_add_pd(xmm0, xmm1);
        _mm_store_pd(dest, xmm0);
        dest += 2;
        src1 += 2;
        src2 += 2;
        n -= 2;
    }
    while (0 < n) {
        *dest++ = *src1++ + *src2++;
        n--;
    }
}
static inline void
inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i;
  __m128d f[2], sum[4], t;
  const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
    (gdouble *) ((gint8 *) b + 1 * bstride),
    (gdouble *) ((gint8 *) b + 2 * bstride),
    (gdouble *) ((gint8 *) b + 3 * bstride)
  };

  f[0] = _mm_loadu_pd (icoeff + 0);
  f[1] = _mm_loadu_pd (icoeff + 2);
  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();

  for (i = 0; i < len; i += 2) {
    t = _mm_loadu_pd (a + i + 0);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
  }
  sum[0] =
      _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
  sum[1] =
      _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
  sum[2] =
      _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
  sum[3] =
      _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
  sum[0] = _mm_add_pd (sum[0], sum[1]);
  sum[2] = _mm_add_pd (sum[2], sum[3]);
  sum[0] = _mm_add_pd (sum[0], sum[2]);
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
  _mm_store_sd (o, sum[0]);
}
static inline void
inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i = 0;
  __m128d sum = _mm_setzero_pd ();

  for (; i < len; i += 8) {
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
            _mm_load_pd (b + i + 0)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
            _mm_load_pd (b + i + 2)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
            _mm_load_pd (b + i + 4)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
            _mm_load_pd (b + i + 6)));
  }
  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
  _mm_store_sd (o, sum);
}
void computeDensitySSE(const double * const currentCell, double *density)
{
    __m128d vsum = _mm_set1_pd(0.0);
    int i;
    for (i = 0; i < PARAMQ - 1; i += 2)
    {
        __m128d v = _mm_loadu_pd(&currentCell[i]);
        vsum = _mm_add_pd(vsum, v);
    }
    vsum = _mm_hadd_pd(vsum, vsum);
    _mm_storeh_pd(density, vsum);
    if (i < PARAMQ)
    {
        *density += currentCell[i];
    }
}
static void
TEST (void)
{
  union128d s1;
  union128 u, s2;
  double source1[2] = {123.345, 67.3321};
  float  e[4] = {5633.098, 93.21, 3.34, 4555.2};

  s1.x = _mm_loadu_pd (source1);
  s2.x = _mm_loadu_ps (e);

  u.x = test(s2.x, s1.x);

  e[0] = (float)source1[0];

  if (check_union128(u, e))
    abort ();
}
void computeVelocitySSE(const double * const currentCell, const double * const density, double *velocity)
{
    __m128d v0, v1, v2;
    int i;
    v0 = v1 = v2 = _mm_setzero_pd();
    for (i = 0; i < PARAMQ - 1; i += 2)
    {
        __m128d vc, vl0, vl1, vl2;
        __m128i vtemp;

        vc = _mm_loadu_pd(&currentCell[i]);
        vtemp = _mm_loadu_si128((__m128i *)&LATTICEVELOCITIES2[0][i]);
        vl0 = _mm_cvtepi32_pd(vtemp);
        vtemp = _mm_loadu_si128((__m128i *)&LATTICEVELOCITIES2[1][i]);
        vl1 = _mm_cvtepi32_pd(vtemp);
        vtemp = _mm_loadu_si128((__m128i *)&LATTICEVELOCITIES2[2][i]);
        vl2 = _mm_cvtepi32_pd(vtemp);
        v0 = _mm_add_pd(v0, _mm_mul_pd(vc, vl0));
        v1 = _mm_add_pd(v1, _mm_mul_pd(vc, vl1));
        v2 = _mm_add_pd(v2, _mm_mul_pd(vc, vl2));
    }
    v0 = _mm_hadd_pd(v0, v0);
    v1 = _mm_hadd_pd(v1, v1);
    v2 = _mm_hadd_pd(v2, v2);
    _mm_store_sd (&velocity[0], v0);
    _mm_store_sd (&velocity[1], v1);
    _mm_store_sd (&velocity[2], v2);
    if (i < PARAMQ)
    {
        velocity[0] += currentCell[i] * LATTICEVELOCITIES2[0][i];
        velocity[1] += currentCell[i] * LATTICEVELOCITIES2[1][i];
        velocity[2] += currentCell[i] * LATTICEVELOCITIES2[2][i];
    }
    velocity[0] = velocity[0] / (*density);
    velocity[1] = velocity[1] / (*density);
    velocity[2] = velocity[2] / (*density);
}