Example #1
0
    static std::complex<double> ZDot(int n, const double* A, const std::complex<double>* B)
    {
        if (n) {
#ifdef __SSE2__
            std::complex<double> sum(0);
            while (n && !IsAligned(A) ) {
                sum += *A * *B;
                ++A;
                Maybe<!c2>::increment(B);
                --n;
            }

            int n_2 = (n>>1);
            int nb = n-(n_2<<1);

            if (n_2) {
                union { __m128d xm; double xd[2]; } xsum;
                xsum.xm = _mm_set1_pd(0.);
                __m128d xsum2 = _mm_set1_pd(0.);
                const std::complex<double>* B1 = Maybe<!c2>::plus(B,1);
                assert(IsAligned(A));
                assert(IsAligned(B));
                do {
                    const __m128d& xA = *(const __m128d*)(A);
                    const __m128d& xB1 = *(const __m128d*)(B);
                    const __m128d& xB2 = *(const __m128d*)(B1);
                    A += 2;
                    Maybe<!c2>::increment(B,2);
                    Maybe<!c2>::increment(B1,2);
                    __m128d xA1 = _mm_shuffle_pd(xA,xA,_MM_SHUFFLE2(0,0));
                    __m128d xA2 = _mm_shuffle_pd(xA,xA,_MM_SHUFFLE2(1,1));
                    __m128d x1 = _mm_mul_pd(xA1,xB1);
                    __m128d x2 = _mm_mul_pd(xA2,xB2);
                    xsum.xm = _mm_add_pd(xsum.xm,x1);
                    xsum2 = _mm_add_pd(xsum2,x2);
                } while (--n_2);
                xsum.xm = _mm_add_pd(xsum.xm,xsum2);
                sum += std::complex<double>(xsum.xd[0],xsum.xd[1]);
            }
            if (nb) {
                sum += *A * *B;
                ++A;
                Maybe<!c2>::increment(B);
            }
            return Maybe<c2>::conj(sum);
#else
            std::complex<double> sum = 0.;
            do {
                sum += *A * *B;
                ++A;
                Maybe<!c2>::increment(B);
            } while (--n);
            return Maybe<c2>::conj(sum);
#endif
        } else {
            return 0.;
void transpose_misaligned(double *a, double *b, int N1, int N2, double factor) {

    int i,j,k,k1,it,jt,itt,jtt,it_bound,jt_bound,itt_bound,jtt_bound;
    int conflict,tmp,tmpN,offset,line_offset,setnum,set[8192/(4*sizeof(double))];
    double *pA, *pB;


    register __m128d x, y, z, w, t, t1,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    itt_bound = (N1/tilesize)*tilesize;
    for (itt = 0; itt < itt_bound; itt=itt+5*tilesize) {
        jtt_bound =(N2/tilesize)*tilesize;
        for (jtt = 0; jtt < jtt_bound; jtt=jtt+5*tilesize) {
            it_bound = (itt+5*tilesize > itt_bound)?itt_bound:itt+5*tilesize;
            for (it = itt; it < it_bound; it = it+tilesize) {
                jt_bound = (jtt+5*tilesize>itt_bound)?jtt_bound:jtt+5*tilesize;
                for (jt = jtt; jt < jt_bound; jt = jt+tilesize) {
                    k = 0;
                    for (j = jt; j < jt+tilesize; j=j+2) {
                        for (i = it; i < it+tilesize; i=i+2) {
                            pA = a+i*N2+j;
                            pB = b+j*N1+i;
                            x = _mm_loadu_pd(pA);
                            x = _mm_mul_pd(x,fac_vector);
                            y = _mm_loadu_pd(pA + N2);
                            y = _mm_mul_pd(y,fac_vector);
                            z = _mm_shuffle_pd( x, y, 0);
                            w = _mm_shuffle_pd( x, y, 3);
                            _mm_storeu_pd(pB,z);
                            _mm_storeu_pd(pB + N1,w);
                        }
                    }
                }
            }
        }
        for (i = itt; i < itt+5*tilesize && i < itt_bound; i++) {
            for (j = jtt_bound; j < N2; j++) {
                b[j*N1+i] = factor * a[i*N2+j];
            }
        }
    }
    for (i = itt_bound; i < N1; i++) {
        for (j = 0; j < N2; j++) {
            b[j*N1+i] = factor * a[i*N2+j];
        }
    }
}
Example #3
0
dcomplex zdotc_( int*      n,
                 dcomplex* x, int* inc_x,
                 dcomplex* z, int* inc_z )
{
	dcomplex* restrict x1;
	dcomplex* restrict z1;
	int                i;
	v2df_t rho1v;
	v2df_t z11v, z12v;
	v2df_t x1v, x1rv;
	dcomplex rho;
	int    n1 = *n;
	int    incx = *inc_x;
	int    incz = *inc_z;

	x1 = x;
	z1 = z;

	rho1v.v = _mm_setzero_pd();

	{
		v2df_t bcac, adbd;

		for ( i = 0; i < n1; ++i )
		{
			z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) );
			z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) );

			x1v.v  = _mm_load_pd( ( double* )x1 );
			x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) );
			bcac.v = x1rv.v * z11v.v;
			adbd.v = x1v.v  * z12v.v;
			rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v );

			x1 += incx;
			z1 += incz;
		}

		rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) );

		rho1v.d[1] = -rho1v.d[1];
	}

	rho.real = rho1v.d[0];
	rho.imag = rho1v.d[1];

	return rho;
}
void transpose_aligned(double *a, double *b, int N1, int N2, double factor) {

    int i,j,k,k1,it,jt,itt,jtt,conflict,tmp,tmpN;
    double *pA, *pB;


    register __m128d x, y, z, w,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    for (it = 0; it < N1; it=it+tilesize) {
        for (jt = 0; jt < N2; jt=jt+tilesize) {

            k = 0;
            for (j = jt; j < jt+tilesize; j=j+2) {
                for (i = it; i < it+tilesize; i=i+2) {
                    pA = a+i*N2+j;
                    x = _mm_load_pd(pA);
                    y = _mm_load_pd(pA + N2);
                    x = _mm_mul_pd(x,fac_vector);
                    y = _mm_mul_pd(y,fac_vector);
                    z = _mm_shuffle_pd( x, y, 0);
                    w = _mm_shuffle_pd( x, y, 3);
                    k = (j-jt)*tilesize + (i-it);
                    _mm_store_pd(buf + k,z);
                    _mm_store_pd(buf + k + tilesize,w);
                }
            }

            k = 0;
            k1 = 0;
            for (j = jt; j < jt+tilesize; j++) {
                pB = b+j*N1+it;
                k = (j-jt)*tilesize;
                x = _mm_load_pd(&buf[k]);
                y = _mm_load_pd(&buf[k]+2);
                z = _mm_load_pd(&buf[k]+2*2);
                w = _mm_load_pd(&buf[k]+3*2);
                _mm_stream_pd(pB,x);
                _mm_stream_pd(pB+2,y);
                _mm_stream_pd(pB+2*2,z);
                _mm_stream_pd(pB+3*2,w);

            }
        }
    }
}
Example #5
0
/* use compiler intrinsics for 2x parallel processing */
static inline double chi2_intrinsic_double(int n, const double* x, const double* y) {
    double result=0;
    const __m128d eps = _mm_set1_pd(DBL_MIN);
    const __m128d zero = _mm_setzero_pd();
    __m128d chi2 = _mm_setzero_pd();    

    for ( ; n>1; n-=2) {
        const __m128d a = _mm_loadu_pd(x);
        const __m128d b = _mm_loadu_pd(y);
	x+=2;
	y+=2;
        const __m128d a_plus_b = _mm_add_pd(a,b);
        const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps);
        const __m128d a_minus_b = _mm_sub_pd(a,b);
        const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b);
        const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps);
        chi2 = _mm_add_pd(chi2, quotient);
    }
    const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1));
    const __m128d sum = _mm_add_pd(chi2, shuffle);
// with SSE3, we could use hadd_pd, but the difference is negligible 

    _mm_store_sd(&result,sum);
    _mm_empty();
    if (n)
        result += chi2_baseline_double(n, x, y); // remaining entries
    return result;
}
Example #6
0
/* xvm_dot:
 *   Return the dot product of the two given vectors.
 */
double xvm_dot(const double x[], const double y[], uint64_t N) {
	double r = 0.0;
#if defined(__SSE2__) && !defined(XVM_ANSI)
	assert(x != NULL && ((uintptr_t)x % 16) == 0);
	assert(y != NULL && ((uintptr_t)y % 16) == 0);
	uint64_t n, d = N % 4;
	__m128d s0 = _mm_setzero_pd();
	__m128d s1 = _mm_setzero_pd();
	for (n = 0; n < N - d; n += 4) {
		const __m128d x0 = _mm_load_pd(x + n    );
		const __m128d x1 = _mm_load_pd(x + n + 2);
		const __m128d y0 = _mm_load_pd(y + n    );
		const __m128d y1 = _mm_load_pd(y + n + 2);
		const __m128d r0 = _mm_mul_pd(x0, y0);
		const __m128d r1 = _mm_mul_pd(x1, y1);
		s0 = _mm_add_pd(s0, r0);
		s1 = _mm_add_pd(s1, r1);
	}
	s0 = _mm_add_pd(s0, s1);
	s1 = _mm_shuffle_pd(s0, s0, _MM_SHUFFLE2(1, 1));
	s0 = _mm_add_pd(s0, s1);
	_mm_store_sd(&r, s0);
	for ( ; n < N; n++)
		r += x[n] * y[n];
#else
	for (uint64_t n = 0; n < N; n++)
		r += x[n] * y[n];
#endif
	return r;
}
    inline
    double operator[](int i) const
    {
        __m128d buf0;
        if (i < 4) {
            if (i < 2) {
                buf0 = _mm512_extractf64x2_pd(val, 0);
            } else {
                buf0 = _mm512_extractf64x2_pd(val, 1);
            }
        } else {
            if (i < 6) {
                buf0 = _mm512_extractf64x2_pd(val, 2);
            } else {
                buf0 = _mm512_extractf64x2_pd(val, 3);
            }
        }

        i &= 1;

        if (i == 0) {
            return _mm_cvtsd_f64(buf0);
        }

        buf0 = _mm_shuffle_pd(buf0, buf0, 1);
        return _mm_cvtsd_f64(buf0);
    }
Example #8
0
__m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
  // DAG-LABEL: test_mm_shuffle_pd
  // DAG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
  //
  // ASM-LABEL: test_mm_shuffle_pd
  // ASM: shufpd $1,
  return _mm_shuffle_pd(A, B, 1);
}
Example #9
0
int fft4a_(double *a, double *b, double *w, int *l)
{
    int j, j0, j1, j2, j3, j4, j5, j6, j7;
    /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */
    __m128d t0, t1, t2, t3, t4, w1, w2, w3;

    for (j = 0; j < *l; j++) {
	j0 = j << 1;
	j1 = j0 + (*l << 1);
	j2 = j1 + (*l << 1);
	j3 = j2 + (*l << 1);
	j4 = j << 3;
	j5 = j4 + 2;
	j6 = j5 + 2;
	j7 = j6 + 2;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1;
	wr3 = wr1 * wr2 - wi1 * wi2;
	wi3 = wr1 * wi2 + wi1 * wr2; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	w3 = ZMUL(w1, w2);
	/* x0 = a[j0] + a[j2];
	y0 = a[j0 + 1] + a[j2 + 1];
	x1 = a[j0] - a[j2];
	y1 = a[j0 + 1] - a[j2 + 1];
	x2 = a[j1] + a[j3];
	y2 = a[j1 + 1] + a[j3 + 1];
	x3 = a[j1 + 1] - a[j3 + 1];
	y3 = a[j3] - a[j1]; */
	t0 = _mm_load_pd(&a[j0]);
	t2 = _mm_load_pd(&a[j2]);
	t1 = _mm_sub_pd(t0, t2);
	t0 = _mm_add_pd(t0, t2);
	t3 = _mm_load_pd(&a[j1]);
	t4 = _mm_load_pd(&a[j3]);
	t2 = _mm_add_pd(t3, t4);
	t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0));
	t3 = _mm_shuffle_pd(t3, t3, 1);
	/* b[j4] = x0 + x2;
	b[j4 + 1] = y0 + y2;
	b[j6] = wr2 * (x0 - x2) - wi2 * (y0 - y2);
	b[j6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2);
	b[j5] = wr1 * (x1 + x3) - wi1 * (y1 + y3);
	b[j5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3);
	b[j7] = wr3 * (x1 - x3) - wi3 * (y1 - y3);
	b[j7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */
	_mm_store_pd(&b[j4], _mm_add_pd(t0, t2));
	_mm_store_pd(&b[j6], ZMUL(w2, _mm_sub_pd(t0, t2)));
	_mm_store_pd(&b[j5], ZMUL(w1, _mm_add_pd(t1, t3)));
	_mm_store_pd(&b[j7], ZMUL(w3, _mm_sub_pd(t1, t3)));
    }
    return 0;
}
Example #10
0
__inline __m128d Length(__m128d vec1,__m128d vec2)
{
	__m128d result1 = _mm_mul_pd(vec1, vec1);
	__m128d result2 = _mm_mul_sd(vec2, vec2);
	__m128d result3 = _mm_shuffle_pd(result1, result1, 1);
	__m128d result4 = _mm_add_sd(result1, result2);
	__m128d result5 = _mm_add_sd(result4, result3);
	__m128d result6 = _mm_sqrt_sd(vec1, result5);
	return result6;
}
Example #11
0
double Point::Cross(const Point &point) const
{
#ifdef __SSE3__
	__m128d b = _mm_shuffle_pd(point.v, point.v, 0x01);
	b *= v;
	b = _mm_hsub_pd(b, b);
	return reinterpret_cast<double &>(b);
#else
	return x * point.y - y * point.x;
#endif
}
Example #12
0
static __inline __m128d ZMUL(__m128d a, __m128d b)
{
    __m128d ar, ai;

    ar = _mm_movedup_pd(a);       /* ar = [a.r a.r] */
    ar = _mm_mul_pd(ar, b);       /* ar = [a.r*b.r a.r*b.i] */
    ai = _mm_unpackhi_pd(a, a);   /* ai = [a.i a.i] */
    b = _mm_shuffle_pd(b, b, 1);  /* b = [b.i b.r] */
    ai = _mm_mul_pd(ai, b);       /* ai = [a.i*b.i a.i*b.r] */

    return _mm_addsub_pd(ar, ai); /* [a.r*b.r-a.i*b.i a.r*b.i+a.i*b.r] */
}
Example #13
0
int fft3a_(double *a, double *b, double *w, int *l)
{
    /* static double c31 = .86602540378443865;
    static double c32 = .5; */
    static __m128d c31, c32;

    int j, j0, j1, j2, j3, j4, j5;
    /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */
    __m128d t0, t1, t2, t3, w1, w2;

    c31 = _mm_set1_pd(0.86602540378443865);
    c32 = _mm_set1_pd(0.5);

    for (j = 0; j < *l; j++) {
        j0 = j << 1;
	j1 = j0 + (*l << 1);
	j2 = j1 + (*l << 1);
	j3 = j * 6;
	j4 = j3 + 2;
	j5 = j4 + 2;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	/* x0 = a[j1] + a[j2];
	y0 = a[j1 + 1] + a[j2 + 1];
	x1 = a[j0] - c32 * x0;
	y1 = a[j0 + 1] - c32 * y0;
	x2 = c31 * (a[j1 + 1] - a[j2 + 1]);
	y2 = c31 * (a[j2] - a[j1]); */
	t1 = _mm_load_pd(&a[j1]);
	t2 = _mm_load_pd(&a[j2]);
	t0 = _mm_add_pd(t1, t2);
	t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0));
	t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1));
	t3 = _mm_load_pd(&a[j0]);
	t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0));
	/* b[j3] = a[j0] + x0;
	b[j3 + 1] = a[j0 + 1] + y0;
	b[j4] = wr1 * (x1 + x2) - wi1 * (y1 + y2);
	b[j4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2);
	b[j5] = wr2 * (x1 - x2) - wi2 * (y1 - y2);
	b[j5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */
	_mm_store_pd(&b[j3], _mm_add_pd(t3, t0));
	_mm_store_pd(&b[j4], ZMUL(w1, _mm_add_pd(t1, t2)));
	_mm_store_pd(&b[j5], ZMUL(w2, _mm_sub_pd(t1, t2)));
    }
    return 0;
}
Example #14
0
value complex_mul(value vab, value vcd)
{
    CAMLparam2(vab, vcd);
    CAMLlocal1(vz);
    vz = caml_alloc(Double_array_tag, 2);
    __m128d ab, cd, ac_bd, ba, bc_ad;
    ab = _mm_loadu_pd((double const*) vab);
    cd = _mm_loadu_pd((double const*) vcd);
    ac_bd = _mm_mul_pd(ab, cd);
    ba    = _mm_shuffle_pd(ab, ab, 1);
    bc_ad = _mm_mul_pd(ba, cd);
    _mm_storeu_pd((double*) vz, _mm_addsub_pd(ac_bd, bc_ad));
    CAMLreturn(vz);
}
Example #15
0
void AES_192_Key_Expansion (const unsigned char *userkey, 
                                    unsigned char *key) 
        { 
            __m128i temp1, temp2, temp3, temp4; 
            __m128i *Key_Schedule = (__m128i*)key; 
         
            temp1 = _mm_loadu_si128((__m128i*)userkey); 
            temp3 = _mm_loadu_si128((__m128i*)(userkey+16)); 
         
            Key_Schedule[0]=temp1; 
            Key_Schedule[1]=temp3; 
            temp2=_mm_aeskeygenassist_si128 (temp3,0x1); 
            KEY_192_ASSIST(&temp1, &temp2, &temp3); 
            Key_Schedule[1] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule[1], 
                                                                  (__m128d)temp1,0); 
            Key_Schedule[2] = (__m128i)_mm_shuffle_pd((__m128d)temp1,(__m128d)temp3,1); 
            temp2=_mm_aeskeygenassist_si128 (temp3,0x2);
            KEY_192_ASSIST(&temp1, &temp2, &temp3); 
            Key_Schedule[3]=temp1; 
            Key_Schedule[4]=temp3; 
            temp2=_mm_aeskeygenassist_si128 (temp3,0x4); 
            KEY_192_ASSIST(&temp1, &temp2, &temp3); 
            Key_Schedule[4] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule[4], 
            (__m128d)temp1,0); 
            Key_Schedule[5] = (__m128i)_mm_shuffle_pd((__m128d)temp1,(__m128d)temp3,1); 
            temp2=_mm_aeskeygenassist_si128 (temp3,0x8); 
            KEY_192_ASSIST(&temp1, &temp2, &temp3); 
            Key_Schedule[6]=temp1; 
            Key_Schedule[7]=temp3; 
            temp2=_mm_aeskeygenassist_si128 (temp3,0x10); 
            KEY_192_ASSIST(&temp1, &temp2, &temp3); 
            Key_Schedule[7] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule[7], 
            (__m128d)temp1,0); 
            Key_Schedule[8] = (__m128i)_mm_shuffle_pd((__m128d)temp1,(__m128d)temp3,1); 
            temp2=_mm_aeskeygenassist_si128 (temp3,0x20); 
            KEY_192_ASSIST(&temp1, &temp2, &temp3); 
            Key_Schedule[9]=temp1; 
            Key_Schedule[10]=temp3; 
            temp2=_mm_aeskeygenassist_si128 (temp3,0x40); 
            KEY_192_ASSIST(&temp1, &temp2, &temp3); 
            Key_Schedule[10] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule[10], 
            (__m128d)temp1,0); 
            Key_Schedule[11] = (__m128i)_mm_shuffle_pd((__m128d)temp1,(__m128d)temp3,1); 
            temp2=_mm_aeskeygenassist_si128 (temp3,0x80); 
            KEY_192_ASSIST(&temp1, &temp2, &temp3); 
            Key_Schedule[12]=temp1;    
        } 
static inline void
inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i;
  __m128d f[2], sum[4], t;
  const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
    (gdouble *) ((gint8 *) b + 1 * bstride),
    (gdouble *) ((gint8 *) b + 2 * bstride),
    (gdouble *) ((gint8 *) b + 3 * bstride)
  };

  f[0] = _mm_loadu_pd (icoeff + 0);
  f[1] = _mm_loadu_pd (icoeff + 2);
  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();

  for (i = 0; i < len; i += 2) {
    t = _mm_loadu_pd (a + i + 0);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
  }
  sum[0] =
      _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
  sum[1] =
      _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
  sum[2] =
      _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
  sum[3] =
      _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
  sum[0] = _mm_add_pd (sum[0], sum[1]);
  sum[2] = _mm_add_pd (sum[2], sum[3]);
  sum[0] = _mm_add_pd (sum[0], sum[2]);
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
  _mm_store_sd (o, sum[0]);
}
Example #17
0
// from Intel's sample intrin_double_sample.c
void multiply_SSE3(double xr, double xi, double yr, double yi,
    complex_num *z)
{
    __m128d num1, num2, num3;

    // Duplicates lower vector element into upper vector element.
    //   num1: [x.real, x.real]

    num1 = _mm_loaddup_pd(&xr);

    // Move y elements into a vector
    //   num2: [y.img, y.real]

    num2 = _mm_set_pd(yi, yr);

    // Multiplies vector elements
    //   num3: [(x.real*y.img), (x.real*y.real)]

    num3 = _mm_mul_pd(num2, num1);

    //   num1: [x.img, x.img]

    num1 = _mm_loaddup_pd(&xi);

    // Swaps the vector elements
    //   num2: [y.real, y.img]

    num2 = _mm_shuffle_pd(num2, num2, 1);

    //   num2: [(x.img*y.real), (x.img*y.img)]

    num2 = _mm_mul_pd(num2, num1);

    // Adds upper vector element while subtracting lower vector element
    //   num3: [((x.real *y.img)+(x.img*y.real)),
    //          ((x.real*y.real)-(x.img*y.img))]

    num3 = _mm_addsub_pd(num3, num2);

    // Stores the elements of num3 into z

    _mm_storeu_pd((double *)z, num3);

}
Example #18
0
int main(){

  __m128d a,b,c;

  double res[2] __attribute__((aligned(16)));

  a = _mm_set_pd(1,2);
  b = _mm_set_pd(3,4);

  c = _mm_shuffle_pd(a,b, _MM_SHUFFLE2(0,1));

  _mm_store_pd(res, c);

  /* 0 1 */
  printf("%f %f\n", res[0] , res[1]);


  return 0;
}
Example #19
0
// only compute the necessary indices of su2_i = subgroup( U*staple^\dagger )
void
only_subgroup( GLU_complex *s0 ,
	       GLU_complex *s1 ,
	       double *scale ,
	       const GLU_complex U[ NCNC ] ,
	       const GLU_complex staple[ NCNC ] ,
	       const size_t su2_index )
{
  const __m128d *u = (const __m128d*)U ;
  const __m128d *s = (const __m128d*)staple ;

  register __m128d sm0 ; 
  register __m128d sm1 ;
#if NC == 3
  switch( su2_index%3 ) { // I don't like this
    // rotation 1
    //  |  s0   s1  0 |
    //  | -s1*  s0* 0 |
    //  |  0     0  1 |
  case 0 :
    sm0 = _mm_add_pd(
		     // temp0
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) ,
					     SSE2_MUL_CONJ( *( u + 2 ) , *( s + 2 ) ) ) ) ,
		      // temp3^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 4 ) ) ,
					     SSE2_MULCONJ( *( u + 5 ) , *( s + 5 ) ) ) ) ) ;
    sm1 = _mm_sub_pd( 
		     // temp1
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 4 ) ) ,
					     SSE2_MUL_CONJ( *( u + 2 ) , *( s + 5 ) ) ) ) ,
		     // temp2^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 1 ) ) ,
					     SSE2_MULCONJ( *( u + 5 ) , *( s + 2 ) ) ) ) ) ;
    break ;
  case 1 :
    // rotation 2
    //  |  1    0   0  |
    //  |  0   s0  s1  |
    //  |  0  -s1* s0* |
    sm0 = _mm_add_pd( 
		     // temp0
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 4 ) ) ,
					     SSE2_MUL_CONJ( *( u + 5 ) , *( s + 5 ) ) ) ) ,
		     // temp3^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 7 ) ) ,
					     SSE2_MULCONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ;
    sm1 = _mm_sub_pd(
		     // temp1
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 7 ) ) ,
					     SSE2_MUL_CONJ( *( u + 5 ) , *( s + 8 ) ) ) ) ,
		     // temp2^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 4 ) ) ,
					     SSE2_MULCONJ( *( u + 8 ) , *( s + 5 ) ) ) ) ) ;
    break ;
  case 2 :
    // rotation 3
    //  | s0*  0  -s1 |
    //  |  0   1   0  |
    //  | s1   0  s0  |
    sm0 = _mm_add_pd( 
		     // temp3^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 1 ) ) ,
					     SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) ) ) ,
		     // temp0
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 7 ) ) ,
					     SSE2_MUL_CONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ;
    sm1 = _mm_sub_pd(
		     // temp1
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 1 ) ) ,
					     SSE2_MUL_CONJ( *( u + 8 ) , *( s + 2 ) ) ) ) ,
		     // temp2^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 7 ) ) ,
					     SSE2_MULCONJ( *( u + 2 ) , *( s + 8 ) ) ) ) ) ;
    break ;
  }
#elif NC == 2
  sm0 = _mm_add_pd( 
		   // temp0
		   _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) , 
			       SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) ) ,
		   // temp3^*
		   _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) , 
			       SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) ) ) ;
  
  sm1 = _mm_sub_pd( 
		   // temp1
		   _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 2 ) ) , 
			       SSE2_MUL_CONJ( *( u + 1 ) , *( s + 3 ) ) ) ,
		   // temp2^*
		   _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 0 ) ) ,
			       SSE2_MULCONJ( *( u + 3 ) , *( s + 1 ) ) ) ) ;
#else
  // su(N) version
  const size_t row_a = Latt.su2_data[ su2_index ].idx_a / NC ;
  const size_t col_b = Latt.su2_data[ su2_index ].idx_b % NC ;

  // prefetch the staple & link indices
  const __m128d *S1 = ( s + NC * row_a ) , *S2 = ( s + NC * col_b ) ; 
  const __m128d *U1 = ( u + NC * row_a ) , *U2 = ( u + NC * col_b ) ; 

  // initialise to zero & perform multiplication
  sm0 = _mm_setzero_pd() ; sm1 = _mm_setzero_pd() ;

  size_t i ;
  for( i = 0 ; i < NC ; i++ ) {
    sm0 = _mm_add_pd( sm0 ,
		      _mm_add_pd( SSE2_MUL_CONJ( *U1 , *S1 ) ,
				  SSE2_MULCONJ( *U2 , *S2 ) ) ) ;
    sm1 = _mm_add_pd( sm1 ,
		      _mm_sub_pd( SSE2_MUL_CONJ( *U1 , *S2 ) ,
				  SSE2_MULCONJ( *U2 , *S1 ) ) ) ;
    // increment our pointers
    S1++ , S2++ , U1++ , U2++ ;
  }
#endif

  // puts the norm in both parts
  register __m128d z = SSE2_FMA( sm0 , sm0 , _mm_mul_pd( sm1 , sm1 ) ) ; 
  z = _mm_add_pd( z , _mm_shuffle_pd( z , z , 1 ) ) ;
  z = _mm_sqrt_pd( z ) ;
  z = _mm_div_pd( _mm_set1_pd( 1.0 ) , z ) ;
  sm0 = _mm_mul_pd( sm0 , z ) ;
  sm1 = _mm_mul_pd( sm1 , z ) ;

  // poke back into *s0 and *s1 and *scale
  _mm_store_pd( (void*)s0 , sm0 ) ; 
  _mm_store_pd( (void*)s1 , sm1 ) ; 
  _mm_store_sd( (void*)scale , z ) ;

  return ;
}
Example #20
0
 BOOST_FORCEINLINE
 __m128d shuffle(__m128d const lower, __m128d const upper)
 {
   return _mm_shuffle_pd(lower, upper, _MM_SHUFFLE2(upper_i0, lower_i0));
 }
void transpose_4321_loop_3241_( double *unsorted, double *sorted,
        int *p_dim1, int *p_dim2, int *p_dim3, int *p_dim4, double *p_factor ) {

    int dim1,dim2,dim3,dim4;
    int dim1mod,dim2mod,dim3mod,dim4mod;
    unsigned int old_offset,new_offset;
    unsigned int j1,j2,j3,j4;
    double factor = *p_factor;
    double *pA, *pB;
    register __m128d x, y, z, w, t, t1,fac_vector;
    unsigned int N1,N2;
    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);


    dim1 = *p_dim1;
    dim2 = *p_dim2;
    dim3 = *p_dim3;
    dim4 = *p_dim4;

    N1 = dim2*dim3*dim4;
    N2 = dim2*dim3*dim4;

    dim1mod = (int) floor( (float)dim1 / (float) 4);
    dim2mod = (int) floor( (float)dim2 / (float) 4);
    dim3mod = (int) floor( (float)dim3 / (float) 4);
    dim4mod = (int) floor( (float)dim4 / (float) 4);

    /* pluto start (dim1,dim2,dim3,dim4) */
#pragma ivdep
#pragma parallel
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
    for( j3 = 0; j3<dim3; j3++) {
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
        for( j2 = 0; j2<dim2; j2++) {
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
#pragma vector always
            for( j4 = 0; j4<dim4; j4+=2) {
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
#pragma vector always
                for( j1 = 0; j1<dim1; j1+=2) {
                    //sorted[j1+dim1*(j2+dim2*(j3+dim3*j4))] = unsorted[j4+dim4*(j3+dim3*(j2+dim2*j1))] * factor;

                    pA = unsorted + j4+dim4*(j3+dim3*(j2+dim2*j1));
                    pB = sorted   + j1+dim1*(j2+dim2*(j3+dim3*j4));
                    x = _mm_loadu_pd(pA);
                    x = _mm_mul_pd(x,fac_vector);
                    y = _mm_loadu_pd(pA + N2);
                    y = _mm_mul_pd(y,fac_vector);
                    z = _mm_shuffle_pd( x, y, 0);
                    w = _mm_shuffle_pd( x, y, 3);
                    _mm_storeu_pd(pB,z);
                    _mm_storeu_pd(pB + N1,w);

                }
            }
        }
    }
    /* pluto end */
    return;
}
Example #22
0
int fft8b_(double *a, double *b, double *w, int *m, int *l)
{
    /* static double c81 = .70710678118654752; */
    static __m128d c81;

    int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, j, j0;
    /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5,
             x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6,
             wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */
    __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7;

    c81 = _mm_set1_pd(0.70710678118654752);

    for (i = 0; i < *m; i++) {
        i0 = i << 1;
	i1 = i0 + (*m * *l << 1);
	i2 = i1 + (*m * *l << 1);
	i3 = i2 + (*m * *l << 1);
	i4 = i3 + (*m * *l << 1);
	i5 = i4 + (*m * *l << 1);
	i6 = i5 + (*m * *l << 1);
	i7 = i6 + (*m * *l << 1);
	i8 = i << 1;
	i9 = i8 + (*m << 1);
	i10 = i9 + (*m << 1);
	i11 = i10 + (*m << 1);
	i12 = i11 + (*m << 1);
	i13 = i12 + (*m << 1);
	i14 = i13 + (*m << 1);
	i15 = i14 + (*m << 1);
	/* x0 = a[i0] + a[i4];
	y0 = a[i0 + 1] + a[i4 + 1];
	x1 = a[i0] - a[i4];
	y1 = a[i0 + 1] - a[i4 + 1];
	x2 = a[i2] + a[i6];
	y2 = a[i2 + 1] + a[i6 + 1];
	x3 = a[i2 + 1] - a[i6 + 1];
	y3 = a[i6] - a[i2]; */
	t0 = _mm_load_pd(&a[i0]);
	t2 = _mm_load_pd(&a[i4]);
	t1 = _mm_sub_pd(t0, t2);
	t0 = _mm_add_pd(t0, t2);
	t3 = _mm_load_pd(&a[i2]);
	t4 = _mm_load_pd(&a[i6]);
	t2 = _mm_add_pd(t3, t4);
	t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0));
	t3 = _mm_shuffle_pd(t3, t3, 1);
	/* u0 = x0 + x2;
	v0 = y0 + y2;
	u1 = x0 - x2;
	v1 = y0 - y2; */
	u0 = _mm_add_pd(t0, t2);
	u1 = _mm_sub_pd(t0, t2);
	/* x4 = a[i1] + a[i5];
	y4 = a[i1 + 1] + a[i5 + 1];
	x5 = a[i1] - a[i5];
	y5 = a[i1 + 1] - a[i5 + 1];
	x6 = a[i3] + a[i7];
	y6 = a[i3 + 1] + a[i7 + 1];
	x7 = a[i3] - a[i7];
	y7 = a[i3 + 1] - a[i7 + 1]; */
	t4 = _mm_load_pd(&a[i1]);
	t6 = _mm_load_pd(&a[i5]);
	t5 = _mm_sub_pd(t4, t6);
	t4 = _mm_add_pd(t4, t6);
	t7 = _mm_load_pd(&a[i3]);
	t8 = _mm_load_pd(&a[i7]);
	t6 = _mm_add_pd(t7, t8);
	t7 = _mm_sub_pd(t7, t8);
	/* u2 = x4 + x6;
	v2 = y4 + y6;
	u3 = y4 - y6;
	v3 = x6 - x4; */
	u2 = _mm_add_pd(t4, t6);
	u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0));
	u3 = _mm_shuffle_pd(u3, u3, 1);
	/* b[i8] = u0 + u2;
	b[i8 + 1] = v0 + v2;
	b[i12] = u0 - u2;
	b[i12 + 1] = v0 - v2;
	b[i10] = u1 + u3;
	b[i10 + 1] = v1 + v3;
	b[i14] = u1 - u3;
	b[i14 + 1] = v1 - v3; */
	_mm_store_pd(&b[i8], _mm_add_pd(u0, u2));
	_mm_store_pd(&b[i12], _mm_sub_pd(u0, u2));
	_mm_store_pd(&b[i10], _mm_add_pd(u1, u3));
	_mm_store_pd(&b[i14], _mm_sub_pd(u1, u3));
	/* u0 = x1 + c81 * (x5 - x7);
	v0 = y1 + c81 * (y5 - y7);
	u1 = x1 - c81 * (x5 - x7);
	v1 = y1 - c81 * (y5 - y7);
	u2 = x3 + c81 * (y5 + y7);
	v2 = y3 - c81 * (x5 + x7);
	u3 = x3 - c81 * (y5 + y7);
	v3 = y3 + c81 * (x5 + x7); */
	u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7));
	u0 = _mm_add_pd(t1, u1);
	u1 = _mm_sub_pd(t1, u1);
	u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0));
	u3 = _mm_shuffle_pd(u3, u3, 1);
	u2 = _mm_add_pd(t3, u3);
	u3 = _mm_sub_pd(t3, u3);
	/* b[i9] = u0 + u2;
	b[i9 + 1] = v0 + v2;
	b[i13] = u1 + u3;
	b[i13 + 1] = v1 + v3;
	b[i11] = u1 - u3;
	b[i11 + 1] = v1 - v3;
	b[i15] = u0 - u2;
	b[i15 + 1] = v0 - v2; */
	_mm_store_pd(&b[i9], _mm_add_pd(u0, u2));
	_mm_store_pd(&b[i13], _mm_add_pd(u1, u3));
	_mm_store_pd(&b[i11], _mm_sub_pd(u1, u3));
	_mm_store_pd(&b[i15], _mm_sub_pd(u0, u2));
    }
    for (j = 1; j < *l; j++) {
        j0 = j << 1;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1;
	wr3 = wr1 * wr2 - wi1 * wi2;
	wi3 = wr1 * wi2 + wi1 * wr2;
	wr4 = wr2 * wr2 - wi2 * wi2;
	wi4 = wr2 * wi2 + wr2 * wi2;
	wr5 = wr2 * wr3 - wi2 * wi3;
	wi5 = wr2 * wi3 + wi2 * wr3;
	wr6 = wr3 * wr3 - wi3 * wi3;
	wi6 = wr3 * wi3 + wr3 * wi3;
	wr7 = wr3 * wr4 - wi3 * wi4;
	wi7 = wr3 * wi4 + wi3 * wr4; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	w3 = ZMUL(w1, w2);
	w4 = ZMUL(w2, w2);
	w5 = ZMUL(w2, w3);
	w6 = ZMUL(w3, w3);
	w7 = ZMUL(w3, w4);
	for (i = 0; i < *m; i++) {
	    i0 = (i << 1) + (j * *m << 1);
	    i1 = i0 + (*m * *l << 1);
	    i2 = i1 + (*m * *l << 1);
	    i3 = i2 + (*m * *l << 1);
	    i4 = i3 + (*m * *l << 1);
	    i5 = i4 + (*m * *l << 1);
	    i6 = i5 + (*m * *l << 1);
	    i7 = i6 + (*m * *l << 1);
	    i8 = (i << 1) + (j * *m << 4);
	    i9 = i8 + (*m << 1);
	    i10 = i9 + (*m << 1);
	    i11 = i10 + (*m << 1);
	    i12 = i11 + (*m << 1);
	    i13 = i12 + (*m << 1);
	    i14 = i13 + (*m << 1);
	    i15 = i14 + (*m << 1);
	    /* x0 = a[i0] + a[i4];
	    y0 = a[i0 + 1] + a[i4 + 1];
	    x1 = a[i0] - a[i4];
	    y1 = a[i0 + 1] - a[i4 + 1];
	    x2 = a[i2] + a[i6];
	    y2 = a[i2 + 1] + a[i6 + 1];
	    x3 = a[i2 + 1] - a[i6 + 1];
	    y3 = a[i6] - a[i2]; */
	    t0 = _mm_load_pd(&a[i0]);
	    t2 = _mm_load_pd(&a[i4]);
	    t1 = _mm_sub_pd(t0, t2);
	    t0 = _mm_add_pd(t0, t2);
	    t3 = _mm_load_pd(&a[i2]);
	    t4 = _mm_load_pd(&a[i6]);
	    t2 = _mm_add_pd(t3, t4);
	    t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0));
	    t3 = _mm_shuffle_pd(t3, t3, 1);
	    /* u0 = x0 + x2;
	    v0 = y0 + y2;
	    u1 = x0 - x2;
	    v1 = y0 - y2; */
	    u0 = _mm_add_pd(t0, t2);
	    u1 = _mm_sub_pd(t0, t2);
	    /* x4 = a[i1] + a[i5];
	    y4 = a[i1 + 1] + a[i5 + 1];
	    x5 = a[i1] - a[i5];
	    y5 = a[i1 + 1] - a[i5 + 1];
	    x6 = a[i3] + a[i7];
	    y6 = a[i3 + 1] + a[i7 + 1];
	    x7 = a[i3] - a[i7];
	    y7 = a[i3 + 1] - a[i7 + 1]; */
	    t4 = _mm_load_pd(&a[i1]);
	    t6 = _mm_load_pd(&a[i5]);
	    t5 = _mm_sub_pd(t4, t6);
	    t4 = _mm_add_pd(t4, t6);
	    t7 = _mm_load_pd(&a[i3]);
	    t8 = _mm_load_pd(&a[i7]);
	    t6 = _mm_add_pd(t7, t8);
	    t7 = _mm_sub_pd(t7, t8);
	    /* u2 = x4 + x6;
	    v2 = y4 + y6;
	    u3 = y4 - y6;
	    v3 = x6 - x4; */
	    u2 = _mm_add_pd(t4, t6);
	    u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0));
	    u3 = _mm_shuffle_pd(u3, u3, 1);
	    /* b[i8] = u0 + u2;
	    b[i8 + 1] = v0 + v2;
	    b[i12] = wr4 * (u0 - u2) - wi4 * (v0 - v2);
	    b[i12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2);
	    b[i10] = wr2 * (u1 + u3) - wi2 * (v1 + v3);
	    b[i10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3);
	    b[i14] = wr6 * (u1 - u3) - wi6 * (v1 - v3);
	    b[i14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */
	    _mm_store_pd(&b[i8], _mm_add_pd(u0, u2));
	    _mm_store_pd(&b[i12], ZMUL(w4, _mm_sub_pd(u0, u2)));
	    _mm_store_pd(&b[i10], ZMUL(w2, _mm_add_pd(u1, u3)));
	    _mm_store_pd(&b[i14], ZMUL(w6, _mm_sub_pd(u1, u3)));
	    /* u0 = x1 + c81 * (x5 - x7);
	    v0 = y1 + c81 * (y5 - y7);
	    u1 = x1 - c81 * (x5 - x7);
	    v1 = y1 - c81 * (y5 - y7);
	    u2 = x3 + c81 * (y5 + y7);
	    v2 = y3 - c81 * (x5 + x7);
	    u3 = x3 - c81 * (y5 + y7);
	    v3 = y3 + c81 * (x5 + x7); */
	    u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7));
	    u0 = _mm_add_pd(t1, u1);
	    u1 = _mm_sub_pd(t1, u1);
	    u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0));
	    u3 = _mm_shuffle_pd(u3, u3, 1);
	    u2 = _mm_add_pd(t3, u3);
	    u3 = _mm_sub_pd(t3, u3);
	    /* b[i9] = wr1 * (u0 + u2) - wi1 * (v0 + v2);
	    b[i9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2);
	    b[i13] = wr5 * (u1 + u3) - wi5 * (v1 + v3);
	    b[i13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3);
	    b[i11] = wr3 * (u1 - u3) - wi3 * (v1 - v3);
	    b[i11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3);
	    b[i15] = wr7 * (u0 - u2) - wi7 * (v0 - v2);
	    b[i15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */
	    _mm_store_pd(&b[i9], ZMUL(w1, _mm_add_pd(u0, u2)));
	    _mm_store_pd(&b[i13], ZMUL(w5, _mm_add_pd(u1, u3)));
	    _mm_store_pd(&b[i11], ZMUL(w3, _mm_sub_pd(u1, u3)));
	    _mm_store_pd(&b[i15], ZMUL(w7, _mm_sub_pd(u0, u2)));
	}
    }
    return 0;
}
void tce_sort_6_simd(double* unsorted,double* sorted,
                     int a, int b, int c, int d, int e, int f,
                     int i, int j, int k, int l, int m, int n,
                     double factor) {
    int id[6],jd[6],ia,ib,j1,j2,j3,j4,j5,j6;
    int l1,l2,l3,l4,l5,l6;
    int ia1,ia2,ia3,ia4,ia5,ia6;
    int ib1,ib2,ib3,ib4,ib5,ib6;
    int rangea1,rangea2,rangea3,rangea4,rangea5,rangea6;
    int rangeb1,rangeb2,rangeb3,rangeb4,rangeb5,rangeb6;
    int range[6],order[6],order_r[6];
    int jj1,jj2,jj3,jj4,jj5,jj6;
    int jj1_bound,jj2_bound,jj3_bound,jj4_bound,jj5_bound,jj6_bound;
    int N1,N2;

    double *pA, *pB;
    register __m128d x, y, z, w, p, q,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    jd[0] = a;
    jd[1] = b;
    jd[2] = c;
    jd[3] = d;
    jd[4] = e;
    jd[5] = f;

    // prefer writes
    range[0] = b*c*d*e*f;
    range[1] = c*d*e*f;
    range[2] = d*e*f;
    range[3] = e*f;
    range[4] = f;
    range[5] = 1;

    l1 = jd[i];
    l2 = jd[j];
    l3 = jd[k];
    l4 = jd[l];
    l5 = jd[m];
    l6 = jd[n];


    rangea1 = range[i];
    rangea2 = range[j];
    rangea3 = range[k];
    rangea4 = range[l];
    rangea5 = range[m];
    rangea6 = range[n];


    rangeb1 = l2*l3*l4*l5*l6;
    rangeb2 = l3*l4*l5*l6;
    rangeb3 = l4*l5*l6;
    rangeb4 = l5*l6;
    rangeb5 = l6;
    rangeb6 = 1;

    // here vectorization can rely on the compiler
    if (n == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6++) {
                                ia = ia5 + j6*rangea6;
                                ib = ib5 + j6*rangeb6;
                                sorted[ib] = unsorted[ia] * factor;
                            }
                        }
                    }
                }
            }
        }
    }

    if (m == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5 += tilesize) {
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj5_bound = (j5 + tilesize > l5)? l5 :j5+tilesize;
                                for (jj5 = j5; jj5 < jj5_bound; jj5 += 2) {
                                    ia5 = ia4 + jj5*rangea5;
                                    ib5 = ib4 + jj5*rangeb5;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia5 + jj6*rangea6;
                                        ib = ib5 + jj6*rangeb6;
                                        N1 = rangeb5;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    if (l == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia3 + j5*rangea5;
                            ib5 = ib3 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj4_bound = (j4 + tilesize > l4)? l4 :j4+tilesize;
                                for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                    ia4 = ia5 + jj4*rangea4;
                                    ib4 = ib5 + jj4*rangeb4;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia4 + jj6*rangea6;
                                        ib = ib4 + jj6*rangeb6;
                                        N1 = rangeb4;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    if (k == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3 += tilesize) {
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia2 + j4*rangea4;
                        ib4 = ib2 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize;
                                for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) {
                                    ia3 = ia5 + jj3*rangea3;
                                    ib3 = ib5 + jj3*rangeb3;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia3 + jj6*rangea6;
                                        ib = ib3 + jj6*rangeb6;
                                        N1 = rangeb3;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }


    if (j == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2 += tilesize) {
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia1 + j3*rangea3;
                    ib3 = ib1 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize;
                                for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) {
                                    ia2 = ia5 + jj2*rangea2;
                                    ib2 = ib5 + jj2*rangeb2;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia2 + jj6*rangea6;
                                        ib = ib2 + jj6*rangeb6;
                                        N1 = rangeb2;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    if (i == 5) {
        for (j1 = 0; j1 < l1; j1 += tilesize) {
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = j2*rangea2;
                ib2 = j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize;
                                for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) {
                                    ia1 = ia5 + jj1*rangea1;
                                    ib1 = ib5 + jj1*rangeb1;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia1 + jj6*rangea6;
                                        ib = ib1 + jj6*rangeb6;
                                        N1 = rangeb1;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

}
Example #24
0
void
ffts_transpose(uint64_t *in, uint64_t *out, int w, int h)
{
#ifdef HAVE_NEON
#if 0
    neon_transpose4(in, out, w, h);
#else
    neon_transpose8(in, out, w, h);
#endif
#elif HAVE_SSE2
    uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE];
    int tx, ty;
    /* int x; */
    int y;
    int tw = w / TSIZE;
    int th = h / TSIZE;

    for (ty = 0; ty < th; ty++) {
        for (tx = 0; tx < tw; tx++) {
            uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
            uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */

            /* copy/transpose to tmp */
            for (y = 0; y < TSIZE; y += 2) {
                /* for (x=0;x<TSIZE;x+=2) {
                   op[x*TSIZE] = ip[x];
                */
                __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w));
                __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w));
                __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w));
                __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w));
                __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w));
                __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w));
                __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w));
                __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w));

                __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
                __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
                __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
                __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
                __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
                __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
                __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
                __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));

                ip0 += 2;
                /* _mm_store_pd((double *)(op0 + y*h + x), t0);
                   _mm_store_pd((double *)(op0 + y*h + x + h), t1);
                   */

                _mm_store_pd((double*)(op0 + 0        ), t0);
                _mm_store_pd((double*)(op0 + 0 + TSIZE), t1);
                _mm_store_pd((double*)(op0 + 2        ), t2);
                _mm_store_pd((double*)(op0 + 2 + TSIZE), t3);
                _mm_store_pd((double*)(op0 + 4        ), t4);
                _mm_store_pd((double*)(op0 + 4 + TSIZE), t5);
                _mm_store_pd((double*)(op0 + 6        ), t6);
                _mm_store_pd((double*)(op0 + 6 + TSIZE), t7);
                /* } */

                op0 += 2*TSIZE;
            }

            op0 = out + h*tx*TSIZE + ty*TSIZE;
            ip0 = tmp;
            for (y = 0; y < TSIZE; y += 1) {
                /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */

                __m128d q0 = _mm_load_pd((double*)(ip0 + 0));
                __m128d q1 = _mm_load_pd((double*)(ip0 + 2));
                __m128d q2 = _mm_load_pd((double*)(ip0 + 4));
                __m128d q3 = _mm_load_pd((double*)(ip0 + 6));

                _mm_store_pd((double*)(op0 + 0), q0);
                _mm_store_pd((double*)(op0 + 2), q1);
                _mm_store_pd((double*)(op0 + 4), q2);
                _mm_store_pd((double*)(op0 + 6), q3);

                op0 += h;
                ip0 += TSIZE;
            }
        }
    }
    /*
    size_t i,j;
    for(i=0;i<w;i+=2) {
    for(j=0;j<h;j+=2) {
    //		out[i*h + j] = in[j*w + i];
    __m128d q0 = _mm_load_pd((double *)(in + j*w + i));
    __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
    __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
    __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
    _mm_store_pd((double *)(out + i*h + j), t0);
    _mm_store_pd((double *)(out + i*h + j + h), t1);
    }
    }
    */
#else
    const int bw = 1;
    const int bh = 8;
    int i = 0, j = 0;

    for (; i <= h - bh; i += bh) {
        for (j = 0; j <= w - bw; j += bw) {
            uint64_t const *ib = &in[w*i + j];
            uint64_t *ob = &out[h*j + i];

            uint64_t s_0_0 = ib[0*w + 0];
            uint64_t s_1_0 = ib[1*w + 0];
            uint64_t s_2_0 = ib[2*w + 0];
            uint64_t s_3_0 = ib[3*w + 0];
            uint64_t s_4_0 = ib[4*w + 0];
            uint64_t s_5_0 = ib[5*w + 0];
            uint64_t s_6_0 = ib[6*w + 0];
            uint64_t s_7_0 = ib[7*w + 0];

            ob[0*h + 0] = s_0_0;
            ob[0*h + 1] = s_1_0;
            ob[0*h + 2] = s_2_0;
            ob[0*h + 3] = s_3_0;
            ob[0*h + 4] = s_4_0;
            ob[0*h + 5] = s_5_0;
            ob[0*h + 6] = s_6_0;
            ob[0*h + 7] = s_7_0;
        }
    }

    if (i < h) {
        int i1;

        for (i1 = 0; i1 < w; i1++) {
            for (j = i; j < h; j++) {
                out[i1*h + j] = in[j*w + i1];
            }
        }
    }

    if (j < w) {
        int j1;

        for (i = j; i < w; i++) {
            for (j1 = 0; j1 < h; j1++) {
                out[i*h + j1] = in[j1*w + i];
            }
        }
    }
#endif
}
void tce_sort_4_simd(double* unsorted,double* sorted,
                     int a, int b, int c, int d,
                     int i, int j, int k, int l,
                     double factor) {
    int id[4],jd[4],ia,ib,j1,j2,j3,j4;
    int l1,l2,l3,l4;
    int ia1,ia2,ia3,ia4;
    int ib1,ib2,ib3,ib4;
    int rangea1,rangea2,rangea3,rangea4;
    int rangeb1,rangeb2,rangeb3,rangeb4;
    int range[4],order[4],order_r[4];
    int jj1,jj2,jj3,jj4;
    int jj1_bound,jj2_bound,jj3_bound,jj4_bound;
    int count,ir,jr,kr,lr,N1,N2;

    double *pA, *pB;
    register __m128d x, y, z, w, t, t1,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    jd[0] = a;
    jd[1] = b;
    jd[2] = c;
    jd[3] = d;
    // prefer writes

    range[0] = b*c*d;
    range[1] = c*d;
    range[2] = d;
    range[3] = 1;

    l1 = jd[i];
    l2 = jd[j];
    l3 = jd[k];
    l4 = jd[l];

    rangea1 = range[i];
    rangea2 = range[j];
    rangea3 = range[k];
    rangea4 = range[l];

    rangeb1 = l2*l3*l4;
    rangeb2 = l3*l4;
    rangeb3 = l4;
    rangeb4 = 1;


    // here vectorization can rely on the compiler
    if (l == 3) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia = ia3 + j4*rangea4;
                        ib = ib3 + j4*rangeb4;
                        sorted[ib] = unsorted[ia] * factor;
                    }
                }
            }
        }
    }

    if (k == 3) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3 += tilesize) {
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize;
                        for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) {
                            ia3 = ia2 + jj3*rangea3;
                            ib3 = ib2 + jj3*rangeb3;
                            jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize;
                            for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                ia = ia3 + jj4*rangea4;
                                ib = ib3 + jj4*rangeb4;
                                N1 = rangeb3;
                                N2 = rangea4;

                                pA = unsorted+ia;
                                pB = sorted+ib;
                                x = _mm_loadu_pd(pA);
                                x = _mm_mul_pd(x,fac_vector);
                                y = _mm_loadu_pd(pA + N2);
                                y = _mm_mul_pd(y,fac_vector);
                                z = _mm_shuffle_pd( x, y, 0);
                                w = _mm_shuffle_pd( x, y, 3);
                                _mm_storeu_pd(pB,z);
                                _mm_storeu_pd(pB + N1,w);
                            }
                        }
                    }
                }
            }
        }
    }

    if (j == 3) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2 += tilesize) {
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia1 + j3*rangea3;
                    ib3 = ib1 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize;
                        for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) {
                            ia2 = ia3 + jj2*rangea2;
                            ib2 = ib3 + jj2*rangeb2;
                            jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize;
                            for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                ia = ia2 + jj4*rangea4;
                                ib = ib2 + jj4*rangeb4;
                                N1 = rangeb2;
                                N2 = rangea4;

                                pA = unsorted+ia;
                                pB = sorted+ib;
                                x = _mm_loadu_pd(pA);
                                x = _mm_mul_pd(x,fac_vector);
                                y = _mm_loadu_pd(pA + N2);
                                y = _mm_mul_pd(y,fac_vector);
                                z = _mm_shuffle_pd( x, y, 0);
                                w = _mm_shuffle_pd( x, y, 3);
                                _mm_storeu_pd(pB,z);
                                _mm_storeu_pd(pB + N1,w);
                            }
                        }
                    }
                }
            }
        }
    }

    if (i == 3) {
        for (j1 = 0; j1 < l1; j1 += tilesize) {
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = j2*rangea2;
                ib2 = j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize;
                        for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) {
                            ia1 = ia3 + jj1*rangea1;
                            ib1 = ib3 + jj1*rangeb1;
                            jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize;
                            for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                ia = ia1 + jj4*rangea4;
                                ib = ib1 + jj4*rangeb4;
                                N1 = rangeb1;
                                N2 = rangea4;

                                pA = unsorted+ia;
                                pB = sorted+ib;
                                x = _mm_loadu_pd(pA);
                                x = _mm_mul_pd(x,fac_vector);
                                y = _mm_loadu_pd(pA + N2);
                                y = _mm_mul_pd(y,fac_vector);
                                z = _mm_shuffle_pd( x, y, 0);
                                w = _mm_shuffle_pd( x, y, 3);
                                _mm_storeu_pd(pB,z);
                                _mm_storeu_pd(pB + N1,w);
                            }
                        }
                    }
                }
            }
        }
    }

}
Example #26
0
void AES_Key_Expansion_PARA_3(
        const unsigned char *userkey1,
        const unsigned char *userkey2,
        const unsigned char *userkey3,
        unsigned char *key1,
        unsigned char *key2,
        unsigned char *key3)
{
    __m128i temp1_1, temp2_1;
    __m128i temp1_2, temp2_2, temp3_2, temp4_2; 
    __m128i temp1_3, temp2_3;

    __m128i *Key_Schedule1 = (__m128i*)key1; 
    __m128i *Key_Schedule2 = (__m128i*)key2; 
    __m128i *Key_Schedule3 = (__m128i*)key3; 
     
    temp1_1 = _mm_loadu_si128((__m128i*)userkey1); 
    temp1_2 = _mm_loadu_si128((__m128i*)userkey2); 
    temp3_2 = _mm_loadu_si128((__m128i*)(userkey2+16)); 
    temp1_3 = _mm_loadu_si128((__m128i*)userkey3); 
    Key_Schedule1[0] = temp1_1; 
    Key_Schedule2[0] = temp1_2; 
    Key_Schedule2[1] = temp3_2; 
    Key_Schedule3[0] = temp1_3; 
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1 ,0x1); 
    temp2_2 = _mm_aeskeygenassist_si128 (temp3_2 ,0x1); 
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3 ,0x1); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    /*KEY_ASSIST_PARA_3(
            temp1_1, temp2_1, &temp1_1,
            &temp1_2, &temp2_2, &temp3_2,
            temp1_3, temp2_3, &temp1_3);*/
    Key_Schedule1[1] = temp1_1; 
    Key_Schedule2[1] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule2[1], (__m128d)temp1_2,0); 
    Key_Schedule2[2] = (__m128i)_mm_shuffle_pd((__m128d)temp1_2, (__m128d)temp3_2,1); 
    Key_Schedule3[1] = temp1_3; 
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x2); 
    temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x2);
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x2); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    /*KEY_ASSIST_PARA_3(
            temp1_1, temp2_1, &temp1_1,
            &temp1_2, &temp2_2, &temp3_2,
            temp1_3, temp2_3, &temp1_3);*/
    Key_Schedule1[2] = temp1_1;   
    Key_Schedule2[3] = temp1_2; 
    Key_Schedule2[4] = temp3_2; 
    Key_Schedule3[2] = temp1_3;   
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x4); 
    temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x4); 
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x4); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    /*KEY_ASSIST_PARA_3(
            temp1_1, temp2_1, &temp1_1,
            &temp1_2, &temp2_2, &temp3_2,
            temp1_3, temp2_3, &temp1_3);*/
    Key_Schedule1[3] = temp1_1; 
    Key_Schedule2[4] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule2[4], (__m128d)temp1_2,0); 
    Key_Schedule2[5] = (__m128i)_mm_shuffle_pd((__m128d)temp1_2,(__m128d)temp3_2,1); 
    Key_Schedule3[3] = temp1_3; 
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x8); 
    temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x8); 
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x8); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    /*KEY_ASSIST_PARA_3(
            temp1_1, temp2_1, &temp1_1,
            &temp1_2, &temp2_2, &temp3_2,
            temp1_3, temp2_3, &temp1_3);*/
    Key_Schedule1[4] = temp1_1; 
    Key_Schedule2[6] = temp1_2; 
    Key_Schedule2[7] = temp3_2; 
    Key_Schedule3[4] = temp1_3; 
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x10); 
    temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x10); 
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x10); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    /*KEY_ASSIST_PARA_3(
            temp1_1, temp2_1, &temp1_1,
            &temp1_2, &temp2_2, &temp3_2,
            temp1_3, temp2_3, &temp1_3);*/
    Key_Schedule1[5] = temp1_1; 
    Key_Schedule2[7] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule2[7], (__m128d)temp1_2,0); 
    Key_Schedule2[8] = (__m128i)_mm_shuffle_pd((__m128d)temp1_2,(__m128d)temp3_2,1); 
    Key_Schedule3[5] = temp1_3; 
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x20); 
    temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x20); 
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x20); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    /*KEY_ASSIST_PARA_3(
            temp1_1, temp2_1, &temp1_1,
            &temp1_2, &temp2_2, &temp3_2,
            temp1_3, temp2_3, &temp1_3);*/
    Key_Schedule1[6] = temp1_1; 
    Key_Schedule2[9] = temp1_2; 
    Key_Schedule2[10]= temp3_2; 
    Key_Schedule3[6] = temp1_3; 
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x40); 
    temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x40); 
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x40); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    /*KEY_ASSIST_PARA_3(
            temp1_1, temp2_1, &temp1_1,
            &temp1_2, &temp2_2, &temp3_2,
            temp1_3, temp2_3, &temp1_3);*/
    Key_Schedule1[7] = temp1_1; 
    Key_Schedule2[10]= (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule2[10], (__m128d)temp1_2,0); 
    Key_Schedule2[11]= (__m128i)_mm_shuffle_pd((__m128d)temp1_2,(__m128d)temp3_2,1); 
    Key_Schedule3[7] = temp1_3; 
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x80); 
    temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x80); 
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x80); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    /*KEY_ASSIST_PARA_3(
            temp1_1, temp2_1, &temp1_1,
            &temp1_2, &temp2_2, &temp3_2,
            temp1_3, temp2_3, &temp1_3);*/
    Key_Schedule1[8] = temp1_1;     
    Key_Schedule2[12]= temp1_2;    
    Key_Schedule3[8] = temp1_3;     
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x1b); 
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x1b); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    Key_Schedule1[9] = temp1_1; 
    Key_Schedule3[9] = temp1_3; 
    temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x36); 
    temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x36); 
    temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); 
    temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); 
    Key_Schedule1[10] = temp1_1; 
    Key_Schedule3[10] = temp1_3; 
}
static inline __m128d
my_invrsq_pd(__m128d x)
{
	const __m128d three = (const __m128d) {3.0f, 3.0f};
	const __m128d half  = (const __m128d) {0.5f, 0.5f};
	
	__m128  t  = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
	__m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
	
	/* First Newton-Rapson step, accuracy is now 24 bits */
	__m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
	
	/* Return second Newton-Rapson step, accuracy 48 bits */
	return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
}

/* to extract single integers from a __m128i datatype */
#define _mm_extract_epi64(x, imm) \
    _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
	
void nb_kernel400_x86_64_sse2(int *           p_nri,
                    int *           iinr,
                    int *           jindex,
                    int *           jjnr,
                    int *           shift,
                    double *         shiftvec,
                    double *         fshift,
                    int *           gid,
                    double *         pos,
                    double *         faction,
                    double *         charge,
                    double *         p_facel,
                    double *         p_krf,
                    double *         p_crf,
                    double *         Vc,
                    int *           type,
                    int *           p_ntype,
                    double *         vdwparam,
                    double *         Vvdw,
                    double *         p_tabscale,
                    double *         VFtab,
                    double *         invsqrta,
                    double *         dvda,
                    double *         p_gbtabscale,
                    double *         GBtab,
                    int *           p_nthreads,
                    int *           count,
                    void *          mtx,
                    int *           outeriter,
                    int *           inneriter,
                    double *         work)
{
	int           nri,ntype,nthreads,offset;
	int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
	double        facel,krf,crf,tabscl,gbtabscl,vct,vgbt;
	double        shX,shY,shZ,isai_d,dva;
	gmx_gbdata_t *gbdata;
	float *        gpol;

	__m128d       ix,iy,iz,jx,jy,jz;
	__m128d		  dx,dy,dz,t1,t2,t3;
	__m128d		  fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
	__m128d		  q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
	__m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
	__m128d		  xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
	__m128d       fac,tabscale,gbtabscale;
	__m128i       n0,nnn;
	
	const __m128d neg    = {-1.0f,-1.0f};
	const __m128d zero   = {0.0f,0.0f};
	const __m128d half   = {0.5f,0.5f};
	const __m128d two    = {2.0f,2.0f};
	const __m128d three  = {3.0f,3.0f};
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;

	nri        = *p_nri;
	ntype      = *p_ntype;
	nthreads   = *p_nthreads; 
    facel      = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent));       
	krf        = *p_krf;
	crf        = *p_crf;
	tabscl     = *p_tabscale;
	gbtabscl   = *p_gbtabscale;
	nj1        = 0;
	
	/* Splat variables */
	fac        = _mm_load1_pd(&facel);
	tabscale   = _mm_load1_pd(&tabscl);
	gbtabscale = _mm_load1_pd(&gbtabscl);
		
	/* Keep compiler happy */
	dvdatmp = _mm_setzero_pd();
	vgb     = _mm_setzero_pd();
	dvdaj   = _mm_setzero_pd();
	isaj    = _mm_setzero_pd();
	vcoul   = _mm_setzero_pd();
	t1      = _mm_setzero_pd();
	t2      = _mm_setzero_pd();
	t3      = _mm_setzero_pd();

	jnr1=jnr2=0;
	j13=j23=0;
	
	for(n=0;n<nri;n++)
	{
		is3     = 3*shift[n];
		shX     = shiftvec[is3];
		shY     = shiftvec[is3+1];
		shZ     = shiftvec[is3+2];
		
		nj0     = jindex[n];      
        nj1     = jindex[n+1];  
		offset  = (nj1-nj0)%2;
		
		ii      = iinr[n];
		ii3     = ii*3;
		
		ix      = _mm_set1_pd(shX+pos[ii3+0]);
		iy      = _mm_set1_pd(shX+pos[ii3+1]);
		iz      = _mm_set1_pd(shX+pos[ii3+2]); 
		q       = _mm_set1_pd(charge[ii]);
		
		iq      = _mm_mul_pd(fac,q); 
		isai_d  = invsqrta[ii];
		isai    = _mm_load1_pd(&isai_d);
		
		fix     = _mm_setzero_pd();
		fiy     = _mm_setzero_pd();
		fiz     = _mm_setzero_pd();
		dvdasum = _mm_setzero_pd();
		vctot   = _mm_setzero_pd();
		vgbtot  = _mm_setzero_pd();
		
		for(k=nj0;k<nj1-offset; k+=2)
		{
			jnr1    = jjnr[k];
			jnr2    = jjnr[k+1];
						
			j13     = jnr1 * 3;
			j23     = jnr2 * 3;
			
			/* Load coordinates */
			xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
			xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
			
			xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
			xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
			
			/* transpose */
			jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
			
			/* distances */
			dx      = _mm_sub_pd(ix,jx);
			dy		= _mm_sub_pd(iy,jy);
			dz		= _mm_sub_pd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
						
			/* Load invsqrta */
			isaj	= _mm_loadl_pd(isaj,invsqrta+jnr1);
			isaj	= _mm_loadh_pd(isaj,invsqrta+jnr2);
			isaprod = _mm_mul_pd(isai,isaj);
			
			/* Load charges */
			q		= _mm_loadl_pd(q,charge+jnr1);
			q		= _mm_loadh_pd(q,charge+jnr2);
			qq		= _mm_mul_pd(iq,q);
			
			vcoul	= _mm_mul_pd(qq,rinv);
			fscal	= _mm_mul_pd(vcoul,rinv);
			qq		= _mm_mul_pd(isaprod,qq);
			qq		= _mm_mul_pd(qq,neg);
			gbscale	= _mm_mul_pd(isaprod,gbtabscale);
			
			/* Load dvdaj */
			dvdaj	= _mm_loadl_pd(dvdaj, dvda+jnr1);
			dvdaj	= _mm_loadh_pd(dvdaj, dvda+jnr2);
			
			r		= _mm_mul_pd(rsq11,rinv);
			rt		= _mm_mul_pd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_pd(rt,n0d);
			eps2	= _mm_mul_pd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G		= _mm_mul_pd(G,eps);
			H		= _mm_mul_pd(H,eps2);
			Fp		= _mm_add_pd(F,G);
			Fp		= _mm_add_pd(Fp,H);
			VV		= _mm_mul_pd(Fp,eps);
			VV		= _mm_add_pd(Y,VV);
			H		= _mm_mul_pd(two,H);
			FF		= _mm_add_pd(Fp,G);
			FF		= _mm_add_pd(FF,H);
			vgb		= _mm_mul_pd(qq,VV);
			fijC	= _mm_mul_pd(qq,FF);
			fijC	= _mm_mul_pd(fijC,gbscale);
			
			dvdatmp = _mm_mul_pd(fijC,r);
			dvdatmp	= _mm_add_pd(vgb,dvdatmp);
			dvdatmp = _mm_mul_pd(dvdatmp,neg);
			dvdatmp = _mm_mul_pd(dvdatmp,half);
			dvdasum	= _mm_add_pd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_pd(dvdatmp,isaj);
			xmm1	= _mm_mul_pd(xmm1,isaj);
			dvdaj	= _mm_add_pd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			_mm_storeh_pd(dvda+jnr2,dvdaj);
			
			vctot	= _mm_add_pd(vctot,vcoul);
			vgbtot  = _mm_add_pd(vgbtot,vgb);
					
			fscal	= _mm_sub_pd(fijC,fscal);
			fscal	= _mm_mul_pd(fscal,neg);
			fscal	= _mm_mul_pd(fscal,rinv);
						
			/* calculate partial force terms */
			t1		= _mm_mul_pd(fscal,dx);
			t2		= _mm_mul_pd(fscal,dy);
			t3		= _mm_mul_pd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_pd(fix,t1);
			fiy		= _mm_add_pd(fiy,t2);
			fiz		= _mm_add_pd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm1	= _mm_loadu_pd(faction+j13); /* fx1 fy1 */
			xmm2	= _mm_loadu_pd(faction+j23); /* fx2 fy2 */
			
			xmm5	= _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
			xmm6	= _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
			
			/* transpose */
			xmm7	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
			xmm5	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
			xmm6	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* subtract partial forces */
			xmm5	= _mm_sub_pd(xmm5,t1);
			xmm6	= _mm_sub_pd(xmm6,t2);
			xmm7	= _mm_sub_pd(xmm7,t3);
			
			xmm1	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
			xmm2	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* store fx and fy */
			_mm_storeu_pd(faction+j13,xmm1);
			_mm_storeu_pd(faction+j23,xmm2);
			
			/* .. then fz */
			_mm_storel_pd(faction+j13+2,xmm7);
			_mm_storel_pd(faction+j23+2,xmm7);
		}

		/* In double precision, offset can only be either 0 or 1 */
		if(offset!=0)
		{
			jnr1	= jjnr[k];
			j13		= jnr1*3;
			
			jx      = _mm_load_sd(pos+j13);
			jy      = _mm_load_sd(pos+j13+1);
			jz      = _mm_load_sd(pos+j13+2);
						
			isaj	= _mm_load_sd(invsqrta+jnr1);
			isaprod = _mm_mul_sd(isai,isaj);
			dvdaj	= _mm_load_sd(dvda+jnr1);
			q		= _mm_load_sd(charge+jnr1);
			qq      = _mm_mul_sd(iq,q);
			
			dx      = _mm_sub_sd(ix,jx);
			dy		= _mm_sub_sd(iy,jy);
			dz		= _mm_sub_sd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
						
			vcoul	= _mm_mul_sd(qq,rinv);
			fscal	= _mm_mul_sd(vcoul,rinv);
			qq		= _mm_mul_sd(isaprod,qq);
			qq		= _mm_mul_sd(qq,neg);
			gbscale	= _mm_mul_sd(isaprod,gbtabscale);
			
			r		= _mm_mul_sd(rsq11,rinv);
			rt		= _mm_mul_sd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_sd(rt,n0d);
			eps2	= _mm_mul_sd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); 
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); 
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); 
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); 
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
			
			G		= _mm_mul_sd(G,eps);
			H		= _mm_mul_sd(H,eps2);
			Fp		= _mm_add_sd(F,G);
			Fp		= _mm_add_sd(Fp,H);
			VV		= _mm_mul_sd(Fp,eps);
			VV		= _mm_add_sd(Y,VV);
			H		= _mm_mul_sd(two,H);
			FF		= _mm_add_sd(Fp,G);
			FF		= _mm_add_sd(FF,H);
			vgb		= _mm_mul_sd(qq,VV);
			fijC	= _mm_mul_sd(qq,FF);
			fijC	= _mm_mul_sd(fijC,gbscale);
			
			dvdatmp = _mm_mul_sd(fijC,r);
			dvdatmp	= _mm_add_sd(vgb,dvdatmp);
			dvdatmp = _mm_mul_sd(dvdatmp,neg);
			dvdatmp = _mm_mul_sd(dvdatmp,half);
			dvdasum	= _mm_add_sd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_sd(dvdatmp,isaj);
			xmm1	= _mm_mul_sd(xmm1,isaj);
			dvdaj	= _mm_add_sd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			
			vctot	= _mm_add_sd(vctot,vcoul);
			vgbtot  = _mm_add_sd(vgbtot,vgb);
						
			fscal	= _mm_sub_sd(fijC,fscal);
			fscal	= _mm_mul_sd(fscal,neg);
			fscal	= _mm_mul_sd(fscal,rinv);
								
			/* calculate partial force terms */
			t1		= _mm_mul_sd(fscal,dx);
			t2		= _mm_mul_sd(fscal,dy);
			t3		= _mm_mul_sd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_sd(fix,t1);
			fiy		= _mm_add_sd(fiy,t2);
			fiz		= _mm_add_sd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm5	= _mm_load_sd(faction+j13);   /* fx */
			xmm6    = _mm_load_sd(faction+j13+1); /* fy */
			xmm7    = _mm_load_sd(faction+j13+2); /* fz */
						
			/* subtract partial forces */
			xmm5	= _mm_sub_sd(xmm5,t1);
			xmm6	= _mm_sub_sd(xmm6,t2);
			xmm7	= _mm_sub_sd(xmm7,t3);
			
			/* store forces */
			_mm_store_sd(faction+j13,xmm5);
			_mm_store_sd(faction+j13+1,xmm6);
			_mm_store_sd(faction+j13+2,xmm7);
		}
		
		/* fix/fiy/fiz now contain four partial terms, that all should be
		 * added to the i particle forces
		 */
		t1		 = _mm_unpacklo_pd(t1,fix);
		t2		 = _mm_unpacklo_pd(t2,fiy);
		t3		 = _mm_unpacklo_pd(t3,fiz);
				
		fix		 = _mm_add_pd(fix,t1);
		fiy		 = _mm_add_pd(fiy,t2);
		fiz		 = _mm_add_pd(fiz,t3);
		
		fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
		fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
		fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
		
		/* Load i forces from memory */
		xmm1     = _mm_load_sd(faction+ii3);
		xmm2     = _mm_load_sd(faction+ii3+1);
		xmm3     = _mm_load_sd(faction+ii3+2);
		
		/* Add to i force */
		fix      = _mm_add_sd(fix,xmm1);
		fiy      = _mm_add_sd(fiy,xmm2);
		fiz      = _mm_add_sd(fiz,xmm3);
	
		/* store i forces to memory */
		_mm_store_sd(faction+ii3,fix);
		_mm_store_sd(faction+ii3+1,fiy);
		_mm_store_sd(faction+ii3+2,fiz);
				
		/* now do dvda */
		dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
		dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
		_mm_storeh_pd(&dva,dvdasum);
		dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
		
		ggid	 = gid[n];
		
		/* Coulomb potential */
		vcoul	 = _mm_unpacklo_pd(vcoul,vctot);
		vctot	 = _mm_add_pd(vctot,vcoul);
		_mm_storeh_pd(&vct,vctot);
		Vc[ggid] = Vc[ggid] + vct;
		
		/* GB potential */
		vgb  	 = _mm_unpacklo_pd(vgb,vgbtot);
		vgbtot	 = _mm_add_pd(vgbtot,vgb);
		_mm_storeh_pd(&vgbt,vgbtot);
		gpol[ggid] = gpol[ggid] + vgbt;
	}
	
	*outeriter   = nri;            
    *inneriter   = nj1; 
	
}
Example #28
0
inline static __m128d
absfac( const __m128d a )
{
  register const __m128d z2 = _mm_shuffle_pd( a , a , 1 ) ;
  return _mm_add_pd( _mm_mul_pd( a , a ) , _mm_mul_pd( z2 , z2 ) ) ;
}
Example #29
0
    inline void Cryptor::expandKey192(const unsigned char *key,
                                      unsigned char *schedule) {
      __m128i *keySchedule = (__m128i*) schedule;

      // Save the first 128 bits of the key as the first one.
      __m128i tmp = _mm_loadu_si128((__m128i*) key);
      if (!bigEndian) {
        reverse_m128i(tmp); // swap byte-order => big-endian.
      }
      keySchedule[0] = tmp;

      // The next 64 bits as the second.
      unsigned char buf[128];
      memset(buf, 0, 128);
      memcpy(buf, key + 16, 64);
      
      __m128i tmp3 = _mm_loadu_si128((__m128i*) buf);
      if (!bigEndian) {
        reverse_m128i(tmp3); // swap byte-order => big-endian.
      }
      keySchedule[1] = tmp3;

      __m128i tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x1);
      assistKey192(&tmp, &tmp2, &tmp3);
      keySchedule[1] =
        (__m128i) _mm_shuffle_pd((__m128d) keySchedule[1],
                                 (__m128d) tmp, 0);
      keySchedule[2] =
        (__m128i) _mm_shuffle_pd((__m128d) tmp, (__m128d) tmp3, 1);
      
      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x2);
      assistKey192(&tmp, &tmp2, &tmp3);
      keySchedule[3] = tmp;
      keySchedule[4] = tmp3;

      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x4);
      assistKey192(&tmp, &tmp2, &tmp3);
      keySchedule[4] =
        (__m128i) _mm_shuffle_pd((__m128d) keySchedule[4],
                                 (__m128d) tmp, 0);
      keySchedule[5] = (__m128i) _mm_shuffle_pd((__m128d) tmp,
                                                (__m128d) tmp3, 1);
      
      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x8);
      assistKey192(&tmp, &tmp2, &tmp3);
      keySchedule[6] = tmp;
      keySchedule[7] = tmp3;
      
      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
      assistKey192(&tmp, &tmp2, &tmp3);
      keySchedule[7] =
        (__m128i) _mm_shuffle_pd((__m128d) keySchedule[7],
                                 (__m128d) tmp, 0);
      keySchedule[8] = (__m128i) _mm_shuffle_pd((__m128d) tmp,
                                                (__m128d) tmp3, 1);
 
      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
      assistKey192(&tmp, &tmp2, &tmp3);
      keySchedule[9] = tmp;
      keySchedule[10] = tmp3;
      
      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
      assistKey192(&tmp, &tmp2, &tmp3);
      keySchedule[10] =
        (__m128i) _mm_shuffle_pd((__m128d) keySchedule[10],
                                 (__m128d) tmp, 0);
      keySchedule[11] = (__m128i) _mm_shuffle_pd((__m128d) tmp,
                                                 (__m128d) tmp3, 1);
 
      tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x80);
      assistKey192(&tmp, &tmp2, &tmp3);
      keySchedule[12] = tmp;
      keySchedule[13] = tmp3;      
    }
Example #30
0
/* Encryption key setup */
static void aes_key_setup_enc(__m128i rk[], const u8* cipherKey, int keylen)
{
    switch (keylen) {
        case 16:
        {
            /* 128 bit key setup */
            rk[0] = _mm_loadu_si128((const __m128i*) cipherKey);
            rk[1] = KEYEXP128(rk[0], 0x01);
            rk[2] = KEYEXP128(rk[1], 0x02);
            rk[3] = KEYEXP128(rk[2], 0x04);
            rk[4] = KEYEXP128(rk[3], 0x08);
            rk[5] = KEYEXP128(rk[4], 0x10);
            rk[6] = KEYEXP128(rk[5], 0x20);
            rk[7] = KEYEXP128(rk[6], 0x40);
            rk[8] = KEYEXP128(rk[7], 0x80);
            rk[9] = KEYEXP128(rk[8], 0x1B);
            rk[10] = KEYEXP128(rk[9], 0x36);
            break;
        }
        case 24:
        {
            /* 192 bit key setup */
            __m128i temp[2];
            rk[0] = _mm_loadu_si128((const __m128i*) cipherKey);
            rk[1] = _mm_loadu_si128((const __m128i*) (cipherKey+16));
            temp[0] = KEYEXP192(rk[0], rk[1], 0x01);
            temp[1] = KEYEXP192_2(temp[0], rk[1]);
            rk[1] = (__m128i)_mm_shuffle_pd((__m128d)rk[1], (__m128d)temp[0], 0);
            rk[2] = (__m128i)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1);
            rk[3] = KEYEXP192(temp[0], temp[1], 0x02);
            rk[4] = KEYEXP192_2(rk[3], temp[1]);
            temp[0] = KEYEXP192(rk[3], rk[4], 0x04);
            temp[1] = KEYEXP192_2(temp[0], rk[4]);
            rk[4] = (__m128i)_mm_shuffle_pd((__m128d)rk[4], (__m128d)temp[0], 0);
            rk[5] = (__m128i)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1);
            rk[6] = KEYEXP192(temp[0], temp[1], 0x08);
            rk[7] = KEYEXP192_2(rk[6], temp[1]);
            temp[0] = KEYEXP192(rk[6], rk[7], 0x10);
            temp[1] = KEYEXP192_2(temp[0], rk[7]);
            rk[7] = (__m128i)_mm_shuffle_pd((__m128d)rk[7], (__m128d)temp[0], 0);
            rk[8] = (__m128i)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1);
            rk[9] = KEYEXP192(temp[0], temp[1], 0x20);
            rk[10] = KEYEXP192_2(rk[9], temp[1]);
            temp[0] = KEYEXP192(rk[9], rk[10], 0x40);
            temp[1] = KEYEXP192_2(temp[0], rk[10]);
            rk[10] = (__m128i)_mm_shuffle_pd((__m128d)rk[10], (__m128d) temp[0], 0);
            rk[11] = (__m128i)_mm_shuffle_pd((__m128d)temp[0],(__m128d) temp[1], 1);
            rk[12] = KEYEXP192(temp[0], temp[1], 0x80);
            break;
        }
        case 32:
        {
            /* 256 bit key setup */
            rk[0] = _mm_loadu_si128((const __m128i*) cipherKey);
            rk[1] = _mm_loadu_si128((const __m128i*) (cipherKey+16));
            rk[2] = KEYEXP256(rk[0], rk[1], 0x01);
            rk[3] = KEYEXP256_2(rk[1], rk[2]);
            rk[4] = KEYEXP256(rk[2], rk[3], 0x02);
            rk[5] = KEYEXP256_2(rk[3], rk[4]);
            rk[6] = KEYEXP256(rk[4], rk[5], 0x04);
            rk[7] = KEYEXP256_2(rk[5], rk[6]);
            rk[8] = KEYEXP256(rk[6], rk[7], 0x08);
            rk[9] = KEYEXP256_2(rk[7], rk[8]);
            rk[10] = KEYEXP256(rk[8], rk[9], 0x10);
            rk[11] = KEYEXP256_2(rk[9], rk[10]);
            rk[12] = KEYEXP256(rk[10], rk[11], 0x20);
            rk[13] = KEYEXP256_2(rk[11], rk[12]);
            rk[14] = KEYEXP256(rk[12], rk[13], 0x40);
            break;
        }
    }
}