Пример #1
0
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint)
{

#if CHECK_FUNCTIONAL
    Vector3d param(EPoint);
#endif

    AVX2TABLETYPE *mp;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    const __m256d ONE_PD = _mm256_set1_pd(1.0);
    const __m128i short_si128 = _mm_set1_epi32(0xffff);

    const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0);
    const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON);
    const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy);
    const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn));

    const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0);

    const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn));
    const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD);

    const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn),
        _mm_set1_epi32(0xfff));

    const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn,
        _mm256_mul_pd(xyz_ixyzn,
            _mm256_sub_pd(_mm256_set1_pd(3.0),
                _mm256_add_pd(xyz_ixyzn, xyz_ixyzn))));

    const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn);

    const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20);
    const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0));
    const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1));

    const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy);

    const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));
    const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));

    int ints[4];
    _mm_storeu_si128((__m128i*)(ints), i_xyzn);

    const int ixiy_hash = Hash2d(ints[0], ints[1]);
    const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]);
    const int ixjy_hash = Hash2d(ints[0], ints[1] + 1);
    const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1);

    const int iz = ints[2];

    const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);
    const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);

    __m256d ss;
    __m256d blend;

    __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd();


    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0));
    //     blend = _mm256_blend_pd(iii, jjj, 0);

    INCSUMAVX_VECTOR(mp, ss, iii);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 2);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3));
    blend = _mm256_blend_pd(iii, jjj, 6);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 4);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 12);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3));
    //     blend = _mm256_blend_pd(iii, jjj, 14);

    INCSUMAVX_VECTOR(mp, ss, jjj);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 10);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0));
    blend = _mm256_blend_pd(iii, jjj, 8);

    INCSUMAVX_VECTOR(mp, ss, blend);


    __m256d xy = _mm256_hadd_pd(x,y);
    __m128d xy_up = _mm256_extractf128_pd(xy,1);
    xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up);
    _mm_storeu_pd(&result[X],xy_up);

    __m128d z_up = _mm256_extractf128_pd(z,1);
    z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up);
    z_up = _mm_hadd_pd(z_up,z_up);
    result[Z] = _mm_cvtsd_f64(z_up);


#if CHECK_FUNCTIONAL
    {
        Vector3d portable_res;
        PortableDNoise(portable_res , param);
        if (fabs(portable_res[X] - result[X]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise X error");
        }
        if (fabs(portable_res[Y] - result[Y]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Y error");
        }
        if (fabs(portable_res[Z] - result[Z]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Z error");
        }

    }

#endif



    _mm256_zeroupper();
    return;

}
Пример #2
0
/*
Naive implementation of Matrix Matrix Multiplication

@param A input matrix
@param B input matrix
@param C output matrix
*/
inline
void	naive(const Matrix& A, const Matrix& B, Matrix& C){
	//preload dimensions for faster access
	int dimM = C.getDimM();
	int dimN = C.getDimN();
	int dimL = A.getDimN();
	
	for (int m = 0; m < dimM; m+=4){				///rows of c
		for (int n = 0; n < dimN; n+=4){			///cols of c	
			//do calculation of a 4x4 block
			//std::cout << m << "\t" << n << std::endl;
			__m256d*	pA = A.get(m, 0);
			__m256d*	pB = A.get(m+1, 0);
			__m256d*	pC = A.get(m+2, 0);
			__m256d*	pD = A.get(m+3, 0);
			__m256d*	pK = B.getT(0, n);
			__m256d*	pL = B.getT(0, n+1);
			__m256d*	pM = B.getT(0, n+2);
			__m256d*	pN = B.getT(0, n+3);
			//std::cout << pA << "\t" << pB << "\t" << pC << "\t" << pD << std::endl;
			__m256d		K = _mm256_setzero_pd();
			__m256d		L = _mm256_setzero_pd();
			__m256d		M = _mm256_setzero_pd();
			__m256d		N = _mm256_setzero_pd();
			__m256d		O = _mm256_setzero_pd();
			__m256d		P = _mm256_setzero_pd();
			__m256d		Q = _mm256_setzero_pd();
			__m256d		R = _mm256_setzero_pd();
			__m256d		S = _mm256_setzero_pd();
			__m256d		T = _mm256_setzero_pd();
			__m256d		U = _mm256_setzero_pd();
			__m256d		V = _mm256_setzero_pd();
			__m256d		W = _mm256_setzero_pd();
			__m256d		X = _mm256_setzero_pd();
			__m256d		Y = _mm256_setzero_pd();
			__m256d		Z = _mm256_setzero_pd();
			for (int l = 0; l < dimL; l+=4){
				//std::cout <<"mul" << std::endl;
				K = K + (*pA) * (*pK);
				L = L + (*pA) * (*pL);
				M = M + (*pA) * (*pM);
				N = N + (*pA) * (*pN);
				O = O + (*pB) * (*pK);
				P = P + (*pB) * (*pL);
				Q = Q + (*pB) * (*pM);
				R = R + (*pB) * (*pN);
				S = S + (*pC) * (*pK);
				T = T + (*pC) * (*pL);
				U = U + (*pC) * (*pM);
				V = V + (*pC) * (*pN);
				W = W + (*pD) * (*pK);
				X = X + (*pD) * (*pL);
				Y = Y + (*pD) * (*pM);
				Z = Z + (*pD) * (*pN);
				//std::cout << "inc" <<std::endl;
				pA++;
				pB++;
				pC++;
				pD++;
				pK++;
				pL++;
				pM++;
				pN++;
			}
			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			__m256d sumab = _mm256_hadd_pd(K, L);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			__m256d sumcd = _mm256_hadd_pd(M, N);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			__m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			__m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			__m256d sum =  _mm256_add_pd(perm, blend);

			C.set(m, n, sum);
			//C(m  , n)     = K[0] + K[1] + K[2] + K[3];
			//C(m  , n+1)   = L[0] + L[1] + L[2] + L[3];
			//C(m  , n+2)   = M[0] + M[1] + M[2] + M[3];
			//C(m  , n+3)   = N[0] + N[1] + N[2] + N[3];

			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			sumab = _mm256_hadd_pd(O, P);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			sumcd = _mm256_hadd_pd(Q, R);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			sum =  _mm256_add_pd(perm, blend);
			
			C.set(m+1, n, sum);
			//C(m+1, n  )   = O[0] + O[1] + O[2] + O[3];
			//C(m+1, n+1)   = P[0] + P[1] + P[2] + P[3];
			//C(m+1, n+2)   = Q[0] + Q[1] + Q[2] + Q[3];
			//C(m+1, n+3)   = R[0] + R[1] + R[2] + R[3];
			
			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			sumab = _mm256_hadd_pd(S, T);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			sumcd = _mm256_hadd_pd(U, V);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			sum =  _mm256_add_pd(perm, blend);
			
			C.set(m+2, n, sum);
			//C(m+2, n  )   = S[0] + S[1] + S[2] + S[3];
			//C(m+2, n+1)   = T[0] + T[1] + T[2] + T[3];
			//C(m+2, n+2)   = U[0] + U[1] + U[2] + U[3];
			//C(m+2, n+3)   = V[0] + V[1] + V[2] + V[3];
			
			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			sumab = _mm256_hadd_pd(W, X);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			sumcd = _mm256_hadd_pd(Y, Z);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			sum =  _mm256_add_pd(perm, blend);
			
			C.set(m+3, n, sum);
			
			//C(m+3, n  )   = W[0] + W[1] + W[2] + W[3];
			//C(m+3, n+1)   = X[0] + X[1] + X[2] + X[3];
			//C(m+3, n+2)   = Y[0] + Y[1] + Y[2] + Y[3];
			//C(m+3, n+3)   = Z[0] + Z[1] + Z[2] + Z[3];
		}
	}
}
Пример #3
0
/**
  * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels.
  */
FORCE_INLINE double single_pixel(
        double *im, int center, int top, int left, int right, int bottom,
        const __m256i mask1110,
        const __m256d rgb0W,
        const __m256d onehalf,
        const __m256d minustwelvehalf){
//    double r = im[center];
//    double g = im[center+1];
//    double b = im[center+2];

//    double r1 = im[top];
//    double g1 = im[top+1];
//    double b1 = im[top+2];
//    double r2 = im[left];
//    double g2 = im[left+1];
//    double b2 = im[left+2];
//    double r3 = im[right];
//    double g3 = im[right+1];
//    double b3 = im[right+2];
//    double r4 = im[bottom];
//    double g4 = im[bottom+1];
//    double b4 = im[bottom+2];

    __m256d c = _mm256_maskload_pd(&(im[center]),mask1110);
    __m256d c1 = _mm256_loadu_pd(&(im[top]));
    __m256d c2 = _mm256_loadu_pd(&(im[left]));
    __m256d c3 = _mm256_loadu_pd(&(im[right]));
    __m256d c4 = _mm256_loadu_pd(&(im[bottom]));

    COST_INC_LOAD(20);

//    double grey = rw * r + gw * g + bw * b;
//    double grey1 = rw * r1 + gw * g1 + bw * b1;
//    double grey2 = rw * r2 + gw * g2 + bw * b2;
//    double grey3 = rw * r3 + gw * g3 + bw * b3;
//    double grey4 = rw * r4 + gw * g4 + bw * b4;

    __m256d greyc = _mm256_mul_pd(c,rgb0W);
    __m256d grey1 = _mm256_mul_pd(c1,rgb0W);
    __m256d grey2 = _mm256_mul_pd(c2,rgb0W);
    __m256d grey3 = _mm256_mul_pd(c3,rgb0W);
    __m256d grey4 = _mm256_mul_pd(c4,rgb0W);

    //AVX: double: horizontal add for 1 vector
     __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2
     __m256d c_h = _mm256_hadd_pd(c,c_perm);
     __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo
     __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi
     double c_hsum_lo = _mm_cvtsd_f64(c_h_lo);
     double c_hsum_hi = _mm_cvtsd_f64(c_h_hi);
     double c_hsum = c_hsum_lo + c_hsum_hi;

     //AVX: double: horizontal add for 1 vector
      __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2
      __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm);
      __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo
      __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi
      double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo);
      double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi);
      double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi;

    //AVX: _m256d: horizontal add for 4 vectors at once
    __m256d grey12 = _mm256_hadd_pd(grey1,grey2);
    __m256d grey34 = _mm256_hadd_pd(grey3,grey4);
    __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011
    __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2
    __m256d grey_1234 =  _mm256_add_pd(grey_1234_perm, grey_1234_blend);

    //AVX: double: horizontal add for 1 vector
     __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2
     __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm);
     __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo
     __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi
     double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo);
     double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi);
     double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi;

    COST_INC_ADD(10); //+ operations wasted on AVX
    COST_INC_MUL(15); //+ operations wasted on AVX

    double mu = c_hsum / 3.0;
    COST_INC_ADD(2);
    COST_INC_DIV(1);

//    double rmu = r-mu;
//    double gmu = g-mu;
//    double bmu = b-mu;

    __m256d c_mu = _mm256_set1_pd(mu);
    __m256d c_rgbmu = _mm256_sub_pd(c,c_mu);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rz = r-0.5;
//    double gz = g-0.5;
//    double bz = b-0.5;

    __m256d c_rgbz = _mm256_sub_pd(c,onehalf);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rzrz = rz*rz;
//    double gzgz = gz*gz;
//    double bzbz = bz*bz;

    __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double re = exp(-12.5*rzrz);
//    double ge = exp(-12.5*gzgz);
//    double be = exp(-12.5*bzbz);

    __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq);

    __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp);
    __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps);
    __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps);

    COST_INC_EXP(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0);
    __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu);

    __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo
    __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi
    __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo);
    double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi);
    double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum);

    double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo;

    double t1_tmp2 = t1_tmp1 / 3.0;
    double t1 = sqrt(t1_tmp2);

    COST_INC_SQRT(1);
    COST_INC_ADD(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX
    COST_INC_DIV(1);
    double t2 = fabs(t1);
    COST_INC_ABS(1);

//    double t3 = re*ge*be;

    __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo
    __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi

    double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo);
    double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi);
    __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap
    double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped);

    double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo;

    COST_INC_MUL(2);
    double t4 = fabs(t3);
    COST_INC_ABS(1);

    double t5 = t2 * t4;
    COST_INC_MUL(1);

//    double t6 = -4.0*grey+grey1+grey2+grey3+grey4;

    double minusfour_times_grey = -4.0*greyc_hsum;
    double t6 = minusfour_times_grey+grey1234_sum;

    COST_INC_MUL(1);
    COST_INC_ADD(2); //2 operations saved due to AVX

    double t7 = fabs(t6);
    COST_INC_ABS(1);

    double t8 = t5 * t7;
    COST_INC_MUL(1);

    double t9 = t8 + 1.0E-12;
    COST_INC_ADD(1);

    return t9;
}
Пример #4
0
DBL AVX2FMA3Noise(const Vector3d& EPoint, int noise_generator)
{
    AVX2TABLETYPE *mp;
    DBL sum = 0.0;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator == kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
        if (sum < 0.0) sum = 0.0;
        if (sum > 1.0) sum = 1.0;

        return sum;
    }

    const __m256d ONE_PD = _mm256_set1_pd(1);
    const __m128i short_si128 = _mm_set1_epi32(0xffff);

    const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0);
    const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON);
    const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy);
    const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn));

    const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0);

    const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn));
    const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD);

    const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn),
        _mm_set1_epi32(0xfff));

    const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn,
        _mm256_mul_pd(xyz_ixyzn,
            _mm256_sub_pd(_mm256_set1_pd(3.0),
                _mm256_add_pd(xyz_ixyzn, xyz_ixyzn))));

    const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn);

    const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20);
    const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0));
    const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1));

    const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy);

    const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));
    const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));

    int ints[4];
    _mm_storeu_si128((__m128i*)(ints), i_xyzn);

    const int ixiy_hash = Hash2d(ints[0], ints[1]);
    const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]);
    const int ixjy_hash = Hash2d(ints[0], ints[1] + 1);
    const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1);

    const int iz = ints[2];

    const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);
    const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);

    __m256d sumr = _mm256_setzero_pd();
    __m256d sumr1 = _mm256_setzero_pd();


    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)];
    INCSUMAVX_NOBLEND(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)), iii);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 2);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 4);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)), iii, jjj, 6);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)), iii, jjj, 8);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 10);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 12);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)];
    INCSUMAVX_NOBLEND(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)), jjj);

    {
        sumr = _mm256_add_pd(sumr, sumr1);

        __m128d sumr_up = _mm256_extractf128_pd(sumr,1);
        sumr_up = _mm_add_pd(_mm256_castpd256_pd128(sumr),sumr_up);
        sumr_up = _mm_hadd_pd(sumr_up,sumr_up);
        sum = _mm_cvtsd_f64(sumr_up);
    }

    if (noise_generator == kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        sum += 1.05242;
        sum *= 0.48985582;
        /*sum *= 0.5;
        sum += 0.5;*/

        if (sum < 0.0)
            sum = 0.0;
        if (sum > 1.0)
            sum = 1.0;
    }
    else
    {
        sum = sum + 0.5;                     /* range at this point -0.5 - 0.5... */

        if (sum < 0.0)
            sum = 0.0;
        if (sum > 1.0)
            sum = 1.0;
    }



#if CHECK_FUNCTIONAL
    {
        DBL orig_sum = PortableNoise(EPoint, noise_generator);
        if (fabs(orig_sum - sum) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("Noise error");
        }

    }

#endif

    _mm256_zeroupper();
    return (sum);
}
Пример #5
0
// it moves horizontally inside a block
void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	double *A1 = A0 + 4*sda;

	const int lda = 4;
	
	int k;

	__m128d
		tmp0,
		z_0, y_0_1, a_00_10;

	__m256d
		zeros,
		ax_temp,
		a_00_10_20_30, a_01_11_21_31,
		a_40_50_60_70, a_41_51_61_71,
		x_0, x_1,
		y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3,
		y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7;
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_4_5_6_7   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_4_5_6_7_b = _mm256_setzero_pd();	*/
		
	zeros = _mm256_setzero_pd();
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_c = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_d = _mm256_setzero_pd();*/
	
	// upper triangular

	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A0[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A0[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 );
	y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc );

	// forth col (avoid zero y_0_1_2_3)
	x_1     = _mm256_broadcast_sd( &x[3] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
	y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 );

	// first col
	x_0     = _mm256_broadcast_sd( &x[2] );
	x_0     = _mm256_blend_pd( x_0, zeros, 0x8 );
	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );


	A0 += 4*lda;
	A1 += 4*lda;
	x  += 4;


	// upper squared
	x_0 = _mm256_broadcast_sd( &x[0] );
	x_1 = _mm256_broadcast_sd( &x[1] );

	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );

	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
	ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

	x_0 = _mm256_broadcast_sd( &x[2] );
	x_1 = _mm256_broadcast_sd( &x[3] );

	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );

	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
	ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );


	// lower triangular


	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A1[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A1[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 );
	y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc );

	// forth col (avoid zero y_4_5_6_7)
	x_1     = _mm256_broadcast_sd( &x[3] );
	a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] );
	y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 );

	// first col
	x_0     = _mm256_broadcast_sd( &x[2] );
	x_0     = _mm256_blend_pd( x_0, zeros, 0x8 );
	a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] );
	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );


	A0 += 4*lda;
	A1 += 4*lda;
	x  += 4;


	k=8;
	for(; k<kmax-3; k+=4)
		{

/*		__builtin_prefetch( A0 + 4*lda );*/
/*		__builtin_prefetch( A1 + 4*lda );*/

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );

/*		__builtin_prefetch( A0 + 5*lda );*/
/*		__builtin_prefetch( A1 + 5*lda );*/

		x_0 = _mm256_broadcast_sd( &x[2] );
		x_1 = _mm256_broadcast_sd( &x[3] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
	
		A0 += 4*lda;
		A1 += 4*lda;
		x  += 4;

		}
		
	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
		A0 += 2*lda;
		A1 += 2*lda;
		x  += 2;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );
	y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		
/*		A0 += 1*lda;*/
/*		A1 += 1*lda;*/
/*		x  += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}

	}
Пример #6
0
// it moves horizontally inside a block (A upper triangular)
void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;
	
	__m128d
		tmp0,
		z_0, y_0_1, a_00_10;

	__m256d
		zeros,
		ax_temp,
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0, x_1, x_2, x_3,
		y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3;
		
	zeros = _mm256_setzero_pd();
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_c = _mm256_setzero_pd();	*/
	y_0_1_2_3_d = _mm256_setzero_pd();
	
	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 );
	y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc );

	// forth col (avoid zero y_0_1_2_3)
	x_3     = _mm256_broadcast_sd( &x[3] );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 );

	// first col
	x_2     = _mm256_broadcast_sd( &x[2] );
	x_2     = _mm256_blend_pd( x_2, zeros, 0x8 );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
	y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 );

	A += 4*lda;
	x += 4;

	k=4;
	for(; k<kmax-3; k+=4)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		x_2 = _mm256_broadcast_sd( &x[2] );
		x_3 = _mm256_broadcast_sd( &x[3] );

		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 );
		y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp );
		ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 );
		y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp );
		
		A += 4*lda;
		x += 4;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d );

	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		A += 2*lda;
		x += 2;

		}

	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		
		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}

	}
Пример #7
0
// it moves vertically across blocks
void kernel_dtrmv_u_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 4;
	
/*	__builtin_prefetch( A + 0*lda );*/
/*	__builtin_prefetch( A + 2*lda );*/
/*	__builtin_prefetch( A + 4*lda );*/
/*	__builtin_prefetch( A + 6*lda );*/

/*	double *tA, *tx;*/

	int k;
/*	int ka = kmax-kna; // number from aligned positon*/
	
	__m256d
		zeros,
		tmp0, tmp1, 
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0_1_2_3,
		y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77;
	
/*	__m128d*/
/*		ax_temp,*/
/*		a_00_10, a_01_11, a_02_12, a_03_13,*/
/*		x_0_1,*/
/*		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;*/
	
	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();
	y_22 = _mm256_setzero_pd();
	y_33 = _mm256_setzero_pd();
	y_44 = _mm256_setzero_pd();
	y_55 = _mm256_setzero_pd();
	y_66 = _mm256_setzero_pd();
	y_77 = _mm256_setzero_pd();
		
	k=0;
	for(; k<kmax-7; k+=8)
		{

/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
	
/*		__builtin_prefetch( A + sda*lda + 4*lda );*/
/*		__builtin_prefetch( A + sda*lda + 6*lda );*/

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, tmp0 );
		y_55 = _mm256_add_pd( y_55, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, tmp0 );
		y_77 = _mm256_add_pd( y_77, tmp1 );

		A += 4 + (sda-1)*lda;
		x += 4;


/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
	
/*		__builtin_prefetch( A + sda*lda + 4*lda );*/
/*		__builtin_prefetch( A + sda*lda + 6*lda );*/

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, tmp0 );
		y_55 = _mm256_add_pd( y_55, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, tmp0 );
		y_77 = _mm256_add_pd( y_77, tmp1 );

		A += 4 + (sda-1)*lda;
		x += 4;

		}
/*	for(; k<ka-3; k+=4)*/
/*		{*/

/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

/*		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );*/

/*		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );*/
/*		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );*/
/*		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );*/
/*		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );*/
/*	*/
/*		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/
/*		y_00 = _mm256_add_pd( y_00, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/
/*		y_11 = _mm256_add_pd( y_11, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/
/*		y_22 = _mm256_add_pd( y_22, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/
/*		y_33 = _mm256_add_pd( y_33, aaxx_temp );*/
/*	*/
/*		__builtin_prefetch( A + sda*lda + 4*lda );*/
/*		__builtin_prefetch( A + sda*lda + 6*lda );*/

/*		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );*/
/*		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );*/
/*		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );*/
/*		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );*/
/*	*/
/*		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/
/*		y_44 = _mm256_add_pd( y_44, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/
/*		y_55 = _mm256_add_pd( y_55, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/
/*		y_66 = _mm256_add_pd( y_66, aaxx_temp );*/
/*		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/
/*		y_77 = _mm256_add_pd( y_77, aaxx_temp );*/

/*		A += 4 + (sda-1)*lda;*/
/*		x += 4;*/

/*		}*/
		
	zeros = _mm256_setzero_pd();

	// top triangle
	x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

	a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
	a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe );
	a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
	a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
	a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
	tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
	y_00 = _mm256_add_pd( y_00, tmp0 );
	y_11 = _mm256_add_pd( y_11, tmp1 );
	tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
	y_22 = _mm256_add_pd( y_22, tmp0 );
	y_33 = _mm256_add_pd( y_33, tmp1 );

	// top square
	a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
	a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );

	tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
	y_44 = _mm256_add_pd( y_44, tmp0 );
	y_55 = _mm256_add_pd( y_55, tmp1 );
	tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
	y_66 = _mm256_add_pd( y_66, tmp0 );
	y_77 = _mm256_add_pd( y_77, tmp1 );

	A += 4 + (sda-1)*lda;
	x += 4;

	// bottom triangle
	x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

	a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
	a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe );
	a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
	a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
	a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );

	tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
	y_44 = _mm256_add_pd( y_44, tmp0 );
	y_55 = _mm256_add_pd( y_55, tmp1 );
	tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
	y_66 = _mm256_add_pd( y_66, tmp0 );
	y_77 = _mm256_add_pd( y_77, tmp1 );

	// store
	__m256d
		y_0_1_2_3, y_4_5_6_7;

	y_00 = _mm256_hadd_pd(y_00, y_11);
	y_22 = _mm256_hadd_pd(y_22, y_33);
	y_44 = _mm256_hadd_pd(y_44, y_55);
	y_66 = _mm256_hadd_pd(y_66, y_77);

	y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 );	
	y_00 = _mm256_permute2f128_pd(y_22, y_00, 19);	
	y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 );	
	y_44 = _mm256_permute2f128_pd(y_66, y_44, 19);	

	y_00 = _mm256_add_pd( y_00, y_11 );
	y_44 = _mm256_add_pd( y_44, y_55 );

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_00);
		_mm256_storeu_pd(&y[4], y_44);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 );

		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );
	
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 );
	
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}

	}
Пример #8
0
// it moves vertically across blocks
void kernel_dtrmv_u_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 4;
	
/*	__builtin_prefetch( A + 0*lda );*/
/*	__builtin_prefetch( A + 2*lda );*/

/*	double *tA, *tx;*/

	int k;
	
	__m256d
		zeros,
		tmp0, tmp1,
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0_1_2_3,
		y_00, y_11, y_22, y_33;

	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();
	y_22 = _mm256_setzero_pd();
	y_33 = _mm256_setzero_pd();

	k=0;
	for(; k<kmax-7; k+=8)
		{
		
/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
		
		A += 4 + (sda-1)*lda;
		x += 4;


/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
		
		A += 4 + (sda-1)*lda;
		x += 4;

		}
	for(; k<kmax-3; k+=4)
		{
		
/*		__builtin_prefetch( A + sda*lda + 0*lda );*/
/*		__builtin_prefetch( A + sda*lda + 2*lda );*/

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );
		tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, tmp0 );
		y_33 = _mm256_add_pd( y_33, tmp1 );
		
		A += 4 + (sda-1)*lda;
		x += 4;

		}

	zeros = _mm256_setzero_pd();

	x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

	a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
	a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe );
	a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
	a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
	a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
	tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
	y_00 = _mm256_add_pd( y_00, tmp0 );
	y_11 = _mm256_add_pd( y_11, tmp1 );
	tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
	tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
	y_22 = _mm256_add_pd( y_22, tmp0 );
	y_33 = _mm256_add_pd( y_33, tmp1 );

	__m256d
		y_0_1_2_3;

	y_00 = _mm256_hadd_pd(y_00, y_11);
	y_22 = _mm256_hadd_pd(y_22, y_33);

	y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 );	
	y_00 = _mm256_permute2f128_pd(y_22, y_00, 19);	

	y_00 = _mm256_add_pd( y_00, y_11 );

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_00);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 );

		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
	
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 );
	
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}

	}
Пример #9
0
// it moves vertically across blocks
void kernel_dsymv_4_lib4(int kmax, double *A, int sda, double *x_n, double *y_n, double *z_n, double *x_t, double *y_t, double *z_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;

/*printf("\nciao %d\n", kmax);	*/
	const int bs = 4;
	
	__builtin_prefetch( A + bs*0 );
	__builtin_prefetch( A + bs*2 );

	int k, ka;
	ka = kmax; // number from aligned positon

	double k_left;
	
//	double *sA, *sy_n, *sx_t;

	static double d_mask[4]  = {0.5, 1.5, 2.5, 3.5};

	__m256d
		v_mask,
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	__m256i
		i_mask;

#if 0
	__m128d
		stemp,
		sa_00, sa_01, sa_02, sa_03,
		sx_n_0, sx_n_1, sx_n_2, sx_n_3, sy_n_0,
		sx_t_0, sy_t_0, sy_t_1, sy_t_2, sy_t_3;
#endif
	
	zeros = _mm256_setzero_pd();

	x_n_0 = _mm256_broadcast_sd( &x_n[0] );
	x_n_1 = _mm256_broadcast_sd( &x_n[1] );
	x_n_2 = _mm256_broadcast_sd( &x_n[2] );
	x_n_3 = _mm256_broadcast_sd( &x_n[3] );

	if(alg==-1) // TODO xor
		{
		x_n_0 = _mm256_sub_pd( zeros, x_n_0 );
		x_n_1 = _mm256_sub_pd( zeros, x_n_1 );
		x_n_2 = _mm256_sub_pd( zeros, x_n_2 );
		x_n_3 = _mm256_sub_pd( zeros, x_n_3 );
		}

	y_t_0 = _mm256_setzero_pd();
	y_t_1 = _mm256_setzero_pd();
	y_t_2 = _mm256_setzero_pd();
	y_t_3 = _mm256_setzero_pd();
	
#if 0
	sx_n_0 = _mm256_castpd256_pd128( x_n_0 );
	sx_n_1 = _mm256_castpd256_pd128( x_n_1 );
	sx_n_2 = _mm256_castpd256_pd128( x_n_2 );
	sx_n_3 = _mm256_castpd256_pd128( x_n_3 );

	sy_t_0 = _mm256_castpd256_pd128( y_t_0 );
	sy_t_1 = _mm256_castpd256_pd128( y_t_1 );
	sy_t_2 = _mm256_castpd256_pd128( y_t_2 );
	sy_t_3 = _mm256_castpd256_pd128( y_t_3 );

	k = bs*(ka/bs);
	sA = A + (ka/bs)*sda*bs;
	sy_n = y_n + (ka/bs)*bs;
	sx_t = x_t + (ka/bs)*bs;

	for(; k<ka; k++)
		{
		
		sy_n_0 = _mm_load_sd( &sy_n[0] );
		sx_t_0 = _mm_load_sd( &sx_t[0] );
		
		sa_00 = _mm_load_sd( &sA[0+bs*0] );
		sa_01 = _mm_load_sd( &sA[0+bs*1] );
		sa_02 = _mm_load_sd( &sA[0+bs*2] );
		sa_03 = _mm_load_sd( &sA[0+bs*3] );
		
		stemp  = _mm_mul_sd( sa_00, sx_n_0 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_00, sx_t_0 );
		sy_t_0 = _mm_add_sd( sy_t_0, stemp );
		stemp  = _mm_mul_sd( sa_01, sx_n_1 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_01, sx_t_0 );
		sy_t_1 = _mm_add_sd( sy_t_1, stemp );
		stemp  = _mm_mul_sd( sa_02, sx_n_2 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_02, sx_t_0 );
		sy_t_2 = _mm_add_sd( sy_t_2, stemp );
		stemp  = _mm_mul_sd( sa_03, sx_n_3 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_03, sx_t_0 );
		sy_t_3 = _mm_add_sd( sy_t_3, stemp );
		
		_mm_store_sd( &sy_n[0], sy_n_0 );

		
		sA += 1;
		sy_n += 1;
		sx_t += 1;

		}

	y_t_0 = _mm256_castpd128_pd256( sy_t_0 );
	y_t_1 = _mm256_castpd128_pd256( sy_t_1 );
	y_t_2 = _mm256_castpd128_pd256( sy_t_2 );
	y_t_3 = _mm256_castpd128_pd256( sy_t_3 );
#endif

	k=0;

	// corner
	if(tri==1)
		{
		
		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		temp  = _mm256_blend_pd( zeros, temp, 14 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		temp  = _mm256_blend_pd( zeros, temp, 14 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		temp  = _mm256_blend_pd( zeros, temp, 12 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		temp  = _mm256_blend_pd( zeros, temp, 12 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		temp  = _mm256_blend_pd( zeros, temp, 8 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		temp  = _mm256_blend_pd( zeros, temp, 8 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );
		

		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		k += 4;

		}

	for(; k<ka-7; k+=2*bs)
		{
		
		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );

		
		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );

		
		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		}

	for(; k<ka-3; k+=bs)
		{
		
		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );

		
		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		}
	if(k<ka)
		{

		k_left = ka-k;
		v_mask  = _mm256_sub_pd( _mm256_loadu_pd( d_mask ), _mm256_broadcast_sd( &k_left ) );
		i_mask  = _mm256_castpd_si256( v_mask );

//		__builtin_prefetch( A + sda*bs +bs*0 );
//		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_maskload_pd( &x_t[0], i_mask );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_maskstore_pd( &z_n[0], i_mask, y_n_0 );

		
//		A += sda*bs;
//		y_n += 4;
//		z_n += 4;
//		x_t += 4;

		}
	
	__m256d
		y_0_1_2_3;

	y_t_0 = _mm256_hadd_pd( y_t_0, y_t_1 );
	y_t_2 = _mm256_hadd_pd( y_t_2, y_t_3 );

	y_t_1 = _mm256_permute2f128_pd( y_t_2, y_t_0, 2  );	
	y_t_0 = _mm256_permute2f128_pd( y_t_2, y_t_0, 19 );	

	y_t_0 = _mm256_add_pd( y_t_0, y_t_1 );

	if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_t_0 );
		_mm256_storeu_pd( &z_t[0], y_0_1_2_3 );
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] );
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_t_0 );
		_mm256_storeu_pd( &z_t[0], y_0_1_2_3 );
		}
	
	}
Пример #10
0
void rnn_int_d8x4_var2(
    int    k,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *c,
    aux_t  *aux
    )
{
  int    i;
  double neg2 = -2.0;
  double dzero = 0.0;
  v4df_t c03_0, c03_1, c03_2, c03_3;
  v4df_t c47_0, c47_1, c47_2, c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t c_tmp;
  v4df_t a03, a47;
  v4df_t A03, A47; // prefetched A 

  v4df_t b0, b1, b2, b3;
  v4df_t B0; // prefetched B
  v4df_t aa_tmp, bb_tmp;


  int k_iter = k / 2;
  int k_left = k % 2;

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( a ) );
  __asm__ volatile( "prefetcht2 0(%0)    \n\t" : :"r"( aux->b_next ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( c ) );


  c03_0.v = _mm256_setzero_pd();
  c03_1.v = _mm256_setzero_pd();
  c03_2.v = _mm256_setzero_pd();
  c03_3.v = _mm256_setzero_pd();
  c47_0.v = _mm256_setzero_pd();
  c47_1.v = _mm256_setzero_pd();
  c47_2.v = _mm256_setzero_pd();
  c47_3.v = _mm256_setzero_pd();


  // Load a03
  a03.v = _mm256_load_pd(      (double*)a         );
  // Load a47
  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  // Load (b0,b1,b2,b3)
  b0.v  = _mm256_load_pd(      (double*)b         );

  for ( i = 0; i < k_iter; ++i ) {
    __asm__ volatile( "prefetcht0 192(%0)    \n\t" : :"r"(a) );

    // Preload A03
    A03.v = _mm256_load_pd(      (double*)( a + 8 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Preload A47
    A47.v = _mm256_load_pd(      (double*)( a + 12 ) );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    // Preload B0
    B0.v  = _mm256_load_pd(      (double*)( b + 4 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );


    // Iteration #1
    __asm__ volatile( "prefetcht0 512(%0)    \n\t" : :"r"(a) );

    // Preload a03 ( next iteration )
    a03.v = _mm256_load_pd(      (double*)( a + 16 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , B0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );

    b1.v  = _mm256_shuffle_pd( B0.v, B0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , B0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );

    // Preload a47 ( next iteration )
    a47.v = _mm256_load_pd(      (double*)( a + 20 ) );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Load b0 ( next iteration )
    b0.v  = _mm256_load_pd(      (double*)( b + 8 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( A47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 16;
    b += 8;
  }

  for ( i = 0; i < k_left; ++i ) {
    a03.v = _mm256_load_pd(      (double*)a         );
    //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] );

    a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
    //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] );

    b0.v  = _mm256_load_pd(      (double*)b         );
    //printf( "b0  = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 8;
    b += 4;
  }
 

  // Prefetch aa and bb
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aa ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( bb ) );


  tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 );
  tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 );
  
  tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 );
  tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 );

  tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 );
  tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 );

  tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 );
  tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 );

  //printf( "rank-k\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aux->I ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aux->D ) );


  //for ( i = 0; i < k; i++ ) {
  //  a03.v = _mm256_load_pd(      (double*)a         );
  //  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  //  b0.v  = _mm256_broadcast_sd( (double*)b         );
  //  b1.v  = _mm256_broadcast_sd( (double*)( b + 1 ) );
  //  b2.v  = _mm256_broadcast_sd( (double*)( b + 2 ) );
  //  b3.v  = _mm256_broadcast_sd( (double*)( b + 3 ) );

  //  a += DKS_MR;
  //  b += DKS_NR;

  //  c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
  //  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
  //  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
  //  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
  //  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );

  //  c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
  //  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
  //  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
  //  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
  //  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
  //}
  
  aa_tmp.v = _mm256_broadcast_sd( &neg2 );
  //c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  //c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  //c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  //c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  //c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  //c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  //c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  //c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );
  //
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  //printf( "scale -2 \n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  aa_tmp.v = _mm256_load_pd( (double*)aa );
  c03_0.v  = _mm256_add_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( aa_tmp.v, c03_3.v );

  //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] );
  //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] );

  aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) );
  c47_0.v  = _mm256_add_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( aa_tmp.v, c47_3.v );
  

  //printf( "add a^2\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  bb_tmp.v = _mm256_broadcast_sd( (double*)bb );
  c03_0.v  = _mm256_add_pd( bb_tmp.v, c03_0.v );
  c47_0.v  = _mm256_add_pd( bb_tmp.v, c47_0.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) );
  c03_1.v  = _mm256_add_pd( bb_tmp.v, c03_1.v );
  c47_1.v  = _mm256_add_pd( bb_tmp.v, c47_1.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) );
  c03_2.v  = _mm256_add_pd( bb_tmp.v, c03_2.v );
  c47_2.v  = _mm256_add_pd( bb_tmp.v, c47_2.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) );
  c03_3.v  = _mm256_add_pd( bb_tmp.v, c03_3.v );
  c47_3.v  = _mm256_add_pd( bb_tmp.v, c47_3.v );



  // Check if there is any illegle value 
  c_tmp.v  = _mm256_broadcast_sd( &dzero );
  c03_0.v  = _mm256_max_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_max_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_max_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_max_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_max_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_max_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_max_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_max_pd( c_tmp.v, c47_3.v );


  // Transpose c03/c47 _0, _1, _2, _3 to be the row vector
  tmpc03_0.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0x0 );
  tmpc03_1.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0xF );

  tmpc03_2.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0x0 );
  tmpc03_3.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0xF );

  tmpc47_0.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0x0 );
  tmpc47_1.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0xF );

  tmpc47_2.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0x0 );
  tmpc47_3.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0xF );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x20 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x31 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x20 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x31 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x20 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x31 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x20 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x31 );


  // c03_0;
  // c03_1;
  // c03_2;
  // c03_3;
  // c47_0;
  // c47_1;
  // c47_2;
  // c47_3;


  _mm256_store_pd( c     , c03_0.v );
  _mm256_store_pd( c +  4, c03_1.v );
  _mm256_store_pd( c +  8, c03_2.v );
  _mm256_store_pd( c + 12, c03_3.v );
  _mm256_store_pd( c + 16, c47_0.v );
  _mm256_store_pd( c + 20, c47_1.v );
  _mm256_store_pd( c + 24, c47_2.v );
  _mm256_store_pd( c + 28, c47_3.v );
}
Пример #11
0
void ks_gaussian_int_d8x4(
    int    k,
    double alpha,
    double *u,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *w,
    aux_t  *aux
    )
{
  int    i;
  double neg2 = -2.0;
  double dzero = 0.0;

  v4df_t c03_0, c03_1, c03_2, c03_3;
  v4df_t c47_0, c47_1, c47_2, c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t c_tmp;
  v4df_t u03;
  v4df_t u47;
  v4df_t a03, a47;
  v4df_t A03, A47; // prefetched A 

  v4df_t b0, b1, b2, b3;
  v4df_t B0; // prefetched B

  v4df_t aa_tmp, bb_tmp;
  v4df_t w_tmp;


  //// Inline vdExp()
  //const double log2e  =  1.4426950408889634073599;
  //const double maxlog =  7.09782712893383996843e2; // log( 2**1024 )
  //const double minlog = -7.08396418532264106224e2; // log( 2**-1024 )
  //const double one    =  1.0;
  //const double c1     =  6.93145751953125E-1;
  //const double c2     =  1.42860682030941723212E-6;

  //// Original Remez Order 11 coefficients
  //const double w11    =  3.5524625185478232665958141148891055719216674475023e-8;
  //const double w10    =  2.5535368519306500343384723775435166753084614063349e-7;
  //const double w9     =  2.77750562801295315877005242757916081614772210463065e-6;
  //const double w8     =  2.47868893393199945541176652007657202642495832996107e-5;
  //const double w7     =  1.98419213985637881240770890090795533564573406893163e-4;
  //const double w6     =  1.3888869684178659239014256260881685824525255547326e-3;
  //const double w5     =  8.3333337052009872221152811550156335074160546333973e-3;
  //const double w4     =  4.1666666621080810610346717440523105184720007971655e-2;
  //const double w3     =  0.166666666669960803484477734308515404418108830469798;
  //const double w2     =  0.499999999999877094481580370323249951329122224389189;
  //const double w1     =  1.0000000000000017952745258419615282194236357388884;
  //const double w0     =  0.99999999999999999566016490920259318691496540598896;

  // Remez Order 11 polynomail approximation
  //const double w0     =  9.9999999999999999694541216787022234814339814028865e-1;
  //const double w1     =  1.0000000000000013347525109964212249781265243645457;
  //const double w2     =  4.9999999999990426011279542064313207349934058355357e-1;
  //const double w3     =  1.6666666666933781279020916199156875162816850273886e-1;
  //const double w4     =  4.1666666628388978913396218847247771982698350546174e-2;
  //const double w5     =  8.3333336552944126722390410619859929515740995889372e-3;
  //const double w6     =  1.3888871805082296012945081624687544823497126781709e-3;
  //const double w7     =  1.9841863599469418342286677256362193951266072398489e-4;
  //const double w8     =  2.4787899938611697691690479138150629377630767114546e-5;
  //const double w9     =  2.7764095757136528235740765949934667970688427190168e-6;
  //const double w10    =  2.5602485412126369546033948405199058329040797134573e-7;
  //const double w11    =  3.5347283721656121939634391175390704621351283546671e-8;

  // Remez Order 9 polynomail approximation
//  const double w0     =  9.9999999999998657717890998293462356769270934668652e-1;
//  const double w1     =  1.0000000000041078023971691258305486059867172736079;
//  const double w2     =  4.9999999979496223000111361187419539211772440139043e-1;
//  const double w3     =  1.6666667059968250851708016603646727895353772273675e-1;
//  const double w4     =  4.1666628655740875994884332519499013211594753124142e-2;
//  const double w5     =  8.3335428149736685441705398632467122758546893330069e-3;
//  const double w6     =  1.3881912931358424526285652289974115047170651985345e-3;
//  const double w7     =  1.9983735415194021112767942931416179152416729204150e-4;
//  const double w8     =  2.3068467290270483679711135625155862511780587976925e-5;
//  const double w9     =  3.8865682386514872192656192137071689334005518164704e-6;




  //v4df_t a03_0, a03_1, a03_2, a03_3;
  //v4df_t a47_0, a47_1, a47_2, a47_3;
  //v4df_t p03_0, p03_1, p03_2, p03_3;
  //v4df_t p47_0, p47_1, p47_2, p47_3;
  //v4df_t y, l2e, tmp, p;
  //v4li_t k03_0, k03_1, k03_2, k03_3;
  //v4li_t k47_0, k47_1, k47_2, k47_3;
  //v4li_t offset;
  //v4li_t k1, k2;
  //__m128d p1, p2;









  int k_iter = k / 2;
  int k_left = k % 2;

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( a ) );
  __asm__ volatile( "prefetcht2 0(%0)    \n\t" : :"r"( aux->b_next ) );


  c03_0.v = _mm256_setzero_pd();
  c03_1.v = _mm256_setzero_pd();
  c03_2.v = _mm256_setzero_pd();
  c03_3.v = _mm256_setzero_pd();
  c47_0.v = _mm256_setzero_pd();
  c47_1.v = _mm256_setzero_pd();
  c47_2.v = _mm256_setzero_pd();
  c47_3.v = _mm256_setzero_pd();


  // Load a03
  a03.v = _mm256_load_pd(      (double*)a         );
  // Load a47
  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  // Load (b0,b1,b2,b3)
  b0.v  = _mm256_load_pd(      (double*)b         );

  for ( i = 0; i < k_iter; ++i ) {
    __asm__ volatile( "prefetcht0 192(%0)    \n\t" : :"r"(a) );

    // Preload A03
    A03.v = _mm256_load_pd(      (double*)( a + 8 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Preload A47
    A47.v = _mm256_load_pd(      (double*)( a + 12 ) );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    // Preload B0
    B0.v  = _mm256_load_pd(      (double*)( b + 4 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );


    // Iteration #1
    __asm__ volatile( "prefetcht0 512(%0)    \n\t" : :"r"(a) );

    // Preload a03 ( next iteration )
    a03.v = _mm256_load_pd(      (double*)( a + 16 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , B0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );

    b1.v  = _mm256_shuffle_pd( B0.v, B0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , B0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );

    // Preload a47 ( next iteration )
    a47.v = _mm256_load_pd(      (double*)( a + 20 ) );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Load b0 ( next iteration )
    b0.v  = _mm256_load_pd(      (double*)( b + 8 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( A47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 16;
    b += 8;
  }

  for ( i = 0; i < k_left; ++i ) {
    a03.v = _mm256_load_pd(      (double*)a         );
    //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] );

    a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
    //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] );

    b0.v  = _mm256_load_pd(      (double*)b         );
    //printf( "b0  = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 8;
    b += 4;
  }
 

  // Prefetch aa and bb
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aa ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( bb ) );


  tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 );
  tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 );
  
  tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 );
  tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 );

  tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 );
  tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 );

  tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 );
  tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 );

  //printf( "rank-k\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );



  //for ( i = 0; i < k; i++ ) {
  //  a03.v = _mm256_load_pd(      (double*)a         );
  //  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  //  b0.v  = _mm256_broadcast_sd( (double*)b         );
  //  b1.v  = _mm256_broadcast_sd( (double*)( b + 1 ) );
  //  b2.v  = _mm256_broadcast_sd( (double*)( b + 2 ) );
  //  b3.v  = _mm256_broadcast_sd( (double*)( b + 3 ) );

  //  a += DKS_MR;
  //  b += DKS_NR;

  //  c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
  //  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
  //  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
  //  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
  //  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );

  //  c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
  //  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
  //  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
  //  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
  //  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
  //}
  
  aa_tmp.v = _mm256_broadcast_sd( &neg2 );
  //c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  //c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  //c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  //c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  //c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  //c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  //c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  //c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );
  //
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  //printf( "scale -2 \n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  aa_tmp.v = _mm256_load_pd( (double*)aa );
  c03_0.v  = _mm256_add_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( aa_tmp.v, c03_3.v );

  //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] );
  //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] );

  aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) );
  c47_0.v  = _mm256_add_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( aa_tmp.v, c47_3.v );
  

  //printf( "add a^2\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  // Prefetch u
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u ) );


  bb_tmp.v = _mm256_broadcast_sd( (double*)bb );
  c03_0.v  = _mm256_add_pd( bb_tmp.v, c03_0.v );
  c47_0.v  = _mm256_add_pd( bb_tmp.v, c47_0.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) );
  c03_1.v  = _mm256_add_pd( bb_tmp.v, c03_1.v );
  c47_1.v  = _mm256_add_pd( bb_tmp.v, c47_1.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) );
  c03_2.v  = _mm256_add_pd( bb_tmp.v, c03_2.v );
  c47_2.v  = _mm256_add_pd( bb_tmp.v, c47_2.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) );
  c03_3.v  = _mm256_add_pd( bb_tmp.v, c03_3.v );
  c47_3.v  = _mm256_add_pd( bb_tmp.v, c47_3.v );



  // Check if there is any illegle value 
  c_tmp.v  = _mm256_broadcast_sd( &dzero );
  c03_0.v  = _mm256_max_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_max_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_max_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_max_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_max_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_max_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_max_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_max_pd( c_tmp.v, c47_3.v );



  // Scale before the kernel evaluation
  aa_tmp.v = _mm256_broadcast_sd( &alpha );
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  // Preload u03, u47
  u03.v    = _mm256_load_pd( (double*)u );
  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );

  // Prefetch w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );


  #include "ks_exp_int_d8x4.h"

  //printf( "square distance\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  //for ( i = 0; i < 4; i++ ) {
  //  if ( c03_0.d[ i ] != c03_0.d[ i ] ) {
  //    printf( "error Nan: c03_0[ %d ]\n", i );
  //  }
  //  if ( c03_1.d[ i ] != c03_1.d[ i ] ) {
  //    printf( "error Nan: c03_1[ %d ]\n", i );
  //  }
  //  if ( c03_2.d[ i ] != c03_2.d[ i ] ) {
  //    printf( "error Nan: c03_2[ %d ]\n", i );
  //  }
  //  if ( c03_3.d[ i ] != c03_3.d[ i ] ) {
  //    printf( "error Nan: c03_3[ %d ]\n", i );
  //  }
  //  if ( c47_0.d[ i ] != c47_0.d[ i ] ) {
  //    printf( "error Nan: c47_0[ %d ]\n", i );
  //  }
  //  if ( c47_1.d[ i ] != c47_1.d[ i ] ) {
  //    printf( "error Nan: c47_1[ %d ]\n", i );
  //  }
  //  if ( c47_2.d[ i ] != c47_2.d[ i ] ) {
  //    printf( "error Nan: c47_2[ %d ]\n", i );
  //  }
  //  if ( c47_3.d[ i ] != c47_3.d[ i ] ) {
  //    printf( "error Nan: c47_3[ %d ]\n", i );
  //  }
  //}



//  tmp.v     = _mm256_broadcast_sd( &maxlog );
//  c03_0.v   = _mm256_min_pd( tmp.v, c03_0.v ); 
//  c03_1.v   = _mm256_min_pd( tmp.v, c03_1.v ); 
//  c03_2.v   = _mm256_min_pd( tmp.v, c03_2.v ); 
//  c03_3.v   = _mm256_min_pd( tmp.v, c03_3.v ); 
//  c47_0.v   = _mm256_min_pd( tmp.v, c47_0.v ); 
//  c47_1.v   = _mm256_min_pd( tmp.v, c47_1.v ); 
//  c47_2.v   = _mm256_min_pd( tmp.v, c47_2.v ); 
//  c47_3.v   = _mm256_min_pd( tmp.v, c47_3.v ); 
//  tmp.v     = _mm256_broadcast_sd( &minlog );
//  c03_0.v   = _mm256_max_pd( tmp.v, c03_0.v ); 
//  c03_1.v   = _mm256_max_pd( tmp.v, c03_1.v ); 
//  c03_2.v   = _mm256_max_pd( tmp.v, c03_2.v ); 
//  c03_3.v   = _mm256_max_pd( tmp.v, c03_3.v ); 
//  c47_0.v   = _mm256_max_pd( tmp.v, c47_0.v ); 
//  c47_1.v   = _mm256_max_pd( tmp.v, c47_1.v ); 
//  c47_2.v   = _mm256_max_pd( tmp.v, c47_2.v ); 
//  c47_3.v   = _mm256_max_pd( tmp.v, c47_3.v ); 
//
//  // a = c / log2e
//  // c = a * ln2 = k * ln2 + w, ( w in [ -ln2, ln2 ] )
//  l2e.v         = _mm256_broadcast_sd( &log2e );
//  a03_0.v       = _mm256_mul_pd( l2e.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( l2e.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( l2e.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( l2e.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( l2e.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( l2e.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( l2e.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( l2e.v, c47_3.v );
//
//  // Check if a < 0 
//  tmp.v         = _mm256_setzero_pd();
//  p03_0.v       = _mm256_cmp_pd( a03_0.v, tmp.v, 1 );
//  p03_1.v       = _mm256_cmp_pd( a03_1.v, tmp.v, 1 );
//  p03_2.v       = _mm256_cmp_pd( a03_2.v, tmp.v, 1 );
//  p03_3.v       = _mm256_cmp_pd( a03_3.v, tmp.v, 1 );
//  p47_0.v       = _mm256_cmp_pd( a47_0.v, tmp.v, 1 );
//  p47_1.v       = _mm256_cmp_pd( a47_1.v, tmp.v, 1 );
//  p47_2.v       = _mm256_cmp_pd( a47_2.v, tmp.v, 1 );
//  p47_3.v       = _mm256_cmp_pd( a47_3.v, tmp.v, 1 );
//  tmp.v         = _mm256_broadcast_sd( &one );
//  p03_0.v       = _mm256_and_pd( tmp.v, p03_0.v );
//  p03_1.v       = _mm256_and_pd( tmp.v, p03_1.v );
//  p03_2.v       = _mm256_and_pd( tmp.v, p03_2.v );
//  p03_3.v       = _mm256_and_pd( tmp.v, p03_3.v );
//  p47_0.v       = _mm256_and_pd( tmp.v, p47_0.v );
//  p47_1.v       = _mm256_and_pd( tmp.v, p47_1.v );
//  p47_2.v       = _mm256_and_pd( tmp.v, p47_2.v );
//  p47_3.v       = _mm256_and_pd( tmp.v, p47_3.v );
//  // If a < 0 ( w < 0 ), then a - 1 =  ( k - 1 ) + w / ln2 
//  a03_0.v       = _mm256_sub_pd( a03_0.v, p03_0.v );
//  a03_1.v       = _mm256_sub_pd( a03_1.v, p03_1.v );
//  a03_2.v       = _mm256_sub_pd( a03_2.v, p03_2.v );
//  a03_3.v       = _mm256_sub_pd( a03_3.v, p03_3.v );
//  a47_0.v       = _mm256_sub_pd( a47_0.v, p47_0.v );
//  a47_1.v       = _mm256_sub_pd( a47_1.v, p47_1.v );
//  a47_2.v       = _mm256_sub_pd( a47_2.v, p47_2.v );
//  a47_3.v       = _mm256_sub_pd( a47_3.v, p47_3.v );
//  // Compute floor( a ) by two conversions
//  // if a < 0, p = k - 1
//  // else    , p = k
//  k03_0.v       = _mm256_cvttpd_epi32( a03_0.v );
//  k03_1.v       = _mm256_cvttpd_epi32( a03_1.v );
//  k03_2.v       = _mm256_cvttpd_epi32( a03_2.v );
//  k03_3.v       = _mm256_cvttpd_epi32( a03_3.v );
//  k47_0.v       = _mm256_cvttpd_epi32( a47_0.v );
//  k47_1.v       = _mm256_cvttpd_epi32( a47_1.v );
//  k47_2.v       = _mm256_cvttpd_epi32( a47_2.v );
//  k47_3.v       = _mm256_cvttpd_epi32( a47_3.v );
//  p03_0.v       = _mm256_cvtepi32_pd( k03_0.v );
//  p03_1.v       = _mm256_cvtepi32_pd( k03_1.v );
//  p03_2.v       = _mm256_cvtepi32_pd( k03_2.v );
//  p03_3.v       = _mm256_cvtepi32_pd( k03_3.v );
//  p47_0.v       = _mm256_cvtepi32_pd( k47_0.v );
//  p47_1.v       = _mm256_cvtepi32_pd( k47_1.v );
//  p47_2.v       = _mm256_cvtepi32_pd( k47_2.v );
//  p47_3.v       = _mm256_cvtepi32_pd( k47_3.v );
//
//  // ---------------------
//  // x -= p * ln2
//  // ---------------------
//  // c1 = ln2
//  // if a < 0, a = ( k - 1 ) * ln2
//  // else    , a = k * ln2
//  // if a < 0, x -= ( k - 1 ) * ln2
//  // else    , x -= k * ln2
//  //
//  tmp.v         = _mm256_broadcast_sd( &c1 );
//  a03_0.v       = _mm256_mul_pd( tmp.v, p03_0.v );
//  a03_1.v       = _mm256_mul_pd( tmp.v, p03_1.v );
//  a03_2.v       = _mm256_mul_pd( tmp.v, p03_2.v );
//  a03_3.v       = _mm256_mul_pd( tmp.v, p03_3.v );
//  a47_0.v       = _mm256_mul_pd( tmp.v, p47_0.v );
//  a47_1.v       = _mm256_mul_pd( tmp.v, p47_1.v );
//  a47_2.v       = _mm256_mul_pd( tmp.v, p47_2.v );
//  a47_3.v       = _mm256_mul_pd( tmp.v, p47_3.v );
//  c03_0.v       = _mm256_sub_pd( c03_0.v, a03_0.v );
//  c03_1.v       = _mm256_sub_pd( c03_1.v, a03_1.v );
//  c03_2.v       = _mm256_sub_pd( c03_2.v, a03_2.v );
//  c03_3.v       = _mm256_sub_pd( c03_3.v, a03_3.v );
//  c47_0.v       = _mm256_sub_pd( c47_0.v, a47_0.v );
//  c47_1.v       = _mm256_sub_pd( c47_1.v, a47_1.v );
//  c47_2.v       = _mm256_sub_pd( c47_2.v, a47_2.v );
//  c47_3.v       = _mm256_sub_pd( c47_3.v, a47_3.v );
//  tmp.v         = _mm256_broadcast_sd( &c2 );
//  a03_0.v       = _mm256_mul_pd( tmp.v, p03_0.v );
//  a03_1.v       = _mm256_mul_pd( tmp.v, p03_1.v );
//  a03_2.v       = _mm256_mul_pd( tmp.v, p03_2.v );
//  a03_3.v       = _mm256_mul_pd( tmp.v, p03_3.v );
//  a47_0.v       = _mm256_mul_pd( tmp.v, p47_0.v );
//  a47_1.v       = _mm256_mul_pd( tmp.v, p47_1.v );
//  a47_2.v       = _mm256_mul_pd( tmp.v, p47_2.v );
//  a47_3.v       = _mm256_mul_pd( tmp.v, p47_3.v );
//  c03_0.v       = _mm256_sub_pd( c03_0.v, a03_0.v );
//  c03_1.v       = _mm256_sub_pd( c03_1.v, a03_1.v );
//  c03_2.v       = _mm256_sub_pd( c03_2.v, a03_2.v );
//  c03_3.v       = _mm256_sub_pd( c03_3.v, a03_3.v );
//  c47_0.v       = _mm256_sub_pd( c47_0.v, a47_0.v );
//  c47_1.v       = _mm256_sub_pd( c47_1.v, a47_1.v );
//  c47_2.v       = _mm256_sub_pd( c47_2.v, a47_2.v );
//  c47_3.v       = _mm256_sub_pd( c47_3.v, a47_3.v );
//
//
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );
//
//
//  // Prefetch u
//  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u ) );
//
//
//
//  // Compute e^x using polynomial approximation
//  // a = w10 + w11 * x
//  tmp.v         = _mm256_broadcast_sd( &w11 );
//  //tmp.v         = _mm256_broadcast_sd( &w9 );
//  a03_0.v       = _mm256_mul_pd( c03_0.v, tmp.v );
//  a03_1.v       = _mm256_mul_pd( c03_1.v, tmp.v );
//  a03_2.v       = _mm256_mul_pd( c03_2.v, tmp.v );
//  a03_3.v       = _mm256_mul_pd( c03_3.v, tmp.v );
//  a47_0.v       = _mm256_mul_pd( c47_0.v, tmp.v );
//  a47_1.v       = _mm256_mul_pd( c47_1.v, tmp.v );
//  a47_2.v       = _mm256_mul_pd( c47_2.v, tmp.v );
//  a47_3.v       = _mm256_mul_pd( c47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w10 );
//  //tmp.v         = _mm256_broadcast_sd( &w8 );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  // a = w8 + ( w9 + ( w10 + w11 * x ) * x ) * x
//  tmp.v         = _mm256_broadcast_sd( &w9 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w8 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w7 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w6 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w5 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w4 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  // Prefetch w
//  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );
//  // Preload u03
//  u03.v    = _mm256_load_pd( (double*)u );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w3 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w2 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w1 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w0 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  // Preload u47
//  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );
//
//
//  offset.v      = _mm_setr_epi32( 1023, 1023, 0, 0 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_0.d[ 1 ], k03_0.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_0.d[ 3 ], k03_0.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_0.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_1.d[ 1 ], k03_1.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_1.d[ 3 ], k03_1.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_1.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_2.d[ 1 ], k03_2.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_2.d[ 3 ], k03_2.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_2.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_3.d[ 1 ], k03_3.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_3.d[ 3 ], k03_3.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_3.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_0.d[ 1 ], k47_0.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_0.d[ 3 ], k47_0.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_0.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_1.d[ 1 ], k47_1.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_1.d[ 3 ], k47_1.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_1.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_2.d[ 1 ], k47_2.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_2.d[ 3 ], k47_2.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_2.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_3.d[ 1 ], k47_3.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_3.d[ 3 ], k47_3.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_3.v       = _mm256_set_m128d( p2, p1 );
//  
// 
//  //u03.v    = _mm256_load_pd( (double*)u );
//  //u47.v    = _mm256_load_pd( (double*)( u + 4 ) );
//
//
//  c03_0.v       = _mm256_mul_pd( a03_0.v, p03_0.v );
//  c03_1.v       = _mm256_mul_pd( a03_1.v, p03_1.v );
//  c03_2.v       = _mm256_mul_pd( a03_2.v, p03_2.v );
//  c03_3.v       = _mm256_mul_pd( a03_3.v, p03_3.v );
//  c47_0.v       = _mm256_mul_pd( a47_0.v, p47_0.v );
//  c47_1.v       = _mm256_mul_pd( a47_1.v, p47_1.v );
//  c47_2.v       = _mm256_mul_pd( a47_2.v, p47_2.v );
//  c47_3.v       = _mm256_mul_pd( a47_3.v, p47_3.v );



  //for ( i = 0; i < 4; i++ ) {
  //  if ( c03_0.d[ i ] != c03_0.d[ i ] ) {
  //    printf( "error exp Nan: c03_0[ %d ]\n", i );
  //  }
  //  if ( c03_1.d[ i ] != c03_1.d[ i ] ) {
  //    printf( "error exp Nan: c03_1[ %d ]\n", i );
  //  }
  //  if ( c03_2.d[ i ] != c03_2.d[ i ] ) {
  //    printf( "error exp Nan: c03_2[ %d ]\n", i );
  //  }
  //  if ( c03_3.d[ i ] != c03_3.d[ i ] ) {
  //    printf( "error exp Nan: c03_3[ %d ]\n", i );
  //  }
  //  if ( c47_0.d[ i ] != c47_0.d[ i ] ) {
  //    printf( "error exp Nan: c47_0[ %d ]\n", i );
  //  }
  //  if ( c47_1.d[ i ] != c47_1.d[ i ] ) {
  //    printf( "error exp Nan: c47_1[ %d ]\n", i );
  //  }
  //  if ( c47_2.d[ i ] != c47_2.d[ i ] ) {
  //    printf( "error exp Nan: c47_2[ %d ]\n", i );
  //  }
  //  if ( c47_3.d[ i ] != c47_3.d[ i ] ) {
  //    printf( "error exp Nan: c47_3[ %d ]\n", i );
  //  }
  //}




  //printf( "exp\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  //printf( "w\n" );
  //printf( "%lf, %lf, %lf, %lf\n", w[0], w[3], w[3], w[3] );


  //u03.v    = _mm256_load_pd( (double*)u );
  //u47.v    = _mm256_load_pd( (double*)( u + 4 ) );

  w_tmp.v  = _mm256_broadcast_sd( (double*)w );
  c03_0.v  = _mm256_mul_pd( w_tmp.v, c03_0.v );
  c47_0.v  = _mm256_mul_pd( w_tmp.v, c47_0.v );
  u03.v    = _mm256_add_pd( u03.v, c03_0.v );
  u47.v    = _mm256_add_pd( u47.v, c47_0.v );
 

  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}


  w_tmp.v  = _mm256_broadcast_sd( (double*)( w + 1 ) );
  c03_1.v  = _mm256_mul_pd( w_tmp.v, c03_1.v );
  c47_1.v  = _mm256_mul_pd( w_tmp.v, c47_1.v );
  u03.v    = _mm256_add_pd( u03.v, c03_1.v );
  u47.v    = _mm256_add_pd( u47.v, c47_1.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}

  w_tmp.v  = _mm256_broadcast_sd( (double*)( w + 2 ) );
  c03_2.v  = _mm256_mul_pd( w_tmp.v, c03_2.v );
  c47_2.v  = _mm256_mul_pd( w_tmp.v, c47_2.v );
  u03.v    = _mm256_add_pd( u03.v, c03_2.v );
  u47.v    = _mm256_add_pd( u47.v, c47_2.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}

  w_tmp.v  = _mm256_broadcast_sd( (double*)( w + 3 ) );
  c03_3.v  = _mm256_mul_pd( w_tmp.v, c03_3.v );
  c47_3.v  = _mm256_mul_pd( w_tmp.v, c47_3.v );
  u03.v    = _mm256_add_pd( u03.v, c03_3.v );
  u47.v    = _mm256_add_pd( u47.v, c47_3.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}



  _mm256_store_pd( (double*)u, u03.v );
  _mm256_store_pd( (double*)( u + 4 ), u47.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( c03_0.d[ i ] != c03_0.d[ i ] ) {
  //    printf( "error gemv Nan: c03_0[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c03_1.d[ i ] != c03_1.d[ i ] ) {
  //    printf( "error gemv Nan: c03_1[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c03_2.d[ i ] != c03_2.d[ i ] ) {
  //    printf( "error gemv Nan: c03_2[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c03_3.d[ i ] != c03_3.d[ i ] ) {
  //    printf( "error gemv Nan: c03_3[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_0.d[ i ] != c47_0.d[ i ] ) {
  //    printf( "error gemv Nan: c47_0[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_1.d[ i ] != c47_1.d[ i ] ) {
  //    printf( "error gemv Nan: c47_1[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_2.d[ i ] != c47_2.d[ i ] ) {
  //    printf( "error gemv Nan: c47_2[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_3.d[ i ] != c47_3.d[ i ] ) {
  //    printf( "error gemv Nan: c47_3[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //}


  //for ( i = 0; i < 4; i ++ ) {
  //  if ( w[ i ] != w[ i ] ) {
  //    printf( "GSKS error w Nan: w03[ %d ]\n", i );
  //  }
  //}


  //for ( i = 0; i < 4; i++ ) {
  //  if ( u03.d[ i ] != u03.d[ i ] ) {
  //    printf( "GSKS error u Nan: u03[ %d ]\n", i );
  //  }
  //  if ( u47.d[ i ] != u47.d[ i ] ) {
  //    printf( "GSKS error u Nan: u47[ %d ]\n", i );
  //  }
  //}



  //printf( "%lf\n", u03.d[0] );
  //printf( "%lf\n", u03.d[1] );
  //printf( "%lf\n", u03.d[2] );
  //printf( "%lf\n", u03.d[3] );
  //printf( "%lf\n", u47.d[0] );
  //printf( "%lf\n", u47.d[1] );
  //printf( "%lf\n", u47.d[2] );
  //printf( "%lf\n", u47.d[3] );
}