Ejemplo n.º 1
0
		inline float64x4_t dot(const float64x4_t ymm1, const float64x4_t ymm2)
		{
			float64x4_t mul0 = _mm256_mul_pd(ymm1, ymm2);
			float64x4_t hadd0 = _mm256_hadd_pd(mul0, mul0);
			float64x2_t ext0 = _mm256_extractf128_pd(hadd0, 0);
			float64x2_t ext1 = _mm256_extractf128_pd(hadd0, 1);
			float64x2_t add0 = _mm_add_pd(ext0, ext1);
			return _mm256_broadcast_pd(&add0);
		}
Ejemplo n.º 2
0
inline void rotate_left_wm1(F64vec4 *v0, const F64vec4 v1)
{
    // {1.0, 2.0, 3.0, 4.0};
    // {5.0, 6.0, 7.0, 8.0};

    const __m128d hiv0      = _mm256_extractf128_pd(*v0, 1); // {3.0, 4.0}
    const __m128d phiv0     = _mm_permute_pd(hiv0, 0x1); // {4.0, 3.0}
    const __m256d shufv1    = _mm256_permute_pd(v1, 0x1); // {6.0, 5.0, 8.0, 7.0};
    const __m128d shufv1_lo = _mm256_extractf128_pd(shufv1, 0); // {6.0, 5.0}
    const __m128d shufv1_hi = _mm256_extractf128_pd(shufv1, 1); // {8.0, 7.0}
    const __m128d v1_blend  = _mm_blend_pd(shufv1_lo, shufv1_hi, 0x2); // blend   {6.0, 7.0};
    const __m256d inserted  = _mm256_insertf128_pd(shufv1, v1_blend, 1); // insert  {6.0, 5.0, 6.0, 7.0};
    const __m256d blended   = _mm256_blend_pd(_mm256_castpd128_pd256(phiv0), inserted, 0xE);
    *v0                     = blended;
}
double HodgkinHuxley::dV(double *V, double I) {
	const double C = 1.0;
	const double gNa = 120.0;
	const double gK = 36.0;
	const double gL = 0.3;
	const double ENa = 50.0;
	const double EK = -77.0;
	const double EL = -54.4;
#ifdef __AVX__
/*
AVX is an instruction set from Intel which allows simultaneous operation
on 4 doubles. Seems to be slower than optimized FPU, though.
*/
	double Va[] __attribute__ ((aligned (32))) = {V[0], V[0], V[0], 1.0},
		   Ea[] __attribute__ ((aligned (32))) = {EL, ENa, EK, 0.0},
		   Ga[] __attribute__ ((aligned (32))) = {-gL, -gNa * pow(V[2], 3.0) * V[3], -gK * pow(V[1], 4.0), I};
	
	// load V
	__m256d Vr = _mm256_load_pd(Va);
	// load E
	__m256d Er = _mm256_load_pd(Ea);
	// load G
	__m256d Gr = _mm256_load_pd(Ga);
	// subtract
	Vr = _mm256_sub_pd(Vr, Er);
	// dot product (why does intel not have _mm256_dp_pd ?)
	Vr = _mm256_mul_pd(Vr, Gr);
	__m256d temp = _mm256_hadd_pd(Vr, Vr);
	__m128d lo128 = _mm256_extractf128_pd(temp, 0);
	__m128d hi128 = _mm256_extractf128_pd(temp, 1);
	__m128d dotproduct = _mm_add_pd(lo128, hi128);
	
	double sseVal;
	// store
	_mm_storel_pd(&sseVal, dotproduct);
	sseVal /= C;
		
	return sseVal;
#else
	return (-gL * (V[0] - EL) - gNa * pow(V[2], 3.0) * V[3] * (V[0] - ENa)
		- gK * pow(V[1], 4.0) * (V[0] - EK) + I) / C;
#endif
}
Ejemplo n.º 4
0
void
test1bit (void)
{
  d1 = _mm256_extractf128_pd (e2, k4);	  /* { dg-error "the last argument must be a 1-bit immediate" } */
  a1 = _mm256_extractf128_ps (b2, k4);	  /* { dg-error "the last argument must be a 1-bit immediate" } */
  i1 = _mm256_extractf128_si256 (l2, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */
  e1 = _mm256_insertf128_pd (e2, d1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */
  b1 = _mm256_insertf128_ps (b2, a1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */
  l1 = _mm256_insertf128_si256 (l2, i1, k4);/* { dg-error "the last argument must be a 1-bit immediate" } */
}
Ejemplo n.º 5
0
 double hadd(const vector4d& rhs)
 {
     // rhs = (x0, x1, x2, x3)
     // tmp = (x2, x3, x0, x1)
     __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
     // tmp = (x2+x0, x3+x1, -, -)
     tmp = _mm256_add_pd(rhs, tmp);
     // tmp = (x2+x0+x3+x1, -, -, -)
     tmp = _mm256_hadd_pd(tmp, tmp);
     return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
 }
Ejemplo n.º 6
0
		inline float64x4_t mat4_mul_vec4(const float64x4_t ymm[4], const float64x4_t ymm_v)
		{
			float64x4_t perm0 = _mm256_permute_pd(ymm_v, 0x0); // x x y y
			float64x4_t perm1 = _mm256_permute_pd(ymm_v, 0xF); // z z w w

			float64x4_t bcast0 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 0)); // x x x x 
			float64x4_t bcast1 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm0, 1)); // y y y y
			float64x4_t bcast2 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 0)); // z z z z
			float64x4_t bcast3 = _mm256_broadcast_pd(&_mm256_extractf128_pd(perm1, 1)); // w w w w

			float64x4_t mul0 = _mm256_mul_pd(ymm[0], bcast0);
			float64x4_t mul1 = _mm256_mul_pd(ymm[1], bcast1);
			float64x4_t mul2 = _mm256_mul_pd(ymm[2], bcast2);
			float64x4_t mul3 = _mm256_mul_pd(ymm[3], bcast3);

			float64x4_t add0 = _mm256_add_pd(mul0, mul1);
			float64x4_t add1 = _mm256_add_pd(mul2, mul3);
			float64x4_t add2 = _mm256_add_pd(add0, add1);

			return add2;
		}
Ejemplo n.º 7
0
ALGEBRA_INLINE double	vector_ps_double (const double* pa,const double* pb,size_t n) {
    if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
        size_t q = n/4;
        size_t r = n%4;
        double w = 0;

        if(q>0) {
            __m256d acc = _mm256_setzero_pd();
            __m256d i1 = _mm256_load_pd(pa);
            __m256d j1 = _mm256_load_pd(pb);
            pa += 4;
            pb += 4;
            __m256d s = _mm256_mul_pd(i1, j1);
            acc = _mm256_add_pd(acc, s);

            while(--q != 0) {
                // load
                i1 = _mm256_load_pd(pa);
                j1 = _mm256_load_pd(pb);
                pa += 4;
                pb += 4;
                // multiplie
                s = _mm256_mul_pd(i1, j1);
                // accumule
                acc = _mm256_add_pd(acc, s);            
            }
            // sum finale
            // add horizontal
            acc = _mm256_hadd_pd(acc, acc);
            // échange 128bits haut et bas
            __m256d accp = _mm256_permute2f128_pd(acc, acc, 1);
            // add vertical
            acc = _mm256_add_pd(acc, accp);
            // extract
            _mm_store_sd(&w,  _mm256_extractf128_pd(acc,0));
        }
        return w + vector_ps_double_basic(pa, pb, r);
    }
    return vector_ps_double_basic(pa, pb, n);
}
Ejemplo n.º 8
0
int main(void)
{
   // std::cout<<std::endl<<" Compute inner product..."<<std::endl<<std::endl;

    // INIT VECTOR
    //double vec1    [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));//__declspec(align(n))
    //double vec2    [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));
     //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec1    [_PBM_SIZE];
     //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec2    [_PBM_SIZE];

    //double *vec1 = _aligned_malloc(_PBM_SIZE*sizeof *vec1,_CBSIM_DBL_ALIGN_);
    //double *vec2 = _aligned_malloc(_PBM_SIZE*sizeof *vec2,_CBSIM_DBL_ALIGN_);


	double *vec1 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32);
    double *vec2 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32);

    double result = 0.0;
//    tbb::tick_count t1, t2;
    int loopsToDo = 10000;
    for (int i=0 ; i < _PBM_SIZE ; i++)
    {
        vec1[i] = static_cast<double>(i)*0.01;
        vec2[i] = static_cast<double>(i)*0.01;
    }


// SERIAL ***********************************************************************************
//    t1 = tbb::tick_count::now();


    for (int z=0 ; z < loopsToDo ; z++)
    {
    //__m256d ymm0;
    //__m256d ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;//, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm16, ymm17, ymm18;
    //ymm0 = _mm256_setzero_pd(); // accumulator
    //double res0 = 0.0, res1 = 0.0, res2 = 0.0, res3 = 0.0;
    //__m256d acc = _mm256_setzero_pd();
    //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0, 0.0, 0.0};
        result = 0.0;
    //double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
    for (int i=0 ; i < _PBM_SIZE; i+=8)
    {
/*        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m128d xmm1 = _mm256_extractf128_pd(ymm3,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm3,1);
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);

        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);
*/
        __assume_aligned(&vec1[0],32);
        __assume_aligned(&vec2[0],32);
       __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m256d ymm4 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm5 = _mm256_load_pd(&vec2[i+4]);
        __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 );

        __m256d ymm7 = _mm256_add_pd( ymm3, ymm6);
        
        __m128d xmm1 = _mm256_extractf128_pd(ymm7,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm7,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);

        //__m256d ymm0 = _mm256_add_pd( ymm0, ymm7);

/*        //__assume_aligned(&vec1[0],32);
        //__assume_aligned(&vec2[0],32);
        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m256d ymm4 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm5 = _mm256_load_pd(&vec2[i+4]);
        //__m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 );

        //__m256d ymm7 = _mm256_add_pd( ymm3, ymm6);
        __m256d ymm6 = _mm256_fmadd_pd (ymm4,ymm5,ymm3);
        //ymm0 = _mm256_add_pd( ymm0, ymm7);


        __m128d xmm1 = _mm256_extractf128_pd(ymm6,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm6,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);

        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);

        //_mm256_store_pd(&res[0],ymm6);
        //result_SIMD_INTRINSICS += (res[0] + res[1]) + (res[2] + res[3]);
*/

//#define _VER_AVX
#ifdef _VER_AVX
        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m256d ymm4 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm5 = _mm256_load_pd(&vec2[i+4]);
        __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 );

        __m256d ymm7 = _mm256_load_pd(&vec1[i+8]);
        __m256d ymm8 = _mm256_load_pd(&vec2[i+8]);
        __m256d ymm9 = _mm256_mul_pd( ymm7, ymm8 );

        __m256d ymm10 = _mm256_load_pd(&vec1[i+12]);
        __m256d ymm11 = _mm256_load_pd(&vec2[i+12]);
        __m256d ymm12 = _mm256_mul_pd( ymm10, ymm11 );

        __m256d ymm13 = _mm256_add_pd( ymm3, ymm6);
        __m256d ymm14 = _mm256_add_pd( ymm9, ymm12);
        __m256d ymm15 = _mm256_add_pd( ymm13, ymm14);

        __m128d xmm1 = _mm256_extractf128_pd(ymm15,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm15,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
        double res_SIMD_INTRINSICS[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res_SIMD_INTRINSICS[0],xmm3);

        result += (res_SIMD_INTRINSICS[0] + res_SIMD_INTRINSICS[1]);

        //ymm0 = _mm256_add_pd( ymm0, ymm13);
        //ymm0 = _mm256_add_pd( ymm0, ymm14);
#endif
//#define _VER_AVX2
#ifdef _VER_AVX2
        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm3 = _mm256_load_pd(&vec1[i+8]);
        __m256d ymm4 = _mm256_load_pd(&vec1[i+12]);
        //__m256d ymm13 = _mm256_load_pd(&vec1[i+16]);
        //__m256d ymm14 = _mm256_load_pd(&vec1[i+20]);
        //__m256d ymm15 = _mm256_load_pd(&vec1[i+24]);
        //__m256d ymm16 = _mm256_load_pd(&vec1[i+28]);

        __m256d ymm5 = _mm256_load_pd(&vec2[i]);
        __m256d ymm6 = _mm256_load_pd(&vec2[i+4]);
        __m256d ymm7 = _mm256_load_pd(&vec2[i+8]);
        __m256d ymm8 = _mm256_load_pd(&vec2[i+12]);
        //__m256d ymm17 = _mm256_load_pd(&vec2[i+16]);
        //__m256d ymm18 = _mm256_load_pd(&vec2[i+20]);
        //__m256d ymm19 = _mm256_load_pd(&vec2[i+24]);
        //__m256d ymm20 = _mm256_load_pd(&vec2[i+28]);

        __m256d ymm9 = _mm256_mul_pd(ymm1,ymm5);
        __m256d ymm10 = _mm256_fmadd_pd(ymm2,ymm6,ymm9);
        //__m256d ymm11 = _mm256_mul_pd(ymm3,ymm7);
        __m256d ymm11 = _mm256_fmadd_pd(ymm3,ymm7,ymm10);
        __m256d ymm12 = _mm256_fmadd_pd(ymm4,ymm8,ymm11);
        //ymm12 = _mm256_hadd_pd(ymm10,ymm12);
        //__m256d ymm21 = _mm256_fmadd_pd(ymm13,ymm17,ymm12);
        //__m256d ymm22 = _mm256_fmadd_pd(ymm14,ymm18,ymm21);
        //__m256d ymm23 = _mm256_fmadd_pd(ymm15,ymm19,ymm22);
        //__m256d ymm24 = _mm256_fmadd_pd(ymm16,ymm20,ymm23);
        __m128d xmm1 = _mm256_extractf128_pd(ymm12,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm12,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
        double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);
#endif

/*        // Performing 4 dot product at one time
        ymm1  = _mm256_load_pd(&vec1[i]);    // x[0]
        ymm2  = _mm256_load_pd(&vec1[i+4]);  // x[1]
        ymm3  = _mm256_load_pd(&vec1[i+8]);  // x[2]
        ymm4  = _mm256_load_pd(&vec1[i+12]); // x[3]

        ymm5  = _mm256_load_pd(&vec2[i]);    // y[0]
        ymm6  = _mm256_load_pd(&vec2[i+4]);  // y[1]
        ymm7  = _mm256_load_pd(&vec2[i+8]);  // y[2]
        ymm8  = _mm256_load_pd(&vec2[i+12]); // y[3]

        ymm9  = _mm256_mul_pd( ymm1, ymm5 );   // xy0
        ymm10 = _mm256_mul_pd( ymm2, ymm6 );   // xy1
        ymm11 = _mm256_hadd_pd( ymm9, ymm10 ); // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13

        ymm12 = _mm256_mul_pd( ymm3, ymm7 ); // xy2
        ymm13 = _mm256_mul_pd( ymm4, ymm8 ); // xy3
        ymm14 = _mm256_hadd_pd( ymm12, ymm13 );  // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33

        ymm15 = _mm256_permute2f128_pd( ymm11, ymm14, 0x21 ); // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31

        ymm1  = _mm256_blend_pd( ymm11, ymm14, 0b1100); // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33

        ymm2 = _mm256_add_pd( ymm15, ymm1 );

        ymm0 = _mm256_add_pd( ymm0, ymm2 );
*/

/*        __m256d x[4], y[4];
        x[0] = _mm256_load_pd(&vec1[i]);
        x[1] = _mm256_load_pd(&vec1[i+4]);
        x[2] = _mm256_load_pd(&vec1[i+8]);
        x[3] = _mm256_load_pd(&vec1[i+12]);
        y[0] = _mm256_load_pd(&vec2[i]);
        y[1] = _mm256_load_pd(&vec2[i+4]);
        y[2] = _mm256_load_pd(&vec2[i+8]);
        y[3] = _mm256_load_pd(&vec2[i+12]);

        __m256d xy0 = _mm256_mul_pd( x[0], y[0] );
        __m256d xy1 = _mm256_mul_pd( x[1], y[1] );
        // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
        __m256d temp01 = _mm256_hadd_pd( xy0, xy1 );

        __m256d xy2 = _mm256_mul_pd( x[2], y[2] );
        __m256d xy3 = _mm256_mul_pd( x[3], y[3] );
        // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
        __m256d temp23 = _mm256_hadd_pd( xy2, xy3 );

        // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
        __m256d swapped = _mm256_permute2f128_pd( temp01, temp23, 0x21 );

        // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
        __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);

        __m256d dotproduct = _mm256_add_pd( swapped, blended );
*/
        //ymm0 = _mm256_add_pd(ymm0,dotproduct);

/*        __m128d xmm1 = _mm256_extractf128_pd(dotproduct,0);
        __m128d xmm2 = _mm256_extractf128_pd(dotproduct,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
        double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);
*/

//        _mm256_store_pd(&res[0],dotproduct);
//        result += (res[0] + res[1]) + (res[2] + res[3]);

        //result_SIMD_INTRINSICS += dotproduct[0] + dotproduct[1] + dotproduct[2] + dotproduct[3];

        //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));
        //_mm256_store_pd(&res[0],ymm0);
        //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
        //double* res = (double*)&ymm0;
        //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
    }
    //double* res = (double*)&ymm0;
    //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
    //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));
    //_mm256_store_pd(&res[0],ymm0);
    //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
}


//    t2 = tbb::tick_count::now();
//    double exec_time = 1000.0*(t2-t1).seconds();


    //std::cout << std::setiosflags(std::ios::fixed) << std::setprecision(5);
    std::cout<<std::endl<<"RESULTS: " <<std::endl;
    std::cout<<"result_intrin ----------: "<< result << std::endl;
    //std::cout<<"result_intrin ----------: "<< result << ", time: " << 1000.0*(t2-t1).seconds() << " ms" << std::endl;

    std::cout<<std::endl<<"Program end. "<<std::endl<<std::endl;
    return 0;
}
Ejemplo n.º 9
0
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint)
{

#if CHECK_FUNCTIONAL
    Vector3d param(EPoint);
#endif

    AVX2TABLETYPE *mp;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    const __m256d ONE_PD = _mm256_set1_pd(1.0);
    const __m128i short_si128 = _mm_set1_epi32(0xffff);

    const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0);
    const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON);
    const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy);
    const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn));

    const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0);

    const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn));
    const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD);

    const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn),
        _mm_set1_epi32(0xfff));

    const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn,
        _mm256_mul_pd(xyz_ixyzn,
            _mm256_sub_pd(_mm256_set1_pd(3.0),
                _mm256_add_pd(xyz_ixyzn, xyz_ixyzn))));

    const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn);

    const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20);
    const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0));
    const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1));

    const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy);

    const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));
    const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));

    int ints[4];
    _mm_storeu_si128((__m128i*)(ints), i_xyzn);

    const int ixiy_hash = Hash2d(ints[0], ints[1]);
    const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]);
    const int ixjy_hash = Hash2d(ints[0], ints[1] + 1);
    const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1);

    const int iz = ints[2];

    const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);
    const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);

    __m256d ss;
    __m256d blend;

    __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd();


    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0));
    //     blend = _mm256_blend_pd(iii, jjj, 0);

    INCSUMAVX_VECTOR(mp, ss, iii);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 2);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3));
    blend = _mm256_blend_pd(iii, jjj, 6);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 4);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 12);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3));
    //     blend = _mm256_blend_pd(iii, jjj, 14);

    INCSUMAVX_VECTOR(mp, ss, jjj);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 10);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0));
    blend = _mm256_blend_pd(iii, jjj, 8);

    INCSUMAVX_VECTOR(mp, ss, blend);


    __m256d xy = _mm256_hadd_pd(x,y);
    __m128d xy_up = _mm256_extractf128_pd(xy,1);
    xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up);
    _mm_storeu_pd(&result[X],xy_up);

    __m128d z_up = _mm256_extractf128_pd(z,1);
    z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up);
    z_up = _mm_hadd_pd(z_up,z_up);
    result[Z] = _mm_cvtsd_f64(z_up);


#if CHECK_FUNCTIONAL
    {
        Vector3d portable_res;
        PortableDNoise(portable_res , param);
        if (fabs(portable_res[X] - result[X]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise X error");
        }
        if (fabs(portable_res[Y] - result[Y]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Y error");
        }
        if (fabs(portable_res[Z] - result[Z]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Z error");
        }

    }

#endif



    _mm256_zeroupper();
    return;

}
Ejemplo n.º 10
0
DBL AVX2FMA3Noise(const Vector3d& EPoint, int noise_generator)
{
    AVX2TABLETYPE *mp;
    DBL sum = 0.0;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator == kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
        if (sum < 0.0) sum = 0.0;
        if (sum > 1.0) sum = 1.0;

        return sum;
    }

    const __m256d ONE_PD = _mm256_set1_pd(1);
    const __m128i short_si128 = _mm_set1_epi32(0xffff);

    const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0);
    const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON);
    const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy);
    const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn));

    const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0);

    const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn));
    const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD);

    const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn),
        _mm_set1_epi32(0xfff));

    const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn,
        _mm256_mul_pd(xyz_ixyzn,
            _mm256_sub_pd(_mm256_set1_pd(3.0),
                _mm256_add_pd(xyz_ixyzn, xyz_ixyzn))));

    const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn);

    const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20);
    const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0));
    const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1));

    const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy);

    const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));
    const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));

    int ints[4];
    _mm_storeu_si128((__m128i*)(ints), i_xyzn);

    const int ixiy_hash = Hash2d(ints[0], ints[1]);
    const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]);
    const int ixjy_hash = Hash2d(ints[0], ints[1] + 1);
    const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1);

    const int iz = ints[2];

    const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);
    const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);

    __m256d sumr = _mm256_setzero_pd();
    __m256d sumr1 = _mm256_setzero_pd();


    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)];
    INCSUMAVX_NOBLEND(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)), iii);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 2);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 4);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)), iii, jjj, 6);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)), iii, jjj, 8);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 10);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 12);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)];
    INCSUMAVX_NOBLEND(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)), jjj);

    {
        sumr = _mm256_add_pd(sumr, sumr1);

        __m128d sumr_up = _mm256_extractf128_pd(sumr,1);
        sumr_up = _mm_add_pd(_mm256_castpd256_pd128(sumr),sumr_up);
        sumr_up = _mm_hadd_pd(sumr_up,sumr_up);
        sum = _mm_cvtsd_f64(sumr_up);
    }

    if (noise_generator == kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        sum += 1.05242;
        sum *= 0.48985582;
        /*sum *= 0.5;
        sum += 0.5;*/

        if (sum < 0.0)
            sum = 0.0;
        if (sum > 1.0)
            sum = 1.0;
    }
    else
    {
        sum = sum + 0.5;                     /* range at this point -0.5 - 0.5... */

        if (sum < 0.0)
            sum = 0.0;
        if (sum > 1.0)
            sum = 1.0;
    }



#if CHECK_FUNCTIONAL
    {
        DBL orig_sum = PortableNoise(EPoint, noise_generator);
        if (fabs(orig_sum - sum) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("Noise error");
        }

    }

#endif

    _mm256_zeroupper();
    return (sum);
}
Ejemplo n.º 11
0
inline void rotate_left_wm2(F64vec4 *v0, const F64vec4 v1)
{
    *v0 = _mm256_castpd128_pd256(_mm256_extractf128_pd(*v0, 1));
    *v0 = _mm256_insertf128_pd(*v0, _mm256_castpd256_pd128(v1), 1);
}
Ejemplo n.º 12
0
/**
  * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels.
  */
FORCE_INLINE double single_pixel(
        double *im, int center, int top, int left, int right, int bottom,
        const __m256i mask1110,
        const __m256d rgb0W,
        const __m256d onehalf,
        const __m256d minustwelvehalf){
//    double r = im[center];
//    double g = im[center+1];
//    double b = im[center+2];

//    double r1 = im[top];
//    double g1 = im[top+1];
//    double b1 = im[top+2];
//    double r2 = im[left];
//    double g2 = im[left+1];
//    double b2 = im[left+2];
//    double r3 = im[right];
//    double g3 = im[right+1];
//    double b3 = im[right+2];
//    double r4 = im[bottom];
//    double g4 = im[bottom+1];
//    double b4 = im[bottom+2];

    __m256d c = _mm256_maskload_pd(&(im[center]),mask1110);
    __m256d c1 = _mm256_loadu_pd(&(im[top]));
    __m256d c2 = _mm256_loadu_pd(&(im[left]));
    __m256d c3 = _mm256_loadu_pd(&(im[right]));
    __m256d c4 = _mm256_loadu_pd(&(im[bottom]));

    COST_INC_LOAD(20);

//    double grey = rw * r + gw * g + bw * b;
//    double grey1 = rw * r1 + gw * g1 + bw * b1;
//    double grey2 = rw * r2 + gw * g2 + bw * b2;
//    double grey3 = rw * r3 + gw * g3 + bw * b3;
//    double grey4 = rw * r4 + gw * g4 + bw * b4;

    __m256d greyc = _mm256_mul_pd(c,rgb0W);
    __m256d grey1 = _mm256_mul_pd(c1,rgb0W);
    __m256d grey2 = _mm256_mul_pd(c2,rgb0W);
    __m256d grey3 = _mm256_mul_pd(c3,rgb0W);
    __m256d grey4 = _mm256_mul_pd(c4,rgb0W);

    //AVX: double: horizontal add for 1 vector
     __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2
     __m256d c_h = _mm256_hadd_pd(c,c_perm);
     __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo
     __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi
     double c_hsum_lo = _mm_cvtsd_f64(c_h_lo);
     double c_hsum_hi = _mm_cvtsd_f64(c_h_hi);
     double c_hsum = c_hsum_lo + c_hsum_hi;

     //AVX: double: horizontal add for 1 vector
      __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2
      __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm);
      __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo
      __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi
      double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo);
      double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi);
      double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi;

    //AVX: _m256d: horizontal add for 4 vectors at once
    __m256d grey12 = _mm256_hadd_pd(grey1,grey2);
    __m256d grey34 = _mm256_hadd_pd(grey3,grey4);
    __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011
    __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2
    __m256d grey_1234 =  _mm256_add_pd(grey_1234_perm, grey_1234_blend);

    //AVX: double: horizontal add for 1 vector
     __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2
     __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm);
     __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo
     __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi
     double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo);
     double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi);
     double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi;

    COST_INC_ADD(10); //+ operations wasted on AVX
    COST_INC_MUL(15); //+ operations wasted on AVX

    double mu = c_hsum / 3.0;
    COST_INC_ADD(2);
    COST_INC_DIV(1);

//    double rmu = r-mu;
//    double gmu = g-mu;
//    double bmu = b-mu;

    __m256d c_mu = _mm256_set1_pd(mu);
    __m256d c_rgbmu = _mm256_sub_pd(c,c_mu);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rz = r-0.5;
//    double gz = g-0.5;
//    double bz = b-0.5;

    __m256d c_rgbz = _mm256_sub_pd(c,onehalf);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rzrz = rz*rz;
//    double gzgz = gz*gz;
//    double bzbz = bz*bz;

    __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double re = exp(-12.5*rzrz);
//    double ge = exp(-12.5*gzgz);
//    double be = exp(-12.5*bzbz);

    __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq);

    __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp);
    __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps);
    __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps);

    COST_INC_EXP(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0);
    __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu);

    __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo
    __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi
    __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo);
    double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi);
    double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum);

    double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo;

    double t1_tmp2 = t1_tmp1 / 3.0;
    double t1 = sqrt(t1_tmp2);

    COST_INC_SQRT(1);
    COST_INC_ADD(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX
    COST_INC_DIV(1);
    double t2 = fabs(t1);
    COST_INC_ABS(1);

//    double t3 = re*ge*be;

    __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo
    __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi

    double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo);
    double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi);
    __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap
    double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped);

    double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo;

    COST_INC_MUL(2);
    double t4 = fabs(t3);
    COST_INC_ABS(1);

    double t5 = t2 * t4;
    COST_INC_MUL(1);

//    double t6 = -4.0*grey+grey1+grey2+grey3+grey4;

    double minusfour_times_grey = -4.0*greyc_hsum;
    double t6 = minusfour_times_grey+grey1234_sum;

    COST_INC_MUL(1);
    COST_INC_ADD(2); //2 operations saved due to AVX

    double t7 = fabs(t6);
    COST_INC_ABS(1);

    double t8 = t5 * t7;
    COST_INC_MUL(1);

    double t9 = t8 + 1.0E-12;
    COST_INC_ADD(1);

    return t9;
}
Ejemplo n.º 13
0
// it moves vertically across blocks
void kernel_dtrmv_u_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 4;
	
	double *tA, *tx;

	int k;
	
	__m256d
		tmp0,
		a_00_10_20_30,
		x_0_1_2_3,
		y_00;
	
	
	y_00 = _mm256_setzero_pd();

	k=0;
	for(; k<kmax-3; k+=4)
		{
		
		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		
		A += 4 + (sda-1)*lda;
		x += 4;

		}

	__m128d
		tm0,
		a_00_10, a_01_11,
		x_0_1,
		y_0, y_1, y_0_1;
	
	tm0 = _mm256_extractf128_pd( y_00, 0x1 );
	y_0 = _mm256_castpd256_pd128( y_00 );
	y_0 = _mm_add_pd( y_0, tm0 );

	if(k<kmax-1)
		{
		
		x_0_1 = _mm_loadu_pd( &x[0] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );
		
		tm0 = _mm_mul_pd( a_00_10, x_0_1 );
		y_0 = _mm_add_pd( y_0, tm0 );
		
		A += 2;
		x += 2;

		}
	
	x_0_1 = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A[0+lda*0] );
	tm0 = _mm_mul_sd( a_00_10, x_0_1 );
	y_0 = _mm_add_sd( y_0, tm0 );

	y_0 = _mm_hadd_pd( y_0, y_0 );


	if(alg==0)
		{
		_mm_store_sd(&y[0], y_0);
		}
	else if(alg==1)
		{
		y_0_1 = _mm_load_sd( &y[0] );

		y_0_1 = _mm_add_sd( y_0_1, y_0 );

		_mm_store_sd(&y[0], y_0_1);
		}
	else // alg==-1
		{
		y_0_1 = _mm_load_sd( &y[0] );
	
		y_0_1 = _mm_sub_sd( y_0_1, y_0 );
	
		_mm_store_sd(&y[0], y_0_1);
		}

	}
Ejemplo n.º 14
0
void kernel_dgemv_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	double *tA, *tx;

	int k;
	int ka = kmax; // number from aligned positon
	
	__m256d
		aaxx_temp,
		a_00_10_20_30,
		x_0_1_2_3,
		y_00;
	
	__m128d
		ax_temp,
		a_00_10,
		x_0_1,
		y_0, y_1, y_0_1;
	
	y_00 = _mm256_setzero_pd();
	
	y_0 = _mm256_castpd256_pd128(y_00);

	k = lda*(ka/lda);
	tA = A + (ka/lda)*sda*lda;
	tx = x + (ka/lda)*lda;

	for(; k<ka; k++)
		{
		x_0_1 = _mm_load_sd( &tx[0] );

		a_00_10 = _mm_load_sd( &tA[0+lda*0] );
		
/*		y_0 += a_00_10 * x_0_1;*/
		ax_temp = _mm_mul_sd( a_00_10, x_0_1 );	
		y_0 = _mm_add_sd (y_0, ax_temp );
		
		tA += 1;
		tx += 1;

		}

	y_00 = _mm256_castpd128_pd256(y_0);

	k=0;
	for(; k<ka-3; k+=4)
		{
		
		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		
/*		y_00 += a_00_10_20_30 * x_0_1_2_3;*/
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		
		A += 4 + (sda-1)*lda;
		x += 4;

		}

	y_00 = _mm256_hadd_pd(y_00, y_00);

	y_1 = _mm256_extractf128_pd(y_00, 1);
	y_0 = _mm256_castpd256_pd128(y_00);

/*	y_0 += y_1;*/
	y_0 = _mm_add_sd( y_0, y_1 );

	if(alg==0)
		{
		_mm_store_sd(&y[0], y_0);
		}
	else if(alg==1)
		{
		y_0_1 = _mm_load_sd( &y[0] );

/*		y_0_1 += y_0;*/
		y_0_1 = _mm_add_sd( y_0_1, y_0 );

		_mm_store_sd(&y[0], y_0_1);
		}
	else // alg==-1
		{
		y_0_1 = _mm_load_sd( &y[0] );
	
/*		y_0_1 -= y_0;*/
		y_0_1 = _mm_sub_sd( y_0_1, y_0 );
	
		_mm_store_sd(&y[0], y_0_1);
		}

	}
Ejemplo n.º 15
0
__m128d test_mm256_extractf128_pd_1(__m256d a) {
  // CHECK-LABEL: @test_mm256_extractf128_pd_1
  // CHECK: shufflevector{{.*}}<i32 2, i32 3>
  return _mm256_extractf128_pd(a, 1);
}
Ejemplo n.º 16
0
__m128d test_mm256_extractf128_pd_0(__m256d a) {
  // CHECK-LABEL: @test_mm256_extractf128_pd_0
  // CHECK: shufflevector{{.*}}<i32 0, i32 1>
  return _mm256_extractf128_pd(a, 0);
}