Example #1
0
/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_mul(ring_t *r, const ring_t *x, const ring_t *y){
  // printf("\n\n**************split-radix FAST**************\n");

  fft_vector_forward(&vctr_x,x);
  fft_vector_forward(&vctr_y,y);
  
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vctr_x.real+i);
    imag_x = _mm256_load_pd(vctr_x.imag+i);
    real_y = _mm256_load_pd(vctr_y.real+i);
    imag_y = _mm256_load_pd(vctr_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
     
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vctr_res.real+i,real_x);
    _mm256_store_pd(vctr_res.imag+i,imag_x);
  }
  fft_vector_backward(&vctr_res,r);
}
Example #2
0
    /*!
     * \copydoc avx_vec::fmadd
     */
    ETL_STATIC_INLINE(avx_simd_double) fmadd(avx_simd_double a, avx_simd_double b, avx_simd_double c) {
#ifdef __FMA__
        return _mm256_fmadd_pd(a.value, b.value, c.value);
#else
        return add(mul(a, b), c);
#endif
    }
Example #3
0
/******************************************************************
*
* NEGACYCLIC FFT LOOK UP TABLE
*
******************************************************************/
void negacyc_mul(ring_t *r, const ring_t *x, const ring_t *y)
{
  phi_forward(&vector_x,x);
  phi_forward(&vector_y,y);

  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp,dim;
  dim = _mm256_set1_pd(CPLXDIM);
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vector_x.real+i);
    imag_x = _mm256_load_pd(vector_x.imag+i);
    real_y = _mm256_load_pd(vector_y.real+i);
    imag_y = _mm256_load_pd(vector_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    
    real_x = _mm256_div_pd(real_x,dim);
    imag_x = _mm256_div_pd(imag_x,dim);

    _mm256_store_pd(vector_res.real+i,real_x);
    _mm256_store_pd(vector_res.imag+i,imag_x);
  }
  phi_backward(&vector_res,r);
  // print_cplx(&vec_res,CPLXDIM);
}
Example #4
0
    inline vector4d fma(const vector4d& x, const vector4d& y, const vector4d& z)
    {
#ifdef __FMA__
        return _mm256_fmadd_pd(x, y, z);
#else
        return x * y + z;
#endif
    }
Example #5
0
 double zdotu_soa(
                const int    N,
                const double* da,
                const double* db,
                const int    ix,
                const double* dc,
                const double* dd,
                const int    iy,
                double*  res
                )
{
        __m256d ymm0;
        __m256d ymm1;
        __m256d ymm2;
        __m256d ymm3;
        __m256d ymm4 = _mm256_setzero_pd();
        __m256d ymm5 = _mm256_setzero_pd();
        //
        int ii;
//#pragma unroll
        for(ii = 0; ii < N/4; ii++)
        {
		_mm_prefetch((const char*) da + 0x200, 1);
		_mm_prefetch((const char*) db + 0x200, 1);
		_mm_prefetch((const char*) dc + 0x200, 1);
		_mm_prefetch((const char*) dd + 0x200, 1);
                //IACA_START;
                // 8*4*4 = 128 bytes
                ymm0 = _mm256_loadu_pd(da + 4*ii);
                ymm1 = _mm256_loadu_pd(db + 4*ii);
                ymm2 = _mm256_loadu_pd(dc + 4*ii);
                ymm3 = _mm256_loadu_pd(dd + 4*ii);
                // 2*4*4 = 32 flops
                ymm4 = _mm256_fmsub_pd(ymm0, ymm2, _mm256_fmsub_pd(ymm1, ymm3, ymm4));
                ymm5 = _mm256_fmadd_pd(ymm0, ymm3, _mm256_fmadd_pd(ymm1, ymm2, ymm5));
		// flops/bute ratio = 1/4
                //IACA_END
        }
        double* re = (double*)&ymm4;
        double* im = (double*)&ymm5;
	//
        res[0] = re[0] + re[1] + re[2] + re[3];
        res[1] = im[0] + im[1] + im[2] + im[3];
}
Example #6
0
 double zdotu_aos(
                const int    N,
                const double* dx,
                const int    ix,
                const double* dy,
                const int    iy,
		double*  res
                )
{
	__m256d ymm0;
	__m256d ymm1;
	__m256d ymm2;
	__m256d ymm3;
	__m256d ymm4 = _mm256_setzero_pd();
	__m256d ymm5 = _mm256_setzero_pd();
	//
	int ii = 0;
	//for(ii = 0; ii < N/2; ii++)
	do
	{
		//IACA_START;
		ymm0 = _mm256_loadu_pd(dx + 4*ii);	
		ymm1 = _mm256_loadu_pd(dy + 4*ii);	
		//
		ymm4 = _mm256_fmadd_pd(ymm1, ymm0, ymm4);
		ymm2 = _mm256_permute_pd(ymm1, 0x5);
		ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5);
		ii++;
		//
	} while (ii < N/2);
	//IACA_END
	double* re = (double*)&ymm4;
	double* im = (double*)&ymm5;
	res[0] = re[0] - re[1] + re[2] - re[3];
	res[1] = im[0] + im[1] + im[2] + im[3];
}
Example #7
0
void calculate_fma_double (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY)
{
    __m256d dd = _mm256_set1_pd (scale);
    __m256d XX0 = _mm256_set1_pd (X0);

    for (unsigned j = YSTART; j < SY; j++)	{
        __m256d y0 = _mm256_set1_pd (j*scale + Y0);
        for (unsigned i = 0; i < SX; i += 4)	{

            __m128i ind = _mm_setr_epi32 (i, i + 1, i + 2, i + 3);
            __m256d x0 = _mm256_fmadd_pd (dd, _mm256_cvtepi32_pd (ind), XX0);
            __m256d x = x0;
            __m256d y = y0;
            __m256i counts = _mm256_setzero_si256 ();
            __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu);

            for (unsigned n = 0; n < 255; n++)	{
                __m256d x2 = _mm256_mul_pd (x, x);
                __m256d y2 = _mm256_mul_pd (y, y);
                __m256d abs = _mm256_add_pd (x2, y2);
                __m256i cmp = _mm256_castpd_si256 (_mm256_cmp_pd (abs, _mm256_set1_pd (4), 1));
                cmp_mask = _mm256_and_si256 (cmp_mask, cmp);
                if (_mm256_testz_si256 (cmp_mask, cmp_mask)) {
                    break;
                }
                counts = _mm256_sub_epi64 (counts, cmp_mask);
                __m256d t = _mm256_add_pd (x, x);
                y = _mm256_fmadd_pd (t, y, y0);
                x = _mm256_add_pd (_mm256_sub_pd (x2, y2), x0);
            }
            __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8));
            *(uint32_t*) out = _mm_extract_epi16 (_mm256_extracti128_si256 (result, 0), 0) | (_mm_extract_epi16 (_mm256_extracti128_si256 (result, 1), 0) << 16);
            out += 4;
        }
    }
}
Example #8
0
void gvrotg_fma(double *c, double *s, double *r, double a, double b)
{
#if defined(__FMA__)
    register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1;
    if (b == 0.0) {
        *c = 1.0;
        *s = 0.0;
        *r = a;
        return;
    }
    if (a == 0.0) {
        *c = 0.0;
        *s = 1.0;
        *r = b;
        return;
    }

    // set_pd() order: [3, 2, 1, 0]
    // x[0], x[1]: |a| > |b|,  x[2],x[3]: |b| > |a|

    one = _mm256_set1_pd(1.0);
    x0  = _mm256_set_pd(1.0, a, b, 1.0);   // x0 = {1, a,   b,   1}
    x1  = _mm256_set_pd(1.0, b, a, 1.0);   // x0 = {1, b,   a,   1}
    t0  = _mm256_div_pd(x0, x1);           // t0 = {1, a/b, b/a, 1}
    t2  = _mm256_fmadd_pd(t0, t0, one);    // x3 = {1, 1+(a/b)^2, (b/a)^2+1, 1}
    u0  = _mm256_sqrt_pd(t2);              // u0 = {1, sqrt(1+(a/b)^2), sqrt((b/2)^2+1), 1}
    u1  = _mm256_div_pd(one, u0);
    b0  = _mm256_blend_pd(u0, u1, 0x9);    // b0 = {1/u(a),   u(a),   u(b), 1/u(b)} 
    b0  = _mm256_mul_pd(b0, x1);           // b0 = {1/u(a), b*u(a), a*u(b), 1/u(b)} 
    b1  = _mm256_mul_pd(t0, u1);           // b1 = {1/u(a), t*u(a), t*u(b), 1/u(b)} 

    if (fabs(b) > fabs(a)) {
      *s = b0[3];
      *r = b0[2];
      *c = b1[2];
      if (signbit(b)) {
          *s = -(*s);
          *c = -(*c);
          *r = -(*r);
      }
    } else {
      *c = b0[0];
      *r = b0[1];
      *s = b1[1];
    }
#endif
}
Example #9
0
void SimpleClean::PartialSubtractImageAVX(double *image, size_t imgWidth, size_t imgHeight, const double *psf, size_t psfWidth, size_t psfHeight, size_t x, size_t y, double factor, size_t startY, size_t endY)
{
	size_t startX, endX;
	int offsetX = (int) x - psfWidth/2, offsetY = (int) y - psfHeight/2;
	
	if(offsetX > 0)
		startX = offsetX;
	else
		startX = 0;
	
	if(offsetY > (int) startY)
		startY = offsetY;
	
	endX = std::min(x + psfWidth/2, imgWidth);
	
	size_t unAlignedCount = (endX - startX) % 4;
	endX -= unAlignedCount;
	
	endY = std::min(y + psfHeight/2, endY);
	
	const __m256d mFactor = _mm256_set1_pd(-factor);
	for(size_t ypos = startY; ypos < endY; ++ypos)
	{
		double *imageIter = image + ypos * imgWidth + startX;
		const double *psfIter = psf + (ypos - offsetY) * psfWidth + startX - offsetX;
		for(size_t xpos = startX; xpos != endX; xpos+=4)
		{
			__m256d
				imgVal = _mm256_loadu_pd(imageIter),
				psfVal = _mm256_loadu_pd(psfIter);
#ifdef __FMA4__
			_mm256_storeu_pd(imageIter, _mm256_fmadd_pd(psfVal, mFactor, imgVal));
#else
			_mm256_storeu_pd(imageIter, _mm256_add_pd(imgVal, _mm256_mul_pd(psfVal, mFactor)));
#endif
			imageIter+=4;
			psfIter+=4;
		}
		for(size_t xpos = endX; xpos!=endX + unAlignedCount; ++xpos)
		{
			*imageIter -= *psfIter * factor;
			++imageIter;
			++psfIter;
		}
	}
}
Example #10
0
void
check_mm256_fmadd_pd (__m256d __A, __m256d __B, __m256d __C)
{
  union256d a, b, c, e;
  a.x = __A;
  b.x = __B;
  c.x = __C;
  double d[4];
  int i;
  e.x = _mm256_fmadd_pd (__A, __B, __C);
  for (i = 0; i < 4; i++)
    {
      d[i] = a.a[i] * b.a[i] + c.a[i];
    }
  if (check_union256d (e, d))
    abort ();
}
Example #11
0
    div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
        //lhs = [x1.real, x1.img, x2.real, x2.img]
        //rhs = [y1.real, y1.img, y2.real, y2.img]

        //ymm0 = [y1.real, y1.real, y2.real, y2.real]
        __m256d ymm0 = _mm256_movedup_pd(rhs.value);

        //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
        __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111);

        //ymm2 = [x1.img, x1.real, x2.img, x2.real]
        __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);

        //ymm4 = [x.img * y.img, x.real * y.img]
        __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1);

        //ymm5 = subadd((lhs * ymm0), ymm4)

#ifdef __FMA__
        __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4);
#else
        __m256d t1   = _mm256_mul_pd(lhs.value, ymm0);
        __m256d t2   = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4);
        __m256d ymm5 = _mm256_addsub_pd(t1, t2);
#endif

        //ymm3 = [y.imag^2, y.imag^2]
        __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1);

        //ymm0 = (ymm0 * ymm0 + ymm3)

#ifdef __FMA__
        ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3);
#else
        __m256d t3   = _mm256_mul_pd(ymm0, ymm0);
        ymm0         = _mm256_add_pd(t3, ymm3);
#endif

        //result = ymm5 / ymm0
        return _mm256_div_pd(ymm5, ymm0);
    }
Example #12
0
/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED NON RECURSIVE FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_nonrec_mul(ring_t *r, const ring_t *x, const ring_t *y){
  fft_vector_nonrec_forward(&vec_x,x);
  fft_vector_nonrec_forward(&vec_y,y);
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vec_x.real+i);
    imag_x = _mm256_load_pd(vec_x.imag+i);
    real_y = _mm256_load_pd(vec_y.real+i);
    imag_y = _mm256_load_pd(vec_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
    //REPLACED FOR COMMENTED SECTION
    //real_x = ac
    // real_x = _mm256_mul_pd(real_x,real_y);
    // //imag_x = bc
    // imag_x = _mm256_mul_pd(imag_x,real_y);
    // //real_x = ac - bd => real_x - real_temp
    // real_x = _mm256_sub_pd(real_x,real_temp);
    // //imag_x = ad + bc => imag_temp + imag_x
    // imag_x = _mm256_add_pd(imag_x,imag_temp);
    //THESE ARE NOT WORKING 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vec_res.real+i,real_x);
    _mm256_store_pd(vec_res.imag+i,imag_x);

  }
  fft_vector_nonrec_backward(&vec_res,r);
}
Example #13
0
double compute_pi_leibniz_fma(size_t n)
{
	double pi = 0.0;
	register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;

	ymm0 = _mm256_setzero_pd();
	ymm1 = _mm256_set1_pd(2.0);
	ymm2 = _mm256_set1_pd(1.0);
	ymm3 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0);
	
	for (int i = 0; i <= n - 4; i += 4) {
		ymm4 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0);
		ymm4 = _mm256_fmadd_pd(ymm1, ymm4, ymm2);
		ymm4 = _mm256_div_pd(ymm3, ymm4);
		ymm0 = _mm256_add_pd(ymm0, ymm4);
	}
	double tmp[4] __attribute__((aligned(32)));
	_mm256_store_pd(tmp, ymm0);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

	return pi * 4.0;
}
Example #14
0
int main(void)
{
   // std::cout<<std::endl<<" Compute inner product..."<<std::endl<<std::endl;

    // INIT VECTOR
    //double vec1    [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));//__declspec(align(n))
    //double vec2    [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));
     //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec1    [_PBM_SIZE];
     //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec2    [_PBM_SIZE];

    //double *vec1 = _aligned_malloc(_PBM_SIZE*sizeof *vec1,_CBSIM_DBL_ALIGN_);
    //double *vec2 = _aligned_malloc(_PBM_SIZE*sizeof *vec2,_CBSIM_DBL_ALIGN_);


	double *vec1 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32);
    double *vec2 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32);

    double result = 0.0;
//    tbb::tick_count t1, t2;
    int loopsToDo = 10000;
    for (int i=0 ; i < _PBM_SIZE ; i++)
    {
        vec1[i] = static_cast<double>(i)*0.01;
        vec2[i] = static_cast<double>(i)*0.01;
    }


// SERIAL ***********************************************************************************
//    t1 = tbb::tick_count::now();


    for (int z=0 ; z < loopsToDo ; z++)
    {
    //__m256d ymm0;
    //__m256d ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;//, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm16, ymm17, ymm18;
    //ymm0 = _mm256_setzero_pd(); // accumulator
    //double res0 = 0.0, res1 = 0.0, res2 = 0.0, res3 = 0.0;
    //__m256d acc = _mm256_setzero_pd();
    //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0, 0.0, 0.0};
        result = 0.0;
    //double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
    for (int i=0 ; i < _PBM_SIZE; i+=8)
    {
/*        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m128d xmm1 = _mm256_extractf128_pd(ymm3,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm3,1);
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);

        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);
*/
        __assume_aligned(&vec1[0],32);
        __assume_aligned(&vec2[0],32);
       __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m256d ymm4 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm5 = _mm256_load_pd(&vec2[i+4]);
        __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 );

        __m256d ymm7 = _mm256_add_pd( ymm3, ymm6);
        
        __m128d xmm1 = _mm256_extractf128_pd(ymm7,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm7,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);

        //__m256d ymm0 = _mm256_add_pd( ymm0, ymm7);

/*        //__assume_aligned(&vec1[0],32);
        //__assume_aligned(&vec2[0],32);
        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m256d ymm4 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm5 = _mm256_load_pd(&vec2[i+4]);
        //__m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 );

        //__m256d ymm7 = _mm256_add_pd( ymm3, ymm6);
        __m256d ymm6 = _mm256_fmadd_pd (ymm4,ymm5,ymm3);
        //ymm0 = _mm256_add_pd( ymm0, ymm7);


        __m128d xmm1 = _mm256_extractf128_pd(ymm6,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm6,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);

        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);

        //_mm256_store_pd(&res[0],ymm6);
        //result_SIMD_INTRINSICS += (res[0] + res[1]) + (res[2] + res[3]);
*/

//#define _VER_AVX
#ifdef _VER_AVX
        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m256d ymm4 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm5 = _mm256_load_pd(&vec2[i+4]);
        __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 );

        __m256d ymm7 = _mm256_load_pd(&vec1[i+8]);
        __m256d ymm8 = _mm256_load_pd(&vec2[i+8]);
        __m256d ymm9 = _mm256_mul_pd( ymm7, ymm8 );

        __m256d ymm10 = _mm256_load_pd(&vec1[i+12]);
        __m256d ymm11 = _mm256_load_pd(&vec2[i+12]);
        __m256d ymm12 = _mm256_mul_pd( ymm10, ymm11 );

        __m256d ymm13 = _mm256_add_pd( ymm3, ymm6);
        __m256d ymm14 = _mm256_add_pd( ymm9, ymm12);
        __m256d ymm15 = _mm256_add_pd( ymm13, ymm14);

        __m128d xmm1 = _mm256_extractf128_pd(ymm15,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm15,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
        double res_SIMD_INTRINSICS[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res_SIMD_INTRINSICS[0],xmm3);

        result += (res_SIMD_INTRINSICS[0] + res_SIMD_INTRINSICS[1]);

        //ymm0 = _mm256_add_pd( ymm0, ymm13);
        //ymm0 = _mm256_add_pd( ymm0, ymm14);
#endif
//#define _VER_AVX2
#ifdef _VER_AVX2
        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm3 = _mm256_load_pd(&vec1[i+8]);
        __m256d ymm4 = _mm256_load_pd(&vec1[i+12]);
        //__m256d ymm13 = _mm256_load_pd(&vec1[i+16]);
        //__m256d ymm14 = _mm256_load_pd(&vec1[i+20]);
        //__m256d ymm15 = _mm256_load_pd(&vec1[i+24]);
        //__m256d ymm16 = _mm256_load_pd(&vec1[i+28]);

        __m256d ymm5 = _mm256_load_pd(&vec2[i]);
        __m256d ymm6 = _mm256_load_pd(&vec2[i+4]);
        __m256d ymm7 = _mm256_load_pd(&vec2[i+8]);
        __m256d ymm8 = _mm256_load_pd(&vec2[i+12]);
        //__m256d ymm17 = _mm256_load_pd(&vec2[i+16]);
        //__m256d ymm18 = _mm256_load_pd(&vec2[i+20]);
        //__m256d ymm19 = _mm256_load_pd(&vec2[i+24]);
        //__m256d ymm20 = _mm256_load_pd(&vec2[i+28]);

        __m256d ymm9 = _mm256_mul_pd(ymm1,ymm5);
        __m256d ymm10 = _mm256_fmadd_pd(ymm2,ymm6,ymm9);
        //__m256d ymm11 = _mm256_mul_pd(ymm3,ymm7);
        __m256d ymm11 = _mm256_fmadd_pd(ymm3,ymm7,ymm10);
        __m256d ymm12 = _mm256_fmadd_pd(ymm4,ymm8,ymm11);
        //ymm12 = _mm256_hadd_pd(ymm10,ymm12);
        //__m256d ymm21 = _mm256_fmadd_pd(ymm13,ymm17,ymm12);
        //__m256d ymm22 = _mm256_fmadd_pd(ymm14,ymm18,ymm21);
        //__m256d ymm23 = _mm256_fmadd_pd(ymm15,ymm19,ymm22);
        //__m256d ymm24 = _mm256_fmadd_pd(ymm16,ymm20,ymm23);
        __m128d xmm1 = _mm256_extractf128_pd(ymm12,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm12,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
        double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);
#endif

/*        // Performing 4 dot product at one time
        ymm1  = _mm256_load_pd(&vec1[i]);    // x[0]
        ymm2  = _mm256_load_pd(&vec1[i+4]);  // x[1]
        ymm3  = _mm256_load_pd(&vec1[i+8]);  // x[2]
        ymm4  = _mm256_load_pd(&vec1[i+12]); // x[3]

        ymm5  = _mm256_load_pd(&vec2[i]);    // y[0]
        ymm6  = _mm256_load_pd(&vec2[i+4]);  // y[1]
        ymm7  = _mm256_load_pd(&vec2[i+8]);  // y[2]
        ymm8  = _mm256_load_pd(&vec2[i+12]); // y[3]

        ymm9  = _mm256_mul_pd( ymm1, ymm5 );   // xy0
        ymm10 = _mm256_mul_pd( ymm2, ymm6 );   // xy1
        ymm11 = _mm256_hadd_pd( ymm9, ymm10 ); // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13

        ymm12 = _mm256_mul_pd( ymm3, ymm7 ); // xy2
        ymm13 = _mm256_mul_pd( ymm4, ymm8 ); // xy3
        ymm14 = _mm256_hadd_pd( ymm12, ymm13 );  // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33

        ymm15 = _mm256_permute2f128_pd( ymm11, ymm14, 0x21 ); // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31

        ymm1  = _mm256_blend_pd( ymm11, ymm14, 0b1100); // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33

        ymm2 = _mm256_add_pd( ymm15, ymm1 );

        ymm0 = _mm256_add_pd( ymm0, ymm2 );
*/

/*        __m256d x[4], y[4];
        x[0] = _mm256_load_pd(&vec1[i]);
        x[1] = _mm256_load_pd(&vec1[i+4]);
        x[2] = _mm256_load_pd(&vec1[i+8]);
        x[3] = _mm256_load_pd(&vec1[i+12]);
        y[0] = _mm256_load_pd(&vec2[i]);
        y[1] = _mm256_load_pd(&vec2[i+4]);
        y[2] = _mm256_load_pd(&vec2[i+8]);
        y[3] = _mm256_load_pd(&vec2[i+12]);

        __m256d xy0 = _mm256_mul_pd( x[0], y[0] );
        __m256d xy1 = _mm256_mul_pd( x[1], y[1] );
        // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
        __m256d temp01 = _mm256_hadd_pd( xy0, xy1 );

        __m256d xy2 = _mm256_mul_pd( x[2], y[2] );
        __m256d xy3 = _mm256_mul_pd( x[3], y[3] );
        // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
        __m256d temp23 = _mm256_hadd_pd( xy2, xy3 );

        // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
        __m256d swapped = _mm256_permute2f128_pd( temp01, temp23, 0x21 );

        // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
        __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);

        __m256d dotproduct = _mm256_add_pd( swapped, blended );
*/
        //ymm0 = _mm256_add_pd(ymm0,dotproduct);

/*        __m128d xmm1 = _mm256_extractf128_pd(dotproduct,0);
        __m128d xmm2 = _mm256_extractf128_pd(dotproduct,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
        double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);
*/

//        _mm256_store_pd(&res[0],dotproduct);
//        result += (res[0] + res[1]) + (res[2] + res[3]);

        //result_SIMD_INTRINSICS += dotproduct[0] + dotproduct[1] + dotproduct[2] + dotproduct[3];

        //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));
        //_mm256_store_pd(&res[0],ymm0);
        //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
        //double* res = (double*)&ymm0;
        //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
    }
    //double* res = (double*)&ymm0;
    //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
    //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));
    //_mm256_store_pd(&res[0],ymm0);
    //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
}


//    t2 = tbb::tick_count::now();
//    double exec_time = 1000.0*(t2-t1).seconds();


    //std::cout << std::setiosflags(std::ios::fixed) << std::setprecision(5);
    std::cout<<std::endl<<"RESULTS: " <<std::endl;
    std::cout<<"result_intrin ----------: "<< result << std::endl;
    //std::cout<<"result_intrin ----------: "<< result << ", time: " << 1000.0*(t2-t1).seconds() << " ms" << std::endl;

    std::cout<<std::endl<<"Program end. "<<std::endl<<std::endl;
    return 0;
}
static inline PetscErrorCode TensorContract_FMA(PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[])
{

  PetscFunctionBegin;
  if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;}
  {
    PetscReal R[Q][P],S[Q][P],T[Q][P];
    const PetscScalar (*x)[P*P*P][NE] = (const PetscScalar(*)[P*P*P][NE])xx;
    PetscScalar       (*y)[P*P*P][NE] =       (PetscScalar(*)[Q*Q*Q][NE])yy;
    PetscScalar u[dof][Q*P*P][NE]_align,v[dof][Q*Q*P][NE]_align;

    for (PetscInt i=0; i<Q; i++) {
      for (PetscInt j=0; j<P; j++) {
        R[i][j] = tmode == TENSOR_EVAL ? Rf[i*P+j] : Rf[j*Q+i];
        S[i][j] = tmode == TENSOR_EVAL ? Sf[i*P+j] : Sf[j*Q+i];
        T[i][j] = tmode == TENSOR_EVAL ? Tf[i*P+j] : Tf[j*Q+i];
      }
    }

    // u[l,a,j,k] = R[a,i] x[l,i,j,k]
    for (PetscInt l=0; l<dof; l++) {
      for (PetscInt a=0; a<Q; a++) {
        __m256d r[P];
        for (PetscInt i=0; i<P; i++) r[i] = _mm256_set1_pd(R[a][i]);
        for (PetscInt jk=0; jk<P*P; jk++) {
          __m256d u_lajk = _mm256_setzero_pd();
          for (PetscInt i=0; i<P; i++) {
            u_lajk = _mm256_fmadd_pd(r[i],_mm256_load_pd(x[l][i*P*P+jk]),u_lajk);
          }
          _mm256_store_pd(u[l][a*P*P+jk],u_lajk);
        }
      }
    }

    // v[l,a,b,k] = S[b,j] u[l,a,j,k]
    for (PetscInt l=0; l<dof; l++) {
      for (PetscInt b=0; b<Q; b++) {
        __m256d s[P];
        for (int j=0; j<P; j++) s[j] = _mm256_set1_pd(S[b][j]);
        for (PetscInt a=0; a<Q; a++) {
          for (PetscInt k=0; k<P; k++) {
            __m256d v_labk = _mm256_setzero_pd();
            for (PetscInt j=0; j<P; j++) {
              v_labk = _mm256_fmadd_pd(s[j],_mm256_load_pd(u[l][(a*P+j)*P+k]),v_labk);
            }
            _mm256_store_pd(v[l][(a*Q+b)*P+k],v_labk);
          }
        }
      }
    }

    // y[l,a,b,c] = T[c,k] v[l,a,b,k]
    for (PetscInt l=0; l<dof; l++) {
      for (PetscInt c=0; c<Q; c++) {
        __m256d t[P];
        for (int k=0; k<P; k++) t[k] = _mm256_set1_pd(T[c][k]);
        for (PetscInt ab=0; ab<Q*Q; ab++) {
          __m256d y_labc = _mm256_load_pd(y[l][ab*Q+c]);
          for (PetscInt k=0; k<P; k++) {
            // for (PetscInt e=0; e<NE; e++) y[l][ab*Q+c][e] += T[c][k] * v[l][ab*P+k][e];
            y_labc = _mm256_fmadd_pd(t[k],_mm256_load_pd(v[l][ab*P+k]),y_labc);
          }
          _mm256_store_pd(y[l][ab*Q+c],y_labc);
        }
      }
    }
    PetscLogFlops(dof*(Q*P*P*P+Q*Q*P*P+Q*Q*Q*P)*NE*2);
  }
  PetscFunctionReturn(0);
}
Example #16
0
__m256d test_mm256_fmadd_pd(__m256d a, __m256d b, __m256d c) {
  // CHECK-LABEL: test_mm256_fmadd_pd
  // CHECK: @llvm.x86.fma.vfmadd.pd.256
  return _mm256_fmadd_pd(a, b, c);
}
Example #17
0
__m256d
check_mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
{
  return _mm256_fmadd_pd (a, b, c);
}