Exemple #1
0
/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_mul(ring_t *r, const ring_t *x, const ring_t *y){
  // printf("\n\n**************split-radix FAST**************\n");

  fft_vector_forward(&vctr_x,x);
  fft_vector_forward(&vctr_y,y);
  
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vctr_x.real+i);
    imag_x = _mm256_load_pd(vctr_x.imag+i);
    real_y = _mm256_load_pd(vctr_y.real+i);
    imag_y = _mm256_load_pd(vctr_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
     
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vctr_res.real+i,real_x);
    _mm256_store_pd(vctr_res.imag+i,imag_x);
  }
  fft_vector_backward(&vctr_res,r);
}
Exemple #2
0
/******************************************************************
*
* NEGACYCLIC FFT LOOK UP TABLE
*
******************************************************************/
void negacyc_mul(ring_t *r, const ring_t *x, const ring_t *y)
{
  phi_forward(&vector_x,x);
  phi_forward(&vector_y,y);

  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp,dim;
  dim = _mm256_set1_pd(CPLXDIM);
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vector_x.real+i);
    imag_x = _mm256_load_pd(vector_x.imag+i);
    real_y = _mm256_load_pd(vector_y.real+i);
    imag_y = _mm256_load_pd(vector_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    
    real_x = _mm256_div_pd(real_x,dim);
    imag_x = _mm256_div_pd(imag_x,dim);

    _mm256_store_pd(vector_res.real+i,real_x);
    _mm256_store_pd(vector_res.imag+i,imag_x);
  }
  phi_backward(&vector_res,r);
  // print_cplx(&vec_res,CPLXDIM);
}
// grid must be initialized to 0s.
void gridKernel_scatter(
    const complexd gcf[w_planes][over][over][max_supp][max_supp]
  , Double4c grid[grid_size][grid_size]
  // , const Pregridded uvw[baselines][timesteps][channels]
  // , const Double4c vis[baselines][timesteps][channels]
  , const Pregridded uvw[baselines * timesteps * channels]
  , const Double4c vis[baselines * timesteps * channels]
  ) {
  for (int sv = 0; sv < max_supp; sv++) { // Moved from 2-levels below according to Romein
    for (int i = 0; i < baselines * timesteps * channels; i++) {
      auto gcf_layer = gcf[0][0][0] + uvw[i].gcf_layer_offset;
#ifdef __AVX__
      // We port Romein CPU code to doubles here (for MS2 we didn't)
      // vis0 covers XX and XY, vis1 -- YX and YY
      __m256d vis0 = _mm256_load_pd((const double *) &vis[i].XX);
      __m256d vis1 = _mm256_load_pd((const double *) &vis[i].YX);
#endif
      for (int su = 0; su < max_supp; su++) {
      // NOTE: Romein writes about moving this to the very outer scope
      // (2 levels up) to make things more cache-friendly.
      // I don't know whether it is cache frendly
      // but what I definitely see gcc generates 4-times shorter code for it!
      // We need to investigate this!
      // for (int sv = 0; sv < max_supp; sv++) {
        int gsu, gsv;
        gsu = uvw[i].u + su - max_supp/2;
        gsv = uvw[i].v + sv - max_supp/2;
#ifdef __AVX__
        __m256d weight_r = _mm256_set1_pd(gcf_layer[su][sv].real());
        __m256d weight_i = _mm256_set1_pd(gcf_layer[su][sv].imag());

        /* _mm256_permute_pd(x, 5) swaps adjacent doubles pairs of x:
           d0, d1, d2, d3 goes to d1, d0, d3, d2
         */
        #define __DO(p,f)                                     \
        __m256d * gridptr##p = (__m256d *) &grid[gsu][gsv].f; \
        __m256d tr##p = _mm256_mul_pd(weight_r, vis##p);      \
        __m256d ti##p = _mm256_mul_pd(weight_i, vis##p);      \
        __m256d tp##p = _mm256_permute_pd(ti##p, 5);          \
        __m256d t##p = _mm256_addsub_pd(tr##p, tp##p);        \
        gridptr##p[p] = _mm256_add_pd(gridptr##p[p], t##p)

        __DO(0, XX);
        __DO(1, YX);
#else
        #define __GRID_POL(pol) grid[gsu][gsv].pol += vis[i].pol * gcf_layer[su][sv]
        __GRID_POL(XX);
        __GRID_POL(XY);
        __GRID_POL(YX);
        __GRID_POL(YY);
#endif
      }
    }
  }
}
Exemple #4
0
void sum_avx(double* c, double* a, double* b, int len)
{
    __m256d rA_AVX, rB_AVX, rC_AVX;   // variables for AVX

    for (int i = 0; i < len; i += 4)
    {
        rA_AVX = _mm256_load_pd(&a[i]);
        rB_AVX = _mm256_load_pd(&b[i]);
        rC_AVX = _mm256_add_pd(rA_AVX, rB_AVX);
        _mm256_store_pd(&c[i], rC_AVX);
    }
}
Exemple #5
0
// multiply *p by v and applied to all n
COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v)
{
#if defined(COREARRAY_SIMD_AVX)

	const __m256d v4 = _mm256_set1_pd(v);

	switch ((size_t)p & 0x1F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x10:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x18:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 4; n-=4)
		{
			_mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
		break;
	default:
		for (; n >= 4; n-=4)
		{
			_mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	const __m128d v2 = _mm_set1_pd(v);

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 2; n-=2, p+=2)
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2));
		break;
	default:
		for (; n >= 2; n-=2, p+=2)
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2));
	}

#endif

	for (; n > 0; n--) (*p++) *= v;
}
Exemple #6
0
irreg_poly_area_func_sign(double, _avx) {
    if (__builtin_expect(is_null(cords) || cords_len == 0, 0))
        return 0;

    __m256d
        curr,
        forw,
        coef_0,
        coef_1,
        end = _mm256_load_pd((const double *)cords),
        accum_sum = _mm256_setzero_pd();
    double accum_sum_aux;

    unsigned long index;
    for (index = 0; index < (cords_len - 4); index += 4) {
        curr = end;                                                 // x0,y0,x1,y1
        forw = _mm256_load_pd((const double *)&cords[index + 2]);   // x2,y2,x3,y3
        end = _mm256_load_pd((const double *)&cords[index + 4]);    // x4,y4,x5,y5

        coef_0 = _mm256_permute2f128_pd(curr, forw, 0b00110001); // x1, y1, x3, y3
        coef_1 = _mm256_permute2f128_pd(forw, end, 0b00100000); // x2, y2, x4, y4

        //_mm256_hsub_pd(a, b) == a0 - a1, b0 - b1, a2 - a3, b2 - b3
        accum_sum = _mm256_add_pd(
            accum_sum,
            _mm256_hsub_pd( // x0*y1 - y0*x1, x1*y2 - y1x2, x2*y3 - y2*x3, x3*y4 - y3*x4
                _mm256_mul_pd( // x0*y1, y0*x1, x2*y3, y2*x3
                    _mm256_permute2f128_pd(curr, forw, 0b00100000),  // x0, y0, x2, y2
                    _mm256_shuffle_pd(coef_0, coef_0, 0b0101)  // y1, x1, y3, x3
                ),
                _mm256_mul_pd(coef_0, _mm256_shuffle_pd(coef_1, coef_1, 0b0101)) // y2, x2, y4, x4
                // ^^^^^^^^^^^^^^^  x1*y2, y1*x2, x3*y4, y3*x4
            )
        );
    }

    accum_sum = _mm256_hadd_pd(accum_sum, _mm256_permute2f128_pd(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a2+a3, a0+a1
    accum_sum = _mm256_hadd_pd(accum_sum, accum_sum); // a0+a1+a2+a3, ...
    for (accum_sum_aux = _mm_cvtsd_f64(_mm256_castpd256_pd128(accum_sum)); index < (cords_len - 1); index++)
        accum_sum_aux += _calc_diff_of_adj_prods(cords, index);

    return accum_sum_aux;
//    return scalar_half(scalar_abs(accum_sum_aux));
}
Exemple #7
0
void
printcounters(struct counter *ctrs, uint64_t duration)
{
    struct metrics s = {0};

    uint64_t thisBytesWritten = pcm->bytesWritten();
    uint64_t thisBytesRead = pcm->bytesRead();
    memset(threadspercore, 0, gbl.ncores * sizeof(int));
    s.timestamp = _rdtsc();
    s.duration = duration;
    for (int cpu = 0; cpu < gbl.ncpus; ++cpu)
    {
        double delta[NEVENTS];
        // volatile because another thread is changing it.
        volatile struct counter *p = &ctrs[cpu];

        for (int i = 0; i < NEVENTS; ++i)
        {
            union {
                __m256d c;
                uint64_t values[4];
            } t;
            t.c = _mm256_load_pd((const double *)&p->counts[i][0]);
            delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]);
            _mm256_store_pd((double *)&lastctr[cpu].counts[i][0], t.c);
            if (delta[i] < 0)
                delta[i] = 0;
            sevents[i] += delta[i];
        }

        //printf("clocks %g duration %lu\n", delta[clocks], duration);
        if (2*delta[clocks] > duration)
        {
            int thiscore = pcm->getSocketId(cpu)  * gbl.corespersocket +
                pcm->getCoreId(cpu);
            ++s.nthreads;
            ++threadspercore[thiscore];
        }
        s.dsimd += delta[simd_dp];
        s.dsse += delta[sse_dp];
        s.dscalar += delta[scalar_dp];
        s.ssimd += delta[simd_sp];
        s.ssse += delta[sse_sp];
        s.sscalar += delta[scalar_sp];
        s.instrs += delta[instrs];
    }
    s.rbytes = thisBytesRead - lastBytesRead;
    s.wbytes = thisBytesWritten - lastBytesWritten;
    lastBytesRead = thisBytesRead;
    lastBytesWritten = thisBytesWritten;
    for (int i = 0; i < gbl.ncores; ++i)
        if (threadspercore[i])
            ++s.ncores;

    sample(&s);
}
double HodgkinHuxley::dV(double *V, double I) {
	const double C = 1.0;
	const double gNa = 120.0;
	const double gK = 36.0;
	const double gL = 0.3;
	const double ENa = 50.0;
	const double EK = -77.0;
	const double EL = -54.4;
#ifdef __AVX__
/*
AVX is an instruction set from Intel which allows simultaneous operation
on 4 doubles. Seems to be slower than optimized FPU, though.
*/
	double Va[] __attribute__ ((aligned (32))) = {V[0], V[0], V[0], 1.0},
		   Ea[] __attribute__ ((aligned (32))) = {EL, ENa, EK, 0.0},
		   Ga[] __attribute__ ((aligned (32))) = {-gL, -gNa * pow(V[2], 3.0) * V[3], -gK * pow(V[1], 4.0), I};
	
	// load V
	__m256d Vr = _mm256_load_pd(Va);
	// load E
	__m256d Er = _mm256_load_pd(Ea);
	// load G
	__m256d Gr = _mm256_load_pd(Ga);
	// subtract
	Vr = _mm256_sub_pd(Vr, Er);
	// dot product (why does intel not have _mm256_dp_pd ?)
	Vr = _mm256_mul_pd(Vr, Gr);
	__m256d temp = _mm256_hadd_pd(Vr, Vr);
	__m128d lo128 = _mm256_extractf128_pd(temp, 0);
	__m128d hi128 = _mm256_extractf128_pd(temp, 1);
	__m128d dotproduct = _mm_add_pd(lo128, hi128);
	
	double sseVal;
	// store
	_mm_storel_pd(&sseVal, dotproduct);
	sseVal /= C;
		
	return sseVal;
#else
	return (-gL * (V[0] - EL) - gNa * pow(V[2], 3.0) * V[3] * (V[0] - ENa)
		- gK * pow(V[1], 4.0) * (V[0] - EK) + I) / C;
#endif
}
Exemple #9
0
inline void transpose_4x4block_AVX_64(double* A, double* B, const size_t lda,
                                   const size_t ldb) {
    __m256d row0 = _mm256_load_pd(&A[0*ldb]);
    __m256d row1 = _mm256_load_pd(&A[1*ldb]);
    __m256d row2 = _mm256_load_pd(&A[2*ldb]);
    __m256d row3 = _mm256_load_pd(&A[3*ldb]);
    __m256d tmp3, tmp2, tmp1, tmp0;
    tmp0 = _mm256_unpacklo_pd(row0, row1);
    tmp1 = _mm256_unpackhi_pd(row0, row1);
    tmp2 = _mm256_unpacklo_pd(row2, row3);
    tmp3 = _mm256_unpackhi_pd(row2, row3);
    row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
    row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
    row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);
    row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0x31);
    _mm256_store_pd(&B[0*lda], row0);
    _mm256_store_pd(&B[1*lda], row1);
    _mm256_store_pd(&B[2*lda], row2);
    _mm256_store_pd(&B[3*lda], row3);

}
Exemple #10
0
/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED NON RECURSIVE FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_nonrec_mul(ring_t *r, const ring_t *x, const ring_t *y){
  fft_vector_nonrec_forward(&vec_x,x);
  fft_vector_nonrec_forward(&vec_y,y);
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vec_x.real+i);
    imag_x = _mm256_load_pd(vec_x.imag+i);
    real_y = _mm256_load_pd(vec_y.real+i);
    imag_y = _mm256_load_pd(vec_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
    //REPLACED FOR COMMENTED SECTION
    //real_x = ac
    // real_x = _mm256_mul_pd(real_x,real_y);
    // //imag_x = bc
    // imag_x = _mm256_mul_pd(imag_x,real_y);
    // //real_x = ac - bd => real_x - real_temp
    // real_x = _mm256_sub_pd(real_x,real_temp);
    // //imag_x = ad + bc => imag_temp + imag_x
    // imag_x = _mm256_add_pd(imag_x,imag_temp);
    //THESE ARE NOT WORKING 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vec_res.real+i,real_x);
    _mm256_store_pd(vec_res.imag+i,imag_x);

  }
  fft_vector_nonrec_backward(&vec_res,r);
}
Exemple #11
0
ALGEBRA_INLINE double	vector_ps_double (const double* pa,const double* pb,size_t n) {
    if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
        size_t q = n/4;
        size_t r = n%4;
        double w = 0;

        if(q>0) {
            __m256d acc = _mm256_setzero_pd();
            __m256d i1 = _mm256_load_pd(pa);
            __m256d j1 = _mm256_load_pd(pb);
            pa += 4;
            pb += 4;
            __m256d s = _mm256_mul_pd(i1, j1);
            acc = _mm256_add_pd(acc, s);

            while(--q != 0) {
                // load
                i1 = _mm256_load_pd(pa);
                j1 = _mm256_load_pd(pb);
                pa += 4;
                pb += 4;
                // multiplie
                s = _mm256_mul_pd(i1, j1);
                // accumule
                acc = _mm256_add_pd(acc, s);            
            }
            // sum finale
            // add horizontal
            acc = _mm256_hadd_pd(acc, acc);
            // échange 128bits haut et bas
            __m256d accp = _mm256_permute2f128_pd(acc, acc, 1);
            // add vertical
            acc = _mm256_add_pd(acc, accp);
            // extract
            _mm_store_sd(&w,  _mm256_extractf128_pd(acc,0));
        }
        return w + vector_ps_double_basic(pa, pb, r);
    }
    return vector_ps_double_basic(pa, pb, n);
}
// this function assumes data is stored in col-major
// if data is in row major, call it like matmul4x4(B, A, C)
void matmul4x4(double *A, double *B, double *C) {
    __m256d col[4], sum[4];
    //load every column into registers
    for(int i=0; i<4; i++)  
      col[i] = _mm256_load_pd(&A[i*4]);
    for(int i=0; i<4; i++) {
        sum[i] = _mm256_setzero_pd();      
        for(int j=0; j<4; j++) {
            sum[i] = _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(B[i*4+j]), col[j]), sum[i]);
        }           
    }
    for(int i=0; i<4; i++) 
      _mm256_store_pd(&C[i*4], sum[i]); 
}
static inline void matmul_4xkxkx4(int lda, int K, double* a, double* b, double* c)
{
  __m256d a_coli, bi0, bi1, bi2, bi3;
  __m256d c_col0, c_col1, c_col2, c_col3;

  /* layout of 4x4 c matrix
      00 01 02 03
      10 11 12 13
      20 21 22 23
      30 31 32 33
  */
  double* c01_ptr = c + lda;
  double* c02_ptr = c01_ptr + lda;
  double* c03_ptr = c02_ptr + lda;

  // load old value of c
  c_col0 = _mm256_loadu_pd(c);
  c_col1 = _mm256_loadu_pd(c01_ptr);
  c_col2 = _mm256_loadu_pd(c02_ptr);
  c_col3 = _mm256_loadu_pd(c03_ptr);

  // for every column of a (or every row of b)
  for (int i = 0; i < K; ++i) 
  {
    a_coli = _mm256_load_pd(a);
    a += 4;

    bi0 = _mm256_broadcast_sd(b++);
    bi1 = _mm256_broadcast_sd(b++);
    bi2 = _mm256_broadcast_sd(b++);
    bi3 = _mm256_broadcast_sd(b++);

    c_col0 = _mm256_add_pd(c_col0, _mm256_mul_pd(a_coli, bi0));
    c_col1 = _mm256_add_pd(c_col1, _mm256_mul_pd(a_coli, bi1));
    c_col2 = _mm256_add_pd(c_col2, _mm256_mul_pd(a_coli, bi2));
    c_col3 = _mm256_add_pd(c_col3, _mm256_mul_pd(a_coli, bi3));
  }

  _mm256_storeu_pd(c, c_col0);
  _mm256_storeu_pd(c01_ptr, c_col1);
  _mm256_storeu_pd(c02_ptr, c_col2);
  _mm256_storeu_pd(c03_ptr, c_col3);
}
Exemple #14
0
 /*!
  * \brief Load a packed vector from the given aligned memory location
  */
 ETL_STATIC_INLINE(avx_simd_double) load(const double* memory) {
     return _mm256_load_pd(memory);
 }
Exemple #15
0
// it moves horizontally inside a block (A upper triangular)
void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;
	
	__m128d
		tmp0,
		z_0, y_0_1, a_00_10;

	__m256d
		zeros,
		ax_temp,
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0, x_1, x_2, x_3,
		y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3;
		
	zeros = _mm256_setzero_pd();
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_c = _mm256_setzero_pd();	*/
	y_0_1_2_3_d = _mm256_setzero_pd();
	
	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 );
	y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc );

	// forth col (avoid zero y_0_1_2_3)
	x_3     = _mm256_broadcast_sd( &x[3] );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 );

	// first col
	x_2     = _mm256_broadcast_sd( &x[2] );
	x_2     = _mm256_blend_pd( x_2, zeros, 0x8 );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
	y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 );

	A += 4*lda;
	x += 4;

	k=4;
	for(; k<kmax-3; k+=4)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		x_2 = _mm256_broadcast_sd( &x[2] );
		x_3 = _mm256_broadcast_sd( &x[3] );

		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 );
		y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp );
		ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 );
		y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp );
		
		A += 4*lda;
		x += 4;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d );

	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		A += 2*lda;
		x += 2;

		}

	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		
		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}

	}
static inline PetscErrorCode TensorContract_FMA(PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[])
{

  PetscFunctionBegin;
  if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;}
  {
    PetscReal R[Q][P],S[Q][P],T[Q][P];
    const PetscScalar (*x)[P*P*P][NE] = (const PetscScalar(*)[P*P*P][NE])xx;
    PetscScalar       (*y)[P*P*P][NE] =       (PetscScalar(*)[Q*Q*Q][NE])yy;
    PetscScalar u[dof][Q*P*P][NE]_align,v[dof][Q*Q*P][NE]_align;

    for (PetscInt i=0; i<Q; i++) {
      for (PetscInt j=0; j<P; j++) {
        R[i][j] = tmode == TENSOR_EVAL ? Rf[i*P+j] : Rf[j*Q+i];
        S[i][j] = tmode == TENSOR_EVAL ? Sf[i*P+j] : Sf[j*Q+i];
        T[i][j] = tmode == TENSOR_EVAL ? Tf[i*P+j] : Tf[j*Q+i];
      }
    }

    // u[l,a,j,k] = R[a,i] x[l,i,j,k]
    for (PetscInt l=0; l<dof; l++) {
      for (PetscInt a=0; a<Q; a++) {
        __m256d r[P];
        for (PetscInt i=0; i<P; i++) r[i] = _mm256_set1_pd(R[a][i]);
        for (PetscInt jk=0; jk<P*P; jk++) {
          __m256d u_lajk = _mm256_setzero_pd();
          for (PetscInt i=0; i<P; i++) {
            u_lajk = _mm256_fmadd_pd(r[i],_mm256_load_pd(x[l][i*P*P+jk]),u_lajk);
          }
          _mm256_store_pd(u[l][a*P*P+jk],u_lajk);
        }
      }
    }

    // v[l,a,b,k] = S[b,j] u[l,a,j,k]
    for (PetscInt l=0; l<dof; l++) {
      for (PetscInt b=0; b<Q; b++) {
        __m256d s[P];
        for (int j=0; j<P; j++) s[j] = _mm256_set1_pd(S[b][j]);
        for (PetscInt a=0; a<Q; a++) {
          for (PetscInt k=0; k<P; k++) {
            __m256d v_labk = _mm256_setzero_pd();
            for (PetscInt j=0; j<P; j++) {
              v_labk = _mm256_fmadd_pd(s[j],_mm256_load_pd(u[l][(a*P+j)*P+k]),v_labk);
            }
            _mm256_store_pd(v[l][(a*Q+b)*P+k],v_labk);
          }
        }
      }
    }

    // y[l,a,b,c] = T[c,k] v[l,a,b,k]
    for (PetscInt l=0; l<dof; l++) {
      for (PetscInt c=0; c<Q; c++) {
        __m256d t[P];
        for (int k=0; k<P; k++) t[k] = _mm256_set1_pd(T[c][k]);
        for (PetscInt ab=0; ab<Q*Q; ab++) {
          __m256d y_labc = _mm256_load_pd(y[l][ab*Q+c]);
          for (PetscInt k=0; k<P; k++) {
            // for (PetscInt e=0; e<NE; e++) y[l][ab*Q+c][e] += T[c][k] * v[l][ab*P+k][e];
            y_labc = _mm256_fmadd_pd(t[k],_mm256_load_pd(v[l][ab*P+k]),y_labc);
          }
          _mm256_store_pd(y[l][ab*Q+c],y_labc);
        }
      }
    }
    PetscLogFlops(dof*(Q*P*P*P+Q*Q*P*P+Q*Q*Q*P)*NE*2);
  }
  PetscFunctionReturn(0);
}
Exemple #17
0
// it moves vertically across blocks
void kernel_dtrmv_u_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 4;
	
	double *tA, *tx;

	int k;
	
	__m256d
		tmp0,
		a_00_10_20_30,
		x_0_1_2_3,
		y_00;
	
	
	y_00 = _mm256_setzero_pd();

	k=0;
	for(; k<kmax-3; k+=4)
		{
		
		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		
		A += 4 + (sda-1)*lda;
		x += 4;

		}

	__m128d
		tm0,
		a_00_10, a_01_11,
		x_0_1,
		y_0, y_1, y_0_1;
	
	tm0 = _mm256_extractf128_pd( y_00, 0x1 );
	y_0 = _mm256_castpd256_pd128( y_00 );
	y_0 = _mm_add_pd( y_0, tm0 );

	if(k<kmax-1)
		{
		
		x_0_1 = _mm_loadu_pd( &x[0] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );
		
		tm0 = _mm_mul_pd( a_00_10, x_0_1 );
		y_0 = _mm_add_pd( y_0, tm0 );
		
		A += 2;
		x += 2;

		}
	
	x_0_1 = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A[0+lda*0] );
	tm0 = _mm_mul_sd( a_00_10, x_0_1 );
	y_0 = _mm_add_sd( y_0, tm0 );

	y_0 = _mm_hadd_pd( y_0, y_0 );


	if(alg==0)
		{
		_mm_store_sd(&y[0], y_0);
		}
	else if(alg==1)
		{
		y_0_1 = _mm_load_sd( &y[0] );

		y_0_1 = _mm_add_sd( y_0_1, y_0 );

		_mm_store_sd(&y[0], y_0_1);
		}
	else // alg==-1
		{
		y_0_1 = _mm_load_sd( &y[0] );
	
		y_0_1 = _mm_sub_sd( y_0_1, y_0 );
	
		_mm_store_sd(&y[0], y_0_1);
		}

	}
Exemple #18
0
int main()
{
    constexpr size_t N = 100 << 20;
    constexpr size_t N_pd = N/sizeof(double);
    constexpr size_t N_ps = N/sizeof(float);

    printf("Comparing std::log to mm256_log_pd/mm256_log_ps with %zuMiB input data\n", N>>20);

    double * data_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32);
    float  * data_ps = (float*) _mm_malloc(N_ps*sizeof(float),  32);

    double * outl_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32);
    float  * outl_ps = (float*) _mm_malloc(N_ps*sizeof(float),  32);

    double * outa_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32);
    float  * outa_ps = (float*) _mm_malloc(N_ps*sizeof(float),  32);

    double * err_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32);
    float  * err_ps = (float*) _mm_malloc(N_ps*sizeof(float),  32);

    size_t * idx_pd = (size_t*)malloc(N_pd*sizeof(size_t));
    size_t * idx_ps = (size_t*)malloc(N_ps*sizeof(size_t));

    if(data_pd == nullptr || data_ps == nullptr ||
       outl_pd == nullptr || outl_ps == nullptr ||
       outa_pd == nullptr || outa_ps == nullptr ||
        err_pd == nullptr ||  err_ps == nullptr) {
        return 1;
    }

    auto rng = std::mt19937(hrc::now().time_since_epoch().count());

    printf("Filling double input data... ");
    fflush(stdout);
    for(size_t i = 0; i < N_pd; ++i) {
        data_pd[i] = /*100.0  */ std::generate_canonical<double, 64>(rng);
    }
    printf("done\n");

    printf("Filling float input data... ");
    fflush(stdout);
    for(size_t i = 0; i < N_ps; ++i) {
        data_ps[i] = /*100.0f */ std::generate_canonical<float, 32>(rng);
    }
    printf("done\n\n");

    printf("Testing serial run:\n\n");

    printf("Running std::log double... ");
    fflush(stdout);
    auto log_pd_s_time_start = hrc::now();
    for(size_t i = 0; i < N_pd; i += 8) {
        outl_pd[i+0] = std::log(data_pd[i+0]);
        outl_pd[i+1] = std::log(data_pd[i+1]);
        outl_pd[i+2] = std::log(data_pd[i+2]);
        outl_pd[i+3] = std::log(data_pd[i+3]);
        outl_pd[i+4] = std::log(data_pd[i+4]);
        outl_pd[i+5] = std::log(data_pd[i+5]);
        outl_pd[i+6] = std::log(data_pd[i+6]);
        outl_pd[i+7] = std::log(data_pd[i+7]);
    }
    auto log_pd_s_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(log_pd_s_time_end - log_pd_s_time_start).count());

    printf("Running mm256_log_pd... ");
    fflush(stdout);
    auto avx_pd_s_time_start = hrc::now();
    for(size_t i = 0; i < N_pd; i += 8) {
        _mm256_store_pd(outa_pd+i+0, mm256_log_pd(_mm256_load_pd(data_pd+i+0)));
        _mm256_store_pd(outa_pd+i+4, mm256_log_pd(_mm256_load_pd(data_pd+i+4)));
    }
    auto avx_pd_s_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_pd_s_time_end - avx_pd_s_time_start).count());

    printf("Running std::log float... ");
    fflush(stdout);
    auto log_ps_s_time_start = hrc::now();
    for(size_t i = 0; i < N_ps; i += 16) {
        outl_ps[i+ 0] = std::log(data_ps[i+ 0]);
        outl_ps[i+ 1] = std::log(data_ps[i+ 1]);
        outl_ps[i+ 2] = std::log(data_ps[i+ 2]);
        outl_ps[i+ 3] = std::log(data_ps[i+ 3]);
        outl_ps[i+ 4] = std::log(data_ps[i+ 4]);
        outl_ps[i+ 5] = std::log(data_ps[i+ 5]);
        outl_ps[i+ 6] = std::log(data_ps[i+ 6]);
        outl_ps[i+ 7] = std::log(data_ps[i+ 7]);
        outl_ps[i+ 8] = std::log(data_ps[i+ 8]);
        outl_ps[i+ 9] = std::log(data_ps[i+ 9]);
        outl_ps[i+10] = std::log(data_ps[i+10]);
        outl_ps[i+11] = std::log(data_ps[i+11]);
        outl_ps[i+12] = std::log(data_ps[i+12]);
        outl_ps[i+13] = std::log(data_ps[i+13]);
        outl_ps[i+14] = std::log(data_ps[i+14]);
        outl_ps[i+15] = std::log(data_ps[i+15]);
    }
    auto log_ps_s_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(log_ps_s_time_end - log_ps_s_time_start).count());

    printf("Running mm256_log_ps... ");
    fflush(stdout);
    auto avx_ps_s_time_start = hrc::now();
    for(size_t i = 0; i < N_ps; i += 16) {
        _mm256_store_ps(outa_ps+i+0, mm256_log_ps(_mm256_load_ps(data_ps+i+0)));
        _mm256_store_ps(outa_ps+i+8, mm256_log_ps(_mm256_load_ps(data_ps+i+8)));
    }
    auto avx_ps_s_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_ps_s_time_end - avx_ps_s_time_start).count());


    printf("\n\nTesting parallel run:\n\n");

    printf("Running std::log double... ");
    fflush(stdout);
    auto log_pd_p_time_start = hrc::now();
    #pragma omp parallel for
    for(size_t i = 0; i < N_pd; i += 8) {
        outl_pd[i+0] = std::log(data_pd[i+0]);
        outl_pd[i+1] = std::log(data_pd[i+1]);
        outl_pd[i+2] = std::log(data_pd[i+2]);
        outl_pd[i+3] = std::log(data_pd[i+3]);
        outl_pd[i+4] = std::log(data_pd[i+4]);
        outl_pd[i+5] = std::log(data_pd[i+5]);
        outl_pd[i+6] = std::log(data_pd[i+6]);
        outl_pd[i+7] = std::log(data_pd[i+7]);
    }
    auto log_pd_p_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(log_pd_p_time_end - log_pd_p_time_start).count());

    printf("Running mm256_log_pd... ");
    fflush(stdout);
    auto avx_pd_p_time_start = hrc::now();
    #pragma omp parallel for
    for(size_t i = 0; i < N_pd; i += 8) {
        _mm256_store_pd(outa_pd+i+0, mm256_log_pd(_mm256_load_pd(data_pd+i+0)));
        _mm256_store_pd(outa_pd+i+4, mm256_log_pd(_mm256_load_pd(data_pd+i+4)));
    }
    auto avx_pd_p_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_pd_p_time_end - avx_pd_p_time_start).count());

    printf("Running std::log float... ");
    fflush(stdout);
    auto log_ps_p_time_start = hrc::now();
    #pragma omp parallel for
    for(size_t i = 0; i < N_ps; i += 16) {
        outl_ps[i+ 0] = std::log(data_ps[i+ 0]);
        outl_ps[i+ 1] = std::log(data_ps[i+ 1]);
        outl_ps[i+ 2] = std::log(data_ps[i+ 2]);
        outl_ps[i+ 3] = std::log(data_ps[i+ 3]);
        outl_ps[i+ 4] = std::log(data_ps[i+ 4]);
        outl_ps[i+ 5] = std::log(data_ps[i+ 5]);
        outl_ps[i+ 6] = std::log(data_ps[i+ 6]);
        outl_ps[i+ 7] = std::log(data_ps[i+ 7]);
        outl_ps[i+ 8] = std::log(data_ps[i+ 8]);
        outl_ps[i+ 9] = std::log(data_ps[i+ 9]);
        outl_ps[i+10] = std::log(data_ps[i+10]);
        outl_ps[i+11] = std::log(data_ps[i+11]);
        outl_ps[i+12] = std::log(data_ps[i+12]);
        outl_ps[i+13] = std::log(data_ps[i+13]);
        outl_ps[i+14] = std::log(data_ps[i+14]);
        outl_ps[i+15] = std::log(data_ps[i+15]);
    }
    auto log_ps_p_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(log_ps_p_time_end - log_ps_p_time_start).count());

    printf("Running mm256_log_ps... ");
    fflush(stdout);
    auto avx_ps_p_time_start = hrc::now();
    #pragma omp parallel for
    for(size_t i = 0; i < N_ps; i += 16) {
        _mm256_store_ps(outa_ps+i+0, mm256_log_ps(_mm256_load_ps(data_ps+i+0)));
        _mm256_store_ps(outa_ps+i+8, mm256_log_ps(_mm256_load_ps(data_ps+i+8)));
    }
    auto avx_ps_p_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_ps_p_time_end - avx_ps_p_time_start).count());

    printf("\nCalculating errors... ");
    fflush(stdout);
    #pragma omp parallel for
    for(size_t i = 0; i < N_pd; ++i) {
        err_pd[i] = std::abs(1.0 - outa_pd[i]/outl_pd[i]);
    }
    #pragma omp parallel for
    for(size_t i = 0; i < N_ps; ++i) {
        err_ps[i] = std::abs(1.0 - outa_ps[i]/outl_ps[i]);
    }
    #pragma omp parallel for
    for(size_t i = 0; i < N_pd; ++i) {
        idx_pd[i] = i;
    }
    #pragma omp parallel for
    for(size_t i = 0; i < N_ps; ++i) {
        idx_ps[i] = i;
    }

    std::sort(idx_pd, idx_pd+N_pd, [&](size_t a, size_t b){ return err_pd[a] < err_pd[b]; });
    std::sort(idx_ps, idx_ps+N_ps, [&](size_t a, size_t b){ return err_ps[a] < err_ps[b]; });
    printf("done\n");

    printf("\n\nSummary:\n");
    double lsd_s = std::chrono::duration_cast<sd>(log_pd_s_time_end - log_pd_s_time_start).count();
    double asd_s = std::chrono::duration_cast<sd>(avx_pd_s_time_end - avx_pd_s_time_start).count();
    double lss_s = std::chrono::duration_cast<sd>(log_ps_s_time_end - log_ps_s_time_start).count();
    double ass_s = std::chrono::duration_cast<sd>(avx_ps_s_time_end - avx_ps_s_time_start).count();
    double lpd_s = std::chrono::duration_cast<sd>(log_pd_p_time_end - log_pd_p_time_start).count();
    double apd_s = std::chrono::duration_cast<sd>(avx_pd_p_time_end - avx_pd_p_time_start).count();
    double lps_s = std::chrono::duration_cast<sd>(log_ps_p_time_end - log_ps_p_time_start).count();
    double aps_s = std::chrono::duration_cast<sd>(avx_ps_p_time_end - avx_ps_p_time_start).count();

    printf(" Algorithm    | Data Type | parallel | time      | speed      | min rel err  | max rel err  | 90%% rel err\n");
    printf("-----------------------------------------------------------------------------------------------------------\n");
    printf(" std::log     | double    |  false   | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lsd_s, N / lsd_s / (1<<30));
    printf(" mm256_log_pd | double    |  false   | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * asd_s, N / asd_s / (1<<30), err_pd[idx_pd[0]], err_pd[idx_pd[N_pd-1]], err_pd[idx_pd[90*N_pd/100]]);
    printf(" std::log     | float     |  false   | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lss_s, N / lss_s / (1<<30));
    printf(" mm256_log_ps | float     |  false   | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * ass_s, N / ass_s / (1<<30), err_ps[idx_ps[0]], err_ps[idx_ps[N_ps-1]], err_ps[idx_ps[90*N_ps/100]]);
    printf(" std::log     | double    |  true    | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lpd_s, N / lpd_s / (1<<30));
    printf(" mm256_log_pd | double    |  true    | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * apd_s, N / apd_s / (1<<30), err_pd[idx_pd[0]], err_pd[idx_pd[N_pd-1]], err_pd[idx_pd[90*N_pd/100]]);
    printf(" std::log     | float     |  true    | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lps_s, N / lps_s / (1<<30));
    printf(" mm256_log_ps | float     |  true    | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * aps_s, N / aps_s / (1<<30), err_ps[idx_ps[0]], err_ps[idx_ps[N_ps-1]], err_ps[idx_ps[90*N_ps/100]]);

    _mm_free(data_pd);
    _mm_free(data_ps);
    _mm_free(outl_pd);
    _mm_free(outl_ps);
    _mm_free(outa_pd);
    _mm_free(outa_ps);
    _mm_free(err_pd);
    _mm_free(err_ps);

    free(idx_pd);
    free(idx_ps);

    return 0;
}
Exemple #19
0
inline F64vec4 load(const double *r)
{
    return _mm256_load_pd(r);
}
Exemple #20
0
void core::Vector3::normalize(void)
{
#if defined(VTX_USE_AVX)
	ALIGNED_32 platform::F64_t vector[] = {this->x, this->y, this->z, 0};
	ALIGNED_32 platform::F64_t reciprocalVector[] = {1.0, 1.0, 1.0, 1.0};
	__m256d simdvector;
	__m256d result;
	__m256d recp;

	simdvector = _mm256_load_pd(vector);
	recp = _mm256_load_pd(reciprocalVector);

	result = _mm256_mul_pd(simdvector, simdvector);

	result = _mm256_hadd_pd(result, result);
	result = _mm256_hadd_pd(result, result);

	result = _mm256_sqrt_pd(result);

	result = _mm256_div_pd(recp, result);

	simdvector = _mm256_mul_pd(simdvector, result);

	_mm256_store_pd(vector, simdvector);

	this->x = vector[0];
	this->y = vector[1];
	this->z = vector[2];

#elif defined(VTX_USE_SSE)
	// Must pad with a trailing 0, to store in 128-bit register
	ALIGNED_16 core::F32_t vector[] = {this->x, this->y, this->z, 0};
	__m128 simdvector;
	__m128 result;
	simdvector = _mm_load_ps(vector);
	
	// (X^2, Y^2, Z^2, 0^2)
	result = _mm_mul_ps(simdvector, simdvector);
	
	// Add all elements together, giving us (X^2 + Y^2 + Z^2 + 0^2)
	result = _mm_hadd_ps(result, result);
	result = _mm_hadd_ps(result, result);
	
	// Calculate square root, giving us sqrt(X^2 + Y^2 + Z^2 + 0^2)
	result = _mm_sqrt_ps(result);

	// Calculate reciprocal, giving us 1 / sqrt(X^2 + Y^2 + Z^2 + 0^2)
	result = _mm_rcp_ps(result);

	// Finally, multiply the result with our original vector.
	simdvector = _mm_mul_ps(simdvector, result);

	_mm_store_ps(vector, simdvector);
	
	this->x = vector[0];
	this->y = vector[1];
	this->z = vector[2];
#else
	core::F64_t num = 1.0 / std::sqrt(std::pow(this->x, 2) + std::pow(this->y, 2) + std::pow(this->z, 2));
	this->x *= num;
	this->y *= num;
	this->z *= num;
#endif
}
Exemple #21
0
ALGEBRA_INLINE void		vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n)
{
	size_t k;
	
	__m256d l1 = _mm256_broadcast_sd(&lambda);
	__m256d l2 = _mm256_broadcast_sd(&lambda);
	__m256d l3 = _mm256_broadcast_sd(&lambda);
	__m256d l4 = _mm256_broadcast_sd(&lambda);

	size_t q = n / 16;
	size_t r = n % 16;
	if(q > 0) {
		if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) {
			for (k=0;k<q;k++) {
				/* Charge 4 valeurs de chaque tableau */
				__m256d i1 = _mm256_load_pd(v1);
				__m256d j1 = _mm256_load_pd(v2);
				__m256d i2 = _mm256_load_pd(v1+4);
				__m256d j2 = _mm256_load_pd(v2+4);
				__m256d i3 = _mm256_load_pd(v1+8);
				__m256d j3 = _mm256_load_pd(v2+8);
				__m256d i4 = _mm256_load_pd(v1+12);
				__m256d j4 = _mm256_load_pd(v2+12);
				/* multiplie */
					   j1 = _mm256_mul_pd(j1, l1);
					   j2 = _mm256_mul_pd(j2, l2);
					   j3 = _mm256_mul_pd(j3, l3);
					   j4 = _mm256_mul_pd(j4, l4);
				/* Additionne */
				i1 = _mm256_add_pd(i1,j1);
				i2 = _mm256_add_pd(i2,j2);
				i3 = _mm256_add_pd(i3,j3);
				i4 = _mm256_add_pd(i4,j4);
				/* Sauvegarde */
				_mm256_store_pd(v1, i1);
				_mm256_store_pd(v1+4, i2);
				_mm256_store_pd(v1+8, i3);
				_mm256_store_pd(v1+12, i4);
				v1 += 16;
				v2 += 16;
			}
		}
		else {		
			for (k=0;k<q;k++) {
				/* Charge 4 valeurs de chaque tableau */
				__m256d i1 = _mm256_loadu_pd(v1);
				__m256d j1 = _mm256_loadu_pd(v2);
				__m256d i2 = _mm256_loadu_pd(v1+4);
				__m256d j2 = _mm256_loadu_pd(v2+4);
				__m256d i3 = _mm256_loadu_pd(v1+8);
				__m256d j3 = _mm256_loadu_pd(v2+8);
				__m256d i4 = _mm256_loadu_pd(v1+12);
				__m256d j4 = _mm256_loadu_pd(v2+12);
				/* multiplie */
					   j1 = _mm256_mul_pd(j1, l1);
					   j2 = _mm256_mul_pd(j2, l2);
					   j3 = _mm256_mul_pd(j3, l3);
					   j4 = _mm256_mul_pd(j4, l4);
				/* Additionne */
				i1 = _mm256_add_pd(i1,j1);
				i2 = _mm256_add_pd(i2,j2);
				i3 = _mm256_add_pd(i3,j3);
				i4 = _mm256_add_pd(i4,j4);
				/* Sauvegarde */
				_mm256_storeu_pd(v1, i1);
				_mm256_storeu_pd(v1+4, i2);
				_mm256_storeu_pd(v1+8, i3);
				_mm256_storeu_pd(v1+12, i4);
				v1 += 16;
				v2 += 16;
			}
		}
	}
	
	for(k = 0 ; k<r ; k++)
		v1[k] += lambda*v2[k];
}
Exemple #22
0
 /*!
  * \brief Load a packed vector from the given aligned memory location
  */
 ETL_STATIC_INLINE(avx_simd_complex_double<etl::complex<double>>) load(const etl::complex<double>* memory) {
     return _mm256_load_pd(reinterpret_cast<const double*>(memory));
 }
double bst_compute_129_m256_maskstore_root_aligned( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, j, l_end_pre;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m256d v_tmp;
    __m256d v00, v01, v02, v03;
    __m256d v10, v11, v12, v13;
    __m256d v20, v21, v22, v23;
    __m256d v30, v31, v32, v33;
    __m256i v_cur_roots;
    __m256 v_rootmask0, v_rootmask1;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx1_root;
    int idx2;
    int idx3, idx3_root;
    int pad_root, pad, pad_r;
    
    idx1      = ((int) mem->e_sz) - 1;
    idx1_root = ((int) mem->r_sz);
    // the conventio is that iteration i, idx1 points to the first element of line i+1
    e[idx1++] = q[n];
    
    // pad contains the padding for row i+1
    // for row n it's always 3
    pad = 3;
    pad_root = 7;
    for (i = n-1; i >= 0; --i) {
        idx1      -= 2*(n-i)+1 + pad;
        idx1_root -= 2*(n-i)+1 + pad_root;
        idx2       = idx1 + 1;
        e[idx1]    = q[i];
        w[idx1]    = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        }
        idx2     += pad; // padding of line i+1
        // idx2 now points to the first element of the next line

        idx3      = idx1;
        idx3_root = idx1_root;
        pad_r     = pad;
        for (r = i; r < n; ++r) {
            pad_r     = (pad_r+1)&3; // padding of line r+1
            idx1      = idx3;
            idx1_root = idx3_root;
            l_end     = idx2 + (n-r);
            // l_end points to the first entry after the current row
            e_tmp     = e[idx1++];
            idx1_root++;
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&15);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1_root] = r;
                }
                idx1++;
                idx1_root++;
            }
            
            v_tmp = _mm256_set_pd( e_tmp, e_tmp, e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm256_set_epi32(r, r, r, r, r, r, r, r);
            for( ; idx2 < l_end; idx2 += 16 ) {
                v01 = _mm256_load_pd( &w[idx1   ] );
                v11 = _mm256_load_pd( &w[idx1+ 4] );
                v21 = _mm256_load_pd( &w[idx1+ 8] );
                v31 = _mm256_load_pd( &w[idx1+12] );

                v00 = _mm256_load_pd( &e[idx2   ] );
                v01 = _mm256_add_pd( v01, v_tmp ); 
                v10 = _mm256_load_pd( &e[idx2+ 4] );
                v11 = _mm256_add_pd( v11, v_tmp );
                v20 = _mm256_load_pd( &e[idx2+ 8] );
                v21 = _mm256_add_pd( v21, v_tmp );
                v30 = _mm256_load_pd( &e[idx2+12] );
                v31 = _mm256_add_pd( v31, v_tmp );

                v01 = _mm256_add_pd( v01, v00 );
                v03 = _mm256_load_pd( &e[idx1   ] );
                v11 = _mm256_add_pd( v11, v10 );
                v13 = _mm256_load_pd( &e[idx1+ 4] );
                v21 = _mm256_add_pd( v21, v20 );
                v23 = _mm256_load_pd( &e[idx1+ 8] );
                v31 = _mm256_add_pd( v31, v30 );
                v33 = _mm256_load_pd( &e[idx1+12] );

                v02 = _mm256_cmp_pd( v01, v03, _CMP_LT_OQ );
                v12 = _mm256_cmp_pd( v11, v13, _CMP_LT_OQ );
                v22 = _mm256_cmp_pd( v21, v23, _CMP_LT_OQ );
                v32 = _mm256_cmp_pd( v31, v33, _CMP_LT_OQ );

                _mm256_maskstore_pd( &e[idx1   ],
                        _mm256_castpd_si256( v02 ), v01 );
                _mm256_maskstore_pd( &e[idx1+ 4],
                        _mm256_castpd_si256( v12 ), v11 );

                v_rootmask0 = _mm256_insertf128_ps(
                        _mm256_castps128_ps256(
                            _mm256_cvtpd_ps(v02)),
                            _mm256_cvtpd_ps(v12) , 1
                    );

                _mm256_maskstore_pd( &e[idx1+ 8],
                        _mm256_castpd_si256( v22 ), v21 );
                _mm256_maskstore_pd( &e[idx1+12], 
                        _mm256_castpd_si256( v32 ), v31 );
                v_rootmask1 = _mm256_insertf128_ps(
                        _mm256_castps128_ps256(
                            _mm256_cvtpd_ps(v22)),
                            _mm256_cvtpd_ps(v32) , 1
                    );
                
                _mm256_maskstore_ps( &root[idx1_root    ],
                        _mm256_castps_si256( v_rootmask0 ),
                        _mm256_castsi256_ps( v_cur_roots ) );
                _mm256_maskstore_ps( &root[idx1_root + 8],
                        _mm256_castps_si256( v_rootmask1 ),
                        _mm256_castsi256_ps( v_cur_roots ) );
                idx1      += 16;
                idx1_root += 16;
            }
            idx2 += pad_r;
            idx3++;
            idx3_root++;
        }
        pad      = (pad     -1)&3;
        pad_root = (pad_root-1)&7;
    }
    // the index of the last item of the first row is ((n/4)+1)*4-1, due to the padding
    // if n is even, the total number of entries in the first
    // row of the table is odd, so we need padding
    return e[ ((n/4)+1)*4 - 1 ];
}
int main(void)
{
   // std::cout<<std::endl<<" Compute inner product..."<<std::endl<<std::endl;

    // INIT VECTOR
    //double vec1    [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));//__declspec(align(n))
    //double vec2    [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));
     //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec1    [_PBM_SIZE];
     //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec2    [_PBM_SIZE];

    //double *vec1 = _aligned_malloc(_PBM_SIZE*sizeof *vec1,_CBSIM_DBL_ALIGN_);
    //double *vec2 = _aligned_malloc(_PBM_SIZE*sizeof *vec2,_CBSIM_DBL_ALIGN_);


	double *vec1 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32);
    double *vec2 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32);

    double result = 0.0;
//    tbb::tick_count t1, t2;
    int loopsToDo = 10000;
    for (int i=0 ; i < _PBM_SIZE ; i++)
    {
        vec1[i] = static_cast<double>(i)*0.01;
        vec2[i] = static_cast<double>(i)*0.01;
    }


// SERIAL ***********************************************************************************
//    t1 = tbb::tick_count::now();


    for (int z=0 ; z < loopsToDo ; z++)
    {
    //__m256d ymm0;
    //__m256d ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;//, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm16, ymm17, ymm18;
    //ymm0 = _mm256_setzero_pd(); // accumulator
    //double res0 = 0.0, res1 = 0.0, res2 = 0.0, res3 = 0.0;
    //__m256d acc = _mm256_setzero_pd();
    //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0, 0.0, 0.0};
        result = 0.0;
    //double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
    for (int i=0 ; i < _PBM_SIZE; i+=8)
    {
/*        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m128d xmm1 = _mm256_extractf128_pd(ymm3,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm3,1);
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);

        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);
*/
        __assume_aligned(&vec1[0],32);
        __assume_aligned(&vec2[0],32);
       __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m256d ymm4 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm5 = _mm256_load_pd(&vec2[i+4]);
        __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 );

        __m256d ymm7 = _mm256_add_pd( ymm3, ymm6);
        
        __m128d xmm1 = _mm256_extractf128_pd(ymm7,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm7,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);

        //__m256d ymm0 = _mm256_add_pd( ymm0, ymm7);

/*        //__assume_aligned(&vec1[0],32);
        //__assume_aligned(&vec2[0],32);
        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m256d ymm4 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm5 = _mm256_load_pd(&vec2[i+4]);
        //__m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 );

        //__m256d ymm7 = _mm256_add_pd( ymm3, ymm6);
        __m256d ymm6 = _mm256_fmadd_pd (ymm4,ymm5,ymm3);
        //ymm0 = _mm256_add_pd( ymm0, ymm7);


        __m128d xmm1 = _mm256_extractf128_pd(ymm6,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm6,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);

        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);

        //_mm256_store_pd(&res[0],ymm6);
        //result_SIMD_INTRINSICS += (res[0] + res[1]) + (res[2] + res[3]);
*/

//#define _VER_AVX
#ifdef _VER_AVX
        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec2[i]);
        __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 );

        __m256d ymm4 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm5 = _mm256_load_pd(&vec2[i+4]);
        __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 );

        __m256d ymm7 = _mm256_load_pd(&vec1[i+8]);
        __m256d ymm8 = _mm256_load_pd(&vec2[i+8]);
        __m256d ymm9 = _mm256_mul_pd( ymm7, ymm8 );

        __m256d ymm10 = _mm256_load_pd(&vec1[i+12]);
        __m256d ymm11 = _mm256_load_pd(&vec2[i+12]);
        __m256d ymm12 = _mm256_mul_pd( ymm10, ymm11 );

        __m256d ymm13 = _mm256_add_pd( ymm3, ymm6);
        __m256d ymm14 = _mm256_add_pd( ymm9, ymm12);
        __m256d ymm15 = _mm256_add_pd( ymm13, ymm14);

        __m128d xmm1 = _mm256_extractf128_pd(ymm15,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm15,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
        double res_SIMD_INTRINSICS[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res_SIMD_INTRINSICS[0],xmm3);

        result += (res_SIMD_INTRINSICS[0] + res_SIMD_INTRINSICS[1]);

        //ymm0 = _mm256_add_pd( ymm0, ymm13);
        //ymm0 = _mm256_add_pd( ymm0, ymm14);
#endif
//#define _VER_AVX2
#ifdef _VER_AVX2
        __m256d ymm1 = _mm256_load_pd(&vec1[i]);
        __m256d ymm2 = _mm256_load_pd(&vec1[i+4]);
        __m256d ymm3 = _mm256_load_pd(&vec1[i+8]);
        __m256d ymm4 = _mm256_load_pd(&vec1[i+12]);
        //__m256d ymm13 = _mm256_load_pd(&vec1[i+16]);
        //__m256d ymm14 = _mm256_load_pd(&vec1[i+20]);
        //__m256d ymm15 = _mm256_load_pd(&vec1[i+24]);
        //__m256d ymm16 = _mm256_load_pd(&vec1[i+28]);

        __m256d ymm5 = _mm256_load_pd(&vec2[i]);
        __m256d ymm6 = _mm256_load_pd(&vec2[i+4]);
        __m256d ymm7 = _mm256_load_pd(&vec2[i+8]);
        __m256d ymm8 = _mm256_load_pd(&vec2[i+12]);
        //__m256d ymm17 = _mm256_load_pd(&vec2[i+16]);
        //__m256d ymm18 = _mm256_load_pd(&vec2[i+20]);
        //__m256d ymm19 = _mm256_load_pd(&vec2[i+24]);
        //__m256d ymm20 = _mm256_load_pd(&vec2[i+28]);

        __m256d ymm9 = _mm256_mul_pd(ymm1,ymm5);
        __m256d ymm10 = _mm256_fmadd_pd(ymm2,ymm6,ymm9);
        //__m256d ymm11 = _mm256_mul_pd(ymm3,ymm7);
        __m256d ymm11 = _mm256_fmadd_pd(ymm3,ymm7,ymm10);
        __m256d ymm12 = _mm256_fmadd_pd(ymm4,ymm8,ymm11);
        //ymm12 = _mm256_hadd_pd(ymm10,ymm12);
        //__m256d ymm21 = _mm256_fmadd_pd(ymm13,ymm17,ymm12);
        //__m256d ymm22 = _mm256_fmadd_pd(ymm14,ymm18,ymm21);
        //__m256d ymm23 = _mm256_fmadd_pd(ymm15,ymm19,ymm22);
        //__m256d ymm24 = _mm256_fmadd_pd(ymm16,ymm20,ymm23);
        __m128d xmm1 = _mm256_extractf128_pd(ymm12,0);
        __m128d xmm2 = _mm256_extractf128_pd(ymm12,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
        double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);
#endif

/*        // Performing 4 dot product at one time
        ymm1  = _mm256_load_pd(&vec1[i]);    // x[0]
        ymm2  = _mm256_load_pd(&vec1[i+4]);  // x[1]
        ymm3  = _mm256_load_pd(&vec1[i+8]);  // x[2]
        ymm4  = _mm256_load_pd(&vec1[i+12]); // x[3]

        ymm5  = _mm256_load_pd(&vec2[i]);    // y[0]
        ymm6  = _mm256_load_pd(&vec2[i+4]);  // y[1]
        ymm7  = _mm256_load_pd(&vec2[i+8]);  // y[2]
        ymm8  = _mm256_load_pd(&vec2[i+12]); // y[3]

        ymm9  = _mm256_mul_pd( ymm1, ymm5 );   // xy0
        ymm10 = _mm256_mul_pd( ymm2, ymm6 );   // xy1
        ymm11 = _mm256_hadd_pd( ymm9, ymm10 ); // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13

        ymm12 = _mm256_mul_pd( ymm3, ymm7 ); // xy2
        ymm13 = _mm256_mul_pd( ymm4, ymm8 ); // xy3
        ymm14 = _mm256_hadd_pd( ymm12, ymm13 );  // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33

        ymm15 = _mm256_permute2f128_pd( ymm11, ymm14, 0x21 ); // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31

        ymm1  = _mm256_blend_pd( ymm11, ymm14, 0b1100); // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33

        ymm2 = _mm256_add_pd( ymm15, ymm1 );

        ymm0 = _mm256_add_pd( ymm0, ymm2 );
*/

/*        __m256d x[4], y[4];
        x[0] = _mm256_load_pd(&vec1[i]);
        x[1] = _mm256_load_pd(&vec1[i+4]);
        x[2] = _mm256_load_pd(&vec1[i+8]);
        x[3] = _mm256_load_pd(&vec1[i+12]);
        y[0] = _mm256_load_pd(&vec2[i]);
        y[1] = _mm256_load_pd(&vec2[i+4]);
        y[2] = _mm256_load_pd(&vec2[i+8]);
        y[3] = _mm256_load_pd(&vec2[i+12]);

        __m256d xy0 = _mm256_mul_pd( x[0], y[0] );
        __m256d xy1 = _mm256_mul_pd( x[1], y[1] );
        // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
        __m256d temp01 = _mm256_hadd_pd( xy0, xy1 );

        __m256d xy2 = _mm256_mul_pd( x[2], y[2] );
        __m256d xy3 = _mm256_mul_pd( x[3], y[3] );
        // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
        __m256d temp23 = _mm256_hadd_pd( xy2, xy3 );

        // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
        __m256d swapped = _mm256_permute2f128_pd( temp01, temp23, 0x21 );

        // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
        __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);

        __m256d dotproduct = _mm256_add_pd( swapped, blended );
*/
        //ymm0 = _mm256_add_pd(ymm0,dotproduct);

/*        __m128d xmm1 = _mm256_extractf128_pd(dotproduct,0);
        __m128d xmm2 = _mm256_extractf128_pd(dotproduct,1);;
        __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2);
        double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0};
        _mm_store_pd(&res[0],xmm3);

        //_mm256_store_pd(&res[0],ymm12);
        result += (res[0] + res[1]);// + (res[2] + res[3]);
*/

//        _mm256_store_pd(&res[0],dotproduct);
//        result += (res[0] + res[1]) + (res[2] + res[3]);

        //result_SIMD_INTRINSICS += dotproduct[0] + dotproduct[1] + dotproduct[2] + dotproduct[3];

        //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));
        //_mm256_store_pd(&res[0],ymm0);
        //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
        //double* res = (double*)&ymm0;
        //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
    }
    //double* res = (double*)&ymm0;
    //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
    //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));
    //_mm256_store_pd(&res[0],ymm0);
    //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3];
}


//    t2 = tbb::tick_count::now();
//    double exec_time = 1000.0*(t2-t1).seconds();


    //std::cout << std::setiosflags(std::ios::fixed) << std::setprecision(5);
    std::cout<<std::endl<<"RESULTS: " <<std::endl;
    std::cout<<"result_intrin ----------: "<< result << std::endl;
    //std::cout<<"result_intrin ----------: "<< result << ", time: " << 1000.0*(t2-t1).seconds() << " ms" << std::endl;

    std::cout<<std::endl<<"Program end. "<<std::endl<<std::endl;
    return 0;
}
Exemple #25
0
void ks_tanh_int_d8x4(
    int    k,
    int    rhs,
    double *h,  // NOP
    double *u,
    double *aa, // NOP
    double *a,
    double *bb, // NOP
    double *b,
    double *w,
    double *c,
    ks_t   *ker,
    aux_t  *aux
    )
{
  int    i, rhs_left;
  double scal = ker->scal;
  double cons = ker->cons;


  v4df_t    c03_0,    c03_1,    c03_2,    c03_3;
  v4df_t    c47_0,    c47_1,    c47_2,    c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t u03, u47;
  v4df_t a03, a47, A03, A47; // prefetched A 
  v4df_t b0, b1, b2, b3, B0; // prefetched B
  v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp;


  // Rank-k update segment
  #include "ks_rank_k_int_d8x4.h"


  // Accumulate
  if ( aux->pc ) {
    tmpc03_0.v = _mm256_load_pd( (double*)( c      ) );
    c03_0.v    = _mm256_add_pd( tmpc03_0.v, c03_0.v );
    tmpc47_0.v = _mm256_load_pd( (double*)( c + 4  ) );
    c47_0.v    = _mm256_add_pd( tmpc47_0.v, c47_0.v );
    tmpc03_1.v = _mm256_load_pd( (double*)( c + 8  ) );
    c03_1.v    = _mm256_add_pd( tmpc03_1.v, c03_1.v );
    tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) );
    c47_1.v    = _mm256_add_pd( tmpc47_1.v, c47_1.v );
    tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) );
    c03_2.v    = _mm256_add_pd( tmpc03_2.v, c03_2.v );
    tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) );
    c47_2.v    = _mm256_add_pd( tmpc47_2.v, c47_2.v );
    tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) );
    c03_3.v    = _mm256_add_pd( tmpc03_3.v, c03_3.v );
    tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) );
    c47_3.v    = _mm256_add_pd( tmpc47_3.v, c47_3.v );
  }


  // Scale before the kernel evaluation
  c_tmp.v  = _mm256_broadcast_sd( &scal );
  c03_0.v  = _mm256_mul_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( c_tmp.v, c47_3.v );


  // Shift before the kernel evaluation
  c_tmp.v  = _mm256_broadcast_sd( &cons );
  c03_0.v  = _mm256_add_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_add_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( c_tmp.v, c47_3.v );


  // Preload u03, u47
  u03.v    = _mm256_load_pd( (double*)u );
  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );


  // Prefetch u and w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u + 8 ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );


  // c = tanh( c );
  c03_0.v  = _mm256_tanh_pd( c03_0.v );
  c03_1.v  = _mm256_tanh_pd( c03_1.v );
  c03_2.v  = _mm256_tanh_pd( c03_2.v );
  c03_3.v  = _mm256_tanh_pd( c03_3.v );
  c47_0.v  = _mm256_tanh_pd( c47_0.v );
  c47_1.v  = _mm256_tanh_pd( c47_1.v );
  c47_2.v  = _mm256_tanh_pd( c47_2.v );
  c47_3.v  = _mm256_tanh_pd( c47_3.v );
  
  
  // Multiple rhs kernel summation.
  #include "ks_kernel_summation_int_d8x4.h"

}
Exemple #26
0
 inline vector4d& vector4d::load_aligned(const double* src)
 {
     m_value = _mm256_load_pd(src);
     return *this;
 }
Exemple #27
0
void gaussian_int_d8x6(
    int    k,
    int    rhs,
    //double *h,
    double *u,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *w,
    double *c,
    ks_t   *ker,
    aux_t  *aux
    )
{
  int    i;
  double alpha = ker->scal;
  // 16 registers.
  v4df_t c03_0, c03_1, c03_2, c03_3, c03_4, c03_5;
  v4df_t c47_0, c47_1, c47_2, c47_3, c47_4, c47_5;
  v4df_t a03, a47, b0, b1;

  #include <rank_k_int_d8x6.h>
  #include <sq2nrm_int_d8x6.h>

  // Scale before the kernel evaluation
  a03.v   = _mm256_broadcast_sd( &alpha );
  c03_0.v = _mm256_mul_pd( a03.v, c03_0.v );
  c03_1.v = _mm256_mul_pd( a03.v, c03_1.v );
  c03_2.v = _mm256_mul_pd( a03.v, c03_2.v );
  c03_3.v = _mm256_mul_pd( a03.v, c03_3.v );
  c03_4.v = _mm256_mul_pd( a03.v, c03_4.v );
  c03_5.v = _mm256_mul_pd( a03.v, c03_5.v );

  c47_0.v = _mm256_mul_pd( a03.v, c47_0.v );
  c47_1.v = _mm256_mul_pd( a03.v, c47_1.v );
  c47_2.v = _mm256_mul_pd( a03.v, c47_2.v );
  c47_3.v = _mm256_mul_pd( a03.v, c47_3.v );
  c47_4.v = _mm256_mul_pd( a03.v, c47_4.v );
  c47_5.v = _mm256_mul_pd( a03.v, c47_5.v );

  // Prefetch u, w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );

  // c = exp( c )
  c03_0.v = _mm256_exp_pd( c03_0.v );
  c03_1.v = _mm256_exp_pd( c03_1.v );
  c03_2.v = _mm256_exp_pd( c03_2.v );
  c03_3.v = _mm256_exp_pd( c03_3.v );
  c03_4.v = _mm256_exp_pd( c03_4.v );
  c03_5.v = _mm256_exp_pd( c03_5.v );

  c47_0.v = _mm256_exp_pd( c47_0.v );
  c47_1.v = _mm256_exp_pd( c47_1.v );
  c47_2.v = _mm256_exp_pd( c47_2.v );
  c47_3.v = _mm256_exp_pd( c47_3.v );
  c47_4.v = _mm256_exp_pd( c47_4.v );
  c47_5.v = _mm256_exp_pd( c47_5.v );

  // Preload u03, u47
  a03.v    = _mm256_load_pd( (double*)  u       );
  a47.v    = _mm256_load_pd( (double*)( u + 4 ) );

  // Multiple rhs weighted sum.
  #include<weighted_sum_int_d8x6.h>

  //if ( u[ 0 ] != u[ 0 ] ) printf( "u[ 0 ] nan\n" );
  //if ( u[ 1 ] != u[ 1 ] ) printf( "u[ 1 ] nan\n" );
  //if ( u[ 2 ] != u[ 2 ] ) printf( "u[ 2 ] nan\n" );
  //if ( u[ 3 ] != u[ 3 ] ) printf( "u[ 3 ] nan\n" );
  //if ( u[ 4 ] != u[ 4 ] ) printf( "u[ 4 ] nan\n" );
  //if ( u[ 5 ] != u[ 5 ] ) printf( "u[ 5 ] nan\n" );
  //if ( u[ 6 ] != u[ 6 ] ) printf( "u[ 6 ] nan\n" );
  //if ( u[ 7 ] != u[ 7 ] ) printf( "u[ 7 ] nan\n" );

  //if ( w[ 0 ] != w[ 0 ] ) printf( "w[ 0 ] nan\n" );
  //if ( w[ 1 ] != w[ 1 ] ) printf( "w[ 1 ] nan\n" );
  //if ( w[ 2 ] != w[ 2 ] ) printf( "w[ 2 ] nan\n" );
  //if ( w[ 3 ] != w[ 3 ] ) printf( "w[ 3 ] nan\n" );
  //if ( w[ 4 ] != w[ 4 ] ) printf( "w[ 4 ] nan\n" );
  //if ( w[ 5 ] != w[ 5 ] ) printf( "w[ 5 ] nan\n" );
}
Exemple #28
0
// it moves horizontally inside a block
void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	double *A1 = A0 + 4*sda;

	const int lda = 4;
	
	int k;

	__m128d
		tmp0,
		z_0, y_0_1, a_00_10;

	__m256d
		zeros,
		ax_temp,
		a_00_10_20_30, a_01_11_21_31,
		a_40_50_60_70, a_41_51_61_71,
		x_0, x_1,
		y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3,
		y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7;
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_4_5_6_7   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_4_5_6_7_b = _mm256_setzero_pd();	*/
		
	zeros = _mm256_setzero_pd();
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_c = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_d = _mm256_setzero_pd();*/
	
	// upper triangular

	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A0[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A0[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 );
	y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc );

	// forth col (avoid zero y_0_1_2_3)
	x_1     = _mm256_broadcast_sd( &x[3] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
	y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 );

	// first col
	x_0     = _mm256_broadcast_sd( &x[2] );
	x_0     = _mm256_blend_pd( x_0, zeros, 0x8 );
	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );


	A0 += 4*lda;
	A1 += 4*lda;
	x  += 4;


	// upper squared
	x_0 = _mm256_broadcast_sd( &x[0] );
	x_1 = _mm256_broadcast_sd( &x[1] );

	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );

	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
	ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

	x_0 = _mm256_broadcast_sd( &x[2] );
	x_1 = _mm256_broadcast_sd( &x[3] );

	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );

	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
	ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );


	// lower triangular


	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A1[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A1[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 );
	y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc );

	// forth col (avoid zero y_4_5_6_7)
	x_1     = _mm256_broadcast_sd( &x[3] );
	a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] );
	y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 );

	// first col
	x_0     = _mm256_broadcast_sd( &x[2] );
	x_0     = _mm256_blend_pd( x_0, zeros, 0x8 );
	a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] );
	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );


	A0 += 4*lda;
	A1 += 4*lda;
	x  += 4;


	k=8;
	for(; k<kmax-3; k+=4)
		{

/*		__builtin_prefetch( A0 + 4*lda );*/
/*		__builtin_prefetch( A1 + 4*lda );*/

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );

/*		__builtin_prefetch( A0 + 5*lda );*/
/*		__builtin_prefetch( A1 + 5*lda );*/

		x_0 = _mm256_broadcast_sd( &x[2] );
		x_1 = _mm256_broadcast_sd( &x[3] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
	
		A0 += 4*lda;
		A1 += 4*lda;
		x  += 4;

		}
		
	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
		A0 += 2*lda;
		A1 += 2*lda;
		x  += 2;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );
	y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		
/*		A0 += 1*lda;*/
/*		A1 += 1*lda;*/
/*		x  += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}

	}
Exemple #29
0
Color3 sampleFourier3(float * const coeffs[3], const double *recip, size_t nCoeffs,
        Float sample, Float &pdf, Float &phi) {
    bool flip = false;
    if (sample < 0.5f) {
        sample *= 2.0f;
    } else {
        sample = 1.0f - 2.0f * (sample - 0.5f);
        flip = true;
    }

    int iterations = 0;

    double a = 0.0,
           c = math::Pi_d,
           coeff0 = coeffs[0][0],
           y = coeff0*math::Pi_d*sample,
           deriv = 0.0,
           b = 0.5 * math::Pi_d,
           cosB = 0,
           sinB = 1;

    if (nCoeffs > 10 && sample != 0 && sample != 1) {
        float stddev = std::sqrt(2.0f / 3.0f * std::log(coeffs[0][1] / coeffs[0][2]));
        if (std::isfinite(stddev)) {
            b = std::min(c, (double) math::normal_quantile(0.5f + sample / 2) * stddev);
            cosB = std::cos(b);
            sinB = std::sqrt(1 - cosB * cosB);
        }
    }

    #if FOURIER_SCALAR != 1
        __m256d factorB_prev, factorB_cur;
    #endif

    while (true) {
        #if FOURIER_SCALAR == 1
            double cosB_prev = cosB,
                   sinB_prev = -sinB,
                   sinB_cur  = 0.0,
                   cosB_cur  = 1.0,
                   value     = coeff0 * b;

            deriv = coeff0;

            for (size_t j=1; j<nCoeffs; ++j) {
                double sinB_next = 2.0*cosB*sinB_cur - sinB_prev,
                       cosB_next = 2.0*cosB*cosB_cur - cosB_prev,
                       coeff     = (double) coeffs[0][j];

                value += coeff * recip[j] * sinB_next;
                deriv += coeff * cosB_next;

                sinB_prev = sinB_cur; sinB_cur = sinB_next;
                cosB_prev = cosB_cur; cosB_cur = cosB_next;
            }
        #else
            initializeRecurrence(cosB, factorB_prev, factorB_cur);

            __m256d
                sinB_prev  = _mm256_set1_pd(-sinB),
                sinB_cur   = _mm256_set1_pd(0.0),
                cosB_prev  = _mm256_set1_pd(cosB),
                cosB_cur   = _mm256_set1_pd(1.0),
                value_vec  = _mm256_set_sd(coeff0 * b),
                deriv_vec  = _mm256_set_sd(coeff0);

            for (size_t j=1; j<nCoeffs; j+=4) {
                __m128 coeff_vec_f = _mm_load_ps(coeffs[0]+j);
                __m256d recip_vec  = _mm256_load_pd(recip+j);
                __m256d coeff_vec  = _mm256_cvtps_pd(coeff_vec_f);

                __m256d sinB_next = _mm256_add_pd(
                        _mm256_mul_pd(factorB_prev, sinB_prev),
                        _mm256_mul_pd(factorB_cur, sinB_cur));

                __m256d cosB_next = _mm256_add_pd(
                        _mm256_mul_pd(factorB_prev, cosB_prev),
                        _mm256_mul_pd(factorB_cur, cosB_cur));

                value_vec = _mm256_add_pd(value_vec, _mm256_mul_pd(
                    _mm256_mul_pd(recip_vec, coeff_vec), sinB_next));
                deriv_vec = _mm256_add_pd(deriv_vec, _mm256_mul_pd(coeff_vec, cosB_next));

                sinB_prev = _mm256_splat2_pd(sinB_next);
                cosB_prev = _mm256_splat2_pd(cosB_next);
                sinB_cur  = _mm256_splat3_pd(sinB_next);
                cosB_cur  = _mm256_splat3_pd(cosB_next);
            }

            double value = simd::hadd(value_vec);
            deriv = simd::hadd(deriv_vec);
        #endif

        value -= y;

        if (std::abs(value) <= 1e-5 * coeff0 || ++iterations > 20)
            break;
        else if (value > 0.0)
            c = b;
        else
            a = b;

        b -= value / deriv;

        if (!(b >= a && b <= c))
            b = 0.5f * (a + c);

        cosB = std::cos(b);
        sinB = std::sqrt(1-cosB*cosB);
    }

    double Y = deriv;
    if (flip)
        b = 2.0*math::Pi_d - b;

    pdf = (Float) (math::InvTwoPi_d * Y / coeff0);
    phi = (Float) b;

    #if FOURIER_SCALAR == 1
        double cosB_prev = cosB,
               cosB_cur  = 1.0;

        double R = coeffs[1][0];
        double B = coeffs[2][0];

        for (size_t j=1; j<nCoeffs; ++j) {
            double cosB_next = 2.0*cosB*cosB_cur - cosB_prev,
                   coeffR    = (double) coeffs[1][j],
                   coeffB    = (double) coeffs[2][j];

            R += coeffR * cosB_next;
            B += coeffB * cosB_next;

            cosB_prev = cosB_cur; cosB_cur = cosB_next;
        }
    #else
        __m256d
            cosB_prev  = _mm256_set1_pd(cosB),
            cosB_cur   = _mm256_set1_pd(1.0),
            R_vec  = _mm256_set_sd(coeffs[1][0]),
            B_vec  = _mm256_set_sd(coeffs[2][0]);

        for (size_t j=1; j<nCoeffs; j+=4) {
            __m128 coeff_R_vec_f = _mm_load_ps(coeffs[1]+j);
            __m128 coeff_B_vec_f = _mm_load_ps(coeffs[2]+j);
            __m256d coeff_R_vec  = _mm256_cvtps_pd(coeff_R_vec_f);
            __m256d coeff_B_vec  = _mm256_cvtps_pd(coeff_B_vec_f);

            __m256d cosB_next = _mm256_add_pd(
                    _mm256_mul_pd(factorB_prev, cosB_prev),
                    _mm256_mul_pd(factorB_cur, cosB_cur));

            R_vec = _mm256_add_pd(R_vec, _mm256_mul_pd(coeff_R_vec, cosB_next));
            B_vec = _mm256_add_pd(B_vec, _mm256_mul_pd(coeff_B_vec, cosB_next));

            cosB_prev = _mm256_splat2_pd(cosB_next);
            cosB_cur  = _mm256_splat3_pd(cosB_next);
        }

        double R = simd::hadd(R_vec);
        double B = simd::hadd(B_vec);
    #endif

    double G = 1.39829 * Y - 0.100913 * B - 0.297375 * R;
    return Color3((Float) R, (Float) G, (Float) B)
        * (2 * math::Pi) * (Float) (coeff0 / Y);
}
void ntt_transform(poly out, const poly o)
{ 
  int s, pos = 0, offset;
  __m256d vt,vo0,vo10,vo11,vo20,vo21,vo22,vo23,vc,vp,vpinv,neg2,neg4;
  __m256d vx0,vx1,vx2,vx3,vx4,vx5,vx6,vx7;
  
  vpinv = _mm256_set_pd(PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE);
  vp    = _mm256_set_pd(8383489., 8383489., 8383489., 8383489.);

  bitrev(out);

  vo10 = _mm256_load_pd(o+pos);
  vo20 = _mm256_load_pd(o+pos+4);
  neg2 = _mm256_load_pd(_neg2);
  neg4 = _mm256_load_pd(_neg4);
                                  
  // m = 2, m = 4, m = 8 (3 levels merged)
  for(s = 0; s<POLY_DEG; s+=8)
  {
    // No multiplication with omega required, respective value is 1
    vx0 = _mm256_load_pd(out+s);
    vt = _mm256_mul_pd(vx0,neg2);
    vx0 = _mm256_hadd_pd(vx0,vt);

    vx1 = _mm256_load_pd(out+s+4);
    vt = _mm256_mul_pd(vx1,neg2);
    vx1 = _mm256_hadd_pd(vx1,vt);

    vx0 = _mm256_mul_pd(vx0, vo10);
    vc = _mm256_mul_pd(vx0, vpinv);
    vc = _mm256_round_pd(vc,0x08);
    vc = _mm256_mul_pd(vc, vp);
    vx0 = _mm256_sub_pd(vx0,vc);
    vt = _mm256_permute2f128_pd (vx0, vx0, 0x01); // now contains x2,x3,x0,x1
    vx0 = _mm256_mul_pd(vx0, neg4);
    vx0 = _mm256_add_pd(vx0, vt);

    vx1 = _mm256_mul_pd(vx1, vo10);
    vc = _mm256_mul_pd(vx1, vpinv);
    vc = _mm256_round_pd(vc,0x08);
    vc = _mm256_mul_pd(vc, vp);
    vx1 = _mm256_sub_pd(vx1,vc);
    vt = _mm256_permute2f128_pd (vx1, vx1, 0x01); // now contains x2,x3,x0,x1
    vx1 = _mm256_mul_pd(vx1, neg4);
    vx1 = _mm256_add_pd(vx1, vt);

    vt = _mm256_mul_pd(vx1, vo20);
    vc = _mm256_mul_pd(vt, vpinv);
    vc = _mm256_round_pd(vc,0x08);
    vc = _mm256_mul_pd(vc, vp);
    vt = _mm256_sub_pd(vt,vc);
    vx1 = _mm256_sub_pd(vx0, vt);
    _mm256_store_pd(out+s+4, vx1);

    vx0 = _mm256_add_pd(vx0, vt);
    _mm256_store_pd(out+s+0, vx0);
  }
  
  pos += 8;

// m = 16, m = 32, m = 64 (3 levels merged)
  for(offset = 0; offset < 8; offset+=4)
  {
    vo0 = _mm256_load_pd(o+pos+offset);
    vo10 = _mm256_load_pd(o+pos+offset+8);
    vo11 = _mm256_load_pd(o+pos+offset+16);

    for(s = 0; s<POLY_DEG; s+=64)
    {
      vx1 = _mm256_load_pd(out+offset+s+8);
      vt = _mm256_mul_pd(vx1, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx0 = _mm256_load_pd(out+offset+s+0);
      vx1 = _mm256_sub_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+8, vx1);
      vx0 = _mm256_add_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+0, vx0);

      vx3 = _mm256_load_pd(out+offset+s+24);
      vt = _mm256_mul_pd(vx3, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx2 = _mm256_load_pd(out+offset+s+16);
      vx3 = _mm256_sub_pd(vx2, vt);
      //  _mm256_store_pd(out+offset+s+24, vx3);
      vx2 = _mm256_add_pd(vx2, vt);
      //  _mm256_store_pd(out+offset+s+16, vx2);

      vx5 = _mm256_load_pd(out+offset+s+40);
      vt = _mm256_mul_pd(vx5, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx4 = _mm256_load_pd(out+offset+s+32);
      vx5 = _mm256_sub_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+40, vx5);
      vx4 = _mm256_add_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+32, vx4);

      vx7 = _mm256_load_pd(out+offset+s+56);
      vt = _mm256_mul_pd(vx7, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx6 = _mm256_load_pd(out+offset+s+48);
      vx7 = _mm256_sub_pd(vx6, vt);
      //  _mm256_store_pd(out+offset+s+56, vx7);
      vx6 = _mm256_add_pd(vx6, vt);
      //  _mm256_store_pd(out+offset+s+48, vx6);


      //  vx2 = _mm256_load_pd(out+offset+s+16);
      vt = _mm256_mul_pd(vx2, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx0 = _mm256_load_pd(out+offset+s+0);
      vx2 = _mm256_sub_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+16, vx2);
      vx0 = _mm256_add_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+0, vx0);

      //  vx6 = _mm256_load_pd(out+offset+s+48);
      vt = _mm256_mul_pd(vx6, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx4 = _mm256_load_pd(out+offset+s+32);
      vx6 = _mm256_sub_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+48, vx6);
      vx4 = _mm256_add_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+32, vx4);


      //  vx3 = _mm256_load_pd(out+offset+s+24);
      vt = _mm256_mul_pd(vx3, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx1 = _mm256_load_pd(out+offset+s+8);
      vx3 = _mm256_sub_pd(vx1, vt);
      //  _mm256_store_pd(out+offset+s+24, vx3);
      vx1 = _mm256_add_pd(vx1, vt);
      //  _mm256_store_pd(out+offset+s+8, vx1);

      //  vx7 = _mm256_load_pd(out+offset+s+56);
      vt = _mm256_mul_pd(vx7, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx5 = _mm256_load_pd(out+offset+s+40);
      vx7 = _mm256_sub_pd(vx5, vt);
      //  _mm256_store_pd(out+offset+s+56, vx7);
      vx5 = _mm256_add_pd(vx5, vt);
      //  _mm256_store_pd(out+offset+s+40, vx5);



      //  vx4 = _mm256_load_pd(out+offset+s+32);
    vo20 = _mm256_load_pd(o+pos+offset+24);
      vt = _mm256_mul_pd(vx4, vo20);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx0 = _mm256_load_pd(out+offset+s+0);
      vx4 = _mm256_sub_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+32, vx4);
      vx0 = _mm256_add_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+0, vx0);

      //  vx5 = _mm256_load_pd(out+offset+s+40);
    vo21 = _mm256_load_pd(o+pos+offset+32);
      vt = _mm256_mul_pd(vx5, vo21);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx1 = _mm256_load_pd(out+offset+s+8);
      vx5 = _mm256_sub_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+40, vx5);
      vx1 = _mm256_add_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+8, vx1);

      //  vx6 = _mm256_load_pd(out+offset+s+48);
    vo22 = _mm256_load_pd(o+pos+offset+40);
      vt = _mm256_mul_pd(vx6, vo22);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx2 = _mm256_load_pd(out+offset+s+16);
      vx6 = _mm256_sub_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+48, vx6);
      vx2 = _mm256_add_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+16, vx2);

      //  vx7 = _mm256_load_pd(out+offset+s+56);
    vo23 = _mm256_load_pd(o+pos+offset+48);
      vt = _mm256_mul_pd(vx7, vo23);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx3 = _mm256_load_pd(out+offset+s+24);
      vx7 = _mm256_sub_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+56, vx7);
      vx3 = _mm256_add_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+24, vx3);
    }
  }


  pos += 56;

  // m = 128, m=256, m=512 (3 levels merged)
  for(offset=0;offset<64;offset+=4)
  {
    vo0 = _mm256_load_pd(o+pos+offset);
    vo10 = _mm256_load_pd(o+pos+offset+64);
    vo11 = _mm256_load_pd(o+pos+offset+128);

    for(s = 0; s<POLY_DEG; s+=512)
    {
      vx1 = _mm256_load_pd(out+offset+s+64);
      vt = _mm256_mul_pd(vx1, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx0 = _mm256_load_pd(out+offset+s+0);
      vx1 = _mm256_sub_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+64, vx1);
      vx0 = _mm256_add_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+0, vx0);

      vx3 = _mm256_load_pd(out+offset+s+192);
      vt = _mm256_mul_pd(vx3, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx2 = _mm256_load_pd(out+offset+s+128);
      vx3 = _mm256_sub_pd(vx2, vt);
      //_mm256_store_pd(out+offset+s+192, vx3);
      vx2 = _mm256_add_pd(vx2, vt);
      //_mm256_store_pd(out+offset+s+128, vx2);

      vx5 = _mm256_load_pd(out+offset+s+320);
      vt = _mm256_mul_pd(vx5, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx4 = _mm256_load_pd(out+offset+s+256);
      vx5 = _mm256_sub_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+320, vx5);
      vx4 = _mm256_add_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+256, vx4);

      vx7 = _mm256_load_pd(out+offset+s+448);
      vt = _mm256_mul_pd(vx7, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx6 = _mm256_load_pd(out+offset+s+384);
      vx7 = _mm256_sub_pd(vx6, vt);
      //_mm256_store_pd(out+offset+s+448, vx7);
      vx6 = _mm256_add_pd(vx6, vt);
      //_mm256_store_pd(out+offset+s+384, vx6);

    

      //vx2 = _mm256_load_pd(out+offset+s+128);
      vt = _mm256_mul_pd(vx2, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx0 = _mm256_load_pd(out+offset+s+0);
      vx2 = _mm256_sub_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+128, vx2);
      vx0 = _mm256_add_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+0, vx0);

      //vx3 = _mm256_load_pd(out+offset+s+192);
      vt = _mm256_mul_pd(vx3, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx1 = _mm256_load_pd(out+offset+s+64);
      vx3 = _mm256_sub_pd(vx1, vt);
      //_mm256_store_pd(out+offset+s+192, vx3);
      vx1 = _mm256_add_pd(vx1, vt);
      //_mm256_store_pd(out+offset+s+64, vx1);

      //vx6 = _mm256_load_pd(out+offset+s+384);
      vt = _mm256_mul_pd(vx6, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx4 = _mm256_load_pd(out+offset+s+256);
      vx6 = _mm256_sub_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+384, vx6);
      vx4 = _mm256_add_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+256, vx4);

      //vx7 = _mm256_load_pd(out+offset+s+448);
      vt = _mm256_mul_pd(vx7, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx5 = _mm256_load_pd(out+offset+s+320);
      vx7 = _mm256_sub_pd(vx5, vt);
      //_mm256_store_pd(out+offset+s+448, vx7);
      vx5 = _mm256_add_pd(vx5, vt);
      //_mm256_store_pd(out+offset+s+320, vx5);


    
      //vx4 = _mm256_load_pd(out+offset+s+256);
    vo20 = _mm256_load_pd(o+pos+offset+192);
      vt = _mm256_mul_pd(vx4, vo20);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx0 = _mm256_load_pd(out+offset+s+0);
      vx4 = _mm256_sub_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+256, vx4);
      vx0 = _mm256_add_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+0, vx0);

      //vx5 = _mm256_load_pd(out+offset+s+320);
    vo21 = _mm256_load_pd(o+pos+offset+256);
      vt = _mm256_mul_pd(vx5, vo21);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx1 = _mm256_load_pd(out+offset+s+64);
      vx5 = _mm256_sub_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+320, vx5);
      vx1 = _mm256_add_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+64, vx1);

      //vx6 = _mm256_load_pd(out+offset+s+384);
    vo22 = _mm256_load_pd(o+pos+offset+320);
      vt = _mm256_mul_pd(vx6, vo22);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx2 = _mm256_load_pd(out+offset+s+128);
      vx6 = _mm256_sub_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+384, vx6);
      vx2 = _mm256_add_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+128, vx2);

      //vx7 = _mm256_load_pd(out+offset+s+448);
    vo23 = _mm256_load_pd(o+pos+offset+384);
      vt = _mm256_mul_pd(vx7, vo23);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx3 = _mm256_load_pd(out+offset+s+192);
      vx7 = _mm256_sub_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+448, vx7);

      vx3 = _mm256_add_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+192, vx3);
    }
  }
}