Example #1
0
/******************************************************************
*
* NEGACYCLIC FFT LOOK UP TABLE
*
******************************************************************/
void negacyc_mul(ring_t *r, const ring_t *x, const ring_t *y)
{
  phi_forward(&vector_x,x);
  phi_forward(&vector_y,y);

  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp,dim;
  dim = _mm256_set1_pd(CPLXDIM);
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vector_x.real+i);
    imag_x = _mm256_load_pd(vector_x.imag+i);
    real_y = _mm256_load_pd(vector_y.real+i);
    imag_y = _mm256_load_pd(vector_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    
    real_x = _mm256_div_pd(real_x,dim);
    imag_x = _mm256_div_pd(imag_x,dim);

    _mm256_store_pd(vector_res.real+i,real_x);
    _mm256_store_pd(vector_res.imag+i,imag_x);
  }
  phi_backward(&vector_res,r);
  // print_cplx(&vec_res,CPLXDIM);
}
Example #2
0
/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_mul(ring_t *r, const ring_t *x, const ring_t *y){
  // printf("\n\n**************split-radix FAST**************\n");

  fft_vector_forward(&vctr_x,x);
  fft_vector_forward(&vctr_y,y);
  
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vctr_x.real+i);
    imag_x = _mm256_load_pd(vctr_x.imag+i);
    real_y = _mm256_load_pd(vctr_y.real+i);
    imag_y = _mm256_load_pd(vctr_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
     
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vctr_res.real+i,real_x);
    _mm256_store_pd(vctr_res.imag+i,imag_x);
  }
  fft_vector_backward(&vctr_res,r);
}
void convert_simd_avx(const int32_t * u, double * y, size_t n, double slope)
{
	const int32_t * u_end = u + n;
	const int32_t * u_current = u;
	double * y_current = y;
	
	__m128i mmx_u1, mmx_u2;
    __m256d mmx_y1, mmx_y2, mmx_y3, mmx_y4;
    __m256d mmx_slope_4 = _mm256_set1_pd(slope);
	
	{
		for (; u_current < u_end; u_current += 8, y_current += 8)
		{
			/* Load 8 input values into an SSE register */
			mmx_u1 = _mm_load_si128(  (const __m128i *) u_current);
			mmx_u2 = _mm_load_si128(  (const __m128i *)  u_current+4);
		
			mmx_y1 = _mm256_cvtepi32_pd(mmx_u1);
			mmx_y2 = _mm256_cvtepi32_pd(mmx_u2);
			
			mmx_y3 = _mm256_mul_pd(mmx_y1, mmx_slope_4);    /* Apply slope */
			mmx_y4 = _mm256_mul_pd(mmx_y2, mmx_slope_4);    /* Apply slope */
			
			_mm256_store_pd(y_current, mmx_y3);
			_mm256_store_pd(y_current+4, mmx_y4);			
		}
	}
	
}
Example #4
0
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_vector_set( const double i_scalar,
                        double*       io_c,
                        const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );

  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] = i_scalar;
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    const __m256d vec_scalar = _mm256_broadcast_sd(&i_scalar);
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]),   vec_scalar );
      _mm256_store_pd(  &(io_c[l_n+4]), vec_scalar );
#else
      _mm256_stream_pd( &(io_c[l_n]),   vec_scalar );
      _mm256_stream_pd( &(io_c[l_n+4]), vec_scalar );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    const __m512d vec_scalar = _mm512_broadcastsd_pd(_mm_load_sd(&i_scalar));
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]), vec_scalar );
#else
      _mm512_stream_pd( &(io_c[l_n]), vec_scalar );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] = i_scalar;
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] = i_scalar;
  }
}
Example #5
0
// multiply *p by v and applied to all n
COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v)
{
#if defined(COREARRAY_SIMD_AVX)

	const __m256d v4 = _mm256_set1_pd(v);

	switch ((size_t)p & 0x1F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x10:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x18:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 4; n-=4)
		{
			_mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
		break;
	default:
		for (; n >= 4; n-=4)
		{
			_mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	const __m128d v2 = _mm_set1_pd(v);

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 2; n-=2, p+=2)
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2));
		break;
	default:
		for (; n >= 2; n-=2, p+=2)
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2));
	}

#endif

	for (; n > 0; n--) (*p++) *= v;
}
Example #6
0
double compute_pi(size_t dt)
{
    int i;
    double pi = 0.0;
    double delta = 1.0 / dt;
    register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;
    ymm0 = _mm256_set1_pd(1.0);
    ymm1 = _mm256_set1_pd(delta);
    ymm2 = _mm256_set_pd(delta * 3, delta * 2, delta * 1, 0.0);
    ymm4 = _mm256_setzero_pd();

    for (i = 0; i <= dt - 4; i += 4) {
        ymm3 = _mm256_set1_pd(i * delta);
        ymm3 = _mm256_add_pd(ymm3, ymm2);
        ymm3 = _mm256_mul_pd(ymm3, ymm3);
        ymm3 = _mm256_add_pd(ymm0, ymm3);
        ymm3 = _mm256_div_pd(ymm1, ymm3);
        ymm4 = _mm256_add_pd(ymm4, ymm3);
    }
    double tmp[4] __attribute__((aligned(32)));
    _mm256_store_pd(tmp, ymm4);
    pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

    return pi * 4.0;
}
Example #7
0
static inline unsigned int evaluatePopcount(INT_TYPE v_N, char *precomputed)
{
#ifdef __AVX            	       	   	      
  unsigned long int
    res[4] __attribute__ ((aligned (BYTE_ALIGNMENT)));
	     
  unsigned int a, b;
	     
  _mm256_store_pd((double*)res, v_N);
  
  a = __builtin_popcountl(res[0]) + __builtin_popcountl(res[1]);
  b = __builtin_popcountl(res[2]) + __builtin_popcountl(res[3]);
	     
  return (a + b);	            
#else      	       
  unsigned int
    sum = 0,
    counts[INTS_PER_VECTOR] __attribute__ ((aligned (BYTE_ALIGNMENT)));

  VECTOR_STORE((CAST)counts, v_N);

  sum += BIT_COUNT(counts[0], precomputed) + BIT_COUNT(counts[1], precomputed);
  sum += BIT_COUNT(counts[2], precomputed) + BIT_COUNT(counts[3], precomputed);          

  return sum;
#endif
}
Example #8
0
double compute_pi_leibniz_avx_opt(size_t n)
{
	double pi = 0.0;
	register __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
	register __m256d ymm9, ymm10, ymm11, ymm12, ymm13;

	ymm0 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0);
	ymm1 = _mm256_set_pd(1.0, 3.0, 5.0, 7.0);
	ymm2 = _mm256_set_pd(9.0, 11.0, 13.0, 15.0);
	ymm3 = _mm256_set_pd(17.0, 19.0, 21.0, 23.0);
	ymm4 = _mm256_set_pd(25.0, 27.0, 29.0, 31.0);
	ymm13 = _mm256_set1_pd(32.0);

	ymm5 = _mm256_setzero_pd();
	ymm6 = _mm256_setzero_pd();
	ymm7 = _mm256_setzero_pd();
	ymm8 = _mm256_setzero_pd();
	
	for (int i = 0; i <= n - 16; i += 16) {
		ymm9 = _mm256_div_pd(ymm0, ymm1);
		ymm1 = _mm256_add_pd(ymm1, ymm13);
		ymm10 = _mm256_div_pd(ymm0, ymm2);
		ymm2 = _mm256_add_pd(ymm2, ymm13);
		ymm11 = _mm256_div_pd(ymm0, ymm3);
		ymm3 = _mm256_add_pd(ymm3, ymm13);
		ymm12 = _mm256_div_pd(ymm0, ymm4);
		ymm4 = _mm256_add_pd(ymm4, ymm13);

		ymm5 = _mm256_add_pd(ymm5, ymm9);
		ymm6 = _mm256_add_pd(ymm6, ymm10);
		ymm7 = _mm256_add_pd(ymm7, ymm11);
		ymm8 = _mm256_add_pd(ymm8, ymm12);
	}
	double tmp[4] __attribute__((aligned(32)));
	_mm256_store_pd(tmp, ymm5);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
	_mm256_store_pd(tmp, ymm6);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
	_mm256_store_pd(tmp, ymm7);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];
	_mm256_store_pd(tmp, ymm8);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

	return pi * 4.0;
}
Example #9
0
void
printcounters(struct counter *ctrs, uint64_t duration)
{
    struct metrics s = {0};

    uint64_t thisBytesWritten = pcm->bytesWritten();
    uint64_t thisBytesRead = pcm->bytesRead();
    memset(threadspercore, 0, gbl.ncores * sizeof(int));
    s.timestamp = _rdtsc();
    s.duration = duration;
    for (int cpu = 0; cpu < gbl.ncpus; ++cpu)
    {
        double delta[NEVENTS];
        // volatile because another thread is changing it.
        volatile struct counter *p = &ctrs[cpu];

        for (int i = 0; i < NEVENTS; ++i)
        {
            union {
                __m256d c;
                uint64_t values[4];
            } t;
            t.c = _mm256_load_pd((const double *)&p->counts[i][0]);
            delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]);
            _mm256_store_pd((double *)&lastctr[cpu].counts[i][0], t.c);
            if (delta[i] < 0)
                delta[i] = 0;
            sevents[i] += delta[i];
        }

        //printf("clocks %g duration %lu\n", delta[clocks], duration);
        if (2*delta[clocks] > duration)
        {
            int thiscore = pcm->getSocketId(cpu)  * gbl.corespersocket +
                pcm->getCoreId(cpu);
            ++s.nthreads;
            ++threadspercore[thiscore];
        }
        s.dsimd += delta[simd_dp];
        s.dsse += delta[sse_dp];
        s.dscalar += delta[scalar_dp];
        s.ssimd += delta[simd_sp];
        s.ssse += delta[sse_sp];
        s.sscalar += delta[scalar_sp];
        s.instrs += delta[instrs];
    }
    s.rbytes = thisBytesRead - lastBytesRead;
    s.wbytes = thisBytesWritten - lastBytesWritten;
    lastBytesRead = thisBytesRead;
    lastBytesWritten = thisBytesWritten;
    for (int i = 0; i < gbl.ncores; ++i)
        if (threadspercore[i])
            ++s.ncores;

    sample(&s);
}
Example #10
0
void sum_avx(double* c, double* a, double* b, int len)
{
    __m256d rA_AVX, rB_AVX, rC_AVX;   // variables for AVX

    for (int i = 0; i < len; i += 4)
    {
        rA_AVX = _mm256_load_pd(&a[i]);
        rB_AVX = _mm256_load_pd(&b[i]);
        rC_AVX = _mm256_add_pd(rA_AVX, rB_AVX);
        _mm256_store_pd(&c[i], rC_AVX);
    }
}
Example #11
0
inline void transpose_4x4block_AVX_64(double* A, double* B, const size_t lda,
                                   const size_t ldb) {
    __m256d row0 = _mm256_load_pd(&A[0*ldb]);
    __m256d row1 = _mm256_load_pd(&A[1*ldb]);
    __m256d row2 = _mm256_load_pd(&A[2*ldb]);
    __m256d row3 = _mm256_load_pd(&A[3*ldb]);
    __m256d tmp3, tmp2, tmp1, tmp0;
    tmp0 = _mm256_unpacklo_pd(row0, row1);
    tmp1 = _mm256_unpackhi_pd(row0, row1);
    tmp2 = _mm256_unpacklo_pd(row2, row3);
    tmp3 = _mm256_unpackhi_pd(row2, row3);
    row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0x20);
    row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0x20);
    row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0x31);
    row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0x31);
    _mm256_store_pd(&B[0*lda], row0);
    _mm256_store_pd(&B[1*lda], row1);
    _mm256_store_pd(&B[2*lda], row2);
    _mm256_store_pd(&B[3*lda], row3);

}
Example #12
0
/******************************************************************
*
* SPLIT RADIX PRECOMPUTED AND VECTORIZED NON RECURSIVE FFT MULTIPLICATION
*
******************************************************************/
void sr_vector_nonrec_mul(ring_t *r, const ring_t *x, const ring_t *y){
  fft_vector_nonrec_forward(&vec_x,x);
  fft_vector_nonrec_forward(&vec_y,y);
  __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp;
  // double a,b,c,d;
  for (int i = 0; i < CPLXDIM; i+=4)
  {
    real_x = _mm256_load_pd(vec_x.real+i);
    imag_x = _mm256_load_pd(vec_x.imag+i);
    real_y = _mm256_load_pd(vec_y.real+i);
    imag_y = _mm256_load_pd(vec_y.imag+i);

    //(a + ib) * (c + id) = (ac - bd) + i(ad+bc)
    //real_temp = bd
    real_temp = _mm256_mul_pd(imag_x,imag_y);
    //imag_temp = ad
    imag_temp = _mm256_mul_pd(real_x,imag_y);
    //REPLACED FOR COMMENTED SECTION
    //real_x = ac
    // real_x = _mm256_mul_pd(real_x,real_y);
    // //imag_x = bc
    // imag_x = _mm256_mul_pd(imag_x,real_y);
    // //real_x = ac - bd => real_x - real_temp
    // real_x = _mm256_sub_pd(real_x,real_temp);
    // //imag_x = ad + bc => imag_temp + imag_x
    // imag_x = _mm256_add_pd(imag_x,imag_temp);
    //THESE ARE NOT WORKING 
    real_x = _mm256_fmsub_pd(real_x,real_y,real_temp);
    imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp);

    real_y = _mm256_set1_pd(CPLXDIM);
    real_x = _mm256_div_pd(real_x,real_y);
    imag_x = _mm256_div_pd(imag_x,real_y);

    _mm256_store_pd(vec_res.real+i,real_x);
    _mm256_store_pd(vec_res.imag+i,imag_x);

  }
  fft_vector_nonrec_backward(&vec_res,r);
}
// this function assumes data is stored in col-major
// if data is in row major, call it like matmul4x4(B, A, C)
void matmul4x4(double *A, double *B, double *C) {
    __m256d col[4], sum[4];
    //load every column into registers
    for(int i=0; i<4; i++)  
      col[i] = _mm256_load_pd(&A[i*4]);
    for(int i=0; i<4; i++) {
        sum[i] = _mm256_setzero_pd();      
        for(int j=0; j<4; j++) {
            sum[i] = _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(B[i*4+j]), col[j]), sum[i]);
        }           
    }
    for(int i=0; i<4; i++) 
      _mm256_store_pd(&C[i*4], sum[i]); 
}
Example #14
0
static inline unsigned int populationCount(INT_TYPE v_N)
{
#ifdef __AVX
  {
    unsigned long int
      res[4] __attribute__ ((aligned (BYTE_ALIGNMENT)));
    unsigned int a, b;
    
    _mm256_store_pd((double*)res, v_N);
    
    a = __builtin_popcountl(res[0]) + __builtin_popcountl(res[1]);
    b = __builtin_popcountl(res[2]) + __builtin_popcountl(res[3]);
    
    return (a + b);	   
  }
#else	  
  return (vectorCount(v_N)); 
#endif
}
Example #15
0
double compute_pi_euler_avx(size_t n)
{
	double pi = 0.0;
	register __m256d ymm0, ymm1, ymm2, ymm3;
	ymm0 = _mm256_setzero_pd();
    ymm1 = _mm256_set1_pd(1.0);
    ymm2 = _mm256_set1_pd(6.0);

    for (int i = 0; i <= n - 4; i += 4) {
        ymm3 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0);
        ymm3 = _mm256_mul_pd(ymm3, ymm3);
        ymm3 = _mm256_div_pd(ymm1, ymm3);  
        ymm0 = _mm256_add_pd(ymm0, ymm3);
    }
    ymm3 = _mm256_mul_pd(ymm2, ymm0);
    double tmp[4] __attribute__((aligned(32)));
    _mm256_store_pd(tmp, ymm0);
    pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

    return sqrt( pi );
}
Example #16
0
double compute_pi_leibniz_fma(size_t n)
{
	double pi = 0.0;
	register __m256d ymm0, ymm1, ymm2, ymm3, ymm4;

	ymm0 = _mm256_setzero_pd();
	ymm1 = _mm256_set1_pd(2.0);
	ymm2 = _mm256_set1_pd(1.0);
	ymm3 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0);
	
	for (int i = 0; i <= n - 4; i += 4) {
		ymm4 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0);
		ymm4 = _mm256_fmadd_pd(ymm1, ymm4, ymm2);
		ymm4 = _mm256_div_pd(ymm3, ymm4);
		ymm0 = _mm256_add_pd(ymm0, ymm4);
	}
	double tmp[4] __attribute__((aligned(32)));
	_mm256_store_pd(tmp, ymm0);
	pi += tmp[0] + tmp[1] + tmp[2] + tmp[3];

	return pi * 4.0;
}
static inline PetscErrorCode TensorContract_FMA(PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[])
{

  PetscFunctionBegin;
  if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;}
  {
    PetscReal R[Q][P],S[Q][P],T[Q][P];
    const PetscScalar (*x)[P*P*P][NE] = (const PetscScalar(*)[P*P*P][NE])xx;
    PetscScalar       (*y)[P*P*P][NE] =       (PetscScalar(*)[Q*Q*Q][NE])yy;
    PetscScalar u[dof][Q*P*P][NE]_align,v[dof][Q*Q*P][NE]_align;

    for (PetscInt i=0; i<Q; i++) {
      for (PetscInt j=0; j<P; j++) {
        R[i][j] = tmode == TENSOR_EVAL ? Rf[i*P+j] : Rf[j*Q+i];
        S[i][j] = tmode == TENSOR_EVAL ? Sf[i*P+j] : Sf[j*Q+i];
        T[i][j] = tmode == TENSOR_EVAL ? Tf[i*P+j] : Tf[j*Q+i];
      }
    }

    // u[l,a,j,k] = R[a,i] x[l,i,j,k]
    for (PetscInt l=0; l<dof; l++) {
      for (PetscInt a=0; a<Q; a++) {
        __m256d r[P];
        for (PetscInt i=0; i<P; i++) r[i] = _mm256_set1_pd(R[a][i]);
        for (PetscInt jk=0; jk<P*P; jk++) {
          __m256d u_lajk = _mm256_setzero_pd();
          for (PetscInt i=0; i<P; i++) {
            u_lajk = _mm256_fmadd_pd(r[i],_mm256_load_pd(x[l][i*P*P+jk]),u_lajk);
          }
          _mm256_store_pd(u[l][a*P*P+jk],u_lajk);
        }
      }
    }

    // v[l,a,b,k] = S[b,j] u[l,a,j,k]
    for (PetscInt l=0; l<dof; l++) {
      for (PetscInt b=0; b<Q; b++) {
        __m256d s[P];
        for (int j=0; j<P; j++) s[j] = _mm256_set1_pd(S[b][j]);
        for (PetscInt a=0; a<Q; a++) {
          for (PetscInt k=0; k<P; k++) {
            __m256d v_labk = _mm256_setzero_pd();
            for (PetscInt j=0; j<P; j++) {
              v_labk = _mm256_fmadd_pd(s[j],_mm256_load_pd(u[l][(a*P+j)*P+k]),v_labk);
            }
            _mm256_store_pd(v[l][(a*Q+b)*P+k],v_labk);
          }
        }
      }
    }

    // y[l,a,b,c] = T[c,k] v[l,a,b,k]
    for (PetscInt l=0; l<dof; l++) {
      for (PetscInt c=0; c<Q; c++) {
        __m256d t[P];
        for (int k=0; k<P; k++) t[k] = _mm256_set1_pd(T[c][k]);
        for (PetscInt ab=0; ab<Q*Q; ab++) {
          __m256d y_labc = _mm256_load_pd(y[l][ab*Q+c]);
          for (PetscInt k=0; k<P; k++) {
            // for (PetscInt e=0; e<NE; e++) y[l][ab*Q+c][e] += T[c][k] * v[l][ab*P+k][e];
            y_labc = _mm256_fmadd_pd(t[k],_mm256_load_pd(v[l][ab*P+k]),y_labc);
          }
          _mm256_store_pd(y[l][ab*Q+c],y_labc);
        }
      }
    }
    PetscLogFlops(dof*(Q*P*P*P+Q*Q*P*P+Q*Q*Q*P)*NE*2);
  }
  PetscFunctionReturn(0);
}
Example #18
0
void CalculateBasisComponents(const MDoubleArray& weights, const BaryCoords& coords,
                              const MIntArray& triangleVertices, const MPointArray& points,
                              const MFloatVectorArray& normals, const MIntArray& sampleIds,
                              double* alignedStorage,
                              MPoint& origin, MVector& up, MVector& normal) {
  // Start with the recreated point and normal using the barycentric coordinates of the hit point.
  unsigned int hitIndex = weights.length()-1;
#ifdef __AVX__
  __m256d originV = Dot4<MPoint>(coords[0], coords[1], coords[2], 0.0,
                                points[triangleVertices[0]], points[triangleVertices[1]],
                                points[triangleVertices[2]], MPoint::origin);
  __m256d hitNormalV = Dot4<MVector>(coords[0], coords[1], coords[2], 0.0,
                                normals[triangleVertices[0]], normals[triangleVertices[1]],
                                normals[triangleVertices[2]], MVector::zero);
  __m256d hitWeightV = _mm256_set1_pd(weights[hitIndex]);
  // Create the barycentric point and normal.
  __m256d normalV = _mm256_mul_pd(hitNormalV, hitWeightV);
  // Then use the weighted adjacent data.
  for (unsigned int j = 0; j < hitIndex; j += 4) {
    __m256d tempNormal = Dot4<MVector>(weights[j], weights[j+1], weights[j+2], weights[j+3],
                                       normals[sampleIds[j]], normals[sampleIds[j+1]],
                                       normals[sampleIds[j+2]], normals[sampleIds[j+3]]);
    normalV = _mm256_add_pd(tempNormal, normalV);
  }

  _mm256_store_pd(alignedStorage, originV);
  origin.x = alignedStorage[0];
  origin.y = alignedStorage[1];
  origin.z = alignedStorage[2];
  _mm256_store_pd(alignedStorage, normalV);
  normal.x = alignedStorage[0];
  normal.y = alignedStorage[1];
  normal.z = alignedStorage[2];

  // Calculate the up vector
  const MPoint& pt1 = points[triangleVertices[0]];
  const MPoint& pt2 = points[triangleVertices[1]];
  __m256d p1 = _mm256_set_pd(pt1.w, pt1.z, pt1.y, pt1.x);
  __m256d p2 = _mm256_set_pd(pt2.w, pt2.z, pt2.y, pt2.x);
  p1 = _mm256_add_pd(p1, p2);
  __m256d half = _mm256_set_pd(0.5, 0.5, 0.5, 0.5);
  p1 = _mm256_mul_pd(p1, half);
  __m256d upV = _mm256_sub_pd(p1, originV);
  _mm256_store_pd(alignedStorage, upV);
  up.x = alignedStorage[0];
  up.y = alignedStorage[1];
  up.z = alignedStorage[2];
#else
  MVector hitNormal;
  // Create the barycentric point and normal.
  for (int i = 0; i < 3; ++i) {
    origin += points[triangleVertices[i]] * coords[i];
    hitNormal += MVector(normals[triangleVertices[i]]) * coords[i];
  }
  // Use crawl data to calculate normal
  normal = hitNormal * weights[hitIndex];
  for (unsigned int j = 0; j < hitIndex; j++) {
    normal += MVector(normals[sampleIds[j]]) * weights[j];
  }

  // Calculate the up vector
  // The triangle vertices are sorted by decreasing barycentric coordinates so the first two are
  // the two closest vertices in the triangle.
  up = ((points[triangleVertices[0]] + points[triangleVertices[1]]) * 0.5) - origin;
#endif
  normal.normalize();
  GetValidUp(weights, points, sampleIds, origin, normal, up);
}
Example #19
0
 /*!
  * \brief Aligned store of the given packed vector at the
  * given memory position
  */
 ETL_STATIC_INLINE(void) store(etl::complex<double>* memory, avx_simd_complex_double<etl::complex<double>> value) {
     _mm256_store_pd(reinterpret_cast<double*>(memory), value.value);
 }
Example #20
0
 /*!
  * \brief Aligned store of the given packed vector at the
  * given memory position
  */
 ETL_STATIC_INLINE(void) store(double* memory, avx_simd_double value) {
     _mm256_store_pd(memory, value.value);
 }
Example #21
0
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_update_helmholtz_no_h2( const double* i_g1,
                                    const double* i_g2,
                                    const double* i_g3, 
                                    const double* i_tm1,
                                    const double* i_tm2,
                                    const double* i_tm3,
                                    double*       io_c,
                                    const double  i_h1,
                                    const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );
  
  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1);
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1;
      __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2;

      vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n]));
      vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n]));
      vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4]));
      vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4]));

      vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1);
      vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n]));
      vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2);
      vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4]));

      vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n]));
      vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1);
      vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4]));
      vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2);

      vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n]));
      vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n]));
      vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4]));
      vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4]));

      vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1);
      vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2);
      vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1);
      vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2);

      vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) );
#else
      _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) );
#endif
      vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) );
#else
      _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1));
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3;
      vec_g1 = _mm512_loadu_pd(&(i_g1[l_n]));
      vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n]));
      vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1);
      vec_g2 = _mm512_loadu_pd(&(i_g2[l_n]));
      vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n]));
      vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2);
      vec_g3 = _mm512_loadu_pd(&(i_g3[l_n]));
      vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n]));
      vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3);
      vec_g1 = _mm512_add_pd(vec_g1, vec_g2);
      vec_g1 = _mm512_add_pd(vec_g1, vec_g3);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) );
#else
      _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
}
Example #22
0
test (double *e, __m256d a)
{
  return _mm256_store_pd (e, a);
}
Example #23
0
void core::Vector3::normalize(void)
{
#if defined(VTX_USE_AVX)
	ALIGNED_32 platform::F64_t vector[] = {this->x, this->y, this->z, 0};
	ALIGNED_32 platform::F64_t reciprocalVector[] = {1.0, 1.0, 1.0, 1.0};
	__m256d simdvector;
	__m256d result;
	__m256d recp;

	simdvector = _mm256_load_pd(vector);
	recp = _mm256_load_pd(reciprocalVector);

	result = _mm256_mul_pd(simdvector, simdvector);

	result = _mm256_hadd_pd(result, result);
	result = _mm256_hadd_pd(result, result);

	result = _mm256_sqrt_pd(result);

	result = _mm256_div_pd(recp, result);

	simdvector = _mm256_mul_pd(simdvector, result);

	_mm256_store_pd(vector, simdvector);

	this->x = vector[0];
	this->y = vector[1];
	this->z = vector[2];

#elif defined(VTX_USE_SSE)
	// Must pad with a trailing 0, to store in 128-bit register
	ALIGNED_16 core::F32_t vector[] = {this->x, this->y, this->z, 0};
	__m128 simdvector;
	__m128 result;
	simdvector = _mm_load_ps(vector);
	
	// (X^2, Y^2, Z^2, 0^2)
	result = _mm_mul_ps(simdvector, simdvector);
	
	// Add all elements together, giving us (X^2 + Y^2 + Z^2 + 0^2)
	result = _mm_hadd_ps(result, result);
	result = _mm_hadd_ps(result, result);
	
	// Calculate square root, giving us sqrt(X^2 + Y^2 + Z^2 + 0^2)
	result = _mm_sqrt_ps(result);

	// Calculate reciprocal, giving us 1 / sqrt(X^2 + Y^2 + Z^2 + 0^2)
	result = _mm_rcp_ps(result);

	// Finally, multiply the result with our original vector.
	simdvector = _mm_mul_ps(simdvector, result);

	_mm_store_ps(vector, simdvector);
	
	this->x = vector[0];
	this->y = vector[1];
	this->z = vector[2];
#else
	core::F64_t num = 1.0 / std::sqrt(std::pow(this->x, 2) + std::pow(this->y, 2) + std::pow(this->z, 2));
	this->x *= num;
	this->y *= num;
	this->z *= num;
#endif
}
Example #24
0
inline void store(double *r, const F64vec4 v)
{
    _mm256_store_pd(r, v);
}
Example #25
0
void ntt_transform(poly out, const poly o)
{ 
  int s, pos = 0, offset;
  __m256d vt,vo0,vo10,vo11,vo20,vo21,vo22,vo23,vc,vp,vpinv,neg2,neg4;
  __m256d vx0,vx1,vx2,vx3,vx4,vx5,vx6,vx7;
  
  vpinv = _mm256_set_pd(PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE);
  vp    = _mm256_set_pd(8383489., 8383489., 8383489., 8383489.);

  bitrev(out);

  vo10 = _mm256_load_pd(o+pos);
  vo20 = _mm256_load_pd(o+pos+4);
  neg2 = _mm256_load_pd(_neg2);
  neg4 = _mm256_load_pd(_neg4);
                                  
  // m = 2, m = 4, m = 8 (3 levels merged)
  for(s = 0; s<POLY_DEG; s+=8)
  {
    // No multiplication with omega required, respective value is 1
    vx0 = _mm256_load_pd(out+s);
    vt = _mm256_mul_pd(vx0,neg2);
    vx0 = _mm256_hadd_pd(vx0,vt);

    vx1 = _mm256_load_pd(out+s+4);
    vt = _mm256_mul_pd(vx1,neg2);
    vx1 = _mm256_hadd_pd(vx1,vt);

    vx0 = _mm256_mul_pd(vx0, vo10);
    vc = _mm256_mul_pd(vx0, vpinv);
    vc = _mm256_round_pd(vc,0x08);
    vc = _mm256_mul_pd(vc, vp);
    vx0 = _mm256_sub_pd(vx0,vc);
    vt = _mm256_permute2f128_pd (vx0, vx0, 0x01); // now contains x2,x3,x0,x1
    vx0 = _mm256_mul_pd(vx0, neg4);
    vx0 = _mm256_add_pd(vx0, vt);

    vx1 = _mm256_mul_pd(vx1, vo10);
    vc = _mm256_mul_pd(vx1, vpinv);
    vc = _mm256_round_pd(vc,0x08);
    vc = _mm256_mul_pd(vc, vp);
    vx1 = _mm256_sub_pd(vx1,vc);
    vt = _mm256_permute2f128_pd (vx1, vx1, 0x01); // now contains x2,x3,x0,x1
    vx1 = _mm256_mul_pd(vx1, neg4);
    vx1 = _mm256_add_pd(vx1, vt);

    vt = _mm256_mul_pd(vx1, vo20);
    vc = _mm256_mul_pd(vt, vpinv);
    vc = _mm256_round_pd(vc,0x08);
    vc = _mm256_mul_pd(vc, vp);
    vt = _mm256_sub_pd(vt,vc);
    vx1 = _mm256_sub_pd(vx0, vt);
    _mm256_store_pd(out+s+4, vx1);

    vx0 = _mm256_add_pd(vx0, vt);
    _mm256_store_pd(out+s+0, vx0);
  }
  
  pos += 8;

// m = 16, m = 32, m = 64 (3 levels merged)
  for(offset = 0; offset < 8; offset+=4)
  {
    vo0 = _mm256_load_pd(o+pos+offset);
    vo10 = _mm256_load_pd(o+pos+offset+8);
    vo11 = _mm256_load_pd(o+pos+offset+16);

    for(s = 0; s<POLY_DEG; s+=64)
    {
      vx1 = _mm256_load_pd(out+offset+s+8);
      vt = _mm256_mul_pd(vx1, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx0 = _mm256_load_pd(out+offset+s+0);
      vx1 = _mm256_sub_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+8, vx1);
      vx0 = _mm256_add_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+0, vx0);

      vx3 = _mm256_load_pd(out+offset+s+24);
      vt = _mm256_mul_pd(vx3, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx2 = _mm256_load_pd(out+offset+s+16);
      vx3 = _mm256_sub_pd(vx2, vt);
      //  _mm256_store_pd(out+offset+s+24, vx3);
      vx2 = _mm256_add_pd(vx2, vt);
      //  _mm256_store_pd(out+offset+s+16, vx2);

      vx5 = _mm256_load_pd(out+offset+s+40);
      vt = _mm256_mul_pd(vx5, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx4 = _mm256_load_pd(out+offset+s+32);
      vx5 = _mm256_sub_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+40, vx5);
      vx4 = _mm256_add_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+32, vx4);

      vx7 = _mm256_load_pd(out+offset+s+56);
      vt = _mm256_mul_pd(vx7, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx6 = _mm256_load_pd(out+offset+s+48);
      vx7 = _mm256_sub_pd(vx6, vt);
      //  _mm256_store_pd(out+offset+s+56, vx7);
      vx6 = _mm256_add_pd(vx6, vt);
      //  _mm256_store_pd(out+offset+s+48, vx6);


      //  vx2 = _mm256_load_pd(out+offset+s+16);
      vt = _mm256_mul_pd(vx2, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx0 = _mm256_load_pd(out+offset+s+0);
      vx2 = _mm256_sub_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+16, vx2);
      vx0 = _mm256_add_pd(vx0, vt);
      //  _mm256_store_pd(out+offset+s+0, vx0);

      //  vx6 = _mm256_load_pd(out+offset+s+48);
      vt = _mm256_mul_pd(vx6, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx4 = _mm256_load_pd(out+offset+s+32);
      vx6 = _mm256_sub_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+48, vx6);
      vx4 = _mm256_add_pd(vx4, vt);
      //  _mm256_store_pd(out+offset+s+32, vx4);


      //  vx3 = _mm256_load_pd(out+offset+s+24);
      vt = _mm256_mul_pd(vx3, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx1 = _mm256_load_pd(out+offset+s+8);
      vx3 = _mm256_sub_pd(vx1, vt);
      //  _mm256_store_pd(out+offset+s+24, vx3);
      vx1 = _mm256_add_pd(vx1, vt);
      //  _mm256_store_pd(out+offset+s+8, vx1);

      //  vx7 = _mm256_load_pd(out+offset+s+56);
      vt = _mm256_mul_pd(vx7, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx5 = _mm256_load_pd(out+offset+s+40);
      vx7 = _mm256_sub_pd(vx5, vt);
      //  _mm256_store_pd(out+offset+s+56, vx7);
      vx5 = _mm256_add_pd(vx5, vt);
      //  _mm256_store_pd(out+offset+s+40, vx5);



      //  vx4 = _mm256_load_pd(out+offset+s+32);
    vo20 = _mm256_load_pd(o+pos+offset+24);
      vt = _mm256_mul_pd(vx4, vo20);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx0 = _mm256_load_pd(out+offset+s+0);
      vx4 = _mm256_sub_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+32, vx4);
      vx0 = _mm256_add_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+0, vx0);

      //  vx5 = _mm256_load_pd(out+offset+s+40);
    vo21 = _mm256_load_pd(o+pos+offset+32);
      vt = _mm256_mul_pd(vx5, vo21);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx1 = _mm256_load_pd(out+offset+s+8);
      vx5 = _mm256_sub_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+40, vx5);
      vx1 = _mm256_add_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+8, vx1);

      //  vx6 = _mm256_load_pd(out+offset+s+48);
    vo22 = _mm256_load_pd(o+pos+offset+40);
      vt = _mm256_mul_pd(vx6, vo22);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx2 = _mm256_load_pd(out+offset+s+16);
      vx6 = _mm256_sub_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+48, vx6);
      vx2 = _mm256_add_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+16, vx2);

      //  vx7 = _mm256_load_pd(out+offset+s+56);
    vo23 = _mm256_load_pd(o+pos+offset+48);
      vt = _mm256_mul_pd(vx7, vo23);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //  vx3 = _mm256_load_pd(out+offset+s+24);
      vx7 = _mm256_sub_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+56, vx7);
      vx3 = _mm256_add_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+24, vx3);
    }
  }


  pos += 56;

  // m = 128, m=256, m=512 (3 levels merged)
  for(offset=0;offset<64;offset+=4)
  {
    vo0 = _mm256_load_pd(o+pos+offset);
    vo10 = _mm256_load_pd(o+pos+offset+64);
    vo11 = _mm256_load_pd(o+pos+offset+128);

    for(s = 0; s<POLY_DEG; s+=512)
    {
      vx1 = _mm256_load_pd(out+offset+s+64);
      vt = _mm256_mul_pd(vx1, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx0 = _mm256_load_pd(out+offset+s+0);
      vx1 = _mm256_sub_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+64, vx1);
      vx0 = _mm256_add_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+0, vx0);

      vx3 = _mm256_load_pd(out+offset+s+192);
      vt = _mm256_mul_pd(vx3, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx2 = _mm256_load_pd(out+offset+s+128);
      vx3 = _mm256_sub_pd(vx2, vt);
      //_mm256_store_pd(out+offset+s+192, vx3);
      vx2 = _mm256_add_pd(vx2, vt);
      //_mm256_store_pd(out+offset+s+128, vx2);

      vx5 = _mm256_load_pd(out+offset+s+320);
      vt = _mm256_mul_pd(vx5, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx4 = _mm256_load_pd(out+offset+s+256);
      vx5 = _mm256_sub_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+320, vx5);
      vx4 = _mm256_add_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+256, vx4);

      vx7 = _mm256_load_pd(out+offset+s+448);
      vt = _mm256_mul_pd(vx7, vo0);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      vx6 = _mm256_load_pd(out+offset+s+384);
      vx7 = _mm256_sub_pd(vx6, vt);
      //_mm256_store_pd(out+offset+s+448, vx7);
      vx6 = _mm256_add_pd(vx6, vt);
      //_mm256_store_pd(out+offset+s+384, vx6);

    

      //vx2 = _mm256_load_pd(out+offset+s+128);
      vt = _mm256_mul_pd(vx2, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx0 = _mm256_load_pd(out+offset+s+0);
      vx2 = _mm256_sub_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+128, vx2);
      vx0 = _mm256_add_pd(vx0, vt);
      //_mm256_store_pd(out+offset+s+0, vx0);

      //vx3 = _mm256_load_pd(out+offset+s+192);
      vt = _mm256_mul_pd(vx3, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx1 = _mm256_load_pd(out+offset+s+64);
      vx3 = _mm256_sub_pd(vx1, vt);
      //_mm256_store_pd(out+offset+s+192, vx3);
      vx1 = _mm256_add_pd(vx1, vt);
      //_mm256_store_pd(out+offset+s+64, vx1);

      //vx6 = _mm256_load_pd(out+offset+s+384);
      vt = _mm256_mul_pd(vx6, vo10);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx4 = _mm256_load_pd(out+offset+s+256);
      vx6 = _mm256_sub_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+384, vx6);
      vx4 = _mm256_add_pd(vx4, vt);
      //_mm256_store_pd(out+offset+s+256, vx4);

      //vx7 = _mm256_load_pd(out+offset+s+448);
      vt = _mm256_mul_pd(vx7, vo11);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx5 = _mm256_load_pd(out+offset+s+320);
      vx7 = _mm256_sub_pd(vx5, vt);
      //_mm256_store_pd(out+offset+s+448, vx7);
      vx5 = _mm256_add_pd(vx5, vt);
      //_mm256_store_pd(out+offset+s+320, vx5);


    
      //vx4 = _mm256_load_pd(out+offset+s+256);
    vo20 = _mm256_load_pd(o+pos+offset+192);
      vt = _mm256_mul_pd(vx4, vo20);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx0 = _mm256_load_pd(out+offset+s+0);
      vx4 = _mm256_sub_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+256, vx4);
      vx0 = _mm256_add_pd(vx0, vt);
      _mm256_store_pd(out+offset+s+0, vx0);

      //vx5 = _mm256_load_pd(out+offset+s+320);
    vo21 = _mm256_load_pd(o+pos+offset+256);
      vt = _mm256_mul_pd(vx5, vo21);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx1 = _mm256_load_pd(out+offset+s+64);
      vx5 = _mm256_sub_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+320, vx5);
      vx1 = _mm256_add_pd(vx1, vt);
      _mm256_store_pd(out+offset+s+64, vx1);

      //vx6 = _mm256_load_pd(out+offset+s+384);
    vo22 = _mm256_load_pd(o+pos+offset+320);
      vt = _mm256_mul_pd(vx6, vo22);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx2 = _mm256_load_pd(out+offset+s+128);
      vx6 = _mm256_sub_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+384, vx6);
      vx2 = _mm256_add_pd(vx2, vt);
      _mm256_store_pd(out+offset+s+128, vx2);

      //vx7 = _mm256_load_pd(out+offset+s+448);
    vo23 = _mm256_load_pd(o+pos+offset+384);
      vt = _mm256_mul_pd(vx7, vo23);
      vc = _mm256_mul_pd(vt, vpinv);
      vc = _mm256_round_pd(vc,0x08);
      vc = _mm256_mul_pd(vc, vp);
      vt = _mm256_sub_pd(vt,vc);
      //vx3 = _mm256_load_pd(out+offset+s+192);
      vx7 = _mm256_sub_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+448, vx7);

      vx3 = _mm256_add_pd(vx3, vt);
      _mm256_store_pd(out+offset+s+192, vx3);
    }
  }
}
Example #26
0
void rnn_int_d8x4_var2(
    int    k,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *c,
    aux_t  *aux
    )
{
  int    i;
  double neg2 = -2.0;
  double dzero = 0.0;
  v4df_t c03_0, c03_1, c03_2, c03_3;
  v4df_t c47_0, c47_1, c47_2, c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t c_tmp;
  v4df_t a03, a47;
  v4df_t A03, A47; // prefetched A 

  v4df_t b0, b1, b2, b3;
  v4df_t B0; // prefetched B
  v4df_t aa_tmp, bb_tmp;


  int k_iter = k / 2;
  int k_left = k % 2;

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( a ) );
  __asm__ volatile( "prefetcht2 0(%0)    \n\t" : :"r"( aux->b_next ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( c ) );


  c03_0.v = _mm256_setzero_pd();
  c03_1.v = _mm256_setzero_pd();
  c03_2.v = _mm256_setzero_pd();
  c03_3.v = _mm256_setzero_pd();
  c47_0.v = _mm256_setzero_pd();
  c47_1.v = _mm256_setzero_pd();
  c47_2.v = _mm256_setzero_pd();
  c47_3.v = _mm256_setzero_pd();


  // Load a03
  a03.v = _mm256_load_pd(      (double*)a         );
  // Load a47
  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  // Load (b0,b1,b2,b3)
  b0.v  = _mm256_load_pd(      (double*)b         );

  for ( i = 0; i < k_iter; ++i ) {
    __asm__ volatile( "prefetcht0 192(%0)    \n\t" : :"r"(a) );

    // Preload A03
    A03.v = _mm256_load_pd(      (double*)( a + 8 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Preload A47
    A47.v = _mm256_load_pd(      (double*)( a + 12 ) );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    // Preload B0
    B0.v  = _mm256_load_pd(      (double*)( b + 4 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );


    // Iteration #1
    __asm__ volatile( "prefetcht0 512(%0)    \n\t" : :"r"(a) );

    // Preload a03 ( next iteration )
    a03.v = _mm256_load_pd(      (double*)( a + 16 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , B0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );

    b1.v  = _mm256_shuffle_pd( B0.v, B0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , B0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );

    // Preload a47 ( next iteration )
    a47.v = _mm256_load_pd(      (double*)( a + 20 ) );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Load b0 ( next iteration )
    b0.v  = _mm256_load_pd(      (double*)( b + 8 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( A47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 16;
    b += 8;
  }

  for ( i = 0; i < k_left; ++i ) {
    a03.v = _mm256_load_pd(      (double*)a         );
    //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] );

    a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
    //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] );

    b0.v  = _mm256_load_pd(      (double*)b         );
    //printf( "b0  = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 8;
    b += 4;
  }
 

  // Prefetch aa and bb
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aa ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( bb ) );


  tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 );
  tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 );
  
  tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 );
  tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 );

  tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 );
  tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 );

  tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 );
  tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 );

  //printf( "rank-k\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aux->I ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aux->D ) );


  //for ( i = 0; i < k; i++ ) {
  //  a03.v = _mm256_load_pd(      (double*)a         );
  //  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  //  b0.v  = _mm256_broadcast_sd( (double*)b         );
  //  b1.v  = _mm256_broadcast_sd( (double*)( b + 1 ) );
  //  b2.v  = _mm256_broadcast_sd( (double*)( b + 2 ) );
  //  b3.v  = _mm256_broadcast_sd( (double*)( b + 3 ) );

  //  a += DKS_MR;
  //  b += DKS_NR;

  //  c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
  //  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
  //  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
  //  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
  //  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );

  //  c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
  //  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
  //  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
  //  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
  //  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
  //}
  
  aa_tmp.v = _mm256_broadcast_sd( &neg2 );
  //c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  //c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  //c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  //c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  //c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  //c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  //c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  //c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );
  //
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  //printf( "scale -2 \n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  aa_tmp.v = _mm256_load_pd( (double*)aa );
  c03_0.v  = _mm256_add_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( aa_tmp.v, c03_3.v );

  //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] );
  //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] );

  aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) );
  c47_0.v  = _mm256_add_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( aa_tmp.v, c47_3.v );
  

  //printf( "add a^2\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  bb_tmp.v = _mm256_broadcast_sd( (double*)bb );
  c03_0.v  = _mm256_add_pd( bb_tmp.v, c03_0.v );
  c47_0.v  = _mm256_add_pd( bb_tmp.v, c47_0.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) );
  c03_1.v  = _mm256_add_pd( bb_tmp.v, c03_1.v );
  c47_1.v  = _mm256_add_pd( bb_tmp.v, c47_1.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) );
  c03_2.v  = _mm256_add_pd( bb_tmp.v, c03_2.v );
  c47_2.v  = _mm256_add_pd( bb_tmp.v, c47_2.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) );
  c03_3.v  = _mm256_add_pd( bb_tmp.v, c03_3.v );
  c47_3.v  = _mm256_add_pd( bb_tmp.v, c47_3.v );



  // Check if there is any illegle value 
  c_tmp.v  = _mm256_broadcast_sd( &dzero );
  c03_0.v  = _mm256_max_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_max_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_max_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_max_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_max_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_max_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_max_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_max_pd( c_tmp.v, c47_3.v );


  // Transpose c03/c47 _0, _1, _2, _3 to be the row vector
  tmpc03_0.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0x0 );
  tmpc03_1.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0xF );

  tmpc03_2.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0x0 );
  tmpc03_3.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0xF );

  tmpc47_0.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0x0 );
  tmpc47_1.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0xF );

  tmpc47_2.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0x0 );
  tmpc47_3.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0xF );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x20 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x31 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x20 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x31 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x20 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x31 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x20 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x31 );


  // c03_0;
  // c03_1;
  // c03_2;
  // c03_3;
  // c47_0;
  // c47_1;
  // c47_2;
  // c47_3;


  _mm256_store_pd( c     , c03_0.v );
  _mm256_store_pd( c +  4, c03_1.v );
  _mm256_store_pd( c +  8, c03_2.v );
  _mm256_store_pd( c + 12, c03_3.v );
  _mm256_store_pd( c + 16, c47_0.v );
  _mm256_store_pd( c + 20, c47_1.v );
  _mm256_store_pd( c + 24, c47_2.v );
  _mm256_store_pd( c + 28, c47_3.v );
}
Example #27
0
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_vector_compscale( const double* i_a,
                              const double* i_b,
                              double*       io_c,
                              const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );

  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] = i_a[l_n]*i_b[l_n];
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m256d vec_a_1, vec_b_1;
      __m256d vec_a_2, vec_b_2;

      vec_a_1 = _mm256_loadu_pd(&(i_a[l_n]));
      vec_a_2 = _mm256_loadu_pd(&(i_a[l_n+4]));
      vec_b_1 = _mm256_loadu_pd(&(i_b[l_n]));
      vec_b_2 = _mm256_loadu_pd(&(i_b[l_n+4]));

#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]),   _mm256_mul_pd( vec_a_1, vec_b_1 ) );
      _mm256_store_pd(  &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) );
#else
      _mm256_stream_pd( &(io_c[l_n]),   _mm256_mul_pd( vec_a_1, vec_b_1 ) );
      _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m512d vec_a, vec_b;

      vec_a = _mm512_loadu_pd(&(i_a[l_n]));
      vec_b = _mm512_loadu_pd(&(i_b[l_n]));

#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) );
#else
      _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] = i_a[l_n]*i_b[l_n];
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] = i_a[l_n]*i_b[l_n];
  }
}
Example #28
0
ALGEBRA_INLINE void		vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n)
{
	size_t k;
	
	__m256d l1 = _mm256_broadcast_sd(&lambda);
	__m256d l2 = _mm256_broadcast_sd(&lambda);
	__m256d l3 = _mm256_broadcast_sd(&lambda);
	__m256d l4 = _mm256_broadcast_sd(&lambda);

	size_t q = n / 16;
	size_t r = n % 16;
	if(q > 0) {
		if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) {
			for (k=0;k<q;k++) {
				/* Charge 4 valeurs de chaque tableau */
				__m256d i1 = _mm256_load_pd(v1);
				__m256d j1 = _mm256_load_pd(v2);
				__m256d i2 = _mm256_load_pd(v1+4);
				__m256d j2 = _mm256_load_pd(v2+4);
				__m256d i3 = _mm256_load_pd(v1+8);
				__m256d j3 = _mm256_load_pd(v2+8);
				__m256d i4 = _mm256_load_pd(v1+12);
				__m256d j4 = _mm256_load_pd(v2+12);
				/* multiplie */
					   j1 = _mm256_mul_pd(j1, l1);
					   j2 = _mm256_mul_pd(j2, l2);
					   j3 = _mm256_mul_pd(j3, l3);
					   j4 = _mm256_mul_pd(j4, l4);
				/* Additionne */
				i1 = _mm256_add_pd(i1,j1);
				i2 = _mm256_add_pd(i2,j2);
				i3 = _mm256_add_pd(i3,j3);
				i4 = _mm256_add_pd(i4,j4);
				/* Sauvegarde */
				_mm256_store_pd(v1, i1);
				_mm256_store_pd(v1+4, i2);
				_mm256_store_pd(v1+8, i3);
				_mm256_store_pd(v1+12, i4);
				v1 += 16;
				v2 += 16;
			}
		}
		else {		
			for (k=0;k<q;k++) {
				/* Charge 4 valeurs de chaque tableau */
				__m256d i1 = _mm256_loadu_pd(v1);
				__m256d j1 = _mm256_loadu_pd(v2);
				__m256d i2 = _mm256_loadu_pd(v1+4);
				__m256d j2 = _mm256_loadu_pd(v2+4);
				__m256d i3 = _mm256_loadu_pd(v1+8);
				__m256d j3 = _mm256_loadu_pd(v2+8);
				__m256d i4 = _mm256_loadu_pd(v1+12);
				__m256d j4 = _mm256_loadu_pd(v2+12);
				/* multiplie */
					   j1 = _mm256_mul_pd(j1, l1);
					   j2 = _mm256_mul_pd(j2, l2);
					   j3 = _mm256_mul_pd(j3, l3);
					   j4 = _mm256_mul_pd(j4, l4);
				/* Additionne */
				i1 = _mm256_add_pd(i1,j1);
				i2 = _mm256_add_pd(i2,j2);
				i3 = _mm256_add_pd(i3,j3);
				i4 = _mm256_add_pd(i4,j4);
				/* Sauvegarde */
				_mm256_storeu_pd(v1, i1);
				_mm256_storeu_pd(v1+4, i2);
				_mm256_storeu_pd(v1+8, i3);
				_mm256_storeu_pd(v1+12, i4);
				v1 += 16;
				v2 += 16;
			}
		}
	}
	
	for(k = 0 ; k<r ; k++)
		v1[k] += lambda*v2[k];
}
Example #29
0
 inline void vector4d::store_aligned(double* dst) const
 {
     _mm256_store_pd(dst, m_value);
 }
Example #30
0
int main()
{
    constexpr size_t N = 100 << 20;
    constexpr size_t N_pd = N/sizeof(double);
    constexpr size_t N_ps = N/sizeof(float);

    printf("Comparing std::log to mm256_log_pd/mm256_log_ps with %zuMiB input data\n", N>>20);

    double * data_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32);
    float  * data_ps = (float*) _mm_malloc(N_ps*sizeof(float),  32);

    double * outl_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32);
    float  * outl_ps = (float*) _mm_malloc(N_ps*sizeof(float),  32);

    double * outa_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32);
    float  * outa_ps = (float*) _mm_malloc(N_ps*sizeof(float),  32);

    double * err_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32);
    float  * err_ps = (float*) _mm_malloc(N_ps*sizeof(float),  32);

    size_t * idx_pd = (size_t*)malloc(N_pd*sizeof(size_t));
    size_t * idx_ps = (size_t*)malloc(N_ps*sizeof(size_t));

    if(data_pd == nullptr || data_ps == nullptr ||
       outl_pd == nullptr || outl_ps == nullptr ||
       outa_pd == nullptr || outa_ps == nullptr ||
        err_pd == nullptr ||  err_ps == nullptr) {
        return 1;
    }

    auto rng = std::mt19937(hrc::now().time_since_epoch().count());

    printf("Filling double input data... ");
    fflush(stdout);
    for(size_t i = 0; i < N_pd; ++i) {
        data_pd[i] = /*100.0  */ std::generate_canonical<double, 64>(rng);
    }
    printf("done\n");

    printf("Filling float input data... ");
    fflush(stdout);
    for(size_t i = 0; i < N_ps; ++i) {
        data_ps[i] = /*100.0f */ std::generate_canonical<float, 32>(rng);
    }
    printf("done\n\n");

    printf("Testing serial run:\n\n");

    printf("Running std::log double... ");
    fflush(stdout);
    auto log_pd_s_time_start = hrc::now();
    for(size_t i = 0; i < N_pd; i += 8) {
        outl_pd[i+0] = std::log(data_pd[i+0]);
        outl_pd[i+1] = std::log(data_pd[i+1]);
        outl_pd[i+2] = std::log(data_pd[i+2]);
        outl_pd[i+3] = std::log(data_pd[i+3]);
        outl_pd[i+4] = std::log(data_pd[i+4]);
        outl_pd[i+5] = std::log(data_pd[i+5]);
        outl_pd[i+6] = std::log(data_pd[i+6]);
        outl_pd[i+7] = std::log(data_pd[i+7]);
    }
    auto log_pd_s_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(log_pd_s_time_end - log_pd_s_time_start).count());

    printf("Running mm256_log_pd... ");
    fflush(stdout);
    auto avx_pd_s_time_start = hrc::now();
    for(size_t i = 0; i < N_pd; i += 8) {
        _mm256_store_pd(outa_pd+i+0, mm256_log_pd(_mm256_load_pd(data_pd+i+0)));
        _mm256_store_pd(outa_pd+i+4, mm256_log_pd(_mm256_load_pd(data_pd+i+4)));
    }
    auto avx_pd_s_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_pd_s_time_end - avx_pd_s_time_start).count());

    printf("Running std::log float... ");
    fflush(stdout);
    auto log_ps_s_time_start = hrc::now();
    for(size_t i = 0; i < N_ps; i += 16) {
        outl_ps[i+ 0] = std::log(data_ps[i+ 0]);
        outl_ps[i+ 1] = std::log(data_ps[i+ 1]);
        outl_ps[i+ 2] = std::log(data_ps[i+ 2]);
        outl_ps[i+ 3] = std::log(data_ps[i+ 3]);
        outl_ps[i+ 4] = std::log(data_ps[i+ 4]);
        outl_ps[i+ 5] = std::log(data_ps[i+ 5]);
        outl_ps[i+ 6] = std::log(data_ps[i+ 6]);
        outl_ps[i+ 7] = std::log(data_ps[i+ 7]);
        outl_ps[i+ 8] = std::log(data_ps[i+ 8]);
        outl_ps[i+ 9] = std::log(data_ps[i+ 9]);
        outl_ps[i+10] = std::log(data_ps[i+10]);
        outl_ps[i+11] = std::log(data_ps[i+11]);
        outl_ps[i+12] = std::log(data_ps[i+12]);
        outl_ps[i+13] = std::log(data_ps[i+13]);
        outl_ps[i+14] = std::log(data_ps[i+14]);
        outl_ps[i+15] = std::log(data_ps[i+15]);
    }
    auto log_ps_s_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(log_ps_s_time_end - log_ps_s_time_start).count());

    printf("Running mm256_log_ps... ");
    fflush(stdout);
    auto avx_ps_s_time_start = hrc::now();
    for(size_t i = 0; i < N_ps; i += 16) {
        _mm256_store_ps(outa_ps+i+0, mm256_log_ps(_mm256_load_ps(data_ps+i+0)));
        _mm256_store_ps(outa_ps+i+8, mm256_log_ps(_mm256_load_ps(data_ps+i+8)));
    }
    auto avx_ps_s_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_ps_s_time_end - avx_ps_s_time_start).count());


    printf("\n\nTesting parallel run:\n\n");

    printf("Running std::log double... ");
    fflush(stdout);
    auto log_pd_p_time_start = hrc::now();
    #pragma omp parallel for
    for(size_t i = 0; i < N_pd; i += 8) {
        outl_pd[i+0] = std::log(data_pd[i+0]);
        outl_pd[i+1] = std::log(data_pd[i+1]);
        outl_pd[i+2] = std::log(data_pd[i+2]);
        outl_pd[i+3] = std::log(data_pd[i+3]);
        outl_pd[i+4] = std::log(data_pd[i+4]);
        outl_pd[i+5] = std::log(data_pd[i+5]);
        outl_pd[i+6] = std::log(data_pd[i+6]);
        outl_pd[i+7] = std::log(data_pd[i+7]);
    }
    auto log_pd_p_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(log_pd_p_time_end - log_pd_p_time_start).count());

    printf("Running mm256_log_pd... ");
    fflush(stdout);
    auto avx_pd_p_time_start = hrc::now();
    #pragma omp parallel for
    for(size_t i = 0; i < N_pd; i += 8) {
        _mm256_store_pd(outa_pd+i+0, mm256_log_pd(_mm256_load_pd(data_pd+i+0)));
        _mm256_store_pd(outa_pd+i+4, mm256_log_pd(_mm256_load_pd(data_pd+i+4)));
    }
    auto avx_pd_p_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_pd_p_time_end - avx_pd_p_time_start).count());

    printf("Running std::log float... ");
    fflush(stdout);
    auto log_ps_p_time_start = hrc::now();
    #pragma omp parallel for
    for(size_t i = 0; i < N_ps; i += 16) {
        outl_ps[i+ 0] = std::log(data_ps[i+ 0]);
        outl_ps[i+ 1] = std::log(data_ps[i+ 1]);
        outl_ps[i+ 2] = std::log(data_ps[i+ 2]);
        outl_ps[i+ 3] = std::log(data_ps[i+ 3]);
        outl_ps[i+ 4] = std::log(data_ps[i+ 4]);
        outl_ps[i+ 5] = std::log(data_ps[i+ 5]);
        outl_ps[i+ 6] = std::log(data_ps[i+ 6]);
        outl_ps[i+ 7] = std::log(data_ps[i+ 7]);
        outl_ps[i+ 8] = std::log(data_ps[i+ 8]);
        outl_ps[i+ 9] = std::log(data_ps[i+ 9]);
        outl_ps[i+10] = std::log(data_ps[i+10]);
        outl_ps[i+11] = std::log(data_ps[i+11]);
        outl_ps[i+12] = std::log(data_ps[i+12]);
        outl_ps[i+13] = std::log(data_ps[i+13]);
        outl_ps[i+14] = std::log(data_ps[i+14]);
        outl_ps[i+15] = std::log(data_ps[i+15]);
    }
    auto log_ps_p_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(log_ps_p_time_end - log_ps_p_time_start).count());

    printf("Running mm256_log_ps... ");
    fflush(stdout);
    auto avx_ps_p_time_start = hrc::now();
    #pragma omp parallel for
    for(size_t i = 0; i < N_ps; i += 16) {
        _mm256_store_ps(outa_ps+i+0, mm256_log_ps(_mm256_load_ps(data_ps+i+0)));
        _mm256_store_ps(outa_ps+i+8, mm256_log_ps(_mm256_load_ps(data_ps+i+8)));
    }
    auto avx_ps_p_time_end = hrc::now();
    printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_ps_p_time_end - avx_ps_p_time_start).count());

    printf("\nCalculating errors... ");
    fflush(stdout);
    #pragma omp parallel for
    for(size_t i = 0; i < N_pd; ++i) {
        err_pd[i] = std::abs(1.0 - outa_pd[i]/outl_pd[i]);
    }
    #pragma omp parallel for
    for(size_t i = 0; i < N_ps; ++i) {
        err_ps[i] = std::abs(1.0 - outa_ps[i]/outl_ps[i]);
    }
    #pragma omp parallel for
    for(size_t i = 0; i < N_pd; ++i) {
        idx_pd[i] = i;
    }
    #pragma omp parallel for
    for(size_t i = 0; i < N_ps; ++i) {
        idx_ps[i] = i;
    }

    std::sort(idx_pd, idx_pd+N_pd, [&](size_t a, size_t b){ return err_pd[a] < err_pd[b]; });
    std::sort(idx_ps, idx_ps+N_ps, [&](size_t a, size_t b){ return err_ps[a] < err_ps[b]; });
    printf("done\n");

    printf("\n\nSummary:\n");
    double lsd_s = std::chrono::duration_cast<sd>(log_pd_s_time_end - log_pd_s_time_start).count();
    double asd_s = std::chrono::duration_cast<sd>(avx_pd_s_time_end - avx_pd_s_time_start).count();
    double lss_s = std::chrono::duration_cast<sd>(log_ps_s_time_end - log_ps_s_time_start).count();
    double ass_s = std::chrono::duration_cast<sd>(avx_ps_s_time_end - avx_ps_s_time_start).count();
    double lpd_s = std::chrono::duration_cast<sd>(log_pd_p_time_end - log_pd_p_time_start).count();
    double apd_s = std::chrono::duration_cast<sd>(avx_pd_p_time_end - avx_pd_p_time_start).count();
    double lps_s = std::chrono::duration_cast<sd>(log_ps_p_time_end - log_ps_p_time_start).count();
    double aps_s = std::chrono::duration_cast<sd>(avx_ps_p_time_end - avx_ps_p_time_start).count();

    printf(" Algorithm    | Data Type | parallel | time      | speed      | min rel err  | max rel err  | 90%% rel err\n");
    printf("-----------------------------------------------------------------------------------------------------------\n");
    printf(" std::log     | double    |  false   | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lsd_s, N / lsd_s / (1<<30));
    printf(" mm256_log_pd | double    |  false   | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * asd_s, N / asd_s / (1<<30), err_pd[idx_pd[0]], err_pd[idx_pd[N_pd-1]], err_pd[idx_pd[90*N_pd/100]]);
    printf(" std::log     | float     |  false   | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lss_s, N / lss_s / (1<<30));
    printf(" mm256_log_ps | float     |  false   | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * ass_s, N / ass_s / (1<<30), err_ps[idx_ps[0]], err_ps[idx_ps[N_ps-1]], err_ps[idx_ps[90*N_ps/100]]);
    printf(" std::log     | double    |  true    | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lpd_s, N / lpd_s / (1<<30));
    printf(" mm256_log_pd | double    |  true    | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * apd_s, N / apd_s / (1<<30), err_pd[idx_pd[0]], err_pd[idx_pd[N_pd-1]], err_pd[idx_pd[90*N_pd/100]]);
    printf(" std::log     | float     |  true    | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lps_s, N / lps_s / (1<<30));
    printf(" mm256_log_ps | float     |  true    | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * aps_s, N / aps_s / (1<<30), err_ps[idx_ps[0]], err_ps[idx_ps[N_ps-1]], err_ps[idx_ps[90*N_ps/100]]);

    _mm_free(data_pd);
    _mm_free(data_ps);
    _mm_free(outl_pd);
    _mm_free(outl_ps);
    _mm_free(outa_pd);
    _mm_free(outa_ps);
    _mm_free(err_pd);
    _mm_free(err_ps);

    free(idx_pd);
    free(idx_ps);

    return 0;
}