/****************************************************************** * * SPLIT RADIX PRECOMPUTED AND VECTORIZED FFT MULTIPLICATION * ******************************************************************/ void sr_vector_mul(ring_t *r, const ring_t *x, const ring_t *y){ // printf("\n\n**************split-radix FAST**************\n"); fft_vector_forward(&vctr_x,x); fft_vector_forward(&vctr_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp; // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vctr_x.real+i); imag_x = _mm256_load_pd(vctr_x.imag+i); real_y = _mm256_load_pd(vctr_y.real+i); imag_y = _mm256_load_pd(vctr_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_y = _mm256_set1_pd(CPLXDIM); real_x = _mm256_div_pd(real_x,real_y); imag_x = _mm256_div_pd(imag_x,real_y); _mm256_store_pd(vctr_res.real+i,real_x); _mm256_store_pd(vctr_res.imag+i,imag_x); } fft_vector_backward(&vctr_res,r); }
/****************************************************************** * * NEGACYCLIC FFT LOOK UP TABLE * ******************************************************************/ void negacyc_mul(ring_t *r, const ring_t *x, const ring_t *y) { phi_forward(&vector_x,x); phi_forward(&vector_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp,dim; dim = _mm256_set1_pd(CPLXDIM); // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vector_x.real+i); imag_x = _mm256_load_pd(vector_x.imag+i); real_y = _mm256_load_pd(vector_y.real+i); imag_y = _mm256_load_pd(vector_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_x = _mm256_div_pd(real_x,dim); imag_x = _mm256_div_pd(imag_x,dim); _mm256_store_pd(vector_res.real+i,real_x); _mm256_store_pd(vector_res.imag+i,imag_x); } phi_backward(&vector_res,r); // print_cplx(&vec_res,CPLXDIM); }
// grid must be initialized to 0s. void gridKernel_scatter( const complexd gcf[w_planes][over][over][max_supp][max_supp] , Double4c grid[grid_size][grid_size] // , const Pregridded uvw[baselines][timesteps][channels] // , const Double4c vis[baselines][timesteps][channels] , const Pregridded uvw[baselines * timesteps * channels] , const Double4c vis[baselines * timesteps * channels] ) { for (int sv = 0; sv < max_supp; sv++) { // Moved from 2-levels below according to Romein for (int i = 0; i < baselines * timesteps * channels; i++) { auto gcf_layer = gcf[0][0][0] + uvw[i].gcf_layer_offset; #ifdef __AVX__ // We port Romein CPU code to doubles here (for MS2 we didn't) // vis0 covers XX and XY, vis1 -- YX and YY __m256d vis0 = _mm256_load_pd((const double *) &vis[i].XX); __m256d vis1 = _mm256_load_pd((const double *) &vis[i].YX); #endif for (int su = 0; su < max_supp; su++) { // NOTE: Romein writes about moving this to the very outer scope // (2 levels up) to make things more cache-friendly. // I don't know whether it is cache frendly // but what I definitely see gcc generates 4-times shorter code for it! // We need to investigate this! // for (int sv = 0; sv < max_supp; sv++) { int gsu, gsv; gsu = uvw[i].u + su - max_supp/2; gsv = uvw[i].v + sv - max_supp/2; #ifdef __AVX__ __m256d weight_r = _mm256_set1_pd(gcf_layer[su][sv].real()); __m256d weight_i = _mm256_set1_pd(gcf_layer[su][sv].imag()); /* _mm256_permute_pd(x, 5) swaps adjacent doubles pairs of x: d0, d1, d2, d3 goes to d1, d0, d3, d2 */ #define __DO(p,f) \ __m256d * gridptr##p = (__m256d *) &grid[gsu][gsv].f; \ __m256d tr##p = _mm256_mul_pd(weight_r, vis##p); \ __m256d ti##p = _mm256_mul_pd(weight_i, vis##p); \ __m256d tp##p = _mm256_permute_pd(ti##p, 5); \ __m256d t##p = _mm256_addsub_pd(tr##p, tp##p); \ gridptr##p[p] = _mm256_add_pd(gridptr##p[p], t##p) __DO(0, XX); __DO(1, YX); #else #define __GRID_POL(pol) grid[gsu][gsv].pol += vis[i].pol * gcf_layer[su][sv] __GRID_POL(XX); __GRID_POL(XY); __GRID_POL(YX); __GRID_POL(YY); #endif } } } }
void sum_avx(double* c, double* a, double* b, int len) { __m256d rA_AVX, rB_AVX, rC_AVX; // variables for AVX for (int i = 0; i < len; i += 4) { rA_AVX = _mm256_load_pd(&a[i]); rB_AVX = _mm256_load_pd(&b[i]); rC_AVX = _mm256_add_pd(rA_AVX, rB_AVX); _mm256_store_pd(&c[i], rC_AVX); } }
// multiply *p by v and applied to all n COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v) { #if defined(COREARRAY_SIMD_AVX) const __m256d v4 = _mm256_set1_pd(v); switch ((size_t)p & 0x1F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x10: if (n > 0) { (*p++) *= v; n--; } case 0x18: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 4; n-=4) { _mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4)); p += 4; } if (n >= 2) { _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } break; default: for (; n >= 4; n-=4) { _mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4)); p += 4; } if (n >= 2) { _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } } #elif defined(COREARRAY_SIMD_SSE2) const __m128d v2 = _mm_set1_pd(v); switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 2; n-=2, p+=2) _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2)); break; default: for (; n >= 2; n-=2, p+=2) _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2)); } #endif for (; n > 0; n--) (*p++) *= v; }
irreg_poly_area_func_sign(double, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256d curr, forw, coef_0, coef_1, end = _mm256_load_pd((const double *)cords), accum_sum = _mm256_setzero_pd(); double accum_sum_aux; unsigned long index; for (index = 0; index < (cords_len - 4); index += 4) { curr = end; // x0,y0,x1,y1 forw = _mm256_load_pd((const double *)&cords[index + 2]); // x2,y2,x3,y3 end = _mm256_load_pd((const double *)&cords[index + 4]); // x4,y4,x5,y5 coef_0 = _mm256_permute2f128_pd(curr, forw, 0b00110001); // x1, y1, x3, y3 coef_1 = _mm256_permute2f128_pd(forw, end, 0b00100000); // x2, y2, x4, y4 //_mm256_hsub_pd(a, b) == a0 - a1, b0 - b1, a2 - a3, b2 - b3 accum_sum = _mm256_add_pd( accum_sum, _mm256_hsub_pd( // x0*y1 - y0*x1, x1*y2 - y1x2, x2*y3 - y2*x3, x3*y4 - y3*x4 _mm256_mul_pd( // x0*y1, y0*x1, x2*y3, y2*x3 _mm256_permute2f128_pd(curr, forw, 0b00100000), // x0, y0, x2, y2 _mm256_shuffle_pd(coef_0, coef_0, 0b0101) // y1, x1, y3, x3 ), _mm256_mul_pd(coef_0, _mm256_shuffle_pd(coef_1, coef_1, 0b0101)) // y2, x2, y4, x4 // ^^^^^^^^^^^^^^^ x1*y2, y1*x2, x3*y4, y3*x4 ) ); } accum_sum = _mm256_hadd_pd(accum_sum, _mm256_permute2f128_pd(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a2+a3, a0+a1 accum_sum = _mm256_hadd_pd(accum_sum, accum_sum); // a0+a1+a2+a3, ... for (accum_sum_aux = _mm_cvtsd_f64(_mm256_castpd256_pd128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
void printcounters(struct counter *ctrs, uint64_t duration) { struct metrics s = {0}; uint64_t thisBytesWritten = pcm->bytesWritten(); uint64_t thisBytesRead = pcm->bytesRead(); memset(threadspercore, 0, gbl.ncores * sizeof(int)); s.timestamp = _rdtsc(); s.duration = duration; for (int cpu = 0; cpu < gbl.ncpus; ++cpu) { double delta[NEVENTS]; // volatile because another thread is changing it. volatile struct counter *p = &ctrs[cpu]; for (int i = 0; i < NEVENTS; ++i) { union { __m256d c; uint64_t values[4]; } t; t.c = _mm256_load_pd((const double *)&p->counts[i][0]); delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]); _mm256_store_pd((double *)&lastctr[cpu].counts[i][0], t.c); if (delta[i] < 0) delta[i] = 0; sevents[i] += delta[i]; } //printf("clocks %g duration %lu\n", delta[clocks], duration); if (2*delta[clocks] > duration) { int thiscore = pcm->getSocketId(cpu) * gbl.corespersocket + pcm->getCoreId(cpu); ++s.nthreads; ++threadspercore[thiscore]; } s.dsimd += delta[simd_dp]; s.dsse += delta[sse_dp]; s.dscalar += delta[scalar_dp]; s.ssimd += delta[simd_sp]; s.ssse += delta[sse_sp]; s.sscalar += delta[scalar_sp]; s.instrs += delta[instrs]; } s.rbytes = thisBytesRead - lastBytesRead; s.wbytes = thisBytesWritten - lastBytesWritten; lastBytesRead = thisBytesRead; lastBytesWritten = thisBytesWritten; for (int i = 0; i < gbl.ncores; ++i) if (threadspercore[i]) ++s.ncores; sample(&s); }
double HodgkinHuxley::dV(double *V, double I) { const double C = 1.0; const double gNa = 120.0; const double gK = 36.0; const double gL = 0.3; const double ENa = 50.0; const double EK = -77.0; const double EL = -54.4; #ifdef __AVX__ /* AVX is an instruction set from Intel which allows simultaneous operation on 4 doubles. Seems to be slower than optimized FPU, though. */ double Va[] __attribute__ ((aligned (32))) = {V[0], V[0], V[0], 1.0}, Ea[] __attribute__ ((aligned (32))) = {EL, ENa, EK, 0.0}, Ga[] __attribute__ ((aligned (32))) = {-gL, -gNa * pow(V[2], 3.0) * V[3], -gK * pow(V[1], 4.0), I}; // load V __m256d Vr = _mm256_load_pd(Va); // load E __m256d Er = _mm256_load_pd(Ea); // load G __m256d Gr = _mm256_load_pd(Ga); // subtract Vr = _mm256_sub_pd(Vr, Er); // dot product (why does intel not have _mm256_dp_pd ?) Vr = _mm256_mul_pd(Vr, Gr); __m256d temp = _mm256_hadd_pd(Vr, Vr); __m128d lo128 = _mm256_extractf128_pd(temp, 0); __m128d hi128 = _mm256_extractf128_pd(temp, 1); __m128d dotproduct = _mm_add_pd(lo128, hi128); double sseVal; // store _mm_storel_pd(&sseVal, dotproduct); sseVal /= C; return sseVal; #else return (-gL * (V[0] - EL) - gNa * pow(V[2], 3.0) * V[3] * (V[0] - ENa) - gK * pow(V[1], 4.0) * (V[0] - EK) + I) / C; #endif }
inline void transpose_4x4block_AVX_64(double* A, double* B, const size_t lda, const size_t ldb) { __m256d row0 = _mm256_load_pd(&A[0*ldb]); __m256d row1 = _mm256_load_pd(&A[1*ldb]); __m256d row2 = _mm256_load_pd(&A[2*ldb]); __m256d row3 = _mm256_load_pd(&A[3*ldb]); __m256d tmp3, tmp2, tmp1, tmp0; tmp0 = _mm256_unpacklo_pd(row0, row1); tmp1 = _mm256_unpackhi_pd(row0, row1); tmp2 = _mm256_unpacklo_pd(row2, row3); tmp3 = _mm256_unpackhi_pd(row2, row3); row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0x20); row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0x20); row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0x31); row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0x31); _mm256_store_pd(&B[0*lda], row0); _mm256_store_pd(&B[1*lda], row1); _mm256_store_pd(&B[2*lda], row2); _mm256_store_pd(&B[3*lda], row3); }
/****************************************************************** * * SPLIT RADIX PRECOMPUTED AND VECTORIZED NON RECURSIVE FFT MULTIPLICATION * ******************************************************************/ void sr_vector_nonrec_mul(ring_t *r, const ring_t *x, const ring_t *y){ fft_vector_nonrec_forward(&vec_x,x); fft_vector_nonrec_forward(&vec_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp; // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vec_x.real+i); imag_x = _mm256_load_pd(vec_x.imag+i); real_y = _mm256_load_pd(vec_y.real+i); imag_y = _mm256_load_pd(vec_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); //REPLACED FOR COMMENTED SECTION //real_x = ac // real_x = _mm256_mul_pd(real_x,real_y); // //imag_x = bc // imag_x = _mm256_mul_pd(imag_x,real_y); // //real_x = ac - bd => real_x - real_temp // real_x = _mm256_sub_pd(real_x,real_temp); // //imag_x = ad + bc => imag_temp + imag_x // imag_x = _mm256_add_pd(imag_x,imag_temp); //THESE ARE NOT WORKING real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_y = _mm256_set1_pd(CPLXDIM); real_x = _mm256_div_pd(real_x,real_y); imag_x = _mm256_div_pd(imag_x,real_y); _mm256_store_pd(vec_res.real+i,real_x); _mm256_store_pd(vec_res.imag+i,imag_x); } fft_vector_nonrec_backward(&vec_res,r); }
ALGEBRA_INLINE double vector_ps_double (const double* pa,const double* pb,size_t n) { if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { size_t q = n/4; size_t r = n%4; double w = 0; if(q>0) { __m256d acc = _mm256_setzero_pd(); __m256d i1 = _mm256_load_pd(pa); __m256d j1 = _mm256_load_pd(pb); pa += 4; pb += 4; __m256d s = _mm256_mul_pd(i1, j1); acc = _mm256_add_pd(acc, s); while(--q != 0) { // load i1 = _mm256_load_pd(pa); j1 = _mm256_load_pd(pb); pa += 4; pb += 4; // multiplie s = _mm256_mul_pd(i1, j1); // accumule acc = _mm256_add_pd(acc, s); } // sum finale // add horizontal acc = _mm256_hadd_pd(acc, acc); // échange 128bits haut et bas __m256d accp = _mm256_permute2f128_pd(acc, acc, 1); // add vertical acc = _mm256_add_pd(acc, accp); // extract _mm_store_sd(&w, _mm256_extractf128_pd(acc,0)); } return w + vector_ps_double_basic(pa, pb, r); } return vector_ps_double_basic(pa, pb, n); }
// this function assumes data is stored in col-major // if data is in row major, call it like matmul4x4(B, A, C) void matmul4x4(double *A, double *B, double *C) { __m256d col[4], sum[4]; //load every column into registers for(int i=0; i<4; i++) col[i] = _mm256_load_pd(&A[i*4]); for(int i=0; i<4; i++) { sum[i] = _mm256_setzero_pd(); for(int j=0; j<4; j++) { sum[i] = _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(B[i*4+j]), col[j]), sum[i]); } } for(int i=0; i<4; i++) _mm256_store_pd(&C[i*4], sum[i]); }
static inline void matmul_4xkxkx4(int lda, int K, double* a, double* b, double* c) { __m256d a_coli, bi0, bi1, bi2, bi3; __m256d c_col0, c_col1, c_col2, c_col3; /* layout of 4x4 c matrix 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 */ double* c01_ptr = c + lda; double* c02_ptr = c01_ptr + lda; double* c03_ptr = c02_ptr + lda; // load old value of c c_col0 = _mm256_loadu_pd(c); c_col1 = _mm256_loadu_pd(c01_ptr); c_col2 = _mm256_loadu_pd(c02_ptr); c_col3 = _mm256_loadu_pd(c03_ptr); // for every column of a (or every row of b) for (int i = 0; i < K; ++i) { a_coli = _mm256_load_pd(a); a += 4; bi0 = _mm256_broadcast_sd(b++); bi1 = _mm256_broadcast_sd(b++); bi2 = _mm256_broadcast_sd(b++); bi3 = _mm256_broadcast_sd(b++); c_col0 = _mm256_add_pd(c_col0, _mm256_mul_pd(a_coli, bi0)); c_col1 = _mm256_add_pd(c_col1, _mm256_mul_pd(a_coli, bi1)); c_col2 = _mm256_add_pd(c_col2, _mm256_mul_pd(a_coli, bi2)); c_col3 = _mm256_add_pd(c_col3, _mm256_mul_pd(a_coli, bi3)); } _mm256_storeu_pd(c, c_col0); _mm256_storeu_pd(c01_ptr, c_col1); _mm256_storeu_pd(c02_ptr, c_col2); _mm256_storeu_pd(c03_ptr, c_col3); }
/*! * \brief Load a packed vector from the given aligned memory location */ ETL_STATIC_INLINE(avx_simd_double) load(const double* memory) { return _mm256_load_pd(memory); }
// it moves horizontally inside a block (A upper triangular) void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0, x_1, x_2, x_3, y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3; zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ y_0_1_2_3_d = _mm256_setzero_pd(); // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc ); // forth col (avoid zero y_0_1_2_3) x_3 = _mm256_broadcast_sd( &x[3] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 ); // first col x_2 = _mm256_broadcast_sd( &x[2] ); x_2 = _mm256_blend_pd( x_2, zeros, 0x8 ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 ); A += 4*lda; x += 4; k=4; for(; k<kmax-3; k+=4) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); x_2 = _mm256_broadcast_sd( &x[2] ); x_3 = _mm256_broadcast_sd( &x[3] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 ); y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp ); ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 ); y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp ); A += 4*lda; x += 4; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d ); if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } }
static inline PetscErrorCode TensorContract_FMA(PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) { PetscFunctionBegin; if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;} { PetscReal R[Q][P],S[Q][P],T[Q][P]; const PetscScalar (*x)[P*P*P][NE] = (const PetscScalar(*)[P*P*P][NE])xx; PetscScalar (*y)[P*P*P][NE] = (PetscScalar(*)[Q*Q*Q][NE])yy; PetscScalar u[dof][Q*P*P][NE]_align,v[dof][Q*Q*P][NE]_align; for (PetscInt i=0; i<Q; i++) { for (PetscInt j=0; j<P; j++) { R[i][j] = tmode == TENSOR_EVAL ? Rf[i*P+j] : Rf[j*Q+i]; S[i][j] = tmode == TENSOR_EVAL ? Sf[i*P+j] : Sf[j*Q+i]; T[i][j] = tmode == TENSOR_EVAL ? Tf[i*P+j] : Tf[j*Q+i]; } } // u[l,a,j,k] = R[a,i] x[l,i,j,k] for (PetscInt l=0; l<dof; l++) { for (PetscInt a=0; a<Q; a++) { __m256d r[P]; for (PetscInt i=0; i<P; i++) r[i] = _mm256_set1_pd(R[a][i]); for (PetscInt jk=0; jk<P*P; jk++) { __m256d u_lajk = _mm256_setzero_pd(); for (PetscInt i=0; i<P; i++) { u_lajk = _mm256_fmadd_pd(r[i],_mm256_load_pd(x[l][i*P*P+jk]),u_lajk); } _mm256_store_pd(u[l][a*P*P+jk],u_lajk); } } } // v[l,a,b,k] = S[b,j] u[l,a,j,k] for (PetscInt l=0; l<dof; l++) { for (PetscInt b=0; b<Q; b++) { __m256d s[P]; for (int j=0; j<P; j++) s[j] = _mm256_set1_pd(S[b][j]); for (PetscInt a=0; a<Q; a++) { for (PetscInt k=0; k<P; k++) { __m256d v_labk = _mm256_setzero_pd(); for (PetscInt j=0; j<P; j++) { v_labk = _mm256_fmadd_pd(s[j],_mm256_load_pd(u[l][(a*P+j)*P+k]),v_labk); } _mm256_store_pd(v[l][(a*Q+b)*P+k],v_labk); } } } } // y[l,a,b,c] = T[c,k] v[l,a,b,k] for (PetscInt l=0; l<dof; l++) { for (PetscInt c=0; c<Q; c++) { __m256d t[P]; for (int k=0; k<P; k++) t[k] = _mm256_set1_pd(T[c][k]); for (PetscInt ab=0; ab<Q*Q; ab++) { __m256d y_labc = _mm256_load_pd(y[l][ab*Q+c]); for (PetscInt k=0; k<P; k++) { // for (PetscInt e=0; e<NE; e++) y[l][ab*Q+c][e] += T[c][k] * v[l][ab*P+k][e]; y_labc = _mm256_fmadd_pd(t[k],_mm256_load_pd(v[l][ab*P+k]),y_labc); } _mm256_store_pd(y[l][ab*Q+c],y_labc); } } } PetscLogFlops(dof*(Q*P*P*P+Q*Q*P*P+Q*Q*Q*P)*NE*2); } PetscFunctionReturn(0); }
// it moves vertically across blocks void kernel_dtrmv_u_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; double *tA, *tx; int k; __m256d tmp0, a_00_10_20_30, x_0_1_2_3, y_00; y_00 = _mm256_setzero_pd(); k=0; for(; k<kmax-3; k+=4) { x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); A += 4 + (sda-1)*lda; x += 4; } __m128d tm0, a_00_10, a_01_11, x_0_1, y_0, y_1, y_0_1; tm0 = _mm256_extractf128_pd( y_00, 0x1 ); y_0 = _mm256_castpd256_pd128( y_00 ); y_0 = _mm_add_pd( y_0, tm0 ); if(k<kmax-1) { x_0_1 = _mm_loadu_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); tm0 = _mm_mul_pd( a_00_10, x_0_1 ); y_0 = _mm_add_pd( y_0, tm0 ); A += 2; x += 2; } x_0_1 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tm0 = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd( y_0, tm0 ); y_0 = _mm_hadd_pd( y_0, y_0 ); if(alg==0) { _mm_store_sd(&y[0], y_0); } else if(alg==1) { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_add_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } else // alg==-1 { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_sub_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } }
int main() { constexpr size_t N = 100 << 20; constexpr size_t N_pd = N/sizeof(double); constexpr size_t N_ps = N/sizeof(float); printf("Comparing std::log to mm256_log_pd/mm256_log_ps with %zuMiB input data\n", N>>20); double * data_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32); float * data_ps = (float*) _mm_malloc(N_ps*sizeof(float), 32); double * outl_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32); float * outl_ps = (float*) _mm_malloc(N_ps*sizeof(float), 32); double * outa_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32); float * outa_ps = (float*) _mm_malloc(N_ps*sizeof(float), 32); double * err_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32); float * err_ps = (float*) _mm_malloc(N_ps*sizeof(float), 32); size_t * idx_pd = (size_t*)malloc(N_pd*sizeof(size_t)); size_t * idx_ps = (size_t*)malloc(N_ps*sizeof(size_t)); if(data_pd == nullptr || data_ps == nullptr || outl_pd == nullptr || outl_ps == nullptr || outa_pd == nullptr || outa_ps == nullptr || err_pd == nullptr || err_ps == nullptr) { return 1; } auto rng = std::mt19937(hrc::now().time_since_epoch().count()); printf("Filling double input data... "); fflush(stdout); for(size_t i = 0; i < N_pd; ++i) { data_pd[i] = /*100.0 */ std::generate_canonical<double, 64>(rng); } printf("done\n"); printf("Filling float input data... "); fflush(stdout); for(size_t i = 0; i < N_ps; ++i) { data_ps[i] = /*100.0f */ std::generate_canonical<float, 32>(rng); } printf("done\n\n"); printf("Testing serial run:\n\n"); printf("Running std::log double... "); fflush(stdout); auto log_pd_s_time_start = hrc::now(); for(size_t i = 0; i < N_pd; i += 8) { outl_pd[i+0] = std::log(data_pd[i+0]); outl_pd[i+1] = std::log(data_pd[i+1]); outl_pd[i+2] = std::log(data_pd[i+2]); outl_pd[i+3] = std::log(data_pd[i+3]); outl_pd[i+4] = std::log(data_pd[i+4]); outl_pd[i+5] = std::log(data_pd[i+5]); outl_pd[i+6] = std::log(data_pd[i+6]); outl_pd[i+7] = std::log(data_pd[i+7]); } auto log_pd_s_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(log_pd_s_time_end - log_pd_s_time_start).count()); printf("Running mm256_log_pd... "); fflush(stdout); auto avx_pd_s_time_start = hrc::now(); for(size_t i = 0; i < N_pd; i += 8) { _mm256_store_pd(outa_pd+i+0, mm256_log_pd(_mm256_load_pd(data_pd+i+0))); _mm256_store_pd(outa_pd+i+4, mm256_log_pd(_mm256_load_pd(data_pd+i+4))); } auto avx_pd_s_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_pd_s_time_end - avx_pd_s_time_start).count()); printf("Running std::log float... "); fflush(stdout); auto log_ps_s_time_start = hrc::now(); for(size_t i = 0; i < N_ps; i += 16) { outl_ps[i+ 0] = std::log(data_ps[i+ 0]); outl_ps[i+ 1] = std::log(data_ps[i+ 1]); outl_ps[i+ 2] = std::log(data_ps[i+ 2]); outl_ps[i+ 3] = std::log(data_ps[i+ 3]); outl_ps[i+ 4] = std::log(data_ps[i+ 4]); outl_ps[i+ 5] = std::log(data_ps[i+ 5]); outl_ps[i+ 6] = std::log(data_ps[i+ 6]); outl_ps[i+ 7] = std::log(data_ps[i+ 7]); outl_ps[i+ 8] = std::log(data_ps[i+ 8]); outl_ps[i+ 9] = std::log(data_ps[i+ 9]); outl_ps[i+10] = std::log(data_ps[i+10]); outl_ps[i+11] = std::log(data_ps[i+11]); outl_ps[i+12] = std::log(data_ps[i+12]); outl_ps[i+13] = std::log(data_ps[i+13]); outl_ps[i+14] = std::log(data_ps[i+14]); outl_ps[i+15] = std::log(data_ps[i+15]); } auto log_ps_s_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(log_ps_s_time_end - log_ps_s_time_start).count()); printf("Running mm256_log_ps... "); fflush(stdout); auto avx_ps_s_time_start = hrc::now(); for(size_t i = 0; i < N_ps; i += 16) { _mm256_store_ps(outa_ps+i+0, mm256_log_ps(_mm256_load_ps(data_ps+i+0))); _mm256_store_ps(outa_ps+i+8, mm256_log_ps(_mm256_load_ps(data_ps+i+8))); } auto avx_ps_s_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_ps_s_time_end - avx_ps_s_time_start).count()); printf("\n\nTesting parallel run:\n\n"); printf("Running std::log double... "); fflush(stdout); auto log_pd_p_time_start = hrc::now(); #pragma omp parallel for for(size_t i = 0; i < N_pd; i += 8) { outl_pd[i+0] = std::log(data_pd[i+0]); outl_pd[i+1] = std::log(data_pd[i+1]); outl_pd[i+2] = std::log(data_pd[i+2]); outl_pd[i+3] = std::log(data_pd[i+3]); outl_pd[i+4] = std::log(data_pd[i+4]); outl_pd[i+5] = std::log(data_pd[i+5]); outl_pd[i+6] = std::log(data_pd[i+6]); outl_pd[i+7] = std::log(data_pd[i+7]); } auto log_pd_p_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(log_pd_p_time_end - log_pd_p_time_start).count()); printf("Running mm256_log_pd... "); fflush(stdout); auto avx_pd_p_time_start = hrc::now(); #pragma omp parallel for for(size_t i = 0; i < N_pd; i += 8) { _mm256_store_pd(outa_pd+i+0, mm256_log_pd(_mm256_load_pd(data_pd+i+0))); _mm256_store_pd(outa_pd+i+4, mm256_log_pd(_mm256_load_pd(data_pd+i+4))); } auto avx_pd_p_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_pd_p_time_end - avx_pd_p_time_start).count()); printf("Running std::log float... "); fflush(stdout); auto log_ps_p_time_start = hrc::now(); #pragma omp parallel for for(size_t i = 0; i < N_ps; i += 16) { outl_ps[i+ 0] = std::log(data_ps[i+ 0]); outl_ps[i+ 1] = std::log(data_ps[i+ 1]); outl_ps[i+ 2] = std::log(data_ps[i+ 2]); outl_ps[i+ 3] = std::log(data_ps[i+ 3]); outl_ps[i+ 4] = std::log(data_ps[i+ 4]); outl_ps[i+ 5] = std::log(data_ps[i+ 5]); outl_ps[i+ 6] = std::log(data_ps[i+ 6]); outl_ps[i+ 7] = std::log(data_ps[i+ 7]); outl_ps[i+ 8] = std::log(data_ps[i+ 8]); outl_ps[i+ 9] = std::log(data_ps[i+ 9]); outl_ps[i+10] = std::log(data_ps[i+10]); outl_ps[i+11] = std::log(data_ps[i+11]); outl_ps[i+12] = std::log(data_ps[i+12]); outl_ps[i+13] = std::log(data_ps[i+13]); outl_ps[i+14] = std::log(data_ps[i+14]); outl_ps[i+15] = std::log(data_ps[i+15]); } auto log_ps_p_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(log_ps_p_time_end - log_ps_p_time_start).count()); printf("Running mm256_log_ps... "); fflush(stdout); auto avx_ps_p_time_start = hrc::now(); #pragma omp parallel for for(size_t i = 0; i < N_ps; i += 16) { _mm256_store_ps(outa_ps+i+0, mm256_log_ps(_mm256_load_ps(data_ps+i+0))); _mm256_store_ps(outa_ps+i+8, mm256_log_ps(_mm256_load_ps(data_ps+i+8))); } auto avx_ps_p_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_ps_p_time_end - avx_ps_p_time_start).count()); printf("\nCalculating errors... "); fflush(stdout); #pragma omp parallel for for(size_t i = 0; i < N_pd; ++i) { err_pd[i] = std::abs(1.0 - outa_pd[i]/outl_pd[i]); } #pragma omp parallel for for(size_t i = 0; i < N_ps; ++i) { err_ps[i] = std::abs(1.0 - outa_ps[i]/outl_ps[i]); } #pragma omp parallel for for(size_t i = 0; i < N_pd; ++i) { idx_pd[i] = i; } #pragma omp parallel for for(size_t i = 0; i < N_ps; ++i) { idx_ps[i] = i; } std::sort(idx_pd, idx_pd+N_pd, [&](size_t a, size_t b){ return err_pd[a] < err_pd[b]; }); std::sort(idx_ps, idx_ps+N_ps, [&](size_t a, size_t b){ return err_ps[a] < err_ps[b]; }); printf("done\n"); printf("\n\nSummary:\n"); double lsd_s = std::chrono::duration_cast<sd>(log_pd_s_time_end - log_pd_s_time_start).count(); double asd_s = std::chrono::duration_cast<sd>(avx_pd_s_time_end - avx_pd_s_time_start).count(); double lss_s = std::chrono::duration_cast<sd>(log_ps_s_time_end - log_ps_s_time_start).count(); double ass_s = std::chrono::duration_cast<sd>(avx_ps_s_time_end - avx_ps_s_time_start).count(); double lpd_s = std::chrono::duration_cast<sd>(log_pd_p_time_end - log_pd_p_time_start).count(); double apd_s = std::chrono::duration_cast<sd>(avx_pd_p_time_end - avx_pd_p_time_start).count(); double lps_s = std::chrono::duration_cast<sd>(log_ps_p_time_end - log_ps_p_time_start).count(); double aps_s = std::chrono::duration_cast<sd>(avx_ps_p_time_end - avx_ps_p_time_start).count(); printf(" Algorithm | Data Type | parallel | time | speed | min rel err | max rel err | 90%% rel err\n"); printf("-----------------------------------------------------------------------------------------------------------\n"); printf(" std::log | double | false | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lsd_s, N / lsd_s / (1<<30)); printf(" mm256_log_pd | double | false | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * asd_s, N / asd_s / (1<<30), err_pd[idx_pd[0]], err_pd[idx_pd[N_pd-1]], err_pd[idx_pd[90*N_pd/100]]); printf(" std::log | float | false | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lss_s, N / lss_s / (1<<30)); printf(" mm256_log_ps | float | false | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * ass_s, N / ass_s / (1<<30), err_ps[idx_ps[0]], err_ps[idx_ps[N_ps-1]], err_ps[idx_ps[90*N_ps/100]]); printf(" std::log | double | true | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lpd_s, N / lpd_s / (1<<30)); printf(" mm256_log_pd | double | true | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * apd_s, N / apd_s / (1<<30), err_pd[idx_pd[0]], err_pd[idx_pd[N_pd-1]], err_pd[idx_pd[90*N_pd/100]]); printf(" std::log | float | true | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lps_s, N / lps_s / (1<<30)); printf(" mm256_log_ps | float | true | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * aps_s, N / aps_s / (1<<30), err_ps[idx_ps[0]], err_ps[idx_ps[N_ps-1]], err_ps[idx_ps[90*N_ps/100]]); _mm_free(data_pd); _mm_free(data_ps); _mm_free(outl_pd); _mm_free(outl_ps); _mm_free(outa_pd); _mm_free(outa_ps); _mm_free(err_pd); _mm_free(err_ps); free(idx_pd); free(idx_ps); return 0; }
inline F64vec4 load(const double *r) { return _mm256_load_pd(r); }
void core::Vector3::normalize(void) { #if defined(VTX_USE_AVX) ALIGNED_32 platform::F64_t vector[] = {this->x, this->y, this->z, 0}; ALIGNED_32 platform::F64_t reciprocalVector[] = {1.0, 1.0, 1.0, 1.0}; __m256d simdvector; __m256d result; __m256d recp; simdvector = _mm256_load_pd(vector); recp = _mm256_load_pd(reciprocalVector); result = _mm256_mul_pd(simdvector, simdvector); result = _mm256_hadd_pd(result, result); result = _mm256_hadd_pd(result, result); result = _mm256_sqrt_pd(result); result = _mm256_div_pd(recp, result); simdvector = _mm256_mul_pd(simdvector, result); _mm256_store_pd(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #elif defined(VTX_USE_SSE) // Must pad with a trailing 0, to store in 128-bit register ALIGNED_16 core::F32_t vector[] = {this->x, this->y, this->z, 0}; __m128 simdvector; __m128 result; simdvector = _mm_load_ps(vector); // (X^2, Y^2, Z^2, 0^2) result = _mm_mul_ps(simdvector, simdvector); // Add all elements together, giving us (X^2 + Y^2 + Z^2 + 0^2) result = _mm_hadd_ps(result, result); result = _mm_hadd_ps(result, result); // Calculate square root, giving us sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_sqrt_ps(result); // Calculate reciprocal, giving us 1 / sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_rcp_ps(result); // Finally, multiply the result with our original vector. simdvector = _mm_mul_ps(simdvector, result); _mm_store_ps(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #else core::F64_t num = 1.0 / std::sqrt(std::pow(this->x, 2) + std::pow(this->y, 2) + std::pow(this->z, 2)); this->x *= num; this->y *= num; this->z *= num; #endif }
ALGEBRA_INLINE void vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n) { size_t k; __m256d l1 = _mm256_broadcast_sd(&lambda); __m256d l2 = _mm256_broadcast_sd(&lambda); __m256d l3 = _mm256_broadcast_sd(&lambda); __m256d l4 = _mm256_broadcast_sd(&lambda); size_t q = n / 16; size_t r = n % 16; if(q > 0) { if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) { for (k=0;k<q;k++) { /* Charge 4 valeurs de chaque tableau */ __m256d i1 = _mm256_load_pd(v1); __m256d j1 = _mm256_load_pd(v2); __m256d i2 = _mm256_load_pd(v1+4); __m256d j2 = _mm256_load_pd(v2+4); __m256d i3 = _mm256_load_pd(v1+8); __m256d j3 = _mm256_load_pd(v2+8); __m256d i4 = _mm256_load_pd(v1+12); __m256d j4 = _mm256_load_pd(v2+12); /* multiplie */ j1 = _mm256_mul_pd(j1, l1); j2 = _mm256_mul_pd(j2, l2); j3 = _mm256_mul_pd(j3, l3); j4 = _mm256_mul_pd(j4, l4); /* Additionne */ i1 = _mm256_add_pd(i1,j1); i2 = _mm256_add_pd(i2,j2); i3 = _mm256_add_pd(i3,j3); i4 = _mm256_add_pd(i4,j4); /* Sauvegarde */ _mm256_store_pd(v1, i1); _mm256_store_pd(v1+4, i2); _mm256_store_pd(v1+8, i3); _mm256_store_pd(v1+12, i4); v1 += 16; v2 += 16; } } else { for (k=0;k<q;k++) { /* Charge 4 valeurs de chaque tableau */ __m256d i1 = _mm256_loadu_pd(v1); __m256d j1 = _mm256_loadu_pd(v2); __m256d i2 = _mm256_loadu_pd(v1+4); __m256d j2 = _mm256_loadu_pd(v2+4); __m256d i3 = _mm256_loadu_pd(v1+8); __m256d j3 = _mm256_loadu_pd(v2+8); __m256d i4 = _mm256_loadu_pd(v1+12); __m256d j4 = _mm256_loadu_pd(v2+12); /* multiplie */ j1 = _mm256_mul_pd(j1, l1); j2 = _mm256_mul_pd(j2, l2); j3 = _mm256_mul_pd(j3, l3); j4 = _mm256_mul_pd(j4, l4); /* Additionne */ i1 = _mm256_add_pd(i1,j1); i2 = _mm256_add_pd(i2,j2); i3 = _mm256_add_pd(i3,j3); i4 = _mm256_add_pd(i4,j4); /* Sauvegarde */ _mm256_storeu_pd(v1, i1); _mm256_storeu_pd(v1+4, i2); _mm256_storeu_pd(v1+8, i3); _mm256_storeu_pd(v1+12, i4); v1 += 16; v2 += 16; } } } for(k = 0 ; k<r ; k++) v1[k] += lambda*v2[k]; }
/*! * \brief Load a packed vector from the given aligned memory location */ ETL_STATIC_INLINE(avx_simd_complex_double<etl::complex<double>>) load(const etl::complex<double>* memory) { return _mm256_load_pd(reinterpret_cast<const double*>(memory)); }
double bst_compute_129_m256_maskstore_root_aligned( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, j, l_end_pre; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m256d v_tmp; __m256d v00, v01, v02, v03; __m256d v10, v11, v12, v13; __m256d v20, v21, v22, v23; __m256d v30, v31, v32, v33; __m256i v_cur_roots; __m256 v_rootmask0, v_rootmask1; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx1_root; int idx2; int idx3, idx3_root; int pad_root, pad, pad_r; idx1 = ((int) mem->e_sz) - 1; idx1_root = ((int) mem->r_sz); // the conventio is that iteration i, idx1 points to the first element of line i+1 e[idx1++] = q[n]; // pad contains the padding for row i+1 // for row n it's always 3 pad = 3; pad_root = 7; for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx1_root -= 2*(n-i)+1 + pad_root; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } idx2 += pad; // padding of line i+1 // idx2 now points to the first element of the next line idx3 = idx1; idx3_root = idx1_root; pad_r = pad; for (r = i; r < n; ++r) { pad_r = (pad_r+1)&3; // padding of line r+1 idx1 = idx3; idx1_root = idx3_root; l_end = idx2 + (n-r); // l_end points to the first entry after the current row e_tmp = e[idx1++]; idx1_root++; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&15); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1_root] = r; } idx1++; idx1_root++; } v_tmp = _mm256_set_pd( e_tmp, e_tmp, e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm256_set_epi32(r, r, r, r, r, r, r, r); for( ; idx2 < l_end; idx2 += 16 ) { v01 = _mm256_load_pd( &w[idx1 ] ); v11 = _mm256_load_pd( &w[idx1+ 4] ); v21 = _mm256_load_pd( &w[idx1+ 8] ); v31 = _mm256_load_pd( &w[idx1+12] ); v00 = _mm256_load_pd( &e[idx2 ] ); v01 = _mm256_add_pd( v01, v_tmp ); v10 = _mm256_load_pd( &e[idx2+ 4] ); v11 = _mm256_add_pd( v11, v_tmp ); v20 = _mm256_load_pd( &e[idx2+ 8] ); v21 = _mm256_add_pd( v21, v_tmp ); v30 = _mm256_load_pd( &e[idx2+12] ); v31 = _mm256_add_pd( v31, v_tmp ); v01 = _mm256_add_pd( v01, v00 ); v03 = _mm256_load_pd( &e[idx1 ] ); v11 = _mm256_add_pd( v11, v10 ); v13 = _mm256_load_pd( &e[idx1+ 4] ); v21 = _mm256_add_pd( v21, v20 ); v23 = _mm256_load_pd( &e[idx1+ 8] ); v31 = _mm256_add_pd( v31, v30 ); v33 = _mm256_load_pd( &e[idx1+12] ); v02 = _mm256_cmp_pd( v01, v03, _CMP_LT_OQ ); v12 = _mm256_cmp_pd( v11, v13, _CMP_LT_OQ ); v22 = _mm256_cmp_pd( v21, v23, _CMP_LT_OQ ); v32 = _mm256_cmp_pd( v31, v33, _CMP_LT_OQ ); _mm256_maskstore_pd( &e[idx1 ], _mm256_castpd_si256( v02 ), v01 ); _mm256_maskstore_pd( &e[idx1+ 4], _mm256_castpd_si256( v12 ), v11 ); v_rootmask0 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v02)), _mm256_cvtpd_ps(v12) , 1 ); _mm256_maskstore_pd( &e[idx1+ 8], _mm256_castpd_si256( v22 ), v21 ); _mm256_maskstore_pd( &e[idx1+12], _mm256_castpd_si256( v32 ), v31 ); v_rootmask1 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v22)), _mm256_cvtpd_ps(v32) , 1 ); _mm256_maskstore_ps( &root[idx1_root ], _mm256_castps_si256( v_rootmask0 ), _mm256_castsi256_ps( v_cur_roots ) ); _mm256_maskstore_ps( &root[idx1_root + 8], _mm256_castps_si256( v_rootmask1 ), _mm256_castsi256_ps( v_cur_roots ) ); idx1 += 16; idx1_root += 16; } idx2 += pad_r; idx3++; idx3_root++; } pad = (pad -1)&3; pad_root = (pad_root-1)&7; } // the index of the last item of the first row is ((n/4)+1)*4-1, due to the padding // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[ ((n/4)+1)*4 - 1 ]; }
int main(void) { // std::cout<<std::endl<<" Compute inner product..."<<std::endl<<std::endl; // INIT VECTOR //double vec1 [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));//__declspec(align(n)) //double vec2 [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_))); //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec1 [_PBM_SIZE]; //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec2 [_PBM_SIZE]; //double *vec1 = _aligned_malloc(_PBM_SIZE*sizeof *vec1,_CBSIM_DBL_ALIGN_); //double *vec2 = _aligned_malloc(_PBM_SIZE*sizeof *vec2,_CBSIM_DBL_ALIGN_); double *vec1 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32); double *vec2 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32); double result = 0.0; // tbb::tick_count t1, t2; int loopsToDo = 10000; for (int i=0 ; i < _PBM_SIZE ; i++) { vec1[i] = static_cast<double>(i)*0.01; vec2[i] = static_cast<double>(i)*0.01; } // SERIAL *********************************************************************************** // t1 = tbb::tick_count::now(); for (int z=0 ; z < loopsToDo ; z++) { //__m256d ymm0; //__m256d ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;//, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm16, ymm17, ymm18; //ymm0 = _mm256_setzero_pd(); // accumulator //double res0 = 0.0, res1 = 0.0, res2 = 0.0, res3 = 0.0; //__m256d acc = _mm256_setzero_pd(); //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0, 0.0, 0.0}; result = 0.0; //double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; for (int i=0 ; i < _PBM_SIZE; i+=8) { /* __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m128d xmm1 = _mm256_extractf128_pd(ymm3,0); __m128d xmm2 = _mm256_extractf128_pd(ymm3,1); __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); */ __assume_aligned(&vec1[0],32); __assume_aligned(&vec2[0],32); __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m256d ymm4 = _mm256_load_pd(&vec1[i+4]); __m256d ymm5 = _mm256_load_pd(&vec2[i+4]); __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 ); __m256d ymm7 = _mm256_add_pd( ymm3, ymm6); __m128d xmm1 = _mm256_extractf128_pd(ymm7,0); __m128d xmm2 = _mm256_extractf128_pd(ymm7,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); //__m256d ymm0 = _mm256_add_pd( ymm0, ymm7); /* //__assume_aligned(&vec1[0],32); //__assume_aligned(&vec2[0],32); __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m256d ymm4 = _mm256_load_pd(&vec1[i+4]); __m256d ymm5 = _mm256_load_pd(&vec2[i+4]); //__m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 ); //__m256d ymm7 = _mm256_add_pd( ymm3, ymm6); __m256d ymm6 = _mm256_fmadd_pd (ymm4,ymm5,ymm3); //ymm0 = _mm256_add_pd( ymm0, ymm7); __m128d xmm1 = _mm256_extractf128_pd(ymm6,0); __m128d xmm2 = _mm256_extractf128_pd(ymm6,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); //_mm256_store_pd(&res[0],ymm6); //result_SIMD_INTRINSICS += (res[0] + res[1]) + (res[2] + res[3]); */ //#define _VER_AVX #ifdef _VER_AVX __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m256d ymm4 = _mm256_load_pd(&vec1[i+4]); __m256d ymm5 = _mm256_load_pd(&vec2[i+4]); __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 ); __m256d ymm7 = _mm256_load_pd(&vec1[i+8]); __m256d ymm8 = _mm256_load_pd(&vec2[i+8]); __m256d ymm9 = _mm256_mul_pd( ymm7, ymm8 ); __m256d ymm10 = _mm256_load_pd(&vec1[i+12]); __m256d ymm11 = _mm256_load_pd(&vec2[i+12]); __m256d ymm12 = _mm256_mul_pd( ymm10, ymm11 ); __m256d ymm13 = _mm256_add_pd( ymm3, ymm6); __m256d ymm14 = _mm256_add_pd( ymm9, ymm12); __m256d ymm15 = _mm256_add_pd( ymm13, ymm14); __m128d xmm1 = _mm256_extractf128_pd(ymm15,0); __m128d xmm2 = _mm256_extractf128_pd(ymm15,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res_SIMD_INTRINSICS[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res_SIMD_INTRINSICS[0],xmm3); result += (res_SIMD_INTRINSICS[0] + res_SIMD_INTRINSICS[1]); //ymm0 = _mm256_add_pd( ymm0, ymm13); //ymm0 = _mm256_add_pd( ymm0, ymm14); #endif //#define _VER_AVX2 #ifdef _VER_AVX2 __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec1[i+4]); __m256d ymm3 = _mm256_load_pd(&vec1[i+8]); __m256d ymm4 = _mm256_load_pd(&vec1[i+12]); //__m256d ymm13 = _mm256_load_pd(&vec1[i+16]); //__m256d ymm14 = _mm256_load_pd(&vec1[i+20]); //__m256d ymm15 = _mm256_load_pd(&vec1[i+24]); //__m256d ymm16 = _mm256_load_pd(&vec1[i+28]); __m256d ymm5 = _mm256_load_pd(&vec2[i]); __m256d ymm6 = _mm256_load_pd(&vec2[i+4]); __m256d ymm7 = _mm256_load_pd(&vec2[i+8]); __m256d ymm8 = _mm256_load_pd(&vec2[i+12]); //__m256d ymm17 = _mm256_load_pd(&vec2[i+16]); //__m256d ymm18 = _mm256_load_pd(&vec2[i+20]); //__m256d ymm19 = _mm256_load_pd(&vec2[i+24]); //__m256d ymm20 = _mm256_load_pd(&vec2[i+28]); __m256d ymm9 = _mm256_mul_pd(ymm1,ymm5); __m256d ymm10 = _mm256_fmadd_pd(ymm2,ymm6,ymm9); //__m256d ymm11 = _mm256_mul_pd(ymm3,ymm7); __m256d ymm11 = _mm256_fmadd_pd(ymm3,ymm7,ymm10); __m256d ymm12 = _mm256_fmadd_pd(ymm4,ymm8,ymm11); //ymm12 = _mm256_hadd_pd(ymm10,ymm12); //__m256d ymm21 = _mm256_fmadd_pd(ymm13,ymm17,ymm12); //__m256d ymm22 = _mm256_fmadd_pd(ymm14,ymm18,ymm21); //__m256d ymm23 = _mm256_fmadd_pd(ymm15,ymm19,ymm22); //__m256d ymm24 = _mm256_fmadd_pd(ymm16,ymm20,ymm23); __m128d xmm1 = _mm256_extractf128_pd(ymm12,0); __m128d xmm2 = _mm256_extractf128_pd(ymm12,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); #endif /* // Performing 4 dot product at one time ymm1 = _mm256_load_pd(&vec1[i]); // x[0] ymm2 = _mm256_load_pd(&vec1[i+4]); // x[1] ymm3 = _mm256_load_pd(&vec1[i+8]); // x[2] ymm4 = _mm256_load_pd(&vec1[i+12]); // x[3] ymm5 = _mm256_load_pd(&vec2[i]); // y[0] ymm6 = _mm256_load_pd(&vec2[i+4]); // y[1] ymm7 = _mm256_load_pd(&vec2[i+8]); // y[2] ymm8 = _mm256_load_pd(&vec2[i+12]); // y[3] ymm9 = _mm256_mul_pd( ymm1, ymm5 ); // xy0 ymm10 = _mm256_mul_pd( ymm2, ymm6 ); // xy1 ymm11 = _mm256_hadd_pd( ymm9, ymm10 ); // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13 ymm12 = _mm256_mul_pd( ymm3, ymm7 ); // xy2 ymm13 = _mm256_mul_pd( ymm4, ymm8 ); // xy3 ymm14 = _mm256_hadd_pd( ymm12, ymm13 ); // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33 ymm15 = _mm256_permute2f128_pd( ymm11, ymm14, 0x21 ); // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31 ymm1 = _mm256_blend_pd( ymm11, ymm14, 0b1100); // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33 ymm2 = _mm256_add_pd( ymm15, ymm1 ); ymm0 = _mm256_add_pd( ymm0, ymm2 ); */ /* __m256d x[4], y[4]; x[0] = _mm256_load_pd(&vec1[i]); x[1] = _mm256_load_pd(&vec1[i+4]); x[2] = _mm256_load_pd(&vec1[i+8]); x[3] = _mm256_load_pd(&vec1[i+12]); y[0] = _mm256_load_pd(&vec2[i]); y[1] = _mm256_load_pd(&vec2[i+4]); y[2] = _mm256_load_pd(&vec2[i+8]); y[3] = _mm256_load_pd(&vec2[i+12]); __m256d xy0 = _mm256_mul_pd( x[0], y[0] ); __m256d xy1 = _mm256_mul_pd( x[1], y[1] ); // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13 __m256d temp01 = _mm256_hadd_pd( xy0, xy1 ); __m256d xy2 = _mm256_mul_pd( x[2], y[2] ); __m256d xy3 = _mm256_mul_pd( x[3], y[3] ); // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33 __m256d temp23 = _mm256_hadd_pd( xy2, xy3 ); // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31 __m256d swapped = _mm256_permute2f128_pd( temp01, temp23, 0x21 ); // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33 __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100); __m256d dotproduct = _mm256_add_pd( swapped, blended ); */ //ymm0 = _mm256_add_pd(ymm0,dotproduct); /* __m128d xmm1 = _mm256_extractf128_pd(dotproduct,0); __m128d xmm2 = _mm256_extractf128_pd(dotproduct,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); */ // _mm256_store_pd(&res[0],dotproduct); // result += (res[0] + res[1]) + (res[2] + res[3]); //result_SIMD_INTRINSICS += dotproduct[0] + dotproduct[1] + dotproduct[2] + dotproduct[3]; //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))); //_mm256_store_pd(&res[0],ymm0); //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; //double* res = (double*)&ymm0; //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; } //double* res = (double*)&ymm0; //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))); //_mm256_store_pd(&res[0],ymm0); //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; } // t2 = tbb::tick_count::now(); // double exec_time = 1000.0*(t2-t1).seconds(); //std::cout << std::setiosflags(std::ios::fixed) << std::setprecision(5); std::cout<<std::endl<<"RESULTS: " <<std::endl; std::cout<<"result_intrin ----------: "<< result << std::endl; //std::cout<<"result_intrin ----------: "<< result << ", time: " << 1000.0*(t2-t1).seconds() << " ms" << std::endl; std::cout<<std::endl<<"Program end. "<<std::endl<<std::endl; return 0; }
void ks_tanh_int_d8x4( int k, int rhs, double *h, // NOP double *u, double *aa, // NOP double *a, double *bb, // NOP double *b, double *w, double *c, ks_t *ker, aux_t *aux ) { int i, rhs_left; double scal = ker->scal; double cons = ker->cons; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t u03, u47; v4df_t a03, a47, A03, A47; // prefetched A v4df_t b0, b1, b2, b3, B0; // prefetched B v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp; // Rank-k update segment #include "ks_rank_k_int_d8x4.h" // Accumulate if ( aux->pc ) { tmpc03_0.v = _mm256_load_pd( (double*)( c ) ); c03_0.v = _mm256_add_pd( tmpc03_0.v, c03_0.v ); tmpc47_0.v = _mm256_load_pd( (double*)( c + 4 ) ); c47_0.v = _mm256_add_pd( tmpc47_0.v, c47_0.v ); tmpc03_1.v = _mm256_load_pd( (double*)( c + 8 ) ); c03_1.v = _mm256_add_pd( tmpc03_1.v, c03_1.v ); tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) ); c47_1.v = _mm256_add_pd( tmpc47_1.v, c47_1.v ); tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) ); c03_2.v = _mm256_add_pd( tmpc03_2.v, c03_2.v ); tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) ); c47_2.v = _mm256_add_pd( tmpc47_2.v, c47_2.v ); tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) ); c03_3.v = _mm256_add_pd( tmpc03_3.v, c03_3.v ); tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) ); c47_3.v = _mm256_add_pd( tmpc47_3.v, c47_3.v ); } // Scale before the kernel evaluation c_tmp.v = _mm256_broadcast_sd( &scal ); c03_0.v = _mm256_mul_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( c_tmp.v, c47_3.v ); // Shift before the kernel evaluation c_tmp.v = _mm256_broadcast_sd( &cons ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Preload u03, u47 u03.v = _mm256_load_pd( (double*)u ); u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // Prefetch u and w __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u + 8 ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); // c = tanh( c ); c03_0.v = _mm256_tanh_pd( c03_0.v ); c03_1.v = _mm256_tanh_pd( c03_1.v ); c03_2.v = _mm256_tanh_pd( c03_2.v ); c03_3.v = _mm256_tanh_pd( c03_3.v ); c47_0.v = _mm256_tanh_pd( c47_0.v ); c47_1.v = _mm256_tanh_pd( c47_1.v ); c47_2.v = _mm256_tanh_pd( c47_2.v ); c47_3.v = _mm256_tanh_pd( c47_3.v ); // Multiple rhs kernel summation. #include "ks_kernel_summation_int_d8x4.h" }
inline vector4d& vector4d::load_aligned(const double* src) { m_value = _mm256_load_pd(src); return *this; }
void gaussian_int_d8x6( int k, int rhs, //double *h, double *u, double *aa, double *a, double *bb, double *b, double *w, double *c, ks_t *ker, aux_t *aux ) { int i; double alpha = ker->scal; // 16 registers. v4df_t c03_0, c03_1, c03_2, c03_3, c03_4, c03_5; v4df_t c47_0, c47_1, c47_2, c47_3, c47_4, c47_5; v4df_t a03, a47, b0, b1; #include <rank_k_int_d8x6.h> #include <sq2nrm_int_d8x6.h> // Scale before the kernel evaluation a03.v = _mm256_broadcast_sd( &alpha ); c03_0.v = _mm256_mul_pd( a03.v, c03_0.v ); c03_1.v = _mm256_mul_pd( a03.v, c03_1.v ); c03_2.v = _mm256_mul_pd( a03.v, c03_2.v ); c03_3.v = _mm256_mul_pd( a03.v, c03_3.v ); c03_4.v = _mm256_mul_pd( a03.v, c03_4.v ); c03_5.v = _mm256_mul_pd( a03.v, c03_5.v ); c47_0.v = _mm256_mul_pd( a03.v, c47_0.v ); c47_1.v = _mm256_mul_pd( a03.v, c47_1.v ); c47_2.v = _mm256_mul_pd( a03.v, c47_2.v ); c47_3.v = _mm256_mul_pd( a03.v, c47_3.v ); c47_4.v = _mm256_mul_pd( a03.v, c47_4.v ); c47_5.v = _mm256_mul_pd( a03.v, c47_5.v ); // Prefetch u, w __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); // c = exp( c ) c03_0.v = _mm256_exp_pd( c03_0.v ); c03_1.v = _mm256_exp_pd( c03_1.v ); c03_2.v = _mm256_exp_pd( c03_2.v ); c03_3.v = _mm256_exp_pd( c03_3.v ); c03_4.v = _mm256_exp_pd( c03_4.v ); c03_5.v = _mm256_exp_pd( c03_5.v ); c47_0.v = _mm256_exp_pd( c47_0.v ); c47_1.v = _mm256_exp_pd( c47_1.v ); c47_2.v = _mm256_exp_pd( c47_2.v ); c47_3.v = _mm256_exp_pd( c47_3.v ); c47_4.v = _mm256_exp_pd( c47_4.v ); c47_5.v = _mm256_exp_pd( c47_5.v ); // Preload u03, u47 a03.v = _mm256_load_pd( (double*) u ); a47.v = _mm256_load_pd( (double*)( u + 4 ) ); // Multiple rhs weighted sum. #include<weighted_sum_int_d8x6.h> //if ( u[ 0 ] != u[ 0 ] ) printf( "u[ 0 ] nan\n" ); //if ( u[ 1 ] != u[ 1 ] ) printf( "u[ 1 ] nan\n" ); //if ( u[ 2 ] != u[ 2 ] ) printf( "u[ 2 ] nan\n" ); //if ( u[ 3 ] != u[ 3 ] ) printf( "u[ 3 ] nan\n" ); //if ( u[ 4 ] != u[ 4 ] ) printf( "u[ 4 ] nan\n" ); //if ( u[ 5 ] != u[ 5 ] ) printf( "u[ 5 ] nan\n" ); //if ( u[ 6 ] != u[ 6 ] ) printf( "u[ 6 ] nan\n" ); //if ( u[ 7 ] != u[ 7 ] ) printf( "u[ 7 ] nan\n" ); //if ( w[ 0 ] != w[ 0 ] ) printf( "w[ 0 ] nan\n" ); //if ( w[ 1 ] != w[ 1 ] ) printf( "w[ 1 ] nan\n" ); //if ( w[ 2 ] != w[ 2 ] ) printf( "w[ 2 ] nan\n" ); //if ( w[ 3 ] != w[ 3 ] ) printf( "w[ 3 ] nan\n" ); //if ( w[ 4 ] != w[ 4 ] ) printf( "w[ 4 ] nan\n" ); //if ( w[ 5 ] != w[ 5 ] ) printf( "w[ 5 ] nan\n" ); }
// it moves horizontally inside a block void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg) { if(kmax<=0) return; double *A1 = A0 + 4*sda; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_40_50_60_70, a_41_51_61_71, x_0, x_1, y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3, y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7; /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_4_5_6_7 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_4_5_6_7_b = _mm256_setzero_pd(); */ zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ /* y_0_1_2_3_d = _mm256_setzero_pd();*/ // upper triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A0[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A0[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc ); // forth col (avoid zero y_0_1_2_3) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; // upper squared x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); // lower triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A1[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A1[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 ); y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc ); // forth col (avoid zero y_4_5_6_7) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] ); y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; k=8; for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A0 + 4*lda );*/ /* __builtin_prefetch( A1 + 4*lda );*/ x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); /* __builtin_prefetch( A0 + 5*lda );*/ /* __builtin_prefetch( A1 + 5*lda );*/ x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; } if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 2*lda; A1 += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); /* A0 += 1*lda;*/ /* A1 += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } }
Color3 sampleFourier3(float * const coeffs[3], const double *recip, size_t nCoeffs, Float sample, Float &pdf, Float &phi) { bool flip = false; if (sample < 0.5f) { sample *= 2.0f; } else { sample = 1.0f - 2.0f * (sample - 0.5f); flip = true; } int iterations = 0; double a = 0.0, c = math::Pi_d, coeff0 = coeffs[0][0], y = coeff0*math::Pi_d*sample, deriv = 0.0, b = 0.5 * math::Pi_d, cosB = 0, sinB = 1; if (nCoeffs > 10 && sample != 0 && sample != 1) { float stddev = std::sqrt(2.0f / 3.0f * std::log(coeffs[0][1] / coeffs[0][2])); if (std::isfinite(stddev)) { b = std::min(c, (double) math::normal_quantile(0.5f + sample / 2) * stddev); cosB = std::cos(b); sinB = std::sqrt(1 - cosB * cosB); } } #if FOURIER_SCALAR != 1 __m256d factorB_prev, factorB_cur; #endif while (true) { #if FOURIER_SCALAR == 1 double cosB_prev = cosB, sinB_prev = -sinB, sinB_cur = 0.0, cosB_cur = 1.0, value = coeff0 * b; deriv = coeff0; for (size_t j=1; j<nCoeffs; ++j) { double sinB_next = 2.0*cosB*sinB_cur - sinB_prev, cosB_next = 2.0*cosB*cosB_cur - cosB_prev, coeff = (double) coeffs[0][j]; value += coeff * recip[j] * sinB_next; deriv += coeff * cosB_next; sinB_prev = sinB_cur; sinB_cur = sinB_next; cosB_prev = cosB_cur; cosB_cur = cosB_next; } #else initializeRecurrence(cosB, factorB_prev, factorB_cur); __m256d sinB_prev = _mm256_set1_pd(-sinB), sinB_cur = _mm256_set1_pd(0.0), cosB_prev = _mm256_set1_pd(cosB), cosB_cur = _mm256_set1_pd(1.0), value_vec = _mm256_set_sd(coeff0 * b), deriv_vec = _mm256_set_sd(coeff0); for (size_t j=1; j<nCoeffs; j+=4) { __m128 coeff_vec_f = _mm_load_ps(coeffs[0]+j); __m256d recip_vec = _mm256_load_pd(recip+j); __m256d coeff_vec = _mm256_cvtps_pd(coeff_vec_f); __m256d sinB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, sinB_prev), _mm256_mul_pd(factorB_cur, sinB_cur)); __m256d cosB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, cosB_prev), _mm256_mul_pd(factorB_cur, cosB_cur)); value_vec = _mm256_add_pd(value_vec, _mm256_mul_pd( _mm256_mul_pd(recip_vec, coeff_vec), sinB_next)); deriv_vec = _mm256_add_pd(deriv_vec, _mm256_mul_pd(coeff_vec, cosB_next)); sinB_prev = _mm256_splat2_pd(sinB_next); cosB_prev = _mm256_splat2_pd(cosB_next); sinB_cur = _mm256_splat3_pd(sinB_next); cosB_cur = _mm256_splat3_pd(cosB_next); } double value = simd::hadd(value_vec); deriv = simd::hadd(deriv_vec); #endif value -= y; if (std::abs(value) <= 1e-5 * coeff0 || ++iterations > 20) break; else if (value > 0.0) c = b; else a = b; b -= value / deriv; if (!(b >= a && b <= c)) b = 0.5f * (a + c); cosB = std::cos(b); sinB = std::sqrt(1-cosB*cosB); } double Y = deriv; if (flip) b = 2.0*math::Pi_d - b; pdf = (Float) (math::InvTwoPi_d * Y / coeff0); phi = (Float) b; #if FOURIER_SCALAR == 1 double cosB_prev = cosB, cosB_cur = 1.0; double R = coeffs[1][0]; double B = coeffs[2][0]; for (size_t j=1; j<nCoeffs; ++j) { double cosB_next = 2.0*cosB*cosB_cur - cosB_prev, coeffR = (double) coeffs[1][j], coeffB = (double) coeffs[2][j]; R += coeffR * cosB_next; B += coeffB * cosB_next; cosB_prev = cosB_cur; cosB_cur = cosB_next; } #else __m256d cosB_prev = _mm256_set1_pd(cosB), cosB_cur = _mm256_set1_pd(1.0), R_vec = _mm256_set_sd(coeffs[1][0]), B_vec = _mm256_set_sd(coeffs[2][0]); for (size_t j=1; j<nCoeffs; j+=4) { __m128 coeff_R_vec_f = _mm_load_ps(coeffs[1]+j); __m128 coeff_B_vec_f = _mm_load_ps(coeffs[2]+j); __m256d coeff_R_vec = _mm256_cvtps_pd(coeff_R_vec_f); __m256d coeff_B_vec = _mm256_cvtps_pd(coeff_B_vec_f); __m256d cosB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, cosB_prev), _mm256_mul_pd(factorB_cur, cosB_cur)); R_vec = _mm256_add_pd(R_vec, _mm256_mul_pd(coeff_R_vec, cosB_next)); B_vec = _mm256_add_pd(B_vec, _mm256_mul_pd(coeff_B_vec, cosB_next)); cosB_prev = _mm256_splat2_pd(cosB_next); cosB_cur = _mm256_splat3_pd(cosB_next); } double R = simd::hadd(R_vec); double B = simd::hadd(B_vec); #endif double G = 1.39829 * Y - 0.100913 * B - 0.297375 * R; return Color3((Float) R, (Float) G, (Float) B) * (2 * math::Pi) * (Float) (coeff0 / Y); }
void ntt_transform(poly out, const poly o) { int s, pos = 0, offset; __m256d vt,vo0,vo10,vo11,vo20,vo21,vo22,vo23,vc,vp,vpinv,neg2,neg4; __m256d vx0,vx1,vx2,vx3,vx4,vx5,vx6,vx7; vpinv = _mm256_set_pd(PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE); vp = _mm256_set_pd(8383489., 8383489., 8383489., 8383489.); bitrev(out); vo10 = _mm256_load_pd(o+pos); vo20 = _mm256_load_pd(o+pos+4); neg2 = _mm256_load_pd(_neg2); neg4 = _mm256_load_pd(_neg4); // m = 2, m = 4, m = 8 (3 levels merged) for(s = 0; s<POLY_DEG; s+=8) { // No multiplication with omega required, respective value is 1 vx0 = _mm256_load_pd(out+s); vt = _mm256_mul_pd(vx0,neg2); vx0 = _mm256_hadd_pd(vx0,vt); vx1 = _mm256_load_pd(out+s+4); vt = _mm256_mul_pd(vx1,neg2); vx1 = _mm256_hadd_pd(vx1,vt); vx0 = _mm256_mul_pd(vx0, vo10); vc = _mm256_mul_pd(vx0, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx0 = _mm256_sub_pd(vx0,vc); vt = _mm256_permute2f128_pd (vx0, vx0, 0x01); // now contains x2,x3,x0,x1 vx0 = _mm256_mul_pd(vx0, neg4); vx0 = _mm256_add_pd(vx0, vt); vx1 = _mm256_mul_pd(vx1, vo10); vc = _mm256_mul_pd(vx1, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx1 = _mm256_sub_pd(vx1,vc); vt = _mm256_permute2f128_pd (vx1, vx1, 0x01); // now contains x2,x3,x0,x1 vx1 = _mm256_mul_pd(vx1, neg4); vx1 = _mm256_add_pd(vx1, vt); vt = _mm256_mul_pd(vx1, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx1 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+s+4, vx1); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+s+0, vx0); } pos += 8; // m = 16, m = 32, m = 64 (3 levels merged) for(offset = 0; offset < 8; offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+8); vo11 = _mm256_load_pd(o+pos+offset+16); for(s = 0; s<POLY_DEG; s+=64) { vx1 = _mm256_load_pd(out+offset+s+8); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+8, vx1); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+16); vx3 = _mm256_sub_pd(vx2, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx2 = _mm256_add_pd(vx2, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx5 = _mm256_load_pd(out+offset+s+40); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+32); vx5 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+40, vx5); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+48); vx7 = _mm256_sub_pd(vx6, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx6 = _mm256_add_pd(vx6, vt); // _mm256_store_pd(out+offset+s+48, vx6); // vx2 = _mm256_load_pd(out+offset+s+16); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); // vx6 = _mm256_load_pd(out+offset+s+48); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx4 = _mm256_load_pd(out+offset+s+32); vx6 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+48, vx6); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); // vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx3 = _mm256_sub_pd(vx1, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx1 = _mm256_add_pd(vx1, vt); // _mm256_store_pd(out+offset+s+8, vx1); // vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx5 = _mm256_load_pd(out+offset+s+40); vx7 = _mm256_sub_pd(vx5, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx5 = _mm256_add_pd(vx5, vt); // _mm256_store_pd(out+offset+s+40, vx5); // vx4 = _mm256_load_pd(out+offset+s+32); vo20 = _mm256_load_pd(o+pos+offset+24); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+32, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); // vx5 = _mm256_load_pd(out+offset+s+40); vo21 = _mm256_load_pd(o+pos+offset+32); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+40, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+8, vx1); // vx6 = _mm256_load_pd(out+offset+s+48); vo22 = _mm256_load_pd(o+pos+offset+40); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx2 = _mm256_load_pd(out+offset+s+16); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+48, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+16, vx2); // vx7 = _mm256_load_pd(out+offset+s+56); vo23 = _mm256_load_pd(o+pos+offset+48); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx3 = _mm256_load_pd(out+offset+s+24); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+56, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+24, vx3); } } pos += 56; // m = 128, m=256, m=512 (3 levels merged) for(offset=0;offset<64;offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+64); vo11 = _mm256_load_pd(o+pos+offset+128); for(s = 0; s<POLY_DEG; s+=512) { vx1 = _mm256_load_pd(out+offset+s+64); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+64, vx1); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+128); vx3 = _mm256_sub_pd(vx2, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx2 = _mm256_add_pd(vx2, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx5 = _mm256_load_pd(out+offset+s+320); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+256); vx5 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+320, vx5); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+384); vx7 = _mm256_sub_pd(vx6, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx6 = _mm256_add_pd(vx6, vt); //_mm256_store_pd(out+offset+s+384, vx6); //vx2 = _mm256_load_pd(out+offset+s+128); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); //vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx3 = _mm256_sub_pd(vx1, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx1 = _mm256_add_pd(vx1, vt); //_mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx4 = _mm256_load_pd(out+offset+s+256); vx6 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+384, vx6); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); //vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx5 = _mm256_load_pd(out+offset+s+320); vx7 = _mm256_sub_pd(vx5, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx5 = _mm256_add_pd(vx5, vt); //_mm256_store_pd(out+offset+s+320, vx5); //vx4 = _mm256_load_pd(out+offset+s+256); vo20 = _mm256_load_pd(o+pos+offset+192); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+256, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); //vx5 = _mm256_load_pd(out+offset+s+320); vo21 = _mm256_load_pd(o+pos+offset+256); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+320, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vo22 = _mm256_load_pd(o+pos+offset+320); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx2 = _mm256_load_pd(out+offset+s+128); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+384, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+128, vx2); //vx7 = _mm256_load_pd(out+offset+s+448); vo23 = _mm256_load_pd(o+pos+offset+384); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx3 = _mm256_load_pd(out+offset+s+192); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+448, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+192, vx3); } } }