/****************************************************************** * * NEGACYCLIC FFT LOOK UP TABLE * ******************************************************************/ void negacyc_mul(ring_t *r, const ring_t *x, const ring_t *y) { phi_forward(&vector_x,x); phi_forward(&vector_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp,dim; dim = _mm256_set1_pd(CPLXDIM); // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vector_x.real+i); imag_x = _mm256_load_pd(vector_x.imag+i); real_y = _mm256_load_pd(vector_y.real+i); imag_y = _mm256_load_pd(vector_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_x = _mm256_div_pd(real_x,dim); imag_x = _mm256_div_pd(imag_x,dim); _mm256_store_pd(vector_res.real+i,real_x); _mm256_store_pd(vector_res.imag+i,imag_x); } phi_backward(&vector_res,r); // print_cplx(&vec_res,CPLXDIM); }
/****************************************************************** * * SPLIT RADIX PRECOMPUTED AND VECTORIZED FFT MULTIPLICATION * ******************************************************************/ void sr_vector_mul(ring_t *r, const ring_t *x, const ring_t *y){ // printf("\n\n**************split-radix FAST**************\n"); fft_vector_forward(&vctr_x,x); fft_vector_forward(&vctr_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp; // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vctr_x.real+i); imag_x = _mm256_load_pd(vctr_x.imag+i); real_y = _mm256_load_pd(vctr_y.real+i); imag_y = _mm256_load_pd(vctr_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_y = _mm256_set1_pd(CPLXDIM); real_x = _mm256_div_pd(real_x,real_y); imag_x = _mm256_div_pd(imag_x,real_y); _mm256_store_pd(vctr_res.real+i,real_x); _mm256_store_pd(vctr_res.imag+i,imag_x); } fft_vector_backward(&vctr_res,r); }
void convert_simd_avx(const int32_t * u, double * y, size_t n, double slope) { const int32_t * u_end = u + n; const int32_t * u_current = u; double * y_current = y; __m128i mmx_u1, mmx_u2; __m256d mmx_y1, mmx_y2, mmx_y3, mmx_y4; __m256d mmx_slope_4 = _mm256_set1_pd(slope); { for (; u_current < u_end; u_current += 8, y_current += 8) { /* Load 8 input values into an SSE register */ mmx_u1 = _mm_load_si128( (const __m128i *) u_current); mmx_u2 = _mm_load_si128( (const __m128i *) u_current+4); mmx_y1 = _mm256_cvtepi32_pd(mmx_u1); mmx_y2 = _mm256_cvtepi32_pd(mmx_u2); mmx_y3 = _mm256_mul_pd(mmx_y1, mmx_slope_4); /* Apply slope */ mmx_y4 = _mm256_mul_pd(mmx_y2, mmx_slope_4); /* Apply slope */ _mm256_store_pd(y_current, mmx_y3); _mm256_store_pd(y_current+4, mmx_y4); } } }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_vector_set( const double i_scalar, double* io_c, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_scalar; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ const __m256d vec_scalar = _mm256_broadcast_sd(&i_scalar); for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), vec_scalar ); _mm256_store_pd( &(io_c[l_n+4]), vec_scalar ); #else _mm256_stream_pd( &(io_c[l_n]), vec_scalar ); _mm256_stream_pd( &(io_c[l_n+4]), vec_scalar ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_scalar = _mm512_broadcastsd_pd(_mm_load_sd(&i_scalar)); for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), vec_scalar ); #else _mm512_stream_pd( &(io_c[l_n]), vec_scalar ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_scalar; } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_scalar; } }
// multiply *p by v and applied to all n COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v) { #if defined(COREARRAY_SIMD_AVX) const __m256d v4 = _mm256_set1_pd(v); switch ((size_t)p & 0x1F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x10: if (n > 0) { (*p++) *= v; n--; } case 0x18: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 4; n-=4) { _mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4)); p += 4; } if (n >= 2) { _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } break; default: for (; n >= 4; n-=4) { _mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4)); p += 4; } if (n >= 2) { _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } } #elif defined(COREARRAY_SIMD_SSE2) const __m128d v2 = _mm_set1_pd(v); switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 2; n-=2, p+=2) _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2)); break; default: for (; n >= 2; n-=2, p+=2) _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2)); } #endif for (; n > 0; n--) (*p++) *= v; }
double compute_pi(size_t dt) { int i; double pi = 0.0; double delta = 1.0 / dt; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_set1_pd(1.0); ymm1 = _mm256_set1_pd(delta); ymm2 = _mm256_set_pd(delta * 3, delta * 2, delta * 1, 0.0); ymm4 = _mm256_setzero_pd(); for (i = 0; i <= dt - 4; i += 4) { ymm3 = _mm256_set1_pd(i * delta); ymm3 = _mm256_add_pd(ymm3, ymm2); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_add_pd(ymm0, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm4 = _mm256_add_pd(ymm4, ymm3); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm4); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
static inline unsigned int evaluatePopcount(INT_TYPE v_N, char *precomputed) { #ifdef __AVX unsigned long int res[4] __attribute__ ((aligned (BYTE_ALIGNMENT))); unsigned int a, b; _mm256_store_pd((double*)res, v_N); a = __builtin_popcountl(res[0]) + __builtin_popcountl(res[1]); b = __builtin_popcountl(res[2]) + __builtin_popcountl(res[3]); return (a + b); #else unsigned int sum = 0, counts[INTS_PER_VECTOR] __attribute__ ((aligned (BYTE_ALIGNMENT))); VECTOR_STORE((CAST)counts, v_N); sum += BIT_COUNT(counts[0], precomputed) + BIT_COUNT(counts[1], precomputed); sum += BIT_COUNT(counts[2], precomputed) + BIT_COUNT(counts[3], precomputed); return sum; #endif }
double compute_pi_leibniz_avx_opt(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8; register __m256d ymm9, ymm10, ymm11, ymm12, ymm13; ymm0 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); ymm1 = _mm256_set_pd(1.0, 3.0, 5.0, 7.0); ymm2 = _mm256_set_pd(9.0, 11.0, 13.0, 15.0); ymm3 = _mm256_set_pd(17.0, 19.0, 21.0, 23.0); ymm4 = _mm256_set_pd(25.0, 27.0, 29.0, 31.0); ymm13 = _mm256_set1_pd(32.0); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); for (int i = 0; i <= n - 16; i += 16) { ymm9 = _mm256_div_pd(ymm0, ymm1); ymm1 = _mm256_add_pd(ymm1, ymm13); ymm10 = _mm256_div_pd(ymm0, ymm2); ymm2 = _mm256_add_pd(ymm2, ymm13); ymm11 = _mm256_div_pd(ymm0, ymm3); ymm3 = _mm256_add_pd(ymm3, ymm13); ymm12 = _mm256_div_pd(ymm0, ymm4); ymm4 = _mm256_add_pd(ymm4, ymm13); ymm5 = _mm256_add_pd(ymm5, ymm9); ymm6 = _mm256_add_pd(ymm6, ymm10); ymm7 = _mm256_add_pd(ymm7, ymm11); ymm8 = _mm256_add_pd(ymm8, ymm12); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm5); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm6); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm7); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm8); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
void printcounters(struct counter *ctrs, uint64_t duration) { struct metrics s = {0}; uint64_t thisBytesWritten = pcm->bytesWritten(); uint64_t thisBytesRead = pcm->bytesRead(); memset(threadspercore, 0, gbl.ncores * sizeof(int)); s.timestamp = _rdtsc(); s.duration = duration; for (int cpu = 0; cpu < gbl.ncpus; ++cpu) { double delta[NEVENTS]; // volatile because another thread is changing it. volatile struct counter *p = &ctrs[cpu]; for (int i = 0; i < NEVENTS; ++i) { union { __m256d c; uint64_t values[4]; } t; t.c = _mm256_load_pd((const double *)&p->counts[i][0]); delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]); _mm256_store_pd((double *)&lastctr[cpu].counts[i][0], t.c); if (delta[i] < 0) delta[i] = 0; sevents[i] += delta[i]; } //printf("clocks %g duration %lu\n", delta[clocks], duration); if (2*delta[clocks] > duration) { int thiscore = pcm->getSocketId(cpu) * gbl.corespersocket + pcm->getCoreId(cpu); ++s.nthreads; ++threadspercore[thiscore]; } s.dsimd += delta[simd_dp]; s.dsse += delta[sse_dp]; s.dscalar += delta[scalar_dp]; s.ssimd += delta[simd_sp]; s.ssse += delta[sse_sp]; s.sscalar += delta[scalar_sp]; s.instrs += delta[instrs]; } s.rbytes = thisBytesRead - lastBytesRead; s.wbytes = thisBytesWritten - lastBytesWritten; lastBytesRead = thisBytesRead; lastBytesWritten = thisBytesWritten; for (int i = 0; i < gbl.ncores; ++i) if (threadspercore[i]) ++s.ncores; sample(&s); }
void sum_avx(double* c, double* a, double* b, int len) { __m256d rA_AVX, rB_AVX, rC_AVX; // variables for AVX for (int i = 0; i < len; i += 4) { rA_AVX = _mm256_load_pd(&a[i]); rB_AVX = _mm256_load_pd(&b[i]); rC_AVX = _mm256_add_pd(rA_AVX, rB_AVX); _mm256_store_pd(&c[i], rC_AVX); } }
inline void transpose_4x4block_AVX_64(double* A, double* B, const size_t lda, const size_t ldb) { __m256d row0 = _mm256_load_pd(&A[0*ldb]); __m256d row1 = _mm256_load_pd(&A[1*ldb]); __m256d row2 = _mm256_load_pd(&A[2*ldb]); __m256d row3 = _mm256_load_pd(&A[3*ldb]); __m256d tmp3, tmp2, tmp1, tmp0; tmp0 = _mm256_unpacklo_pd(row0, row1); tmp1 = _mm256_unpackhi_pd(row0, row1); tmp2 = _mm256_unpacklo_pd(row2, row3); tmp3 = _mm256_unpackhi_pd(row2, row3); row0 = _mm256_permute2f128_pd(tmp0, tmp2, 0x20); row1 = _mm256_permute2f128_pd(tmp1, tmp3, 0x20); row2 = _mm256_permute2f128_pd(tmp0, tmp2, 0x31); row3 = _mm256_permute2f128_pd(tmp1, tmp3, 0x31); _mm256_store_pd(&B[0*lda], row0); _mm256_store_pd(&B[1*lda], row1); _mm256_store_pd(&B[2*lda], row2); _mm256_store_pd(&B[3*lda], row3); }
/****************************************************************** * * SPLIT RADIX PRECOMPUTED AND VECTORIZED NON RECURSIVE FFT MULTIPLICATION * ******************************************************************/ void sr_vector_nonrec_mul(ring_t *r, const ring_t *x, const ring_t *y){ fft_vector_nonrec_forward(&vec_x,x); fft_vector_nonrec_forward(&vec_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp; // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vec_x.real+i); imag_x = _mm256_load_pd(vec_x.imag+i); real_y = _mm256_load_pd(vec_y.real+i); imag_y = _mm256_load_pd(vec_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); //REPLACED FOR COMMENTED SECTION //real_x = ac // real_x = _mm256_mul_pd(real_x,real_y); // //imag_x = bc // imag_x = _mm256_mul_pd(imag_x,real_y); // //real_x = ac - bd => real_x - real_temp // real_x = _mm256_sub_pd(real_x,real_temp); // //imag_x = ad + bc => imag_temp + imag_x // imag_x = _mm256_add_pd(imag_x,imag_temp); //THESE ARE NOT WORKING real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_y = _mm256_set1_pd(CPLXDIM); real_x = _mm256_div_pd(real_x,real_y); imag_x = _mm256_div_pd(imag_x,real_y); _mm256_store_pd(vec_res.real+i,real_x); _mm256_store_pd(vec_res.imag+i,imag_x); } fft_vector_nonrec_backward(&vec_res,r); }
// this function assumes data is stored in col-major // if data is in row major, call it like matmul4x4(B, A, C) void matmul4x4(double *A, double *B, double *C) { __m256d col[4], sum[4]; //load every column into registers for(int i=0; i<4; i++) col[i] = _mm256_load_pd(&A[i*4]); for(int i=0; i<4; i++) { sum[i] = _mm256_setzero_pd(); for(int j=0; j<4; j++) { sum[i] = _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(B[i*4+j]), col[j]), sum[i]); } } for(int i=0; i<4; i++) _mm256_store_pd(&C[i*4], sum[i]); }
static inline unsigned int populationCount(INT_TYPE v_N) { #ifdef __AVX { unsigned long int res[4] __attribute__ ((aligned (BYTE_ALIGNMENT))); unsigned int a, b; _mm256_store_pd((double*)res, v_N); a = __builtin_popcountl(res[0]) + __builtin_popcountl(res[1]); b = __builtin_popcountl(res[2]) + __builtin_popcountl(res[3]); return (a + b); } #else return (vectorCount(v_N)); #endif }
double compute_pi_euler_avx(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(1.0); ymm2 = _mm256_set1_pd(6.0); for (int i = 0; i <= n - 4; i += 4) { ymm3 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm0 = _mm256_add_pd(ymm0, ymm3); } ymm3 = _mm256_mul_pd(ymm2, ymm0); double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return sqrt( pi ); }
double compute_pi_leibniz_fma(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(2.0); ymm2 = _mm256_set1_pd(1.0); ymm3 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); for (int i = 0; i <= n - 4; i += 4) { ymm4 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm4 = _mm256_fmadd_pd(ymm1, ymm4, ymm2); ymm4 = _mm256_div_pd(ymm3, ymm4); ymm0 = _mm256_add_pd(ymm0, ymm4); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
static inline PetscErrorCode TensorContract_FMA(PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) { PetscFunctionBegin; if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;} { PetscReal R[Q][P],S[Q][P],T[Q][P]; const PetscScalar (*x)[P*P*P][NE] = (const PetscScalar(*)[P*P*P][NE])xx; PetscScalar (*y)[P*P*P][NE] = (PetscScalar(*)[Q*Q*Q][NE])yy; PetscScalar u[dof][Q*P*P][NE]_align,v[dof][Q*Q*P][NE]_align; for (PetscInt i=0; i<Q; i++) { for (PetscInt j=0; j<P; j++) { R[i][j] = tmode == TENSOR_EVAL ? Rf[i*P+j] : Rf[j*Q+i]; S[i][j] = tmode == TENSOR_EVAL ? Sf[i*P+j] : Sf[j*Q+i]; T[i][j] = tmode == TENSOR_EVAL ? Tf[i*P+j] : Tf[j*Q+i]; } } // u[l,a,j,k] = R[a,i] x[l,i,j,k] for (PetscInt l=0; l<dof; l++) { for (PetscInt a=0; a<Q; a++) { __m256d r[P]; for (PetscInt i=0; i<P; i++) r[i] = _mm256_set1_pd(R[a][i]); for (PetscInt jk=0; jk<P*P; jk++) { __m256d u_lajk = _mm256_setzero_pd(); for (PetscInt i=0; i<P; i++) { u_lajk = _mm256_fmadd_pd(r[i],_mm256_load_pd(x[l][i*P*P+jk]),u_lajk); } _mm256_store_pd(u[l][a*P*P+jk],u_lajk); } } } // v[l,a,b,k] = S[b,j] u[l,a,j,k] for (PetscInt l=0; l<dof; l++) { for (PetscInt b=0; b<Q; b++) { __m256d s[P]; for (int j=0; j<P; j++) s[j] = _mm256_set1_pd(S[b][j]); for (PetscInt a=0; a<Q; a++) { for (PetscInt k=0; k<P; k++) { __m256d v_labk = _mm256_setzero_pd(); for (PetscInt j=0; j<P; j++) { v_labk = _mm256_fmadd_pd(s[j],_mm256_load_pd(u[l][(a*P+j)*P+k]),v_labk); } _mm256_store_pd(v[l][(a*Q+b)*P+k],v_labk); } } } } // y[l,a,b,c] = T[c,k] v[l,a,b,k] for (PetscInt l=0; l<dof; l++) { for (PetscInt c=0; c<Q; c++) { __m256d t[P]; for (int k=0; k<P; k++) t[k] = _mm256_set1_pd(T[c][k]); for (PetscInt ab=0; ab<Q*Q; ab++) { __m256d y_labc = _mm256_load_pd(y[l][ab*Q+c]); for (PetscInt k=0; k<P; k++) { // for (PetscInt e=0; e<NE; e++) y[l][ab*Q+c][e] += T[c][k] * v[l][ab*P+k][e]; y_labc = _mm256_fmadd_pd(t[k],_mm256_load_pd(v[l][ab*P+k]),y_labc); } _mm256_store_pd(y[l][ab*Q+c],y_labc); } } } PetscLogFlops(dof*(Q*P*P*P+Q*Q*P*P+Q*Q*Q*P)*NE*2); } PetscFunctionReturn(0); }
void CalculateBasisComponents(const MDoubleArray& weights, const BaryCoords& coords, const MIntArray& triangleVertices, const MPointArray& points, const MFloatVectorArray& normals, const MIntArray& sampleIds, double* alignedStorage, MPoint& origin, MVector& up, MVector& normal) { // Start with the recreated point and normal using the barycentric coordinates of the hit point. unsigned int hitIndex = weights.length()-1; #ifdef __AVX__ __m256d originV = Dot4<MPoint>(coords[0], coords[1], coords[2], 0.0, points[triangleVertices[0]], points[triangleVertices[1]], points[triangleVertices[2]], MPoint::origin); __m256d hitNormalV = Dot4<MVector>(coords[0], coords[1], coords[2], 0.0, normals[triangleVertices[0]], normals[triangleVertices[1]], normals[triangleVertices[2]], MVector::zero); __m256d hitWeightV = _mm256_set1_pd(weights[hitIndex]); // Create the barycentric point and normal. __m256d normalV = _mm256_mul_pd(hitNormalV, hitWeightV); // Then use the weighted adjacent data. for (unsigned int j = 0; j < hitIndex; j += 4) { __m256d tempNormal = Dot4<MVector>(weights[j], weights[j+1], weights[j+2], weights[j+3], normals[sampleIds[j]], normals[sampleIds[j+1]], normals[sampleIds[j+2]], normals[sampleIds[j+3]]); normalV = _mm256_add_pd(tempNormal, normalV); } _mm256_store_pd(alignedStorage, originV); origin.x = alignedStorage[0]; origin.y = alignedStorage[1]; origin.z = alignedStorage[2]; _mm256_store_pd(alignedStorage, normalV); normal.x = alignedStorage[0]; normal.y = alignedStorage[1]; normal.z = alignedStorage[2]; // Calculate the up vector const MPoint& pt1 = points[triangleVertices[0]]; const MPoint& pt2 = points[triangleVertices[1]]; __m256d p1 = _mm256_set_pd(pt1.w, pt1.z, pt1.y, pt1.x); __m256d p2 = _mm256_set_pd(pt2.w, pt2.z, pt2.y, pt2.x); p1 = _mm256_add_pd(p1, p2); __m256d half = _mm256_set_pd(0.5, 0.5, 0.5, 0.5); p1 = _mm256_mul_pd(p1, half); __m256d upV = _mm256_sub_pd(p1, originV); _mm256_store_pd(alignedStorage, upV); up.x = alignedStorage[0]; up.y = alignedStorage[1]; up.z = alignedStorage[2]; #else MVector hitNormal; // Create the barycentric point and normal. for (int i = 0; i < 3; ++i) { origin += points[triangleVertices[i]] * coords[i]; hitNormal += MVector(normals[triangleVertices[i]]) * coords[i]; } // Use crawl data to calculate normal normal = hitNormal * weights[hitIndex]; for (unsigned int j = 0; j < hitIndex; j++) { normal += MVector(normals[sampleIds[j]]) * weights[j]; } // Calculate the up vector // The triangle vertices are sorted by decreasing barycentric coordinates so the first two are // the two closest vertices in the triangle. up = ((points[triangleVertices[0]] + points[triangleVertices[1]]) * 0.5) - origin; #endif normal.normalize(); GetValidUp(weights, points, sampleIds, origin, normal, up); }
/*! * \brief Aligned store of the given packed vector at the * given memory position */ ETL_STATIC_INLINE(void) store(etl::complex<double>* memory, avx_simd_complex_double<etl::complex<double>> value) { _mm256_store_pd(reinterpret_cast<double*>(memory), value.value); }
/*! * \brief Aligned store of the given packed vector at the * given memory position */ ETL_STATIC_INLINE(void) store(double* memory, avx_simd_double value) { _mm256_store_pd(memory, value.value); }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_update_helmholtz_no_h2( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, double* io_c, const double i_h1, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1); /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #endif vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1)); for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } }
test (double *e, __m256d a) { return _mm256_store_pd (e, a); }
void core::Vector3::normalize(void) { #if defined(VTX_USE_AVX) ALIGNED_32 platform::F64_t vector[] = {this->x, this->y, this->z, 0}; ALIGNED_32 platform::F64_t reciprocalVector[] = {1.0, 1.0, 1.0, 1.0}; __m256d simdvector; __m256d result; __m256d recp; simdvector = _mm256_load_pd(vector); recp = _mm256_load_pd(reciprocalVector); result = _mm256_mul_pd(simdvector, simdvector); result = _mm256_hadd_pd(result, result); result = _mm256_hadd_pd(result, result); result = _mm256_sqrt_pd(result); result = _mm256_div_pd(recp, result); simdvector = _mm256_mul_pd(simdvector, result); _mm256_store_pd(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #elif defined(VTX_USE_SSE) // Must pad with a trailing 0, to store in 128-bit register ALIGNED_16 core::F32_t vector[] = {this->x, this->y, this->z, 0}; __m128 simdvector; __m128 result; simdvector = _mm_load_ps(vector); // (X^2, Y^2, Z^2, 0^2) result = _mm_mul_ps(simdvector, simdvector); // Add all elements together, giving us (X^2 + Y^2 + Z^2 + 0^2) result = _mm_hadd_ps(result, result); result = _mm_hadd_ps(result, result); // Calculate square root, giving us sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_sqrt_ps(result); // Calculate reciprocal, giving us 1 / sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_rcp_ps(result); // Finally, multiply the result with our original vector. simdvector = _mm_mul_ps(simdvector, result); _mm_store_ps(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #else core::F64_t num = 1.0 / std::sqrt(std::pow(this->x, 2) + std::pow(this->y, 2) + std::pow(this->z, 2)); this->x *= num; this->y *= num; this->z *= num; #endif }
inline void store(double *r, const F64vec4 v) { _mm256_store_pd(r, v); }
void ntt_transform(poly out, const poly o) { int s, pos = 0, offset; __m256d vt,vo0,vo10,vo11,vo20,vo21,vo22,vo23,vc,vp,vpinv,neg2,neg4; __m256d vx0,vx1,vx2,vx3,vx4,vx5,vx6,vx7; vpinv = _mm256_set_pd(PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE); vp = _mm256_set_pd(8383489., 8383489., 8383489., 8383489.); bitrev(out); vo10 = _mm256_load_pd(o+pos); vo20 = _mm256_load_pd(o+pos+4); neg2 = _mm256_load_pd(_neg2); neg4 = _mm256_load_pd(_neg4); // m = 2, m = 4, m = 8 (3 levels merged) for(s = 0; s<POLY_DEG; s+=8) { // No multiplication with omega required, respective value is 1 vx0 = _mm256_load_pd(out+s); vt = _mm256_mul_pd(vx0,neg2); vx0 = _mm256_hadd_pd(vx0,vt); vx1 = _mm256_load_pd(out+s+4); vt = _mm256_mul_pd(vx1,neg2); vx1 = _mm256_hadd_pd(vx1,vt); vx0 = _mm256_mul_pd(vx0, vo10); vc = _mm256_mul_pd(vx0, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx0 = _mm256_sub_pd(vx0,vc); vt = _mm256_permute2f128_pd (vx0, vx0, 0x01); // now contains x2,x3,x0,x1 vx0 = _mm256_mul_pd(vx0, neg4); vx0 = _mm256_add_pd(vx0, vt); vx1 = _mm256_mul_pd(vx1, vo10); vc = _mm256_mul_pd(vx1, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx1 = _mm256_sub_pd(vx1,vc); vt = _mm256_permute2f128_pd (vx1, vx1, 0x01); // now contains x2,x3,x0,x1 vx1 = _mm256_mul_pd(vx1, neg4); vx1 = _mm256_add_pd(vx1, vt); vt = _mm256_mul_pd(vx1, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx1 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+s+4, vx1); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+s+0, vx0); } pos += 8; // m = 16, m = 32, m = 64 (3 levels merged) for(offset = 0; offset < 8; offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+8); vo11 = _mm256_load_pd(o+pos+offset+16); for(s = 0; s<POLY_DEG; s+=64) { vx1 = _mm256_load_pd(out+offset+s+8); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+8, vx1); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+16); vx3 = _mm256_sub_pd(vx2, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx2 = _mm256_add_pd(vx2, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx5 = _mm256_load_pd(out+offset+s+40); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+32); vx5 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+40, vx5); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+48); vx7 = _mm256_sub_pd(vx6, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx6 = _mm256_add_pd(vx6, vt); // _mm256_store_pd(out+offset+s+48, vx6); // vx2 = _mm256_load_pd(out+offset+s+16); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); // vx6 = _mm256_load_pd(out+offset+s+48); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx4 = _mm256_load_pd(out+offset+s+32); vx6 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+48, vx6); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); // vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx3 = _mm256_sub_pd(vx1, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx1 = _mm256_add_pd(vx1, vt); // _mm256_store_pd(out+offset+s+8, vx1); // vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx5 = _mm256_load_pd(out+offset+s+40); vx7 = _mm256_sub_pd(vx5, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx5 = _mm256_add_pd(vx5, vt); // _mm256_store_pd(out+offset+s+40, vx5); // vx4 = _mm256_load_pd(out+offset+s+32); vo20 = _mm256_load_pd(o+pos+offset+24); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+32, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); // vx5 = _mm256_load_pd(out+offset+s+40); vo21 = _mm256_load_pd(o+pos+offset+32); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+40, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+8, vx1); // vx6 = _mm256_load_pd(out+offset+s+48); vo22 = _mm256_load_pd(o+pos+offset+40); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx2 = _mm256_load_pd(out+offset+s+16); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+48, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+16, vx2); // vx7 = _mm256_load_pd(out+offset+s+56); vo23 = _mm256_load_pd(o+pos+offset+48); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx3 = _mm256_load_pd(out+offset+s+24); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+56, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+24, vx3); } } pos += 56; // m = 128, m=256, m=512 (3 levels merged) for(offset=0;offset<64;offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+64); vo11 = _mm256_load_pd(o+pos+offset+128); for(s = 0; s<POLY_DEG; s+=512) { vx1 = _mm256_load_pd(out+offset+s+64); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+64, vx1); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+128); vx3 = _mm256_sub_pd(vx2, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx2 = _mm256_add_pd(vx2, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx5 = _mm256_load_pd(out+offset+s+320); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+256); vx5 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+320, vx5); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+384); vx7 = _mm256_sub_pd(vx6, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx6 = _mm256_add_pd(vx6, vt); //_mm256_store_pd(out+offset+s+384, vx6); //vx2 = _mm256_load_pd(out+offset+s+128); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); //vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx3 = _mm256_sub_pd(vx1, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx1 = _mm256_add_pd(vx1, vt); //_mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx4 = _mm256_load_pd(out+offset+s+256); vx6 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+384, vx6); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); //vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx5 = _mm256_load_pd(out+offset+s+320); vx7 = _mm256_sub_pd(vx5, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx5 = _mm256_add_pd(vx5, vt); //_mm256_store_pd(out+offset+s+320, vx5); //vx4 = _mm256_load_pd(out+offset+s+256); vo20 = _mm256_load_pd(o+pos+offset+192); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+256, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); //vx5 = _mm256_load_pd(out+offset+s+320); vo21 = _mm256_load_pd(o+pos+offset+256); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+320, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vo22 = _mm256_load_pd(o+pos+offset+320); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx2 = _mm256_load_pd(out+offset+s+128); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+384, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+128, vx2); //vx7 = _mm256_load_pd(out+offset+s+448); vo23 = _mm256_load_pd(o+pos+offset+384); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx3 = _mm256_load_pd(out+offset+s+192); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+448, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+192, vx3); } } }
void rnn_int_d8x4_var2( int k, double *aa, double *a, double *bb, double *b, double *c, aux_t *aux ) { int i; double neg2 = -2.0; double dzero = 0.0; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t c_tmp; v4df_t a03, a47; v4df_t A03, A47; // prefetched A v4df_t b0, b1, b2, b3; v4df_t B0; // prefetched B v4df_t aa_tmp, bb_tmp; int k_iter = k / 2; int k_left = k % 2; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( c ) ); c03_0.v = _mm256_setzero_pd(); c03_1.v = _mm256_setzero_pd(); c03_2.v = _mm256_setzero_pd(); c03_3.v = _mm256_setzero_pd(); c47_0.v = _mm256_setzero_pd(); c47_1.v = _mm256_setzero_pd(); c47_2.v = _mm256_setzero_pd(); c47_3.v = _mm256_setzero_pd(); // Load a03 a03.v = _mm256_load_pd( (double*)a ); // Load a47 a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // Load (b0,b1,b2,b3) b0.v = _mm256_load_pd( (double*)b ); for ( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Preload A03 A03.v = _mm256_load_pd( (double*)( a + 8 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Preload A47 A47.v = _mm256_load_pd( (double*)( a + 12 ) ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); // Preload B0 B0.v = _mm256_load_pd( (double*)( b + 4 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Iteration #1 __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Preload a03 ( next iteration ) a03.v = _mm256_load_pd( (double*)( a + 16 ) ); c_tmp.v = _mm256_mul_pd( A03.v , B0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , B0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c_tmp.v = _mm256_mul_pd( A03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // Preload a47 ( next iteration ) a47.v = _mm256_load_pd( (double*)( a + 20 ) ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( A47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c_tmp.v = _mm256_mul_pd( A03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Load b0 ( next iteration ) b0.v = _mm256_load_pd( (double*)( b + 8 ) ); c_tmp.v = _mm256_mul_pd( A03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( A47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 16; b += 8; } for ( i = 0; i < k_left; ++i ) { a03.v = _mm256_load_pd( (double*)a ); //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] ); a47.v = _mm256_load_pd( (double*)( a + 4 ) ); //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] ); b0.v = _mm256_load_pd( (double*)b ); //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 8; b += 4; } // Prefetch aa and bb __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 ); tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 ); tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 ); tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 ); tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 ); tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 ); tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 ); tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 ); //printf( "rank-k\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->I ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->D ) ); //for ( i = 0; i < k; i++ ) { // a03.v = _mm256_load_pd( (double*)a ); // a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // b0.v = _mm256_broadcast_sd( (double*)b ); // b1.v = _mm256_broadcast_sd( (double*)( b + 1 ) ); // b2.v = _mm256_broadcast_sd( (double*)( b + 2 ) ); // b3.v = _mm256_broadcast_sd( (double*)( b + 3 ) ); // a += DKS_MR; // b += DKS_NR; // c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); // c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); // c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); // c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); // c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); // c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); // c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); // c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); // c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); //} aa_tmp.v = _mm256_broadcast_sd( &neg2 ); //c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); //c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); //c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); //c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); //c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); //c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); //c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); //c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); //printf( "scale -2 \n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] ); //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); //printf( "add a^2\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Transpose c03/c47 _0, _1, _2, _3 to be the row vector tmpc03_0.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0x0 ); tmpc03_1.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0xF ); tmpc03_2.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0x0 ); tmpc03_3.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0xF ); tmpc47_0.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0x0 ); tmpc47_1.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0xF ); tmpc47_2.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0x0 ); tmpc47_3.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0xF ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x20 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x31 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x20 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x31 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x20 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x31 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x20 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x31 ); // c03_0; // c03_1; // c03_2; // c03_3; // c47_0; // c47_1; // c47_2; // c47_3; _mm256_store_pd( c , c03_0.v ); _mm256_store_pd( c + 4, c03_1.v ); _mm256_store_pd( c + 8, c03_2.v ); _mm256_store_pd( c + 12, c03_3.v ); _mm256_store_pd( c + 16, c47_0.v ); _mm256_store_pd( c + 20, c47_1.v ); _mm256_store_pd( c + 24, c47_2.v ); _mm256_store_pd( c + 28, c47_3.v ); }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_vector_compscale( const double* i_a, const double* i_b, double* io_c, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_a_1, vec_b_1; __m256d vec_a_2, vec_b_2; vec_a_1 = _mm256_loadu_pd(&(i_a[l_n])); vec_a_2 = _mm256_loadu_pd(&(i_a[l_n+4])); vec_b_1 = _mm256_loadu_pd(&(i_b[l_n])); vec_b_2 = _mm256_loadu_pd(&(i_b[l_n+4])); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_a, vec_b; vec_a = _mm512_loadu_pd(&(i_a[l_n])); vec_b = _mm512_loadu_pd(&(i_b[l_n])); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } }
ALGEBRA_INLINE void vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n) { size_t k; __m256d l1 = _mm256_broadcast_sd(&lambda); __m256d l2 = _mm256_broadcast_sd(&lambda); __m256d l3 = _mm256_broadcast_sd(&lambda); __m256d l4 = _mm256_broadcast_sd(&lambda); size_t q = n / 16; size_t r = n % 16; if(q > 0) { if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) { for (k=0;k<q;k++) { /* Charge 4 valeurs de chaque tableau */ __m256d i1 = _mm256_load_pd(v1); __m256d j1 = _mm256_load_pd(v2); __m256d i2 = _mm256_load_pd(v1+4); __m256d j2 = _mm256_load_pd(v2+4); __m256d i3 = _mm256_load_pd(v1+8); __m256d j3 = _mm256_load_pd(v2+8); __m256d i4 = _mm256_load_pd(v1+12); __m256d j4 = _mm256_load_pd(v2+12); /* multiplie */ j1 = _mm256_mul_pd(j1, l1); j2 = _mm256_mul_pd(j2, l2); j3 = _mm256_mul_pd(j3, l3); j4 = _mm256_mul_pd(j4, l4); /* Additionne */ i1 = _mm256_add_pd(i1,j1); i2 = _mm256_add_pd(i2,j2); i3 = _mm256_add_pd(i3,j3); i4 = _mm256_add_pd(i4,j4); /* Sauvegarde */ _mm256_store_pd(v1, i1); _mm256_store_pd(v1+4, i2); _mm256_store_pd(v1+8, i3); _mm256_store_pd(v1+12, i4); v1 += 16; v2 += 16; } } else { for (k=0;k<q;k++) { /* Charge 4 valeurs de chaque tableau */ __m256d i1 = _mm256_loadu_pd(v1); __m256d j1 = _mm256_loadu_pd(v2); __m256d i2 = _mm256_loadu_pd(v1+4); __m256d j2 = _mm256_loadu_pd(v2+4); __m256d i3 = _mm256_loadu_pd(v1+8); __m256d j3 = _mm256_loadu_pd(v2+8); __m256d i4 = _mm256_loadu_pd(v1+12); __m256d j4 = _mm256_loadu_pd(v2+12); /* multiplie */ j1 = _mm256_mul_pd(j1, l1); j2 = _mm256_mul_pd(j2, l2); j3 = _mm256_mul_pd(j3, l3); j4 = _mm256_mul_pd(j4, l4); /* Additionne */ i1 = _mm256_add_pd(i1,j1); i2 = _mm256_add_pd(i2,j2); i3 = _mm256_add_pd(i3,j3); i4 = _mm256_add_pd(i4,j4); /* Sauvegarde */ _mm256_storeu_pd(v1, i1); _mm256_storeu_pd(v1+4, i2); _mm256_storeu_pd(v1+8, i3); _mm256_storeu_pd(v1+12, i4); v1 += 16; v2 += 16; } } } for(k = 0 ; k<r ; k++) v1[k] += lambda*v2[k]; }
inline void vector4d::store_aligned(double* dst) const { _mm256_store_pd(dst, m_value); }
int main() { constexpr size_t N = 100 << 20; constexpr size_t N_pd = N/sizeof(double); constexpr size_t N_ps = N/sizeof(float); printf("Comparing std::log to mm256_log_pd/mm256_log_ps with %zuMiB input data\n", N>>20); double * data_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32); float * data_ps = (float*) _mm_malloc(N_ps*sizeof(float), 32); double * outl_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32); float * outl_ps = (float*) _mm_malloc(N_ps*sizeof(float), 32); double * outa_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32); float * outa_ps = (float*) _mm_malloc(N_ps*sizeof(float), 32); double * err_pd = (double*)_mm_malloc(N_pd*sizeof(double), 32); float * err_ps = (float*) _mm_malloc(N_ps*sizeof(float), 32); size_t * idx_pd = (size_t*)malloc(N_pd*sizeof(size_t)); size_t * idx_ps = (size_t*)malloc(N_ps*sizeof(size_t)); if(data_pd == nullptr || data_ps == nullptr || outl_pd == nullptr || outl_ps == nullptr || outa_pd == nullptr || outa_ps == nullptr || err_pd == nullptr || err_ps == nullptr) { return 1; } auto rng = std::mt19937(hrc::now().time_since_epoch().count()); printf("Filling double input data... "); fflush(stdout); for(size_t i = 0; i < N_pd; ++i) { data_pd[i] = /*100.0 */ std::generate_canonical<double, 64>(rng); } printf("done\n"); printf("Filling float input data... "); fflush(stdout); for(size_t i = 0; i < N_ps; ++i) { data_ps[i] = /*100.0f */ std::generate_canonical<float, 32>(rng); } printf("done\n\n"); printf("Testing serial run:\n\n"); printf("Running std::log double... "); fflush(stdout); auto log_pd_s_time_start = hrc::now(); for(size_t i = 0; i < N_pd; i += 8) { outl_pd[i+0] = std::log(data_pd[i+0]); outl_pd[i+1] = std::log(data_pd[i+1]); outl_pd[i+2] = std::log(data_pd[i+2]); outl_pd[i+3] = std::log(data_pd[i+3]); outl_pd[i+4] = std::log(data_pd[i+4]); outl_pd[i+5] = std::log(data_pd[i+5]); outl_pd[i+6] = std::log(data_pd[i+6]); outl_pd[i+7] = std::log(data_pd[i+7]); } auto log_pd_s_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(log_pd_s_time_end - log_pd_s_time_start).count()); printf("Running mm256_log_pd... "); fflush(stdout); auto avx_pd_s_time_start = hrc::now(); for(size_t i = 0; i < N_pd; i += 8) { _mm256_store_pd(outa_pd+i+0, mm256_log_pd(_mm256_load_pd(data_pd+i+0))); _mm256_store_pd(outa_pd+i+4, mm256_log_pd(_mm256_load_pd(data_pd+i+4))); } auto avx_pd_s_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_pd_s_time_end - avx_pd_s_time_start).count()); printf("Running std::log float... "); fflush(stdout); auto log_ps_s_time_start = hrc::now(); for(size_t i = 0; i < N_ps; i += 16) { outl_ps[i+ 0] = std::log(data_ps[i+ 0]); outl_ps[i+ 1] = std::log(data_ps[i+ 1]); outl_ps[i+ 2] = std::log(data_ps[i+ 2]); outl_ps[i+ 3] = std::log(data_ps[i+ 3]); outl_ps[i+ 4] = std::log(data_ps[i+ 4]); outl_ps[i+ 5] = std::log(data_ps[i+ 5]); outl_ps[i+ 6] = std::log(data_ps[i+ 6]); outl_ps[i+ 7] = std::log(data_ps[i+ 7]); outl_ps[i+ 8] = std::log(data_ps[i+ 8]); outl_ps[i+ 9] = std::log(data_ps[i+ 9]); outl_ps[i+10] = std::log(data_ps[i+10]); outl_ps[i+11] = std::log(data_ps[i+11]); outl_ps[i+12] = std::log(data_ps[i+12]); outl_ps[i+13] = std::log(data_ps[i+13]); outl_ps[i+14] = std::log(data_ps[i+14]); outl_ps[i+15] = std::log(data_ps[i+15]); } auto log_ps_s_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(log_ps_s_time_end - log_ps_s_time_start).count()); printf("Running mm256_log_ps... "); fflush(stdout); auto avx_ps_s_time_start = hrc::now(); for(size_t i = 0; i < N_ps; i += 16) { _mm256_store_ps(outa_ps+i+0, mm256_log_ps(_mm256_load_ps(data_ps+i+0))); _mm256_store_ps(outa_ps+i+8, mm256_log_ps(_mm256_load_ps(data_ps+i+8))); } auto avx_ps_s_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_ps_s_time_end - avx_ps_s_time_start).count()); printf("\n\nTesting parallel run:\n\n"); printf("Running std::log double... "); fflush(stdout); auto log_pd_p_time_start = hrc::now(); #pragma omp parallel for for(size_t i = 0; i < N_pd; i += 8) { outl_pd[i+0] = std::log(data_pd[i+0]); outl_pd[i+1] = std::log(data_pd[i+1]); outl_pd[i+2] = std::log(data_pd[i+2]); outl_pd[i+3] = std::log(data_pd[i+3]); outl_pd[i+4] = std::log(data_pd[i+4]); outl_pd[i+5] = std::log(data_pd[i+5]); outl_pd[i+6] = std::log(data_pd[i+6]); outl_pd[i+7] = std::log(data_pd[i+7]); } auto log_pd_p_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(log_pd_p_time_end - log_pd_p_time_start).count()); printf("Running mm256_log_pd... "); fflush(stdout); auto avx_pd_p_time_start = hrc::now(); #pragma omp parallel for for(size_t i = 0; i < N_pd; i += 8) { _mm256_store_pd(outa_pd+i+0, mm256_log_pd(_mm256_load_pd(data_pd+i+0))); _mm256_store_pd(outa_pd+i+4, mm256_log_pd(_mm256_load_pd(data_pd+i+4))); } auto avx_pd_p_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_pd_p_time_end - avx_pd_p_time_start).count()); printf("Running std::log float... "); fflush(stdout); auto log_ps_p_time_start = hrc::now(); #pragma omp parallel for for(size_t i = 0; i < N_ps; i += 16) { outl_ps[i+ 0] = std::log(data_ps[i+ 0]); outl_ps[i+ 1] = std::log(data_ps[i+ 1]); outl_ps[i+ 2] = std::log(data_ps[i+ 2]); outl_ps[i+ 3] = std::log(data_ps[i+ 3]); outl_ps[i+ 4] = std::log(data_ps[i+ 4]); outl_ps[i+ 5] = std::log(data_ps[i+ 5]); outl_ps[i+ 6] = std::log(data_ps[i+ 6]); outl_ps[i+ 7] = std::log(data_ps[i+ 7]); outl_ps[i+ 8] = std::log(data_ps[i+ 8]); outl_ps[i+ 9] = std::log(data_ps[i+ 9]); outl_ps[i+10] = std::log(data_ps[i+10]); outl_ps[i+11] = std::log(data_ps[i+11]); outl_ps[i+12] = std::log(data_ps[i+12]); outl_ps[i+13] = std::log(data_ps[i+13]); outl_ps[i+14] = std::log(data_ps[i+14]); outl_ps[i+15] = std::log(data_ps[i+15]); } auto log_ps_p_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(log_ps_p_time_end - log_ps_p_time_start).count()); printf("Running mm256_log_ps... "); fflush(stdout); auto avx_ps_p_time_start = hrc::now(); #pragma omp parallel for for(size_t i = 0; i < N_ps; i += 16) { _mm256_store_ps(outa_ps+i+0, mm256_log_ps(_mm256_load_ps(data_ps+i+0))); _mm256_store_ps(outa_ps+i+8, mm256_log_ps(_mm256_load_ps(data_ps+i+8))); } auto avx_ps_p_time_end = hrc::now(); printf("done in %lums\n", std::chrono::duration_cast<ms>(avx_ps_p_time_end - avx_ps_p_time_start).count()); printf("\nCalculating errors... "); fflush(stdout); #pragma omp parallel for for(size_t i = 0; i < N_pd; ++i) { err_pd[i] = std::abs(1.0 - outa_pd[i]/outl_pd[i]); } #pragma omp parallel for for(size_t i = 0; i < N_ps; ++i) { err_ps[i] = std::abs(1.0 - outa_ps[i]/outl_ps[i]); } #pragma omp parallel for for(size_t i = 0; i < N_pd; ++i) { idx_pd[i] = i; } #pragma omp parallel for for(size_t i = 0; i < N_ps; ++i) { idx_ps[i] = i; } std::sort(idx_pd, idx_pd+N_pd, [&](size_t a, size_t b){ return err_pd[a] < err_pd[b]; }); std::sort(idx_ps, idx_ps+N_ps, [&](size_t a, size_t b){ return err_ps[a] < err_ps[b]; }); printf("done\n"); printf("\n\nSummary:\n"); double lsd_s = std::chrono::duration_cast<sd>(log_pd_s_time_end - log_pd_s_time_start).count(); double asd_s = std::chrono::duration_cast<sd>(avx_pd_s_time_end - avx_pd_s_time_start).count(); double lss_s = std::chrono::duration_cast<sd>(log_ps_s_time_end - log_ps_s_time_start).count(); double ass_s = std::chrono::duration_cast<sd>(avx_ps_s_time_end - avx_ps_s_time_start).count(); double lpd_s = std::chrono::duration_cast<sd>(log_pd_p_time_end - log_pd_p_time_start).count(); double apd_s = std::chrono::duration_cast<sd>(avx_pd_p_time_end - avx_pd_p_time_start).count(); double lps_s = std::chrono::duration_cast<sd>(log_ps_p_time_end - log_ps_p_time_start).count(); double aps_s = std::chrono::duration_cast<sd>(avx_ps_p_time_end - avx_ps_p_time_start).count(); printf(" Algorithm | Data Type | parallel | time | speed | min rel err | max rel err | 90%% rel err\n"); printf("-----------------------------------------------------------------------------------------------------------\n"); printf(" std::log | double | false | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lsd_s, N / lsd_s / (1<<30)); printf(" mm256_log_pd | double | false | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * asd_s, N / asd_s / (1<<30), err_pd[idx_pd[0]], err_pd[idx_pd[N_pd-1]], err_pd[idx_pd[90*N_pd/100]]); printf(" std::log | float | false | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lss_s, N / lss_s / (1<<30)); printf(" mm256_log_ps | float | false | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * ass_s, N / ass_s / (1<<30), err_ps[idx_ps[0]], err_ps[idx_ps[N_ps-1]], err_ps[idx_ps[90*N_ps/100]]); printf(" std::log | double | true | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lpd_s, N / lpd_s / (1<<30)); printf(" mm256_log_pd | double | true | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * apd_s, N / apd_s / (1<<30), err_pd[idx_pd[0]], err_pd[idx_pd[N_pd-1]], err_pd[idx_pd[90*N_pd/100]]); printf(" std::log | float | true | %6.1f ms | %4.2f GiB/s\n", 1000.0 * lps_s, N / lps_s / (1<<30)); printf(" mm256_log_ps | float | true | %6.1f ms | %4.2f GiB/s | %e | %e | %e\n", 1000.0 * aps_s, N / aps_s / (1<<30), err_ps[idx_ps[0]], err_ps[idx_ps[N_ps-1]], err_ps[idx_ps[90*N_ps/100]]); _mm_free(data_pd); _mm_free(data_ps); _mm_free(outl_pd); _mm_free(outl_ps); _mm_free(outa_pd); _mm_free(outa_ps); _mm_free(err_pd); _mm_free(err_ps); free(idx_pd); free(idx_ps); return 0; }