void fill_lanes_naive (int N, int * iarr, int * jarr, int * marr, int * base, int * offs, real * x, real * f, real rsq, void * data) { __assume_aligned(iarr, 64); __assume_aligned(jarr, 64); #pragma simd for (int idx = 0; idx < N; idx++) { const int i = iarr[idx]; const int j = jarr[idx]; const int M = marr[i]; const real xi = x[i]; const int * idxs = base + offs[i]; real acc_fi = 0; const real xj = x[j]; const real dxij = xi - xj; real acc_fj = 0; for (int k = 0; k < M; k++) { const int kk = idxs[k]; const real xk = x[kk]; const real dxik = xi - xk; if (dxik * dxik > rsq) continue; real fi = 0, fj = 0, fk = 0; compute_f(dxij, dxik, &fi, &fj, &fk); acc_fj += fj; acc_fi += fi; memory_reduce_add(&f[kk], fk); } memory_reduce_add(&f[j], acc_fj); memory_reduce_add(&f[i], acc_fi); } }
int optimal(int weight, int idx, const item_t item[]) { if (idx < 0) { return 0; } __assume_aligned(memo, 64); if (weight < item[idx].weight) { if( idx - 1 >= 0 ) { if( memo[(idx-1) * (capacity+1) + weight] == 0 ) { memo[(idx-1) * (capacity+1) + weight] = optimal(weight, idx-1, item); } return memo[(idx-1) * (capacity+1) + weight]; } return 0; } if( idx - 1 >= 0 ) { if( memo[(idx-1) * (capacity+1) + weight] == 0 ) { memo[(idx-1) * (capacity+1) + weight] = optimal(weight, idx-1, item); } if( weight - item[idx].weight >= 0 ) { if( memo[(idx-1) * (capacity+1) + (weight - item[idx].weight)] == 0 ) { memo[(idx-1) * (capacity+1) + (weight - item[idx].weight)] = optimal(weight - item[idx].weight, idx-1, item); } int aux = ( memo[(idx-1) * (capacity+1) + weight] >= ( memo[(idx-1) * (capacity+1) + (weight - item[idx].weight)] + item[idx].value ) ) ? memo[(idx-1) * (capacity+1) + weight] : ( memo[(idx-1) * (capacity+1) + (weight - item[idx].weight)] + item[idx].value ); return aux; } else { return ( memo[(idx-1) * (capacity+1) + weight] >= item[idx].value ) ? memo[(idx-1) * (capacity+1) + weight] : item[idx].value; } } return item[idx].value; }
__attribute__((target(mic))) double distancia(double *p1, double *p2, int DIM) { int i=0; double suma=0.0; double aux, aux2; __assume_aligned(p1, 64); __assume_aligned(p2, 64); #pragma vector aligned #pragma ivdep #pragma simd for (i=0; i < DIM; i++){ suma += (p1[i]-p2[i])*(p1[i]-p2[i]); } return sqrt(suma); }
void do_work(value_type* v, size_t n) { size_t i; __assume_aligned(v, 64); __assume(n%16 == 0); for(i=0; i<n; ++i) { v[i] += 1.2; } }
static void mvmt_do(const float *mvmt, float *g, float *dm, int nact, int ngleft, int ngtot){ #pragma omp parallel for for(int ia=0; ia<nact; ia++){ register float tmp=dm[ia]; #ifdef __INTEL_COMPILER #pragma unroll __assume_aligned(mvmt,128); __assume_aligned(g,128); __assume_aligned(dm,128); #endif #ifdef __INTEL_COMPILER #pragma vector aligned #pragma ivdep #pragma simd vectorlength(16) assert #endif for(int ig=0; ig<ngleft; ig++){ tmp+=mvmt[ig+ia*ngtot]*g[ig]; } dm[ia]=tmp; } }
template <class T> void MICStencil<T>::operator()( Matrix2D<T>& mtx, unsigned int nIters ) { unsigned int uDimWithHalo = mtx.GetNumRows(); unsigned int uHaloWidth = LINESIZE / sizeof(T); unsigned int uImgElements = uDimWithHalo * uDimWithHalo; __declspec(target(mic), align(LINESIZE)) T* pIn = mtx.GetFlatData(); __declspec(target(mic), align(sizeof(T))) T wcenter = this->wCenter; __declspec(target(mic), align(sizeof(T))) T wdiag = this->wDiagonal; __declspec(target(mic), align(sizeof(T))) T wcardinal = this->wCardinal; #pragma offload target(mic) in(pIn:length(uImgElements) ALLOC RETAIN) { // Just copy pIn to compute the copy transfer time } #pragma offload target(mic) in(pIn:length(uImgElements) REUSE RETAIN) \ in(uImgElements) in(uDimWithHalo) \ in(wcenter) in(wdiag) in(wcardinal) { unsigned int uRowPartitions = sysconf(_SC_NPROCESSORS_ONLN) / 4 - 1; unsigned int uColPartitions = 4; // Threads per core for KNC unsigned int uRowTileSize = (uDimWithHalo - 2 * uHaloWidth) / uRowPartitions; unsigned int uColTileSize = (uDimWithHalo - 2 * uHaloWidth) / uColPartitions; uRowTileSize = ((uDimWithHalo - 2 * uHaloWidth) % uRowPartitions > 0) ? (uRowTileSize + 1) : (uRowTileSize); // Should use the "Halo Val" when filling the memory space T *pTmp = (T*)pIn; T *pCrnt = (T*)memset((T*)_mm_malloc(uImgElements * sizeof(T), LINESIZE), 0, uImgElements * sizeof(T)); #pragma omp parallel firstprivate(pTmp, pCrnt, uRowTileSize, uColTileSize, uHaloWidth, uDimWithHalo) { unsigned int uThreadId = omp_get_thread_num(); unsigned int uRowTileId = uThreadId / uColPartitions; unsigned int uColTileId = uThreadId % uColPartitions; unsigned int uStartLine = uRowTileId * uRowTileSize + uHaloWidth; unsigned int uStartCol = uColTileId * uColTileSize + uHaloWidth; unsigned int uEndLine = uStartLine + uRowTileSize; uEndLine = (uEndLine > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndLine; unsigned int uEndCol = uStartCol + uColTileSize; uEndCol = (uEndCol > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndCol; T cardinal0 = 0.0; T diagonal0 = 0.0; T center0 = 0.0; unsigned int cntIterations, i, j; for (cntIterations = 0; cntIterations < nIters; cntIterations ++) { // Do Stencil Operation for (i = uStartLine; i < uEndLine; i++) { T * pCenter = &pTmp [ i * uDimWithHalo]; T * pTop = pCenter - uDimWithHalo; T * pBottom = pCenter + uDimWithHalo; T * pOut = &pCrnt[ i * uDimWithHalo]; __assume_aligned(pCenter, 64); __assume_aligned(pTop, 64); __assume_aligned(pBottom, 64); __assume_aligned(pOut, 64); #pragma simd vectorlengthfor(float) for (j = uStartCol; j < uEndCol; j++) { cardinal0 = pCenter[j - 1] + pCenter[j + 1] + pTop[j] + pBottom[j]; diagonal0 = pTop[j - 1] + pTop[j + 1] + pBottom[j - 1] + pBottom[j + 1]; center0 = pCenter[j]; pOut[j] = wcardinal * cardinal0 + wdiag * diagonal0 + wcenter * center0; } } #pragma omp barrier ; // Switch pointers T* pAux = pTmp; pTmp = pCrnt; pCrnt = pAux; } // End For } // End Parallel _mm_free(pCrnt); } // End Offload #pragma offload target(mic) out(pIn:length(uImgElements) REUSE FREE) { // Just copy back pIn } }
// host stub function void op_par_loop_update(char const *name, op_set set, op_arg arg0, op_arg arg1, op_arg arg2, op_arg arg3, op_arg arg4) { int nargs = 5; op_arg args[5]; args[0] = arg0; args[1] = arg1; args[2] = arg2; args[3] = arg3; args[4] = arg4; // create aligned pointers for dats ALIGNED_double const double *__restrict__ ptr0 = (double *)arg0.data; __assume_aligned(ptr0, double_ALIGN); ALIGNED_double double *__restrict__ ptr1 = (double *)arg1.data; __assume_aligned(ptr1, double_ALIGN); ALIGNED_double double *__restrict__ ptr2 = (double *)arg2.data; __assume_aligned(ptr2, double_ALIGN); ALIGNED_double const double *__restrict__ ptr3 = (double *)arg3.data; __assume_aligned(ptr3, double_ALIGN); // initialise timers double cpu_t1, cpu_t2, wall_t1, wall_t2; op_timing_realloc(4); op_timers_core(&cpu_t1, &wall_t1); if (OP_diags > 2) { printf(" kernel routine w/o indirection: update"); } int exec_size = op_mpi_halo_exchanges(set, nargs, args); if (exec_size > 0) { #ifdef VECTORIZE #pragma novector for (int n = 0; n < (exec_size / SIMD_VEC) * SIMD_VEC; n += SIMD_VEC) { double dat4[SIMD_VEC] = {0.0}; #pragma simd for (int i = 0; i < SIMD_VEC; i++) { update(&(ptr0)[4 * (n + i)], &(ptr1)[4 * (n + i)], &(ptr2)[4 * (n + i)], &(ptr3)[1 * (n + i)], &dat4[i]); } for (int i = 0; i < SIMD_VEC; i++) { *(double *)arg4.data += dat4[i]; } } // remainder for (int n = (exec_size / SIMD_VEC) * SIMD_VEC; n < exec_size; n++) { #else for (int n = 0; n < exec_size; n++) { #endif update(&(ptr0)[4 * n], &(ptr1)[4 * n], &(ptr2)[4 * n], &(ptr3)[1 * n], (double *)arg4.data); } } // combine reduction data op_mpi_reduce(&arg4, (double *)arg4.data); op_mpi_set_dirtybit(nargs, args); // update kernel record op_timers_core(&cpu_t2, &wall_t2); OP_kernels[4].name = name; OP_kernels[4].count += 1; OP_kernels[4].time += wall_t2 - wall_t1; OP_kernels[4].transfer += (float)set->size * arg0.size; OP_kernels[4].transfer += (float)set->size * arg1.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg2.size * 2.0f; OP_kernels[4].transfer += (float)set->size * arg3.size; }
int main(void) { // std::cout<<std::endl<<" Compute inner product..."<<std::endl<<std::endl; // INIT VECTOR //double vec1 [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));//__declspec(align(n)) //double vec2 [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_))); //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec1 [_PBM_SIZE]; //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec2 [_PBM_SIZE]; //double *vec1 = _aligned_malloc(_PBM_SIZE*sizeof *vec1,_CBSIM_DBL_ALIGN_); //double *vec2 = _aligned_malloc(_PBM_SIZE*sizeof *vec2,_CBSIM_DBL_ALIGN_); double *vec1 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32); double *vec2 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32); double result = 0.0; // tbb::tick_count t1, t2; int loopsToDo = 10000; for (int i=0 ; i < _PBM_SIZE ; i++) { vec1[i] = static_cast<double>(i)*0.01; vec2[i] = static_cast<double>(i)*0.01; } // SERIAL *********************************************************************************** // t1 = tbb::tick_count::now(); for (int z=0 ; z < loopsToDo ; z++) { //__m256d ymm0; //__m256d ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;//, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm16, ymm17, ymm18; //ymm0 = _mm256_setzero_pd(); // accumulator //double res0 = 0.0, res1 = 0.0, res2 = 0.0, res3 = 0.0; //__m256d acc = _mm256_setzero_pd(); //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0, 0.0, 0.0}; result = 0.0; //double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; for (int i=0 ; i < _PBM_SIZE; i+=8) { /* __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m128d xmm1 = _mm256_extractf128_pd(ymm3,0); __m128d xmm2 = _mm256_extractf128_pd(ymm3,1); __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); */ __assume_aligned(&vec1[0],32); __assume_aligned(&vec2[0],32); __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m256d ymm4 = _mm256_load_pd(&vec1[i+4]); __m256d ymm5 = _mm256_load_pd(&vec2[i+4]); __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 ); __m256d ymm7 = _mm256_add_pd( ymm3, ymm6); __m128d xmm1 = _mm256_extractf128_pd(ymm7,0); __m128d xmm2 = _mm256_extractf128_pd(ymm7,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); //__m256d ymm0 = _mm256_add_pd( ymm0, ymm7); /* //__assume_aligned(&vec1[0],32); //__assume_aligned(&vec2[0],32); __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m256d ymm4 = _mm256_load_pd(&vec1[i+4]); __m256d ymm5 = _mm256_load_pd(&vec2[i+4]); //__m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 ); //__m256d ymm7 = _mm256_add_pd( ymm3, ymm6); __m256d ymm6 = _mm256_fmadd_pd (ymm4,ymm5,ymm3); //ymm0 = _mm256_add_pd( ymm0, ymm7); __m128d xmm1 = _mm256_extractf128_pd(ymm6,0); __m128d xmm2 = _mm256_extractf128_pd(ymm6,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); //_mm256_store_pd(&res[0],ymm6); //result_SIMD_INTRINSICS += (res[0] + res[1]) + (res[2] + res[3]); */ //#define _VER_AVX #ifdef _VER_AVX __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m256d ymm4 = _mm256_load_pd(&vec1[i+4]); __m256d ymm5 = _mm256_load_pd(&vec2[i+4]); __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 ); __m256d ymm7 = _mm256_load_pd(&vec1[i+8]); __m256d ymm8 = _mm256_load_pd(&vec2[i+8]); __m256d ymm9 = _mm256_mul_pd( ymm7, ymm8 ); __m256d ymm10 = _mm256_load_pd(&vec1[i+12]); __m256d ymm11 = _mm256_load_pd(&vec2[i+12]); __m256d ymm12 = _mm256_mul_pd( ymm10, ymm11 ); __m256d ymm13 = _mm256_add_pd( ymm3, ymm6); __m256d ymm14 = _mm256_add_pd( ymm9, ymm12); __m256d ymm15 = _mm256_add_pd( ymm13, ymm14); __m128d xmm1 = _mm256_extractf128_pd(ymm15,0); __m128d xmm2 = _mm256_extractf128_pd(ymm15,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res_SIMD_INTRINSICS[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res_SIMD_INTRINSICS[0],xmm3); result += (res_SIMD_INTRINSICS[0] + res_SIMD_INTRINSICS[1]); //ymm0 = _mm256_add_pd( ymm0, ymm13); //ymm0 = _mm256_add_pd( ymm0, ymm14); #endif //#define _VER_AVX2 #ifdef _VER_AVX2 __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec1[i+4]); __m256d ymm3 = _mm256_load_pd(&vec1[i+8]); __m256d ymm4 = _mm256_load_pd(&vec1[i+12]); //__m256d ymm13 = _mm256_load_pd(&vec1[i+16]); //__m256d ymm14 = _mm256_load_pd(&vec1[i+20]); //__m256d ymm15 = _mm256_load_pd(&vec1[i+24]); //__m256d ymm16 = _mm256_load_pd(&vec1[i+28]); __m256d ymm5 = _mm256_load_pd(&vec2[i]); __m256d ymm6 = _mm256_load_pd(&vec2[i+4]); __m256d ymm7 = _mm256_load_pd(&vec2[i+8]); __m256d ymm8 = _mm256_load_pd(&vec2[i+12]); //__m256d ymm17 = _mm256_load_pd(&vec2[i+16]); //__m256d ymm18 = _mm256_load_pd(&vec2[i+20]); //__m256d ymm19 = _mm256_load_pd(&vec2[i+24]); //__m256d ymm20 = _mm256_load_pd(&vec2[i+28]); __m256d ymm9 = _mm256_mul_pd(ymm1,ymm5); __m256d ymm10 = _mm256_fmadd_pd(ymm2,ymm6,ymm9); //__m256d ymm11 = _mm256_mul_pd(ymm3,ymm7); __m256d ymm11 = _mm256_fmadd_pd(ymm3,ymm7,ymm10); __m256d ymm12 = _mm256_fmadd_pd(ymm4,ymm8,ymm11); //ymm12 = _mm256_hadd_pd(ymm10,ymm12); //__m256d ymm21 = _mm256_fmadd_pd(ymm13,ymm17,ymm12); //__m256d ymm22 = _mm256_fmadd_pd(ymm14,ymm18,ymm21); //__m256d ymm23 = _mm256_fmadd_pd(ymm15,ymm19,ymm22); //__m256d ymm24 = _mm256_fmadd_pd(ymm16,ymm20,ymm23); __m128d xmm1 = _mm256_extractf128_pd(ymm12,0); __m128d xmm2 = _mm256_extractf128_pd(ymm12,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); #endif /* // Performing 4 dot product at one time ymm1 = _mm256_load_pd(&vec1[i]); // x[0] ymm2 = _mm256_load_pd(&vec1[i+4]); // x[1] ymm3 = _mm256_load_pd(&vec1[i+8]); // x[2] ymm4 = _mm256_load_pd(&vec1[i+12]); // x[3] ymm5 = _mm256_load_pd(&vec2[i]); // y[0] ymm6 = _mm256_load_pd(&vec2[i+4]); // y[1] ymm7 = _mm256_load_pd(&vec2[i+8]); // y[2] ymm8 = _mm256_load_pd(&vec2[i+12]); // y[3] ymm9 = _mm256_mul_pd( ymm1, ymm5 ); // xy0 ymm10 = _mm256_mul_pd( ymm2, ymm6 ); // xy1 ymm11 = _mm256_hadd_pd( ymm9, ymm10 ); // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13 ymm12 = _mm256_mul_pd( ymm3, ymm7 ); // xy2 ymm13 = _mm256_mul_pd( ymm4, ymm8 ); // xy3 ymm14 = _mm256_hadd_pd( ymm12, ymm13 ); // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33 ymm15 = _mm256_permute2f128_pd( ymm11, ymm14, 0x21 ); // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31 ymm1 = _mm256_blend_pd( ymm11, ymm14, 0b1100); // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33 ymm2 = _mm256_add_pd( ymm15, ymm1 ); ymm0 = _mm256_add_pd( ymm0, ymm2 ); */ /* __m256d x[4], y[4]; x[0] = _mm256_load_pd(&vec1[i]); x[1] = _mm256_load_pd(&vec1[i+4]); x[2] = _mm256_load_pd(&vec1[i+8]); x[3] = _mm256_load_pd(&vec1[i+12]); y[0] = _mm256_load_pd(&vec2[i]); y[1] = _mm256_load_pd(&vec2[i+4]); y[2] = _mm256_load_pd(&vec2[i+8]); y[3] = _mm256_load_pd(&vec2[i+12]); __m256d xy0 = _mm256_mul_pd( x[0], y[0] ); __m256d xy1 = _mm256_mul_pd( x[1], y[1] ); // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13 __m256d temp01 = _mm256_hadd_pd( xy0, xy1 ); __m256d xy2 = _mm256_mul_pd( x[2], y[2] ); __m256d xy3 = _mm256_mul_pd( x[3], y[3] ); // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33 __m256d temp23 = _mm256_hadd_pd( xy2, xy3 ); // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31 __m256d swapped = _mm256_permute2f128_pd( temp01, temp23, 0x21 ); // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33 __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100); __m256d dotproduct = _mm256_add_pd( swapped, blended ); */ //ymm0 = _mm256_add_pd(ymm0,dotproduct); /* __m128d xmm1 = _mm256_extractf128_pd(dotproduct,0); __m128d xmm2 = _mm256_extractf128_pd(dotproduct,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); */ // _mm256_store_pd(&res[0],dotproduct); // result += (res[0] + res[1]) + (res[2] + res[3]); //result_SIMD_INTRINSICS += dotproduct[0] + dotproduct[1] + dotproduct[2] + dotproduct[3]; //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))); //_mm256_store_pd(&res[0],ymm0); //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; //double* res = (double*)&ymm0; //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; } //double* res = (double*)&ymm0; //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))); //_mm256_store_pd(&res[0],ymm0); //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; } // t2 = tbb::tick_count::now(); // double exec_time = 1000.0*(t2-t1).seconds(); //std::cout << std::setiosflags(std::ios::fixed) << std::setprecision(5); std::cout<<std::endl<<"RESULTS: " <<std::endl; std::cout<<"result_intrin ----------: "<< result << std::endl; //std::cout<<"result_intrin ----------: "<< result << ", time: " << 1000.0*(t2-t1).seconds() << " ms" << std::endl; std::cout<<std::endl<<"Program end. "<<std::endl<<std::endl; return 0; }
void set_element( int index, int res ) { __assume_aligned(memo, 64); memo[index] = res; }
int get_element( int index ) { __assume_aligned(memo, 64); return memo[index]; }