int main (int argc, char *argv[]) { MKLVersion ver; // Print information on CPU optimization in effect MKLGetVersion(&ver); printf("Processor optimization: %s\n",ver.Processor); omp_set_num_threads(8); mkl_set_dynamic(true); unsigned short A = ((unsigned short) std::strtoul(argv[1],(char **)0, 10)); unsigned short K_min = ((unsigned short) std::strtoul(argv[2],(char **)0, 10)); unsigned short K_max = ((unsigned short) std::strtoul(argv[3],(char **)0, 10)); unsigned short P = ((unsigned short) std::strtoul(argv[4],(char **)0, 10)); unsigned short Parita_Orbitale = ((unsigned short) std::strtoul(argv[5],(char **)0, 10));; //0 = pari ; 1 = dispari ; 2 = tutte unsigned short modo_cluster = 0; if (A==4) { if (argc == 7) { modo_cluster = ((unsigned short) std::strtoul(argv[6],(char **)0, 10)); //mettere a 1 per avere modo H sui 4 corpi } } init_main_0(A,K_min,K_max,P,modo_cluster,Parita_Orbitale); return true; }
double* ttm (double *X, const int Dim, long int *XDimSize, double *U, long int *UDimSize, char tflag, double *Y, long int *YDimSize, int ModeComn, int *ModeCom, int ModeComLen) { //Decide transa and transb according to tflag and ModeComn and ModeCom's relationship. CBLAS_TRANSPOSE transa = CblasNoTrans; CBLAS_TRANSPOSE transb = CblasNoTrans; long int m,n,k; double alpha, beta; long int rsu,csu,rsx,csx,rsy,csy; long int XLoopStride = 0, YLoopStride = 0; alpha = 1.0; beta = 1.0; /****Generated TM algo according to different modes and dimensions ***/ m = 10; n = 1000; k = 1000; rsu = 1000; csu = 1; rsx = 1000; csx = 1; rsy = 1000; csy = 1; #pragma omp parallel for default(shared) schedule(static) num_threads(1) for (long int i0=0;i0<1000;i0++) { mkl_set_dynamic(0); mkl_set_num_threads_local(8); XLoopStride = i0*1000000; YLoopStride = i0*10000; cblas_dgemm(CblasRowMajor, transa, transb, m, n, k, alpha, U, rsu, X+XLoopStride, rsx, beta, Y+YLoopStride, rsy); } return Y; }
int main (int argc, char *argv[]) { if (argc == 1){ std::cerr << "Sintassi: A m_max K_min K_max nucleoni S_tot masse_uguali [modo_H]" << std::endl; return true; } MKLVersion ver; // Print information on CPU optimization in effect MKLGetVersion(&ver); printf("Processor optimization: %s\n",ver.Processor); omp_set_num_threads(8); mkl_set_dynamic(true); unsigned short A = ((unsigned short) std::strtoul(argv[1],(char **)0, 10)); unsigned short m_max = ((unsigned short) std::strtoul(argv[2],(char **)0, 10)); unsigned short K_min = ((unsigned short) std::strtoul(argv[3],(char **)0, 10)); unsigned short K_max = ((unsigned short) std::strtoul(argv[4],(char **)0, 10)); std::vector <char> nucleoni; for (short i = 5; i<5+A; ++i) { nucleoni.push_back(*argv[i]); } unsigned short S_tot = ((unsigned short) std::strtoul(argv[5+A],(char **)0, 10)); bool controllo = ((bool) std::strtoul(argv[6+A],(char **)0, 10)); bool modo_cluster = false; if (A==4) { beta_riferimento = 2.; if (argc == 8+A) { modo_cluster = ((bool) std::strtoul(argv[7+A],(char **)0, 10)); //mettere a 1 per avere modo H sui 4 corpi } } Init_Corpi_LS Corp(nucleoni,controllo); std::string Nome_dir_vec = "lista_vettori"; int crea_dir = mkdir(Nome_dir_vec.c_str(), 0777); init_main_0(A,m_max,K_min,K_max,S_tot,Corp,modo_cluster); return true; }
void matmul_init() { #ifdef WITH_MKL mkl_set_dynamic(1); #endif // omp_set_dynamic(1); }
void StVKReducedStiffnessMatrix::Evaluate(double * q, double * Rq) { // this is same as EvaluateSubset with start=0, end=quadraticSize /* int i,j,k; int output; // reset to free terms int index = 0; int indexEntry = 0; for(output=0; output<r; output++) { for(i=output; i<r; i++) { Rq[indexEntry] = freeCoef_[index]; index++; indexEntry++; } indexEntry += output + 1; } // add linear terms index = 0; indexEntry = 0; for(output=0; output<r; output++) { for(i=output; i<r; i++) { for(j=0; j<r; j++) { Rq[indexEntry] += linearCoef_[index] * q[j]; index++; } indexEntry++; } indexEntry += output + 1; } // add quadratic terms index = 0; indexEntry = 0; for(output=0; output<r; output++) { for(i=output; i<r; i++) { for(j=0; j<r; j++) for(k=j; k<r; k++) { Rq[indexEntry] += quadraticCoef_[index] * q[j] * q[k]; index++; } indexEntry++; } indexEntry += output + 1; } // make symetric for(output=0; output<r; output++) for(i=0; i<output; i++) Rq[ELT(r,i,output)] = Rq[ELT(r,output,i)]; */ if (useSingleThread) { #if defined(WIN32) || defined(linux) mkl_max_threads = mkl_get_max_threads(); mkl_dynamic = mkl_get_dynamic(); mkl_set_num_threads(1); mkl_set_dynamic(0); #elif defined(__APPLE__) //setenv("VECLIB_MAXIMUM_THREADS", "1", true); #endif } // reset to free terms memcpy(buffer1,freeCoef_,sizeof(double)*quadraticSize); // add linear terms // multiply linearCoef_ and q // linearCoef_ is r x quadraticSize array cblas_dgemv(CblasColMajor, CblasTrans, r, quadraticSize, 1.0, linearCoef_, r, q, 1, 1.0, buffer1, 1); // compute qiqj int index = 0; for(int output=0; output<r; output++) for(int i=output; i<r; i++) { qiqj[index] = q[output] * q[i]; index++; } // update Rq // quadraticCoef_ is quadraticSize x quadraticSize matrix // each column gives quadratic coef for one matrix entry cblas_dgemv(CblasColMajor, CblasTrans, quadraticSize, quadraticSize, 1.0, quadraticCoef_, quadraticSize, qiqj, 1, 1.0, buffer1, 1); // unpack into a symmetric matrix int i1=0,j1=0; for(int i=0; i< quadraticSize; i++) { Rq[ELT(r,i1,j1)] = buffer1[i]; Rq[ELT(r,j1,i1)] = buffer1[i]; j1++; if(j1 == r) { i1++; j1 = i1; } } if (useSingleThread) { #if defined(WIN32) || defined(linux) mkl_set_num_threads(mkl_max_threads); mkl_set_dynamic(mkl_dynamic); #elif defined(__APPLE__) //unsetenv("VECLIB_MAXIMUM_THREADS"); #endif } }
void StVKReducedInternalForces::Evaluate(double * q, double * fq) { /* // unoptimized version // reset to zero int i,j,k,l; for(l=0; l<r; l++) fq[l] = 0; // add linear terms int index = 0; for(l=0; l<r; l++) for(i=0; i<r; i++) { fq[l] += linearCoef_[index] * q[i]; index++; } // add quadratic terms index = 0; for(l=0; l<r; l++) for(i=0; i<r; i++) for(j=i; j<r; j++) { fq[l] += quadraticCoef_[index] * q[i] * q[j]; index++; } // add cubic terms index = 0; for(l=0; l<r; l++) for(i=0; i<r; i++) for(j=i; j<r; j++) for(k=j; k<r; k++) { fq[l] += cubicCoef_[index] * q[i] * q[j] * q[k]; index++; } */ if (useSingleThread) { #if defined(_WIN32) || defined(WIN32) || defined(linux) mkl_max_threads = mkl_get_max_threads(); mkl_dynamic = mkl_get_dynamic(); mkl_set_num_threads(1); mkl_set_dynamic(0); #elif defined(__APPLE__) //setenv("VECLIB_MAXIMUM_THREADS", "1", true); #endif } // add linear terms // multiply linearCoef_ and q // linearCoef_ is r x r array cblas_dgemv(CblasColMajor, CblasTrans, r, r, 1.0, linearCoef_, r, q, 1, 0.0, fq, 1); // compute qiqj int index = 0; for(int output=0; output<r; output++) for(int i=output; i<r; i++) { qiqj[index] = q[output] * q[i]; index++; } // add quadratic terms // quadraticCoef_ is quadraticSize x r matrix // each column gives quadratic coef for one force vector component cblas_dgemv(CblasColMajor, CblasTrans, quadraticSize, r, 1.0, quadraticCoef_, quadraticSize, qiqj, 1, 1.0, fq, 1); // add cubic terms // cubicCoef_ is cubicSize x r matrix // each column gives cubicSize coef for one force vector component int size = quadraticSize; double * qiqjPos = qiqj; double * cubicCoefPos = cubicCoef_; for(int i=0; i<r; i++) { cblas_dgemv(CblasColMajor, CblasTrans, size, r, q[i], cubicCoefPos, cubicSize, qiqjPos, 1, 1.0, fq, 1); int param = r-i; size -= param; qiqjPos += param; cubicCoefPos += param * (param+1) / 2; } if (addGravity) { for(int i=0; i<r; i++) fq[i] -= reducedGravityForce[i]; } if (useSingleThread) { #if defined(_WIN32) || defined(WIN32) || defined(linux) mkl_set_num_threads(mkl_max_threads); mkl_set_dynamic(mkl_dynamic); #elif defined(__APPLE__) //unsetenv("VECLIB_MAXIMUM_THREADS"); #endif } }
double FastMatmul(Matrix<Scalar>& A, Matrix<Scalar>& B, Matrix<Scalar>& C, int num_steps, double x=1e-8, Scalar alpha=Scalar(1.0), Scalar beta=Scalar(0.0)) { MemoryManager<Scalar> mem_mngr; #ifdef _PARALLEL_ mem_mngr.Allocate(2, 2, 2, 8, num_steps, A.m(), A.n(), B.n()); #endif A.set_multiplier(alpha); int num_multiplies_per_step = 8; int total_multiplies = pow(num_multiplies_per_step, num_steps); // Set parameters needed for all types of parallelism. int num_threads = 0; #ifdef _PARALLEL_ # pragma omp parallel { if (omp_get_thread_num() == 0) { num_threads = omp_get_num_threads(); } } omp_set_nested(1); #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_) # pragma omp parallel { mkl_set_num_threads_local(1); mkl_set_dynamic(0); } #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _DFS_PAR_) mkl_set_dynamic(0); #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) if (num_threads > total_multiplies) { mkl_set_dynamic(0); } else { # pragma omp parallel { mkl_set_num_threads_local(1); mkl_set_dynamic(0); } } #endif LockAndCounter locker(total_multiplies - (total_multiplies % num_threads)); using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>; auto t1 = std::chrono::high_resolution_clock::now(); #ifdef _PARALLEL_ # pragma omp parallel { # pragma omp single #endif FastMatmulRecursive(locker, mem_mngr, A, B, C, num_steps, num_steps, 0, x, num_threads, beta); #ifdef _PARALLEL_ } #endif auto t2 = std::chrono::high_resolution_clock::now(); return FpMilliseconds(t2 - t1).count(); }
void FastMatmulRecursive(LockAndCounter& locker, MemoryManager<Scalar>& mem_mngr, Matrix<Scalar>& A, Matrix<Scalar>& B, Matrix<Scalar>& C, int total_steps, int steps_left, int start_index, double x, int num_threads, Scalar beta) { // Update multipliers C.UpdateMultiplier(A.multiplier()); C.UpdateMultiplier(B.multiplier()); A.set_multiplier(Scalar(1.0)); B.set_multiplier(Scalar(1.0)); // Base case for recursion if (steps_left == 0) { MatMul(A, B, C); return; } Matrix<Scalar> A11 = A.Subblock(2, 2, 1, 1); Matrix<Scalar> A12 = A.Subblock(2, 2, 1, 2); Matrix<Scalar> A21 = A.Subblock(2, 2, 2, 1); Matrix<Scalar> A22 = A.Subblock(2, 2, 2, 2); Matrix<Scalar> B11 = B.Subblock(2, 2, 1, 1); Matrix<Scalar> B12 = B.Subblock(2, 2, 1, 2); Matrix<Scalar> B21 = B.Subblock(2, 2, 2, 1); Matrix<Scalar> B22 = B.Subblock(2, 2, 2, 2); Matrix<Scalar> C11 = C.Subblock(2, 2, 1, 1); Matrix<Scalar> C12 = C.Subblock(2, 2, 1, 2); Matrix<Scalar> C21 = C.Subblock(2, 2, 2, 1); Matrix<Scalar> C22 = C.Subblock(2, 2, 2, 2); // Matrices to store the results of multiplications. #ifdef _PARALLEL_ Matrix<Scalar> M1(mem_mngr.GetMem(start_index, 1, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M2(mem_mngr.GetMem(start_index, 2, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M3(mem_mngr.GetMem(start_index, 3, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M4(mem_mngr.GetMem(start_index, 4, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M5(mem_mngr.GetMem(start_index, 5, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M6(mem_mngr.GetMem(start_index, 6, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M7(mem_mngr.GetMem(start_index, 7, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M8(mem_mngr.GetMem(start_index, 8, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); #else Matrix<Scalar> M1(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M2(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M3(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M4(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M5(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M6(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M7(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M8(C11.m(), C11.n(), C.multiplier()); #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) bool sequential1 = should_launch_task(8, total_steps, steps_left, start_index, 1, num_threads); bool sequential2 = should_launch_task(8, total_steps, steps_left, start_index, 2, num_threads); bool sequential3 = should_launch_task(8, total_steps, steps_left, start_index, 3, num_threads); bool sequential4 = should_launch_task(8, total_steps, steps_left, start_index, 4, num_threads); bool sequential5 = should_launch_task(8, total_steps, steps_left, start_index, 5, num_threads); bool sequential6 = should_launch_task(8, total_steps, steps_left, start_index, 6, num_threads); bool sequential7 = should_launch_task(8, total_steps, steps_left, start_index, 7, num_threads); bool sequential8 = should_launch_task(8, total_steps, steps_left, start_index, 8, num_threads); #else bool sequential1 = false; bool sequential2 = false; bool sequential3 = false; bool sequential4 = false; bool sequential5 = false; bool sequential6 = false; bool sequential7 = false; bool sequential8 = false; #endif // M1 = (1 * A11) * (1 * B11) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential1) shared(mem_mngr, locker) untied { #endif M1.UpdateMultiplier(Scalar(1)); M1.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A11, B11, M1, total_steps, steps_left - 1, (start_index + 1 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 1, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M2 = (1 * A12) * (1 * B21) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential2) shared(mem_mngr, locker) untied { #endif M2.UpdateMultiplier(Scalar(1)); M2.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A12, B21, M2, total_steps, steps_left - 1, (start_index + 2 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 2, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M3 = (1 * A11) * (1 * B12) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential3) shared(mem_mngr, locker) untied { #endif M3.UpdateMultiplier(Scalar(1)); M3.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A11, B12, M3, total_steps, steps_left - 1, (start_index + 3 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 3, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M4 = (1 * A12) * (1 * B22) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential4) shared(mem_mngr, locker) untied { #endif M4.UpdateMultiplier(Scalar(1)); M4.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A12, B22, M4, total_steps, steps_left - 1, (start_index + 4 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 4, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M5 = (1 * A21) * (1 * B11) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential5) shared(mem_mngr, locker) untied { #endif M5.UpdateMultiplier(Scalar(1)); M5.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A21, B11, M5, total_steps, steps_left - 1, (start_index + 5 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 5, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M6 = (1 * A22) * (1 * B21) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential6) shared(mem_mngr, locker) untied { #endif M6.UpdateMultiplier(Scalar(1)); M6.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A22, B21, M6, total_steps, steps_left - 1, (start_index + 6 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 6, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M7 = (1 * A21) * (1 * B12) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential7) shared(mem_mngr, locker) untied { #endif M7.UpdateMultiplier(Scalar(1)); M7.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A21, B12, M7, total_steps, steps_left - 1, (start_index + 7 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 7, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M8 = (1 * A22) * (1 * B22) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential8) shared(mem_mngr, locker) untied { #endif M8.UpdateMultiplier(Scalar(1)); M8.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A22, B22, M8, total_steps, steps_left - 1, (start_index + 8 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 8, num_threads)) { # pragma omp taskwait } #endif M_Add1(M1, M2, C11, x, false, beta); M_Add2(M3, M4, C12, x, false, beta); M_Add3(M5, M6, C21, x, false, beta); M_Add4(M7, M8, C22, x, false, beta); // Handle edge cases with dynamic peeling #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) if (total_steps == steps_left) { mkl_set_num_threads_local(num_threads); mkl_set_dynamic(0); } #endif DynamicPeeling(A, B, C, 2, 2, 2, beta); }