double* ttm (double *X, const int Dim, long int *XDimSize, double *U, long int *UDimSize, char tflag, double *Y, long int *YDimSize, int ModeComn, int *ModeCom, int ModeComLen) { //Decide transa and transb according to tflag and ModeComn and ModeCom's relationship. CBLAS_TRANSPOSE transa = CblasNoTrans; CBLAS_TRANSPOSE transb = CblasNoTrans; long int m,n,k; double alpha, beta; long int rsu,csu,rsx,csx,rsy,csy; long int XLoopStride = 0, YLoopStride = 0; alpha = 1.0; beta = 1.0; /****Generated TM algo according to different modes and dimensions ***/ m = 10; n = 1000; k = 1000; rsu = 1000; csu = 1; rsx = 1000; csx = 1; rsy = 1000; csy = 1; #pragma omp parallel for default(shared) schedule(static) num_threads(1) for (long int i0=0;i0<1000;i0++) { mkl_set_dynamic(0); mkl_set_num_threads_local(8); XLoopStride = i0*1000000; YLoopStride = i0*10000; cblas_dgemm(CblasRowMajor, transa, transb, m, n, k, alpha, U, rsu, X+XLoopStride, rsx, beta, Y+YLoopStride, rsy); } return Y; }
double FastMatmul(Matrix<Scalar>& A, Matrix<Scalar>& B, Matrix<Scalar>& C, int num_steps, double x=1e-8, Scalar alpha=Scalar(1.0), Scalar beta=Scalar(0.0)) { MemoryManager<Scalar> mem_mngr; #ifdef _PARALLEL_ mem_mngr.Allocate(2, 2, 2, 8, num_steps, A.m(), A.n(), B.n()); #endif A.set_multiplier(alpha); int num_multiplies_per_step = 8; int total_multiplies = pow(num_multiplies_per_step, num_steps); // Set parameters needed for all types of parallelism. int num_threads = 0; #ifdef _PARALLEL_ # pragma omp parallel { if (omp_get_thread_num() == 0) { num_threads = omp_get_num_threads(); } } omp_set_nested(1); #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_) # pragma omp parallel { mkl_set_num_threads_local(1); mkl_set_dynamic(0); } #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _DFS_PAR_) mkl_set_dynamic(0); #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) if (num_threads > total_multiplies) { mkl_set_dynamic(0); } else { # pragma omp parallel { mkl_set_num_threads_local(1); mkl_set_dynamic(0); } } #endif LockAndCounter locker(total_multiplies - (total_multiplies % num_threads)); using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>; auto t1 = std::chrono::high_resolution_clock::now(); #ifdef _PARALLEL_ # pragma omp parallel { # pragma omp single #endif FastMatmulRecursive(locker, mem_mngr, A, B, C, num_steps, num_steps, 0, x, num_threads, beta); #ifdef _PARALLEL_ } #endif auto t2 = std::chrono::high_resolution_clock::now(); return FpMilliseconds(t2 - t1).count(); }
void TaskManager :: Loop(int thd) { /* static Timer tADD("add entry counter"); static Timer tCASready1("spin-CAS ready tick1"); static Timer tCASready2("spin-CAS ready tick2"); static Timer tCASyield("spin-CAS yield"); static Timer tCAS1("spin-CAS wait"); static Timer texit("exit zone"); static Timer tdec("decrement"); */ thread_id = thd; int thds = GetNumThreads(); int mynode = num_nodes * thd/thds; NodeData & mynode_data = *(nodedata[mynode]); TaskInfo ti; ti.nthreads = thds; ti.thread_nr = thd; // ti.nnodes = num_nodes; // ti.node_nr = mynode; #ifdef USE_NUMA numa_run_on_node (mynode); #endif active_workers++; workers_on_node[mynode]++; int jobdone = 0; #ifdef USE_MKL auto mkl_max = mkl_get_max_threads(); mkl_set_num_threads_local(1); #endif while (!done) { if (complete[mynode] > jobdone) jobdone = complete[mynode]; if (jobnr == jobdone) { // RegionTracer t(ti.thread_nr, tCASyield, ti.task_nr); if(sleep) this_thread::sleep_for(chrono::microseconds(sleep_usecs)); else { #ifdef WIN32 this_thread::yield(); #else // WIN32 sched_yield(); #endif // WIN32 } continue; } { // RegionTracer t(ti.thread_nr, tADD, ti.task_nr); // non-atomic fast check ... if ( (mynode_data.participate & 1) == 0) continue; int oldval = mynode_data.participate += 2; if ( (oldval & 1) == 0) { // job not active, going out again mynode_data.participate -= 2; continue; } } if (startup_function) (*startup_function)(); IntRange mytasks = Range(int(ntasks)).Split (mynode, num_nodes); try { while (1) { if (mynode_data.start_cnt >= mytasks.Size()) break; int mytask = mynode_data.start_cnt.fetch_add(1, memory_order_relaxed); if (mytask >= mytasks.Size()) break; ti.task_nr = mytasks.First()+mytask; ti.ntasks = ntasks; { RegionTracer t(ti.thread_nr, jobnr, RegionTracer::ID_JOB, ti.task_nr); (*func)(ti); } } } catch (Exception e) { { // cout << "got exception in TM" << endl; lock_guard<mutex> guard(copyex_mutex); delete ex; ex = new Exception (e); mynode_data.start_cnt = mytasks.Size(); } } #ifndef __MIC__ atomic_thread_fence (memory_order_release); #endif // __MIC__ if (cleanup_function) (*cleanup_function)(); jobdone = jobnr; mynode_data.participate-=2; { int oldpart = 1; if (mynode_data.participate.compare_exchange_strong (oldpart, 0)) { if (jobdone < jobnr.load()) { // reopen gate mynode_data.participate |= 1; } else { if (mynode != 0) mynode_data.start_cnt = 0; complete[mynode] = jobnr.load(); } } } } #ifdef USE_MKL mkl_set_num_threads_local(mkl_max); #endif workers_on_node[mynode]--; active_workers--; }
void FastMatmulRecursive(LockAndCounter& locker, MemoryManager<Scalar>& mem_mngr, Matrix<Scalar>& A, Matrix<Scalar>& B, Matrix<Scalar>& C, int total_steps, int steps_left, int start_index, double x, int num_threads, Scalar beta) { // Update multipliers C.UpdateMultiplier(A.multiplier()); C.UpdateMultiplier(B.multiplier()); A.set_multiplier(Scalar(1.0)); B.set_multiplier(Scalar(1.0)); // Base case for recursion if (steps_left == 0) { MatMul(A, B, C); return; } Matrix<Scalar> A11 = A.Subblock(2, 2, 1, 1); Matrix<Scalar> A12 = A.Subblock(2, 2, 1, 2); Matrix<Scalar> A21 = A.Subblock(2, 2, 2, 1); Matrix<Scalar> A22 = A.Subblock(2, 2, 2, 2); Matrix<Scalar> B11 = B.Subblock(2, 2, 1, 1); Matrix<Scalar> B12 = B.Subblock(2, 2, 1, 2); Matrix<Scalar> B21 = B.Subblock(2, 2, 2, 1); Matrix<Scalar> B22 = B.Subblock(2, 2, 2, 2); Matrix<Scalar> C11 = C.Subblock(2, 2, 1, 1); Matrix<Scalar> C12 = C.Subblock(2, 2, 1, 2); Matrix<Scalar> C21 = C.Subblock(2, 2, 2, 1); Matrix<Scalar> C22 = C.Subblock(2, 2, 2, 2); // Matrices to store the results of multiplications. #ifdef _PARALLEL_ Matrix<Scalar> M1(mem_mngr.GetMem(start_index, 1, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M2(mem_mngr.GetMem(start_index, 2, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M3(mem_mngr.GetMem(start_index, 3, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M4(mem_mngr.GetMem(start_index, 4, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M5(mem_mngr.GetMem(start_index, 5, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M6(mem_mngr.GetMem(start_index, 6, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M7(mem_mngr.GetMem(start_index, 7, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M8(mem_mngr.GetMem(start_index, 8, total_steps - steps_left, M), C11.m(), C11.m(), C11.n(), C.multiplier()); #else Matrix<Scalar> M1(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M2(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M3(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M4(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M5(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M6(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M7(C11.m(), C11.n(), C.multiplier()); Matrix<Scalar> M8(C11.m(), C11.n(), C.multiplier()); #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) bool sequential1 = should_launch_task(8, total_steps, steps_left, start_index, 1, num_threads); bool sequential2 = should_launch_task(8, total_steps, steps_left, start_index, 2, num_threads); bool sequential3 = should_launch_task(8, total_steps, steps_left, start_index, 3, num_threads); bool sequential4 = should_launch_task(8, total_steps, steps_left, start_index, 4, num_threads); bool sequential5 = should_launch_task(8, total_steps, steps_left, start_index, 5, num_threads); bool sequential6 = should_launch_task(8, total_steps, steps_left, start_index, 6, num_threads); bool sequential7 = should_launch_task(8, total_steps, steps_left, start_index, 7, num_threads); bool sequential8 = should_launch_task(8, total_steps, steps_left, start_index, 8, num_threads); #else bool sequential1 = false; bool sequential2 = false; bool sequential3 = false; bool sequential4 = false; bool sequential5 = false; bool sequential6 = false; bool sequential7 = false; bool sequential8 = false; #endif // M1 = (1 * A11) * (1 * B11) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential1) shared(mem_mngr, locker) untied { #endif M1.UpdateMultiplier(Scalar(1)); M1.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A11, B11, M1, total_steps, steps_left - 1, (start_index + 1 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 1, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M2 = (1 * A12) * (1 * B21) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential2) shared(mem_mngr, locker) untied { #endif M2.UpdateMultiplier(Scalar(1)); M2.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A12, B21, M2, total_steps, steps_left - 1, (start_index + 2 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 2, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M3 = (1 * A11) * (1 * B12) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential3) shared(mem_mngr, locker) untied { #endif M3.UpdateMultiplier(Scalar(1)); M3.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A11, B12, M3, total_steps, steps_left - 1, (start_index + 3 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 3, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M4 = (1 * A12) * (1 * B22) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential4) shared(mem_mngr, locker) untied { #endif M4.UpdateMultiplier(Scalar(1)); M4.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A12, B22, M4, total_steps, steps_left - 1, (start_index + 4 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 4, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M5 = (1 * A21) * (1 * B11) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential5) shared(mem_mngr, locker) untied { #endif M5.UpdateMultiplier(Scalar(1)); M5.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A21, B11, M5, total_steps, steps_left - 1, (start_index + 5 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 5, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M6 = (1 * A22) * (1 * B21) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential6) shared(mem_mngr, locker) untied { #endif M6.UpdateMultiplier(Scalar(1)); M6.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A22, B21, M6, total_steps, steps_left - 1, (start_index + 6 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 6, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M7 = (1 * A21) * (1 * B12) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential7) shared(mem_mngr, locker) untied { #endif M7.UpdateMultiplier(Scalar(1)); M7.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A21, B12, M7, total_steps, steps_left - 1, (start_index + 7 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 7, num_threads)) { # pragma omp taskwait # if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) SwitchToDFS(locker, num_threads); # endif } #endif // M8 = (1 * A22) * (1 * B22) #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) # pragma omp task if(sequential8) shared(mem_mngr, locker) untied { #endif M8.UpdateMultiplier(Scalar(1)); M8.UpdateMultiplier(Scalar(1)); FastMatmulRecursive(locker, mem_mngr, A22, B22, M8, total_steps, steps_left - 1, (start_index + 8 - 1) * 8, x, num_threads, Scalar(0.0)); #ifndef _PARALLEL_ #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) locker.Decrement(); } if (should_task_wait(8, total_steps, steps_left, start_index, 8, num_threads)) { # pragma omp taskwait } #endif M_Add1(M1, M2, C11, x, false, beta); M_Add2(M3, M4, C12, x, false, beta); M_Add3(M5, M6, C21, x, false, beta); M_Add4(M7, M8, C22, x, false, beta); // Handle edge cases with dynamic peeling #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_ || _PARALLEL_ == _HYBRID_PAR_) if (total_steps == steps_left) { mkl_set_num_threads_local(num_threads); mkl_set_dynamic(0); } #endif DynamicPeeling(A, B, C, 2, 2, 2, beta); }