// icpc -fopenmp -std=c++11 -O3 dealloc_test.cpp -o dealloc_test_par void run() { using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>; static_assert(std::chrono::treat_as_floating_point<FpMilliseconds::rep>::value, "Rep required to be floating point"); int m = 22000; int k = 2000; int n = 22000; double *A = memory(m, k); double *B = memory(k, n); double *C = memory(m, n); std::vector<double *> data; for (int i = 0; i < 26; ++i) { data.push_back(memory(m / 4, n / 4)); } for (int i = 0; i < 26; ++i) { #pragma omp task fill(m / 4, n / 4, data[i]); } #pragma omp taskwait auto t1 = std::chrono::high_resolution_clock::now(); for (double *p : data) { #pragma omp task delete [] p; } auto t2 = std::chrono::high_resolution_clock::now(); auto time_ms = FpMilliseconds(t2 - t1); std::cout << "time: " << time_ms.count() << " ms" << std::endl; }
// Time func and return the time double Time(std::function<void ()> func) { using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>; static_assert(std::chrono::treat_as_floating_point<FpMilliseconds::rep>::value, "Rep required to be floating point"); auto t1 = std::chrono::high_resolution_clock::now(); func(); auto t2 = std::chrono::high_resolution_clock::now(); auto time_ms = FpMilliseconds(t2 - t1); return time_ms.count(); }
double FastMatmul(Matrix<Scalar>& A, Matrix<Scalar>& B, Matrix<Scalar>& C, int num_steps, double x=1e-8, Scalar alpha=Scalar(1.0), Scalar beta=Scalar(0.0)) { MemoryManager<Scalar> mem_mngr; #ifdef _PARALLEL_ mem_mngr.Allocate(2, 2, 2, 8, num_steps, A.m(), A.n(), B.n()); #endif A.set_multiplier(alpha); int num_multiplies_per_step = 8; int total_multiplies = pow(num_multiplies_per_step, num_steps); // Set parameters needed for all types of parallelism. int num_threads = 0; #ifdef _PARALLEL_ # pragma omp parallel { if (omp_get_thread_num() == 0) { num_threads = omp_get_num_threads(); } } omp_set_nested(1); #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_) # pragma omp parallel { mkl_set_num_threads_local(1); mkl_set_dynamic(0); } #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _DFS_PAR_) mkl_set_dynamic(0); #endif #if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_) if (num_threads > total_multiplies) { mkl_set_dynamic(0); } else { # pragma omp parallel { mkl_set_num_threads_local(1); mkl_set_dynamic(0); } } #endif LockAndCounter locker(total_multiplies - (total_multiplies % num_threads)); using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>; auto t1 = std::chrono::high_resolution_clock::now(); #ifdef _PARALLEL_ # pragma omp parallel { # pragma omp single #endif FastMatmulRecursive(locker, mem_mngr, A, B, C, num_steps, num_steps, 0, x, num_threads, beta); #ifdef _PARALLEL_ } #endif auto t2 = std::chrono::high_resolution_clock::now(); return FpMilliseconds(t2 - t1).count(); }