Пример #1
0
// icpc -fopenmp -std=c++11 -O3 dealloc_test.cpp -o dealloc_test_par
void run() {
  using FpMilliseconds =
      std::chrono::duration<float, std::chrono::milliseconds::period>;
  static_assert(std::chrono::treat_as_floating_point<FpMilliseconds::rep>::value, 
		"Rep required to be floating point");
  int m = 22000;
  int k = 2000;
  int n = 22000;
  double *A = memory(m, k);
  double *B = memory(k, n);
  double *C = memory(m, n);

  std::vector<double *> data;
  for (int i = 0; i < 26; ++i) {
    data.push_back(memory(m / 4, n / 4));
  }
  for (int i = 0; i < 26; ++i) {
    #pragma omp task
    fill(m / 4, n / 4, data[i]);
  }
  #pragma omp taskwait

  auto t1 = std::chrono::high_resolution_clock::now();
  for (double *p : data) {
    #pragma omp task
    delete [] p;
  }
  auto t2 = std::chrono::high_resolution_clock::now();
  auto time_ms = FpMilliseconds(t2 - t1);
  std::cout << "time: " << time_ms.count() << " ms" << std::endl;
}
Пример #2
0
// Time func and return the time
double Time(std::function<void ()> func) {
  using FpMilliseconds =
      std::chrono::duration<float, std::chrono::milliseconds::period>;
  static_assert(std::chrono::treat_as_floating_point<FpMilliseconds::rep>::value, 
		"Rep required to be floating point");
  auto t1 = std::chrono::high_resolution_clock::now();
  func();
  auto t2 = std::chrono::high_resolution_clock::now();
  auto time_ms = FpMilliseconds(t2 - t1);
  return time_ms.count();
}
Пример #3
0
double FastMatmul(Matrix<Scalar>& A, Matrix<Scalar>& B, Matrix<Scalar>& C,
                  int num_steps, double x=1e-8, Scalar alpha=Scalar(1.0), Scalar beta=Scalar(0.0)) {
    MemoryManager<Scalar> mem_mngr;
#ifdef _PARALLEL_
    mem_mngr.Allocate(2, 2, 2, 8, num_steps, A.m(), A.n(), B.n());
#endif
    A.set_multiplier(alpha);
    int num_multiplies_per_step = 8;
    int total_multiplies = pow(num_multiplies_per_step, num_steps);

    // Set parameters needed for all types of parallelism.
    int num_threads = 0;
#ifdef _PARALLEL_
    # pragma omp parallel
    {
        if (omp_get_thread_num() == 0) {
            num_threads = omp_get_num_threads();
        }
    }
    omp_set_nested(1);
#endif

#if defined(_PARALLEL_) && (_PARALLEL_ == _BFS_PAR_)
    # pragma omp parallel
    {
        mkl_set_num_threads_local(1);
        mkl_set_dynamic(0);
    }
#endif

#if defined(_PARALLEL_) && (_PARALLEL_ == _DFS_PAR_)
    mkl_set_dynamic(0);
#endif

#if defined(_PARALLEL_) && (_PARALLEL_ == _HYBRID_PAR_)
    if (num_threads > total_multiplies) {
        mkl_set_dynamic(0);
    } else {
        # pragma omp parallel
        {
            mkl_set_num_threads_local(1);
            mkl_set_dynamic(0);
        }
    }
#endif

    LockAndCounter locker(total_multiplies - (total_multiplies % num_threads));
    using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
    auto t1 = std::chrono::high_resolution_clock::now();

#ifdef _PARALLEL_
    # pragma omp parallel
    {
        # pragma omp single
#endif
        FastMatmulRecursive(locker, mem_mngr, A, B, C, num_steps, num_steps, 0, x, num_threads, beta);
#ifdef _PARALLEL_
    }
#endif
    auto t2 = std::chrono::high_resolution_clock::now();
    return FpMilliseconds(t2 - t1).count();
}