void cpu_benchmark(int n, int samples, double &duration, double &GFLOPS) { type_precision* A = new type_precision[n * n]; type_precision* B = new type_precision[n * n]; type_precision* C = new type_precision[n * n]; cputime_type start_tick, end_tick; duration = 9999999999.0; int b = 0; for (int i = 0; i < samples; i++) { re_random_vec(A, n*n); re_random_vec(B, n*n); re_random_vec(C, n*n); get_ticks(start_tick); cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, A, n, B, n, 1.0, C, n); get_ticks(end_tick); duration = min(duration, static_cast<double>(ticks2sec(end_tick, start_tick))); int a = 0; for (int j = 0; j < n * n ; j++) { a += A[j] + B[j] + C[j]; } b += a; } //!2nnn - nn + 2nn (from+c) GFLOPS = gemm_flops(n, n, n, 0); cout << b; delete []A; delete []B; delete []C; }
static void dgemm_driver(blas_idx_t m_global, blas_idx_t n_global, blas_idx_t k_global) { auto grid = std::make_shared<blacs_grid_t>(); auto a = block_cyclic_mat_t::random(grid, m_global, k_global); // auto b = block_cyclic_mat_t::random(grid, k_global, n_global); auto c = block_cyclic_mat_t::random(grid, m_global, n_global); //for test TODO double *dd = new double[m_global*k_global]; for (int i = 0; i < m_global*k_global; i++) { dd[i] = i; } auto b = block_cyclic_mat_t::createWithArray(grid, m_global, k_global, dd); //从这里开始矩阵的运算 MPI_Barrier(MPI_COMM_WORLD); double alpha = 1.0, beta = 0.0; double t0 = MPI_Wtime(); char NEIN = 'N'; //表示不进行转置 blas_idx_t ia = 1, ja = 1, ib = 1, jb = 1, ic = 1, jc = 1; // sub(C) = alpha*op(sub(A))*op(sub(B)) + beta*sub(C) pdgemm_ (NEIN, NEIN, m_global, n_global, k_global, alpha, a->local_data(), ia, ja, a->descriptor(), b->local_data(), ib, jb, b->descriptor(), beta, c->local_data(), ic, jc, c->descriptor() ); double t1 = MPI_Wtime() - t0; double t_glob; //获取所有进程所用时间中最长的时间 MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) // 进程号为0的进程 { double gflops = gemm_flops(m_global, n_global, k_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX MULTIPLY BENCHMARK SUMMARY\n" "=================================\n" "M = %d\tN = %d\tK = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGEMM = %10.7f seconds\tGFlops/Proc = %10.7f\n", m_global, n_global, k_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops); fflush(stdout); for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { printf("%f ", c->local_data()[i*k_global + j]); } printf("\n"); } fflush(stdout); } }