static void chol_driver(blas_idx_t n_global) { auto grid = std::make_shared<blacs_grid_t>(); auto a = make_tridiagonal(grid, n_global); // Compute Cholesky factorization of A in-place char uplo ='U'; blas_idx_t ia = 1, ja = 1, info; MPI_Barrier (MPI_COMM_WORLD); double t0 = MPI_Wtime(); pdpotrf_ (uplo, n_global, a->local_data(), ia, ja, a->descriptor(), info); assert(info == 0); double t1 = MPI_Wtime() - t0; double t_glob; MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) { double gflops = potrf_flops(n_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX CHOLESKY FACTORIZATION BENCHMARK SUMMARY\n" "===============================================\n" "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxPOTRF = %10.7f seconds\tGflops/Proc = %10.7f\n", n_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops);fflush(stdout); } }
void BaseLB::LDStats::print() { #if CMK_LBDB_ON int i; CkPrintf("------------- Processor Data: %d -------------\n", nprocs()); for(int pe=0; pe < nprocs(); pe++) { struct ProcStats &proc = procs[pe]; CkPrintf("Proc %d (%d) Speed %d Total = %f Idle = %f Bg = %f nObjs = %d", pe, proc.pe, proc.pe_speed, proc.total_walltime, proc.idletime, proc.bg_walltime, proc.n_objs); #if CMK_LB_CPUTIMER CkPrintf(" CPU Total %f Bg %f", proc.total_cputime, proc.bg_cputime); #endif CkPrintf("\n"); } CkPrintf("------------- Object Data: %d objects -------------\n", n_objs); for(i=0; i < n_objs; i++) { LDObjData &odata = objData[i]; CkPrintf("Object %d\n",i); CkPrintf(" id = %d %d %d %d\n",odata.objID().id[0],odata.objID().id[1 ], odata.objID().id[2], odata.objID().id[3]); CkPrintf(" OM id = %d\t",odata.omID().id); CkPrintf(" Mig. = %d\n",odata.migratable); #if CMK_LB_CPUTIMER CkPrintf(" CPU = %f\t",odata.cpuTime); #endif CkPrintf(" Wall = %f\n",odata.wallTime); } CkPrintf("------------- Comm Data: %d records -------------\n", n_comm); CkVec<LDCommData> &cdata = commData; for(i=0; i < n_comm; i++) { CkPrintf("Link %d\n",i); LDObjid &sid = cdata[i].sender.objID(); if (cdata[i].from_proc()) CkPrintf(" sender PE = %d\t",cdata[i].src_proc); else CkPrintf(" sender id = %d:[%d %d %d %d]\t", cdata[i].sender.omID().id,sid.id[0], sid.id[1], sid.id[2], sid.id[3]); LDObjid &rid = cdata[i].receiver.get_destObj().objID(); if (cdata[i].recv_type() == LD_PROC_MSG) CkPrintf(" receiver PE = %d\n",cdata[i].receiver.proc()); else CkPrintf(" receiver id = %d:[%d %d %d %d]\n", cdata[i].receiver.get_destObj().omID().id,rid.id[0],rid.id[1],rid.id[2],rid.id[3]); CkPrintf(" messages = %d\t",cdata[i].messages); CkPrintf(" bytes = %d\n",cdata[i].bytes); } CkPrintf("------------- Object to PE mapping -------------\n"); for (i=0; i<n_objs; i++) CkPrintf(" %d", from_proc[i]); CkPrintf("\n"); #endif }
void BaseLB::LDStats::normalize_speed() { int pe; double maxspeed = 0.0; for(int pe=0; pe < nprocs(); pe++) { if (procs[pe].pe_speed > maxspeed) maxspeed = procs[pe].pe_speed; } for(int pe=0; pe < nprocs(); pe++) procs[pe].pe_speed /= maxspeed; }
void BaseLB::LDStats::pup(PUP::er &p) { int i; p(count); p(n_objs); p(n_migrateobjs); p(n_comm); if (p.isUnpacking()) { // user can specify simulated processors other than the real # of procs. int maxpe = nprocs() > LBSimulation::simProcs ? nprocs() : LBSimulation::simProcs; procs = new ProcStats[maxpe]; objData.resize(n_objs); commData.resize(n_comm); from_proc.resize(n_objs); to_proc.resize(n_objs); objHash = NULL; } // ignore the background load when unpacking if the user change the # of procs // otherwise load everything if (p.isUnpacking() && LBSimulation::procsChanged) { ProcStats dummy; for (i=0; i<nprocs(); i++) p|dummy; } else for (i=0; i<nprocs(); i++) p|procs[i]; for (i=0; i<n_objs; i++) p|objData[i]; for (i=0; i<n_objs; i++) p|from_proc[i]; for (i=0; i<n_objs; i++) p|to_proc[i]; // reset to_proc when unpacking if (p.isUnpacking()) for (i=0; i<n_objs; i++) to_proc[i] = from_proc[i]; for (i=0; i<n_comm; i++) p|commData[i]; if (p.isUnpacking()) count = LBSimulation::simProcs; if (p.isUnpacking()) { objHash = NULL; if (_lb_args.lbversion() <= 1) for (i=0; i<nprocs(); i++) procs[i].pe = i; } }
double BaseLB::LDStats::computeAverageLoad() { int i, numAvail=0; double total = 0; for (i=0; i<n_objs; i++) total += objData[i].wallTime; for (i=0; i<nprocs(); i++) if (procs[i].available == CmiTrue) { total += procs[i].bg_walltime; numAvail++; } double averageLoad = total/numAvail; return averageLoad; }
int BaseLB::LDStats::useMem() { // calculate the memory usage of this LB (superclass). return sizeof(LDStats) + sizeof(ProcStats) * nprocs() + (sizeof(LDObjData) + 2 * sizeof(int)) * n_objs + sizeof(LDCommData) * n_comm; }
void BaseLB::LDStats::computeNonlocalComm(int &nmsgs, int &nbytes) { #if CMK_LBDB_ON nmsgs = 0; nbytes = 0; makeCommHash(); int mcast_count = 0; for (int cidx=0; cidx < n_comm; cidx++) { LDCommData& cdata = commData[cidx]; int senderPE, receiverPE; if (cdata.from_proc()) senderPE = cdata.src_proc; else { int idx = getHash(cdata.sender); if (idx == -1) continue; // sender has just migrated? senderPE = to_proc[idx]; CmiAssert(senderPE != -1); } CmiAssert(senderPE < nprocs() && senderPE >= 0); // find receiver: point-to-point and multicast two cases int receiver_type = cdata.receiver.get_type(); if (receiver_type == LD_PROC_MSG || receiver_type == LD_OBJ_MSG) { if (receiver_type == LD_PROC_MSG) receiverPE = cdata.receiver.proc(); else { // LD_OBJ_MSG int idx = getHash(cdata.receiver.get_destObj()); if (idx == -1) { // receiver outside this domain if (complete_flag) continue; else receiverPE = -1; } else { receiverPE = to_proc[idx]; CmiAssert(receiverPE < nprocs() && receiverPE >= 0); } } if(senderPE != receiverPE) { nmsgs += cdata.messages; nbytes += cdata.bytes; } } else if (receiver_type == LD_OBJLIST_MSG) { int nobjs; LDObjKey *objs = cdata.receiver.get_destObjs(nobjs); mcast_count ++; CkVec<int> pes; for (int i=0; i<nobjs; i++) { int idx = getHash(objs[i]); CmiAssert(idx != -1); if (idx == -1) continue; // receiver has just been removed? receiverPE = to_proc[idx]; CmiAssert(receiverPE < nprocs() && receiverPE >= 0); int exist = 0; for (int p=0; p<pes.size(); p++) if (receiverPE == pes[p]) { exist=1; break; } if (exist) continue; pes.push_back(receiverPE); if(senderPE != receiverPE) { nmsgs += cdata.messages; nbytes += cdata.bytes; } } } } // end of for #endif }
/// 计算结果存储在矩阵a中 /// n_global: the order of the matrix static void inv_driver(blas_idx_t n_global) { auto grid = std::make_shared<blacs_grid_t>(); //// self code //n_global = 3; //double *aaa = new double(n_global*n_global); //for (int i = 0; i < 9; i++) //{ // aaa[i] = i + 1; //} //aaa[8] = 10; //auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa); // Create a NxN random matrix A auto a = block_cyclic_mat_t::random(grid, n_global, n_global); // Create a NxN matrix to hold A^{-1} auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global); // Copy A to A^{-1} since it will be overwritten during factorization std::copy_n(a->local_data(), a->local_size(), ai->local_data()); MPI_Barrier (MPI_COMM_WORLD); double t0 = MPI_Wtime(); // Factorize A blas_idx_t ia = 1, ja = 1; std::vector<blas_idx_t> ipiv(a->local_rows() + a->row_block_size() + 100); blas_idx_t info; //含义应该是D-GE-TRF。 //第一个D表示我们的矩阵是double类型的 //GE表示我们的矩阵是General类型的 //TRF表示对矩阵进行三角分解也就是我们通常所说的LU分解。 pdgetrf_(n_global, n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), info); assert(info == 0); double t_factor = MPI_Wtime() - t0; // Compute A^{-1} based on the LU factorization // Compute workspace for double and integer work arrays on each process blas_idx_t lwork = 10; blas_idx_t liwork = 10; std::vector<double> work (lwork); std::vector<blas_idx_t> iwork(liwork); lwork = liwork = -1; // 计算lwork与liwork的值 pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); lwork = static_cast<blas_idx_t>(work[0]); liwork = static_cast<size_t>(iwork[0]); work.resize(lwork); iwork.resize(liwork); // Now compute the inverse t0 = MPI_Wtime(); pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); double t_solve = MPI_Wtime() - t0; // Verify that the inverse is correct using A*A^{-1} = I auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global); // Compute I = A * A^{-1} - I and verify that the ||I|| is small char nein = 'N'; double alpha = 1.0, beta = -1.0; pdgemm_(nein, nein, n_global, n_global, n_global, alpha, a->local_data() , ia, ja, a->descriptor(), ai->local_data(), ia, ja, ai->descriptor(), beta, identity->local_data(), ia, ja, identity->descriptor()); // Compute 1-norm of the result char norm='1'; work.resize(identity->local_cols()); double err = pdlange_(norm, n_global, n_global, identity->local_data(), ia, ja, identity->descriptor(), work.data()); double t_total = t_factor + t_solve; double t_glob; MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) { double gflops = getri_flops(n_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX INVERSE BENCHMARK SUMMARY\n" "================================\n" "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n", n_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops, err);fflush(stdout); } }
static void dgemm_driver(blas_idx_t m_global, blas_idx_t n_global, blas_idx_t k_global) { auto grid = std::make_shared<blacs_grid_t>(); auto a = block_cyclic_mat_t::random(grid, m_global, k_global); // auto b = block_cyclic_mat_t::random(grid, k_global, n_global); auto c = block_cyclic_mat_t::random(grid, m_global, n_global); //for test TODO double *dd = new double[m_global*k_global]; for (int i = 0; i < m_global*k_global; i++) { dd[i] = i; } auto b = block_cyclic_mat_t::createWithArray(grid, m_global, k_global, dd); //从这里开始矩阵的运算 MPI_Barrier(MPI_COMM_WORLD); double alpha = 1.0, beta = 0.0; double t0 = MPI_Wtime(); char NEIN = 'N'; //表示不进行转置 blas_idx_t ia = 1, ja = 1, ib = 1, jb = 1, ic = 1, jc = 1; // sub(C) = alpha*op(sub(A))*op(sub(B)) + beta*sub(C) pdgemm_ (NEIN, NEIN, m_global, n_global, k_global, alpha, a->local_data(), ia, ja, a->descriptor(), b->local_data(), ib, jb, b->descriptor(), beta, c->local_data(), ic, jc, c->descriptor() ); double t1 = MPI_Wtime() - t0; double t_glob; //获取所有进程所用时间中最长的时间 MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) // 进程号为0的进程 { double gflops = gemm_flops(m_global, n_global, k_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX MULTIPLY BENCHMARK SUMMARY\n" "=================================\n" "M = %d\tN = %d\tK = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGEMM = %10.7f seconds\tGFlops/Proc = %10.7f\n", m_global, n_global, k_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops); fflush(stdout); for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { printf("%f ", c->local_data()[i*k_global + j]); } printf("\n"); } fflush(stdout); } }