Exemple #1
0
static void chol_driver(blas_idx_t n_global)
{
    auto grid = std::make_shared<blacs_grid_t>();    
    auto a    = make_tridiagonal(grid, n_global);    

    // Compute Cholesky factorization of A in-place
    char       uplo     ='U';
    blas_idx_t ia       = 1, ja = 1, info;

    MPI_Barrier (MPI_COMM_WORLD);
    double t0 = MPI_Wtime();
    pdpotrf_ (uplo, n_global, a->local_data(), ia, ja, a->descriptor(), info);
    assert(info == 0);

    double t1 = MPI_Wtime() - t0;
  
    double t_glob;
    MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    if (grid->iam() == 0) 
    {
        double gflops = potrf_flops(n_global)/t_glob/grid->nprocs();
        printf("\n"
            "MATRIX CHOLESKY FACTORIZATION BENCHMARK SUMMARY\n"
            "===============================================\n"
            "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n"
            "Time for PxPOTRF = %10.7f seconds\tGflops/Proc = %10.7f\n",
            n_global, grid->nprocs(), grid->nprows(), grid->npcols(), 
            t_glob, gflops);fflush(stdout);
    }
}
Exemple #2
0
void BaseLB::LDStats::print()
{
#if CMK_LBDB_ON
  int i;
  CkPrintf("------------- Processor Data: %d -------------\n", nprocs());
  for(int pe=0; pe < nprocs(); pe++) {
    struct ProcStats &proc = procs[pe];

    CkPrintf("Proc %d (%d) Speed %d Total = %f Idle = %f Bg = %f nObjs = %d",
      pe, proc.pe, proc.pe_speed, proc.total_walltime, proc.idletime,
      proc.bg_walltime, proc.n_objs);
#if CMK_LB_CPUTIMER
    CkPrintf(" CPU Total %f Bg %f", proc.total_cputime, proc.bg_cputime);
#endif
    CkPrintf("\n");
  }

  CkPrintf("------------- Object Data: %d objects -------------\n", n_objs);
  for(i=0; i < n_objs; i++) {
      LDObjData &odata = objData[i];
      CkPrintf("Object %d\n",i);
      CkPrintf("     id = %d %d %d %d\n",odata.objID().id[0],odata.objID().id[1
], odata.objID().id[2], odata.objID().id[3]);
      CkPrintf("  OM id = %d\t",odata.omID().id);
      CkPrintf("   Mig. = %d\n",odata.migratable);
#if CMK_LB_CPUTIMER
      CkPrintf("    CPU = %f\t",odata.cpuTime);
#endif
      CkPrintf("   Wall = %f\n",odata.wallTime);
  }

  CkPrintf("------------- Comm Data: %d records -------------\n", n_comm);
  CkVec<LDCommData> &cdata = commData;
  for(i=0; i < n_comm; i++) {
      CkPrintf("Link %d\n",i);

      LDObjid &sid = cdata[i].sender.objID();
      if (cdata[i].from_proc())
	CkPrintf("    sender PE = %d\t",cdata[i].src_proc);
      else
	CkPrintf("    sender id = %d:[%d %d %d %d]\t",
		 cdata[i].sender.omID().id,sid.id[0], sid.id[1], sid.id[2], sid.id[3]);

      LDObjid &rid = cdata[i].receiver.get_destObj().objID();
      if (cdata[i].recv_type() == LD_PROC_MSG)
	CkPrintf("  receiver PE = %d\n",cdata[i].receiver.proc());
      else	
	CkPrintf("  receiver id = %d:[%d %d %d %d]\n",
		 cdata[i].receiver.get_destObj().omID().id,rid.id[0],rid.id[1],rid.id[2],rid.id[3]);
      
      CkPrintf("     messages = %d\t",cdata[i].messages);
      CkPrintf("        bytes = %d\n",cdata[i].bytes);
  }
  CkPrintf("------------- Object to PE mapping -------------\n");
  for (i=0; i<n_objs; i++) CkPrintf(" %d", from_proc[i]);
  CkPrintf("\n");
#endif
}
Exemple #3
0
void BaseLB::LDStats::normalize_speed() {
  int pe;
  double maxspeed = 0.0;

  for(int pe=0; pe < nprocs(); pe++) {
    if (procs[pe].pe_speed > maxspeed) maxspeed = procs[pe].pe_speed;
  }
  for(int pe=0; pe < nprocs(); pe++)
    procs[pe].pe_speed /= maxspeed;
}
Exemple #4
0
void BaseLB::LDStats::pup(PUP::er &p)
{
  int i;
  p(count);
  p(n_objs);
  p(n_migrateobjs);
  p(n_comm);
  if (p.isUnpacking()) {
    // user can specify simulated processors other than the real # of procs.
    int maxpe = nprocs() > LBSimulation::simProcs ? nprocs() : LBSimulation::simProcs;
    procs = new ProcStats[maxpe];
    objData.resize(n_objs);
    commData.resize(n_comm);
    from_proc.resize(n_objs);
    to_proc.resize(n_objs);
    objHash = NULL;
  }
  // ignore the background load when unpacking if the user change the # of procs
  // otherwise load everything
  if (p.isUnpacking() && LBSimulation::procsChanged) {
    ProcStats dummy;
    for (i=0; i<nprocs(); i++) p|dummy;
  }
  else
    for (i=0; i<nprocs(); i++) p|procs[i];
  for (i=0; i<n_objs; i++) p|objData[i]; 
  for (i=0; i<n_objs; i++) p|from_proc[i]; 
  for (i=0; i<n_objs; i++) p|to_proc[i]; 
  // reset to_proc when unpacking
  if (p.isUnpacking())
    for (i=0; i<n_objs; i++) to_proc[i] = from_proc[i];
  for (i=0; i<n_comm; i++) p|commData[i];
  if (p.isUnpacking())
    count = LBSimulation::simProcs;
  if (p.isUnpacking()) {
    objHash = NULL;
    if (_lb_args.lbversion() <= 1) 
      for (i=0; i<nprocs(); i++) procs[i].pe = i;
  }
}
Exemple #5
0
double BaseLB::LDStats::computeAverageLoad()
{
  int i, numAvail=0;
  double total = 0;
  for (i=0; i<n_objs; i++) total += objData[i].wallTime;
                                                                                
  for (i=0; i<nprocs(); i++)
    if (procs[i].available == CmiTrue) {
        total += procs[i].bg_walltime;
	numAvail++;
    }
                                                                                
  double averageLoad = total/numAvail;
  return averageLoad;
}
Exemple #6
0
int BaseLB::LDStats::useMem() { 
  // calculate the memory usage of this LB (superclass).
  return sizeof(LDStats) + sizeof(ProcStats) * nprocs() +
	 (sizeof(LDObjData) + 2 * sizeof(int)) * n_objs +
 	 sizeof(LDCommData) * n_comm;
}
Exemple #7
0
void BaseLB::LDStats::computeNonlocalComm(int &nmsgs, int &nbytes)
{
#if CMK_LBDB_ON
    	nmsgs = 0;
	nbytes = 0;

	makeCommHash();

	int mcast_count = 0;
        for (int cidx=0; cidx < n_comm; cidx++) {
	    LDCommData& cdata = commData[cidx];
	    int senderPE, receiverPE;
	    if (cdata.from_proc())
	      senderPE = cdata.src_proc;
  	    else {
	      int idx = getHash(cdata.sender);
	      if (idx == -1) continue;    // sender has just migrated?
	      senderPE = to_proc[idx];
	      CmiAssert(senderPE != -1);
	    }
	    CmiAssert(senderPE < nprocs() && senderPE >= 0);

            // find receiver: point-to-point and multicast two cases
	    int receiver_type = cdata.receiver.get_type();
	    if (receiver_type == LD_PROC_MSG || receiver_type == LD_OBJ_MSG) {
              if (receiver_type == LD_PROC_MSG)
	        receiverPE = cdata.receiver.proc();
              else  {  // LD_OBJ_MSG
	        int idx = getHash(cdata.receiver.get_destObj());
		if (idx == -1) {		// receiver outside this domain
		  if (complete_flag) continue;
		  else receiverPE = -1;
		}
		else {
	          receiverPE = to_proc[idx];
                  CmiAssert(receiverPE < nprocs() && receiverPE >= 0);
		}
              }
	      if(senderPE != receiverPE)
	      {
	  	nmsgs += cdata.messages;
		nbytes += cdata.bytes;
	      }
	    }
            else if (receiver_type == LD_OBJLIST_MSG) {
              int nobjs;
              LDObjKey *objs = cdata.receiver.get_destObjs(nobjs);
	      mcast_count ++;
	      CkVec<int> pes;
	      for (int i=0; i<nobjs; i++) {
	        int idx = getHash(objs[i]);
		CmiAssert(idx != -1);
	        if (idx == -1) continue;    // receiver has just been removed?
	        receiverPE = to_proc[idx];
		CmiAssert(receiverPE < nprocs() && receiverPE >= 0);
		int exist = 0;
	        for (int p=0; p<pes.size(); p++) 
		  if (receiverPE == pes[p]) { exist=1; break; }
		if (exist) continue;
		pes.push_back(receiverPE);
	        if(senderPE != receiverPE)
	        {
	  	  nmsgs += cdata.messages;
		  nbytes += cdata.bytes;
	        }
              }
	    }
	}   // end of for
#endif
}
Exemple #8
0
///  计算结果存储在矩阵a中
///  n_global: the order of the matrix
static void inv_driver(blas_idx_t n_global)		
{

    auto grid = std::make_shared<blacs_grid_t>();
	
	//// self code
	//n_global = 3;
	//double *aaa = new double(n_global*n_global);
	//for (int i = 0; i < 9; i++)
	//{
	//	aaa[i] = i + 1;
	//}
	//aaa[8] = 10;
	//auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa);


    // Create a NxN random matrix A
    auto a = block_cyclic_mat_t::random(grid, n_global, n_global);        

    // Create a NxN matrix to hold A^{-1}
    auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global);

    // Copy A to A^{-1} since it will be overwritten during factorization
    std::copy_n(a->local_data(), a->local_size(), ai->local_data());

    MPI_Barrier (MPI_COMM_WORLD);

    double t0 = MPI_Wtime();
    
    // Factorize A 
    blas_idx_t ia = 1, ja = 1;
    std::vector<blas_idx_t> ipiv(a->local_rows() + a->row_block_size() + 100);
    blas_idx_t info;

	//含义应该是D-GE-TRF。
	//第一个D表示我们的矩阵是double类型的
	//GE表示我们的矩阵是General类型的
	//TRF表示对矩阵进行三角分解也就是我们通常所说的LU分解。
    pdgetrf_(n_global, n_global, 
        ai->local_data(), ia, ja, ai->descriptor(), 
        ipiv.data(), 
        info);
    assert(info == 0);
    double t_factor = MPI_Wtime() - t0;

    // Compute A^{-1} based on the LU factorization

    // Compute workspace for double and integer work arrays on each process
    blas_idx_t lwork  = 10;
    blas_idx_t liwork = 10;
    std::vector<double>     work (lwork); 
    std::vector<blas_idx_t> iwork(liwork);

    lwork = liwork = -1;   

	// 计算lwork与liwork的值
    pdgetri_(n_global, 
        ai->local_data(), ia, ja, ai->descriptor(), 
        ipiv.data(), 
        work.data(), lwork, iwork.data(), liwork, info);
    assert(info == 0);
    lwork  = static_cast<blas_idx_t>(work[0]);
    liwork = static_cast<size_t>(iwork[0]);
    work.resize(lwork);
    iwork.resize(liwork);

    // Now compute the inverse
    t0 = MPI_Wtime();
    pdgetri_(n_global, 
        ai->local_data(), ia, ja, ai->descriptor(), 
        ipiv.data(), 
        work.data(), lwork, iwork.data(), liwork, info);
    assert(info == 0);
    double t_solve = MPI_Wtime() - t0;

    // Verify that the inverse is correct using A*A^{-1} = I
    auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global);

    // Compute I = A * A^{-1} - I and verify that the ||I|| is small    
    char nein = 'N';
    double alpha = 1.0, beta = -1.0;
    pdgemm_(nein, nein, n_global, n_global, n_global, alpha, 
        a->local_data() , ia, ja, a->descriptor(),
        ai->local_data(), ia, ja, ai->descriptor(),
        beta,
        identity->local_data(), ia, ja, identity->descriptor());

    // Compute 1-norm of the result
    char norm='1';
    work.resize(identity->local_cols());
    double err = pdlange_(norm, n_global, n_global, 
        identity->local_data(), ia, ja, identity->descriptor(), work.data());

    double t_total = t_factor + t_solve;
    double t_glob;
    MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    if (grid->iam() == 0) 
    {
        double gflops = getri_flops(n_global)/t_glob/grid->nprocs();
        printf("\n"
            "MATRIX INVERSE BENCHMARK SUMMARY\n"
            "================================\n"
            "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n"
            "Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n",
            n_global, grid->nprocs(), grid->nprows(), grid->npcols(), 
            t_glob, gflops, err);fflush(stdout);
    }
}
Exemple #9
0
static void dgemm_driver(blas_idx_t m_global, blas_idx_t n_global, blas_idx_t k_global)
{
    auto grid = std::make_shared<blacs_grid_t>();

    auto a = block_cyclic_mat_t::random(grid, m_global, k_global);
  //  auto b = block_cyclic_mat_t::random(grid, k_global, n_global);
    auto c = block_cyclic_mat_t::random(grid, m_global, n_global);
	

	//for test TODO
	double *dd = new double[m_global*k_global];
	for (int i = 0; i < m_global*k_global; i++)
	{
		dd[i] = i;
	}
	auto b = block_cyclic_mat_t::createWithArray(grid, m_global, k_global, dd);
	//从这里开始矩阵的运算

    MPI_Barrier(MPI_COMM_WORLD);

    double alpha = 1.0, beta = 0.0;

    double t0 = MPI_Wtime();
    char NEIN = 'N';	//表示不进行转置
    blas_idx_t ia = 1, ja = 1, ib = 1, jb = 1, ic = 1, jc = 1;

	// sub(C) = alpha*op(sub(A))*op(sub(B)) + beta*sub(C)
    pdgemm_ (NEIN, NEIN, m_global, n_global, k_global, 
        alpha, 
        a->local_data(), ia, ja, a->descriptor(), 
        b->local_data(), ib, jb, b->descriptor(),
        beta,
        c->local_data(), ic, jc, c->descriptor()  
		);
    
	double t1 = MPI_Wtime() - t0;
	double t_glob;
	//获取所有进程所用时间中最长的时间
    MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); 


    if (grid->iam() == 0) // 进程号为0的进程
    { 
        double gflops = gemm_flops(m_global, n_global, k_global)/t_glob/grid->nprocs();

        printf("\n"
            "MATRIX MULTIPLY BENCHMARK SUMMARY\n"
            "=================================\n"
            "M = %d\tN = %d\tK = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n"
            "Time for PxGEMM = %10.7f seconds\tGFlops/Proc = %10.7f\n", 
            m_global, n_global, k_global, grid->nprocs(), grid->nprows(), grid->npcols(),
            t_glob, gflops); fflush(stdout);

		for (int i = 0; i < 10; i++)
		{
			for (int j = 0; j < 10; j++)
			{
				printf("%f ", c->local_data()[i*k_global + j]);
			}
			printf("\n");
		}
		fflush(stdout);
    }
}