コード例 #1
0
ファイル: QMP_perf.c プロジェクト: 6twirl9/qmp
static void
test_simultaneous_send (int** smem,
			int** rmem,
			QMP_msghandle_t* sendh,
			QMP_msghandle_t* recvh,
			struct perf_argv* pargv)
{
  double it, ft, dt, bwval;
  int    i, j;
  QMP_status_t err;
  int nc, ndims;
  int nloops;
  int dsize;

  nc = pargv->num_channels;
  nloops = pargv->loops;
  dsize = pargv->size;
  ndims = pargv->ndims;

  /* do a test for nloops */
  QMP_barrier();
  it = get_current_time ();
  for (i = 0; i < nloops; i++) {
    for (j = 0; j < nc; j++) {
      /* receive operation */
      if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));
    }

    for (j = 0; j < nc; j++) {
      /* Send operation */
      if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));
    }

    for (j = 0; j < nc; j++) {
      if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	QMP_printf ("Error in sending %d\n", j );
    }

    for (j = 0; j < nc; j++) {
      if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	QMP_printf ("Error in receiving %d\n", j);
    }

  }
  ft = get_current_time ();

  if(QMP_get_node_number()==0) {
    dt = (ft - it); /* in milli seconds */

    bwval = dsize/(double)1000.0 * 4.0 * nloops * nc*ndims/dt;

    QMP_printf ("Simultaneous send B/W for datasize %d is %g (MB/s)", dsize * 4, bwval);

    QMP_printf ("Time difference is %lf micro seconds", dt*1000.0/nloops);
  }
  QMP_barrier();
}
コード例 #2
0
ファイル: sysfunc.C プロジェクト: DeanHowarth/QUDA-CPS
unsigned int sync() {
QMP_status_t sync_status = QMP_barrier(); 
if (sync_status != QMP_SUCCESS) {
      QMP_error("Error in QMP sync:%s\n", QMP_error_string(sync_status));
      return 0;
}
return 1;
}
コード例 #3
0
ファイル: wilson_test.c プロジェクト: usqcd-software/qopqdp
double
bench_inv(QOP_info_t *info, QOP_invert_arg_t *inv_arg,
	  QOP_resid_arg_t *res_arg, QDP_DiracFermion *out,
	  QDP_DiracFermion *in)
{
  static QLA_Real r2s=-1, r2;
  double sec=0, flop=0, mf=0;
  int i, iter=0;
  QOP_DiracFermion *qopout, *qopin;

  QDP_D_eq_zero(out, QDP_all);
  qopout = QOP_create_D_from_qdp(out);
  qopin = QOP_create_D_from_qdp(in);
  for(i=0; i<=nit; i++) {
    QMP_barrier();
    QOP_wilson_invert(info, flw, inv_arg, res_arg, kappa, qopout, qopin);
    QMP_barrier();
    printf("%i\t%i\t%g\t%i\n", i, res_arg->final_iter, info->final_sec, (int)info->final_flop);
    if(i>0) {
      iter += res_arg->final_iter;
      sec += info->final_sec;
      flop += info->final_flop;
      //mf += info->final_flop/(1e6*info->final_sec);
    }
  }
  QOP_destroy_D(qopout);
  QOP_destroy_D(qopin);
  QDP_r_eq_norm2_D(&r2, out, QDP_even);
  if(r2s<0) r2s = r2;
  if(fabs(1-r2/r2s)>1e-3) {
    printf0("first norm = %g  this norn = %g\n", r2s, r2);
  }
  mf = 1;
  QMP_sum_double(&mf);
  QMP_sum_double(&sec);
  QMP_sum_double(&flop);
  res_arg->final_iter = iter/nit;
  info->final_sec = sec/(mf*nit);
  info->final_flop = flop/(mf*nit);
  mf = info->final_flop/(1e6*info->final_sec);
  return mf;
}
コード例 #4
0
ファイル: comm_qmp.cpp プロジェクト: ckallidonis/quda
void comm_barrier(void)
{
  QMP_CHECK( QMP_barrier() );  
}
コード例 #5
0
ファイル: testLittleDiracOp.C プロジェクト: paboyle/BFM
void test_solver(BfmSolver solver)
{

  g5dParams parms;

  int Ls=16;
  double M5=1.8;
  double mq=0.0001;
  double wilson_lo = 0.05;
  double wilson_hi = 6.8;
  double shamir_lo = 0.025;
  double shamir_hi = 1.7;
  double ht_scale=1.7;
  double hw_scale=1.0;

  if ( solver != DWF ) { 
    exit(0);
    Printf("Should be testing HtCayleyTanh aka DWF\n");
  }
  parms.pDWF(mq,M5,Ls);

  multi1d<LatticeColorMatrix> u(4);
  HotSt(u);
  //  ArchivGauge_t Header ; readArchiv(Header,u,"ckpoint_lat.3000");  

  multi1d<LatticeFermion> src(Ls);

/* Rudy calculate some eigenvectors */


  BfmWrapperParams BWP;
  BWP.BfmInverter = BfmInv_CG; 
  BWP.BfmMatrix   = BfmMat_M;
  BWP.BfmPrecision= Bfm64bit;
  BWP.MaxIter     = 10000;
  BWP.RsdTarget.resize(1);
  BWP.RsdTarget[0]= 1.0e-9;
  BWP.Delta = 1.0e-4;
  BWP.BAP = parms;
  BfmWrapper bfm(BWP);

    bfmarg bfma;
#if defined(QDP_USE_OMP_THREADS)
    bfma.Threads(omp_get_max_threads());
#else
    bfma.Threads(16);
#endif
    bfma.Verbose(0);

    //Physics parameters
    bfmActionParams *bfmap = (bfmActionParams *) &bfma;
    *bfmap = bfm.invParam.BAP;
    
    // Algorithm & code control
    bfma.time_report_iter=-100;
    bfma.max_iter     = bfm.invParam.MaxIter;
    bfma.residual     = toDouble(bfm.invParam.RsdTarget[0]);

  int lx = QDP::Layout::subgridLattSize()[0];
  int ly = QDP::Layout::subgridLattSize()[1];
  int lz = QDP::Layout::subgridLattSize()[2];
  int lt = QDP::Layout::subgridLattSize()[3];
    //Geometry
    bfma.node_latt[0] = lx;
    bfma.node_latt[1] = ly;
    bfma.node_latt[2] = lz;
    bfma.node_latt[3] = lt;
    
    multi1d<int> procs = QDP::Layout::logicalSize();
    for(int mu=0;mu<4;mu++){
      if (procs[mu]>1) bfma.local_comm[mu] = 0;
      else             bfma.local_comm[mu] = 1;
    }
    
    // Bfm object
    bfm_qdp<double> bfm_eig; 
    bfm_eig.init(bfma);

    //Gauge field import
    bfm_eig.importGauge(u);

    //Subspace
#define NumberGaussian (1)
  Fermion_t subspace[NumberGaussian];
  Fermion_t check;
  Fermion_t mp;
  Fermion_t mmp;
  Fermion_t tmp_t;
  check = bfm_eig.allocFermion();
     mp = bfm_eig.allocFermion();
    mmp = bfm_eig.allocFermion();
  tmp_t = bfm_eig.allocFermion();
  bfm_eig.importFermion(src,check,1);

  QDPIO::cout << "Ls = "<<Ls<<endl;
  for(int g=0;g<NumberGaussian;g++){
    for(int s=0;s<Ls;s++){
      gaussian(src[s]);
    }
    subspace[g]=bfm_eig.allocFermion();
    bfm_eig.importFermion(src,subspace[g],1); // Half parity gaussian
    if ( g==0) {
      bfm_eig.importFermion(src,check,1);
    }
    for(int s=0;s<Ls;s++){
      src[s]=zero;
    }
    bfm_eig.exportFermion(src,subspace[g],1);
    QDPIO::cout << "Subspace norm " << norm2(src)<<endl;
  }
  for(int s=0;s<Ls;s++){
    gaussian(src[s]);
  }
  QDPIO::cout << "Got here " << endl;

  //  Handle< LinearOperatorArray<T> > linop =GetLinOp(u, parms);
  int block[5];
  for(int i=0;i<5;i++) block[i]=4;

  QDPIO::cout << "Initialised dirac op"<<endl;
  BfmLittleDiracOperator ldop(Ls,NumberGaussian,block,subspace,&bfm_eig);

  int ns = ldop.SubspaceDimension();
  QDPIO::cout << "subspace dimension is "<< ns<<endl;
  ns = ldop.SubspaceLocalDimension();
  QDPIO::cout << "subspace dimension per node is "<< ns<<endl;

  std::vector<std::complex<double> > decomp(ns);
  ldop.ProjectToSubspace(check,decomp);
  if (QMP_is_primary_node()){
    FILE * fp = fopen("coeff.dat","w");
    for(int s=0;s<ns;s++){
      fprintf(fp,"coeff %d %le %le\n",s,real(decomp[s]),imag(decomp[s]));
    }
    fclose(fp);
  }
  for(int s=0;s<ns;s++){
    QDPIO::cout << "coeff "<<s<<" " << real(decomp[s]) << " " << imag(decomp[s])<<endl;
  }
  ldop.PromoteFromSubspace(decomp,mp);
  double n;
#pragma omp parallel 
  {
    omp_set_num_threads(bfm_eig.nthread);
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.axpy(check,mp,check,-1);
      n = bfm_eig.norm(check);
    }
  }
  QDPIO::cout << "project/promote n2diff "<< n<<endl;
  QMP_barrier();

QDPIO::cout << "Computing little dirac matrix"<<endl;
  ldop.ComputeLittleMatrixColored();

  QDPIO::cout << "Done"<<endl;

  std::vector<std::complex<double> > Aphi(ns);
  //        phi^dag DdagD phi = |Dphi|^2 with phi a subspace vector
  //        should be equal to Project/Apply/Promote + inner product

#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.Mprec(subspace[0],mp,tmp_t,0);
    }
  }

  QDPIO::cout << "Applied BFM matrix "<<endl;

  double n2;
#pragma omp parallel 
  {
    omp_set_num_threads(bfm_eig.nthread);
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      n2 = bfm_eig.norm(mp);
    }
  }

  QDPIO::cout << "Applied BFM matrix "<<n2<<endl;

  ldop.ProjectToSubspace(subspace[0],decomp);
  QDPIO::cout << "Projected to subspace "<<endl;
  ldop.Apply(decomp,Aphi);
  QDPIO::cout << "Applied A "<<endl;
  ldop.PromoteFromSubspace(Aphi,check);
  QDPIO::cout << "Promoted "<<endl;

  complex<double> inn;
#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      inn = bfm_eig.inner(subspace[0],check);
    }
  }

  QDPIO::cout << "phi^dag Ddag D phi check " << n2 << " " <<real(inn) << imag(inn) <<endl;

  std::vector<std::complex<double> > AinvAphi(ns);
  ldop.ProjectToSubspace(subspace[0],decomp);
  ldop.Apply(decomp,Aphi);
  for(int s=0;s<ns;s++){
    QDPIO::cout << "Aphi "<<s<<" " << real(Aphi[s]) <<" " << imag(Aphi[s])<<endl;
  }
  ldop.PromoteFromSubspace(Aphi,check);

#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.Mprec(subspace[0],mp,tmp_t,0);
      bfm_eig.Mprec(mp,mmp,tmp_t,1);
    }
  }
  ldop.ProjectToSubspace(mmp,decomp);
  ldop.PromoteFromSubspace(decomp,mmp);
#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.axpy(check,mmp,check,-1.0);
      n2 = bfm_eig.norm(check);
    }
  }
  QDPIO::cout << "PMdagMP check n2diff "<< n2<<endl;


  QMP_barrier();
  QDPIO::cout << "Applying inverse"<<endl;
  ldop.ApplyInverse(Aphi,AinvAphi);
  QMP_barrier();
  for(int s=0;s<ns;s++){
    QDPIO::cout << "AinvAphi "<<s<<" " << real(AinvAphi[s]) << " " << imag(AinvAphi[s])<<endl;
  }
  ldop.PromoteFromSubspace(AinvAphi,check);

#pragma omp parallel 
  {
#pragma omp for 
    for(int t=0;t<bfm_eig.nthread;t++) {
      bfm_eig.axpy(check,subspace[0],check,-1.0);
      n2 = bfm_eig.norm(check);
    }
  }
  QDPIO::cout << "AinvA check n2diff "<< n2<<endl;
  

}
コード例 #6
0
ファイル: QMP_perf.c プロジェクト: 6twirl9/qmp
int
main (int argc, char** argv)
{
  int             i, nc;
  QMP_status_t      status;
  int       **smem, **rmem;
  QMP_msgmem_t    *recvmem;
  QMP_msghandle_t *recvh;
  QMP_msgmem_t    *sendmem;
  QMP_msghandle_t *sendh;
  struct perf_argv pargv;
  QMP_thread_level_t req, prv;

  /** 
   * Simple point to point topology 
   */
  int dims[4] = {2,2,2,2};
  int ndims = 1;

  //if(QMP_get_node_number()==0)
  //printf("starting init\n"); fflush(stdout);
  req = QMP_THREAD_SINGLE;
  status = QMP_init_msg_passing (&argc, &argv, req, &prv);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "QMP_init failed\n");
    return -1;
  }
  if(QMP_get_node_number()==0)
    printf("finished init\n"); fflush(stdout);

  if (parse_options (argc, argv, &pargv) == -1) {
    if(QMP_get_node_number()==0)
      usage (argv[0]);
    exit (1);
  }

  {
    int maxdims = 4;
    int k=0;
    int nodes = QMP_get_number_of_nodes();
    ndims = 0;
    while( (nodes&1) == 0 ) {
      if(ndims<maxdims) ndims++;
      else {
	dims[k] *= 2;
	k++;
	if(k>=maxdims) k = 0;
      }
      nodes /= 2;
    }
    if(nodes != 1) {
      QMP_error("invalid number of nodes %i", QMP_get_number_of_nodes());
      QMP_error(" must power of 2");
      QMP_abort(1);
    }
    pargv.ndims = ndims;
  }

  status = QMP_declare_logical_topology (dims, ndims);
  if (status != QMP_SUCCESS) {
    fprintf (stderr, "Cannot declare logical grid\n");
    return -1;
  }

  /* do a broadcast of parameter */
  if (QMP_broadcast (&pargv, sizeof (pargv)) != QMP_SUCCESS) {
    QMP_printf ("Broadcast parameter failed\n");
    exit (1);
  }

  {
    int k=1;
    const int *lc = QMP_get_logical_coordinates();
    for(i=0; i<ndims; i++) k += lc[i];
    pargv.sender = k&1;
  }

  QMP_printf("%s options: num_channels[%d] verify[%d] option[%d] datasize[%d] numloops[%d] sender[%d] strided_send[%i] strided_recv[%i] strided_array_send[%i] ",
	     argv[0], pargv.num_channels, pargv.verify, 
	     pargv.option, pargv.size, pargv.loops, pargv.sender,
	     strided_send, strided_recv, strided_array_send);
  fflush(stdout);


  /**
   * Create memory
   */
  nc = pargv.num_channels;
  smem = (int **)malloc(nc*sizeof (int *));
  rmem = (int **)malloc(nc*sizeof (int *));
  sendmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  recvmem = (QMP_msgmem_t *)malloc(ndims*nc*sizeof (QMP_msgmem_t));
  sendh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));
  recvh = (QMP_msghandle_t *)malloc(nc*sizeof (QMP_msghandle_t));

  QMP_barrier();
  if(QMP_get_node_number()==0) printf("\n"); fflush(stdout);
  if(pargv.option & TEST_SIMUL) {
    int opts = pargv.option;
    pargv.option = TEST_SIMUL;
    if(QMP_get_node_number()==0)
      QMP_printf("starting simultaneous sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_simultaneous_send (smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished simultaneous sends\n"); fflush(stdout);
    pargv.option = opts;
  }

  if(pargv.option & TEST_PINGPONG) {
    int opts = pargv.option;
    pargv.option = TEST_PINGPONG;
    if(QMP_get_node_number()==0)
      QMP_printf("starting ping pong sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      if(pargv.verify)
	test_pingpong_verify(smem, rmem, sendh, recvh, &pargv);
      else
	test_pingpong(smem, rmem, sendh, recvh, &pargv);
      check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished ping pong sends\n"); fflush(stdout);
    pargv.option = opts;
  }

  if(pargv.option & TEST_ONEWAY) {
    int opts = pargv.option;
    pargv.option = TEST_ONEWAY;
    if(QMP_get_node_number()==0)
      QMP_printf("starting one way sends"); fflush(stdout);
    for(i=pargv.minsize; i<=pargv.maxsize; i*=pargv.facsize) {
      pargv.size = i;
      create_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc, i, &pargv);
      test_oneway (smem, rmem, sendh, recvh, &pargv);
      if(!pargv.sender) check_mem(rmem, ndims, nc, i);
      free_msgs(smem, rmem, sendmem, recvmem, sendh, recvh, ndims, nc);
    }
    if(QMP_get_node_number()==0)
      QMP_printf("finished one way sends"); fflush(stdout);
    pargv.option = opts;
  }


  /**
   * Free memory 
   */
  free (smem);
  free (rmem);

  free (sendh);
  free (recvh);

  free (sendmem);
  free (recvmem);

  QMP_finalize_msg_passing ();

  return 0;
}
コード例 #7
0
ファイル: QMP_perf.c プロジェクト: 6twirl9/qmp
/**
 * Test oneway blast send
 */
static void
test_oneway (int** smem,
	     int** rmem,
	     QMP_msghandle_t* sendh,
	     QMP_msghandle_t* recvh,
	     struct perf_argv* pargv)
{
  double it, ft, dt, bwval;
  int    i, j;
  QMP_status_t err;
  int nc, ndims;
  int nloops;
  int dsize;
  QMP_bool_t sender;

  nc = pargv->num_channels;
  nloops = pargv->loops;
  dsize = pargv->size;
  sender = pargv->sender;
  ndims = pargv->ndims;

  QMP_barrier();
  if (sender) {
    it = get_current_time ();
    for (i = 0; i < nloops; i++) {
      for (j = 0; j < nc; j++) {
	/* Send operation */
	if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));
      }

      for (j = 0; j < nc; j++) {
	if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in sending %d\n", j );
      }
    }
    ft = get_current_time (); /* In milli seconds */

    dt = (ft - it); /* actual send time milli seconds */

    bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt;

    if(QMP_get_node_number()==0) {
      QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", 
		  dsize * 4, bwval);
      QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops);
      fflush(stdout);
    }
    QMP_barrier();
  }
  else { 
    it = get_current_time ();
    for (i = 0; i < nloops; i++) {
      for (j = 0; j < nc; j++) {
	/* receive operation */
	if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));
      }

      for (j = 0; j < nc; j++) {
	if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in receiving %d\n", j);
      }
    }
    ft = get_current_time (); /* In milli seconds */

    dt = (ft - it); /* actual send time milli seconds */

    bwval = dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt;

    QMP_barrier();
    if(QMP_get_node_number()==1) {
      QMP_printf ("Oneway Bandwidth for datasize %d is %g (MB/s)", 
		  dsize * 4, bwval);
      QMP_printf ("time is %lf micro seconds", dt*1000.0/nloops);
      fflush(stdout);
    }
  }
  QMP_barrier();
}
コード例 #8
0
ファイル: QMP_perf.c プロジェクト: 6twirl9/qmp
/**
 * Test ping and verify received message.
 */
static void
test_pingpong_verify (int** smem,
		      int** rmem,
		      QMP_msghandle_t* sendh,
		      QMP_msghandle_t* recvh,
		      struct perf_argv* pargv)
{
  double it, ft, dt, bwval;
  int    i, j, k;
  QMP_status_t err;
  int nc, ndims;
  int nloops;
  int dsize;
  QMP_bool_t sender;

  nc = pargv->num_channels;
  nloops = pargv->loops;
  dsize = pargv->size;
  sender = pargv->sender;
  ndims = pargv->ndims;

  QMP_barrier();
  it = get_current_time ();

  for (i = 0; i < nloops; i++) {
    for (j = 0; j < nc; j++) {
      for (k = 0; k < dsize; k++) {
	rmem[j][k] = 0;
	smem[j][k] = i + k * j + nc*nc;
      }
    }

    if (sender) {
      for (j = 0; j < nc; j++) {
	/* Send operation */
	if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));
      }

      for (j = 0; j < nc; j++) {
	if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in sending %d\n", j );
      }

      for (j = 0; j < nc; j++) {
	/* receive operation */
	if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));
      }
      
      for (j = 0; j < nc; j++) {
	if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in receiving %d\n", j);
      }
    }
    else { 
      for (j = 0; j < nc; j++) {
	/* receive operation */
	if ((err = QMP_start (recvh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start receiving failed: %s\n", QMP_error_string(err));
      }
	
      for (j = 0; j < nc; j++) {
	if (QMP_wait (recvh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in receiving %d\n", j);
      }
	
      for (j = 0; j < nc; j++) {
	/* Send operation */
	if ((err = QMP_start (sendh[j])) != QMP_SUCCESS)
	  QMP_printf ("Start sending failed: %s\n", QMP_error_string(err));
      }

      for (j = 0; j < nc; j++) {
	if (QMP_wait (sendh[j]) != QMP_SUCCESS)
	  QMP_printf ("Error in sending %d\n", j );
      }
    }

    /* verify memory */
    for (j = 0; j < nc; j++) {
      for (k = 0; k < dsize; k++)
	if (rmem[j][k] != i + k * j + nc* nc)
	  QMP_printf ("Receiving memory error for memory %d %d %d\n", 
		  j, k, rmem[j][k]);
    }

  }
  ft = get_current_time (); /* In milli seconds */

  dt = (ft - it); /* actual send time milli seconds */

  bwval = 2 * dsize/(double)1000.0 * 4 * nloops*nc*ndims/dt;

  QMP_printf ("Ping Pong Bandwidth for datasize %d is %g (MB/s)", 
	      dsize * 4, bwval);

  QMP_printf ("RTT/2 is %lf micro seconds", dt*1000.0/nloops/2);
}