Beispiel #1
0
  main(CkArgMsg* m)
  {
      if(m->argc < 5)
          CkPrintf("Usage: payload PEs CharesPerPE iteration\n");

      niter = 0;
      iterations=NITER;
      payload=PAYLOAD;
      if(m->argc>1)
          totalPayload=atoi(m->argv[1]);
      if(m->argc>2)
          PEsPerNode = atoi(m->argv[2]);
      if(m->argc>3)
          CharesPerPE = atoi(m->argv[3]);
      if(m->argc>4)
          iterations=atoi(m->argv[4]);
 
      payload = totalPayload/PEsPerNode/CharesPerPE;
      mainProxy = thishandle;
      arr1 = CProxy_Ping1::ckNew(CkNumNodes()* PEsPerNode * CharesPerPE );
      start_time = CkWallTimer();
      nodeIndex = 1;
      int x,y,z,t;
      TopoManager tmgr;
      for(int i=0; i<CmiNumPes(); i+=CmiMyNodeSize())
      {
          tmgr.rankToCoordinates(i, x,y, z, t);
          CkPrintf(" %d  [%d:%d:%d:%d]\n", i, x, y, z, t);
      }
      CkPrintf("NodeIndex Chares       Workers        NoOfMsgs        Bytes           Total           Time(us)\n");
      _traceControl = CProxy_TraceControl::ckNew();
      for(int i=0; i<PEsPerNode * CharesPerPE; i++)
          arr1[i].start(nodeIndex);
      delete m;
  };
Beispiel #2
0
    Main(CkArgMsg* m) {
#if CMK_BLUEGENEL
      BGLPersonality bgl_p;
      int i = rts_get_personality(&bgl_p, sizeof(BGLPersonality));
#elif CMK_BLUEGENEP
      DCMF_Hardware_t bgp_hwt;
      DCMF_Hardware(&bgp_hwt);
#elif XT3_TOPOLOGY
      XT3TorusManager xt3tm;
#elif XT4_TOPOLOGY || XT5_TOPOLOGY
      XTTorusManager xttm;
#endif

      mainProxy = thishandle;
      CkPrintf("Testing TopoManager .... \n");
      TopoManager tmgr;
      CkPrintf("Torus Size [%d] [%d] [%d] [%d]\n", tmgr.getDimNX(), tmgr.getDimNY(), tmgr.getDimNZ(), tmgr.getDimNT());

#if CMK_BLUEGENEP
      CkPrintf("Torus Size [%d] [%d] [%d] [%d]\n", bgp_hwt.xSize, bgp_hwt.ySize, bgp_hwt.zSize, bgp_hwt.tSize);
#endif
      int x, y, z, t;

      for(int i=0; i<CkNumPes(); i++) {
	tmgr.rankToCoordinates(i, x, y, z, t);
	CkPrintf("---- Processor %d ---> x %d y %d z %d t %d\n", i, x, y, z, t);
#if CMK_BLUEGENEL
	unsigned int tmp_t, tmp_x, tmp_y, tmp_z;
	rts_coordinatesForRank(i, &tmp_x, &tmp_y, &tmp_z, &tmp_t);
	CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t);
#elif CMK_BLUEGENEP
	unsigned int tmp_t, tmp_x, tmp_y, tmp_z;
    #if (DCMF_VERSION_MAJOR >= 3)
	DCMF_NetworkCoord_t nc;
	DCMF_Messager_rank2network(i, DCMF_DEFAULT_NETWORK, &nc);
	tmp_x = nc.torus.x;
	tmp_y = nc.torus.y;
	tmp_z = nc.torus.z;
	tmp_t = nc.torus.t;
    #else
	DCMF_Messager_rank2torus(c, &tmp_x, &tmp_y, &tmp_z, &tmp_t);
    #endif
	CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t);
#elif XT3_TOPOLOGY
	int tmp_t, tmp_x, tmp_y, tmp_z;
	xt3tm.realRankToCoordinates(i, tmp_x, tmp_y, tmp_z, tmp_t);
	CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t);
#elif XT4_TOPOLOGY || XT5_TOPOLOGY
	int tmp_t, tmp_x, tmp_y, tmp_z;
	xttm.realRankToCoordinates(i, tmp_x, tmp_y, tmp_z, tmp_t);
	CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t);
#endif
      } // end of for loop

      int size = tmgr.getDimNX() * tmgr.getDimNY() * tmgr.getDimNZ();
      CkPrintf("Torus Contiguity Metric %d : %d [%f] \n", size, CkNumPes()/tmgr.getDimNT(), (float)(CkNumPes())/(tmgr.getDimNT()*size) );
      CkExit();
    };
Beispiel #3
0
int main(int argc, char *argv[]) {
  int numprocs, myrank;
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  double sendTime, recTime, min, avg, max;
  double time[3] = {0.0, 0.0, 0.0};
  int msg_size;
  MPI_Status mstat;
  int i=0,j, pe, pe1, pe2, trial, hops;
  char name[30];
  char locname[30];
  char blockname[50];
  double newTime, oldTime;
  double storeTime[NUM_MSGS];
  double recvTime[NUM_MSGS];
  double storeBw[NUM_MSGS];
  char *send_buf = (char *)malloc(MAX_MSG_SIZE);
  char *recv_buf = (char *)malloc(MAX_MSG_SIZE);
  FILE *locf;
  for(i = 0; i < MAX_MSG_SIZE; i++) {
    recv_buf[i] = send_buf[i] = (char) (i & 0xff);
  }

  // allocate the routing map.
  int *rmap = (int *) malloc(sizeof(int) * numprocs);
  int *smap = (int *) malloc(sizeof(int) * numprocs);
  
  TopoManager *tmgr;
  int dimNZ, numRG, x, y, z, t, bcastSend[3], bcastRecv[3];

  if(myrank == 0) {
    tmgr = new TopoManager();
#if CREATE_JOBS
    numRG = tmgr->getDimNX() * (tmgr->getDimNY() - 2) * 2 * tmgr->getDimNT();
#else
    numRG = tmgr->getDimNX() * tmgr->getDimNY() * 2;
#endif
    dimNZ = tmgr->getDimNZ();
    for (int i=1; i<numprocs; i++) {
      bcastSend[0] = dimNZ;
      bcastSend[1] = numRG;
      tmgr->rankToCoordinates(i, x, y, z, t);
      bcastSend[2] = z;
      MPI_Send(bcastSend, 3, MPI_INT, i, 1, MPI_COMM_WORLD);
    }
    tmgr->rankToCoordinates(0, x, y, z, t);
  } else {
      MPI_Recv(bcastRecv, 3, MPI_INT, 0, 1, MPI_COMM_WORLD, &mstat);
      dimNZ = bcastRecv[0];
      numRG = bcastRecv[1];
      z = bcastRecv[2];
  }
  MPI_Barrier(MPI_COMM_WORLD);
  if (myrank == 0) {
    printf("Torus Dimensions %d %d %d %d\n", tmgr->getDimNX(), tmgr->getDimNY(), dimNZ, tmgr->getDimNT());
  }
#if USE_HPM
    HPM_Init();
#endif
  for (hops=0; hops < 1; hops++) {
    // To print the recv times for certain ranks
    int *pmap = (int *) malloc(sizeof(int) * numprocs);
    if (myrank == 0) {
      // Rank 0 makes up a routing map.
      build_process_map(numprocs, smap, rmap, pmap, 2);
    }
    // Broadcast the routing map.
    MPI_Bcast(smap, numprocs, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(rmap, numprocs, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(pmap, numprocs, MPI_INT, 0, MPI_COMM_WORLD);
    sprintf(blockname, "Block_%d.hpm",hops);
    if (myrank == 0) {
       printf( " Broadcasted the map \n");
    }
#if USE_HPM
    HPM_Start(blockname);
#endif
#if CREATE_JOBS
    sprintf(name, "xt4_job_%d_%d.dat", numprocs, hops);
#else
    sprintf(name, "bgp_line_%d_%d.dat", numprocs, hops);
#endif
	if(pmap[myrank]>0)
	{
	sprintf(locname, "bgp_print_%d.dat", myrank);
	locf = fopen(locname, "a");
   	}
    for (msg_size=MIN_MSG_SIZE; msg_size<=MAX_MSG_SIZE; msg_size=(msg_size<<1)) {
      for (trial=0; trial<1; trial++) {
	 if (myrank == 0) {
	     printf( " Going to begin the trial \n");
	  }
	pe1 = smap[myrank]; // Am I a sender?
	pe2 = rmap[myrank]; // Am I a reciever? 
	MPI_Barrier(MPI_COMM_WORLD);
	// Actual Data Transfer
	if(pe1 != -1) {
	    sendTime = MPI_Wtime();
	    oldTime = sendTime;
	    j=0;
	    for(i=0; i<NUM_MSGS; i++)
	    {
		  storeTime[i] = MPI_Wtime(); // Just before the next send operation
		  MPI_Send(send_buf, msg_size, MPI_CHAR, pe1, 1, MPI_COMM_WORLD);
	    }
	    MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe1, 1, MPI_COMM_WORLD, &mstat);
	    recTime = (MPI_Wtime() - sendTime) / (NUM_MSGS+1);
	    //printf(" My Rank : %d Experiment: %d  MSG_SIZE: %d -- Completed send recv \n", myrank, hops, msg_size);
	  }
	if(pe2 !=1)
	{
	    sendTime = MPI_Wtime();
	    oldTime = sendTime;
	    j=0;
	    for(i=0; i<NUM_MSGS; i++)
	      {
		  MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe2, 1, MPI_COMM_WORLD, &mstat);
		  recvTime[i] = MPI_Wtime(); // Just after the next recv operation
	      }	  
	    MPI_Send(send_buf, msg_size, MPI_CHAR, pe2, 1, MPI_COMM_WORLD);
	    recTime = (MPI_Wtime() - sendTime) / (NUM_MSGS+1);
        }
	// Recv times sent back to the Senders for b/w calculations 
	if(myrank==0)
	{
	  printf(" My Rank : %d Experiment: %d  MSG_SIZE: %d -- Reached barrier in middle \n", myrank, hops, msg_size);
	}
	pe1 = smap[myrank]; // Am I a sender?
	pe2 = rmap[myrank]; // Am I a reciever? 
	MPI_Barrier(MPI_COMM_WORLD);
	if(pe1 != -1) {
	    MPI_Recv(recvTime, NUM_MSGS, MPI_DOUBLE, pe1, 1, MPI_COMM_WORLD, &mstat);
	    if(pmap[myrank]==1)
	    {
	      printf(" My Rank : %d Hops: %d  MSG_SIZE: %d Sender Side Exp trial: %d   Avg recv time %g \n", myrank, hops, msg_size, trial, recTime );
	      //printf(" My Rank : %d Hops: %d  MSG_SIZE: %d Sender Side Exp trial: %d   Recv time %g \n", myrank, hops, msg_size, trial, recvTime );
	      for(i=0;i<NUM_MSGS; i++)
		{
		  storeBw[i]= msg_size/(recvTime[i] - storeTime[i]);
		  fprintf(locf,  "%d   %d   %d   %g   %g   %g   %g \n", hops, myrank, msg_size, 500000*(storeTime[i]+recvTime[i]), storeBw[i],1000000*recvTime[i],1000000*storeTime[i]); 
		}
	    }
	  }
	if(pe2 !=1) 
	    {
	      MPI_Send(recvTime, NUM_MSGS, MPI_DOUBLE, pe2, 1, MPI_COMM_WORLD);
	    }
      } // end for loop of trials
    } // end for loop of msgs
    if(pmap[myrank]>0)
	{
  		fflush(NULL);
		fclose(locf);
	}
    free(pmap);
#if USE_HPM
  HPM_Stop(blockname);
#endif
  } // end for loop of hops
#if USE_HPM
  HPM_Print();
#endif
  if(myrank == 0)
    printf("Program Complete\n");
  MPI_Finalize();
  return 0;
}
Beispiel #4
0
extern "C" void LrtsInitCpuTopo(char **argv)
{
  static skt_ip_t myip;
  hostnameMsg  *msg;
  double startT;
 
  int obtain_flag = 1;              // default on
  int show_flag = 0;                // default not show topology

  if (CmiMyRank() ==0) {
     topoLock = CmiCreateLock();
  }

#if __FAULT__
  obtain_flag = 0;
#endif
  if(CmiGetArgFlagDesc(argv,"+obtain_cpu_topology",
					   "obtain cpu topology info"))
    obtain_flag = 1;
  if (CmiGetArgFlagDesc(argv,"+skip_cpu_topology",
                               "skip the processof getting cpu topology info"))
    obtain_flag = 0;
  if(CmiGetArgFlagDesc(argv,"+show_cpu_topology",
					   "Show cpu topology info"))
    show_flag = 1;

#if CMK_BIGSIM_CHARM
  if (BgNodeRank() == 0)
#endif
  {
  cpuTopoHandlerIdx =
     CmiRegisterHandler((CmiHandler)cpuTopoHandler);
  cpuTopoRecvHandlerIdx =
     CmiRegisterHandler((CmiHandler)cpuTopoRecvHandler);
  }

  if (!obtain_flag) {
    if (CmiMyRank() == 0) cpuTopo.sort();
    CmiNodeAllBarrier();
    CcdRaiseCondition(CcdTOPOLOGY_AVAIL);      // call callbacks
    return;
  }

  if (CmiMyPe() == 0) {
#if CMK_BIGSIM_CHARM
    if (BgNodeRank() == 0)
#endif
      startT = CmiWallTimer();
  }

#if CMK_BIGSIM_CHARM
  if (BgNodeRank() == 0)
  {
    //int numPes = BgNumNodes()*BgGetNumWorkThread();
    int numPes = cpuTopo.numPes = CkNumPes();
    cpuTopo.nodeIDs = new int[numPes];
    CpuTopology::supported = 1;
    int wth = BgGetNumWorkThread();
    for (int i=0; i<numPes; i++) {
      int nid = i / wth;
      cpuTopo.nodeIDs[i] = nid;
    }
    cpuTopo.sort();
  }
  return;
#else

#if CMK_USE_GM
  CmiBarrier();
#endif


#if 0
  if (gethostname(hostname, 999)!=0) {
      strcpy(hostname, "");
  }
#endif
#if CMK_BLUEGENEL || CMK_BLUEGENEP
  if (CmiMyRank() == 0) {
    TopoManager tmgr;

    int numPes = cpuTopo.numPes = CmiNumPes();
    cpuTopo.nodeIDs = new int[numPes];
    CpuTopology::supported = 1;

    int x, y, z, t, nid;
    for(int i=0; i<numPes; i++) {
      tmgr.rankToCoordinates(i, x, y, z, t);
      nid = tmgr.coordinatesToRank(x, y, z, 0);
      cpuTopo.nodeIDs[i] = nid;
    }
    cpuTopo.sort();
    if (CmiMyPe()==0)  CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
  }
  CmiNodeAllBarrier();
#elif CMK_BLUEGENEQ
  if (CmiMyRank() == 0) {
   TopoManager tmgr;

    int numPes = cpuTopo.numPes = CmiNumPes();
    cpuTopo.nodeIDs = new int[numPes];
    CpuTopology::supported = 1;

    int a, b, c, d, e, t, nid;
    for(int i=0; i<numPes; i++) {
      tmgr.rankToCoordinates(i, a, b, c, d, e, t);
      nid = tmgr.coordinatesToRank(a, b, c, d, e, 0);
      cpuTopo.nodeIDs[i] = nid;
    }
    cpuTopo.sort();
    if (CmiMyPe()==0)  CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
  }
  CmiNodeAllBarrier();
#elif CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
  if(CmiMyRank() == 0) {
    int numPes = cpuTopo.numPes = CmiNumPes();
    int numNodes = CmiNumNodes();
    cpuTopo.nodeIDs = new int[numPes];
    CpuTopology::supported = 1;

    int nid;
    for(int i=0; i<numPes; i++) {
      nid = getXTNodeID(CmiNodeOf(i), numNodes);
      cpuTopo.nodeIDs[i] = nid;
    }
    int prev = -1;
    nid = -1;

    // this assumes that all cores on a node have consecutive MPI rank IDs
    // and then changes nodeIDs to 0 to numNodes-1
    for(int i=0; i<numPes; i++) {
      if(cpuTopo.nodeIDs[i] != prev) {
	prev = cpuTopo.nodeIDs[i];
	cpuTopo.nodeIDs[i] = ++nid;
      }
      else
	cpuTopo.nodeIDs[i] = nid;
    }
    cpuTopo.sort();
    if (CmiMyPe()==0)  CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
  }
  CmiNodeAllBarrier();

#else

  bool topoInProgress = true;

  if (CmiMyPe() >= CmiNumPes()) {
    CmiNodeAllBarrier();         // comm thread waiting
#if CMK_MACHINE_PROGRESS_DEFINED
#if ! CMK_CRAYXT
    while (topoInProgress) {
      CmiNetworkProgress();
      CmiLock(topoLock);
      topoInProgress = done < CmiMyNodeSize();
      CmiUnlock(topoLock);
    }
#endif
#endif
    return;    /* comm thread return */
  }

    /* get my ip address */
  if (CmiMyRank() == 0)
  {
  #if CMK_HAS_GETHOSTNAME && !CMK_BLUEGENEQ
    myip = skt_my_ip();        /* not thread safe, so only calls on rank 0 */
    // fprintf(stderr, "[%d] IP is %d.%d.%d.%d\n", CmiMyPe(), myip.data[0],myip.data[1],myip.data[2],myip.data[3]);
  #elif CMK_BPROC
    myip = skt_innode_my_ip();
  #else
    if (!CmiMyPe())
    CmiPrintf("CmiInitCPUTopology Warning: Can not get unique name for the compute nodes. \n");
    _noip = 1; 
  #endif
    cpuTopo.numPes = CmiNumPes();
  }

  CmiNodeAllBarrier();
  if (_noip) return; 

    /* prepare a msg to send */
  msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg)+sizeof(_procInfo));
  msg->n = 1;
  msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg));
  CmiSetHandler((char *)msg, cpuTopoHandlerIdx);
  msg->procs[0].pe = CmiMyPe();
  msg->procs[0].ip = myip;
  msg->procs[0].ncores = CmiNumCores();
  msg->procs[0].rank = 0;
  msg->procs[0].nodeID = 0;
  CmiReduce(msg, sizeof(hostnameMsg)+sizeof(_procInfo), combineMessage);

  // blocking here
  while (topoInProgress) {
    CsdSchedulePoll();
    CmiLock(topoLock);
    topoInProgress = done < CmiMyNodeSize();
    CmiUnlock(topoLock);
  }

  if (CmiMyPe() == 0) {
#if CMK_BIGSIM_CHARM
    if (BgNodeRank() == 0)
#endif
      CmiPrintf("Charm++> cpu topology info is gathered in %.3f seconds.\n", CmiWallTimer()-startT);
  }
#endif

#endif   /* __BIGSIM__ */

  // now every one should have the node info
  CcdRaiseCondition(CcdTOPOLOGY_AVAIL);      // call callbacks
  if (CmiMyPe() == 0 && show_flag) cpuTopo.print();
}
Beispiel #5
0
int main(int argc, char *argv[]) {
  int numprocs, myrank, grank;
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  MPI_Group orig_group, new_group; 
  MPI_Comm new_comm; 

  /* Extract the original group handle */ 
  MPI_Comm_group(MPI_COMM_WORLD, &orig_group); 

  double sendTime, recvTime, min, avg, max;
  double time[3] = {0.0, 0.0, 0.0};
  int msg_size;
  MPI_Status mstat;
  int i=0, pe, trial, hops;
  char name[30];

  char *send_buf = (char *)malloc(MAX_MSG_SIZE);
  char *recv_buf = (char *)malloc(MAX_MSG_SIZE);

  for(i = 0; i < MAX_MSG_SIZE; i++) {
    recv_buf[i] = send_buf[i] = (char) (i & 0xff);
  }

  // allocate the routing map.
  int *map = (int *) malloc(sizeof(int) * numprocs);
  TopoManager *tmgr;
  int dimNZ, numRG, x, y, z, t, bcastSend[3], bcastRecv[3];

  if(myrank == 0) {
    tmgr = new TopoManager();
#if CREATE_JOBS
    numRG = tmgr->getDimNX() * (tmgr->getDimNY() - 2) * 2 * tmgr->getDimNT();
#else
    numRG = tmgr->getDimNX() * tmgr->getDimNY() * 2 * tmgr->getDimNT();
#endif
    dimNZ = tmgr->getDimNZ();
    for (int i=1; i<numprocs; i++) {
      bcastSend[0] = dimNZ;
      bcastSend[1] = numRG;
      tmgr->rankToCoordinates(i, x, y, z, t);
      bcastSend[2] = z;
      MPI_Send(bcastSend, 3, MPI_INT, i, 1, MPI_COMM_WORLD);
    }
    tmgr->rankToCoordinates(0, x, y, z, t);
  } else {
      MPI_Recv(bcastRecv, 3, MPI_INT, 0, 1, MPI_COMM_WORLD, &mstat);
      dimNZ = bcastRecv[0];
      numRG = bcastRecv[1];
      z = bcastRecv[2];
  }

  MPI_Barrier(MPI_COMM_WORLD);

  if (myrank == 0) {
    printf("Torus Dimensions %d %d %d %d\n", tmgr->getDimNX(), tmgr->getDimNY(), dimNZ, tmgr->getDimNT());
  }

#if CREATE_JOBS
  for (hops=0; hops < 2; hops++) {
#else
  for (hops=0; hops < dimNZ/2; hops++) {
#endif
    int *mapRG = (int *) malloc(sizeof(int) * numRG);
    if (myrank == 0) {
      // Rank 0 makes up a routing map.
      build_process_map(numprocs, map, hops, numRG, mapRG);
    }

    // Broadcast the routing map.
    MPI_Bcast(map, numprocs, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(mapRG, numRG, MPI_INT, 0, MPI_COMM_WORLD);

    MPI_Group_incl(orig_group, numRG, mapRG, &new_group);
    MPI_Comm_create(MPI_COMM_WORLD, new_group, &new_comm);
    MPI_Group_rank(new_group, &grank);
    
#if CREATE_JOBS
    sprintf(name, "xt4_job_%d_%d.dat", numprocs, hops);
#else
    sprintf(name, "xt4_line_%d_%d.dat", numprocs, hops);
#endif
   
    for (msg_size=MIN_MSG_SIZE; msg_size<=MAX_MSG_SIZE; msg_size=(msg_size<<1)) {
      for (trial=0; trial<10; trial++) {

	pe = map[myrank];
	if(pe != -1) {
          if(grank != MPI_UNDEFINED) MPI_Barrier(new_comm);

	  if(myrank < pe) {
	    // warmup
	    for(i=0; i<2; i++) {
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	    }

	    sendTime = MPI_Wtime();
	    for(i=0; i<NUM_MSGS; i++)
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	    for(i=0; i<NUM_MSGS; i++)
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	    recvTime = (MPI_Wtime() - sendTime) / NUM_MSGS;
	
	    // cooldown
	    for(i=0; i<2; i++) {
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	    }

	    if(grank != MPI_UNDEFINED) MPI_Barrier(new_comm);
	  } else {
	    // warmup
	    for(i=0; i<2; i++) {
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	    }

	    sendTime = MPI_Wtime();
	    for(i=0; i<NUM_MSGS; i++)
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	    for(i=0; i<NUM_MSGS; i++)
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	    recvTime = (MPI_Wtime() - sendTime) / NUM_MSGS;

	    // cooldown
	    for(i=0; i<2; i++) {
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	    }

	    if(grank != MPI_UNDEFINED) MPI_Barrier(new_comm);
	  }

	  if(grank != MPI_UNDEFINED) {
  	    MPI_Allreduce(&recvTime, &min, 1, MPI_DOUBLE, MPI_MIN, new_comm);
  	    MPI_Allreduce(&recvTime, &avg, 1, MPI_DOUBLE, MPI_SUM, new_comm);
	    MPI_Allreduce(&recvTime, &max, 1, MPI_DOUBLE, MPI_MAX, new_comm);
          }

	  avg /= numRG;

	} // end if map[pe] != -1
	if(grank == 0) {
	  time[0] += min;
	  time[1] += avg;
	  time[2] += max;
	}
      } // end for loop of trials
      if (grank == 0) {
	FILE *outf = fopen(name, "a");
	fprintf(outf, "%d %g %g %g\n", msg_size, time[0]/10, time[1]/10, time[2]/10);
	fflush(NULL);
	fclose(outf);
	time[0] = time[1] = time[2] = 0.0;
      }
    } // end for loop of msgs
    free(mapRG);
  } // end for loop of hops

  if(grank == 0)
    printf("Program Complete\n");

  MPI_Finalize();
  return 0;
}