Exemple #1
0
  main(CkArgMsg* m)
  {
      if(m->argc < 5)
          CkPrintf("Usage: payload PEs CharesPerPE iteration\n");

      niter = 0;
      iterations=NITER;
      payload=PAYLOAD;
      if(m->argc>1)
          totalPayload=atoi(m->argv[1]);
      if(m->argc>2)
          PEsPerNode = atoi(m->argv[2]);
      if(m->argc>3)
          CharesPerPE = atoi(m->argv[3]);
      if(m->argc>4)
          iterations=atoi(m->argv[4]);
 
      payload = totalPayload/PEsPerNode/CharesPerPE;
      mainProxy = thishandle;
      arr1 = CProxy_Ping1::ckNew(CkNumNodes()* PEsPerNode * CharesPerPE );
      start_time = CkWallTimer();
      nodeIndex = 1;
      int x,y,z,t;
      TopoManager tmgr;
      for(int i=0; i<CmiNumPes(); i+=CmiMyNodeSize())
      {
          tmgr.rankToCoordinates(i, x,y, z, t);
          CkPrintf(" %d  [%d:%d:%d:%d]\n", i, x, y, z, t);
      }
      CkPrintf("NodeIndex Chares       Workers        NoOfMsgs        Bytes           Total           Time(us)\n");
      _traceControl = CProxy_TraceControl::ckNew();
      for(int i=0; i<PEsPerNode * CharesPerPE; i++)
          arr1[i].start(nodeIndex);
      delete m;
  };
Exemple #2
0
    Main(CkArgMsg* m) {
#if CMK_BLUEGENEL
      BGLPersonality bgl_p;
      int i = rts_get_personality(&bgl_p, sizeof(BGLPersonality));
#elif CMK_BLUEGENEP
      DCMF_Hardware_t bgp_hwt;
      DCMF_Hardware(&bgp_hwt);
#elif XT3_TOPOLOGY
      XT3TorusManager xt3tm;
#elif XT4_TOPOLOGY || XT5_TOPOLOGY
      XTTorusManager xttm;
#endif

      mainProxy = thishandle;
      CkPrintf("Testing TopoManager .... \n");
      TopoManager tmgr;
      CkPrintf("Torus Size [%d] [%d] [%d] [%d]\n", tmgr.getDimNX(), tmgr.getDimNY(), tmgr.getDimNZ(), tmgr.getDimNT());

#if CMK_BLUEGENEP
      CkPrintf("Torus Size [%d] [%d] [%d] [%d]\n", bgp_hwt.xSize, bgp_hwt.ySize, bgp_hwt.zSize, bgp_hwt.tSize);
#endif
      int x, y, z, t;

      for(int i=0; i<CkNumPes(); i++) {
	tmgr.rankToCoordinates(i, x, y, z, t);
	CkPrintf("---- Processor %d ---> x %d y %d z %d t %d\n", i, x, y, z, t);
#if CMK_BLUEGENEL
	unsigned int tmp_t, tmp_x, tmp_y, tmp_z;
	rts_coordinatesForRank(i, &tmp_x, &tmp_y, &tmp_z, &tmp_t);
	CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t);
#elif CMK_BLUEGENEP
	unsigned int tmp_t, tmp_x, tmp_y, tmp_z;
    #if (DCMF_VERSION_MAJOR >= 3)
	DCMF_NetworkCoord_t nc;
	DCMF_Messager_rank2network(i, DCMF_DEFAULT_NETWORK, &nc);
	tmp_x = nc.torus.x;
	tmp_y = nc.torus.y;
	tmp_z = nc.torus.z;
	tmp_t = nc.torus.t;
    #else
	DCMF_Messager_rank2torus(c, &tmp_x, &tmp_y, &tmp_z, &tmp_t);
    #endif
	CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t);
#elif XT3_TOPOLOGY
	int tmp_t, tmp_x, tmp_y, tmp_z;
	xt3tm.realRankToCoordinates(i, tmp_x, tmp_y, tmp_z, tmp_t);
	CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t);
#elif XT4_TOPOLOGY || XT5_TOPOLOGY
	int tmp_t, tmp_x, tmp_y, tmp_z;
	xttm.realRankToCoordinates(i, tmp_x, tmp_y, tmp_z, tmp_t);
	CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t);
#endif
      } // end of for loop

      int size = tmgr.getDimNX() * tmgr.getDimNY() * tmgr.getDimNZ();
      CkPrintf("Torus Contiguity Metric %d : %d [%f] \n", size, CkNumPes()/tmgr.getDimNT(), (float)(CkNumPes())/(tmgr.getDimNT()*size) );
      CkExit();
    };
Exemple #3
0
ComputeMap::ComputeMap(int x, int y, int z, int tx, int ty, int tz) {
  X = x;
  Y = y;
  Z = z;
  mapping = new int[X*Y*Z];

  TopoManager tmgr;
  int dimX, dimY, dimZ, dimT;

#if USE_TOPOMAP
  dimX = tmgr.getDimNX();
  dimY = tmgr.getDimNY();
  dimZ = tmgr.getDimNZ();
  dimT = tmgr.getDimNT();
#elif USE_BLOCKMAP
  dimX = tx;
  dimY = ty;
  dimZ = tz;
  dimT = 1;
#endif

  // we are assuming that the no. of chares in each dimension is a 
  // multiple of the torus dimension
  int numCharesPerPe = X*Y*Z/CkNumPes();

  int numCharesPerPeX = X / dimX;
  int numCharesPerPeY = Y / dimY;
  int numCharesPerPeZ = Z / dimZ;

  if(dimT < 2) {    // one core per node
    if(CkMyPe()==0) CkPrintf("DATA: %d %d %d %d : %d %d %d\n", dimX, dimY, dimZ, dimT, numCharesPerPeX, numCharesPerPeY, numCharesPerPeZ);
    for(int i=0; i<dimX; i++)
      for(int j=0; j<dimY; j++)
        for(int k=0; k<dimZ; k++)
          for(int ci=i*numCharesPerPeX; ci<(i+1)*numCharesPerPeX; ci++)
            for(int cj=j*numCharesPerPeY; cj<(j+1)*numCharesPerPeY; cj++)
              for(int ck=k*numCharesPerPeZ; ck<(k+1)*numCharesPerPeZ; ck++) {
#if USE_TOPOMAP
                mapping[ci*Y*Z + cj*Z + ck] = tmgr.coordinatesToRank(i, j, k);
#elif USE_BLOCKMAP
                mapping[ci*Y*Z + cj*Z + ck] = i + j*dimX + k*dimX*dimY;
#endif
              }
  } else {          // multiple cores per node
    // In this case, we split the chares in the X dimension among the
    // cores on the same node.
    numCharesPerPeX /= dimT;
      if(CkMyPe()==0) CkPrintf("%d %d %d : %d %d %d %d : %d %d %d \n", x, y, z, dimX, dimY, dimZ, dimT, numCharesPerPeX, numCharesPerPeY, numCharesPerPeZ);
      for(int i=0; i<dimX; i++)
        for(int j=0; j<dimY; j++)
          for(int k=0; k<dimZ; k++)
            for(int l=0; l<dimT; l++)
              for(int ci=(dimT*i+l)*numCharesPerPeX; ci<(dimT*i+l+1)*numCharesPerPeX; ci++)
                for(int cj=j*numCharesPerPeY; cj<(j+1)*numCharesPerPeY; cj++)
                  for(int ck=k*numCharesPerPeZ; ck<(k+1)*numCharesPerPeZ; ck++) {
                    mapping[ci*Y*Z + cj*Z + ck] = tmgr.coordinatesToRank(i, j, k, l);
                  }
  }
}
Exemple #4
0
void build_process_map(int size, int *smap, int *rmap, int *pmap, int file)
{
  TopoManager tmgr;
  int pe, pe1, pe2, x, y, z1, t;
  int dimNX, dimNY, dimNZ, dimNT;
  dimNX = tmgr.getDimNX();
  dimNY = tmgr.getDimNY();
  dimNZ = tmgr.getDimNZ();
  dimNT = tmgr.getDimNT();
  int count = 0;
  for(int i=0; i<size; i++)
  {
      smap[i]=-1;
      rmap[i]=-1;
      pmap[i]=-1;

  }
  cout << "Loading Map" << endl;
  char name[50];
  sprintf(name,"%d.map",file);
  ifstream mapFile(name);
  string line_s;
  while(mapFile.good()   ){
    #ifdef DEBUG
      cout << "     >  Loading " << name << endl;
    #endif
    int c1,c2,c3,c4,c5,c6;
    getline(mapFile,line_s);
    istringstream line(line_s);
    line >> c1 >> c2 >>c3 >> c4 >> c5>> c6;

    for(int i=0;i<dimNZ;i++)
    {
      pe = tmgr.coordinatesToRank(c1, c2, i, 0);
      pe1 = tmgr.coordinatesToRank(c4, c5, i, 0);
      smap[pe] = pe1;
      rmap[pe1] = pe;
      if(i==0)
      {
	pmap[pe] =1;
	pmap[pe1]=2;
      }
    }
  }
  dump_map(size,rmap);
  dump_map(size,smap);
}
Exemple #5
0
/**
 * This function implements a strategy similar to the one used in the 
 * centralized case in NamdCentLB.
 */
CLBMigrateMsg* NamdHybridLB::GrpLevelStrategy(LDStats* stats) {
  int numProcessors = stats->nprocs();	// number of processors at group level
  int numPatches = PatchMap::Object()->numPatches();
  ComputeMap *computeMap = ComputeMap::Object();
  const int numComputes = computeMap->numComputes();
  const int numGroupComputes = stats->n_migrateobjs;
  const SimParameters* simParams = Node::Object()->simParameters;

  if ( ! processorArray ) processorArray = new processorInfo[numProcessors];
  // these data structures are global and need to be distributed
  if ( ! patchArray ) patchArray = new patchInfo[numPatches];
  if ( ! computeArray ) computeArray = new computeInfo[numGroupComputes];
  if ( ! from_procs ) from_procs = new int[numGroupComputes];

  int nMoveableComputes = buildData(stats);
  CmiAssert(nMoveableComputes <= numGroupComputes);


#if LDB_DEBUG
#define DUMP_LDBDATA 1
#define LOAD_LDBDATA 1
#endif

#if DUMP_LDBDATA 
  dumpDataASCII("ldbd_before", numProcessors, numPatches, nMoveableComputes);
#elif LOAD_LDBDATA
  loadDataASCII("ldbd_before.5", numProcessors, numPatches, nMoveableComputes);
  // CkExit();
#endif

  double averageLoad = 0.;
  double avgCompute;
  double maxCompute;
  int maxComputeId;
  int numPesAvailable;
  {
   int i;
   double total = 0.;
   maxCompute = 0.;
   int maxi = 0;
   for (i=0; i<nMoveableComputes; i++) {
      double load = computeArray[i].load;
      total += load;
      if ( load > maxCompute ) { maxCompute = load;  maxi = i; }
   }
   avgCompute = total / nMoveableComputes;
   maxComputeId = computeArray[maxi].handle.id.id[0];

    int P = stats->nprocs();
   numPesAvailable = 0;
   for (i=0; i<P; i++) {
      if (processorArray[i].available) {
        ++numPesAvailable;
        total += processorArray[i].backgroundLoad;
      }
   }
   if (numPesAvailable == 0)
     NAMD_die("No processors available for load balancing!\n");

   averageLoad = total/numPesAvailable;
  }

  int i_split = 0;
  double maxUnsplit = 0.;

  if ( step() == 1 ) {
    for (int i=0; i<nMoveableComputes; i++) {
      const int cid = computeArray[i].handle.id.id[0];
      if ( computeMap->numPartitions(cid) == 0 ) {
        const double load = computeArray[i].load;
        if ( load > maxUnsplit ) maxUnsplit = load;
        continue;
      }
      ++i_split;
    }
  }

  {
    SplitComputesMsg *msg = new(i_split,i_split) SplitComputesMsg;
    msg->maxUnsplit = maxUnsplit;
    msg->averageLoad = averageLoad;
    msg->avgCompute = avgCompute;
    msg->maxCompute = maxCompute;
    msg->maxComputeId = maxComputeId;
    msg->nMoveableComputes = nMoveableComputes;
    msg->numPesAvailable = numPesAvailable;
    msg->n = i_split;

    if ( step() == 1 ) {
      i_split = 0;
      for (int i=0; i<nMoveableComputes; i++) {
        computeArray[i].processor = computeArray[i].oldProcessor;
        const int cid = computeArray[i].handle.id.id[0];
        if ( computeMap->numPartitions(cid) == 0 ) {
          continue;
        }
        msg->cid[i_split] = cid;
        msg->load[i_split] = computeArray[i].load;
        ++i_split;
      }
    }

    thisProxy[0].splitComputes(msg);
  }

  if ( step() == 1 ) {
    // compute splitting only
  } else if (simParams->ldbStrategy == LDBSTRAT_DEFAULT) { // default
    if (step() < 4)
      TorusLB(computeArray, patchArray, processorArray,
                  nMoveableComputes, numPatches, numProcessors);
    else
      RefineTorusLB(computeArray, patchArray, processorArray,
                  nMoveableComputes, numPatches, numProcessors, 1);
  } else if (simParams->ldbStrategy == LDBSTRAT_COMPREHENSIVE) {
    TorusLB(computeArray, patchArray, processorArray,
                  nMoveableComputes, numPatches, numProcessors);
  } else if (simParams->ldbStrategy == LDBSTRAT_REFINEONLY) {
    RefineTorusLB(computeArray, patchArray, processorArray,
                  nMoveableComputes, numPatches, numProcessors, 1);
  } else if (simParams->ldbStrategy == LDBSTRAT_OLD) {
    NAMD_die("Old load balancer strategy is not compatible with hybrid balancer.");
    if (step() < 4)
      Alg7(computeArray, patchArray, processorArray,
                  nMoveableComputes, numPatches, numProcessors);
    else
      RefineOnly(computeArray, patchArray, processorArray,
                  nMoveableComputes, numPatches, numProcessors);
  }

#if LDB_DEBUG && USE_TOPOMAP
  TopoManager tmgr;
  int pe1, pe2, pe3, hops=0;
  /* This is double counting the hops
  for(int i=0; i<nMoveableComputes; i++)
  {
    pe1 = computeArray[i].processor;
    pe2 = patchArray[computeArray[i].patch1].processor;
    pe3 = patchArray[computeArray[i].patch2].processor;
    hops += tmgr.getHopsBetweenRanks(pe1, pe2);
    if(computeArray[i].patch1 != computeArray[i].patch2)
      hops += tmgr.getHopsBetweenRanks(pe1, pe3);  
  }*/
  for (int i=0; i<numPatches; i++)  {
    //int num = patchArray[i].proxiesOn.numElements();
    pe1 = patchArray[i].processor;
    Iterator nextProc;
    processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.iterator((Iterator *)&nextProc);
    while (p) {
      pe2 = p->Id;
      hops += tmgr.getHopsBetweenRanks(pe1, pe2);
      p = (processorInfo *)patchArray[i].proxiesOn.next((Iterator*)&nextProc);
    }
  }
  CkPrintf("Load Balancing: Number of Hops: %d\n", hops);
#endif

#if DUMP_LDBDATA
  dumpDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
#elif LOAD_LDBDATA
  dumpDataASCII("ldbd_after.5", numProcessors, numPatches, nMoveableComputes);
  // loadDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes);
  // CkExit();
#endif

  // For error checking:
  // Count up computes, to see if somebody doesn't have any computes
  int i;
#if 0
  int* computeCount = new int[numProcessors];
  for(i=0; i<numProcessors; i++)
    computeCount[i]=0;
  for(i=0; i<nMoveableComputes; i++)
    computeCount[computeArray[i].processor]++;
  for(i=0; i<numProcessors; i++) {
    if (computeCount[i]==0)
      iout << iINFO <<"Warning: Processor " << i 
	   << " has NO moveable computes.\n" << endi;
  }
  delete [] computeCount;
#endif
  
  CkVec<MigrateInfo *> migrateInfo;
  for(i=0;i<nMoveableComputes;i++) {
    if (computeArray[i].processor != from_procs[i]+stats->procs[0].pe) {
      /* CkPrintf("[%d] Obj %d migrating from %d (%d) to %d\n",
                     CkMyPe(),computeArray[i].handle.id.id[0],
			 from_procs[i], computeArray[i].oldProcessor, computeArray[i].processor); */
      MigrateInfo *migrateMe = new MigrateInfo;
      migrateMe->obj = computeArray[i].handle;
      //migrateMe->from_pe = computeArray[i].oldProcessor;
      int frompe = from_procs[i];
      if (frompe == numProcessors)
        frompe = -1;
      else
        frompe = frompe + stats->procs[0].pe;
      migrateMe->from_pe = frompe;
      migrateMe->to_pe = computeArray[i].processor;
      if (frompe == -1) {
          // don't know yet which processor this compute belongs to, but
	  // inform receiver
        LDObjData obj;
        obj.handle = computeArray[i].handle;
        thisProxy[computeArray[i].processor].ObjMigrated(obj, NULL, 0, currentLevel-1);
      } 
      migrateInfo.insertAtEnd(migrateMe);

      // sneak in updates to ComputeMap
      //ERASE CkPrintf("%d setting %d to processor %d\n",CkMyPe(),computeArray[i].handle.id.id[0],computeArray[i].processor);
      computeMap->setNewNode(computeArray[i].handle.id.id[0],
				computeArray[i].processor);
    }
  }
  // CkPrintf("LOAD BALANCING READY %d\n",CkMyPe()); 

  LBMigrateMsg* msg;
  msg = createMigrateMsg(migrateInfo, numProcessors);

  peLoads = new double [numProcessors]; 
  startPE = processorArray[0].Id;
  endPE = processorArray[numProcessors-1].Id;
  // CkPrintf("[%d] numProcessors=%d, %d to %d\n",CkMyPe(),numProcessors,processorArray[0].Id,processorArray[numProcessors-1].Id);
  for (i=0; i<numProcessors; i++) {
	peLoads[i] = processorArray[i].load;
  }


  delete [] from_procs;
  delete [] processorArray;
  delete [] patchArray;
  delete [] computeArray;

  from_procs = NULL;
  processorArray = NULL;
  patchArray = NULL;
  computeArray = NULL;
  
  return msg;

}
Exemple #6
0
int main(int argc, char *argv[]) {
  int numprocs, myrank;
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  double sendTime, recTime, min, avg, max;
  double time[3] = {0.0, 0.0, 0.0};
  int msg_size;
  MPI_Status mstat;
  int i=0,j, pe, pe1, pe2, trial, hops;
  char name[30];
  char locname[30];
  char blockname[50];
  double newTime, oldTime;
  double storeTime[NUM_MSGS];
  double recvTime[NUM_MSGS];
  double storeBw[NUM_MSGS];
  char *send_buf = (char *)malloc(MAX_MSG_SIZE);
  char *recv_buf = (char *)malloc(MAX_MSG_SIZE);
  FILE *locf;
  for(i = 0; i < MAX_MSG_SIZE; i++) {
    recv_buf[i] = send_buf[i] = (char) (i & 0xff);
  }

  // allocate the routing map.
  int *rmap = (int *) malloc(sizeof(int) * numprocs);
  int *smap = (int *) malloc(sizeof(int) * numprocs);
  
  TopoManager *tmgr;
  int dimNZ, numRG, x, y, z, t, bcastSend[3], bcastRecv[3];

  if(myrank == 0) {
    tmgr = new TopoManager();
#if CREATE_JOBS
    numRG = tmgr->getDimNX() * (tmgr->getDimNY() - 2) * 2 * tmgr->getDimNT();
#else
    numRG = tmgr->getDimNX() * tmgr->getDimNY() * 2;
#endif
    dimNZ = tmgr->getDimNZ();
    for (int i=1; i<numprocs; i++) {
      bcastSend[0] = dimNZ;
      bcastSend[1] = numRG;
      tmgr->rankToCoordinates(i, x, y, z, t);
      bcastSend[2] = z;
      MPI_Send(bcastSend, 3, MPI_INT, i, 1, MPI_COMM_WORLD);
    }
    tmgr->rankToCoordinates(0, x, y, z, t);
  } else {
      MPI_Recv(bcastRecv, 3, MPI_INT, 0, 1, MPI_COMM_WORLD, &mstat);
      dimNZ = bcastRecv[0];
      numRG = bcastRecv[1];
      z = bcastRecv[2];
  }
  MPI_Barrier(MPI_COMM_WORLD);
  if (myrank == 0) {
    printf("Torus Dimensions %d %d %d %d\n", tmgr->getDimNX(), tmgr->getDimNY(), dimNZ, tmgr->getDimNT());
  }
#if USE_HPM
    HPM_Init();
#endif
  for (hops=0; hops < 1; hops++) {
    // To print the recv times for certain ranks
    int *pmap = (int *) malloc(sizeof(int) * numprocs);
    if (myrank == 0) {
      // Rank 0 makes up a routing map.
      build_process_map(numprocs, smap, rmap, pmap, 2);
    }
    // Broadcast the routing map.
    MPI_Bcast(smap, numprocs, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(rmap, numprocs, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(pmap, numprocs, MPI_INT, 0, MPI_COMM_WORLD);
    sprintf(blockname, "Block_%d.hpm",hops);
    if (myrank == 0) {
       printf( " Broadcasted the map \n");
    }
#if USE_HPM
    HPM_Start(blockname);
#endif
#if CREATE_JOBS
    sprintf(name, "xt4_job_%d_%d.dat", numprocs, hops);
#else
    sprintf(name, "bgp_line_%d_%d.dat", numprocs, hops);
#endif
	if(pmap[myrank]>0)
	{
	sprintf(locname, "bgp_print_%d.dat", myrank);
	locf = fopen(locname, "a");
   	}
    for (msg_size=MIN_MSG_SIZE; msg_size<=MAX_MSG_SIZE; msg_size=(msg_size<<1)) {
      for (trial=0; trial<1; trial++) {
	 if (myrank == 0) {
	     printf( " Going to begin the trial \n");
	  }
	pe1 = smap[myrank]; // Am I a sender?
	pe2 = rmap[myrank]; // Am I a reciever? 
	MPI_Barrier(MPI_COMM_WORLD);
	// Actual Data Transfer
	if(pe1 != -1) {
	    sendTime = MPI_Wtime();
	    oldTime = sendTime;
	    j=0;
	    for(i=0; i<NUM_MSGS; i++)
	    {
		  storeTime[i] = MPI_Wtime(); // Just before the next send operation
		  MPI_Send(send_buf, msg_size, MPI_CHAR, pe1, 1, MPI_COMM_WORLD);
	    }
	    MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe1, 1, MPI_COMM_WORLD, &mstat);
	    recTime = (MPI_Wtime() - sendTime) / (NUM_MSGS+1);
	    //printf(" My Rank : %d Experiment: %d  MSG_SIZE: %d -- Completed send recv \n", myrank, hops, msg_size);
	  }
	if(pe2 !=1)
	{
	    sendTime = MPI_Wtime();
	    oldTime = sendTime;
	    j=0;
	    for(i=0; i<NUM_MSGS; i++)
	      {
		  MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe2, 1, MPI_COMM_WORLD, &mstat);
		  recvTime[i] = MPI_Wtime(); // Just after the next recv operation
	      }	  
	    MPI_Send(send_buf, msg_size, MPI_CHAR, pe2, 1, MPI_COMM_WORLD);
	    recTime = (MPI_Wtime() - sendTime) / (NUM_MSGS+1);
        }
	// Recv times sent back to the Senders for b/w calculations 
	if(myrank==0)
	{
	  printf(" My Rank : %d Experiment: %d  MSG_SIZE: %d -- Reached barrier in middle \n", myrank, hops, msg_size);
	}
	pe1 = smap[myrank]; // Am I a sender?
	pe2 = rmap[myrank]; // Am I a reciever? 
	MPI_Barrier(MPI_COMM_WORLD);
	if(pe1 != -1) {
	    MPI_Recv(recvTime, NUM_MSGS, MPI_DOUBLE, pe1, 1, MPI_COMM_WORLD, &mstat);
	    if(pmap[myrank]==1)
	    {
	      printf(" My Rank : %d Hops: %d  MSG_SIZE: %d Sender Side Exp trial: %d   Avg recv time %g \n", myrank, hops, msg_size, trial, recTime );
	      //printf(" My Rank : %d Hops: %d  MSG_SIZE: %d Sender Side Exp trial: %d   Recv time %g \n", myrank, hops, msg_size, trial, recvTime );
	      for(i=0;i<NUM_MSGS; i++)
		{
		  storeBw[i]= msg_size/(recvTime[i] - storeTime[i]);
		  fprintf(locf,  "%d   %d   %d   %g   %g   %g   %g \n", hops, myrank, msg_size, 500000*(storeTime[i]+recvTime[i]), storeBw[i],1000000*recvTime[i],1000000*storeTime[i]); 
		}
	    }
	  }
	if(pe2 !=1) 
	    {
	      MPI_Send(recvTime, NUM_MSGS, MPI_DOUBLE, pe2, 1, MPI_COMM_WORLD);
	    }
      } // end for loop of trials
    } // end for loop of msgs
    if(pmap[myrank]>0)
	{
  		fflush(NULL);
		fclose(locf);
	}
    free(pmap);
#if USE_HPM
  HPM_Stop(blockname);
#endif
  } // end for loop of hops
#if USE_HPM
  HPM_Print();
#endif
  if(myrank == 0)
    printf("Program Complete\n");
  MPI_Finalize();
  return 0;
}
Exemple #7
0
    Main(CkArgMsg* m) {
        if ( (m->argc != 3) && (m->argc != 7) ) {
            CkPrintf("%s [array_size] [block_size]\n", m->argv[0]);
            CkPrintf("OR %s [array_size_X] [array_size_Y] [array_size_Z] [block_size_X] [block_size_Y] [block_size_Z]\n", m->argv[0]);
            CkAbort("Abort");
        }

        // set iteration counter to zero
        iterations = 0;

        // store the main proxy
        mainProxy = thisProxy;

        if(m->argc == 3) {
            arrayDimX = arrayDimY = arrayDimZ = atoi(m->argv[1]);
            blockDimX = blockDimY = blockDimZ = atoi(m->argv[2]);
        }
        else if (m->argc == 7) {
            arrayDimX = atoi(m->argv[1]);
            arrayDimY = atoi(m->argv[2]);
            arrayDimZ = atoi(m->argv[3]);
            blockDimX = atoi(m->argv[4]);
            blockDimY = atoi(m->argv[5]);
            blockDimZ = atoi(m->argv[6]);
        }

        if (arrayDimX < blockDimX || arrayDimX % blockDimX != 0)
            CkAbort("array_size_X % block_size_X != 0!");
        if (arrayDimY < blockDimY || arrayDimY % blockDimY != 0)
            CkAbort("array_size_Y % block_size_Y != 0!");
        if (arrayDimZ < blockDimZ || arrayDimZ % blockDimZ != 0)
            CkAbort("array_size_Z % block_size_Z != 0!");

        num_chare_x = arrayDimX / blockDimX;
        num_chare_y = arrayDimY / blockDimY;
        num_chare_z = arrayDimZ / blockDimZ;

        // print info
        CkPrintf("\nSTENCIL COMPUTATION WITH NO BARRIERS\n");
        CkPrintf("Running Jacobi on %d processors with (%d, %d, %d) chares\n", CkNumPes(), num_chare_x, num_chare_y, num_chare_z);
        CkPrintf("Array Dimensions: %d %d %d\n", arrayDimX, arrayDimY, arrayDimZ);
        CkPrintf("Block Dimensions: %d %d %d\n", blockDimX, blockDimY, blockDimZ);

        // Create new array of worker chares
#if USE_TOPOMAP
        CProxy_JacobiMap map = CProxy_JacobiMap::ckNew(num_chare_x, num_chare_y, num_chare_z);
        CkPrintf("Topology Mapping is being done ... \n");
        CkArrayOptions opts(num_chare_x, num_chare_y, num_chare_z);
        opts.setMap(map);
        array = CProxy_Jacobi::ckNew(opts);
#else
        array = CProxy_Jacobi::ckNew(num_chare_x, num_chare_y, num_chare_z);
#endif

        TopoManager tmgr;
        CkArray *jarr = array.ckLocalBranch();
        int jmap[num_chare_x][num_chare_y][num_chare_z];

        int hops=0, p;
        for(int i=0; i<num_chare_x; i++)
            for(int j=0; j<num_chare_y; j++)
                for(int k=0; k<num_chare_z; k++) {
                    jmap[i][j][k] = jarr->procNum(CkArrayIndex3D(i, j, k));
                }

        for(int i=0; i<num_chare_x; i++)
            for(int j=0; j<num_chare_y; j++)
                for(int k=0; k<num_chare_z; k++) {
                    p = jmap[i][j][k];
                    hops += tmgr.getHopsBetweenRanks(p, jmap[wrap_x(i+1)][j][k]);
                    hops += tmgr.getHopsBetweenRanks(p, jmap[wrap_x(i-1)][j][k]);
                    hops += tmgr.getHopsBetweenRanks(p, jmap[i][wrap_y(j+1)][k]);
                    hops += tmgr.getHopsBetweenRanks(p, jmap[i][wrap_y(j-1)][k]);
                    hops += tmgr.getHopsBetweenRanks(p, jmap[i][j][wrap_z(k+1)]);
                    hops += tmgr.getHopsBetweenRanks(p, jmap[i][j][wrap_z(k-1)]);
                }
        CkPrintf("Total Hops: %d\n", hops);

#ifdef JACOBI_OPENMP
        CProxy_OmpInitializer ompInit =
            CProxy_OmpInitializer::ckNew(4);
#else
        //Start the computation
        start();
#endif
    }
Exemple #8
0
extern "C" void LrtsInitCpuTopo(char **argv)
{
  static skt_ip_t myip;
  hostnameMsg  *msg;
  double startT;
 
  int obtain_flag = 1;              // default on
  int show_flag = 0;                // default not show topology

  if (CmiMyRank() ==0) {
     topoLock = CmiCreateLock();
  }

#if __FAULT__
  obtain_flag = 0;
#endif
  if(CmiGetArgFlagDesc(argv,"+obtain_cpu_topology",
					   "obtain cpu topology info"))
    obtain_flag = 1;
  if (CmiGetArgFlagDesc(argv,"+skip_cpu_topology",
                               "skip the processof getting cpu topology info"))
    obtain_flag = 0;
  if(CmiGetArgFlagDesc(argv,"+show_cpu_topology",
					   "Show cpu topology info"))
    show_flag = 1;

#if CMK_BIGSIM_CHARM
  if (BgNodeRank() == 0)
#endif
  {
  cpuTopoHandlerIdx =
     CmiRegisterHandler((CmiHandler)cpuTopoHandler);
  cpuTopoRecvHandlerIdx =
     CmiRegisterHandler((CmiHandler)cpuTopoRecvHandler);
  }

  if (!obtain_flag) {
    if (CmiMyRank() == 0) cpuTopo.sort();
    CmiNodeAllBarrier();
    CcdRaiseCondition(CcdTOPOLOGY_AVAIL);      // call callbacks
    return;
  }

  if (CmiMyPe() == 0) {
#if CMK_BIGSIM_CHARM
    if (BgNodeRank() == 0)
#endif
      startT = CmiWallTimer();
  }

#if CMK_BIGSIM_CHARM
  if (BgNodeRank() == 0)
  {
    //int numPes = BgNumNodes()*BgGetNumWorkThread();
    int numPes = cpuTopo.numPes = CkNumPes();
    cpuTopo.nodeIDs = new int[numPes];
    CpuTopology::supported = 1;
    int wth = BgGetNumWorkThread();
    for (int i=0; i<numPes; i++) {
      int nid = i / wth;
      cpuTopo.nodeIDs[i] = nid;
    }
    cpuTopo.sort();
  }
  return;
#else

#if CMK_USE_GM
  CmiBarrier();
#endif


#if 0
  if (gethostname(hostname, 999)!=0) {
      strcpy(hostname, "");
  }
#endif
#if CMK_BLUEGENEL || CMK_BLUEGENEP
  if (CmiMyRank() == 0) {
    TopoManager tmgr;

    int numPes = cpuTopo.numPes = CmiNumPes();
    cpuTopo.nodeIDs = new int[numPes];
    CpuTopology::supported = 1;

    int x, y, z, t, nid;
    for(int i=0; i<numPes; i++) {
      tmgr.rankToCoordinates(i, x, y, z, t);
      nid = tmgr.coordinatesToRank(x, y, z, 0);
      cpuTopo.nodeIDs[i] = nid;
    }
    cpuTopo.sort();
    if (CmiMyPe()==0)  CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
  }
  CmiNodeAllBarrier();
#elif CMK_BLUEGENEQ
  if (CmiMyRank() == 0) {
   TopoManager tmgr;

    int numPes = cpuTopo.numPes = CmiNumPes();
    cpuTopo.nodeIDs = new int[numPes];
    CpuTopology::supported = 1;

    int a, b, c, d, e, t, nid;
    for(int i=0; i<numPes; i++) {
      tmgr.rankToCoordinates(i, a, b, c, d, e, t);
      nid = tmgr.coordinatesToRank(a, b, c, d, e, 0);
      cpuTopo.nodeIDs[i] = nid;
    }
    cpuTopo.sort();
    if (CmiMyPe()==0)  CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
  }
  CmiNodeAllBarrier();
#elif CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC
  if(CmiMyRank() == 0) {
    int numPes = cpuTopo.numPes = CmiNumPes();
    int numNodes = CmiNumNodes();
    cpuTopo.nodeIDs = new int[numPes];
    CpuTopology::supported = 1;

    int nid;
    for(int i=0; i<numPes; i++) {
      nid = getXTNodeID(CmiNodeOf(i), numNodes);
      cpuTopo.nodeIDs[i] = nid;
    }
    int prev = -1;
    nid = -1;

    // this assumes that all cores on a node have consecutive MPI rank IDs
    // and then changes nodeIDs to 0 to numNodes-1
    for(int i=0; i<numPes; i++) {
      if(cpuTopo.nodeIDs[i] != prev) {
	prev = cpuTopo.nodeIDs[i];
	cpuTopo.nodeIDs[i] = ++nid;
      }
      else
	cpuTopo.nodeIDs[i] = nid;
    }
    cpuTopo.sort();
    if (CmiMyPe()==0)  CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
  }
  CmiNodeAllBarrier();

#else

  bool topoInProgress = true;

  if (CmiMyPe() >= CmiNumPes()) {
    CmiNodeAllBarrier();         // comm thread waiting
#if CMK_MACHINE_PROGRESS_DEFINED
#if ! CMK_CRAYXT
    while (topoInProgress) {
      CmiNetworkProgress();
      CmiLock(topoLock);
      topoInProgress = done < CmiMyNodeSize();
      CmiUnlock(topoLock);
    }
#endif
#endif
    return;    /* comm thread return */
  }

    /* get my ip address */
  if (CmiMyRank() == 0)
  {
  #if CMK_HAS_GETHOSTNAME && !CMK_BLUEGENEQ
    myip = skt_my_ip();        /* not thread safe, so only calls on rank 0 */
    // fprintf(stderr, "[%d] IP is %d.%d.%d.%d\n", CmiMyPe(), myip.data[0],myip.data[1],myip.data[2],myip.data[3]);
  #elif CMK_BPROC
    myip = skt_innode_my_ip();
  #else
    if (!CmiMyPe())
    CmiPrintf("CmiInitCPUTopology Warning: Can not get unique name for the compute nodes. \n");
    _noip = 1; 
  #endif
    cpuTopo.numPes = CmiNumPes();
  }

  CmiNodeAllBarrier();
  if (_noip) return; 

    /* prepare a msg to send */
  msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg)+sizeof(_procInfo));
  msg->n = 1;
  msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg));
  CmiSetHandler((char *)msg, cpuTopoHandlerIdx);
  msg->procs[0].pe = CmiMyPe();
  msg->procs[0].ip = myip;
  msg->procs[0].ncores = CmiNumCores();
  msg->procs[0].rank = 0;
  msg->procs[0].nodeID = 0;
  CmiReduce(msg, sizeof(hostnameMsg)+sizeof(_procInfo), combineMessage);

  // blocking here
  while (topoInProgress) {
    CsdSchedulePoll();
    CmiLock(topoLock);
    topoInProgress = done < CmiMyNodeSize();
    CmiUnlock(topoLock);
  }

  if (CmiMyPe() == 0) {
#if CMK_BIGSIM_CHARM
    if (BgNodeRank() == 0)
#endif
      CmiPrintf("Charm++> cpu topology info is gathered in %.3f seconds.\n", CmiWallTimer()-startT);
  }
#endif

#endif   /* __BIGSIM__ */

  // now every one should have the node info
  CcdRaiseCondition(CcdTOPOLOGY_AVAIL);      // call callbacks
  if (CmiMyPe() == 0 && show_flag) cpuTopo.print();
}
Exemple #9
0
void build_process_map(int size, int *map, int dist, int numRG, int *mapRG)
{
  TopoManager tmgr;
  int pe1, pe2, x, y, z, t;
  int dimNX, dimNY, dimNZ, dimNT;

  dimNX = tmgr.getDimNX();
  dimNY = tmgr.getDimNY();
  dimNZ = tmgr.getDimNZ();
  dimNT = tmgr.getDimNT();

  int count = 0;

#if CREATE_JOBS
  for(int i=0; i<size; i++)
    map[i] = -1;

  // assumes a cubic partition such as 8 x 8 x 8
  // inner brick is always used
  for(int i=0; i<dimNX; i++)
    for(int j=1; j<dimNY-1; j++)
      for(int k=1; k<dimNZ-1; k++)
	for(int l=0; l<dimNT; l++) {
          if(k == 2 || k == dimNZ-3) {
	    pe1 = tmgr.coordinatesToRank(i, j, k, l);
            if(k == 2)
              pe2 = tmgr.coordinatesToRank(i, j, dimNZ-3, l);
            else
              pe2 = tmgr.coordinatesToRank(i, j, 2, l);
	    map[pe1] = pe2;
            mapRG[count++] = pe1;
	    printf("%d ", pe1);
	  }
	}

  printf("\n");
  if(dist == 1) {
    // outer brick is used only when dist == 1
    for(int i=0; i<dimNX; i++)
      for(int j=0; j<dimNY; j++)
	for(int k=0; k<dimNZ; k++)
	  for(int l=0; l<dimNT; l++) {
	    if(j == 0 || j == dimNY-1 || k == 0 || k == dimNZ-1) {
	      pe1 = tmgr.coordinatesToRank(i, j, k, l);
	      pe2 = tmgr.coordinatesToRank(i, k, j, l);
	      if(j == 0 && k == 0)
		pe2 = tmgr.coordinatesToRank(i, dimNY-1, dimNZ-1, l);
	      else if(j == dimNY-1 && k == dimNZ-1)
		pe2 = tmgr.coordinatesToRank(i, 0, 0, l);
	      map[pe1] = pe2;
	    }
	  }
  }
#else
  for(int i=0; i<dimNX; i++)
    for(int j=0; j<dimNY; j++)
      for(int k=0; k<dimNZ; k++)
	for(int l=0; l<dimNT; l++) {
	  pe1 = tmgr.coordinatesToRank(i, j, k, l);
	  if( abs(dimNZ - 1 - 2*k) <= (2*dist+1) ) {
	    pe2 = tmgr.coordinatesToRank(i, j, (dimNZ-1-k), l);
	    map[pe1] = pe2;

	    if(i==0 && j==0 && l==0) {
	      printf("Hops %d [%d] [%d]\n", 2*dist+1, pe1, pe2);
	    }

	    if(k == dimNZ/2-1 || k == dimNZ/2)
	      mapRG[count++] = pe1;
	  } else
	    map[pe1] = -1;
	}
#endif
  printf("Barrier Process %d %d\n", count, numRG);
  check_map(size, map);
}
Exemple #10
0
int main(int argc, char *argv[]) {
  int numprocs, myrank, grank;
  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  MPI_Group orig_group, new_group; 
  MPI_Comm new_comm; 

  /* Extract the original group handle */ 
  MPI_Comm_group(MPI_COMM_WORLD, &orig_group); 

  double sendTime, recvTime, min, avg, max;
  double time[3] = {0.0, 0.0, 0.0};
  int msg_size;
  MPI_Status mstat;
  int i=0, pe, trial, hops;
  char name[30];

  char *send_buf = (char *)malloc(MAX_MSG_SIZE);
  char *recv_buf = (char *)malloc(MAX_MSG_SIZE);

  for(i = 0; i < MAX_MSG_SIZE; i++) {
    recv_buf[i] = send_buf[i] = (char) (i & 0xff);
  }

  // allocate the routing map.
  int *map = (int *) malloc(sizeof(int) * numprocs);
  TopoManager *tmgr;
  int dimNZ, numRG, x, y, z, t, bcastSend[3], bcastRecv[3];

  if(myrank == 0) {
    tmgr = new TopoManager();
#if CREATE_JOBS
    numRG = tmgr->getDimNX() * (tmgr->getDimNY() - 2) * 2 * tmgr->getDimNT();
#else
    numRG = tmgr->getDimNX() * tmgr->getDimNY() * 2 * tmgr->getDimNT();
#endif
    dimNZ = tmgr->getDimNZ();
    for (int i=1; i<numprocs; i++) {
      bcastSend[0] = dimNZ;
      bcastSend[1] = numRG;
      tmgr->rankToCoordinates(i, x, y, z, t);
      bcastSend[2] = z;
      MPI_Send(bcastSend, 3, MPI_INT, i, 1, MPI_COMM_WORLD);
    }
    tmgr->rankToCoordinates(0, x, y, z, t);
  } else {
      MPI_Recv(bcastRecv, 3, MPI_INT, 0, 1, MPI_COMM_WORLD, &mstat);
      dimNZ = bcastRecv[0];
      numRG = bcastRecv[1];
      z = bcastRecv[2];
  }

  MPI_Barrier(MPI_COMM_WORLD);

  if (myrank == 0) {
    printf("Torus Dimensions %d %d %d %d\n", tmgr->getDimNX(), tmgr->getDimNY(), dimNZ, tmgr->getDimNT());
  }

#if CREATE_JOBS
  for (hops=0; hops < 2; hops++) {
#else
  for (hops=0; hops < dimNZ/2; hops++) {
#endif
    int *mapRG = (int *) malloc(sizeof(int) * numRG);
    if (myrank == 0) {
      // Rank 0 makes up a routing map.
      build_process_map(numprocs, map, hops, numRG, mapRG);
    }

    // Broadcast the routing map.
    MPI_Bcast(map, numprocs, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(mapRG, numRG, MPI_INT, 0, MPI_COMM_WORLD);

    MPI_Group_incl(orig_group, numRG, mapRG, &new_group);
    MPI_Comm_create(MPI_COMM_WORLD, new_group, &new_comm);
    MPI_Group_rank(new_group, &grank);
    
#if CREATE_JOBS
    sprintf(name, "xt4_job_%d_%d.dat", numprocs, hops);
#else
    sprintf(name, "xt4_line_%d_%d.dat", numprocs, hops);
#endif
   
    for (msg_size=MIN_MSG_SIZE; msg_size<=MAX_MSG_SIZE; msg_size=(msg_size<<1)) {
      for (trial=0; trial<10; trial++) {

	pe = map[myrank];
	if(pe != -1) {
          if(grank != MPI_UNDEFINED) MPI_Barrier(new_comm);

	  if(myrank < pe) {
	    // warmup
	    for(i=0; i<2; i++) {
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	    }

	    sendTime = MPI_Wtime();
	    for(i=0; i<NUM_MSGS; i++)
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	    for(i=0; i<NUM_MSGS; i++)
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	    recvTime = (MPI_Wtime() - sendTime) / NUM_MSGS;
	
	    // cooldown
	    for(i=0; i<2; i++) {
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	    }

	    if(grank != MPI_UNDEFINED) MPI_Barrier(new_comm);
	  } else {
	    // warmup
	    for(i=0; i<2; i++) {
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	    }

	    sendTime = MPI_Wtime();
	    for(i=0; i<NUM_MSGS; i++)
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	    for(i=0; i<NUM_MSGS; i++)
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	    recvTime = (MPI_Wtime() - sendTime) / NUM_MSGS;

	    // cooldown
	    for(i=0; i<2; i++) {
	      MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat);
	      MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD);
	    }

	    if(grank != MPI_UNDEFINED) MPI_Barrier(new_comm);
	  }

	  if(grank != MPI_UNDEFINED) {
  	    MPI_Allreduce(&recvTime, &min, 1, MPI_DOUBLE, MPI_MIN, new_comm);
  	    MPI_Allreduce(&recvTime, &avg, 1, MPI_DOUBLE, MPI_SUM, new_comm);
	    MPI_Allreduce(&recvTime, &max, 1, MPI_DOUBLE, MPI_MAX, new_comm);
          }

	  avg /= numRG;

	} // end if map[pe] != -1
	if(grank == 0) {
	  time[0] += min;
	  time[1] += avg;
	  time[2] += max;
	}
      } // end for loop of trials
      if (grank == 0) {
	FILE *outf = fopen(name, "a");
	fprintf(outf, "%d %g %g %g\n", msg_size, time[0]/10, time[1]/10, time[2]/10);
	fflush(NULL);
	fclose(outf);
	time[0] = time[1] = time[2] = 0.0;
      }
    } // end for loop of msgs
    free(mapRG);
  } // end for loop of hops

  if(grank == 0)
    printf("Program Complete\n");

  MPI_Finalize();
  return 0;
}
Exemple #11
0
  Main(CkArgMsg *args) {

    if (args->argc >= 3) {
      dataSizeMin = atoi(args->argv[1]);
      dataSizeMax = atoi(args->argv[2]);
    }
    else {
      dataSizeMin = 32;
      dataSizeMax = 16384;
    }
    bufferSize =
      args->argc == 4 ? atoi(args->argv[3]) : TRAM_BUFFER_SIZE;
    CkPrintf("size of envelope: %d\n\n", sizeof(envelope));
    delete args;

    iters = dataSizeMin / DATA_ITEM_SIZE;
    allToAllGroup = CProxy_Participant::ckNew();

#if !CMK_BLUEGENEQ
    int nDims = 2;
    int dims[2] = {CkNumNodes(), CkNumPes() / CkNumNodes()};
    CkPrintf("TEST 1: Using %dD TRAM Topology: %d %d\n",
             nDims, dims[0], dims[1]);

    // Alternative 3D topology
    // int nDims = 3;
    // int dim1 = CkNumNodes();
    // int dim2 = 1;
    // if (dim1 != 1) {
    //   while (dim2 < dim1) {
    //     dim2 *= 2;
    //     dim1 /= 2;
    //   }
    // }
    // int dims[3] = {dim1, dim2, CkNumPes() / CkNumNodes()};
    // CkPrintf("Topology: %d %d %d\n", dims[0], dims[1], dims[2]);
#else
    TopoManager tmgr;

    int nDims = 3;
    int dims[3] = {tmgr.getDimNA() * tmgr.getDimNB(),
                   tmgr.getDimNC() * tmgr.getDimND() * tmgr.getDimNE(),
                   tmgr.getDimNT()};
    CkPrintf("TEST 1: Using %dD TRAM Topology: %d %d %d\n",
             nDims, dims[0], dims[1], dims[2]);

    // Alternative TRAM topologies for Blue Gene/Q using Topology Manager
    // int nDims = 4;
    // int dims[4] = {tmgr.getDimNA() * tmgr.getDimNB(), tmgr.getDimNC(),
    //                tmgr.getDimND() * tmgr.getDimNE(), tmgr.getDimNT()};

    // int nDims = 6;
    // int dims[6] = {tmgr.getDimNA(), tmgr.getDimNB(), tmgr.getDimNC(),
    //                tmgr.getDimND() * tmgr.getDimNE(),
    //                tmgr.getDimNT() / 8, 8};
#endif

    mainProxy = thisProxy;


    aggregator = CProxy_GroupMeshStreamer<DataItem, Participant,
                                          SimpleMeshRouter>::
    ckNew(nDims, dims, allToAllGroup, bufferSize, 1, 0.1);
    testType = usingTram;
  }