main(CkArgMsg* m) { if(m->argc < 5) CkPrintf("Usage: payload PEs CharesPerPE iteration\n"); niter = 0; iterations=NITER; payload=PAYLOAD; if(m->argc>1) totalPayload=atoi(m->argv[1]); if(m->argc>2) PEsPerNode = atoi(m->argv[2]); if(m->argc>3) CharesPerPE = atoi(m->argv[3]); if(m->argc>4) iterations=atoi(m->argv[4]); payload = totalPayload/PEsPerNode/CharesPerPE; mainProxy = thishandle; arr1 = CProxy_Ping1::ckNew(CkNumNodes()* PEsPerNode * CharesPerPE ); start_time = CkWallTimer(); nodeIndex = 1; int x,y,z,t; TopoManager tmgr; for(int i=0; i<CmiNumPes(); i+=CmiMyNodeSize()) { tmgr.rankToCoordinates(i, x,y, z, t); CkPrintf(" %d [%d:%d:%d:%d]\n", i, x, y, z, t); } CkPrintf("NodeIndex Chares Workers NoOfMsgs Bytes Total Time(us)\n"); _traceControl = CProxy_TraceControl::ckNew(); for(int i=0; i<PEsPerNode * CharesPerPE; i++) arr1[i].start(nodeIndex); delete m; };
Main(CkArgMsg* m) { #if CMK_BLUEGENEL BGLPersonality bgl_p; int i = rts_get_personality(&bgl_p, sizeof(BGLPersonality)); #elif CMK_BLUEGENEP DCMF_Hardware_t bgp_hwt; DCMF_Hardware(&bgp_hwt); #elif XT3_TOPOLOGY XT3TorusManager xt3tm; #elif XT4_TOPOLOGY || XT5_TOPOLOGY XTTorusManager xttm; #endif mainProxy = thishandle; CkPrintf("Testing TopoManager .... \n"); TopoManager tmgr; CkPrintf("Torus Size [%d] [%d] [%d] [%d]\n", tmgr.getDimNX(), tmgr.getDimNY(), tmgr.getDimNZ(), tmgr.getDimNT()); #if CMK_BLUEGENEP CkPrintf("Torus Size [%d] [%d] [%d] [%d]\n", bgp_hwt.xSize, bgp_hwt.ySize, bgp_hwt.zSize, bgp_hwt.tSize); #endif int x, y, z, t; for(int i=0; i<CkNumPes(); i++) { tmgr.rankToCoordinates(i, x, y, z, t); CkPrintf("---- Processor %d ---> x %d y %d z %d t %d\n", i, x, y, z, t); #if CMK_BLUEGENEL unsigned int tmp_t, tmp_x, tmp_y, tmp_z; rts_coordinatesForRank(i, &tmp_x, &tmp_y, &tmp_z, &tmp_t); CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t); #elif CMK_BLUEGENEP unsigned int tmp_t, tmp_x, tmp_y, tmp_z; #if (DCMF_VERSION_MAJOR >= 3) DCMF_NetworkCoord_t nc; DCMF_Messager_rank2network(i, DCMF_DEFAULT_NETWORK, &nc); tmp_x = nc.torus.x; tmp_y = nc.torus.y; tmp_z = nc.torus.z; tmp_t = nc.torus.t; #else DCMF_Messager_rank2torus(c, &tmp_x, &tmp_y, &tmp_z, &tmp_t); #endif CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t); #elif XT3_TOPOLOGY int tmp_t, tmp_x, tmp_y, tmp_z; xt3tm.realRankToCoordinates(i, tmp_x, tmp_y, tmp_z, tmp_t); CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t); #elif XT4_TOPOLOGY || XT5_TOPOLOGY int tmp_t, tmp_x, tmp_y, tmp_z; xttm.realRankToCoordinates(i, tmp_x, tmp_y, tmp_z, tmp_t); CkPrintf("Real Processor %d ---> x %d y %d z %d t %d\n", i, tmp_x, tmp_y, tmp_z, tmp_t); #endif } // end of for loop int size = tmgr.getDimNX() * tmgr.getDimNY() * tmgr.getDimNZ(); CkPrintf("Torus Contiguity Metric %d : %d [%f] \n", size, CkNumPes()/tmgr.getDimNT(), (float)(CkNumPes())/(tmgr.getDimNT()*size) ); CkExit(); };
ComputeMap::ComputeMap(int x, int y, int z, int tx, int ty, int tz) { X = x; Y = y; Z = z; mapping = new int[X*Y*Z]; TopoManager tmgr; int dimX, dimY, dimZ, dimT; #if USE_TOPOMAP dimX = tmgr.getDimNX(); dimY = tmgr.getDimNY(); dimZ = tmgr.getDimNZ(); dimT = tmgr.getDimNT(); #elif USE_BLOCKMAP dimX = tx; dimY = ty; dimZ = tz; dimT = 1; #endif // we are assuming that the no. of chares in each dimension is a // multiple of the torus dimension int numCharesPerPe = X*Y*Z/CkNumPes(); int numCharesPerPeX = X / dimX; int numCharesPerPeY = Y / dimY; int numCharesPerPeZ = Z / dimZ; if(dimT < 2) { // one core per node if(CkMyPe()==0) CkPrintf("DATA: %d %d %d %d : %d %d %d\n", dimX, dimY, dimZ, dimT, numCharesPerPeX, numCharesPerPeY, numCharesPerPeZ); for(int i=0; i<dimX; i++) for(int j=0; j<dimY; j++) for(int k=0; k<dimZ; k++) for(int ci=i*numCharesPerPeX; ci<(i+1)*numCharesPerPeX; ci++) for(int cj=j*numCharesPerPeY; cj<(j+1)*numCharesPerPeY; cj++) for(int ck=k*numCharesPerPeZ; ck<(k+1)*numCharesPerPeZ; ck++) { #if USE_TOPOMAP mapping[ci*Y*Z + cj*Z + ck] = tmgr.coordinatesToRank(i, j, k); #elif USE_BLOCKMAP mapping[ci*Y*Z + cj*Z + ck] = i + j*dimX + k*dimX*dimY; #endif } } else { // multiple cores per node // In this case, we split the chares in the X dimension among the // cores on the same node. numCharesPerPeX /= dimT; if(CkMyPe()==0) CkPrintf("%d %d %d : %d %d %d %d : %d %d %d \n", x, y, z, dimX, dimY, dimZ, dimT, numCharesPerPeX, numCharesPerPeY, numCharesPerPeZ); for(int i=0; i<dimX; i++) for(int j=0; j<dimY; j++) for(int k=0; k<dimZ; k++) for(int l=0; l<dimT; l++) for(int ci=(dimT*i+l)*numCharesPerPeX; ci<(dimT*i+l+1)*numCharesPerPeX; ci++) for(int cj=j*numCharesPerPeY; cj<(j+1)*numCharesPerPeY; cj++) for(int ck=k*numCharesPerPeZ; ck<(k+1)*numCharesPerPeZ; ck++) { mapping[ci*Y*Z + cj*Z + ck] = tmgr.coordinatesToRank(i, j, k, l); } } }
void build_process_map(int size, int *smap, int *rmap, int *pmap, int file) { TopoManager tmgr; int pe, pe1, pe2, x, y, z1, t; int dimNX, dimNY, dimNZ, dimNT; dimNX = tmgr.getDimNX(); dimNY = tmgr.getDimNY(); dimNZ = tmgr.getDimNZ(); dimNT = tmgr.getDimNT(); int count = 0; for(int i=0; i<size; i++) { smap[i]=-1; rmap[i]=-1; pmap[i]=-1; } cout << "Loading Map" << endl; char name[50]; sprintf(name,"%d.map",file); ifstream mapFile(name); string line_s; while(mapFile.good() ){ #ifdef DEBUG cout << " > Loading " << name << endl; #endif int c1,c2,c3,c4,c5,c6; getline(mapFile,line_s); istringstream line(line_s); line >> c1 >> c2 >>c3 >> c4 >> c5>> c6; for(int i=0;i<dimNZ;i++) { pe = tmgr.coordinatesToRank(c1, c2, i, 0); pe1 = tmgr.coordinatesToRank(c4, c5, i, 0); smap[pe] = pe1; rmap[pe1] = pe; if(i==0) { pmap[pe] =1; pmap[pe1]=2; } } } dump_map(size,rmap); dump_map(size,smap); }
/** * This function implements a strategy similar to the one used in the * centralized case in NamdCentLB. */ CLBMigrateMsg* NamdHybridLB::GrpLevelStrategy(LDStats* stats) { int numProcessors = stats->nprocs(); // number of processors at group level int numPatches = PatchMap::Object()->numPatches(); ComputeMap *computeMap = ComputeMap::Object(); const int numComputes = computeMap->numComputes(); const int numGroupComputes = stats->n_migrateobjs; const SimParameters* simParams = Node::Object()->simParameters; if ( ! processorArray ) processorArray = new processorInfo[numProcessors]; // these data structures are global and need to be distributed if ( ! patchArray ) patchArray = new patchInfo[numPatches]; if ( ! computeArray ) computeArray = new computeInfo[numGroupComputes]; if ( ! from_procs ) from_procs = new int[numGroupComputes]; int nMoveableComputes = buildData(stats); CmiAssert(nMoveableComputes <= numGroupComputes); #if LDB_DEBUG #define DUMP_LDBDATA 1 #define LOAD_LDBDATA 1 #endif #if DUMP_LDBDATA dumpDataASCII("ldbd_before", numProcessors, numPatches, nMoveableComputes); #elif LOAD_LDBDATA loadDataASCII("ldbd_before.5", numProcessors, numPatches, nMoveableComputes); // CkExit(); #endif double averageLoad = 0.; double avgCompute; double maxCompute; int maxComputeId; int numPesAvailable; { int i; double total = 0.; maxCompute = 0.; int maxi = 0; for (i=0; i<nMoveableComputes; i++) { double load = computeArray[i].load; total += load; if ( load > maxCompute ) { maxCompute = load; maxi = i; } } avgCompute = total / nMoveableComputes; maxComputeId = computeArray[maxi].handle.id.id[0]; int P = stats->nprocs(); numPesAvailable = 0; for (i=0; i<P; i++) { if (processorArray[i].available) { ++numPesAvailable; total += processorArray[i].backgroundLoad; } } if (numPesAvailable == 0) NAMD_die("No processors available for load balancing!\n"); averageLoad = total/numPesAvailable; } int i_split = 0; double maxUnsplit = 0.; if ( step() == 1 ) { for (int i=0; i<nMoveableComputes; i++) { const int cid = computeArray[i].handle.id.id[0]; if ( computeMap->numPartitions(cid) == 0 ) { const double load = computeArray[i].load; if ( load > maxUnsplit ) maxUnsplit = load; continue; } ++i_split; } } { SplitComputesMsg *msg = new(i_split,i_split) SplitComputesMsg; msg->maxUnsplit = maxUnsplit; msg->averageLoad = averageLoad; msg->avgCompute = avgCompute; msg->maxCompute = maxCompute; msg->maxComputeId = maxComputeId; msg->nMoveableComputes = nMoveableComputes; msg->numPesAvailable = numPesAvailable; msg->n = i_split; if ( step() == 1 ) { i_split = 0; for (int i=0; i<nMoveableComputes; i++) { computeArray[i].processor = computeArray[i].oldProcessor; const int cid = computeArray[i].handle.id.id[0]; if ( computeMap->numPartitions(cid) == 0 ) { continue; } msg->cid[i_split] = cid; msg->load[i_split] = computeArray[i].load; ++i_split; } } thisProxy[0].splitComputes(msg); } if ( step() == 1 ) { // compute splitting only } else if (simParams->ldbStrategy == LDBSTRAT_DEFAULT) { // default if (step() < 4) TorusLB(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors); else RefineTorusLB(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors, 1); } else if (simParams->ldbStrategy == LDBSTRAT_COMPREHENSIVE) { TorusLB(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors); } else if (simParams->ldbStrategy == LDBSTRAT_REFINEONLY) { RefineTorusLB(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors, 1); } else if (simParams->ldbStrategy == LDBSTRAT_OLD) { NAMD_die("Old load balancer strategy is not compatible with hybrid balancer."); if (step() < 4) Alg7(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors); else RefineOnly(computeArray, patchArray, processorArray, nMoveableComputes, numPatches, numProcessors); } #if LDB_DEBUG && USE_TOPOMAP TopoManager tmgr; int pe1, pe2, pe3, hops=0; /* This is double counting the hops for(int i=0; i<nMoveableComputes; i++) { pe1 = computeArray[i].processor; pe2 = patchArray[computeArray[i].patch1].processor; pe3 = patchArray[computeArray[i].patch2].processor; hops += tmgr.getHopsBetweenRanks(pe1, pe2); if(computeArray[i].patch1 != computeArray[i].patch2) hops += tmgr.getHopsBetweenRanks(pe1, pe3); }*/ for (int i=0; i<numPatches; i++) { //int num = patchArray[i].proxiesOn.numElements(); pe1 = patchArray[i].processor; Iterator nextProc; processorInfo *p = (processorInfo *)patchArray[i].proxiesOn.iterator((Iterator *)&nextProc); while (p) { pe2 = p->Id; hops += tmgr.getHopsBetweenRanks(pe1, pe2); p = (processorInfo *)patchArray[i].proxiesOn.next((Iterator*)&nextProc); } } CkPrintf("Load Balancing: Number of Hops: %d\n", hops); #endif #if DUMP_LDBDATA dumpDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes); #elif LOAD_LDBDATA dumpDataASCII("ldbd_after.5", numProcessors, numPatches, nMoveableComputes); // loadDataASCII("ldbd_after", numProcessors, numPatches, nMoveableComputes); // CkExit(); #endif // For error checking: // Count up computes, to see if somebody doesn't have any computes int i; #if 0 int* computeCount = new int[numProcessors]; for(i=0; i<numProcessors; i++) computeCount[i]=0; for(i=0; i<nMoveableComputes; i++) computeCount[computeArray[i].processor]++; for(i=0; i<numProcessors; i++) { if (computeCount[i]==0) iout << iINFO <<"Warning: Processor " << i << " has NO moveable computes.\n" << endi; } delete [] computeCount; #endif CkVec<MigrateInfo *> migrateInfo; for(i=0;i<nMoveableComputes;i++) { if (computeArray[i].processor != from_procs[i]+stats->procs[0].pe) { /* CkPrintf("[%d] Obj %d migrating from %d (%d) to %d\n", CkMyPe(),computeArray[i].handle.id.id[0], from_procs[i], computeArray[i].oldProcessor, computeArray[i].processor); */ MigrateInfo *migrateMe = new MigrateInfo; migrateMe->obj = computeArray[i].handle; //migrateMe->from_pe = computeArray[i].oldProcessor; int frompe = from_procs[i]; if (frompe == numProcessors) frompe = -1; else frompe = frompe + stats->procs[0].pe; migrateMe->from_pe = frompe; migrateMe->to_pe = computeArray[i].processor; if (frompe == -1) { // don't know yet which processor this compute belongs to, but // inform receiver LDObjData obj; obj.handle = computeArray[i].handle; thisProxy[computeArray[i].processor].ObjMigrated(obj, NULL, 0, currentLevel-1); } migrateInfo.insertAtEnd(migrateMe); // sneak in updates to ComputeMap //ERASE CkPrintf("%d setting %d to processor %d\n",CkMyPe(),computeArray[i].handle.id.id[0],computeArray[i].processor); computeMap->setNewNode(computeArray[i].handle.id.id[0], computeArray[i].processor); } } // CkPrintf("LOAD BALANCING READY %d\n",CkMyPe()); LBMigrateMsg* msg; msg = createMigrateMsg(migrateInfo, numProcessors); peLoads = new double [numProcessors]; startPE = processorArray[0].Id; endPE = processorArray[numProcessors-1].Id; // CkPrintf("[%d] numProcessors=%d, %d to %d\n",CkMyPe(),numProcessors,processorArray[0].Id,processorArray[numProcessors-1].Id); for (i=0; i<numProcessors; i++) { peLoads[i] = processorArray[i].load; } delete [] from_procs; delete [] processorArray; delete [] patchArray; delete [] computeArray; from_procs = NULL; processorArray = NULL; patchArray = NULL; computeArray = NULL; return msg; }
int main(int argc, char *argv[]) { int numprocs, myrank; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); double sendTime, recTime, min, avg, max; double time[3] = {0.0, 0.0, 0.0}; int msg_size; MPI_Status mstat; int i=0,j, pe, pe1, pe2, trial, hops; char name[30]; char locname[30]; char blockname[50]; double newTime, oldTime; double storeTime[NUM_MSGS]; double recvTime[NUM_MSGS]; double storeBw[NUM_MSGS]; char *send_buf = (char *)malloc(MAX_MSG_SIZE); char *recv_buf = (char *)malloc(MAX_MSG_SIZE); FILE *locf; for(i = 0; i < MAX_MSG_SIZE; i++) { recv_buf[i] = send_buf[i] = (char) (i & 0xff); } // allocate the routing map. int *rmap = (int *) malloc(sizeof(int) * numprocs); int *smap = (int *) malloc(sizeof(int) * numprocs); TopoManager *tmgr; int dimNZ, numRG, x, y, z, t, bcastSend[3], bcastRecv[3]; if(myrank == 0) { tmgr = new TopoManager(); #if CREATE_JOBS numRG = tmgr->getDimNX() * (tmgr->getDimNY() - 2) * 2 * tmgr->getDimNT(); #else numRG = tmgr->getDimNX() * tmgr->getDimNY() * 2; #endif dimNZ = tmgr->getDimNZ(); for (int i=1; i<numprocs; i++) { bcastSend[0] = dimNZ; bcastSend[1] = numRG; tmgr->rankToCoordinates(i, x, y, z, t); bcastSend[2] = z; MPI_Send(bcastSend, 3, MPI_INT, i, 1, MPI_COMM_WORLD); } tmgr->rankToCoordinates(0, x, y, z, t); } else { MPI_Recv(bcastRecv, 3, MPI_INT, 0, 1, MPI_COMM_WORLD, &mstat); dimNZ = bcastRecv[0]; numRG = bcastRecv[1]; z = bcastRecv[2]; } MPI_Barrier(MPI_COMM_WORLD); if (myrank == 0) { printf("Torus Dimensions %d %d %d %d\n", tmgr->getDimNX(), tmgr->getDimNY(), dimNZ, tmgr->getDimNT()); } #if USE_HPM HPM_Init(); #endif for (hops=0; hops < 1; hops++) { // To print the recv times for certain ranks int *pmap = (int *) malloc(sizeof(int) * numprocs); if (myrank == 0) { // Rank 0 makes up a routing map. build_process_map(numprocs, smap, rmap, pmap, 2); } // Broadcast the routing map. MPI_Bcast(smap, numprocs, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(rmap, numprocs, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(pmap, numprocs, MPI_INT, 0, MPI_COMM_WORLD); sprintf(blockname, "Block_%d.hpm",hops); if (myrank == 0) { printf( " Broadcasted the map \n"); } #if USE_HPM HPM_Start(blockname); #endif #if CREATE_JOBS sprintf(name, "xt4_job_%d_%d.dat", numprocs, hops); #else sprintf(name, "bgp_line_%d_%d.dat", numprocs, hops); #endif if(pmap[myrank]>0) { sprintf(locname, "bgp_print_%d.dat", myrank); locf = fopen(locname, "a"); } for (msg_size=MIN_MSG_SIZE; msg_size<=MAX_MSG_SIZE; msg_size=(msg_size<<1)) { for (trial=0; trial<1; trial++) { if (myrank == 0) { printf( " Going to begin the trial \n"); } pe1 = smap[myrank]; // Am I a sender? pe2 = rmap[myrank]; // Am I a reciever? MPI_Barrier(MPI_COMM_WORLD); // Actual Data Transfer if(pe1 != -1) { sendTime = MPI_Wtime(); oldTime = sendTime; j=0; for(i=0; i<NUM_MSGS; i++) { storeTime[i] = MPI_Wtime(); // Just before the next send operation MPI_Send(send_buf, msg_size, MPI_CHAR, pe1, 1, MPI_COMM_WORLD); } MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe1, 1, MPI_COMM_WORLD, &mstat); recTime = (MPI_Wtime() - sendTime) / (NUM_MSGS+1); //printf(" My Rank : %d Experiment: %d MSG_SIZE: %d -- Completed send recv \n", myrank, hops, msg_size); } if(pe2 !=1) { sendTime = MPI_Wtime(); oldTime = sendTime; j=0; for(i=0; i<NUM_MSGS; i++) { MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe2, 1, MPI_COMM_WORLD, &mstat); recvTime[i] = MPI_Wtime(); // Just after the next recv operation } MPI_Send(send_buf, msg_size, MPI_CHAR, pe2, 1, MPI_COMM_WORLD); recTime = (MPI_Wtime() - sendTime) / (NUM_MSGS+1); } // Recv times sent back to the Senders for b/w calculations if(myrank==0) { printf(" My Rank : %d Experiment: %d MSG_SIZE: %d -- Reached barrier in middle \n", myrank, hops, msg_size); } pe1 = smap[myrank]; // Am I a sender? pe2 = rmap[myrank]; // Am I a reciever? MPI_Barrier(MPI_COMM_WORLD); if(pe1 != -1) { MPI_Recv(recvTime, NUM_MSGS, MPI_DOUBLE, pe1, 1, MPI_COMM_WORLD, &mstat); if(pmap[myrank]==1) { printf(" My Rank : %d Hops: %d MSG_SIZE: %d Sender Side Exp trial: %d Avg recv time %g \n", myrank, hops, msg_size, trial, recTime ); //printf(" My Rank : %d Hops: %d MSG_SIZE: %d Sender Side Exp trial: %d Recv time %g \n", myrank, hops, msg_size, trial, recvTime ); for(i=0;i<NUM_MSGS; i++) { storeBw[i]= msg_size/(recvTime[i] - storeTime[i]); fprintf(locf, "%d %d %d %g %g %g %g \n", hops, myrank, msg_size, 500000*(storeTime[i]+recvTime[i]), storeBw[i],1000000*recvTime[i],1000000*storeTime[i]); } } } if(pe2 !=1) { MPI_Send(recvTime, NUM_MSGS, MPI_DOUBLE, pe2, 1, MPI_COMM_WORLD); } } // end for loop of trials } // end for loop of msgs if(pmap[myrank]>0) { fflush(NULL); fclose(locf); } free(pmap); #if USE_HPM HPM_Stop(blockname); #endif } // end for loop of hops #if USE_HPM HPM_Print(); #endif if(myrank == 0) printf("Program Complete\n"); MPI_Finalize(); return 0; }
Main(CkArgMsg* m) { if ( (m->argc != 3) && (m->argc != 7) ) { CkPrintf("%s [array_size] [block_size]\n", m->argv[0]); CkPrintf("OR %s [array_size_X] [array_size_Y] [array_size_Z] [block_size_X] [block_size_Y] [block_size_Z]\n", m->argv[0]); CkAbort("Abort"); } // set iteration counter to zero iterations = 0; // store the main proxy mainProxy = thisProxy; if(m->argc == 3) { arrayDimX = arrayDimY = arrayDimZ = atoi(m->argv[1]); blockDimX = blockDimY = blockDimZ = atoi(m->argv[2]); } else if (m->argc == 7) { arrayDimX = atoi(m->argv[1]); arrayDimY = atoi(m->argv[2]); arrayDimZ = atoi(m->argv[3]); blockDimX = atoi(m->argv[4]); blockDimY = atoi(m->argv[5]); blockDimZ = atoi(m->argv[6]); } if (arrayDimX < blockDimX || arrayDimX % blockDimX != 0) CkAbort("array_size_X % block_size_X != 0!"); if (arrayDimY < blockDimY || arrayDimY % blockDimY != 0) CkAbort("array_size_Y % block_size_Y != 0!"); if (arrayDimZ < blockDimZ || arrayDimZ % blockDimZ != 0) CkAbort("array_size_Z % block_size_Z != 0!"); num_chare_x = arrayDimX / blockDimX; num_chare_y = arrayDimY / blockDimY; num_chare_z = arrayDimZ / blockDimZ; // print info CkPrintf("\nSTENCIL COMPUTATION WITH NO BARRIERS\n"); CkPrintf("Running Jacobi on %d processors with (%d, %d, %d) chares\n", CkNumPes(), num_chare_x, num_chare_y, num_chare_z); CkPrintf("Array Dimensions: %d %d %d\n", arrayDimX, arrayDimY, arrayDimZ); CkPrintf("Block Dimensions: %d %d %d\n", blockDimX, blockDimY, blockDimZ); // Create new array of worker chares #if USE_TOPOMAP CProxy_JacobiMap map = CProxy_JacobiMap::ckNew(num_chare_x, num_chare_y, num_chare_z); CkPrintf("Topology Mapping is being done ... \n"); CkArrayOptions opts(num_chare_x, num_chare_y, num_chare_z); opts.setMap(map); array = CProxy_Jacobi::ckNew(opts); #else array = CProxy_Jacobi::ckNew(num_chare_x, num_chare_y, num_chare_z); #endif TopoManager tmgr; CkArray *jarr = array.ckLocalBranch(); int jmap[num_chare_x][num_chare_y][num_chare_z]; int hops=0, p; for(int i=0; i<num_chare_x; i++) for(int j=0; j<num_chare_y; j++) for(int k=0; k<num_chare_z; k++) { jmap[i][j][k] = jarr->procNum(CkArrayIndex3D(i, j, k)); } for(int i=0; i<num_chare_x; i++) for(int j=0; j<num_chare_y; j++) for(int k=0; k<num_chare_z; k++) { p = jmap[i][j][k]; hops += tmgr.getHopsBetweenRanks(p, jmap[wrap_x(i+1)][j][k]); hops += tmgr.getHopsBetweenRanks(p, jmap[wrap_x(i-1)][j][k]); hops += tmgr.getHopsBetweenRanks(p, jmap[i][wrap_y(j+1)][k]); hops += tmgr.getHopsBetweenRanks(p, jmap[i][wrap_y(j-1)][k]); hops += tmgr.getHopsBetweenRanks(p, jmap[i][j][wrap_z(k+1)]); hops += tmgr.getHopsBetweenRanks(p, jmap[i][j][wrap_z(k-1)]); } CkPrintf("Total Hops: %d\n", hops); #ifdef JACOBI_OPENMP CProxy_OmpInitializer ompInit = CProxy_OmpInitializer::ckNew(4); #else //Start the computation start(); #endif }
extern "C" void LrtsInitCpuTopo(char **argv) { static skt_ip_t myip; hostnameMsg *msg; double startT; int obtain_flag = 1; // default on int show_flag = 0; // default not show topology if (CmiMyRank() ==0) { topoLock = CmiCreateLock(); } #if __FAULT__ obtain_flag = 0; #endif if(CmiGetArgFlagDesc(argv,"+obtain_cpu_topology", "obtain cpu topology info")) obtain_flag = 1; if (CmiGetArgFlagDesc(argv,"+skip_cpu_topology", "skip the processof getting cpu topology info")) obtain_flag = 0; if(CmiGetArgFlagDesc(argv,"+show_cpu_topology", "Show cpu topology info")) show_flag = 1; #if CMK_BIGSIM_CHARM if (BgNodeRank() == 0) #endif { cpuTopoHandlerIdx = CmiRegisterHandler((CmiHandler)cpuTopoHandler); cpuTopoRecvHandlerIdx = CmiRegisterHandler((CmiHandler)cpuTopoRecvHandler); } if (!obtain_flag) { if (CmiMyRank() == 0) cpuTopo.sort(); CmiNodeAllBarrier(); CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks return; } if (CmiMyPe() == 0) { #if CMK_BIGSIM_CHARM if (BgNodeRank() == 0) #endif startT = CmiWallTimer(); } #if CMK_BIGSIM_CHARM if (BgNodeRank() == 0) { //int numPes = BgNumNodes()*BgGetNumWorkThread(); int numPes = cpuTopo.numPes = CkNumPes(); cpuTopo.nodeIDs = new int[numPes]; CpuTopology::supported = 1; int wth = BgGetNumWorkThread(); for (int i=0; i<numPes; i++) { int nid = i / wth; cpuTopo.nodeIDs[i] = nid; } cpuTopo.sort(); } return; #else #if CMK_USE_GM CmiBarrier(); #endif #if 0 if (gethostname(hostname, 999)!=0) { strcpy(hostname, ""); } #endif #if CMK_BLUEGENEL || CMK_BLUEGENEP if (CmiMyRank() == 0) { TopoManager tmgr; int numPes = cpuTopo.numPes = CmiNumPes(); cpuTopo.nodeIDs = new int[numPes]; CpuTopology::supported = 1; int x, y, z, t, nid; for(int i=0; i<numPes; i++) { tmgr.rankToCoordinates(i, x, y, z, t); nid = tmgr.coordinatesToRank(x, y, z, 0); cpuTopo.nodeIDs[i] = nid; } cpuTopo.sort(); if (CmiMyPe()==0) CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores()); } CmiNodeAllBarrier(); #elif CMK_BLUEGENEQ if (CmiMyRank() == 0) { TopoManager tmgr; int numPes = cpuTopo.numPes = CmiNumPes(); cpuTopo.nodeIDs = new int[numPes]; CpuTopology::supported = 1; int a, b, c, d, e, t, nid; for(int i=0; i<numPes; i++) { tmgr.rankToCoordinates(i, a, b, c, d, e, t); nid = tmgr.coordinatesToRank(a, b, c, d, e, 0); cpuTopo.nodeIDs[i] = nid; } cpuTopo.sort(); if (CmiMyPe()==0) CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores()); } CmiNodeAllBarrier(); #elif CMK_CRAYXT || CMK_CRAYXE || CMK_CRAYXC if(CmiMyRank() == 0) { int numPes = cpuTopo.numPes = CmiNumPes(); int numNodes = CmiNumNodes(); cpuTopo.nodeIDs = new int[numPes]; CpuTopology::supported = 1; int nid; for(int i=0; i<numPes; i++) { nid = getXTNodeID(CmiNodeOf(i), numNodes); cpuTopo.nodeIDs[i] = nid; } int prev = -1; nid = -1; // this assumes that all cores on a node have consecutive MPI rank IDs // and then changes nodeIDs to 0 to numNodes-1 for(int i=0; i<numPes; i++) { if(cpuTopo.nodeIDs[i] != prev) { prev = cpuTopo.nodeIDs[i]; cpuTopo.nodeIDs[i] = ++nid; } else cpuTopo.nodeIDs[i] = nid; } cpuTopo.sort(); if (CmiMyPe()==0) CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores()); } CmiNodeAllBarrier(); #else bool topoInProgress = true; if (CmiMyPe() >= CmiNumPes()) { CmiNodeAllBarrier(); // comm thread waiting #if CMK_MACHINE_PROGRESS_DEFINED #if ! CMK_CRAYXT while (topoInProgress) { CmiNetworkProgress(); CmiLock(topoLock); topoInProgress = done < CmiMyNodeSize(); CmiUnlock(topoLock); } #endif #endif return; /* comm thread return */ } /* get my ip address */ if (CmiMyRank() == 0) { #if CMK_HAS_GETHOSTNAME && !CMK_BLUEGENEQ myip = skt_my_ip(); /* not thread safe, so only calls on rank 0 */ // fprintf(stderr, "[%d] IP is %d.%d.%d.%d\n", CmiMyPe(), myip.data[0],myip.data[1],myip.data[2],myip.data[3]); #elif CMK_BPROC myip = skt_innode_my_ip(); #else if (!CmiMyPe()) CmiPrintf("CmiInitCPUTopology Warning: Can not get unique name for the compute nodes. \n"); _noip = 1; #endif cpuTopo.numPes = CmiNumPes(); } CmiNodeAllBarrier(); if (_noip) return; /* prepare a msg to send */ msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg)+sizeof(_procInfo)); msg->n = 1; msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg)); CmiSetHandler((char *)msg, cpuTopoHandlerIdx); msg->procs[0].pe = CmiMyPe(); msg->procs[0].ip = myip; msg->procs[0].ncores = CmiNumCores(); msg->procs[0].rank = 0; msg->procs[0].nodeID = 0; CmiReduce(msg, sizeof(hostnameMsg)+sizeof(_procInfo), combineMessage); // blocking here while (topoInProgress) { CsdSchedulePoll(); CmiLock(topoLock); topoInProgress = done < CmiMyNodeSize(); CmiUnlock(topoLock); } if (CmiMyPe() == 0) { #if CMK_BIGSIM_CHARM if (BgNodeRank() == 0) #endif CmiPrintf("Charm++> cpu topology info is gathered in %.3f seconds.\n", CmiWallTimer()-startT); } #endif #endif /* __BIGSIM__ */ // now every one should have the node info CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks if (CmiMyPe() == 0 && show_flag) cpuTopo.print(); }
void build_process_map(int size, int *map, int dist, int numRG, int *mapRG) { TopoManager tmgr; int pe1, pe2, x, y, z, t; int dimNX, dimNY, dimNZ, dimNT; dimNX = tmgr.getDimNX(); dimNY = tmgr.getDimNY(); dimNZ = tmgr.getDimNZ(); dimNT = tmgr.getDimNT(); int count = 0; #if CREATE_JOBS for(int i=0; i<size; i++) map[i] = -1; // assumes a cubic partition such as 8 x 8 x 8 // inner brick is always used for(int i=0; i<dimNX; i++) for(int j=1; j<dimNY-1; j++) for(int k=1; k<dimNZ-1; k++) for(int l=0; l<dimNT; l++) { if(k == 2 || k == dimNZ-3) { pe1 = tmgr.coordinatesToRank(i, j, k, l); if(k == 2) pe2 = tmgr.coordinatesToRank(i, j, dimNZ-3, l); else pe2 = tmgr.coordinatesToRank(i, j, 2, l); map[pe1] = pe2; mapRG[count++] = pe1; printf("%d ", pe1); } } printf("\n"); if(dist == 1) { // outer brick is used only when dist == 1 for(int i=0; i<dimNX; i++) for(int j=0; j<dimNY; j++) for(int k=0; k<dimNZ; k++) for(int l=0; l<dimNT; l++) { if(j == 0 || j == dimNY-1 || k == 0 || k == dimNZ-1) { pe1 = tmgr.coordinatesToRank(i, j, k, l); pe2 = tmgr.coordinatesToRank(i, k, j, l); if(j == 0 && k == 0) pe2 = tmgr.coordinatesToRank(i, dimNY-1, dimNZ-1, l); else if(j == dimNY-1 && k == dimNZ-1) pe2 = tmgr.coordinatesToRank(i, 0, 0, l); map[pe1] = pe2; } } } #else for(int i=0; i<dimNX; i++) for(int j=0; j<dimNY; j++) for(int k=0; k<dimNZ; k++) for(int l=0; l<dimNT; l++) { pe1 = tmgr.coordinatesToRank(i, j, k, l); if( abs(dimNZ - 1 - 2*k) <= (2*dist+1) ) { pe2 = tmgr.coordinatesToRank(i, j, (dimNZ-1-k), l); map[pe1] = pe2; if(i==0 && j==0 && l==0) { printf("Hops %d [%d] [%d]\n", 2*dist+1, pe1, pe2); } if(k == dimNZ/2-1 || k == dimNZ/2) mapRG[count++] = pe1; } else map[pe1] = -1; } #endif printf("Barrier Process %d %d\n", count, numRG); check_map(size, map); }
int main(int argc, char *argv[]) { int numprocs, myrank, grank; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Group orig_group, new_group; MPI_Comm new_comm; /* Extract the original group handle */ MPI_Comm_group(MPI_COMM_WORLD, &orig_group); double sendTime, recvTime, min, avg, max; double time[3] = {0.0, 0.0, 0.0}; int msg_size; MPI_Status mstat; int i=0, pe, trial, hops; char name[30]; char *send_buf = (char *)malloc(MAX_MSG_SIZE); char *recv_buf = (char *)malloc(MAX_MSG_SIZE); for(i = 0; i < MAX_MSG_SIZE; i++) { recv_buf[i] = send_buf[i] = (char) (i & 0xff); } // allocate the routing map. int *map = (int *) malloc(sizeof(int) * numprocs); TopoManager *tmgr; int dimNZ, numRG, x, y, z, t, bcastSend[3], bcastRecv[3]; if(myrank == 0) { tmgr = new TopoManager(); #if CREATE_JOBS numRG = tmgr->getDimNX() * (tmgr->getDimNY() - 2) * 2 * tmgr->getDimNT(); #else numRG = tmgr->getDimNX() * tmgr->getDimNY() * 2 * tmgr->getDimNT(); #endif dimNZ = tmgr->getDimNZ(); for (int i=1; i<numprocs; i++) { bcastSend[0] = dimNZ; bcastSend[1] = numRG; tmgr->rankToCoordinates(i, x, y, z, t); bcastSend[2] = z; MPI_Send(bcastSend, 3, MPI_INT, i, 1, MPI_COMM_WORLD); } tmgr->rankToCoordinates(0, x, y, z, t); } else { MPI_Recv(bcastRecv, 3, MPI_INT, 0, 1, MPI_COMM_WORLD, &mstat); dimNZ = bcastRecv[0]; numRG = bcastRecv[1]; z = bcastRecv[2]; } MPI_Barrier(MPI_COMM_WORLD); if (myrank == 0) { printf("Torus Dimensions %d %d %d %d\n", tmgr->getDimNX(), tmgr->getDimNY(), dimNZ, tmgr->getDimNT()); } #if CREATE_JOBS for (hops=0; hops < 2; hops++) { #else for (hops=0; hops < dimNZ/2; hops++) { #endif int *mapRG = (int *) malloc(sizeof(int) * numRG); if (myrank == 0) { // Rank 0 makes up a routing map. build_process_map(numprocs, map, hops, numRG, mapRG); } // Broadcast the routing map. MPI_Bcast(map, numprocs, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(mapRG, numRG, MPI_INT, 0, MPI_COMM_WORLD); MPI_Group_incl(orig_group, numRG, mapRG, &new_group); MPI_Comm_create(MPI_COMM_WORLD, new_group, &new_comm); MPI_Group_rank(new_group, &grank); #if CREATE_JOBS sprintf(name, "xt4_job_%d_%d.dat", numprocs, hops); #else sprintf(name, "xt4_line_%d_%d.dat", numprocs, hops); #endif for (msg_size=MIN_MSG_SIZE; msg_size<=MAX_MSG_SIZE; msg_size=(msg_size<<1)) { for (trial=0; trial<10; trial++) { pe = map[myrank]; if(pe != -1) { if(grank != MPI_UNDEFINED) MPI_Barrier(new_comm); if(myrank < pe) { // warmup for(i=0; i<2; i++) { MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD); MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat); } sendTime = MPI_Wtime(); for(i=0; i<NUM_MSGS; i++) MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD); for(i=0; i<NUM_MSGS; i++) MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat); recvTime = (MPI_Wtime() - sendTime) / NUM_MSGS; // cooldown for(i=0; i<2; i++) { MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD); MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat); } if(grank != MPI_UNDEFINED) MPI_Barrier(new_comm); } else { // warmup for(i=0; i<2; i++) { MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat); MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD); } sendTime = MPI_Wtime(); for(i=0; i<NUM_MSGS; i++) MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat); for(i=0; i<NUM_MSGS; i++) MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD); recvTime = (MPI_Wtime() - sendTime) / NUM_MSGS; // cooldown for(i=0; i<2; i++) { MPI_Recv(recv_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD, &mstat); MPI_Send(send_buf, msg_size, MPI_CHAR, pe, 1, MPI_COMM_WORLD); } if(grank != MPI_UNDEFINED) MPI_Barrier(new_comm); } if(grank != MPI_UNDEFINED) { MPI_Allreduce(&recvTime, &min, 1, MPI_DOUBLE, MPI_MIN, new_comm); MPI_Allreduce(&recvTime, &avg, 1, MPI_DOUBLE, MPI_SUM, new_comm); MPI_Allreduce(&recvTime, &max, 1, MPI_DOUBLE, MPI_MAX, new_comm); } avg /= numRG; } // end if map[pe] != -1 if(grank == 0) { time[0] += min; time[1] += avg; time[2] += max; } } // end for loop of trials if (grank == 0) { FILE *outf = fopen(name, "a"); fprintf(outf, "%d %g %g %g\n", msg_size, time[0]/10, time[1]/10, time[2]/10); fflush(NULL); fclose(outf); time[0] = time[1] = time[2] = 0.0; } } // end for loop of msgs free(mapRG); } // end for loop of hops if(grank == 0) printf("Program Complete\n"); MPI_Finalize(); return 0; }
Main(CkArgMsg *args) { if (args->argc >= 3) { dataSizeMin = atoi(args->argv[1]); dataSizeMax = atoi(args->argv[2]); } else { dataSizeMin = 32; dataSizeMax = 16384; } bufferSize = args->argc == 4 ? atoi(args->argv[3]) : TRAM_BUFFER_SIZE; CkPrintf("size of envelope: %d\n\n", sizeof(envelope)); delete args; iters = dataSizeMin / DATA_ITEM_SIZE; allToAllGroup = CProxy_Participant::ckNew(); #if !CMK_BLUEGENEQ int nDims = 2; int dims[2] = {CkNumNodes(), CkNumPes() / CkNumNodes()}; CkPrintf("TEST 1: Using %dD TRAM Topology: %d %d\n", nDims, dims[0], dims[1]); // Alternative 3D topology // int nDims = 3; // int dim1 = CkNumNodes(); // int dim2 = 1; // if (dim1 != 1) { // while (dim2 < dim1) { // dim2 *= 2; // dim1 /= 2; // } // } // int dims[3] = {dim1, dim2, CkNumPes() / CkNumNodes()}; // CkPrintf("Topology: %d %d %d\n", dims[0], dims[1], dims[2]); #else TopoManager tmgr; int nDims = 3; int dims[3] = {tmgr.getDimNA() * tmgr.getDimNB(), tmgr.getDimNC() * tmgr.getDimND() * tmgr.getDimNE(), tmgr.getDimNT()}; CkPrintf("TEST 1: Using %dD TRAM Topology: %d %d %d\n", nDims, dims[0], dims[1], dims[2]); // Alternative TRAM topologies for Blue Gene/Q using Topology Manager // int nDims = 4; // int dims[4] = {tmgr.getDimNA() * tmgr.getDimNB(), tmgr.getDimNC(), // tmgr.getDimND() * tmgr.getDimNE(), tmgr.getDimNT()}; // int nDims = 6; // int dims[6] = {tmgr.getDimNA(), tmgr.getDimNB(), tmgr.getDimNC(), // tmgr.getDimND() * tmgr.getDimNE(), // tmgr.getDimNT() / 8, 8}; #endif mainProxy = thisProxy; aggregator = CProxy_GroupMeshStreamer<DataItem, Participant, SimpleMeshRouter>:: ckNew(nDims, dims, allToAllGroup, bufferSize, 1, 0.1); testType = usingTram; }